summaryrefslogtreecommitdiff
path: root/perllib/phash.ph
diff options
context:
space:
mode:
Diffstat (limited to 'perllib/phash.ph')
-rw-r--r--perllib/phash.ph228
1 files changed, 228 insertions, 0 deletions
diff --git a/perllib/phash.ph b/perllib/phash.ph
new file mode 100644
index 0000000..8feb6b7
--- /dev/null
+++ b/perllib/phash.ph
@@ -0,0 +1,228 @@
+# -*- perl -*-
+#
+# Perfect Minimal Hash Generator written in Perl, which produces
+# C output.
+#
+# Requires the CPAN Graph module (tested against 0.81, 0.83, 0.84)
+#
+
+require 'random_sv_vectors.ph';
+require 'crc64.ph';
+
+#
+# Compute the prehash for a key
+#
+# prehash(key, sv, N)
+#
+sub prehash($$$) {
+ my($key, $n, $sv) = @_;
+ my @c = crc64($sv, $key);
+
+ # Create a bipartite graph...
+ $k1 = (($c[1] & ($n-1)) << 1) + 0; # low word
+ $k2 = (($c[0] & ($n-1)) << 1) + 1; # high word
+
+ return ($k1, $k2);
+}
+
+#
+# Walk the assignment graph, return true on success
+#
+sub walk_graph($$$$) {
+ my($nodeval,$nodeneigh,$n,$v) = @_;
+ my $nx;
+
+ # print STDERR "Vertex $n value $v\n";
+ $$nodeval[$n] = $v;
+
+ foreach $nx (@{$$nodeneigh[$n]}) {
+ # $nx -> [neigh, hash]
+ my ($o, $e) = @$nx;
+
+ # print STDERR "Edge $n,$o value $e: ";
+ my $ov;
+ if (defined($ov = $$nodeval[$o])) {
+ if ($v+$ov != $e) {
+ # Cyclic graph with collision
+ # print STDERR "error, should be ", $v+$ov, "\n";
+ return 0;
+ } else {
+ # print STDERR "ok\n";
+ }
+ } else {
+ return 0 unless (walk_graph($nodeval, $nodeneigh, $o, $e-$v));
+ }
+ }
+ return 1;
+}
+
+#
+# Generate the function assuming a given N.
+#
+# gen_hash_n(N, sv, \%data, run)
+#
+sub gen_hash_n($$$$) {
+ my($n, $sv, $href, $run) = @_;
+ my @keys = keys(%{$href});
+ my $i, $sv;
+ my $gr;
+ my $k, $v;
+ my $gsize = 2*$n;
+ my @nodeval;
+ my @nodeneigh;
+ my %edges;
+
+ for ($i = 0; $i < $gsize; $i++) {
+ $nodeneigh[$i] = [];
+ }
+
+ %edges = ();
+ foreach $k (@keys) {
+ my ($pf1, $pf2) = prehash($k, $n, $sv);
+ ($pf1,$pf2) = ($pf2,$pf1) if ($pf1 > $pf2); # Canonicalize order
+
+ my $pf = "$pf1,$pf2";
+ my $e = ${$href}{$k};
+ my $xkey;
+
+ if (defined($xkey = $edges{$pf})) {
+ next if ($e == ${$href}{$xkey}); # Duplicate hash, safe to ignore
+ if (defined($run)) {
+ print STDERR "$run: Collision: $pf: $k with $xkey\n";
+ }
+ return;
+ }
+
+ # print STDERR "Edge $pf value $e from $k\n";
+
+ $edges{$pf} = $k;
+ push(@{$nodeneigh[$pf1]}, [$pf2, $e]);
+ push(@{$nodeneigh[$pf2]}, [$pf1, $e]);
+ }
+
+ # Now we need to assign values to each vertex, so that for each
+ # edge, the sum of the values for the two vertices give the value
+ # for the edge (which is our hash index.) If we find an impossible
+ # sitation, the graph was cyclic.
+ @nodeval = (undef) x $gsize;
+
+ for ($i = 0; $i < $gsize; $i++) {
+ if (scalar(@{$nodeneigh[$i]})) {
+ # This vertex has neighbors (is used)
+ if (!defined($nodeval[$i])) {
+ # First vertex in a cluster
+ unless (walk_graph(\@nodeval, \@nodeneigh, $i, 0)) {
+ if (defined($run)) {
+ print STDERR "$run: Graph is cyclic\n";
+ }
+ return;
+ }
+ }
+ }
+ }
+
+ # for ($i = 0; $i < $n; $i++) {
+ # print STDERR "Vertex ", $i, ": ", $g[$i], "\n";
+ # }
+
+ if (defined($run)) {
+ printf STDERR "$run: Done: n = $n, sv = [0x%08x, 0x%08x]\n",
+ $$sv[0], $$sv[1];
+ }
+
+ return ($n, $sv, \@nodeval);
+}
+
+#
+# Driver for generating the function
+#
+# gen_perfect_hash(\%data)
+#
+sub gen_perfect_hash($) {
+ my($href) = @_;
+ my @keys = keys(%{$href});
+ my @hashinfo;
+ my $n, $i, $j, $sv, $maxj;
+ my $run = 1;
+
+ # Minimal power of 2 value for N with enough wiggle room.
+ # The scaling constant must be larger than 0.5 in order for the
+ # algorithm to ever terminate.
+ my $room = scalar(@keys)*0.8;
+ $n = 1;
+ while ($n < $room) {
+ $n <<= 1;
+ }
+
+ # Number of times to try...
+ $maxj = scalar @random_sv_vectors;
+
+ for ($i = 0; $i < 4; $i++) {
+ printf STDERR "%d vectors, trying n = %d...\n",
+ scalar @keys, $n;
+ for ($j = 0; $j < $maxj; $j++) {
+ $sv = $random_sv_vectors[$j];
+ @hashinfo = gen_hash_n($n, $sv, $href, $run++);
+ return @hashinfo if (defined(@hashinfo));
+ }
+ $n <<= 1;
+ }
+
+ return;
+}
+
+#
+# Read input file
+#
+sub read_input() {
+ my $key,$val;
+ my %out;
+ my $x = 0;
+
+ while (defined($l = <STDIN>)) {
+ chomp $l;
+ $l =~ s/\s*(\#.*|)$//;
+
+ next if ($l eq '');
+
+ if ($l =~ /^([^=]+)\=([^=]+)$/) {
+ $out{$1} = $2;
+ $x = $2;
+ } else {
+ $out{$l} = $x;
+ }
+ $x++;
+ }
+
+ return %out;
+}
+
+#
+# Verify that the hash table is actually correct...
+#
+sub verify_hash_table($$)
+{
+ my ($href, $hashinfo) = @_;
+ my ($n, $sv, $g) = @{$hashinfo};
+ my $k;
+ my $err = 0;
+
+ foreach $k (keys(%$href)) {
+ my ($pf1, $pf2) = prehash($k, $n, $sv);
+ my $g1 = ${$g}[$pf1];
+ my $g2 = ${$g}[$pf2];
+
+ if ($g1+$g2 != ${$href}{$k}) {
+ printf STDERR "%s(%d,%d): %d+%d = %d != %d\n",
+ $k, $pf1, $pf2, $g1, $g2, $g1+$g2, ${$href}{$k};
+ $err = 1;
+ } else {
+ # printf STDERR "%s: %d+%d = %d ok\n",
+ # $k, $g1, $g2, $g1+$g2;
+ }
+ }
+
+ die "$0: hash validation error\n" if ($err);
+}
+
+1;