diff options
Diffstat (limited to 'phash.pl')
-rwxr-xr-x | phash.pl | 303 |
1 files changed, 303 insertions, 0 deletions
diff --git a/phash.pl b/phash.pl new file mode 100755 index 0000000..15f262a --- /dev/null +++ b/phash.pl @@ -0,0 +1,303 @@ +#!/usr/bin/perl +# +# Perfect Minimal Hash Generator written in Perl, which produces +# C output. +# +# Requires the CPAN Graph module (tested against 0.83) + +use Graph::Undirected; + + +# Produce the same values every time, please... +srand(0); + +# +# 32-bit rotate +# +sub rot($$) { + my($v,$s) = @_; + + return (($v << $s)+($v >> (32-$s))) & 0xffffffff; +} + +# +# Compute the prehash for a key +# +# prehash(key, sv, N) +# +sub prehash($$$) { + my($key, $n, $sv) = @_; + my $c; + my $k1 = 0, $k2 = 0; + + foreach $c (unpack("C*", $key)) { + $k1 = (rot($k1,$sv+2) + rot($k1, 5) + $c) & 0xffffffff; + $k2 = (rot($k2,$sv) + rot($k2, 7) + $c) & 0xffffffff; + } + + return ($k1 % $n, $k2 % $n); +} + +# +# Shuffle a list. +# +sub shuffle(@) { + my(@l) = @_; + my($i, $j); + my $tmp; + + for ($i = scalar(@l)-1; $i > 0; $i--) { + $j = int(rand($i)); + + $tmp = $l[$j]; + $l[$j] = $l[$i]; + $l[$i] = $tmp; + } + + return @l; +} + +# +# Pick a set of F-functions of length N. +# +# ffunc(N) +# +sub ffunc($) { + my($n) = @_; + + return shuffle(0..$n-1); +} + +# +# Walk the assignment graph +# +sub walk_graph($$$) { + my($gr,$n,$v) = @_; + my $nx; + + # print STDERR "Vertex $n value $v\n"; + $gr->set_vertex_attribute($n,"val",$v); + + foreach $nx ($gr->neighbors($n)) { + die unless ($gr->has_edge_attribute($n, $nx, "hash")); + my $e = $gr->get_edge_attribute($n, $nx, "hash"); + + # print STDERR "Edge $n=$nx value $e: "; + + if ($gr->has_vertex_attribute($nx, "val")) { + die if ($v+$gr->get_vertex_attribute($nx, "val") != $e); + # print STDERR "ok\n"; + } else { + walk_graph($gr, $nx, $e-$v); + } + } +} + +# +# Generate the function assuming a given N. +# +# gen_hash_n(N, sv, \%data) +# +sub gen_hash_n($$$) { + my($n, $sv, $href) = @_; + my @keys = keys(%{$href}); + my $i, $sv, @f1, @f2, @g; + my $gr; + my $k, $v; + + @f1 = ffunc($n); + @f2 = ffunc($n); + + print STDERR "*** New graph ***\n"; + + $gr = Graph::Undirected->new; + for ($i = 0; $i < $n; $i++) { + $gr->add_vertex($i); + } + + foreach $k (@keys) { + my ($p1, $p2) = prehash($k, $n, $sv); + my $pf1 = $f1[$p1]; + my $pf2 = $f2[$p2]; + my $e = ${$href}{$k}; + + if ($gr->has_edge($pf1, $pf2)) { + my $xkey = $gr->get_edge_attribute($pf1, $pf2, "key"); + my ($xp1, $xp2) = prehash($xkey, $n, $sv); + print STDERR "Collision: $pf1=$pf2 $k ($p1,$p2) with "; + print STDERR "$xkey ($xp1,$xp2)\n"; + return; + } + + # print STDERR "Edge $pf1=$pf2 value $e from $k ($p1,$p2)\n"; + + $gr->add_edge($pf1, $pf2); + $gr->set_edge_attribute($pf1, $pf2, "hash", $e); + $gr->set_edge_attribute($pf1, $pf2, "key", $k); + } + + # At this point, we're good if the graph is acyclic. + if ($gr->is_cyclic) { + print STDERR "Graph is cyclic\n"; + return; + } + + # print STDERR "Graph:\n$gr\n"; + + # Now we need to assign values to each vertex, so that for each + # edge, the sum of the values for the two vertices give the value + # for the edge (which is our hash index.) Since the graph is + # acyclic, this is always doable. + for ($i = 0; $i < $n; $i++) { + if (!$gr->has_vertex_attribute($i, "val")) { + walk_graph($gr,$i,0); # First vertex in a cluster + } + push(@g, $gr->get_vertex_attribute($i, "val")); + } + + # for ($i = 0; $i < $n; $i++) { + # print STDERR "Vertex ", $i, ": ", $g[$i], "\n"; + # } + + print STDERR "Done: n = $n, sv = $sv\n"; + + return ($n, $sv, \@f1, \@f2, \@g); +} + + +# +# Driver for generating the function +# +# gen_perfect_hash(\%data) +# +sub gen_perfect_hash($) { + my($href) = @_; + my @keys = keys(%{$href}); + my @hashinfo; + my $n, $i, $j, $sv, $maxj; + + # Minimal power of 2 value for N + $n = 1; + while ($n < scalar(@keys)) { + $n <<= 1; + } + + $maxj = 256; # Number of times to try + + for ($i = 0; $i < 4; $i++) { + print STDERR "Trying n = $n...\n"; + for ($j = 0; $j < $maxj; $j++) { + for ($sv = 0; $sv < 32; $sv++) { + @hashinfo = gen_hash_n($n, $sv, $href); + return @hashinfo if (defined(@hashinfo)); + } + } + $n <<= 1; + $maxj >>= 1; + } + + return; +} + +# +# Read input file +# +sub read_input() { + my $key,$val; + my %out; + my $x = 0; + + while (defined($l = <STDIN>)) { + chomp $l; + $l =~ s/\s*(\#.*|)$//; + + next if ($l eq ''); + + if ($l =~ /^([^=]+)\=([^=]+)$/) { + $out{$1} = $2; + $x = $2; + } else { + $out{$l} = $x; + } + $x++; + } + + return %out; +} + +# +# Verify that the hash table is actually correct... +# +sub verify_hash_table($$) +{ + my ($href, $hashinfo) = @_; + my ($n, $sv, $f1, $f2, $g) = @{$hashinfo}; + my $k; + my $err = 0; + + print STDERR "Verify: n = $n, sv = $sv\n"; + + foreach $k (keys(%$href)) { + my ($p1, $p2) = prehash($k, $n, $sv); + my $pf1 = ${$f1}[$p1]; + my $pf2 = ${$f2}[$p2]; + my $g1 = ${$g}[$pf1]; + my $g2 = ${$g}[$pf2]; + + if ($g1+$g2 != ${$href}{$k}) { + printf STDERR "%s(%d,%d): %d=%d, %d+%d = %d != %d\n", + $k, $p1, $p2, $pf1, $pf2, $g1, $g2, $g1+$g2, ${$href}{$k}; + $err = 1; + } else { + printf STDERR "%s: %d+%d = %d ok\n", + $k, $g1, $g2, $g1+$g2; + } + } + + die "$0: hash validation error\n" if ($err); +} + +# +# Main program +# +sub main() { + my $n; + my %data; + my @hashinfo; + my $x, $i; + + %data = read_input(); + @hashinfo = gen_perfect_hash(\%data); + + if (!defined(@hashinfo)) { + die "$0: no hash found\n"; + } + + verify_hash_table(\%data, \@hashinfo); + + ($n, $sv, $f1, $f2, $g) = @hashinfo; + + print "static int HASHNAME_fg1[$n] =\n"; + print "{\n"; + for ($i = 0; $i < $n; $i++) { + print "\t", ${$g}[${$f1}[$i]], "\n"; + } + print "};\n\n"; + + print "static int HASHNAME_fg2[$n] =\n"; + print "{\n"; + for ($i = 0; $i < $n; $i++) { + print "\t", ${$g}[${$f2}[$i]], "\n"; + } + print "};\n\n"; + + print "struct p_hash HASHNAME =\n"; + print "{\n"; + print "\t$n\n"; + print "\t$sv\n"; + print "\tHASHNAME_fg1,\n"; + print "\tHASHNAME_fg2,\n"; + print "};\n"; +} + +main(); |