diff options
author | H. Peter Anvin <hpa@zytor.com> | 2007-08-30 23:42:39 +0000 |
---|---|---|
committer | H. Peter Anvin <hpa@zytor.com> | 2007-08-30 23:42:39 +0000 |
commit | fb5a599c8ac2c67139e2d37c0b7031ea75e1da84 (patch) | |
tree | 45de113bd4ee4e28f5436aaff65fdc6a92cd9ee4 /perllib | |
parent | 74cc5e569c1c8bcdd886734e1ce4c2df741b5b07 (diff) | |
download | nasm-fb5a599c8ac2c67139e2d37c0b7031ea75e1da84.tar.gz nasm-fb5a599c8ac2c67139e2d37c0b7031ea75e1da84.tar.bz2 nasm-fb5a599c8ac2c67139e2d37c0b7031ea75e1da84.zip |
phash.ph: use a bipartite graph to reduce the storage requirements
Since we fold the f- and g-functions together, if we guarantee that g is
bipartite, we can make g twice the size of f without cost. This greatly
improves the odds of generating a smaller hash.
Diffstat (limited to 'perllib')
-rw-r--r-- | perllib/phash.ph | 32 |
1 files changed, 19 insertions, 13 deletions
diff --git a/perllib/phash.ph b/perllib/phash.ph index 438e466..017155d 100644 --- a/perllib/phash.ph +++ b/perllib/phash.ph @@ -3,7 +3,7 @@ # Perfect Minimal Hash Generator written in Perl, which produces # C output. # -# Requires the CPAN Graph module (tested against 0.83) +# Requires the CPAN Graph module (tested against 0.81, 0.83, 0.84) use Graph::Undirected; @@ -65,10 +65,15 @@ sub shuffle(@) { # # ffunc(N) # -sub ffunc($) { - my($n) = @_; +sub ffunc($$$) { + my($n,$s,$i) = @_; + my(@l) = (); - return shuffle(0..$n-1); + while ($n--) { + push(@l, $i); + $i += $s; + } + return shuffle(@l); } # @@ -107,12 +112,13 @@ sub gen_hash_n($$$) { my $i, $sv, @f1, @f2, @g; my $gr; my $k, $v; + my $gsize = 2*$n; - @f1 = ffunc($n); - @f2 = ffunc($n); + @f1 = ffunc($n, 2, 0); + @f2 = ffunc($n, 2, 1); $gr = Graph::Undirected->new; - for ($i = 0; $i < $n; $i++) { + for ($i = 0; $i < $gsize; $i++) { $gr->add_vertex($i); } @@ -149,7 +155,7 @@ sub gen_hash_n($$$) { # edge, the sum of the values for the two vertices give the value # for the edge (which is our hash index.) Since the graph is # acyclic, this is always doable. - for ($i = 0; $i < $n; $i++) { + for ($i = 0; $i < $gsize; $i++) { if (!$gr->has_vertex_attribute($i, "val")) { walk_graph($gr,$i,0); # First vertex in a cluster } @@ -160,7 +166,7 @@ sub gen_hash_n($$$) { # print STDERR "Vertex ", $i, ": ", $g[$i], "\n"; # } - print STDERR "Done: n = $n, sv = $sv\n"; + print STDERR "Done: n = $n, sv = [", join(',', @$sv), "]\n"; return ($n, $sv, \@f1, \@f2, \@g); } @@ -184,8 +190,10 @@ sub gen_perfect_hash($) { my @hashinfo; my $n, $i, $j, $sv, $maxj; - # Minimal power of 2 value for N with enough wiggle room - my $room = scalar(@keys)*1.3; + # Minimal power of 2 value for N with enough wiggle room. + # The scaling constant must be larger than 0.5 in order for the + # algorithm to ever terminate. + my $room = scalar(@keys)*0.8; $n = 1; while ($n < $room) { $n <<= 1; @@ -243,8 +251,6 @@ sub verify_hash_table($$) my $k; my $err = 0; - print STDERR "Verify: n = $n, sv = $sv\n"; - foreach $k (keys(%$href)) { my ($p1, $p2) = prehash($k, $n, $sv); my $pf1 = ${$f1}[$p1]; |