diff options
author | snappy.mirrorbot@gmail.com <snappy.mirrorbot@gmail.com@03e5f5b5-db94-4691-08a0-1a8bf15f6143> | 2011-12-05 21:27:26 +0000 |
---|---|---|
committer | snappy.mirrorbot@gmail.com <snappy.mirrorbot@gmail.com@03e5f5b5-db94-4691-08a0-1a8bf15f6143> | 2011-12-05 21:27:26 +0000 |
commit | d7eb2dc4133794b62cba691f9be40d1549bc32e2 (patch) | |
tree | 445d694b5d8e8cfc6c07c116546ac71df385cbbb /snappy.cc | |
parent | 5ed51ce15fc4ff8d2f7235704eb6b0c3f762fb88 (diff) | |
download | snappy-d7eb2dc4133794b62cba691f9be40d1549bc32e2.tar.gz snappy-d7eb2dc4133794b62cba691f9be40d1549bc32e2.tar.bz2 snappy-d7eb2dc4133794b62cba691f9be40d1549bc32e2.zip |
Speed up decompression by moving the refill check to the end of the loop.
This seems to work because in most of the branches, the compiler can evaluate
“ip_limit_ - ip” in a more efficient way than reloading ip_limit_ from memory
(either by already having the entire expression in a register, or reconstructing
it from “avail”, or something else). Memory loads, even from L1, are seemingly
costly in the big picture at the current decompression speeds.
Microbenchmarks (64-bit, opt mode):
Westmere (Intel Core i7):
Benchmark Time(ns) CPU(ns) Iterations
--------------------------------------------
BM_UFlat/0 74492 74491 187894 1.3GB/s html [ +5.9%]
BM_UFlat/1 712268 712263 19644 940.0MB/s urls [ +3.8%]
BM_UFlat/2 10591 10590 1000000 11.2GB/s jpg [ -6.8%]
BM_UFlat/3 29643 29643 469915 3.0GB/s pdf [ +7.9%]
BM_UFlat/4 304669 304667 45930 1.3GB/s html4 [ +4.8%]
BM_UFlat/5 28508 28507 490077 823.1MB/s cp [ +4.0%]
BM_UFlat/6 12415 12415 1000000 856.5MB/s c [ +8.6%]
BM_UFlat/7 3415 3415 4084723 1039.0MB/s lsp [+18.0%]
BM_UFlat/8 979569 979563 14261 1002.5MB/s xls [ +5.8%]
BM_UFlat/9 230150 230148 60934 630.2MB/s txt1 [ +5.2%]
BM_UFlat/10 197167 197166 71135 605.5MB/s txt2 [ +4.7%]
BM_UFlat/11 607394 607390 23041 670.1MB/s txt3 [ +5.6%]
BM_UFlat/12 808502 808496 17316 568.4MB/s txt4 [ +5.0%]
BM_UFlat/13 372791 372788 37564 1.3GB/s bin [ +3.3%]
BM_UFlat/14 44541 44541 313969 818.8MB/s sum [ +5.7%]
BM_UFlat/15 4833 4833 2898697 834.1MB/s man [ +4.8%]
BM_UFlat/16 79855 79855 175356 1.4GB/s pb [ +4.8%]
BM_UFlat/17 245845 245843 56838 715.0MB/s gaviota [ +5.8%]
Clovertown (Intel Core 2):
Benchmark Time(ns) CPU(ns) Iterations
--------------------------------------------
BM_UFlat/0 107911 107890 100000 905.1MB/s html [ +2.2%]
BM_UFlat/1 1011237 1011041 10000 662.3MB/s urls [ +2.5%]
BM_UFlat/2 26775 26770 523089 4.4GB/s jpg [ +0.0%]
BM_UFlat/3 48103 48095 290618 1.8GB/s pdf [ +3.4%]
BM_UFlat/4 437724 437644 31937 892.6MB/s html4 [ +2.1%]
BM_UFlat/5 39607 39600 358284 592.5MB/s cp [ +2.4%]
BM_UFlat/6 18227 18224 768191 583.5MB/s c [ +2.7%]
BM_UFlat/7 5171 5170 2709437 686.4MB/s lsp [ +3.9%]
BM_UFlat/8 1560291 1559989 8970 629.5MB/s xls [ +3.6%]
BM_UFlat/9 335401 335343 41731 432.5MB/s txt1 [ +3.0%]
BM_UFlat/10 287014 286963 48758 416.0MB/s txt2 [ +2.8%]
BM_UFlat/11 888522 888356 15752 458.1MB/s txt3 [ +2.9%]
BM_UFlat/12 1186600 1186378 10000 387.3MB/s txt4 [ +3.1%]
BM_UFlat/13 572295 572188 24468 855.4MB/s bin [ +2.1%]
BM_UFlat/14 64060 64049 218401 569.4MB/s sum [ +4.1%]
BM_UFlat/15 7264 7263 1916168 555.0MB/s man [ +1.4%]
BM_UFlat/16 108853 108836 100000 1039.1MB/s pb [ +1.7%]
BM_UFlat/17 364289 364223 38419 482.6MB/s gaviota [ +4.9%]
Barcelona (AMD Opteron):
Benchmark Time(ns) CPU(ns) Iterations
--------------------------------------------
BM_UFlat/0 103900 103871 100000 940.2MB/s html [ +8.3%]
BM_UFlat/1 1000435 1000107 10000 669.5MB/s urls [ +6.6%]
BM_UFlat/2 24659 24652 567362 4.8GB/s jpg [ +0.1%]
BM_UFlat/3 48206 48193 291121 1.8GB/s pdf [ +5.0%]
BM_UFlat/4 421980 421850 33174 926.0MB/s html4 [ +7.3%]
BM_UFlat/5 40368 40357 346994 581.4MB/s cp [ +8.7%]
BM_UFlat/6 19836 19830 708695 536.2MB/s c [ +8.0%]
BM_UFlat/7 6100 6098 2292774 581.9MB/s lsp [ +9.0%]
BM_UFlat/8 1693093 1692514 8261 580.2MB/s xls [ +8.0%]
BM_UFlat/9 365991 365886 38225 396.4MB/s txt1 [ +7.1%]
BM_UFlat/10 311330 311238 44950 383.6MB/s txt2 [ +7.6%]
BM_UFlat/11 975037 974737 14376 417.5MB/s txt3 [ +6.9%]
BM_UFlat/12 1303558 1303175 10000 352.6MB/s txt4 [ +7.3%]
BM_UFlat/13 517448 517290 27144 946.2MB/s bin [ +5.5%]
BM_UFlat/14 66537 66518 210352 548.3MB/s sum [ +7.5%]
BM_UFlat/15 7976 7974 1760383 505.6MB/s man [ +5.6%]
BM_UFlat/16 103121 103092 100000 1097.0MB/s pb [ +8.7%]
BM_UFlat/17 391431 391314 35733 449.2MB/s gaviota [ +6.5%]
R=sanjay
git-svn-id: https://snappy.googlecode.com/svn/trunk@54 03e5f5b5-db94-4691-08a0-1a8bf15f6143
Diffstat (limited to 'snappy.cc')
-rw-r--r-- | snappy.cc | 24 |
1 files changed, 18 insertions, 6 deletions
@@ -669,13 +669,20 @@ class SnappyDecompressor { template <class Writer> void DecompressAllTags(Writer* writer) { const char* ip = ip_; - for ( ;; ) { - if (ip_limit_ - ip < 5) { - ip_ = ip; - if (!RefillTag()) return; - ip = ip_; - } + // We could have put this refill fragment only at the beginning of the loop. + // However, duplicating it at the end of each branch gives the compiler more + // scope to optimize the <ip_limit_ - ip> expression based on the local + // context, which overall increases speed. + #define MAYBE_REFILL() \ + if (ip_limit_ - ip < 5) { \ + ip_ = ip; \ + if (!RefillTag()) return; \ + ip = ip_; \ + } + + MAYBE_REFILL(); + for ( ;; ) { const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++)); if ((c & 0x3) == LITERAL) { @@ -683,6 +690,7 @@ class SnappyDecompressor { if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) { DCHECK_LT(literal_length, 61); ip += literal_length; + MAYBE_REFILL(); continue; } if (PREDICT_FALSE(literal_length >= 61)) { @@ -709,6 +717,7 @@ class SnappyDecompressor { return; } ip += literal_length; + MAYBE_REFILL(); } else { const uint32 entry = char_table[c]; const uint32 trailer = LittleEndian::Load32(ip) & wordmask[entry >> 11]; @@ -722,8 +731,11 @@ class SnappyDecompressor { if (!writer->AppendFromSelf(copy_offset + trailer, length)) { return; } + MAYBE_REFILL(); } } + +#undef MAYBE_REFILL } }; |