summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeoff Pike <gpike@google.com>2016-07-06 08:24:48 -0700
committerAlkis Evlogimenos <alkis@google.com>2017-01-26 21:42:26 +0100
commit27c5d86527532a8a2d73ae0e6d78bdbd626c3590 (patch)
treebb86969a885aa5e8239f57b0e3e9171b6572557d
parent4a740940803af465c8a4c13819e8c7a5214dbd6b (diff)
downloadsnappy-27c5d86527532a8a2d73ae0e6d78bdbd626c3590.tar.gz
snappy-27c5d86527532a8a2d73ae0e6d78bdbd626c3590.tar.bz2
snappy-27c5d86527532a8a2d73ae0e6d78bdbd626c3590.zip
Re-work fast path for handling copies in zippy decompression.
This is a performance-tuning change that shouldn't change the behavior of the library. This adds some complexity but the performance gain might make that worthwhile: With FDO on perflab/haswell, a 4.0% gain (geometric mean). SAMPLE (before) Benchmark Time(ns) CPU(ns) Iterations ------------------------------------------------ BM_UFlat/0 36638 36552 100000 2.6GB/s html BM_UFlat/1 457153 455895 9173 1.4GB/s urls BM_UFlat/2 5850 5837 685481 19.6GB/s jpg BM_UFlat/3 122 122 34551988 1.5GB/s jpg_200 BM_UFlat/4 6797 6781 620811 14.1GB/s pdf BM_UFlat/5 179485 179037 23471 2.1GB/s html4 BM_UFlat/6 142734 142384 29525 1018.7MB/s txt1 BM_UFlat/7 125233 124924 33709 955.6MB/s txt2 BM_UFlat/8 382548 381533 10000 1066.7MB/s txt3 BM_UFlat/9 525614 524297 8018 876.5MB/s txt4 BM_UFlat/10 34946 34868 100000 3.2GB/s pb BM_UFlat/11 149548 149208 28063 1.2GB/s gaviota BM_UFlat/12 10684 10663 392580 2.1GB/s cp BM_UFlat/13 5494 5484 766584 1.9GB/s c BM_UFlat/14 1691 1688 2488784 2.1GB/s lsp BM_UFlat/15 676443 674726 6129 1.4GB/s xls BM_UFlat/16 156 156 26656909 1.2GB/s xls_200 BM_UFlat/17 239911 239297 17558 2.0GB/s bin BM_UFlat/18 182 182 23072932 1047.9MB/s bin_200 BM_UFlat/19 21544 21499 194484 1.7GB/s sum BM_UFlat/20 2236 2232 1877810 1.8GB/s man BM_UFlatSink/0 42266 42179 99732 2.3GB/s html BM_UFlatSink/1 461810 460633 9055 1.4GB/s urls BM_UFlatSink/2 5816 5804 632829 19.8GB/s jpg BM_UFlatSink/3 124 123 34351698 1.5GB/s jpg_200 BM_UFlatSink/4 7173 7157 609929 13.3GB/s pdf BM_UFlatSink/5 184795 184302 22660 2.1GB/s html4 BM_UFlatSink/6 143552 143223 29272 1012.7MB/s txt1 BM_UFlatSink/7 127160 126890 33178 940.8MB/s txt2 BM_UFlatSink/8 382219 381313 10000 1067.3MB/s txt3 BM_UFlatSink/9 528042 526713 7988 872.5MB/s txt4 BM_UFlatSink/10 41389 41305 100000 2.7GB/s pb BM_UFlatSink/11 147215 146877 28854 1.2GB/s gaviota BM_UFlatSink/12 12008 11984 348139 1.9GB/s cp BM_UFlatSink/13 5444 5433 775084 1.9GB/s c BM_UFlatSink/14 1647 1644 2552119 2.1GB/s lsp BM_UFlatSink/15 665011 663424 6320 1.4GB/s xls BM_UFlatSink/16 153 153 27571837 1.2GB/s xls_200 BM_UFlatSink/17 239735 239169 17411 2.0GB/s bin BM_UFlatSink/18 183 182 23005573 1046.8MB/s bin_200 BM_UFlatSink/19 22544 22498 187705 1.6GB/s sum BM_UFlatSink/20 2190 2186 1917894 1.8GB/s man SAMPLE (after) Benchmark Time(ns) CPU(ns) Iterations ------------------------------------------------ BM_UFlat/0 33940 33889 100000 2.8GB/s html BM_UFlat/1 440728 439944 9586 1.5GB/s urls BM_UFlat/2 5652 5641 744776 20.3GB/s jpg BM_UFlat/3 123 123 34647884 1.5GB/s jpg_200 BM_UFlat/4 6628 6615 631892 14.4GB/s pdf BM_UFlat/5 169523 169227 24197 2.3GB/s html4 BM_UFlat/6 144139 143892 29232 1008.0MB/s txt1 BM_UFlat/7 127148 126915 33144 940.6MB/s txt2 BM_UFlat/8 380267 379233 10000 1073.2MB/s txt3 BM_UFlat/9 529495 528194 7957 870.0MB/s txt4 BM_UFlat/10 31844 31784 100000 3.5GB/s pb BM_UFlat/11 146822 146476 28737 1.2GB/s gaviota BM_UFlat/12 10784 10762 392176 2.1GB/s cp BM_UFlat/13 5528 5518 760934 1.9GB/s c BM_UFlat/14 1721 1719 2449291 2.0GB/s lsp BM_UFlat/15 673304 671774 6255 1.4GB/s xls BM_UFlat/16 155 155 27092003 1.2GB/s xls_200 BM_UFlat/17 230424 229902 18285 2.1GB/s bin BM_UFlat/18 185 184 22818199 1033.9MB/s bin_200 BM_UFlat/19 21035 20996 200765 1.7GB/s sum BM_UFlat/20 2242 2238 1864380 1.8GB/s man BM_UFlatSink/0 33487 33405 100000 2.9GB/s html BM_UFlatSink/1 431108 430226 9764 1.5GB/s urls BM_UFlatSink/2 5927 5916 648112 19.4GB/s jpg BM_UFlatSink/3 123 122 34704423 1.5GB/s jpg_200 BM_UFlatSink/4 6472 6461 653462 14.8GB/s pdf BM_UFlatSink/5 164309 163988 25567 2.3GB/s html4 BM_UFlatSink/6 138274 138020 30311 1050.9MB/s txt1 BM_UFlatSink/7 120844 120637 34708 989.6MB/s txt2 BM_UFlatSink/8 371046 370366 10000 1098.9MB/s txt3 BM_UFlatSink/9 510021 508982 8269 902.9MB/s txt4 BM_UFlatSink/10 30889 30844 100000 3.6GB/s pb BM_UFlatSink/11 140752 140521 29903 1.2GB/s gaviota BM_UFlatSink/12 10162 10146 413600 2.3GB/s cp BM_UFlatSink/13 5264 5256 762398 2.0GB/s c BM_UFlatSink/14 1622 1619 2606069 2.1GB/s lsp BM_UFlatSink/15 646897 645756 6512 1.5GB/s xls BM_UFlatSink/16 150 150 28223595 1.2GB/s xls_200 BM_UFlatSink/17 226096 225650 18629 2.1GB/s bin BM_UFlatSink/18 185 184 22907935 1035.3MB/s bin_200 BM_UFlatSink/19 21369 21335 198881 1.7GB/s sum BM_UFlatSink/20 2139 2136 1953637 1.8GB/s man
-rw-r--r--snappy.cc38
1 files changed, 30 insertions, 8 deletions
diff --git a/snappy.cc b/snappy.cc
index e2dd546..7b12fb1 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -1067,19 +1067,41 @@ class SnappyArrayWriter {
if (produced <= offset - 1u) {
return false;
}
- if (len <= 16 && offset >= 8 && space_left >= 16) {
- // Fast path, used for the majority (70-80%) of dynamic invocations.
+ if (offset >= 8 && space_left >= 16) {
UnalignedCopy64(op - offset, op);
UnalignedCopy64(op - offset + 8, op + 8);
- } else {
- if (space_left >= len + kMaxIncrementCopyOverflow) {
- IncrementalCopyFastPath(op - offset, op, len);
+ if (PREDICT_TRUE(len <= 16)) {
+ // Fast path, used for the majority (70-80%) of dynamic invocations.
+ op_ = op + len;
+ return true;
+ }
+ op += 16;
+ // Copy 8 bytes at a time. This will write as many as 7 bytes more
+ // than necessary, so we check if space_left >= len + 7.
+ if (space_left >= len + 7) {
+ const char* src = op - offset;
+ ssize_t l = len - 16; // 16 bytes were already handled, above.
+ do {
+ UnalignedCopy64(src, op);
+ src += 8;
+ op += 8;
+ l -= 8;
+ } while (l > 0);
+ // l is now negative if we wrote extra bytes; adjust op_ accordingly.
+ op_ = op + l;
+ return true;
+ } else if (space_left < len) {
+ return false;
} else {
- if (space_left < len) {
- return false;
- }
+ len -= 16;
IncrementalCopy(op - offset, op, len);
}
+ } else if (space_left >= len + kMaxIncrementCopyOverflow) {
+ IncrementalCopyFastPath(op - offset, op, len);
+ } else if (space_left < len) {
+ return false;
+ } else {
+ IncrementalCopy(op - offset, op, len);
}
op_ = op + len;