Re-work fast path that emits copies in zippy compression.

The primary motivation for the change is that FindMatchLength is likely to discover a difference in the first 8 bytes it compares. If that occurs then we know the length of the match is less than 12, because FindMatchLength is invoked after a 4-byte match is found. When emitting a copy, it is useful to know that the length is less than 12 because the two-byte variant of an emitted copy requires that. This is a performance-tuning change that should not affect the library's behavior. With FDO on perflab/Haswell the geometric mean for ZFlat/* went from 47,290ns to 45,741ns, an improvement of 3.4%. SAMPLE (before) BM_ZFlat/0 102824 102650 40691 951.4MB/s html (22.31 %) BM_ZFlat/1 1293512 1290442 3225 518.9MB/s urls (47.78 %) BM_ZFlat/2 10373 10353 417959 11.1GB/s jpg (99.95 %) BM_ZFlat/3 268 268 15745324 712.4MB/s jpg_200 (73.00 %) BM_ZFlat/4 12137 12113 342462 7.9GB/s pdf (83.30 %) BM_ZFlat/5 430672 429720 9724 909.0MB/s html4 (22.52 %) BM_ZFlat/6 420541 419636 9833 345.6MB/s txt1 (57.88 %) BM_ZFlat/7 373829 373158 10000 319.9MB/s txt2 (61.91 %) BM_ZFlat/8 1119014 1116604 3755 364.5MB/s txt3 (54.99 %) BM_ZFlat/9 1544203 1540657 2748 298.3MB/s txt4 (66.26 %) BM_ZFlat/10 91041 90866 46002 1.2GB/s pb (19.68 %) BM_ZFlat/11 332766 331990 10000 529.5MB/s gaviota (37.72 %) BM_ZFlat/12 39960 39886 100000 588.3MB/s cp (48.12 %) BM_ZFlat/13 14493 14465 287181 735.1MB/s c (42.47 %) BM_ZFlat/14 4447 4440 947927 799.3MB/s lsp (48.37 %) BM_ZFlat/15 1316362 1313350 3196 747.7MB/s xls (41.23 %) BM_ZFlat/16 312 311 10000000 613.0MB/s xls_200 (78.00 %) BM_ZFlat/17 388471 387502 10000 1.2GB/s bin (18.11 %) BM_ZFlat/18 65 64 64838208 2.9GB/s bin_200 (7.50 %) BM_ZFlat/19 65900 65787 63099 554.3MB/s sum (48.96 %) BM_ZFlat/20 6188 6177 681951 652.6MB/s man (59.21 %) SAMPLE (after) Benchmark Time(ns) CPU(ns) Iterations -------------------------------------------- BM_ZFlat/0 99259 99044 42428 986.0MB/s html (22.31 %) BM_ZFlat/1 1257039 1255276 3341 533.4MB/s urls (47.78 %) BM_ZFlat/2 10044 10030 405781 11.4GB/s jpg (99.95 %) BM_ZFlat/3 268 267 15732282 713.3MB/s jpg_200 (73.00 %) BM_ZFlat/4 11675 11657 358629 8.2GB/s pdf (83.30 %) BM_ZFlat/5 420951 419818 9739 930.5MB/s html4 (22.52 %) BM_ZFlat/6 415460 414632 10000 349.8MB/s txt1 (57.88 %) BM_ZFlat/7 367191 366436 10000 325.8MB/s txt2 (61.91 %) BM_ZFlat/8 1098345 1096036 3819 371.3MB/s txt3 (54.99 %) BM_ZFlat/9 1508701 1505306 2758 305.3MB/s txt4 (66.26 %) BM_ZFlat/10 87195 87031 47289 1.3GB/s pb (19.68 %) BM_ZFlat/11 322338 321637 10000 546.5MB/s gaviota (37.72 %) BM_ZFlat/12 36739 36668 100000 639.9MB/s cp (48.12 %) BM_ZFlat/13 13646 13618 304009 780.9MB/s c (42.47 %) BM_ZFlat/14 4249 4240 992456 837.0MB/s lsp (48.37 %) BM_ZFlat/15 1262925 1260012 3314 779.4MB/s xls (41.23 %) BM_ZFlat/16 308 308 10000000 619.8MB/s xls_200 (78.00 %) BM_ZFlat/17 379750 378944 10000 1.3GB/s bin (18.11 %) BM_ZFlat/18 62 62 67443280 3.0GB/s bin_200 (7.50 %) BM_ZFlat/19 61706 61587 67645 592.1MB/s sum (48.96 %) BM_ZFlat/20 5968 5958 698974 676.6MB/s man (59.21 %)
author: Geoff Pike <gpike@google.com> 2016-06-28 11:53:11 -0700
committer: Alkis Evlogimenos <alkis@google.com> 2017-01-26 21:39:39 +0100
commit: 38a5ec5fcaca58556889c840cbcbb2886e524979 (patch)
tree: 945c7a6091c7e523011e3426d228517ddbaf4fbd /snappy.cc
parent: 094c67de88f41eae494a3823f8aaf0f77b25b980 (diff)
download: snappy-38a5ec5fcaca58556889c840cbcbb2886e524979.tar.gz
snappy-38a5ec5fcaca58556889c840cbcbb2886e524979.tar.bz2
snappy-38a5ec5fcaca58556889c840cbcbb2886e524979.zip
1 files changed, 42 insertions, 30 deletions
diff --git a/snappy.cc b/snappy.cc
index 089219c..1561f04 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -41,7 +41,6 @@ namespace snappy {
 
 using internal::COPY_1_BYTE_OFFSET;
 using internal::COPY_2_BYTE_OFFSET;
-using internal::COPY_4_BYTE_OFFSET;
 using internal::LITERAL;
 using internal::char_table;
 using internal::kMaximumTagLength;
@@ -199,42 +198,54 @@ static inline char* EmitLiteral(char* op,
   return op + len;
 }
 
-static inline char* EmitCopyLessThan64(char* op, size_t offset, int len) {
+static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len,
+                                     bool len_less_than_12) {
   assert(len <= 64);
   assert(len >= 4);
   assert(offset < 65536);
+  assert(len_less_than_12 == (len < 12));
 
-  if ((len < 12) && (offset < 2048)) {
-    size_t len_minus_4 = len - 4;
-    assert(len_minus_4 < 8);            // Must fit in 3 bits
-    *op++ = COPY_1_BYTE_OFFSET + ((len_minus_4) << 2) + ((offset >> 8) << 5);
+  if (len_less_than_12 && PREDICT_TRUE(offset < 2048)) {
+    // offset fits in 11 bits.  The 3 highest go in the top of the first byte,
+    // and the rest go in the second byte.
+    *op++ = COPY_1_BYTE_OFFSET + ((len - 4) << 2) + ((offset >> 3) & 0xe0);
     *op++ = offset & 0xff;
   } else {
-    *op++ = COPY_2_BYTE_OFFSET + ((len-1) << 2);
-    LittleEndian::Store16(op, offset);
-    op += 2;
+    // Write 4 bytes, though we only care about 3 of them.  The output buffer
+    // is required to have some slack, so the extra byte won't overrun it.
+    uint32 u = COPY_2_BYTE_OFFSET + ((len - 1) << 2) + (offset << 8);
+    LittleEndian::Store32(op, u);
+    op += 3;
   }
   return op;
 }
 
-static inline char* EmitCopy(char* op, size_t offset, int len) {
-  // Emit 64 byte copies but make sure to keep at least four bytes reserved
-  while (PREDICT_FALSE(len >= 68)) {
-    op = EmitCopyLessThan64(op, offset, 64);
-    len -= 64;
-  }
+static inline char* EmitCopy(char* op, size_t offset, size_t len,
+                             bool len_less_than_12) {
+  assert(len_less_than_12 == (len < 12));
+  if (len_less_than_12) {
+    return EmitCopyAtMost64(op, offset, len, true);
+  } else {
+    // A special case for len <= 64 might help, but so far measurements suggest
+    // it's in the noise.
 
-  // Emit an extra 60 byte copy if have too much data to fit in one copy
-  if (len > 64) {
-    op = EmitCopyLessThan64(op, offset, 60);
-    len -= 60;
-  }
+    // Emit 64 byte copies but make sure to keep at least four bytes reserved.
+    while (PREDICT_FALSE(len >= 68)) {
+      op = EmitCopyAtMost64(op, offset, 64, false);
+      len -= 64;
+    }
 
-  // Emit remainder
-  op = EmitCopyLessThan64(op, offset, len);
-  return op;
-}
+    // One or two copies will now finish the job.
+    if (len > 64) {
+      op = EmitCopyAtMost64(op, offset, 60, false);
+      len -= 60;
+    }
 
+    // Emit remainder.
+    op = EmitCopyAtMost64(op, offset, len, len < 12);
+    return op;
+  }
+}
 
 bool GetUncompressedLength(const char* start, size_t n, size_t* result) {
   uint32 v = 0;
@@ -422,19 +433,20 @@ char* CompressFragment(const char* input,
         // We have a 4-byte match at ip, and no need to emit any
         // "literal bytes" prior to ip.
         const char* base = ip;
-        int matched = 4 + FindMatchLength(candidate + 4, ip + 4, ip_end);
+        pair<size_t, bool> p = FindMatchLength(candidate + 4, ip + 4, ip_end);
+        size_t matched = 4 + p.first;
         ip += matched;
         size_t offset = base - candidate;
         assert(0 == memcmp(base, candidate, matched));
-        op = EmitCopy(op, offset, matched);
-        // We could immediately start working at ip now, but to improve
-        // compression we first update table[Hash(ip - 1, ...)].
-        const char* insert_tail = ip - 1;
+        op = EmitCopy(op, offset, matched, p.second);
         next_emit = ip;
         if (PREDICT_FALSE(ip >= ip_limit)) {
           goto emit_remainder;
         }
-        input_bytes = GetEightBytesAt(insert_tail);
+        // We are now looking for a 4-byte match again.  We read
+        // table[Hash(ip, shift)] for that.  To improve compression,
+        // we also update table[Hash(ip - 1, shift)] and table[Hash(ip, shift)].
+        input_bytes = GetEightBytesAt(ip - 1);
         uint32 prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift);
         table[prev_hash] = ip - base_ip - 1;
         uint32 cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift);
author	Geoff Pike <gpike@google.com>	2016-06-28 11:53:11 -0700
committer	Alkis Evlogimenos <alkis@google.com>	2017-01-26 21:39:39 +0100
commit	38a5ec5fcaca58556889c840cbcbb2886e524979 (patch)
tree	945c7a6091c7e523011e3426d228517ddbaf4fbd /snappy.cc
parent	094c67de88f41eae494a3823f8aaf0f77b25b980 (diff)
download	snappy-38a5ec5fcaca58556889c840cbcbb2886e524979.tar.gz snappy-38a5ec5fcaca58556889c840cbcbb2886e524979.tar.bz2 snappy-38a5ec5fcaca58556889c840cbcbb2886e524979.zip