diff options
author | Geoff Pike <gpike@google.com> | 2016-06-28 11:53:11 -0700 |
---|---|---|
committer | Alkis Evlogimenos <alkis@google.com> | 2017-01-26 21:39:39 +0100 |
commit | 38a5ec5fcaca58556889c840cbcbb2886e524979 (patch) | |
tree | 945c7a6091c7e523011e3426d228517ddbaf4fbd | |
parent | 094c67de88f41eae494a3823f8aaf0f77b25b980 (diff) | |
download | snappy-38a5ec5fcaca58556889c840cbcbb2886e524979.tar.gz snappy-38a5ec5fcaca58556889c840cbcbb2886e524979.tar.bz2 snappy-38a5ec5fcaca58556889c840cbcbb2886e524979.zip |
Re-work fast path that emits copies in zippy compression.
The primary motivation for the change is that FindMatchLength is
likely to discover a difference in the first 8 bytes it compares.
If that occurs then we know the length of the match is less than 12,
because FindMatchLength is invoked after a 4-byte match is found.
When emitting a copy, it is useful to know that the length is less
than 12 because the two-byte variant of an emitted copy requires that.
This is a performance-tuning change that should not affect the
library's behavior.
With FDO on perflab/Haswell the geometric mean for ZFlat/* went from
47,290ns to 45,741ns, an improvement of 3.4%.
SAMPLE (before)
BM_ZFlat/0 102824 102650 40691 951.4MB/s html (22.31 %)
BM_ZFlat/1 1293512 1290442 3225 518.9MB/s urls (47.78 %)
BM_ZFlat/2 10373 10353 417959 11.1GB/s jpg (99.95 %)
BM_ZFlat/3 268 268 15745324 712.4MB/s jpg_200 (73.00 %)
BM_ZFlat/4 12137 12113 342462 7.9GB/s pdf (83.30 %)
BM_ZFlat/5 430672 429720 9724 909.0MB/s html4 (22.52 %)
BM_ZFlat/6 420541 419636 9833 345.6MB/s txt1 (57.88 %)
BM_ZFlat/7 373829 373158 10000 319.9MB/s txt2 (61.91 %)
BM_ZFlat/8 1119014 1116604 3755 364.5MB/s txt3 (54.99 %)
BM_ZFlat/9 1544203 1540657 2748 298.3MB/s txt4 (66.26 %)
BM_ZFlat/10 91041 90866 46002 1.2GB/s pb (19.68 %)
BM_ZFlat/11 332766 331990 10000 529.5MB/s gaviota (37.72 %)
BM_ZFlat/12 39960 39886 100000 588.3MB/s cp (48.12 %)
BM_ZFlat/13 14493 14465 287181 735.1MB/s c (42.47 %)
BM_ZFlat/14 4447 4440 947927 799.3MB/s lsp (48.37 %)
BM_ZFlat/15 1316362 1313350 3196 747.7MB/s xls (41.23 %)
BM_ZFlat/16 312 311 10000000 613.0MB/s xls_200 (78.00 %)
BM_ZFlat/17 388471 387502 10000 1.2GB/s bin (18.11 %)
BM_ZFlat/18 65 64 64838208 2.9GB/s bin_200 (7.50 %)
BM_ZFlat/19 65900 65787 63099 554.3MB/s sum (48.96 %)
BM_ZFlat/20 6188 6177 681951 652.6MB/s man (59.21 %)
SAMPLE (after)
Benchmark Time(ns) CPU(ns) Iterations
--------------------------------------------
BM_ZFlat/0 99259 99044 42428 986.0MB/s html (22.31 %)
BM_ZFlat/1 1257039 1255276 3341 533.4MB/s urls (47.78 %)
BM_ZFlat/2 10044 10030 405781 11.4GB/s jpg (99.95 %)
BM_ZFlat/3 268 267 15732282 713.3MB/s jpg_200 (73.00 %)
BM_ZFlat/4 11675 11657 358629 8.2GB/s pdf (83.30 %)
BM_ZFlat/5 420951 419818 9739 930.5MB/s html4 (22.52 %)
BM_ZFlat/6 415460 414632 10000 349.8MB/s txt1 (57.88 %)
BM_ZFlat/7 367191 366436 10000 325.8MB/s txt2 (61.91 %)
BM_ZFlat/8 1098345 1096036 3819 371.3MB/s txt3 (54.99 %)
BM_ZFlat/9 1508701 1505306 2758 305.3MB/s txt4 (66.26 %)
BM_ZFlat/10 87195 87031 47289 1.3GB/s pb (19.68 %)
BM_ZFlat/11 322338 321637 10000 546.5MB/s gaviota (37.72 %)
BM_ZFlat/12 36739 36668 100000 639.9MB/s cp (48.12 %)
BM_ZFlat/13 13646 13618 304009 780.9MB/s c (42.47 %)
BM_ZFlat/14 4249 4240 992456 837.0MB/s lsp (48.37 %)
BM_ZFlat/15 1262925 1260012 3314 779.4MB/s xls (41.23 %)
BM_ZFlat/16 308 308 10000000 619.8MB/s xls_200 (78.00 %)
BM_ZFlat/17 379750 378944 10000 1.3GB/s bin (18.11 %)
BM_ZFlat/18 62 62 67443280 3.0GB/s bin_200 (7.50 %)
BM_ZFlat/19 61706 61587 67645 592.1MB/s sum (48.96 %)
BM_ZFlat/20 5968 5958 698974 676.6MB/s man (59.21 %)
-rw-r--r-- | snappy-internal.h | 46 | ||||
-rw-r--r-- | snappy.cc | 72 | ||||
-rw-r--r-- | snappy_unittest.cc | 7 |
3 files changed, 76 insertions, 49 deletions
diff --git a/snappy-internal.h b/snappy-internal.h index c4d1f6d..e9ca1a8 100644 --- a/snappy-internal.h +++ b/snappy-internal.h @@ -70,11 +70,12 @@ char* CompressFragment(const char* input, uint16* table, const int table_size); -// Return the largest n such that +// Find the largest n such that // // s1[0,n-1] == s2[0,n-1] // and n <= (s2_limit - s2). // +// Return make_pair(n, n < 8). // Does not read *s2_limit or beyond. // Does not read *(s1 + (s2_limit - s2)) or beyond. // Requires that s2_limit >= s2. @@ -82,11 +83,27 @@ char* CompressFragment(const char* input, // Separate implementation for x86_64, for speed. Uses the fact that // x86_64 is little endian. #if defined(ARCH_K8) -static inline int FindMatchLength(const char* s1, - const char* s2, - const char* s2_limit) { +static inline pair<size_t, bool> FindMatchLength(const char* s1, + const char* s2, + const char* s2_limit) { assert(s2_limit >= s2); - int matched = 0; + size_t matched = 0; + + // This block isn't necessary for correctness; we could just start looping + // immediately. As an optimization though, it is useful. It creates some not + // uncommon code paths that determine, without extra effort, whether the match + // length is less than 8. In short, we are hoping to avoid a conditional + // branch, and perhaps get better code layout from the C++ compiler. + if (PREDICT_TRUE(s2 <= s2_limit - 8)) { + uint64 a1 = UNALIGNED_LOAD64(s1); + uint64 a2 = UNALIGNED_LOAD64(s2); + if (a1 != a2) { + return pair<size_t, bool>(Bits::FindLSBSetNonZero64(a1 ^ a2) >> 3, true); + } else { + matched = 8; + s2 += 8; + } + } // Find out how long the match is. We loop over the data 64 bits at a // time until we find a 64-bit block that doesn't match; then we find @@ -97,14 +114,11 @@ static inline int FindMatchLength(const char* s1, s2 += 8; matched += 8; } else { - // On current (mid-2008) Opteron models there is a 3% more - // efficient code sequence to find the first non-matching byte. - // However, what follows is ~10% better on Intel Core 2 and newer, - // and we expect AMD's bsf instruction to improve. uint64 x = UNALIGNED_LOAD64(s2) ^ UNALIGNED_LOAD64(s1 + matched); int matching_bits = Bits::FindLSBSetNonZero64(x); matched += matching_bits >> 3; - return matched; + assert(matched >= 8); + return pair<size_t, bool>(matched, false); } } while (PREDICT_TRUE(s2 < s2_limit)) { @@ -112,15 +126,15 @@ static inline int FindMatchLength(const char* s1, ++s2; ++matched; } else { - return matched; + return pair<size_t, bool>(matched, matched < 8); } } - return matched; + return pair<size_t, bool>(matched, matched < 8); } #else -static inline int FindMatchLength(const char* s1, - const char* s2, - const char* s2_limit) { +static inline pair<size_t, bool> FindMatchLength(const char* s1, + const char* s2, + const char* s2_limit) { // Implementation based on the x86-64 version, above. assert(s2_limit >= s2); int matched = 0; @@ -140,7 +154,7 @@ static inline int FindMatchLength(const char* s1, ++matched; } } - return matched; + return pair<size_t, bool>(matched, matched < 8); } #endif @@ -41,7 +41,6 @@ namespace snappy { using internal::COPY_1_BYTE_OFFSET; using internal::COPY_2_BYTE_OFFSET; -using internal::COPY_4_BYTE_OFFSET; using internal::LITERAL; using internal::char_table; using internal::kMaximumTagLength; @@ -199,42 +198,54 @@ static inline char* EmitLiteral(char* op, return op + len; } -static inline char* EmitCopyLessThan64(char* op, size_t offset, int len) { +static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len, + bool len_less_than_12) { assert(len <= 64); assert(len >= 4); assert(offset < 65536); + assert(len_less_than_12 == (len < 12)); - if ((len < 12) && (offset < 2048)) { - size_t len_minus_4 = len - 4; - assert(len_minus_4 < 8); // Must fit in 3 bits - *op++ = COPY_1_BYTE_OFFSET + ((len_minus_4) << 2) + ((offset >> 8) << 5); + if (len_less_than_12 && PREDICT_TRUE(offset < 2048)) { + // offset fits in 11 bits. The 3 highest go in the top of the first byte, + // and the rest go in the second byte. + *op++ = COPY_1_BYTE_OFFSET + ((len - 4) << 2) + ((offset >> 3) & 0xe0); *op++ = offset & 0xff; } else { - *op++ = COPY_2_BYTE_OFFSET + ((len-1) << 2); - LittleEndian::Store16(op, offset); - op += 2; + // Write 4 bytes, though we only care about 3 of them. The output buffer + // is required to have some slack, so the extra byte won't overrun it. + uint32 u = COPY_2_BYTE_OFFSET + ((len - 1) << 2) + (offset << 8); + LittleEndian::Store32(op, u); + op += 3; } return op; } -static inline char* EmitCopy(char* op, size_t offset, int len) { - // Emit 64 byte copies but make sure to keep at least four bytes reserved - while (PREDICT_FALSE(len >= 68)) { - op = EmitCopyLessThan64(op, offset, 64); - len -= 64; - } +static inline char* EmitCopy(char* op, size_t offset, size_t len, + bool len_less_than_12) { + assert(len_less_than_12 == (len < 12)); + if (len_less_than_12) { + return EmitCopyAtMost64(op, offset, len, true); + } else { + // A special case for len <= 64 might help, but so far measurements suggest + // it's in the noise. - // Emit an extra 60 byte copy if have too much data to fit in one copy - if (len > 64) { - op = EmitCopyLessThan64(op, offset, 60); - len -= 60; - } + // Emit 64 byte copies but make sure to keep at least four bytes reserved. + while (PREDICT_FALSE(len >= 68)) { + op = EmitCopyAtMost64(op, offset, 64, false); + len -= 64; + } - // Emit remainder - op = EmitCopyLessThan64(op, offset, len); - return op; -} + // One or two copies will now finish the job. + if (len > 64) { + op = EmitCopyAtMost64(op, offset, 60, false); + len -= 60; + } + // Emit remainder. + op = EmitCopyAtMost64(op, offset, len, len < 12); + return op; + } +} bool GetUncompressedLength(const char* start, size_t n, size_t* result) { uint32 v = 0; @@ -422,19 +433,20 @@ char* CompressFragment(const char* input, // We have a 4-byte match at ip, and no need to emit any // "literal bytes" prior to ip. const char* base = ip; - int matched = 4 + FindMatchLength(candidate + 4, ip + 4, ip_end); + pair<size_t, bool> p = FindMatchLength(candidate + 4, ip + 4, ip_end); + size_t matched = 4 + p.first; ip += matched; size_t offset = base - candidate; assert(0 == memcmp(base, candidate, matched)); - op = EmitCopy(op, offset, matched); - // We could immediately start working at ip now, but to improve - // compression we first update table[Hash(ip - 1, ...)]. - const char* insert_tail = ip - 1; + op = EmitCopy(op, offset, matched, p.second); next_emit = ip; if (PREDICT_FALSE(ip >= ip_limit)) { goto emit_remainder; } - input_bytes = GetEightBytesAt(insert_tail); + // We are now looking for a 4-byte match again. We read + // table[Hash(ip, shift)] for that. To improve compression, + // we also update table[Hash(ip - 1, shift)] and table[Hash(ip, shift)]. + input_bytes = GetEightBytesAt(ip - 1); uint32 prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift); table[prev_hash] = ip - base_ip - 1; uint32 cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift); diff --git a/snappy_unittest.cc b/snappy_unittest.cc index 65ac16a..41f5a98 100644 --- a/snappy_unittest.cc +++ b/snappy_unittest.cc @@ -1054,7 +1054,9 @@ TEST(Snappy, ZeroOffsetCopyValidation) { namespace { int TestFindMatchLength(const char* s1, const char *s2, unsigned length) { - return snappy::internal::FindMatchLength(s1, s2, s2 + length); + pair<size_t, bool> p = snappy::internal::FindMatchLength(s1, s2, s2 + length); + CHECK_EQ(p.first < 8, p.second); + return p.first; } } // namespace @@ -1164,8 +1166,7 @@ TEST(Snappy, FindMatchLengthRandom) { } DataEndingAtUnreadablePage u(s); DataEndingAtUnreadablePage v(t); - int matched = snappy::internal::FindMatchLength( - u.data(), v.data(), v.data() + t.size()); + int matched = TestFindMatchLength(u.data(), v.data(), t.size()); if (matched == t.size()) { EXPECT_EQ(s, t); } else { |