summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorcostan <costan@google.com>2017-08-16 12:38:06 -0700
committerVictor Costan <pwnall@chromium.org>2017-08-16 19:18:22 -0700
commit632cd0f12856ef7e1feda46021940a98d7117bda (patch)
tree6e075081eae94a736d0c528a5d8105a5a2ff9d1c
parent77c12adc192ac6620a0f0d340c99149ec56a97a3 (diff)
downloadsnappy-632cd0f12856ef7e1feda46021940a98d7117bda.tar.gz
snappy-632cd0f12856ef7e1feda46021940a98d7117bda.tar.bz2
snappy-632cd0f12856ef7e1feda46021940a98d7117bda.zip
Use 64-bit optimized code path for ARM64.
This is inspired by https://github.com/google/snappy/pull/22. Benchmark results with the change, Pixel C with Android N2G48B Benchmark Time(ns) CPU(ns) Iterations --------------------------------------------------- BM_UFlat/0 119544 119253 1501 818.9MB/s html BM_UFlat/1 1223950 1208588 163 554.0MB/s urls BM_UFlat/2 16081 15962 11527 7.2GB/s jpg BM_UFlat/3 356 352 416666 540.6MB/s jpg_200 BM_UFlat/4 25010 24860 7683 3.8GB/s pdf BM_UFlat/5 484832 481572 407 811.1MB/s html4 BM_UFlat/6 408410 408713 482 354.9MB/s txt1 BM_UFlat/7 361714 361663 553 330.1MB/s txt2 BM_UFlat/8 1090582 1087912 182 374.1MB/s txt3 BM_UFlat/9 1503127 1503759 133 305.6MB/s txt4 BM_UFlat/10 114183 114285 1715 989.6MB/s pb BM_UFlat/11 406714 407331 491 431.5MB/s gaviota BM_UIOVec/0 370397 369888 538 264.0MB/s html BM_UIOVec/1 3207510 3190000 100 209.9MB/s urls BM_UIOVec/2 16589 16573 11223 6.9GB/s jpg BM_UIOVec/3 1052 1052 165289 181.2MB/s jpg_200 BM_UIOVec/4 49151 49184 3985 1.9GB/s pdf BM_UValidate/0 68115 68095 2893 1.4GB/s html BM_UValidate/1 792652 792000 250 845.4MB/s urls BM_UValidate/2 334 334 487804 343.1GB/s jpg BM_UValidate/3 235 235 666666 809.9MB/s jpg_200 BM_UValidate/4 6126 6130 32626 15.6GB/s pdf BM_ZFlat/0 292697 290560 678 336.1MB/s html (22.31 %) BM_ZFlat/1 4062080 4050000 100 165.3MB/s urls (47.78 %) BM_ZFlat/2 29225 29274 6422 3.9GB/s jpg (99.95 %) BM_ZFlat/3 1099 1098 163934 173.7MB/s jpg_200 (73.00 %) BM_ZFlat/4 44117 44233 4205 2.2GB/s pdf (83.30 %) BM_ZFlat/5 1158058 1157894 171 337.4MB/s html4 (22.52 %) BM_ZFlat/6 1102983 1093922 181 132.6MB/s txt1 (57.88 %) BM_ZFlat/7 974142 975490 204 122.4MB/s txt2 (61.91 %) BM_ZFlat/8 2984670 2990000 100 136.1MB/s txt3 (54.99 %) BM_ZFlat/9 4100130 4090000 100 112.4MB/s txt4 (66.26 %) BM_ZFlat/10 276236 275139 716 411.0MB/s pb (19.68 %) BM_ZFlat/11 760091 759541 262 231.4MB/s gaviota (37.72 %) Baseline benchmark results, Pixel C with Android N2G48B Benchmark Time(ns) CPU(ns) Iterations --------------------------------------------------- BM_UFlat/0 148957 147565 1335 661.8MB/s html BM_UFlat/1 1527257 1500000 132 446.4MB/s urls BM_UFlat/2 19589 19397 8764 5.9GB/s jpg BM_UFlat/3 425 418 408163 455.3MB/s jpg_200 BM_UFlat/4 30096 29552 6497 3.2GB/s pdf BM_UFlat/5 595933 594594 333 657.0MB/s html4 BM_UFlat/6 516315 514360 383 282.0MB/s txt1 BM_UFlat/7 454653 453514 441 263.2MB/s txt2 BM_UFlat/8 1382687 1361111 144 299.0MB/s txt3 BM_UFlat/9 1967590 1904761 105 241.3MB/s txt4 BM_UFlat/10 148271 144560 1342 782.3MB/s pb BM_UFlat/11 523997 510471 382 344.4MB/s gaviota BM_UIOVec/0 478443 465227 417 209.9MB/s html BM_UIOVec/1 4172860 4060000 100 164.9MB/s urls BM_UIOVec/2 21470 20975 7342 5.5GB/s jpg BM_UIOVec/3 1357 1330 75187 143.4MB/s jpg_200 BM_UIOVec/4 63143 61365 3031 1.6GB/s pdf BM_UValidate/0 86910 85125 2279 1.1GB/s html BM_UValidate/1 1022256 1000000 195 669.6MB/s urls BM_UValidate/2 420 417 400000 274.6GB/s jpg BM_UValidate/3 311 302 571428 630.0MB/s jpg_200 BM_UValidate/4 7778 7584 25445 12.6GB/s pdf BM_ZFlat/0 469209 457547 424 213.4MB/s html (22.31 %) BM_ZFlat/1 5633510 5460000 100 122.6MB/s urls (47.78 %) BM_ZFlat/2 37896 36693 4524 3.1GB/s jpg (99.95 %) BM_ZFlat/3 1485 1441 123456 132.3MB/s jpg_200 (73.00 %) BM_ZFlat/4 74870 72775 2652 1.3GB/s pdf (83.30 %) BM_ZFlat/5 1857321 1785714 112 218.8MB/s html4 (22.52 %) BM_ZFlat/6 1538723 1492307 130 97.2MB/s txt1 (57.88 %) BM_ZFlat/7 1338236 1310810 148 91.1MB/s txt2 (61.91 %) BM_ZFlat/8 4050820 4040000 100 100.7MB/s txt3 (54.99 %) BM_ZFlat/9 5234940 5230000 100 87.9MB/s txt4 (66.26 %) BM_ZFlat/10 400309 400000 495 282.7MB/s pb (19.68 %) BM_ZFlat/11 1063042 1058510 188 166.1MB/s gaviota (37.72 %)
-rw-r--r--CMakeLists.txt5
-rw-r--r--cmake/config.h.in6
-rw-r--r--snappy-internal.h3
-rw-r--r--snappy-stubs-internal.h35
4 files changed, 26 insertions, 23 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e9e70c8..2a90a08 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,10 +8,7 @@ option(BUILD_SHARED_LIBS "Build shared libraries(DLLs)." OFF)
option(SNAPPY_BUILD_TESTS "Build Snappy's own tests." ON)
include(TestBigEndian)
-test_big_endian(WORDS_BIG_ENDIAN)
-if(WORDS_BIG_ENDIAN)
- add_definitions(-DWORDS_BIGENDIAN=1)
-endif(WORDS_BIG_ENDIAN)
+test_big_endian(SNAPPY_IS_BIG_ENDIAN)
include(CheckIncludeFile)
check_include_file("byteswap.h" HAVE_BYTESWAP_H)
diff --git a/cmake/config.h.in b/cmake/config.h.in
index 64f2648..97cd818 100644
--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@@ -55,8 +55,8 @@
/* Define to 1 if you have the <windows.h> header file. */
#cmakedefine HAVE_WINDOWS_H 1
-/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
- significant byte first (like Motorola and SPARC, unlike Intel and VAX). */
-#cmakedefine WORDS_BIGENDIAN 1
+/* Define to 1 if your processor stores words with the most significant byte
+ first (like Motorola and SPARC, unlike Intel and VAX). */
+#cmakedefine SNAPPY_IS_BIG_ENDIAN 1
#endif // THIRD_PARTY_SNAPPY_OPENSOURCE_CMAKE_CONFIG_H_
diff --git a/snappy-internal.h b/snappy-internal.h
index c12637d..4b53d59 100644
--- a/snappy-internal.h
+++ b/snappy-internal.h
@@ -83,7 +83,8 @@ char* CompressFragment(const char* input,
// Requires that s2_limit >= s2.
//
// Separate implementation for 64-bit, little-endian cpus.
-#if defined(ARCH_K8) || (defined(ARCH_PPC) && !defined(WORDS_BIGENDIAN))
+#if !defined(SNAPPY_IS_BIG_ENDIAN) && \
+ (defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM))
static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
const char* s2,
const char* s2_limit) {
diff --git a/snappy-stubs-internal.h b/snappy-stubs-internal.h
index 9898f18..cb605f8 100644
--- a/snappy-stubs-internal.h
+++ b/snappy-stubs-internal.h
@@ -64,6 +64,10 @@
#define ARCH_PPC 1
+#elif defined(__aarch64__)
+
+#define ARCH_ARM 1
+
#endif
// Needed by OS X, among others.
@@ -104,9 +108,10 @@ static const int64 kint64max = static_cast<int64>(0x7FFFFFFFFFFFFFFFLL);
// Potentially unaligned loads and stores.
-// x86 and PowerPC can simply do these loads and stores native.
+// x86, PowerPC, and ARM64 can simply do these loads and stores native.
-#if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__)
+#if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || \
+ defined(__aarch64__)
#define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16 *>(_p))
#define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32 *>(_p))
@@ -234,7 +239,7 @@ inline void UNALIGNED_STORE64(void *p, uint64 v) {
#endif
// The following guarantees declaration of the byte swap functions.
-#ifdef WORDS_BIGENDIAN
+#if defined(SNAPPY_IS_BIG_ENDIAN)
#ifdef HAVE_SYS_BYTEORDER_H
#include <sys/byteorder.h>
@@ -291,7 +296,7 @@ inline uint64 bswap_64(uint64 x) {
#endif
-#endif // WORDS_BIGENDIAN
+#endif // defined(SNAPPY_IS_BIG_ENDIAN)
// Convert to little-endian storage, opposite of network format.
// Convert x from host to little endian: x = LittleEndian.FromHost(x);
@@ -305,7 +310,7 @@ inline uint64 bswap_64(uint64 x) {
class LittleEndian {
public:
// Conversion functions.
-#ifdef WORDS_BIGENDIAN
+#if defined(SNAPPY_IS_BIG_ENDIAN)
static uint16 FromHost16(uint16 x) { return bswap_16(x); }
static uint16 ToHost16(uint16 x) { return bswap_16(x); }
@@ -315,7 +320,7 @@ class LittleEndian {
static bool IsLittleEndian() { return false; }
-#else // !defined(WORDS_BIGENDIAN)
+#else // !defined(SNAPPY_IS_BIG_ENDIAN)
static uint16 FromHost16(uint16 x) { return x; }
static uint16 ToHost16(uint16 x) { return x; }
@@ -325,7 +330,7 @@ class LittleEndian {
static bool IsLittleEndian() { return true; }
-#endif // !defined(WORDS_BIGENDIAN)
+#endif // !defined(SNAPPY_IS_BIG_ENDIAN)
// Functions to do unaligned loads and stores in little-endian order.
static uint16 Load16(const void *p) {
@@ -356,9 +361,9 @@ class Bits {
// that it's 0-indexed.
static int FindLSBSetNonZero(uint32 n);
-#if defined(ARCH_K8) || defined(ARCH_PPC)
+#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
static int FindLSBSetNonZero64(uint64 n);
-#endif // defined(ARCH_K8) || defined(ARCH_PPC)
+#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
private:
// No copying
@@ -376,11 +381,11 @@ inline int Bits::FindLSBSetNonZero(uint32 n) {
return __builtin_ctz(n);
}
-#if defined(ARCH_K8) || defined(ARCH_PPC)
+#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
inline int Bits::FindLSBSetNonZero64(uint64 n) {
return __builtin_ctzll(n);
}
-#endif // defined(ARCH_K8) || defined(ARCH_PPC)
+#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
#elif defined(_MSC_VER)
@@ -399,13 +404,13 @@ inline int Bits::FindLSBSetNonZero(uint32 n) {
return 32;
}
-#if defined(ARCH_K8) || defined(ARCH_PPC)
+#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
inline int Bits::FindLSBSetNonZero64(uint64 n) {
unsigned long where;
if (_BitScanForward64(&where, n)) return static_cast<int>(where);
return 64;
}
-#endif // defined(ARCH_K8) || defined(ARCH_PPC)
+#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
#else // Portable versions.
@@ -439,7 +444,7 @@ inline int Bits::FindLSBSetNonZero(uint32 n) {
return rc;
}
-#if defined(ARCH_K8) || defined(ARCH_PPC)
+#if defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
// FindLSBSetNonZero64() is defined in terms of FindLSBSetNonZero().
inline int Bits::FindLSBSetNonZero64(uint64 n) {
const uint32 bottombits = static_cast<uint32>(n);
@@ -450,7 +455,7 @@ inline int Bits::FindLSBSetNonZero64(uint64 n) {
return FindLSBSetNonZero(bottombits);
}
}
-#endif // defined(ARCH_K8) || defined(ARCH_PPC)
+#endif // defined(ARCH_K8) || defined(ARCH_PPC) || defined(ARCH_ARM)
#endif // End portable versions.