summaryrefslogtreecommitdiff
path: root/examples/checksum_test.cc
diff options
context:
space:
mode:
Diffstat (limited to 'examples/checksum_test.cc')
-rw-r--r--examples/checksum_test.cc732
1 files changed, 732 insertions, 0 deletions
diff --git a/examples/checksum_test.cc b/examples/checksum_test.cc
new file mode 100644
index 0000000..028922b
--- /dev/null
+++ b/examples/checksum_test.cc
@@ -0,0 +1,732 @@
+/* Copyright (C) 2007 Josh MacDonald */
+
+extern "C" {
+#include "test.h"
+#include <assert.h>
+}
+
+#include <list>
+#include <vector>
+#include <map>
+#include <algorithm>
+
+using std::list;
+using std::map;
+using std::vector;
+
+// MLCG parameters
+// a, a*
+uint32_t good_32bit_values[] = {
+ 1597334677U, // ...
+ 741103597U, 887987685U,
+};
+
+// a, a*
+uint64_t good_64bit_values[] = {
+ 1181783497276652981ULL, 4292484099903637661ULL,
+ 7664345821815920749ULL, // ...
+};
+
+struct true_type { };
+struct false_type { };
+
+template <typename Word>
+int bitsof();
+
+template<>
+int bitsof<uint32_t>() {
+ return 32;
+}
+
+template<>
+int bitsof<uint64_t>() {
+ return 64;
+}
+
+struct plain {
+ int operator()(const uint8_t &c) {
+ return c;
+ }
+};
+
+template <typename Word>
+struct hhash { // take "h" of the high-bits as a hash value for this
+ // checksum, which are the most "distant" in terms of the
+ // spectral test for the rabin_karp MLCG. For short windows,
+ // the high bits aren't enough, XOR "mask" worth of these in.
+ Word operator()(const Word& t, const int &h, const int &mask) {
+ return (t >> h) ^ (t & mask);
+ }
+};
+
+template <typename Word>
+Word good_word();
+
+template<>
+uint32_t good_word<uint32_t>() {
+ return good_32bit_values[0];
+}
+
+template<>
+uint64_t good_word<uint64_t>() {
+ return good_64bit_values[0];
+}
+
+// CLASSES
+
+#define SELF Word, CksumSize, CksumSkip, Permute, Hash, Compaction
+#define MEMBER template <typename Word, \
+ int CksumSize, \
+ int CksumSkip, \
+ typename Permute, \
+ typename Hash, \
+ int Compaction>
+
+MEMBER
+struct cksum_params {
+ typedef Word word_type;
+ typedef Permute permute_type;
+ typedef Hash hash_type;
+
+ enum { cksum_size = CksumSize,
+ cksum_skip = CksumSkip,
+ compaction = Compaction,
+ };
+};
+
+
+MEMBER
+struct rabin_karp {
+ typedef Word word_type;
+ typedef Permute permute_type;
+ typedef Hash hash_type;
+
+ enum { cksum_size = CksumSize,
+ cksum_skip = CksumSkip,
+ compaction = Compaction,
+ };
+
+ // (a^cksum_size-1 c_0) + (a^cksum_size-2 c_1) ...
+ rabin_karp() {
+ multiplier = good_word<Word>();
+ powers = new Word[cksum_size];
+ powers[cksum_size - 1] = 1;
+ for (int i = cksum_size - 2; i >= 0; i--) {
+ powers[i] = powers[i + 1] * multiplier;
+ }
+ product = powers[0] * multiplier;
+ }
+
+ ~rabin_karp() {
+ delete [] powers;
+ }
+
+ Word step(const uint8_t *ptr) {
+ Word h = 0;
+ for (int i = 0; i < cksum_size; i++) {
+ h += permute_type()(ptr[i]) * powers[i];
+ }
+ return h;
+ }
+
+ Word state0(const uint8_t *ptr) {
+ incr_state = step(ptr);
+ return incr_state;
+ }
+
+ Word incr(const uint8_t *ptr) {
+ incr_state = multiplier * incr_state -
+ product * permute_type()(ptr[-1]) +
+ permute_type()(ptr[cksum_size - 1]);
+ return incr_state;
+ }
+
+ Word *powers;
+ Word product;
+ Word multiplier;
+ Word incr_state;
+};
+
+MEMBER
+struct adler32_cksum {
+ typedef Word word_type;
+ typedef Permute permute_type;
+ typedef Hash hash_type;
+
+ enum { cksum_size = CksumSize,
+ cksum_skip = CksumSkip,
+ compaction = Compaction,
+ };
+
+ Word step(const uint8_t *ptr) {
+ return xd3_lcksum (ptr, cksum_size);
+ }
+
+ Word state0(const uint8_t *ptr) {
+ incr_state = step(ptr);
+ return incr_state;
+ }
+
+ Word incr(const uint8_t *ptr) {
+ incr_state = xd3_large_cksum_update (incr_state, ptr - 1, cksum_size);
+ return incr_state;
+ }
+
+ Word incr_state;
+};
+
+// TESTS
+
+template <typename Word>
+struct file_stats {
+ typedef list<const uint8_t*> ptr_list;
+ typedef Word word_type;
+ typedef map<word_type, ptr_list> table_type;
+ typedef typename table_type::iterator table_iterator;
+ typedef typename ptr_list::iterator ptr_iterator;
+
+ int cksum_size;
+ int cksum_skip;
+ int unique;
+ int unique_values;
+ int count;
+ table_type table;
+
+ file_stats(int size, int skip)
+ : cksum_size(size),
+ cksum_skip(skip),
+ unique(0),
+ unique_values(0),
+ count(0) {
+ }
+
+ void reset() {
+ unique = 0;
+ unique_values = 0;
+ count = 0;
+ table.clear();
+ }
+
+ void update(const word_type &word, const uint8_t *ptr) {
+ table_iterator t_i = table.find(word);
+
+ count++;
+
+ if (t_i == table.end()) {
+ table.insert(make_pair(word, ptr_list()));
+ }
+
+ ptr_list &pl = table[word];
+
+ for (ptr_iterator p_i = pl.begin();
+ p_i != pl.end();
+ ++p_i) {
+ if (memcmp(*p_i, ptr, cksum_size) == 0) {
+ return;
+ }
+ }
+
+ unique++;
+ pl.push_back(ptr);
+ }
+
+ void freeze() {
+ unique_values = table.size();
+ table.clear();
+ }
+};
+
+struct test_result_base;
+
+static vector<test_result_base*> all_tests;
+
+struct test_result_base {
+ virtual ~test_result_base() {
+ }
+ virtual void reset() = 0;
+ virtual void print() = 0;
+ virtual void get(const uint8_t* buf, const int buf_size, int iters) = 0;
+ virtual void stat() = 0;
+ virtual int count() = 0;
+ virtual int dups() = 0;
+ virtual double uniqueness() = 0;
+ virtual double fullness() = 0;
+ virtual double collisions() = 0;
+ virtual double coverage() = 0;
+ virtual double compression() = 0;
+ virtual double time() = 0;
+ virtual double score() = 0;
+ virtual void set_score(double min_dups_frac, double min_time) = 0;
+ virtual double total_time() = 0;
+ virtual int total_count() = 0;
+ virtual int total_dups() = 0;
+};
+
+struct compare_h {
+ bool operator()(test_result_base *a,
+ test_result_base *b) {
+ return a->score() < b->score();
+ }
+};
+
+MEMBER
+struct test_result : public test_result_base {
+ typedef Word word_type;
+ typedef Permute permute_type;
+ typedef Hash hash_type;
+
+ enum { cksum_size = CksumSize,
+ cksum_skip = CksumSkip,
+ compaction = Compaction,
+ };
+
+ const char *test_name;
+ file_stats<Word> fstats;
+ int test_size;
+ int n_steps;
+ int n_incrs;
+ int s_bits;
+ int s_mask;
+ int t_entries;
+ int h_bits;
+ int h_buckets_full;
+ double h_score;
+ char *hash_table;
+ long accum_millis;
+ int accum_iters;
+
+ // These are not reset
+ double accum_time;
+ int accum_count;
+ int accum_dups;
+ int accum_colls;
+ int accum_size;
+
+ test_result(const char *name)
+ : test_name(name),
+ fstats(cksum_size, cksum_skip),
+ hash_table(NULL),
+ accum_millis(0),
+ accum_iters(0),
+ accum_time(0.0),
+ accum_count(0),
+ accum_dups(0),
+ accum_colls(0),
+ accum_size(0) {
+ all_tests.push_back(this);
+ }
+
+ ~test_result() {
+ reset();
+ }
+
+ void reset() {
+ // size of file
+ test_size = -1;
+
+ // count
+ n_steps = -1;
+ n_incrs = -1;
+
+ // four values used by new_table()/summarize_table()
+ s_bits = -1;
+ s_mask = -1;
+ t_entries = -1;
+ h_bits = -1;
+ h_buckets_full = -1;
+
+ accum_millis = 0;
+ accum_iters = 0;
+
+ fstats.reset();
+
+ // temporary
+ if (hash_table) {
+ delete(hash_table);
+ hash_table = NULL;
+ }
+ }
+
+ int count() {
+ if (cksum_skip == 1) {
+ return n_incrs;
+ } else {
+ return n_steps;
+ }
+ }
+
+ int dups() {
+ return fstats.count - fstats.unique;
+ }
+
+ int colls() {
+ return fstats.unique - fstats.unique_values;
+ }
+
+ double uniqueness() {
+ return 1.0 - (double) dups() / count();
+ }
+
+ double fullness() {
+ return (double) h_buckets_full / (1 << h_bits);
+ }
+
+ double collisions() {
+ return (double) colls() / fstats.unique;
+ }
+
+ double coverage() {
+ return (double) h_buckets_full / uniqueness() / count();
+ }
+
+ double compression() {
+ return 1.0 - coverage();
+ }
+
+ double time() {
+ return (double) accum_millis / accum_iters;
+ }
+
+ double score() {
+ return h_score;
+ }
+
+ void set_score(double min_compression, double min_time) {
+ h_score = (compression() - 0.99 * min_compression)
+ * (time() - 0.99 * min_time);
+ }
+
+ double total_time() {
+ return accum_time;
+ }
+
+ int total_count() {
+ return accum_count;
+ }
+
+ int total_dups() {
+ return accum_dups;
+ }
+
+ int total_colls() {
+ return accum_dups;
+ }
+
+ void stat() {
+ accum_time += time();
+ accum_count += count();
+ accum_dups += dups();
+ accum_colls += colls();
+ accum_size += test_size;
+ }
+
+ void print() {
+ if (fstats.count != count()) {
+ fprintf(stderr, "internal error: %d != %d\n", fstats.count, count());
+ abort();
+ }
+ printf("%s: (%u#%u) count %u uniq %0.2f%% full %u (%0.4f%% coll %0.4f%%) covers %0.2f%% w/ 2^%d @ %.4f MB/s %u iters\n",
+ test_name,
+ cksum_size,
+ cksum_skip,
+ count(),
+ 100.0 * uniqueness(),
+ h_buckets_full,
+ 100.0 * fullness(),
+ 100.0 * collisions(),
+ 100.0 * coverage(),
+ h_bits,
+ 0.001 * accum_iters * test_size / accum_millis,
+ accum_iters);
+ }
+
+ int size_log2 (int slots)
+ {
+ int bits = bitsof<word_type>() - 1;
+ int i;
+
+ for (i = 3; i <= bits; i += 1) {
+ if (slots <= (1 << i)) {
+ return i - compaction;
+ }
+ }
+
+ return bits;
+ }
+
+ void new_table(int entries) {
+ t_entries = entries;
+ h_bits = size_log2(entries);
+
+ int n = 1 << h_bits;
+
+ s_bits = bitsof<word_type>() - h_bits;
+ s_mask = n - 1;
+
+ hash_table = new char[n / 8];
+ memset(hash_table, 0, n / 8);
+ }
+
+ int get_table_bit(int i) {
+ return hash_table[i/8] & (1 << i%8);
+ }
+
+ int set_table_bit(int i) {
+ return hash_table[i/8] |= (1 << i%8);
+ }
+
+ void summarize_table() {
+ int n = 1 << h_bits;
+ int f = 0;
+ for (int i = 0; i < n; i++) {
+ if (get_table_bit(i)) {
+ f++;
+ }
+ }
+ h_buckets_full = f;
+ }
+
+ void get(const uint8_t* buf, const int buf_size, int test_iters) {
+ rabin_karp<SELF> test;
+ //adler32_cksum<SELF> test;
+ hash_type hash;
+ const uint8_t *ptr;
+ const uint8_t *end;
+ int last_offset;
+ int periods;
+ int stop;
+
+ test_size = buf_size;
+ last_offset = buf_size - cksum_size;
+
+ if (last_offset < 0) {
+ periods = 0;
+ n_steps = 0;
+ n_incrs = 0;
+ stop = -cksum_size;
+ } else {
+ periods = last_offset / cksum_skip;
+ n_steps = periods + 1;
+ n_incrs = last_offset + 1;
+ stop = last_offset - (periods + 1) * cksum_skip;
+ }
+
+ // Compute file stats once.
+ if (fstats.unique_values == 0) {
+ if (cksum_skip == 1) {
+ for (int i = 0; i <= buf_size - cksum_size; i++) {
+ fstats.update(hash(test.step(buf + i), s_bits, s_mask), buf + i);
+ }
+ } else {
+ ptr = buf + last_offset;
+ end = buf + stop;
+
+ for (; ptr != end; ptr -= cksum_skip) {
+ fstats.update(hash(test.step(ptr), s_bits, s_mask), ptr);
+ }
+ }
+ fstats.freeze();
+ }
+
+ long start_test = get_millisecs_now();
+
+ if (cksum_skip != 1) {
+ new_table(n_steps);
+
+ for (int i = 0; i < test_iters; i++) {
+ ptr = buf + last_offset;
+ end = buf + stop;
+
+ for (; ptr != end; ptr -= cksum_skip) {
+ set_table_bit(hash(test.step(ptr), s_bits, s_mask));
+ }
+ }
+
+ summarize_table();
+ }
+
+ stop = buf_size - cksum_size + 1;
+ if (stop < 0) {
+ stop = 0;
+ }
+
+ if (cksum_skip == 1) {
+
+ new_table(n_incrs);
+
+ for (int i = 0; i < test_iters; i++) {
+ ptr = buf;
+ end = buf + stop;
+
+ if (ptr != end) {
+ set_table_bit(hash(test.state0(ptr++), s_bits, s_mask));
+ }
+
+ for (; ptr != end; ptr++) {
+ Word w = test.incr(ptr);
+ assert(w == test.step(ptr));
+ set_table_bit(hash(w, s_bits, s_mask));
+ }
+ }
+
+ summarize_table();
+ }
+
+ accum_iters += test_iters;
+ accum_millis += get_millisecs_now() - start_test;
+ }
+};
+
+template <typename Word>
+void print_array(const char *tname) {
+ printf("static const %s hash_multiplier[64] = {\n", tname);
+ Word p = 1;
+ for (int i = 0; i < 64; i++) {
+ printf(" %uU,\n", p);
+ p *= good_word<Word>();
+ }
+ printf("};\n", tname);
+}
+
+int main(int argc, char** argv) {
+ int i;
+ uint8_t *buf = NULL;
+ size_t buf_len = 0;
+ int ret;
+
+ if (argc <= 1) {
+ fprintf(stderr, "usage: %s file ...\n", argv[0]);
+ return 1;
+ }
+
+ //print_array<uint32_t>("uint32_t");
+
+#define TEST(T,Z,S,P,H,C) test_result<T,Z,S,P,H<T>,C> \
+ _ ## T ## _ ## Z ## _ ## S ## _ ## P ## _ ## H ## _ ## C \
+ (#T "_" #Z "_" #S "_" #P "_" #H "_" #C)
+
+#if 0
+
+ TEST(uint32_t, 4, SKIP, plain, hhash, 0); /* x */ \
+ TEST(uint32_t, 4, SKIP, plain, hhash, 1); /* x */ \
+ TEST(uint32_t, 4, SKIP, plain, hhash, 2); /* x */ \
+ TEST(uint32_t, 4, SKIP, plain, hhash, 3); /* x */ \
+
+#endif
+
+#define TESTS(SKIP) \
+ TEST(uint32_t, 9, SKIP, plain, hhash, 0); /* x */ \
+ TEST(uint32_t, 9, SKIP, plain, hhash, 1); /* x */ \
+ TEST(uint32_t, 9, SKIP, plain, hhash, 2); /* x */ \
+ TEST(uint32_t, 9, SKIP, plain, hhash, 3)
+
+#define TESTS_ALL(SKIP) \
+ TEST(uint32_t, 3, SKIP, plain, hhash, 0); \
+ TEST(uint32_t, 3, SKIP, plain, hhash, 1); \
+ TEST(uint32_t, 4, SKIP, plain, hhash, 0); /* x */ \
+ TEST(uint32_t, 4, SKIP, plain, hhash, 1); /* x */ \
+ TEST(uint32_t, 4, SKIP, plain, hhash, 2); /* x */ \
+ TEST(uint32_t, 4, SKIP, plain, hhash, 3); /* x */ \
+ TEST(uint32_t, 5, SKIP, plain, hhash, 0); \
+ TEST(uint32_t, 5, SKIP, plain, hhash, 1); \
+ TEST(uint32_t, 8, SKIP, plain, hhash, 0); \
+ TEST(uint32_t, 8, SKIP, plain, hhash, 1); \
+ TEST(uint32_t, 9, SKIP, plain, hhash, 0); /* x */ \
+ TEST(uint32_t, 9, SKIP, plain, hhash, 1); /* x */ \
+ TEST(uint32_t, 9, SKIP, plain, hhash, 2); /* x */ \
+ TEST(uint32_t, 9, SKIP, plain, hhash, 3); /* x */ \
+ TEST(uint32_t, 11, SKIP, plain, hhash, 0); /* x */ \
+ TEST(uint32_t, 11, SKIP, plain, hhash, 1); /* x */ \
+ TEST(uint32_t, 13, SKIP, plain, hhash, 0); \
+ TEST(uint32_t, 13, SKIP, plain, hhash, 1); \
+ TEST(uint32_t, 15, SKIP, plain, hhash, 0); /* x */ \
+ TEST(uint32_t, 15, SKIP, plain, hhash, 1); /* x */ \
+ TEST(uint32_t, 16, SKIP, plain, hhash, 0); /* x */ \
+ TEST(uint32_t, 16, SKIP, plain, hhash, 1); /* x */ \
+ TEST(uint32_t, 21, SKIP, plain, hhash, 0); \
+ TEST(uint32_t, 21, SKIP, plain, hhash, 1); \
+ TEST(uint32_t, 34, SKIP, plain, hhash, 0); \
+ TEST(uint32_t, 34, SKIP, plain, hhash, 1); \
+ TEST(uint32_t, 55, SKIP, plain, hhash, 0); \
+ TEST(uint32_t, 55, SKIP, plain, hhash, 1)
+
+ TESTS(1); // *
+// TESTS(2); // *
+// TESTS(3); // *
+// TESTS(5); // *
+// TESTS(8); // *
+// TESTS(9);
+// TESTS(11);
+// TESTS(13); // *
+ TESTS(15);
+// TESTS(16);
+// TESTS(21); // *
+// TESTS(34); // *
+// TESTS(55); // *
+// TESTS(89); // *
+
+ for (i = 1; i < argc; i++) {
+ if ((ret = read_whole_file(argv[i],
+ & buf,
+ & buf_len))) {
+ return 1;
+ }
+
+ fprintf(stderr, "file %s is %zu bytes\n",
+ argv[i], buf_len);
+
+ double min_time = -1.0;
+ double min_compression = 0.0;
+
+ for (vector<test_result_base*>::iterator i = all_tests.begin();
+ i != all_tests.end(); ++i) {
+ test_result_base *test = *i;
+ test->reset();
+
+ int iters = 100;
+ long start_test = get_millisecs_now();
+
+ do {
+ test->get(buf, buf_len, iters);
+ iters *= 3;
+ iters /= 2;
+ } while (get_millisecs_now() - start_test < 2000);
+
+ test->stat();
+
+ if (min_time < 0.0) {
+ min_compression = test->compression();
+ min_time = test->time();
+ }
+
+ if (min_time > test->time()) {
+ min_time = test->time();
+ }
+
+ if (min_compression > test->compression()) {
+ min_compression = test->compression();
+ }
+
+ test->print();
+ }
+
+// for (vector<test_result_base*>::iterator i = all_tests.begin();
+// i != all_tests.end(); ++i) {
+// test_result_base *test = *i;
+// test->set_score(min_compression, min_time);
+// }
+
+// sort(all_tests.begin(), all_tests.end(), compare_h());
+
+// for (vector<test_result_base*>::iterator i = all_tests.begin();
+// i != all_tests.end(); ++i) {
+// test_result_base *test = *i;
+// test->print();
+// }
+
+ free(buf);
+ buf = NULL;
+ }
+
+ return 0;
+}