diff options
Diffstat (limited to 'examples/checksum_test.cc')
-rw-r--r-- | examples/checksum_test.cc | 732 |
1 files changed, 732 insertions, 0 deletions
diff --git a/examples/checksum_test.cc b/examples/checksum_test.cc new file mode 100644 index 0000000..028922b --- /dev/null +++ b/examples/checksum_test.cc @@ -0,0 +1,732 @@ +/* Copyright (C) 2007 Josh MacDonald */ + +extern "C" { +#include "test.h" +#include <assert.h> +} + +#include <list> +#include <vector> +#include <map> +#include <algorithm> + +using std::list; +using std::map; +using std::vector; + +// MLCG parameters +// a, a* +uint32_t good_32bit_values[] = { + 1597334677U, // ... + 741103597U, 887987685U, +}; + +// a, a* +uint64_t good_64bit_values[] = { + 1181783497276652981ULL, 4292484099903637661ULL, + 7664345821815920749ULL, // ... +}; + +struct true_type { }; +struct false_type { }; + +template <typename Word> +int bitsof(); + +template<> +int bitsof<uint32_t>() { + return 32; +} + +template<> +int bitsof<uint64_t>() { + return 64; +} + +struct plain { + int operator()(const uint8_t &c) { + return c; + } +}; + +template <typename Word> +struct hhash { // take "h" of the high-bits as a hash value for this + // checksum, which are the most "distant" in terms of the + // spectral test for the rabin_karp MLCG. For short windows, + // the high bits aren't enough, XOR "mask" worth of these in. + Word operator()(const Word& t, const int &h, const int &mask) { + return (t >> h) ^ (t & mask); + } +}; + +template <typename Word> +Word good_word(); + +template<> +uint32_t good_word<uint32_t>() { + return good_32bit_values[0]; +} + +template<> +uint64_t good_word<uint64_t>() { + return good_64bit_values[0]; +} + +// CLASSES + +#define SELF Word, CksumSize, CksumSkip, Permute, Hash, Compaction +#define MEMBER template <typename Word, \ + int CksumSize, \ + int CksumSkip, \ + typename Permute, \ + typename Hash, \ + int Compaction> + +MEMBER +struct cksum_params { + typedef Word word_type; + typedef Permute permute_type; + typedef Hash hash_type; + + enum { cksum_size = CksumSize, + cksum_skip = CksumSkip, + compaction = Compaction, + }; +}; + + +MEMBER +struct rabin_karp { + typedef Word word_type; + typedef Permute permute_type; + typedef Hash hash_type; + + enum { cksum_size = CksumSize, + cksum_skip = CksumSkip, + compaction = Compaction, + }; + + // (a^cksum_size-1 c_0) + (a^cksum_size-2 c_1) ... + rabin_karp() { + multiplier = good_word<Word>(); + powers = new Word[cksum_size]; + powers[cksum_size - 1] = 1; + for (int i = cksum_size - 2; i >= 0; i--) { + powers[i] = powers[i + 1] * multiplier; + } + product = powers[0] * multiplier; + } + + ~rabin_karp() { + delete [] powers; + } + + Word step(const uint8_t *ptr) { + Word h = 0; + for (int i = 0; i < cksum_size; i++) { + h += permute_type()(ptr[i]) * powers[i]; + } + return h; + } + + Word state0(const uint8_t *ptr) { + incr_state = step(ptr); + return incr_state; + } + + Word incr(const uint8_t *ptr) { + incr_state = multiplier * incr_state - + product * permute_type()(ptr[-1]) + + permute_type()(ptr[cksum_size - 1]); + return incr_state; + } + + Word *powers; + Word product; + Word multiplier; + Word incr_state; +}; + +MEMBER +struct adler32_cksum { + typedef Word word_type; + typedef Permute permute_type; + typedef Hash hash_type; + + enum { cksum_size = CksumSize, + cksum_skip = CksumSkip, + compaction = Compaction, + }; + + Word step(const uint8_t *ptr) { + return xd3_lcksum (ptr, cksum_size); + } + + Word state0(const uint8_t *ptr) { + incr_state = step(ptr); + return incr_state; + } + + Word incr(const uint8_t *ptr) { + incr_state = xd3_large_cksum_update (incr_state, ptr - 1, cksum_size); + return incr_state; + } + + Word incr_state; +}; + +// TESTS + +template <typename Word> +struct file_stats { + typedef list<const uint8_t*> ptr_list; + typedef Word word_type; + typedef map<word_type, ptr_list> table_type; + typedef typename table_type::iterator table_iterator; + typedef typename ptr_list::iterator ptr_iterator; + + int cksum_size; + int cksum_skip; + int unique; + int unique_values; + int count; + table_type table; + + file_stats(int size, int skip) + : cksum_size(size), + cksum_skip(skip), + unique(0), + unique_values(0), + count(0) { + } + + void reset() { + unique = 0; + unique_values = 0; + count = 0; + table.clear(); + } + + void update(const word_type &word, const uint8_t *ptr) { + table_iterator t_i = table.find(word); + + count++; + + if (t_i == table.end()) { + table.insert(make_pair(word, ptr_list())); + } + + ptr_list &pl = table[word]; + + for (ptr_iterator p_i = pl.begin(); + p_i != pl.end(); + ++p_i) { + if (memcmp(*p_i, ptr, cksum_size) == 0) { + return; + } + } + + unique++; + pl.push_back(ptr); + } + + void freeze() { + unique_values = table.size(); + table.clear(); + } +}; + +struct test_result_base; + +static vector<test_result_base*> all_tests; + +struct test_result_base { + virtual ~test_result_base() { + } + virtual void reset() = 0; + virtual void print() = 0; + virtual void get(const uint8_t* buf, const int buf_size, int iters) = 0; + virtual void stat() = 0; + virtual int count() = 0; + virtual int dups() = 0; + virtual double uniqueness() = 0; + virtual double fullness() = 0; + virtual double collisions() = 0; + virtual double coverage() = 0; + virtual double compression() = 0; + virtual double time() = 0; + virtual double score() = 0; + virtual void set_score(double min_dups_frac, double min_time) = 0; + virtual double total_time() = 0; + virtual int total_count() = 0; + virtual int total_dups() = 0; +}; + +struct compare_h { + bool operator()(test_result_base *a, + test_result_base *b) { + return a->score() < b->score(); + } +}; + +MEMBER +struct test_result : public test_result_base { + typedef Word word_type; + typedef Permute permute_type; + typedef Hash hash_type; + + enum { cksum_size = CksumSize, + cksum_skip = CksumSkip, + compaction = Compaction, + }; + + const char *test_name; + file_stats<Word> fstats; + int test_size; + int n_steps; + int n_incrs; + int s_bits; + int s_mask; + int t_entries; + int h_bits; + int h_buckets_full; + double h_score; + char *hash_table; + long accum_millis; + int accum_iters; + + // These are not reset + double accum_time; + int accum_count; + int accum_dups; + int accum_colls; + int accum_size; + + test_result(const char *name) + : test_name(name), + fstats(cksum_size, cksum_skip), + hash_table(NULL), + accum_millis(0), + accum_iters(0), + accum_time(0.0), + accum_count(0), + accum_dups(0), + accum_colls(0), + accum_size(0) { + all_tests.push_back(this); + } + + ~test_result() { + reset(); + } + + void reset() { + // size of file + test_size = -1; + + // count + n_steps = -1; + n_incrs = -1; + + // four values used by new_table()/summarize_table() + s_bits = -1; + s_mask = -1; + t_entries = -1; + h_bits = -1; + h_buckets_full = -1; + + accum_millis = 0; + accum_iters = 0; + + fstats.reset(); + + // temporary + if (hash_table) { + delete(hash_table); + hash_table = NULL; + } + } + + int count() { + if (cksum_skip == 1) { + return n_incrs; + } else { + return n_steps; + } + } + + int dups() { + return fstats.count - fstats.unique; + } + + int colls() { + return fstats.unique - fstats.unique_values; + } + + double uniqueness() { + return 1.0 - (double) dups() / count(); + } + + double fullness() { + return (double) h_buckets_full / (1 << h_bits); + } + + double collisions() { + return (double) colls() / fstats.unique; + } + + double coverage() { + return (double) h_buckets_full / uniqueness() / count(); + } + + double compression() { + return 1.0 - coverage(); + } + + double time() { + return (double) accum_millis / accum_iters; + } + + double score() { + return h_score; + } + + void set_score(double min_compression, double min_time) { + h_score = (compression() - 0.99 * min_compression) + * (time() - 0.99 * min_time); + } + + double total_time() { + return accum_time; + } + + int total_count() { + return accum_count; + } + + int total_dups() { + return accum_dups; + } + + int total_colls() { + return accum_dups; + } + + void stat() { + accum_time += time(); + accum_count += count(); + accum_dups += dups(); + accum_colls += colls(); + accum_size += test_size; + } + + void print() { + if (fstats.count != count()) { + fprintf(stderr, "internal error: %d != %d\n", fstats.count, count()); + abort(); + } + printf("%s: (%u#%u) count %u uniq %0.2f%% full %u (%0.4f%% coll %0.4f%%) covers %0.2f%% w/ 2^%d @ %.4f MB/s %u iters\n", + test_name, + cksum_size, + cksum_skip, + count(), + 100.0 * uniqueness(), + h_buckets_full, + 100.0 * fullness(), + 100.0 * collisions(), + 100.0 * coverage(), + h_bits, + 0.001 * accum_iters * test_size / accum_millis, + accum_iters); + } + + int size_log2 (int slots) + { + int bits = bitsof<word_type>() - 1; + int i; + + for (i = 3; i <= bits; i += 1) { + if (slots <= (1 << i)) { + return i - compaction; + } + } + + return bits; + } + + void new_table(int entries) { + t_entries = entries; + h_bits = size_log2(entries); + + int n = 1 << h_bits; + + s_bits = bitsof<word_type>() - h_bits; + s_mask = n - 1; + + hash_table = new char[n / 8]; + memset(hash_table, 0, n / 8); + } + + int get_table_bit(int i) { + return hash_table[i/8] & (1 << i%8); + } + + int set_table_bit(int i) { + return hash_table[i/8] |= (1 << i%8); + } + + void summarize_table() { + int n = 1 << h_bits; + int f = 0; + for (int i = 0; i < n; i++) { + if (get_table_bit(i)) { + f++; + } + } + h_buckets_full = f; + } + + void get(const uint8_t* buf, const int buf_size, int test_iters) { + rabin_karp<SELF> test; + //adler32_cksum<SELF> test; + hash_type hash; + const uint8_t *ptr; + const uint8_t *end; + int last_offset; + int periods; + int stop; + + test_size = buf_size; + last_offset = buf_size - cksum_size; + + if (last_offset < 0) { + periods = 0; + n_steps = 0; + n_incrs = 0; + stop = -cksum_size; + } else { + periods = last_offset / cksum_skip; + n_steps = periods + 1; + n_incrs = last_offset + 1; + stop = last_offset - (periods + 1) * cksum_skip; + } + + // Compute file stats once. + if (fstats.unique_values == 0) { + if (cksum_skip == 1) { + for (int i = 0; i <= buf_size - cksum_size; i++) { + fstats.update(hash(test.step(buf + i), s_bits, s_mask), buf + i); + } + } else { + ptr = buf + last_offset; + end = buf + stop; + + for (; ptr != end; ptr -= cksum_skip) { + fstats.update(hash(test.step(ptr), s_bits, s_mask), ptr); + } + } + fstats.freeze(); + } + + long start_test = get_millisecs_now(); + + if (cksum_skip != 1) { + new_table(n_steps); + + for (int i = 0; i < test_iters; i++) { + ptr = buf + last_offset; + end = buf + stop; + + for (; ptr != end; ptr -= cksum_skip) { + set_table_bit(hash(test.step(ptr), s_bits, s_mask)); + } + } + + summarize_table(); + } + + stop = buf_size - cksum_size + 1; + if (stop < 0) { + stop = 0; + } + + if (cksum_skip == 1) { + + new_table(n_incrs); + + for (int i = 0; i < test_iters; i++) { + ptr = buf; + end = buf + stop; + + if (ptr != end) { + set_table_bit(hash(test.state0(ptr++), s_bits, s_mask)); + } + + for (; ptr != end; ptr++) { + Word w = test.incr(ptr); + assert(w == test.step(ptr)); + set_table_bit(hash(w, s_bits, s_mask)); + } + } + + summarize_table(); + } + + accum_iters += test_iters; + accum_millis += get_millisecs_now() - start_test; + } +}; + +template <typename Word> +void print_array(const char *tname) { + printf("static const %s hash_multiplier[64] = {\n", tname); + Word p = 1; + for (int i = 0; i < 64; i++) { + printf(" %uU,\n", p); + p *= good_word<Word>(); + } + printf("};\n", tname); +} + +int main(int argc, char** argv) { + int i; + uint8_t *buf = NULL; + size_t buf_len = 0; + int ret; + + if (argc <= 1) { + fprintf(stderr, "usage: %s file ...\n", argv[0]); + return 1; + } + + //print_array<uint32_t>("uint32_t"); + +#define TEST(T,Z,S,P,H,C) test_result<T,Z,S,P,H<T>,C> \ + _ ## T ## _ ## Z ## _ ## S ## _ ## P ## _ ## H ## _ ## C \ + (#T "_" #Z "_" #S "_" #P "_" #H "_" #C) + +#if 0 + + TEST(uint32_t, 4, SKIP, plain, hhash, 0); /* x */ \ + TEST(uint32_t, 4, SKIP, plain, hhash, 1); /* x */ \ + TEST(uint32_t, 4, SKIP, plain, hhash, 2); /* x */ \ + TEST(uint32_t, 4, SKIP, plain, hhash, 3); /* x */ \ + +#endif + +#define TESTS(SKIP) \ + TEST(uint32_t, 9, SKIP, plain, hhash, 0); /* x */ \ + TEST(uint32_t, 9, SKIP, plain, hhash, 1); /* x */ \ + TEST(uint32_t, 9, SKIP, plain, hhash, 2); /* x */ \ + TEST(uint32_t, 9, SKIP, plain, hhash, 3) + +#define TESTS_ALL(SKIP) \ + TEST(uint32_t, 3, SKIP, plain, hhash, 0); \ + TEST(uint32_t, 3, SKIP, plain, hhash, 1); \ + TEST(uint32_t, 4, SKIP, plain, hhash, 0); /* x */ \ + TEST(uint32_t, 4, SKIP, plain, hhash, 1); /* x */ \ + TEST(uint32_t, 4, SKIP, plain, hhash, 2); /* x */ \ + TEST(uint32_t, 4, SKIP, plain, hhash, 3); /* x */ \ + TEST(uint32_t, 5, SKIP, plain, hhash, 0); \ + TEST(uint32_t, 5, SKIP, plain, hhash, 1); \ + TEST(uint32_t, 8, SKIP, plain, hhash, 0); \ + TEST(uint32_t, 8, SKIP, plain, hhash, 1); \ + TEST(uint32_t, 9, SKIP, plain, hhash, 0); /* x */ \ + TEST(uint32_t, 9, SKIP, plain, hhash, 1); /* x */ \ + TEST(uint32_t, 9, SKIP, plain, hhash, 2); /* x */ \ + TEST(uint32_t, 9, SKIP, plain, hhash, 3); /* x */ \ + TEST(uint32_t, 11, SKIP, plain, hhash, 0); /* x */ \ + TEST(uint32_t, 11, SKIP, plain, hhash, 1); /* x */ \ + TEST(uint32_t, 13, SKIP, plain, hhash, 0); \ + TEST(uint32_t, 13, SKIP, plain, hhash, 1); \ + TEST(uint32_t, 15, SKIP, plain, hhash, 0); /* x */ \ + TEST(uint32_t, 15, SKIP, plain, hhash, 1); /* x */ \ + TEST(uint32_t, 16, SKIP, plain, hhash, 0); /* x */ \ + TEST(uint32_t, 16, SKIP, plain, hhash, 1); /* x */ \ + TEST(uint32_t, 21, SKIP, plain, hhash, 0); \ + TEST(uint32_t, 21, SKIP, plain, hhash, 1); \ + TEST(uint32_t, 34, SKIP, plain, hhash, 0); \ + TEST(uint32_t, 34, SKIP, plain, hhash, 1); \ + TEST(uint32_t, 55, SKIP, plain, hhash, 0); \ + TEST(uint32_t, 55, SKIP, plain, hhash, 1) + + TESTS(1); // * +// TESTS(2); // * +// TESTS(3); // * +// TESTS(5); // * +// TESTS(8); // * +// TESTS(9); +// TESTS(11); +// TESTS(13); // * + TESTS(15); +// TESTS(16); +// TESTS(21); // * +// TESTS(34); // * +// TESTS(55); // * +// TESTS(89); // * + + for (i = 1; i < argc; i++) { + if ((ret = read_whole_file(argv[i], + & buf, + & buf_len))) { + return 1; + } + + fprintf(stderr, "file %s is %zu bytes\n", + argv[i], buf_len); + + double min_time = -1.0; + double min_compression = 0.0; + + for (vector<test_result_base*>::iterator i = all_tests.begin(); + i != all_tests.end(); ++i) { + test_result_base *test = *i; + test->reset(); + + int iters = 100; + long start_test = get_millisecs_now(); + + do { + test->get(buf, buf_len, iters); + iters *= 3; + iters /= 2; + } while (get_millisecs_now() - start_test < 2000); + + test->stat(); + + if (min_time < 0.0) { + min_compression = test->compression(); + min_time = test->time(); + } + + if (min_time > test->time()) { + min_time = test->time(); + } + + if (min_compression > test->compression()) { + min_compression = test->compression(); + } + + test->print(); + } + +// for (vector<test_result_base*>::iterator i = all_tests.begin(); +// i != all_tests.end(); ++i) { +// test_result_base *test = *i; +// test->set_score(min_compression, min_time); +// } + +// sort(all_tests.begin(), all_tests.end(), compare_h()); + +// for (vector<test_result_base*>::iterator i = all_tests.begin(); +// i != all_tests.end(); ++i) { +// test_result_base *test = *i; +// test->print(); +// } + + free(buf); + buf = NULL; + } + + return 0; +} |