#pragma once #include namespace caffe2 { namespace math { // Returns the quantized and compressed values of floating inputs // The "fused" representation stores the [bitwidth][tail][min][max] // with the quantized data in one array. Since we store 8/bitwidth // quantized data in one byte, the last buckets of some bytes may have // unused bits. There are totally tail buckets are unused. // We encode *bitwidth* and *tail* at the beginning, // following by 32-bit floating data respresenting min and max. // | bitwidth | tail | min | max | ... int8 data ... | // | 1B | 1B | 4B | 4B | ...output_data....| // In output_data: the b-th bucket of the i-th byte stores // the i-th data of the b-th segment of input row void quantize_and_compress( const float* input_data, std::uint8_t* output_data, std::uint64_t input_size, std::uint64_t bitwidth, bool random, const float* random_buffer); void decompress_and_dequantize( const std::uint8_t* input_data, float* output_data, std::uint64_t input_size); } // namespace math } // namespace caffe2