diff options
Diffstat (limited to 'compute/cker/include/cker/NeonTensorUtils.h')
-rw-r--r-- | compute/cker/include/cker/NeonTensorUtils.h | 319 |
1 files changed, 319 insertions, 0 deletions
diff --git a/compute/cker/include/cker/NeonTensorUtils.h b/compute/cker/include/cker/NeonTensorUtils.h new file mode 100644 index 000000000..4d97dd187 --- /dev/null +++ b/compute/cker/include/cker/NeonTensorUtils.h @@ -0,0 +1,319 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_NEON_TENSOR_UTILS_H__ +#define __NNFW_CKER_NEON_TENSOR_UTILS_H__ + +#include "cker/neon/neon_check.h" + +#include <cassert> +#include <cmath> + +#ifdef USE_NEON + +#define kFloatWeightsPerNeonLane 4 + +namespace nnfw +{ +namespace cker +{ + +namespace +{ + +// Allocates, at least, size bytes of uninitialized storage whose alignment is +// specified by alignment. The size parameter must be an integral multiple of +// alignment. +// Caller is responsible by freeing the allocated memory by calling free on +// the passed freeing_buffer pointer. +void *aligned_alloc(size_t alignment, size_t size, void **freeing_buffer) +{ + *freeing_buffer = malloc(size + alignment); + const size_t offset = ((uintptr_t)*freeing_buffer) % alignment; // NOLINT + return offset == 0 ? *freeing_buffer : ((char *)*freeing_buffer + (alignment - offset)); // NOLINT +} + +} // namespace + +bool NeonIsZeroVector(const float *vector, int v_size) +{ + // If v_size is not divisible by kFloatWeightsPerNeonLane, we cannot + // use the main vectorized loop, and we need to process sequentially. + // postamble_start shows the start index where this should happen. + const int postamble_start = v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); + + const float32x4_t zero_x4_float = vmovq_n_f32(0.0f); + for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) + { + const float32x4_t i_x4_float = vld1q_f32(vector + v); + uint32x4_t cmp_result = vceqq_f32(i_x4_float, zero_x4_float); + if (vgetq_lane_u32(cmp_result, 0) == 0) + return false; + if (vgetq_lane_u32(cmp_result, 1) == 0) + return false; + if (vgetq_lane_u32(cmp_result, 2) == 0) + return false; + if (vgetq_lane_u32(cmp_result, 3) == 0) + return false; + } + + // Postamble loop + for (int v = postamble_start; v < v_size; ++v) + { + if (vector[v] != 0.0) + return false; + } + return true; +} + +void NeonSymmetricQuantizeFloats(const float *values, const int size, int8_t *quantized_values, + float *min, float *max, float *scaling_factor) +{ + // TODO(raziel): vectorize min/max calculation. + auto minmax = std::minmax_element(values, values + size); + *min = *minmax.first; + *max = *minmax.second; + const int kScale = 127; + const float range = std::max(std::abs(*min), std::abs(*max)); + if (range == 0) + { + memset(quantized_values, 0, size * sizeof(int8_t)); + *scaling_factor = 1; + return; + } + *scaling_factor = range / kScale; + const float scaling_factor_inv = kScale / range; + + const int postamble_start = size - (size & (2 * kFloatWeightsPerNeonLane - 1)); + + // Vectorized constants. + const float32x4_t q_factor_f32x4 = vmovq_n_f32(scaling_factor_inv); + const float32x4_t point5_f32x4 = vmovq_n_f32(0.5); + const float32x4_t zero_f32x4 = vmovq_n_f32(0.0); + const int32x4_t scale_i32x4 = vmovq_n_s32(kScale); + const int32x4_t neg_scale_i32x4 = vmovq_n_s32(-kScale); + + for (int i = 0; i < postamble_start; i += 2 * kFloatWeightsPerNeonLane) + { + // Implements the vectorized version of the following: + // const int32_t quantized_value = static_cast<int32>( + // std::round(*scaling_factor * values[i])); + // Since the vectorized round intrinsics (vrndqa_f32) is not supported + // on all Neon flavors, we use the following method for rounding: if (x + // < 0) (int)(x - 0.5) if (x >= 0) (int)(x + 0.5) + float32x4_t value0_f32x4 = vld1q_f32(&values[i]); + float32x4_t value1_f32x4 = vld1q_f32(&values[i + kFloatWeightsPerNeonLane]); + float32x4_t mul0_f32x4 = vmulq_f32(value0_f32x4, q_factor_f32x4); + float32x4_t mul1_f32x4 = vmulq_f32(value1_f32x4, q_factor_f32x4); + + int32x4_t cmp_with_zero0_ui32x4 = (int32x4_t)vcltq_f32(mul0_f32x4, zero_f32x4); // NOLINT + int32x4_t cmp_with_zero1_ui32x4 = (int32x4_t)vcltq_f32(mul1_f32x4, zero_f32x4); // NOLINT + + float32x4_t cmp_with_zero0_f32x4 = vcvtq_f32_s32(cmp_with_zero0_ui32x4); + float32x4_t cmp_with_zero1_f32x4 = vcvtq_f32_s32(cmp_with_zero1_ui32x4); + cmp_with_zero0_f32x4 = vaddq_f32(cmp_with_zero0_f32x4, point5_f32x4); + cmp_with_zero1_f32x4 = vaddq_f32(cmp_with_zero1_f32x4, point5_f32x4); + + mul0_f32x4 = vaddq_f32(mul0_f32x4, cmp_with_zero0_f32x4); + mul1_f32x4 = vaddq_f32(mul1_f32x4, cmp_with_zero1_f32x4); + + int32x4_t f2i0_i32x4 = vcvtq_s32_f32(mul0_f32x4); + int32x4_t f2i1_i32x4 = vcvtq_s32_f32(mul1_f32x4); + + // Implements the vectorized version of the folowing block: + // quantized_values[i] = std::min(kScale, std::max(-kScale, + // quantized_value)); + int32x4_t max0_i32x4 = vmaxq_s32(f2i0_i32x4, neg_scale_i32x4); + int32x4_t max1_i32x4 = vmaxq_s32(f2i1_i32x4, neg_scale_i32x4); + int32x4_t min0_i32x4 = vminq_s32(max0_i32x4, scale_i32x4); + int32x4_t min1_i32x4 = vminq_s32(max1_i32x4, scale_i32x4); + + int16x4_t min0_16x4 = vmovn_s32(min0_i32x4); + int16x4_t min1_16x4 = vmovn_s32(min1_i32x4); + + int16x8_t min_16x8 = vcombine_s16(min0_16x4, min1_16x4); + int8x8_t min_s8x8 = vqmovn_s16(min_16x8); + vst1_s8(&quantized_values[i], min_s8x8); + } + + for (int i = postamble_start; i < size; ++i) + { + const int32_t quantized_value = + static_cast<int32_t>(std::round(scaling_factor_inv * values[i])); + quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value)); + } +} + +void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, const int m_rows, + const int m_cols, const int8_t *__restrict__ vectors, + const float *scaling_factors, int n_batch, + float *__restrict__ result, int result_stride) +{ + const int kWeightsPerUint32 = 4; + const int kWeightsPerNeonLane = 16; + // If the number of rows is not divisible by kWeightsPerUint32, we set a + // flag and allocate an aligned memory block. The flag is used to use the + // aligned memory block later in the kernel loop. + bool unaligned = false; + int8_t *aligned_row = nullptr; + void *aligned_row_free = nullptr; + if ((m_cols & (kWeightsPerUint32 - 1)) != 0) + { + unaligned = true; + aligned_row = (int8_t *)aligned_alloc(kWeightsPerUint32, m_cols, // NOLINT + &aligned_row_free); + } + void *aligned_vec_free = nullptr; + int8_t *aligned_vec = (int8_t *)aligned_alloc(kWeightsPerUint32, m_cols, // NOLINT + &aligned_vec_free); + + // If m_cols is not at least kWeightsPerNeonLane, we cannot use the main + // vectorized loop, and we need to process sequentially. postamble_start shows + // the start index where this should happen. + const int postamble_start = m_cols - (m_cols & (kWeightsPerNeonLane - 1)); + + int batch, row, col; + for (batch = 0; batch < n_batch; ++batch) + { + const float batch_scaling_factor = scaling_factors[batch]; + // Copy the vector data to an aligned vector. + memcpy(aligned_vec, vectors + batch * m_cols, sizeof(int8_t) * m_cols); + // Compute dot-product for every column. + for (row = 0; row < m_rows; ++row, result += result_stride) + { + // Get the address of the first element of the row. + int8_t *row_ptr = (int8_t *)matrix + row * m_cols; // NOLINT + if (unaligned) + { + memcpy(aligned_row, row_ptr, sizeof(int8_t) * m_cols); + row_ptr = aligned_row; + } + + // Initialize the dot product sum for the row to 0. + int32x4_t dotprod = vmovq_n_s32(0); + + // Prefetch the row to cache. + __builtin_prefetch(row_ptr, 0 /* prefetch for read */, 3 /* temporal locality */); + + // For every block of 16 8-bit elements. + col = 0; + for (; col < postamble_start; col += kWeightsPerNeonLane) + { + // Load 16 8-bit values from the row and vector, each, to operate on. + // Here the assumption is that each buffer is 4-byte aligned. + assert(((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0); + const int8x16_t s1_8x16 = vld1q_s8((const int8_t *)(aligned_vec + col)); + const int8x16_t s2_8x16 = vld1q_s8((const int8_t *)(row_ptr + col)); + // Multiply the low bits (i.e. the lower 8 8bit numbers in the + // registers). + int16x8_t prod_16x8 = vmull_s8(vget_low_s8(s1_8x16), vget_low_s8(s2_8x16)); + // Multiply the high bits (i.e. the lower 8 8bit numbers in the + // registers), and accumulate with the result of the low bits product. + // The assumption here is that overflow will not happen as we quantize + // our values to be in the range [-127, 127]. As such the sum of the 2 + // products is always strictly smaller than 15-bits (32767 in absolute + // value). + prod_16x8 = vmlal_s8(prod_16x8, vget_high_s8(s1_8x16), vget_high_s8(s2_8x16)); + + dotprod = vpadalq_s16(dotprod, prod_16x8); + } // for col + + int32_t postable_sum = 0; + // Postamble loop. + // TODO(raziel): if (ABSL_PREDICT_FALSE(postamble_start < m_rows)) + if (postamble_start < m_cols) + { + col = postamble_start; + if ((m_cols - postamble_start) >= (kWeightsPerNeonLane >> 1)) + { + // Load 8 8-bit values from the row and column each to operate on. + // Here the assumption is that each buffer is 4-bytes aligned. + assert(((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0); + const int8x8_t s1_8x8 = vld1_s8((const int8_t *)(aligned_vec + col)); + const int8x8_t s2_8x8 = vld1_s8((const int8_t *)(row_ptr + col)); + const int16x8_t prod_16x8 = vmull_s8(s1_8x8, s2_8x8); + dotprod = vpadalq_s16(dotprod, prod_16x8); + col += (kWeightsPerNeonLane >> 1); + } + for (; col < m_cols; ++col) + { + postable_sum += row_ptr[col] * aligned_vec[col]; + } // for col + } + // Add the 4 intermediate sum values to get the final dot-prod value for + // this row. + int64x2_t pairwiseAdded = vpaddlq_s32(dotprod); + int32_t neon_sum = vgetq_lane_s64(pairwiseAdded, 0) + vgetq_lane_s64(pairwiseAdded, 1); + + *result += ((neon_sum + postable_sum) * batch_scaling_factor); + } // for row + } // for batch + + if (unaligned) + { + free(aligned_row_free); + } + free(aligned_vec_free); +} + +void NeonMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols, + const float *vector, int n_batch, float *result, + int result_stride) +{ + // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main + // vectorized loop, and we need to process sequentially. postamble_start shows + // the start index where this should happen. + const int postamble_start = m_cols - (m_cols & (kFloatWeightsPerNeonLane - 1)); + + for (int b = 0; b < n_batch; b++) + { + float *result_in_batch = result + b * m_rows * result_stride; + const float *vector_in_batch = vector + b * m_cols; + const float *matrix_row = matrix; + + // Main matrix by vector multiplication loop + for (int r = 0; r < m_rows; r++) + { + float32x4_t acc_32x4 = vmovq_n_f32(0.0); + for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) + { + // Load 4 float values from vector and matrix row. + float32x4_t vector_f32x4 = vld1q_f32(vector_in_batch + c); + float32x4_t matrix_f32x4 = vld1q_f32(matrix_row + c); + // Multiply the vector and matrix row and add to accumulator. + acc_32x4 = vmlaq_f32(acc_32x4, matrix_f32x4, vector_f32x4); + } + // Add the 4 intermediate sum values to get the final dot-prod value for + // this column. + *result_in_batch += (vgetq_lane_f32(acc_32x4, 0) + vgetq_lane_f32(acc_32x4, 1) + + vgetq_lane_f32(acc_32x4, 2) + vgetq_lane_f32(acc_32x4, 3)); + for (int c = postamble_start; c < m_cols; c++) + { + *result_in_batch += matrix_row[c] * vector_in_batch[c]; + } + matrix_row += m_cols; + result_in_batch += result_stride; + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // USE_NEON + +#endif // __NNFW_CKER_NEON_TENSOR_UTILS_H__ |