summaryrefslogtreecommitdiff
path: root/compute/cker/include/cker/NeonTensorUtils.h
diff options
context:
space:
mode:
Diffstat (limited to 'compute/cker/include/cker/NeonTensorUtils.h')
-rw-r--r--compute/cker/include/cker/NeonTensorUtils.h319
1 files changed, 319 insertions, 0 deletions
diff --git a/compute/cker/include/cker/NeonTensorUtils.h b/compute/cker/include/cker/NeonTensorUtils.h
new file mode 100644
index 000000000..4d97dd187
--- /dev/null
+++ b/compute/cker/include/cker/NeonTensorUtils.h
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_NEON_TENSOR_UTILS_H__
+#define __NNFW_CKER_NEON_TENSOR_UTILS_H__
+
+#include "cker/neon/neon_check.h"
+
+#include <cassert>
+#include <cmath>
+
+#ifdef USE_NEON
+
+#define kFloatWeightsPerNeonLane 4
+
+namespace nnfw
+{
+namespace cker
+{
+
+namespace
+{
+
+// Allocates, at least, size bytes of uninitialized storage whose alignment is
+// specified by alignment. The size parameter must be an integral multiple of
+// alignment.
+// Caller is responsible by freeing the allocated memory by calling free on
+// the passed freeing_buffer pointer.
+void *aligned_alloc(size_t alignment, size_t size, void **freeing_buffer)
+{
+ *freeing_buffer = malloc(size + alignment);
+ const size_t offset = ((uintptr_t)*freeing_buffer) % alignment; // NOLINT
+ return offset == 0 ? *freeing_buffer : ((char *)*freeing_buffer + (alignment - offset)); // NOLINT
+}
+
+} // namespace
+
+bool NeonIsZeroVector(const float *vector, int v_size)
+{
+ // If v_size is not divisible by kFloatWeightsPerNeonLane, we cannot
+ // use the main vectorized loop, and we need to process sequentially.
+ // postamble_start shows the start index where this should happen.
+ const int postamble_start = v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+
+ const float32x4_t zero_x4_float = vmovq_n_f32(0.0f);
+ for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane)
+ {
+ const float32x4_t i_x4_float = vld1q_f32(vector + v);
+ uint32x4_t cmp_result = vceqq_f32(i_x4_float, zero_x4_float);
+ if (vgetq_lane_u32(cmp_result, 0) == 0)
+ return false;
+ if (vgetq_lane_u32(cmp_result, 1) == 0)
+ return false;
+ if (vgetq_lane_u32(cmp_result, 2) == 0)
+ return false;
+ if (vgetq_lane_u32(cmp_result, 3) == 0)
+ return false;
+ }
+
+ // Postamble loop
+ for (int v = postamble_start; v < v_size; ++v)
+ {
+ if (vector[v] != 0.0)
+ return false;
+ }
+ return true;
+}
+
+void NeonSymmetricQuantizeFloats(const float *values, const int size, int8_t *quantized_values,
+ float *min, float *max, float *scaling_factor)
+{
+ // TODO(raziel): vectorize min/max calculation.
+ auto minmax = std::minmax_element(values, values + size);
+ *min = *minmax.first;
+ *max = *minmax.second;
+ const int kScale = 127;
+ const float range = std::max(std::abs(*min), std::abs(*max));
+ if (range == 0)
+ {
+ memset(quantized_values, 0, size * sizeof(int8_t));
+ *scaling_factor = 1;
+ return;
+ }
+ *scaling_factor = range / kScale;
+ const float scaling_factor_inv = kScale / range;
+
+ const int postamble_start = size - (size & (2 * kFloatWeightsPerNeonLane - 1));
+
+ // Vectorized constants.
+ const float32x4_t q_factor_f32x4 = vmovq_n_f32(scaling_factor_inv);
+ const float32x4_t point5_f32x4 = vmovq_n_f32(0.5);
+ const float32x4_t zero_f32x4 = vmovq_n_f32(0.0);
+ const int32x4_t scale_i32x4 = vmovq_n_s32(kScale);
+ const int32x4_t neg_scale_i32x4 = vmovq_n_s32(-kScale);
+
+ for (int i = 0; i < postamble_start; i += 2 * kFloatWeightsPerNeonLane)
+ {
+ // Implements the vectorized version of the following:
+ // const int32_t quantized_value = static_cast<int32>(
+ // std::round(*scaling_factor * values[i]));
+ // Since the vectorized round intrinsics (vrndqa_f32) is not supported
+ // on all Neon flavors, we use the following method for rounding: if (x
+ // < 0) (int)(x - 0.5) if (x >= 0) (int)(x + 0.5)
+ float32x4_t value0_f32x4 = vld1q_f32(&values[i]);
+ float32x4_t value1_f32x4 = vld1q_f32(&values[i + kFloatWeightsPerNeonLane]);
+ float32x4_t mul0_f32x4 = vmulq_f32(value0_f32x4, q_factor_f32x4);
+ float32x4_t mul1_f32x4 = vmulq_f32(value1_f32x4, q_factor_f32x4);
+
+ int32x4_t cmp_with_zero0_ui32x4 = (int32x4_t)vcltq_f32(mul0_f32x4, zero_f32x4); // NOLINT
+ int32x4_t cmp_with_zero1_ui32x4 = (int32x4_t)vcltq_f32(mul1_f32x4, zero_f32x4); // NOLINT
+
+ float32x4_t cmp_with_zero0_f32x4 = vcvtq_f32_s32(cmp_with_zero0_ui32x4);
+ float32x4_t cmp_with_zero1_f32x4 = vcvtq_f32_s32(cmp_with_zero1_ui32x4);
+ cmp_with_zero0_f32x4 = vaddq_f32(cmp_with_zero0_f32x4, point5_f32x4);
+ cmp_with_zero1_f32x4 = vaddq_f32(cmp_with_zero1_f32x4, point5_f32x4);
+
+ mul0_f32x4 = vaddq_f32(mul0_f32x4, cmp_with_zero0_f32x4);
+ mul1_f32x4 = vaddq_f32(mul1_f32x4, cmp_with_zero1_f32x4);
+
+ int32x4_t f2i0_i32x4 = vcvtq_s32_f32(mul0_f32x4);
+ int32x4_t f2i1_i32x4 = vcvtq_s32_f32(mul1_f32x4);
+
+ // Implements the vectorized version of the folowing block:
+ // quantized_values[i] = std::min(kScale, std::max(-kScale,
+ // quantized_value));
+ int32x4_t max0_i32x4 = vmaxq_s32(f2i0_i32x4, neg_scale_i32x4);
+ int32x4_t max1_i32x4 = vmaxq_s32(f2i1_i32x4, neg_scale_i32x4);
+ int32x4_t min0_i32x4 = vminq_s32(max0_i32x4, scale_i32x4);
+ int32x4_t min1_i32x4 = vminq_s32(max1_i32x4, scale_i32x4);
+
+ int16x4_t min0_16x4 = vmovn_s32(min0_i32x4);
+ int16x4_t min1_16x4 = vmovn_s32(min1_i32x4);
+
+ int16x8_t min_16x8 = vcombine_s16(min0_16x4, min1_16x4);
+ int8x8_t min_s8x8 = vqmovn_s16(min_16x8);
+ vst1_s8(&quantized_values[i], min_s8x8);
+ }
+
+ for (int i = postamble_start; i < size; ++i)
+ {
+ const int32_t quantized_value =
+ static_cast<int32_t>(std::round(scaling_factor_inv * values[i]));
+ quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
+ }
+}
+
+void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, const int m_rows,
+ const int m_cols, const int8_t *__restrict__ vectors,
+ const float *scaling_factors, int n_batch,
+ float *__restrict__ result, int result_stride)
+{
+ const int kWeightsPerUint32 = 4;
+ const int kWeightsPerNeonLane = 16;
+ // If the number of rows is not divisible by kWeightsPerUint32, we set a
+ // flag and allocate an aligned memory block. The flag is used to use the
+ // aligned memory block later in the kernel loop.
+ bool unaligned = false;
+ int8_t *aligned_row = nullptr;
+ void *aligned_row_free = nullptr;
+ if ((m_cols & (kWeightsPerUint32 - 1)) != 0)
+ {
+ unaligned = true;
+ aligned_row = (int8_t *)aligned_alloc(kWeightsPerUint32, m_cols, // NOLINT
+ &aligned_row_free);
+ }
+ void *aligned_vec_free = nullptr;
+ int8_t *aligned_vec = (int8_t *)aligned_alloc(kWeightsPerUint32, m_cols, // NOLINT
+ &aligned_vec_free);
+
+ // If m_cols is not at least kWeightsPerNeonLane, we cannot use the main
+ // vectorized loop, and we need to process sequentially. postamble_start shows
+ // the start index where this should happen.
+ const int postamble_start = m_cols - (m_cols & (kWeightsPerNeonLane - 1));
+
+ int batch, row, col;
+ for (batch = 0; batch < n_batch; ++batch)
+ {
+ const float batch_scaling_factor = scaling_factors[batch];
+ // Copy the vector data to an aligned vector.
+ memcpy(aligned_vec, vectors + batch * m_cols, sizeof(int8_t) * m_cols);
+ // Compute dot-product for every column.
+ for (row = 0; row < m_rows; ++row, result += result_stride)
+ {
+ // Get the address of the first element of the row.
+ int8_t *row_ptr = (int8_t *)matrix + row * m_cols; // NOLINT
+ if (unaligned)
+ {
+ memcpy(aligned_row, row_ptr, sizeof(int8_t) * m_cols);
+ row_ptr = aligned_row;
+ }
+
+ // Initialize the dot product sum for the row to 0.
+ int32x4_t dotprod = vmovq_n_s32(0);
+
+ // Prefetch the row to cache.
+ __builtin_prefetch(row_ptr, 0 /* prefetch for read */, 3 /* temporal locality */);
+
+ // For every block of 16 8-bit elements.
+ col = 0;
+ for (; col < postamble_start; col += kWeightsPerNeonLane)
+ {
+ // Load 16 8-bit values from the row and vector, each, to operate on.
+ // Here the assumption is that each buffer is 4-byte aligned.
+ assert(((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0);
+ const int8x16_t s1_8x16 = vld1q_s8((const int8_t *)(aligned_vec + col));
+ const int8x16_t s2_8x16 = vld1q_s8((const int8_t *)(row_ptr + col));
+ // Multiply the low bits (i.e. the lower 8 8bit numbers in the
+ // registers).
+ int16x8_t prod_16x8 = vmull_s8(vget_low_s8(s1_8x16), vget_low_s8(s2_8x16));
+ // Multiply the high bits (i.e. the lower 8 8bit numbers in the
+ // registers), and accumulate with the result of the low bits product.
+ // The assumption here is that overflow will not happen as we quantize
+ // our values to be in the range [-127, 127]. As such the sum of the 2
+ // products is always strictly smaller than 15-bits (32767 in absolute
+ // value).
+ prod_16x8 = vmlal_s8(prod_16x8, vget_high_s8(s1_8x16), vget_high_s8(s2_8x16));
+
+ dotprod = vpadalq_s16(dotprod, prod_16x8);
+ } // for col
+
+ int32_t postable_sum = 0;
+ // Postamble loop.
+ // TODO(raziel): if (ABSL_PREDICT_FALSE(postamble_start < m_rows))
+ if (postamble_start < m_cols)
+ {
+ col = postamble_start;
+ if ((m_cols - postamble_start) >= (kWeightsPerNeonLane >> 1))
+ {
+ // Load 8 8-bit values from the row and column each to operate on.
+ // Here the assumption is that each buffer is 4-bytes aligned.
+ assert(((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0);
+ const int8x8_t s1_8x8 = vld1_s8((const int8_t *)(aligned_vec + col));
+ const int8x8_t s2_8x8 = vld1_s8((const int8_t *)(row_ptr + col));
+ const int16x8_t prod_16x8 = vmull_s8(s1_8x8, s2_8x8);
+ dotprod = vpadalq_s16(dotprod, prod_16x8);
+ col += (kWeightsPerNeonLane >> 1);
+ }
+ for (; col < m_cols; ++col)
+ {
+ postable_sum += row_ptr[col] * aligned_vec[col];
+ } // for col
+ }
+ // Add the 4 intermediate sum values to get the final dot-prod value for
+ // this row.
+ int64x2_t pairwiseAdded = vpaddlq_s32(dotprod);
+ int32_t neon_sum = vgetq_lane_s64(pairwiseAdded, 0) + vgetq_lane_s64(pairwiseAdded, 1);
+
+ *result += ((neon_sum + postable_sum) * batch_scaling_factor);
+ } // for row
+ } // for batch
+
+ if (unaligned)
+ {
+ free(aligned_row_free);
+ }
+ free(aligned_vec_free);
+}
+
+void NeonMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols,
+ const float *vector, int n_batch, float *result,
+ int result_stride)
+{
+ // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+ // vectorized loop, and we need to process sequentially. postamble_start shows
+ // the start index where this should happen.
+ const int postamble_start = m_cols - (m_cols & (kFloatWeightsPerNeonLane - 1));
+
+ for (int b = 0; b < n_batch; b++)
+ {
+ float *result_in_batch = result + b * m_rows * result_stride;
+ const float *vector_in_batch = vector + b * m_cols;
+ const float *matrix_row = matrix;
+
+ // Main matrix by vector multiplication loop
+ for (int r = 0; r < m_rows; r++)
+ {
+ float32x4_t acc_32x4 = vmovq_n_f32(0.0);
+ for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane)
+ {
+ // Load 4 float values from vector and matrix row.
+ float32x4_t vector_f32x4 = vld1q_f32(vector_in_batch + c);
+ float32x4_t matrix_f32x4 = vld1q_f32(matrix_row + c);
+ // Multiply the vector and matrix row and add to accumulator.
+ acc_32x4 = vmlaq_f32(acc_32x4, matrix_f32x4, vector_f32x4);
+ }
+ // Add the 4 intermediate sum values to get the final dot-prod value for
+ // this column.
+ *result_in_batch += (vgetq_lane_f32(acc_32x4, 0) + vgetq_lane_f32(acc_32x4, 1) +
+ vgetq_lane_f32(acc_32x4, 2) + vgetq_lane_f32(acc_32x4, 3));
+ for (int c = postamble_start; c < m_cols; c++)
+ {
+ *result_in_batch += matrix_row[c] * vector_in_batch[c];
+ }
+ matrix_row += m_cols;
+ result_in_batch += result_stride;
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // USE_NEON
+
+#endif // __NNFW_CKER_NEON_TENSOR_UTILS_H__