summaryrefslogtreecommitdiff
path: root/compute/cker
diff options
context:
space:
mode:
Diffstat (limited to 'compute/cker')
-rw-r--r--compute/cker/CMakeLists.txt24
-rw-r--r--compute/cker/include/cker/CpuBackendThreadpool.h51
-rw-r--r--compute/cker/include/cker/NeonTensorUtils.h620
-rw-r--r--compute/cker/include/cker/PortableTensorUtils.h165
-rw-r--r--compute/cker/include/cker/Shape.h18
-rw-r--r--compute/cker/include/cker/TensorUtils.h112
-rw-r--r--compute/cker/include/cker/Types.h99
-rw-r--r--compute/cker/include/cker/Utils.h47
-rw-r--r--compute/cker/include/cker/eigen/EigenSupport.h8
-rw-r--r--compute/cker/include/cker/eigen/Utils.h14
-rw-r--r--compute/cker/include/cker/eigen/eigen_convolution_helpers.h23
-rw-r--r--compute/cker/include/cker/eigen/eigen_gemm_eigen.h95
-rw-r--r--compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h346
-rw-r--r--compute/cker/include/cker/operation/AddN.h46
-rw-r--r--compute/cker/include/cker/operation/AveragePool.h139
-rw-r--r--compute/cker/include/cker/operation/BatchToSpaceND.h4
-rw-r--r--compute/cker/include/cker/operation/BinaryArithmeticOps.h70
-rw-r--r--compute/cker/include/cker/operation/BroadcastTo.h2
-rw-r--r--compute/cker/include/cker/operation/Common.h4
-rw-r--r--compute/cker/include/cker/operation/Comparison.h114
-rw-r--r--compute/cker/include/cker/operation/Concatenation.h2
-rw-r--r--compute/cker/include/cker/operation/Conv.h69
-rw-r--r--compute/cker/include/cker/operation/DepthToSpace.h71
-rw-r--r--compute/cker/include/cker/operation/DepthwiseConv.h253
-rw-r--r--compute/cker/include/cker/operation/Dequantize.h151
-rw-r--r--compute/cker/include/cker/operation/ELU.h44
-rw-r--r--compute/cker/include/cker/operation/Einsum.h30
-rw-r--r--compute/cker/include/cker/operation/Elementwise.h38
-rw-r--r--compute/cker/include/cker/operation/Fill.h22
-rw-r--r--compute/cker/include/cker/operation/FloorDiv.h82
-rw-r--r--compute/cker/include/cker/operation/FullyConnected.h63
-rw-r--r--compute/cker/include/cker/operation/FullyConnectedDense16x1.h134
-rw-r--r--compute/cker/include/cker/operation/FullyConnectedSparse16x1.h159
-rw-r--r--compute/cker/include/cker/operation/FusedBatchNorm.h10
-rw-r--r--compute/cker/include/cker/operation/Helper/BCast.h4
-rw-r--r--compute/cker/include/cker/operation/Helper/MatmulBCast.h6
-rw-r--r--compute/cker/include/cker/operation/Helper/RandomDistributions.h22
-rw-r--r--compute/cker/include/cker/operation/Helper/RandomOp.h4
-rw-r--r--compute/cker/include/cker/operation/Helper/RandomOpCpu.h6
-rw-r--r--compute/cker/include/cker/operation/Helper/Tensor.h30
-rw-r--r--compute/cker/include/cker/operation/InstanceNorm.h4
-rw-r--r--compute/cker/include/cker/operation/L2Normalize.h2
-rw-r--r--compute/cker/include/cker/operation/LSTM.h371
-rw-r--r--compute/cker/include/cker/operation/LeakyReLU.h47
-rw-r--r--compute/cker/include/cker/operation/LogSoftMax.h6
-rw-r--r--compute/cker/include/cker/operation/LogicalAnd.h80
-rw-r--r--compute/cker/include/cker/operation/Logistic.h29
-rw-r--r--compute/cker/include/cker/operation/MatrixBandPart.h11
-rw-r--r--compute/cker/include/cker/operation/MaxPool.h12
-rw-r--r--compute/cker/include/cker/operation/OneHot.h2
-rw-r--r--compute/cker/include/cker/operation/Quantize.h409
-rw-r--r--compute/cker/include/cker/operation/Range.h4
-rw-r--r--compute/cker/include/cker/operation/Reduce.h124
-rw-r--r--compute/cker/include/cker/operation/ReduceMean.h41
-rw-r--r--compute/cker/include/cker/operation/ResizeBilinear.h107
-rw-r--r--compute/cker/include/cker/operation/Round.h21
-rw-r--r--compute/cker/include/cker/operation/Select.h4
-rw-r--r--compute/cker/include/cker/operation/Slice.h12
-rw-r--r--compute/cker/include/cker/operation/SoftMax.h392
-rw-r--r--compute/cker/include/cker/operation/SpaceToBatchND.h6
-rw-r--r--compute/cker/include/cker/operation/StatelessRandomUniform.h6
-rw-r--r--compute/cker/include/cker/operation/StridedSlice.h35
-rw-r--r--compute/cker/include/cker/operation/Tile.h2
-rw-r--r--compute/cker/include/cker/operation/Transpose.h6
-rw-r--r--compute/cker/include/cker/operation/TransposeConv.h8
-rw-r--r--compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h1064
-rw-r--r--compute/cker/include/cker/operation/optimized/Conv.h28
-rw-r--r--compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h1250
-rw-r--r--compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h304
-rw-r--r--compute/cker/include/cker/operation/optimized/Gemm.h100
-rw-r--r--compute/cker/include/cker/operation/optimized/OptimizedUtils.h4
-rw-r--r--compute/cker/include/cker/operation/optimized/integer_ops/DepthwiseConvInt8.h2138
-rw-r--r--compute/cker/include/cker/operation/reference/BatchMatMul.h5
-rw-r--r--compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h47
-rw-r--r--compute/cker/include/cker/operation/reference/Conv.h212
-rw-r--r--compute/cker/include/cker/operation/reference/integer_ops/DepthwiseConvHybrid.h122
-rw-r--r--compute/cker/include/cker/operation/reference/integer_ops/DepthwiseConvUInt8.h152
-rw-r--r--compute/cker/include/cker/ruy/RuySupport.h82
-rw-r--r--compute/cker/include/cker/train/operation/FullyConnected.h49
-rw-r--r--compute/cker/include/cker/train/operation/Loss.h77
-rw-r--r--compute/cker/include/cker/train/operation/ReLU.h50
-rw-r--r--compute/cker/src/Range.test.cc70
-rw-r--r--compute/cker/src/train/FullyConnected.test.cc83
-rw-r--r--compute/cker/src/train/Loss.test.cc201
-rw-r--r--compute/cker/src/train/Relu.test.cc107
85 files changed, 9678 insertions, 1409 deletions
diff --git a/compute/cker/CMakeLists.txt b/compute/cker/CMakeLists.txt
index 609dd45a3..d464dccae 100644
--- a/compute/cker/CMakeLists.txt
+++ b/compute/cker/CMakeLists.txt
@@ -8,15 +8,33 @@ target_link_libraries(nnfw_lib_cker INTERFACE gemmlowp)
target_link_libraries(nnfw_lib_cker INTERFACE ruy)
target_link_libraries(nnfw_lib_cker INTERFACE ruy_instrumentation)
target_compile_definitions(nnfw_lib_cker INTERFACE USE_RUY_GEMV)
-if(EXPERIMENTAL_RUY_FEATURE)
- target_compile_definitions(nnfw_lib_cker INTERFACE EXPERIMENTAL_RUY_FEATURE)
-endif(EXPERIMENTAL_RUY_FEATURE)
if(PROFILE_RUY)
target_link_libraries(nnfw_lib_cker INTERFACE ruy_profiler)
endif(PROFILE_RUY)
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
+ target_compile_definitions(nnfw_lib_cker INTERFACE CKER_X86_PLATFORM)
+endif(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
+
target_include_directories(nnfw_lib_cker INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include)
# Workaround to avoid warning
# TODO Resolve warning
target_compile_options(nnfw_lib_cker INTERFACE -Wno-attributes)
+
+if(NOT ENABLE_TEST)
+ return()
+endif(NOT ENABLE_TEST)
+
+set(TEST_CKER test_cker)
+
+file(GLOB_RECURSE TESTS "src/*.test.cc")
+
+add_executable(${TEST_CKER} ${TESTS})
+
+target_link_libraries(${TEST_CKER} nnfw_lib_cker)
+target_link_libraries(${TEST_CKER} nnfw_coverage)
+target_link_libraries(${TEST_CKER} gtest gtest_main ${LIB_PTHREAD})
+
+add_test(${TEST_CKER} ${TEST_CKER})
+install(TARGETS ${TEST_CKER} DESTINATION unittest)
diff --git a/compute/cker/include/cker/CpuBackendThreadpool.h b/compute/cker/include/cker/CpuBackendThreadpool.h
new file mode 100644
index 000000000..8ec6140bd
--- /dev/null
+++ b/compute/cker/include/cker/CpuBackendThreadpool.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_CPU_BACKEND_THREADPOOL_H_
+#define __NNFW_CKER_CPU_BACKEND_THREADPOOL_H_
+
+#include <ruy/context.h> // from @ruy
+#include <ruy/thread_pool.h> // from @ruy
+
+#include <stdexcept>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace cpu_backend_threadpool
+{
+
+using Task = ruy::Task;
+
+template <typename TaskType>
+void Execute(int tasks_count, TaskType *tasks, ruy::Context *ruy_context)
+{
+ assert(ruy_context != nullptr);
+ assert(tasks_count <= ruy_context->max_num_threads());
+ if (ruy_context == nullptr)
+ {
+ throw std::runtime_error("CpuBackendThreadpool.h: ruy::Context is null");
+ }
+ ruy_context->mutable_thread_pool()->Execute(tasks_count, tasks);
+}
+
+} // namespace cpu_backend_threadpool
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_CPU_BACKEND_THREADPOOL_H_
diff --git a/compute/cker/include/cker/NeonTensorUtils.h b/compute/cker/include/cker/NeonTensorUtils.h
index 246fd9a46..45ad969c3 100644
--- a/compute/cker/include/cker/NeonTensorUtils.h
+++ b/compute/cker/include/cker/NeonTensorUtils.h
@@ -20,11 +20,13 @@
#include <ruy/path.h>
#include <ruy/ruy.h>
-#include <ruy/detect_arm.h>
#include "cker/Types.h"
#include "cker/neon/neon_check.h"
#include "cker/ruy/RuySupport.h"
#include "util/logging.h"
+#if defined __linux__ && defined __aarch64__
+#include <sys/auxv.h>
+#endif
#include <cassert>
#include <cmath>
@@ -41,6 +43,8 @@ namespace cker
namespace
{
+constexpr int kFloatValuesPerNeonVector = 4;
+
// TODO(ahentz): Clean up.
using int8 = std::int8_t;
using uint8 = std::uint8_t;
@@ -49,6 +53,11 @@ using uint16 = std::uint16_t;
using int32 = std::int32_t;
using uint32 = std::uint32_t;
+template <int PerNeonSize> inline int RoundDownVectors(int size)
+{
+ return size & ~(PerNeonSize - 1);
+}
+
// Allocates, at least, size bytes of uninitialized storage whose alignment is
// specified by alignment. The size parameter must be an integral multiple of
// alignment.
@@ -73,14 +82,37 @@ inline int32_t AccumulateNeonLane(const int32x4_t lane)
} // namespace
-#ifdef __aarch64__
+// The implementation of dotprod detection is copied from ruy's internal
+// function DetectDotprod().
+// At the moment it's only implemented on Linux ARM64. Consider syncing again
+// with ruy in the future to share improvements.
+#if defined __linux__ && defined __aarch64__
+inline bool DetectDotprodByLinuxAuxvMethod()
+{
+ // This is the value of HWCAP_ASIMDDP in sufficiently recent Linux headers,
+ // however we need to support building against older headers for the time
+ // being.
+ const int kLocalHwcapAsimddp = 1 << 20;
+ return getauxval(AT_HWCAP) & kLocalHwcapAsimddp;
+}
+#endif
+
+inline bool DetectArmNeonDotprod()
+{
+#if defined __linux__ && defined __aarch64__
+ return DetectDotprodByLinuxAuxvMethod();
+#endif
-bool HasSdotInstruction()
+ return false;
+}
+
+inline bool HasSdotInstruction()
{
- static const bool has_dotprod = ruy::DetectDotprod();
+ static const bool has_dotprod = DetectArmNeonDotprod();
return has_dotprod;
}
+#ifdef __aarch64__
// We interleave vector data to make the dot product logic more efficient.
// Suppose that vectors is:
// a0 a1 a2 a3 a4 a5 ...
@@ -93,13 +125,13 @@ bool HasSdotInstruction()
// e0 e1 e2 e3 f0 f1 f2 f3 ...
// Once the data is interleaved, each 16-byte read from the vectors pointer
// contains 4 bytes from each of 4 vectors.
-const int8_t *ShuffleVectors(const int8_t *vectors, const int n_batch, const int m_cols,
- void **shuffled_vectors_free)
+inline const int8_t *ShuffleVectors(const int8_t *vectors, const int n_batch, const int m_cols,
+ void **shuffled_vectors_free)
{
const int kWeightsPerUint32 = 4;
int8 *shuffled_vectors = reinterpret_cast<int8 *>(
- aligned_alloc(kWeightsPerUint32, n_batch * m_cols, shuffled_vectors_free));
+ aligned_alloc(kWeightsPerUint32, n_batch * m_cols, shuffled_vectors_free));
for (int i = 0; i < n_batch; i += 4)
{
@@ -113,25 +145,25 @@ const int8_t *ShuffleVectors(const int8_t *vectors, const int n_batch, const int
while (unshuffled_vec0_ptr != end_vec0_ptr)
{
asm volatile(
- // This code path requires that (n_cols % 16) == 0 so we can safely
- // read in 16-byte chunks from each row.
- "ld1 {v0.16b}, [%[unshuffled_vec0_ptr]], #16\n"
- "ld1 {v1.16b}, [%[unshuffled_vec1_ptr]], #16\n"
- "ld1 {v2.16b}, [%[unshuffled_vec2_ptr]], #16\n"
- "ld1 {v3.16b}, [%[unshuffled_vec3_ptr]], #16\n"
-
- "st4 {v0.s, v1.s, v2.s, v3.s}[0], [%[shuffled_vectors_ptr]], #16\n"
- "st4 {v0.s, v1.s, v2.s, v3.s}[1], [%[shuffled_vectors_ptr]], #16\n"
- "st4 {v0.s, v1.s, v2.s, v3.s}[2], [%[shuffled_vectors_ptr]], #16\n"
- "st4 {v0.s, v1.s, v2.s, v3.s}[3], [%[shuffled_vectors_ptr]], #16\n"
-
- : [unshuffled_vec0_ptr] "+r"(unshuffled_vec0_ptr),
- [unshuffled_vec1_ptr] "+r"(unshuffled_vec1_ptr),
- [unshuffled_vec2_ptr] "+r"(unshuffled_vec2_ptr),
- [unshuffled_vec3_ptr] "+r"(unshuffled_vec3_ptr),
- [shuffled_vectors_ptr] "+r"(shuffled_vectors_ptr)
- :
- : "v0", "v1", "v2", "v3", "cc", "memory");
+ // This code path requires that (n_cols % 16) == 0 so we can safely
+ // read in 16-byte chunks from each row.
+ "ld1 {v0.16b}, [%[unshuffled_vec0_ptr]], #16\n"
+ "ld1 {v1.16b}, [%[unshuffled_vec1_ptr]], #16\n"
+ "ld1 {v2.16b}, [%[unshuffled_vec2_ptr]], #16\n"
+ "ld1 {v3.16b}, [%[unshuffled_vec3_ptr]], #16\n"
+
+ "st4 {v0.s, v1.s, v2.s, v3.s}[0], [%[shuffled_vectors_ptr]], #16\n"
+ "st4 {v0.s, v1.s, v2.s, v3.s}[1], [%[shuffled_vectors_ptr]], #16\n"
+ "st4 {v0.s, v1.s, v2.s, v3.s}[2], [%[shuffled_vectors_ptr]], #16\n"
+ "st4 {v0.s, v1.s, v2.s, v3.s}[3], [%[shuffled_vectors_ptr]], #16\n"
+
+ : [ unshuffled_vec0_ptr ] "+r"(unshuffled_vec0_ptr),
+ [ unshuffled_vec1_ptr ] "+r"(unshuffled_vec1_ptr),
+ [ unshuffled_vec2_ptr ] "+r"(unshuffled_vec2_ptr),
+ [ unshuffled_vec3_ptr ] "+r"(unshuffled_vec3_ptr),
+ [ shuffled_vectors_ptr ] "+r"(shuffled_vectors_ptr)
+ :
+ : "v0", "v1", "v2", "v3", "cc", "memory");
}
}
@@ -172,104 +204,104 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(const int8_t *__restr
const int8 *mat_ptr3 = matrix + ((row + 3) * m_cols);
asm volatile(
- // Zero out the accumulator registers.
- "dup v0.4s, wzr\n"
- "dup v1.4s, wzr\n"
- "dup v2.4s, wzr\n"
- "dup v3.4s, wzr\n"
-
- "1:\n" // batch_cols_loop
-
- // Read 16 more bytes from a pair of matrix rows.
- "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"
-
- // Prefetch two rows ahead.
- "prfm pldl1strm, [%[mat_ptr2]]\n"
- "prfm pldl1strm, [%[mat_ptr3]]\n"
-
- // Read from input vectors 4 times; 64 bytes total.
- // Each 16-byte register contains parts of 4 vectors; see the
- // shuffle logic above.
-
- // From Benoit, places to look in the future:
- // - Move load instructions further from sdot
- // - Switch loop use-then-reload
- // - Do partial unrolling to use register space better
- "ld1 {v8.16b}, [%[vec_ptr]], #16\n"
- ".word 0x4f8ce100 // sdot v0.4s, v8.16b, v12.4b[0]\n"
- "ld1 {v9.16b}, [%[vec_ptr]], #16\n"
- ".word 0x4face121 // sdot v1.4s, v9.16b, v12.4b[1]\n"
- "ld1 {v10.16b}, [%[vec_ptr]], #16\n"
- ".word 0x4f8ce940 // sdot v0.4s, v10.16b, v12.4b[2]\n"
- "ld1 {v11.16b}, [%[vec_ptr]], #16\n"
- ".word 0x4face961 // sdot v1.4s, v11.16b, v12.4b[3]\n"
-
- // Update prefetch pointers.
- "add %[mat_ptr2], %[mat_ptr2], #16\n"
- "add %[mat_ptr3], %[mat_ptr3], #16\n"
-
- // Re-use those vectors for the next row as well.
- "ld1 {v13.16b}, [%[mat_ptr1]], #16\n"
- ".word 0x4f8de102 // sdot v2.4s, v8.16b, v13.4b[0]\n"
- ".word 0x4fade123 // sdot v3.4s, v9.16b, v13.4b[1]\n"
- ".word 0x4f8de942 // sdot v2.4s, v10.16b, v13.4b[2]\n"
- ".word 0x4fade963 // sdot v3.4s, v11.16b, v13.4b[3]\n"
-
- // If we're not done with these rows, continue.
- "cmp %[mat_ptr0], %[mat_ptr0_end]\n"
- "bne 1b\n" // batch_cols_loop
-
- // Done with the rows, sum the results.
- "add v0.4s, v0.4s, v1.4s\n"
- "add v2.4s, v2.4s, v3.4s\n"
-
- // Convert the per-vector sums to floating point.
- "scvtf v0.4s, v0.4s\n"
- "scvtf v1.4s, v2.4s\n"
-
- // Fetch scale factors.
- "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
-
- // Multiply scale factors times sums.
- "fmul v0.4s, v4.4s, v0.4s\n"
- "fmul v1.4s, v4.4s, v1.4s\n"
-
- // Load previous result values.
- // The result position is:
- // result[batch * m_rows + row]
- // Here that is factored into:
- // result_ptr = result + row
- // *result_ptr = res[0]
- // (uint8*)result_ptr += (m_rows * sizeof(float))
- // *result_ptr = res[1]
- // ...
- // Since we're reading two rows at a time, though, we read both
- // result[batch * m_rows + row]
- // and
- // result[batch * m_rows + row + 1]
- "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
- "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
- "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
- "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
-
- // Go back to the starting position (subtract wide_rows * 4).
- "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n"
-
- // Add previous result values.
- "fadd v9.4s, v9.4s, v0.4s\n"
- "fadd v10.4s, v10.4s, v1.4s\n"
-
- // Store results.
- "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
- "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
- "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
- "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
- : [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1), [vec_ptr] "+r"(vec_ptr),
- [result_ptr] "+r"(result_ptr), [mat_ptr2] "+r"(mat_ptr2), [mat_ptr3] "+r"(mat_ptr3)
- : [mat_ptr0_end] "r"(mat_ptr0_end), [scaling_factors_ptr] "r"(scaling_factors_ptr),
- [wide_rows] "r"(wide_rows)
- : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
- "v13", "cc", "memory");
+ // Zero out the accumulator registers.
+ "dup v0.4s, wzr\n"
+ "dup v1.4s, wzr\n"
+ "dup v2.4s, wzr\n"
+ "dup v3.4s, wzr\n"
+
+ "1:\n" // batch_cols_loop
+
+ // Read 16 more bytes from a pair of matrix rows.
+ "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"
+
+ // Prefetch two rows ahead.
+ "prfm pldl1strm, [%[mat_ptr2]]\n"
+ "prfm pldl1strm, [%[mat_ptr3]]\n"
+
+ // Read from input vectors 4 times; 64 bytes total.
+ // Each 16-byte register contains parts of 4 vectors; see the
+ // shuffle logic above.
+
+ // From Benoit, places to look in the future:
+ // - Move load instructions further from sdot
+ // - Switch loop use-then-reload
+ // - Do partial unrolling to use register space better
+ "ld1 {v8.16b}, [%[vec_ptr]], #16\n"
+ ".word 0x4f8ce100 // sdot v0.4s, v8.16b, v12.4b[0]\n"
+ "ld1 {v9.16b}, [%[vec_ptr]], #16\n"
+ ".word 0x4face121 // sdot v1.4s, v9.16b, v12.4b[1]\n"
+ "ld1 {v10.16b}, [%[vec_ptr]], #16\n"
+ ".word 0x4f8ce940 // sdot v0.4s, v10.16b, v12.4b[2]\n"
+ "ld1 {v11.16b}, [%[vec_ptr]], #16\n"
+ ".word 0x4face961 // sdot v1.4s, v11.16b, v12.4b[3]\n"
+
+ // Update prefetch pointers.
+ "add %[mat_ptr2], %[mat_ptr2], #16\n"
+ "add %[mat_ptr3], %[mat_ptr3], #16\n"
+
+ // Re-use those vectors for the next row as well.
+ "ld1 {v13.16b}, [%[mat_ptr1]], #16\n"
+ ".word 0x4f8de102 // sdot v2.4s, v8.16b, v13.4b[0]\n"
+ ".word 0x4fade123 // sdot v3.4s, v9.16b, v13.4b[1]\n"
+ ".word 0x4f8de942 // sdot v2.4s, v10.16b, v13.4b[2]\n"
+ ".word 0x4fade963 // sdot v3.4s, v11.16b, v13.4b[3]\n"
+
+ // If we're not done with these rows, continue.
+ "cmp %[mat_ptr0], %[mat_ptr0_end]\n"
+ "bne 1b\n" // batch_cols_loop
+
+ // Done with the rows, sum the results.
+ "add v0.4s, v0.4s, v1.4s\n"
+ "add v2.4s, v2.4s, v3.4s\n"
+
+ // Convert the per-vector sums to floating point.
+ "scvtf v0.4s, v0.4s\n"
+ "scvtf v1.4s, v2.4s\n"
+
+ // Fetch scale factors.
+ "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
+
+ // Multiply scale factors times sums.
+ "fmul v0.4s, v4.4s, v0.4s\n"
+ "fmul v1.4s, v4.4s, v1.4s\n"
+
+ // Load previous result values.
+ // The result position is:
+ // result[batch * m_rows + row]
+ // Here that is factored into:
+ // result_ptr = result + row
+ // *result_ptr = res[0]
+ // (uint8*)result_ptr += (m_rows * sizeof(float))
+ // *result_ptr = res[1]
+ // ...
+ // Since we're reading two rows at a time, though, we read both
+ // result[batch * m_rows + row]
+ // and
+ // result[batch * m_rows + row + 1]
+ "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
+ "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
+ "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
+ "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
+
+ // Go back to the starting position (subtract wide_rows * 4).
+ "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n"
+
+ // Add previous result values.
+ "fadd v9.4s, v9.4s, v0.4s\n"
+ "fadd v10.4s, v10.4s, v1.4s\n"
+
+ // Store results.
+ "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
+ "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
+ "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
+ "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
+ : [ mat_ptr0 ] "+r"(mat_ptr0), [ mat_ptr1 ] "+r"(mat_ptr1), [ vec_ptr ] "+r"(vec_ptr),
+ [ result_ptr ] "+r"(result_ptr), [ mat_ptr2 ] "+r"(mat_ptr2), [ mat_ptr3 ] "+r"(mat_ptr3)
+ : [ mat_ptr0_end ] "r"(mat_ptr0_end), [ scaling_factors_ptr ] "r"(scaling_factors_ptr),
+ [ wide_rows ] "r"(wide_rows)
+ : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+ "v13", "cc", "memory");
}
}
@@ -277,9 +309,9 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(const int8_t *__restr
}
static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
- const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
- const float *scaling_factors, int n_batch, float *__restrict__ result,
- const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums)
+ const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
+ const float *scaling_factors, int n_batch, float *__restrict__ result,
+ const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums)
{
void *shuffled_vectors_free;
const int8_t *shuffled_vectors = ShuffleVectors(vectors, n_batch, m_cols, &shuffled_vectors_free);
@@ -300,102 +332,102 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
const int32_t *batch_offsets_ptr = input_offset + batch;
const int32_t is_channel_scale_nullptr = per_channel_scale == nullptr;
const int32_t is_row_sums_nullptr = row_sums_ptr == nullptr;
- asm volatile("dup v0.4s, wzr\n"
- "dup v1.4s, wzr\n"
- "dup v2.4s, wzr\n"
- "dup v3.4s, wzr\n"
- // Load zero points.
- "ld1 {v7.4s}, [%[batch_offsets_ptr]]\n"
- "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
- // Zero out zero point accumulators.
- "dup v14.4s, wzr\n"
- "dup v15.4s, wzr\n"
-
- // Load per channel scales if not null.
- "cmp %w[is_channel_scale_nullptr], #0\n"
- "bne 1f\n"
- "ld1r {v16.4s}, [%[channel_scales_ptr]], #4\n"
- "ld1r {v17.4s}, [%[channel_scales_ptr]]\n"
- "fmul v16.4s, v16.4s, v4.4s\n"
- "fmul v17.4s, v17.4s, v4.4s\n"
- "b 2f\n"
- "1:\n"
- "mov v16.16b, v4.16b\n"
- "mov v17.16b, v4.16b\n"
- "2:\n"
- "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"
- "ld1 {v8.16b}, [%[vec_ptr]], #16\n"
- ".word 0x4f8ce100 // sdot v0.4s, v8.16b, v12.4b[0]\n"
- "ld1 {v9.16b}, [%[vec_ptr]], #16\n"
- ".word 0x4face121 // sdot v1.4s, v9.16b, v12.4b[1]\n"
- "ld1 {v10.16b}, [%[vec_ptr]], #16\n"
- ".word 0x4f8ce940 // sdot v0.4s, v10.16b, v12.4b[2]\n"
- "ld1 {v11.16b}, [%[vec_ptr]], #16\n"
- ".word 0x4face961 // sdot v1.4s, v11.16b, v12.4b[3]\n"
- "ld1 {v13.16b}, [%[mat_ptr1]], #16\n"
- ".word 0x4f8de102 // sdot v2.4s, v8.16b, v13.4b[0]\n"
- ".word 0x4fade123 // sdot v3.4s, v9.16b, v13.4b[1]\n"
- ".word 0x4f8de942 // sdot v2.4s, v10.16b, v13.4b[2]\n"
- ".word 0x4fade963 // sdot v3.4s, v11.16b, v13.4b[3]\n"
- "cmp %w[is_row_sums_nullptr], #1\n"
- "bne 3f\n"
- // Accumulate row_sums for zero point calculations.
- "saddlp v12.8h, v12.16b\n"
- "saddlp v13.8h, v13.16b\n"
- "sadalp v14.4s, v12.8h\n"
- "sadalp v15.4s, v13.8h\n"
- "3:\n"
- "cmp %[mat_ptr0], %[mat_ptr0_end]\n"
- "bne 2b\n"
- "add v0.4s, v0.4s, v1.4s\n"
- "add v2.4s, v2.4s, v3.4s\n"
-
- "cmp %w[is_row_sums_nullptr], #1\n"
- "bne 4f\n"
- // Calculate zero point offsets.
- "addv s14, v14.4s\n"
- "addv s15, v15.4s\n"
- "dup v14.4s, v14.s[0]\n"
- "dup v15.4s, v15.s[0]\n"
- "b 5f\n"
- "4:\n"
- "ld1r {v14.4s}, [%[row_sums_ptr]], #4\n"
- "ld1r {v15.4s}, [%[row_sums_ptr]]\n"
- "5:\n"
-
- "mul v14.4s, v14.4s, v7.4s\n"
- "mul v15.4s, v15.4s, v7.4s\n"
- "sub v0.4s, v0.4s, v14.4s\n"
- "sub v2.4s, v2.4s, v15.4s\n"
-
- "scvtf v0.4s, v0.4s\n"
- "scvtf v1.4s, v2.4s\n"
-
- // Multiply scale.
- "fmul v0.4s, v16.4s, v0.4s\n"
- "fmul v1.4s, v17.4s, v1.4s\n"
-
- "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
- "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
- "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
- "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
- "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n"
- "fadd v9.4s, v9.4s, v0.4s\n"
- "fadd v10.4s, v10.4s, v1.4s\n"
- "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
- "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
- "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
- "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
- : [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1), [vec_ptr] "+r"(vec_ptr),
- [result_ptr] "+r"(result_ptr), [row_sums_ptr] "+r"(row_sums_ptr)
- : [mat_ptr0_end] "r"(mat_ptr0_end),
- [scaling_factors_ptr] "r"(scaling_factors_ptr), [wide_rows] "r"(wide_rows),
- [channel_scales_ptr] "r"(channel_scales_ptr),
- [batch_offsets_ptr] "r"(batch_offsets_ptr),
- [is_channel_scale_nullptr] "r"(is_channel_scale_nullptr),
- [is_row_sums_nullptr] "r"(is_row_sums_nullptr)
- : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
- "v12", "v13", "v14", "v15", "v16", "v17", "w0", "w1", "cc", "memory");
+ asm volatile(
+ "dup v0.4s, wzr\n"
+ "dup v1.4s, wzr\n"
+ "dup v2.4s, wzr\n"
+ "dup v3.4s, wzr\n"
+ // Load zero points.
+ "ld1 {v7.4s}, [%[batch_offsets_ptr]]\n"
+ "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n"
+ // Zero out zero point accumulators.
+ "dup v14.4s, wzr\n"
+ "dup v15.4s, wzr\n"
+
+ // Load per channel scales if not null.
+ "cmp %w[is_channel_scale_nullptr], #0\n"
+ "bne 1f\n"
+ "ld1r {v16.4s}, [%[channel_scales_ptr]], #4\n"
+ "ld1r {v17.4s}, [%[channel_scales_ptr]]\n"
+ "fmul v16.4s, v16.4s, v4.4s\n"
+ "fmul v17.4s, v17.4s, v4.4s\n"
+ "b 2f\n"
+ "1:\n"
+ "mov v16.16b, v4.16b\n"
+ "mov v17.16b, v4.16b\n"
+ "2:\n"
+ "ld1 {v12.16b}, [%[mat_ptr0]], #16\n"
+ "ld1 {v8.16b}, [%[vec_ptr]], #16\n"
+ ".word 0x4f8ce100 // sdot v0.4s, v8.16b, v12.4b[0]\n"
+ "ld1 {v9.16b}, [%[vec_ptr]], #16\n"
+ ".word 0x4face121 // sdot v1.4s, v9.16b, v12.4b[1]\n"
+ "ld1 {v10.16b}, [%[vec_ptr]], #16\n"
+ ".word 0x4f8ce940 // sdot v0.4s, v10.16b, v12.4b[2]\n"
+ "ld1 {v11.16b}, [%[vec_ptr]], #16\n"
+ ".word 0x4face961 // sdot v1.4s, v11.16b, v12.4b[3]\n"
+ "ld1 {v13.16b}, [%[mat_ptr1]], #16\n"
+ ".word 0x4f8de102 // sdot v2.4s, v8.16b, v13.4b[0]\n"
+ ".word 0x4fade123 // sdot v3.4s, v9.16b, v13.4b[1]\n"
+ ".word 0x4f8de942 // sdot v2.4s, v10.16b, v13.4b[2]\n"
+ ".word 0x4fade963 // sdot v3.4s, v11.16b, v13.4b[3]\n"
+ "cmp %w[is_row_sums_nullptr], #1\n"
+ "bne 3f\n"
+ // Accumulate row_sums for zero point calculations.
+ "saddlp v12.8h, v12.16b\n"
+ "saddlp v13.8h, v13.16b\n"
+ "sadalp v14.4s, v12.8h\n"
+ "sadalp v15.4s, v13.8h\n"
+ "3:\n"
+ "cmp %[mat_ptr0], %[mat_ptr0_end]\n"
+ "bne 2b\n"
+ "add v0.4s, v0.4s, v1.4s\n"
+ "add v2.4s, v2.4s, v3.4s\n"
+
+ "cmp %w[is_row_sums_nullptr], #1\n"
+ "bne 4f\n"
+ // Calculate zero point offsets.
+ "addv s14, v14.4s\n"
+ "addv s15, v15.4s\n"
+ "dup v14.4s, v14.s[0]\n"
+ "dup v15.4s, v15.s[0]\n"
+ "b 5f\n"
+ "4:\n"
+ "ld1r {v14.4s}, [%[row_sums_ptr]], #4\n"
+ "ld1r {v15.4s}, [%[row_sums_ptr]]\n"
+ "5:\n"
+
+ "mul v14.4s, v14.4s, v7.4s\n"
+ "mul v15.4s, v15.4s, v7.4s\n"
+ "sub v0.4s, v0.4s, v14.4s\n"
+ "sub v2.4s, v2.4s, v15.4s\n"
+
+ "scvtf v0.4s, v0.4s\n"
+ "scvtf v1.4s, v2.4s\n"
+
+ // Multiply scale.
+ "fmul v0.4s, v16.4s, v0.4s\n"
+ "fmul v1.4s, v17.4s, v1.4s\n"
+
+ "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
+ "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
+ "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
+ "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
+ "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n"
+ "fadd v9.4s, v9.4s, v0.4s\n"
+ "fadd v10.4s, v10.4s, v1.4s\n"
+ "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n"
+ "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n"
+ "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n"
+ "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n"
+ : [ mat_ptr0 ] "+r"(mat_ptr0), [ mat_ptr1 ] "+r"(mat_ptr1), [ vec_ptr ] "+r"(vec_ptr),
+ [ result_ptr ] "+r"(result_ptr), [ row_sums_ptr ] "+r"(row_sums_ptr)
+ : [ mat_ptr0_end ] "r"(mat_ptr0_end), [ scaling_factors_ptr ] "r"(scaling_factors_ptr),
+ [ wide_rows ] "r"(wide_rows), [ channel_scales_ptr ] "r"(channel_scales_ptr),
+ [ batch_offsets_ptr ] "r"(batch_offsets_ptr),
+ [ is_channel_scale_nullptr ] "r"(is_channel_scale_nullptr),
+ [ is_row_sums_nullptr ] "r"(is_row_sums_nullptr)
+ : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
+ "v13", "v14", "v15", "v16", "v17", "w0", "w1", "cc", "memory");
}
}
@@ -425,10 +457,10 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(
//
// We don't use this kernel when n_batch = 1 because the baseline kernel
// is fine for that case.
-void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
- const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
- const float *scaling_factors, int n_batch, float *__restrict__ result,
- const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums)
+inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
+ const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
+ const float *scaling_factors, int n_batch, float *__restrict__ result,
+ const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums)
{
const int kWeightsPerUint32 = 4;
@@ -443,14 +475,14 @@ void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
void *padded_vectors_free;
const int padded_vectors_size = batch_round_up * m_cols;
int8_t *padded_vectors = reinterpret_cast<int8_t *>(
- aligned_alloc(kWeightsPerUint32, padded_vectors_size, &padded_vectors_free));
+ aligned_alloc(kWeightsPerUint32, padded_vectors_size, &padded_vectors_free));
memset(padded_vectors, 0, padded_vectors_size);
void *padded_result_free;
const int result_size = n_batch * m_rows * sizeof(float);
const int padded_result_size = batch_round_up * m_rows * sizeof(float);
float *padded_result = reinterpret_cast<float *>(
- aligned_alloc(kWeightsPerUint32, padded_result_size, &padded_result_free));
+ aligned_alloc(kWeightsPerUint32, padded_result_size, &padded_result_free));
memcpy(padded_result, result, result_size);
memset(reinterpret_cast<char *>(padded_result) + result_size, 0,
padded_result_size - result_size);
@@ -462,7 +494,7 @@ void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
void *padded_scaling_factors_free;
const int padded_scaling_factors_size = batch_round_up * sizeof(float);
float *padded_scaling_factors = reinterpret_cast<float *>(
- aligned_alloc(kWeightsPerUint32, padded_scaling_factors_size, &padded_scaling_factors_free));
+ aligned_alloc(kWeightsPerUint32, padded_scaling_factors_size, &padded_scaling_factors_free));
assert(static_cast<int>(n_batch * sizeof(float)) <= padded_scaling_factors_size);
assert(static_cast<int>(batch_round_up * sizeof(float)) <= padded_scaling_factors_size);
memset(padded_scaling_factors, 0, batch_round_up * sizeof(float));
@@ -473,7 +505,7 @@ void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
void *padded_input_offset_free;
const int padded_input_offset_size = batch_round_up * sizeof(int32_t);
int32_t *padded_input_offset = reinterpret_cast<int32_t *>(
- aligned_alloc(kWeightsPerUint32, padded_input_offset_size, &padded_input_offset_free));
+ aligned_alloc(kWeightsPerUint32, padded_input_offset_size, &padded_input_offset_free));
assert(static_cast<int>(n_batch * sizeof(int32_t)) <= padded_input_offset_size);
assert(static_cast<int>(batch_round_up * sizeof(int32_t)) <= padded_input_offset_size);
memset(padded_input_offset, 0, batch_round_up * sizeof(int32_t));
@@ -481,8 +513,8 @@ void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
// Call the main kernel.
DotprodMatrixBatchFourVectorMultiplyAccumulate(
- matrix, m_rows, m_cols, padded_vectors, padded_scaling_factors, batch_round_up,
- padded_result, per_channel_scale, padded_input_offset, row_sums);
+ matrix, m_rows, m_cols, padded_vectors, padded_scaling_factors, batch_round_up, padded_result,
+ per_channel_scale, padded_input_offset, row_sums);
free(padded_input_offset_free);
}
@@ -500,20 +532,40 @@ void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
free(padded_scaling_factors_free);
}
-void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
- const int m_rows, const int m_cols,
- const int8_t *vectors,
- const float *scaling_factors, int n_batch,
- float *__restrict__ result)
+inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
+ const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors,
+ const float *scaling_factors, int n_batch, float *__restrict__ result)
{
DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(
- matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
- /*per_channel_scale=*/nullptr, /*input_offset=*/nullptr,
- /*row_sums=*/nullptr);
+ matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+ /*per_channel_scale=*/nullptr, /*input_offset=*/nullptr,
+ /*row_sums=*/nullptr);
}
#endif // __aarch64__
-bool NeonIsZeroVector(const float *vector, int v_size)
+inline void NeonCwiseClipping(float *vector, const int v_size, const float clipping_value)
+{
+ const float32x4_t clipping_value_f32x4 = vmovq_n_f32(clipping_value);
+ const float32x4_t neg_clipping_value_f32x4 = vmovq_n_f32(-clipping_value);
+
+ int i = 0;
+ for (; i <= v_size - kFloatValuesPerNeonVector; i += kFloatValuesPerNeonVector)
+ {
+ // Load from memory to vector.
+ float32x4_t v_f32x4 = vld1q_f32(vector + i);
+ // Clip between clipping_value and -clipping_value.
+ v_f32x4 = vminq_f32(clipping_value_f32x4, v_f32x4);
+ v_f32x4 = vmaxq_f32(neg_clipping_value_f32x4, v_f32x4);
+ // Save to output.
+ vst1q_f32(vector + i, v_f32x4);
+ }
+ for (; i < v_size; i++)
+ {
+ vector[i] = std::max(std::min(clipping_value, vector[i]), -clipping_value);
+ }
+}
+
+inline bool NeonIsZeroVector(const float *vector, int v_size)
{
// If v_size is not divisible by kFloatWeightsPerNeonLane, we cannot
// use the main vectorized loop, and we need to process sequentially.
@@ -544,15 +596,16 @@ bool NeonIsZeroVector(const float *vector, int v_size)
return true;
}
-void NeonCpuBackendGemm(const int8_t *input, const int32_t *bias,
- const int8_t *input_to_gate_weights, int32_t n_batch, int32_t n_input,
- int32_t n_output, int32_t, int32_t *scratch, ruy::Context *ruy_context)
+inline void NeonCpuBackendGemm(const int8_t *input, const int32_t *bias,
+ const int8_t *input_to_gate_weights, int32_t n_batch,
+ int32_t n_input, int32_t n_output, int32_t, int32_t *scratch,
+ ruy::Context *ruy_context)
{
MatrixParams<int8_t> lhs_params;
lhs_params.order = Order::kRowMajor;
lhs_params.rows = n_output;
lhs_params.cols = n_input;
- lhs_params.cacheable = true;
+ lhs_params.cache_policy = CachePolicy::kAlwaysCache;
MatrixParams<int8_t> rhs_params;
rhs_params.order = Order::kColMajor;
@@ -574,19 +627,44 @@ void NeonCpuBackendGemm(const int8_t *input, const int32_t *bias,
ruy::Matrix<int8_t> ruy_lhs;
ruy::Matrix<int8_t> ruy_rhs;
ruy::Matrix<int32_t> ruy_dst;
- ruy_support::MakeRuyMatrix(lhs_params, input_to_gate_weights, &ruy_lhs);
- ruy_support::MakeRuyMatrix(rhs_params, input, &ruy_rhs);
+ // Note that cache is always enabled for input and weight tensors
+ ruy_support::MakeRuyMatrix(lhs_params, input_to_gate_weights, &ruy_lhs, true);
+ ruy_support::MakeRuyMatrix(rhs_params, input, &ruy_rhs, true);
ruy_support::MakeRuyMatrix(dst_params, scratch, &ruy_dst);
- ruy::BasicSpec<int32_t, int32_t> ruy_spec;
- ruy_support::MakeRuySpec(gemm_params, &ruy_spec);
+ ruy::MulParams<int32_t, int32_t> ruy_mul_params;
+ ruy_support::MakeRuyMulParams(gemm_params, &ruy_mul_params);
- constexpr ruy::Path kRuyPath = ruy::kAllPaths;
- ruy::Mul<kRuyPath>(ruy_lhs, ruy_rhs, ruy_spec, ruy_context, &ruy_dst);
+ ruy::Mul(ruy_lhs, ruy_rhs, ruy_mul_params, ruy_context, &ruy_dst);
+}
+
+inline void NeonSub1Vector(const float *vector, int v_size, float *result)
+{
+ // If v_size is not divisible by the vector size, then we need to process the
+ // final few elements sequentially. postamble_start shows the start index
+ // where this should happen.
+ const int postamble_start = RoundDownVectors<kFloatValuesPerNeonVector>(v_size);
+
+ float32x4_t one_f32x4 = vmovq_n_f32(1.0);
+ int v = 0;
+ for (; v < postamble_start; v += kFloatValuesPerNeonVector)
+ {
+ // Load 4 float values from the current pointers of the input column and
+ // subtract from 1.
+ float32x4_t v_f32x4 = vld1q_f32(vector + v);
+ float32x4_t result_f32x4 = vsubq_f32(one_f32x4, v_f32x4);
+ // Save to output.
+ vst1q_f32(result + v, result_f32x4);
+ }
+ for (; v < v_size; v++)
+ {
+ result[v] = 1.0f - vector[v];
+ }
}
-void NeonSymmetricQuantizeFloats(const float *values, const int size, int8_t *quantized_values,
- float *min, float *max, float *scaling_factor)
+inline void NeonSymmetricQuantizeFloats(const float *values, const int size,
+ int8_t *quantized_values, float *min, float *max,
+ float *scaling_factor)
{
// TODO(raziel): vectorize min/max calculation.
auto minmax = std::minmax_element(values, values + size);
@@ -658,15 +736,16 @@ void NeonSymmetricQuantizeFloats(const float *values, const int size, int8_t *qu
for (int i = postamble_start; i < size; ++i)
{
const int32_t quantized_value =
- static_cast<int32_t>(std::round(scaling_factor_inv * values[i]));
+ static_cast<int32_t>(std::round(scaling_factor_inv * values[i]));
quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
}
}
-void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, const int m_rows,
- const int m_cols, const int8_t *__restrict__ vectors,
- const float *scaling_factors, int n_batch,
- float *__restrict__ result, int result_stride)
+inline void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
+ const int m_rows, const int m_cols,
+ const int8_t *__restrict__ vectors,
+ const float *scaling_factors, int n_batch,
+ float *__restrict__ result, int result_stride)
{
#ifdef __aarch64__
if (HasSdotInstruction() && m_cols % 16 == 0 && m_rows % 2 == 0 && m_rows >= n_batch)
@@ -751,7 +830,7 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
// Here the assumption is that each buffer is 4-byte aligned. Otherwise,
// performance may suffer significantly.
assert( // NOLINT
- ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0);
+ ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0);
const int8x16_t s1_8x16 = vld1q_s8((const int8_t *)(aligned_vec + col));
const int8x16_t s2_8x16 = vld1q_s8((const int8_t *)(row_ptr + col));
// Multiply the low bits (i.e. the lower 8 8bit numbers in the
@@ -776,7 +855,7 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
// Here the assumption is that each buffer is 4-bytes aligned.
// Otherwise, performance may suffer significantly.
assert( // NOLINT
- ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0);
+ ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0);
const int8x8_t s1_8x8 = vld1_s8((const int8_t *)(aligned_vec + col));
const int8x8_t s2_8x8 = vld1_s8((const int8_t *)(row_ptr + col));
const int16x8_t prod_16x8 = vmull_s8(s1_8x8, s2_8x8);
@@ -804,9 +883,9 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
free(aligned_vec_free);
}
-void NeonMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols,
- const float *vector, int n_batch, float *result,
- int result_stride)
+inline void NeonMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols,
+ const float *vector, int n_batch, float *result,
+ int result_stride)
{
// If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
// vectorized loop, and we need to process sequentially. postamble_start shows
@@ -845,11 +924,12 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, in
}
}
-void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, const int m_rows,
- const int m_cols, const int8_t *__restrict__ vectors,
- const float *scaling_factors, int n_batch,
- int32_t *scratch, float *__restrict__ result,
- int result_stride, ruy::Context *ruy_context)
+inline void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
+ const int m_rows, const int m_cols,
+ const int8_t *__restrict__ vectors,
+ const float *scaling_factors, int n_batch,
+ int32_t *scratch, float *__restrict__ result,
+ int result_stride, ruy::Context *ruy_context)
{
if (m_rows % 4 == 0 && result_stride == 1)
{
@@ -872,7 +952,7 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
const float32x4_t float_val1 = vcvtq_f32_s32(scratch_val1);
const float32x4_t result0 = vmlaq_f32(vld1q_f32(result), float_val0, scaling_factor0);
const float32x4_t result1 =
- vmlaq_f32(vld1q_f32(result + 4 * result_stride), float_val1, scaling_factor1);
+ vmlaq_f32(vld1q_f32(result + 4 * result_stride), float_val1, scaling_factor1);
vst1q_f32(result, result0);
vst1q_f32(result + 4 * result_stride, result1);
}
diff --git a/compute/cker/include/cker/PortableTensorUtils.h b/compute/cker/include/cker/PortableTensorUtils.h
index 54714e214..7e4b01a01 100644
--- a/compute/cker/include/cker/PortableTensorUtils.h
+++ b/compute/cker/include/cker/PortableTensorUtils.h
@@ -45,6 +45,10 @@ public:
return a < 0.f ? 0.f : a;
case FusedActivationFunctionType::kRelu6:
return std::max(0.f, std::min(a, 6.f));
+ case FusedActivationFunctionType::kTanh:
+ return std::tanh(a);
+ case FusedActivationFunctionType::kSigmoid:
+ return 1.0f / (1.0f + std::exp(-a));
default:
// TODO(aselle): More informative fatal error!
exit(1);
@@ -55,8 +59,17 @@ private:
FusedActivationFunctionType act_;
};
-void PortableVectorBatchVectorAssign(const float *vector, int v_size, int n_batch,
- float *batch_vector)
+template <typename T>
+void PortableCwiseClipping(T *vector, const int v_size, const T clipping_value)
+{
+ for (int i = 0; i < v_size; i++)
+ {
+ vector[i] = std::max(std::min(clipping_value, vector[i]), static_cast<T>(-clipping_value));
+ }
+}
+
+inline void PortableVectorBatchVectorAssign(const float *vector, int v_size, int n_batch,
+ float *batch_vector)
{
for (int b = 0; b < n_batch; b++)
{
@@ -64,7 +77,20 @@ void PortableVectorBatchVectorAssign(const float *vector, int v_size, int n_batc
}
}
-bool PortableIsZeroVector(const float *vector, int v_size)
+inline void PortableVectorBatchVectorAdd(const float *vector, int v_size, int n_batch,
+ float *batch_vector)
+{
+ for (int b = 0; b < n_batch; b++)
+ {
+ for (int i = 0; i < v_size; ++i)
+ {
+ batch_vector[i] += vector[i];
+ }
+ batch_vector += v_size;
+ }
+}
+
+inline bool PortableIsZeroVector(const float *vector, int v_size)
{
for (int i = 0; i < v_size; ++i)
{
@@ -74,8 +100,8 @@ bool PortableIsZeroVector(const float *vector, int v_size)
return true;
}
-void PortableApplyActivationToVector(const float *vector, int v_size,
- FusedActivationFunctionType activation, float *result)
+inline void PortableApplyActivationToVector(const float *vector, int v_size,
+ FusedActivationFunctionType activation, float *result)
{
auto activation_func = ActivationFunctor(activation);
for (int v = 0; v < v_size; v++)
@@ -84,8 +110,17 @@ void PortableApplyActivationToVector(const float *vector, int v_size,
}
}
-void PortableSymmetricQuantizeFloats(const float *values, const int size, int8_t *quantized_values,
- float *min_value, float *max_value, float *scaling_factor)
+inline void PortableSub1Vector(const float *vector, int v_size, float *result)
+{
+ for (int v = 0; v < v_size; v++)
+ {
+ *result++ = 1.0f - *vector++;
+ }
+}
+
+inline void PortableSymmetricQuantizeFloats(const float *values, const int size,
+ int8_t *quantized_values, float *min_value,
+ float *max_value, float *scaling_factor)
{
auto minmax = std::minmax_element(values, values + size);
*min_value = *minmax.first;
@@ -103,17 +138,72 @@ void PortableSymmetricQuantizeFloats(const float *values, const int size, int8_t
for (int i = 0; i < size; ++i)
{
const int32_t quantized_value =
- static_cast<int32_t>(std::round(values[i] * scaling_factor_inv));
+ static_cast<int32_t>(std::round(values[i] * scaling_factor_inv));
// Clamp: just in case some odd numeric offset.
quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
}
}
-void PortableMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
- const int m_rows, const int m_cols,
- const int8_t *__restrict__ vectors,
- const float *scaling_factors, int n_batch,
- float *__restrict__ result, int result_stride)
+inline void PortableAsymmetricQuantizeFloats(const float *values, const int size,
+ int8_t *quantized_values, float *scaling_factor,
+ int32_t *offset)
+{
+ /* Copied from TensorFlow PortableAsymmetricQuantizeFloats */
+ const int32_t kMinScale = -128;
+ const int32_t kMaxScale = 127;
+ const double qmin_double = kMinScale;
+ const double qmax_double = kMaxScale;
+ const auto minmax = std::minmax_element(values, values + size);
+ const double rmin = static_cast<double>(std::min(0.0f, *minmax.first));
+ const double rmax = static_cast<double>(std::max(0.0f, *minmax.second));
+ if (rmin == rmax)
+ {
+ memset(quantized_values, 0, size * sizeof(int8_t));
+ *scaling_factor = 1;
+ *offset = 0;
+ return;
+ }
+ else
+ {
+ double scale = (rmax - rmin) / (qmax_double - qmin_double);
+ const double zero_point_from_min = qmin_double - rmin / scale;
+ const double zero_point_from_max = qmax_double - rmax / scale;
+ const double zero_point_from_min_error = std::abs(qmin_double) + std::abs(rmin / scale);
+ const double zero_point_from_max_error = std::abs(qmax_double) + std::abs(rmax / scale);
+ const double zero_point_double = zero_point_from_min_error < zero_point_from_max_error
+ ? zero_point_from_min
+ : zero_point_from_max;
+ int8_t nudged_zero_point = 0;
+ if (zero_point_double <= qmin_double)
+ {
+ nudged_zero_point = kMinScale;
+ }
+ else if (zero_point_double >= qmax_double)
+ {
+ nudged_zero_point = kMaxScale;
+ }
+ else
+ {
+ nudged_zero_point = static_cast<int8_t>(round(zero_point_double));
+ }
+ *scaling_factor = scale;
+ *offset = nudged_zero_point;
+ }
+ const float scaling_factor_inv = 1.0f / *scaling_factor;
+ for (int i = 0; i < size; ++i)
+ {
+ const int32_t quantized_value =
+ static_cast<int32_t>(std::round(*offset + values[i] * scaling_factor_inv));
+ quantized_values[i] = std::min(kMaxScale, std::max(kMinScale, quantized_value));
+ }
+}
+
+inline void PortableMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
+ const int m_rows, const int m_cols,
+ const int8_t *__restrict__ vectors,
+ const float *scaling_factors, int n_batch,
+ float *__restrict__ result,
+ int result_stride)
{
int batch, row, col;
for (batch = 0; batch < n_batch; ++batch, vectors += m_cols)
@@ -138,20 +228,20 @@ void PortableMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matr
} // for batch
}
-void PortableMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
- const int m_rows, const int m_cols,
- const int8_t *__restrict__ vector,
- const float *scaling_factors, int n_batch,
- int32_t *, float *__restrict__ result,
- int result_stride, ruy::Context *)
+inline void PortableMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
+ const int m_rows, const int m_cols,
+ const int8_t *__restrict__ vector,
+ const float *scaling_factors, int n_batch,
+ int32_t *, float *__restrict__ result,
+ int result_stride, ruy::Context *)
{
PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector, scaling_factors,
n_batch, result, result_stride);
}
-void PortableMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols,
- const float *vector, int n_batch, float *result,
- int result_stride)
+inline void PortableMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols,
+ const float *vector, int n_batch,
+ float *result, int result_stride)
{
float *result_in_batch = result;
for (int b = 0; b < n_batch; b++)
@@ -171,7 +261,36 @@ void PortableMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows
}
}
-void PortableZeroVector(float *vector, int v_size) { std::fill_n(vector, v_size, 0); }
+inline void PortableMeanStddevNormalization(const float *input_vector, float *output_vector,
+ int v_size, int n_batch)
+{
+ for (int batch = 0; batch < n_batch; ++batch)
+ {
+ float sum = 0.0f;
+ for (int i = 0; i < v_size; ++i)
+ {
+ sum += input_vector[i];
+ }
+ const float mean = sum / v_size;
+ float sum_diff_sq = 0.0f;
+ for (int i = 0; i < v_size; ++i)
+ {
+ const float diff = input_vector[i] - mean;
+ sum_diff_sq += diff * diff;
+ }
+ const float variance = sum_diff_sq / v_size;
+ constexpr float kNormalizationConstant = 1e-8f;
+ const float stddev_inv = 1.0f / std::sqrt(variance + kNormalizationConstant);
+ for (int i = 0; i < v_size; ++i)
+ {
+ output_vector[i] = (input_vector[i] - mean) * stddev_inv;
+ }
+ input_vector += v_size;
+ output_vector += v_size;
+ }
+}
+
+inline void PortableZeroVector(float *vector, int v_size) { std::fill_n(vector, v_size, 0); }
} // namespace cker
} // namespace nnfw
diff --git a/compute/cker/include/cker/Shape.h b/compute/cker/include/cker/Shape.h
index 2486f01a6..9269ce9aa 100644
--- a/compute/cker/include/cker/Shape.h
+++ b/compute/cker/include/cker/Shape.h
@@ -136,12 +136,27 @@ public:
std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t));
}
+ inline void ReplaceWith(const Shape &other)
+ {
+ ReplaceWith(other.DimensionsCount(), other.DimsData());
+ }
+
+ inline void ReplaceWith(Shape &&other)
+ {
+ Resize(0);
+ std::swap(_size, other._size);
+ if (_size <= kMaxSmallSize)
+ std::copy(other._dims, other._dims + kMaxSmallSize, _dims);
+ else
+ _dims_pointer = other._dims_pointer;
+ }
+
template <typename T> inline void BuildFrom(const T &src_iterable)
{
const int dimensions_count = std::distance(src_iterable.begin(), src_iterable.end());
Resize(dimensions_count);
int32_t *data = DimsData();
- for (auto it : src_iterable)
+ for (auto &&it : src_iterable)
{
*data = it;
++data;
@@ -172,7 +187,6 @@ public:
for (int i = 0; i < _size; i++)
{
const int dim = dims_data[i];
- assert(dim >= 1);
buffer_size *= dim;
}
return buffer_size;
diff --git a/compute/cker/include/cker/TensorUtils.h b/compute/cker/include/cker/TensorUtils.h
index e07c91239..bac79b887 100644
--- a/compute/cker/include/cker/TensorUtils.h
+++ b/compute/cker/include/cker/TensorUtils.h
@@ -31,55 +31,133 @@ namespace nnfw
namespace cker
{
-void VectorBatchVectorAssign(const float *vector, int v_size, int n_batch, float *batch_vector)
+inline void CwiseClipping(float *vector, const int v_size, const float clipping_value)
+{
+ NEON_OR_PORTABLE(CwiseClipping, vector, v_size, clipping_value);
+}
+
+inline void VectorBatchVectorAdd(const float *vector, int v_size, int n_batch, float *batch_vector)
+{
+ PortableVectorBatchVectorAdd(vector, v_size, n_batch, batch_vector);
+}
+
+inline void VectorBatchVectorAssign(const float *vector, int v_size, int n_batch,
+ float *batch_vector)
{
PortableVectorBatchVectorAssign(vector, v_size, n_batch, batch_vector);
}
-bool IsZeroVector(const float *vector, int v_size)
+// Cwise product of two vectors.
+template <typename T>
+inline void VectorVectorCwiseProduct(const T *__restrict__ vector1, const T *__restrict__ vector2,
+ int v_size, T *__restrict__ result)
+{
+ for (int v = 0; v < v_size; v++)
+ {
+ *result++ = *vector1++ * *vector2++;
+ }
+}
+
+// Cwise product and accumulate of two vectors. Since it's a MAC operation, the
+// assumption here is that result array is initialized to valid values.
+template <typename T>
+inline void VectorVectorCwiseProductAccumulate(const T *__restrict__ vector1,
+ const T *__restrict__ vector2, int v_size,
+ T *__restrict__ result)
+{
+ for (int v = 0; v < v_size; v++)
+ {
+ *result++ += *vector1++ * *vector2++;
+ }
+}
+
+// Cwise product of a vector and a batch-vector.
+template <typename T>
+inline void VectorBatchVectorCwiseProduct(const T *vector, int v_size, const T *batch_vector,
+ int n_batch, T *result)
+{
+ for (int b = 0; b < n_batch; b++)
+ {
+ VectorVectorCwiseProduct(vector, batch_vector, v_size, result);
+ // Update the pointers.
+ result += v_size;
+ batch_vector += v_size;
+ }
+}
+
+// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
+// operation, the assumption here is that result array is initialized to valid
+// values.
+template <typename T>
+inline void VectorBatchVectorCwiseProductAccumulate(const T *vector, int v_size,
+ const T *batch_vector, int n_batch, T *result)
+{
+ for (int b = 0; b < n_batch; b++)
+ {
+ VectorVectorCwiseProductAccumulate(vector, batch_vector, v_size, result);
+ // Update the pointers.
+ result += v_size;
+ batch_vector += v_size;
+ }
+}
+
+inline bool IsZeroVector(const float *vector, int v_size)
{
return NEON_OR_PORTABLE(IsZeroVector, vector, v_size);
}
-void ApplyActivationToVector(const float *vector, int v_size,
- FusedActivationFunctionType activation, float *result)
+inline void ApplyActivationToVector(const float *vector, int v_size,
+ FusedActivationFunctionType activation, float *result)
{
PortableApplyActivationToVector(vector, v_size, activation, result);
}
-void SymmetricQuantizeFloats(const float *values, const int size, int8_t *quantized_values,
- float *min, float *max, float *scaling_factor)
+inline void Sub1Vector(const float *vector, int v_size, float *result)
+{
+ NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result);
+}
+
+inline void SymmetricQuantizeFloats(const float *values, const int size, int8_t *quantized_values,
+ float *min, float *max, float *scaling_factor)
{
return NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values, min, max,
scaling_factor);
}
-void MatrixBatchVectorMultiplyAccumulate(const int8_t *matrix, const int m_rows, const int m_cols,
- const int8_t *vector, const float *scaling_factors,
- int n_batch, float *result, int result_stride)
+inline void MatrixBatchVectorMultiplyAccumulate(const int8_t *matrix, const int m_rows,
+ const int m_cols, const int8_t *vector,
+ const float *scaling_factors, int n_batch,
+ float *result, int result_stride)
{
NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, vector,
scaling_factors, n_batch, result, result_stride);
}
-void MatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols,
- const float *vector, int n_batch, float *result,
- int result_stride)
+inline void MatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols,
+ const float *vector, int n_batch, float *result,
+ int result_stride)
{
NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, vector, n_batch,
result, result_stride);
}
-void MatrixBatchVectorMultiplyAccumulate(const int8_t *matrix, const int m_rows, const int m_cols,
- const int8_t *vectors, const float *scaling_factors,
- int n_batch, int32_t *scratch, float *result,
- int result_stride, ruy::Context *ruy_context)
+inline void MatrixBatchVectorMultiplyAccumulate(const int8_t *matrix, const int m_rows,
+ const int m_cols, const int8_t *vectors,
+ const float *scaling_factors, int n_batch,
+ int32_t *scratch, float *result, int result_stride,
+ ruy::Context *ruy_context)
{
NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, vectors,
scaling_factors, n_batch, scratch, result, result_stride, ruy_context);
}
-void ZeroVector(float *vector, int v_size) { PortableZeroVector(vector, v_size); }
+inline void MeanStddevNormalization(const float *input_vector, float *output_vector, int v_size,
+ int n_batch)
+{
+ PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch);
+}
+
+inline void ZeroVector(float *vector, int v_size) { PortableZeroVector(vector, v_size); }
} // namespace cker
} // namespace nnfw
diff --git a/compute/cker/include/cker/Types.h b/compute/cker/include/cker/Types.h
index c0c9313ea..3fd0cf5b6 100644
--- a/compute/cker/include/cker/Types.h
+++ b/compute/cker/include/cker/Types.h
@@ -34,6 +34,8 @@ enum class FusedActivationFunctionType
kRelu6 = 1,
kRelu1 = 2,
kRelu = 3,
+ kTanh = 4,
+ kSigmoid = 6,
};
enum class PaddingType
{
@@ -78,8 +80,6 @@ enum class BroadcastableOpCategory : uint8_t
struct PoolParams
{
- FusedActivationFunctionType activation;
- PaddingType padding_type;
PaddingValues padding_values;
int stride_height;
int stride_width;
@@ -109,6 +109,8 @@ struct SoftmaxParams
int32_t zero_point;
float scale;
float *table;
+ uint8_t *uint8_table1;
+ uint8_t *uint8_table2;
};
struct PackParams
@@ -170,25 +172,25 @@ struct ComparisonParams
struct BinaryArithmeticOpParam
{
// Shape dependent / common to data / op types.
- BroadcastableOpCategory broadcast_category;
+ BroadcastableOpCategory broadcast_category{BroadcastableOpCategory::kNone};
// uint8 inference params.
- int32_t input1_offset;
- int32_t input2_offset;
- int32_t output_offset;
- int32_t output_multiplier;
- int32_t output_shift;
+ int32_t input1_offset = 0;
+ int32_t input2_offset = 0;
+ int32_t output_offset = 0;
+ int32_t output_multiplier = 0;
+ int32_t output_shift = 0;
// Add / Sub, not Mul, uint8 inference params.
- int32_t left_shift;
- int32_t input1_multiplier;
- int32_t input1_shift;
- int32_t input2_multiplier;
- int32_t input2_shift;
+ int32_t left_shift = 0;
+ int32_t input1_multiplier = 0;
+ int32_t input1_shift = 0;
+ int32_t input2_multiplier = 0;
+ int32_t input2_shift = 0;
// uint8, etc, activation params.
- int32_t quantized_activation_min;
- int32_t quantized_activation_max;
+ int32_t quantized_activation_min = 0;
+ int32_t quantized_activation_max = 0;
// float activation params.
- float float_activation_min;
- float float_activation_max;
+ float float_activation_min = 0;
+ float float_activation_max = 0;
// Processed output dimensions.
// Let input "a" be the one that broadcasts in the faster-changing dimension.
@@ -256,9 +258,12 @@ struct FullyConnectedParams
// uint8, etc, activation params.
int32_t quantized_activation_min;
int32_t quantized_activation_max;
- // float activation params.
+ // float activation params
float float_activation_min;
float float_activation_max;
+ // Mark the operands as cacheable if they are unchanging, e.g. weights.
+ bool lhs_cacheable;
+ bool rhs_cacheable;
// FullyConnectedWeightsFormat weights_format;
};
@@ -268,6 +273,27 @@ struct L2NormParams
int32_t input_zero_point;
};
+enum LSTMKernelType
+{
+ kTfLiteLSTMFullKernel = 0,
+ kTfLiteLSTMBasicKernel
+};
+
+struct LSTMParams
+{
+ // Parameters for LSTM version 1.
+ FusedActivationFunctionType activation{FusedActivationFunctionType::kNone};
+ float cell_clip;
+ float proj_clip;
+
+ // Parameters for LSTM version 2.
+ // kTfLiteLSTMBasicKernel is only supported in version 2 or above.
+ LSTMKernelType kernel_type;
+
+ // Parameters for LSTM version 4.
+ bool asymmetric_quantize_inputs;
+};
+
struct GatherParams
{
int32_t axis;
@@ -366,12 +392,24 @@ struct SpaceToDepthParams
int32_t block_size;
};
+struct LeakyReluParams
+{
+ float alpha;
+};
+
enum class Order
{
kColMajor,
kRowMajor
};
+enum class CachePolicy : std::uint8_t
+{
+ kNeverCache,
+ kCacheIfLargeSpeedup,
+ kAlwaysCache,
+};
+
// MatrixParams encapsulates the parameters that Gemm needs about each
// matrix, besides the buffer data pointer.
// Compare to ruy::Matrix, which also encapsulates the data pointer.
@@ -390,10 +428,13 @@ template <typename Scalar> struct MatrixParams
// The zero_point, i.e. which Scalar value is to be interpreted as zero.
// When Scalar is floating-point, this must be 0.
Scalar zero_point = 0;
- // Indicate whether the underlying data will remain unchanged for
- // some period of time. Defaults to false, but should be set to true
- // for unchanging data (e.g. weights buffers in many cases)
- bool cacheable = false;
+ // When the data pointed to by this matrix is constant data, so that it is
+ // valid to assume that equality of pointers implies equality of data,
+ // a CachePolicy may be used instead of the default kNeverCache,
+ // which will enable ruy to take advantage of this constancy of the data to
+ // cache the packing work, which can be a large speedup in matrix*vector
+ // and other narrow shapes.
+ CachePolicy cache_policy = CachePolicy::kNeverCache;
};
// Enumeration of broad categories of Gemm.
@@ -442,9 +483,9 @@ enum class QuantizationFlavor
// (only those that need perchannel quantization do).
template <typename AccumScalar, typename DstScalar,
QuantizationFlavor quantization_flavor =
- std::is_floating_point<AccumScalar>::value
- ? QuantizationFlavor::kFloatingPoint
- : QuantizationFlavor::kIntegerWithUniformMultiplier>
+ std::is_floating_point<AccumScalar>::value
+ ? QuantizationFlavor::kFloatingPoint
+ : QuantizationFlavor::kIntegerWithUniformMultiplier>
struct GemmParams
{
// Only for non-floating-point cases. The fixed-point part (i.e. the mantissa)
@@ -471,12 +512,12 @@ struct GemmParams
const AccumScalar *bias = nullptr;
// min clamp bound of destination values.
DstScalar clamp_min = std::is_floating_point<DstScalar>::value
- ? -std::numeric_limits<DstScalar>::infinity()
- : std::numeric_limits<DstScalar>::lowest();
+ ? -std::numeric_limits<DstScalar>::infinity()
+ : std::numeric_limits<DstScalar>::lowest();
// max clamp bound of destination values.
DstScalar clamp_max = std::is_floating_point<DstScalar>::value
- ? std::numeric_limits<DstScalar>::infinity()
- : std::numeric_limits<DstScalar>::max();
+ ? std::numeric_limits<DstScalar>::infinity()
+ : std::numeric_limits<DstScalar>::max();
};
// Validates self-consistency of GemmParams.
diff --git a/compute/cker/include/cker/Utils.h b/compute/cker/include/cker/Utils.h
index 2abb998d0..9aae0a957 100644
--- a/compute/cker/include/cker/Utils.h
+++ b/compute/cker/include/cker/Utils.h
@@ -20,6 +20,8 @@
#include "Shape.h"
+#include "neon/neon_check.h"
+
#include <algorithm>
#include <cstdint>
#include <fixedpoint/fixedpoint.h>
@@ -29,6 +31,11 @@ namespace nnfw
namespace cker
{
+template <typename T> struct is_quant8
+{
+ static constexpr bool value = std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value;
+};
+
template <typename T>
inline T ActivationFunctionWithMinMax(T x, T output_activation_min, T output_activation_max)
{
@@ -88,8 +95,8 @@ inline int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multip
int left_shift = shift > 0 ? shift : 0;
int right_shift = shift > 0 ? 0 : -shift;
return gemmlowp::RoundingDivideByPOT(
- gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier),
- right_shift);
+ gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier),
+ right_shift);
}
inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(int32_t x, int32_t quantized_multiplier,
@@ -103,8 +110,36 @@ inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(int32_t x,
int left_shift)
{
return gemmlowp::RoundingDivideByPOT(
- gemmlowp::SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
+ gemmlowp::SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
+}
+
+#ifdef USE_NEON
+inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(int32x4x4_t input_val,
+ int32_t quantized_multiplier, int32_t shift)
+{
+ const int left_shift = std::max(shift, 0);
+ const int right_shift = std::min(shift, 0);
+ int32x4x4_t result;
+
+ int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier);
+ int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
+ int32x4_t right_shift_dup = vdupq_n_s32(right_shift);
+
+ result.val[0] = vrshlq_s32(
+ vqrdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup), multiplier_dup), right_shift_dup);
+
+ result.val[1] = vrshlq_s32(
+ vqrdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup), multiplier_dup), right_shift_dup);
+
+ result.val[2] = vrshlq_s32(
+ vqrdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup), multiplier_dup), right_shift_dup);
+
+ result.val[3] = vrshlq_s32(
+ vqrdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup), multiplier_dup), right_shift_dup);
+
+ return result;
}
+#endif
inline int NodeOffset(int b, int h, int w, int height, int width)
{
@@ -162,7 +197,7 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
const F3 fixedpoint_input = F3::FromRaw(input >> 1);
const F3 fixedpoint_half_input = SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
const F3 fixedpoint_half_three =
- GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
+ GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
// Newton-Raphson iteration
// Naive unoptimized starting guess: x = 1
F3 x = F3::One();
@@ -173,7 +208,7 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
}
const F0 fixedpoint_half_sqrt_2 =
- GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
+ GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
x = x * fixedpoint_half_sqrt_2;
*output_inv_sqrt = x.raw();
if (*output_shift < 0)
@@ -429,7 +464,7 @@ template <typename T> class SequentialTensorWriter
{
public:
SequentialTensorWriter(const T *input_data, T *output_data)
- : input_data_(input_data), output_ptr_(output_data)
+ : input_data_(input_data), output_ptr_(output_data)
{
}
diff --git a/compute/cker/include/cker/eigen/EigenSupport.h b/compute/cker/include/cker/eigen/EigenSupport.h
index 49c34211a..e3b10990e 100644
--- a/compute/cker/include/cker/eigen/EigenSupport.h
+++ b/compute/cker/include/cker/eigen/EigenSupport.h
@@ -39,17 +39,17 @@ namespace eigen_support
// library.
typedef Eigen::TensorMap<Eigen::Tensor<float, 2, Eigen::RowMajor, Eigen::DenseIndex>,
Eigen::Aligned>
- EigenMatrix;
+ EigenMatrix;
typedef Eigen::TensorMap<Eigen::Tensor<const float, 2, Eigen::RowMajor, Eigen::DenseIndex>,
Eigen::Aligned>
- ConstEigenMatrix;
+ ConstEigenMatrix;
typedef Eigen::TensorMap<Eigen::Tensor<float, 4, Eigen::RowMajor, Eigen::DenseIndex>,
Eigen::Aligned>
- EigenTensor;
+ EigenTensor;
typedef Eigen::TensorMap<Eigen::Tensor<const float, 4, Eigen::RowMajor, Eigen::DenseIndex>,
Eigen::Aligned>
- ConstEigenTensor;
+ ConstEigenTensor;
// Utility functions we need for the EigenTensor API.
template <typename Device, typename T> struct MatMulConvFunctor
diff --git a/compute/cker/include/cker/eigen/Utils.h b/compute/cker/include/cker/eigen/Utils.h
index f9c706370..40cb85432 100644
--- a/compute/cker/include/cker/eigen/Utils.h
+++ b/compute/cker/include/cker/eigen/Utils.h
@@ -36,9 +36,9 @@ namespace cker
// Eigen::Map<Eigen::Matrix<const float, ...>>
template <typename Scalar>
using VectorMap = typename std::conditional<
- std::is_const<Scalar>::value,
- Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, 1>>,
- Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>>::type;
+ std::is_const<Scalar>::value,
+ Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, 1>>,
+ Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>>::type;
template <typename Scalar> VectorMap<Scalar> MapAsVector(Scalar *data, const Shape &shape)
{
@@ -51,10 +51,10 @@ template <typename Scalar> VectorMap<Scalar> MapAsVector(Scalar *data, const Sha
// above also applies here.
template <typename Scalar>
using MatrixMap = typename std::conditional<
- std::is_const<Scalar>::value,
- Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic,
- Eigen::Dynamic>>,
- Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
+ std::is_const<Scalar>::value,
+ Eigen::Map<
+ const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, Eigen::Dynamic>>,
+ Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
template <typename Scalar>
MatrixMap<Scalar> MapAsMatrixWithLastDimAsRows(Scalar *data, const Shape &shape)
diff --git a/compute/cker/include/cker/eigen/eigen_convolution_helpers.h b/compute/cker/include/cker/eigen/eigen_convolution_helpers.h
index dc3e2552d..9d4fd2eaf 100644
--- a/compute/cker/include/cker/eigen/eigen_convolution_helpers.h
+++ b/compute/cker/include/cker/eigen/eigen_convolution_helpers.h
@@ -49,20 +49,19 @@ class TensorEvaluatorHasPartialPacket
public:
template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
static auto functionExistsSfinae(
- typename std::enable_if<
- unpacket_traits<PacketT>::masked_load_available &&
- std::is_same<
- PacketT,
- decltype(std::declval<const TensorEvaluatorT>().template partialPacket<PacketT>(
- std::declval<IndexT>(),
- std::declval<typename unpacket_traits<PacketT>::mask_t>()))>::value>::type *)
- -> std::true_type;
+ typename std::enable_if<
+ unpacket_traits<PacketT>::masked_load_available &&
+ std::is_same<PacketT,
+ decltype(std::declval<const TensorEvaluatorT>().template partialPacket<PacketT>(
+ std::declval<IndexT>(),
+ std::declval<typename unpacket_traits<PacketT>::mask_t>()))>::value>::type *)
+ -> std::true_type;
template <typename TensorEvaluatorT, typename PacketT, typename IndexT>
static auto functionExistsSfinae(...) -> std::false_type;
typedef decltype(
- functionExistsSfinae<TensorEvaluatorType, PacketType, IndexType>(nullptr)) status;
+ functionExistsSfinae<TensorEvaluatorType, PacketType, IndexType>(nullptr)) status;
static constexpr bool value = status::value;
};
@@ -71,9 +70,9 @@ public:
// [from, to) range. If the mask bit is 1, element will be loaded/stored.
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
- typename std::enable_if<unpacket_traits<Packet>::masked_load_available,
- typename unpacket_traits<Packet>::mask_t>::type
- mask(int from, int to)
+ typename std::enable_if<unpacket_traits<Packet>::masked_load_available,
+ typename unpacket_traits<Packet>::mask_t>::type
+ mask(int from, int to)
{
const Index packet_size = internal::unpacket_traits<Packet>::size;
eigen_assert(0 <= from && to <= (packet_size + 1) && from < to);
diff --git a/compute/cker/include/cker/eigen/eigen_gemm_eigen.h b/compute/cker/include/cker/eigen/eigen_gemm_eigen.h
new file mode 100644
index 000000000..d4f8fc09d
--- /dev/null
+++ b/compute/cker/include/cker/eigen/eigen_gemm_eigen.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_EGIEN_EIGEN_GEMM_EIGEN_H__
+#define __NNFW_CKER_EGIEN_EIGEN_GEMM_EIGEN_H__
+
+// See b/131835803: in TFLite code, because eigen_spatial_convolutions.h does
+// #define Eigen EigenForTFLite, it is difficult to have any #include of Eigen
+// headers in a header file, as that results in name classes (compilation
+// errors) depending on the order in which these headers are #included.
+// So we have moved the #include of Eigen here, in a .cc file, where we have
+// control over the header #include sequence.
+// #include "third_party/eigen3/Eigen/Core"
+// #include "tensorflow/lite/kernels/cpu_backend_context.h"
+// #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+// #include "tensorflow/lite/kernels/internal/common.h"
+// #include "cker/eigen/eigen_convolution_helpers.h"
+#include "cker/operation/Common.h"
+#include "cker/Types.h"
+
+#include <Eigen/Core>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace detail
+{
+
+// tensorflow/tensorflow/lite/kernels/cpu_backend_gemm_eigen.h and cpu_backend_gemm_eigen.cc
+struct GemmImplUsingEigen
+{
+ static void Run(const MatrixParams<float> &lhs_params, const float *lhs_data,
+ const MatrixParams<float> &rhs_params, const float *rhs_data,
+ const MatrixParams<float> &dst_params, float *dst_data,
+ const GemmParams<float, float> &params)
+ {
+ // This code assumes specific storage orders, encoded in these Eigen types.
+ // These assumptions have been checked by TF_LITE_ASSERT's in the public
+ // Gemm entry point already, before the implementation gets to this point.
+ using EigenMatrixMapRowMajorConst =
+ Eigen::Map<const Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+ using EigenMatrixMapColMajorConst =
+ Eigen::Map<const Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>>;
+ using EigenMatrixMapColMajorMutable =
+ Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>>;
+
+ EigenMatrixMapRowMajorConst eigen_lhs(lhs_data, lhs_params.rows, lhs_params.cols);
+ EigenMatrixMapColMajorConst eigen_rhs(rhs_data, rhs_params.rows, rhs_params.cols);
+ EigenMatrixMapColMajorMutable eigen_dst(dst_data, dst_params.rows, dst_params.cols);
+
+ if (rhs_params.cols == 1)
+ {
+ eigen_dst.col(0).noalias() = eigen_lhs * eigen_rhs.col(0);
+ }
+ else if (lhs_params.rows == 1)
+ {
+ eigen_dst.row(0).noalias() = eigen_lhs.row(0) * eigen_rhs;
+ }
+ else
+ {
+ eigen_dst.noalias() = eigen_lhs * eigen_rhs;
+ }
+
+ if (params.bias)
+ {
+ BiasAndClamp(params.clamp_min, params.clamp_max, dst_params.rows, params.bias,
+ dst_params.rows * dst_params.cols, dst_data);
+ }
+ else
+ {
+ eigen_dst = eigen_dst.cwiseMin(params.clamp_max).cwiseMax(params.clamp_min);
+ }
+ }
+};
+
+} // namespace detail
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_EGIEN_EIGEN_GEMM_EIGEN_H__
diff --git a/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h b/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h
index 92e1614d1..c931ac518 100644
--- a/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h
+++ b/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h
@@ -62,30 +62,27 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen
typename Scalar_, typename Index, typename nocontract_t, typename contract_t, int Side,
int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
class TensorContractionInputMapper<
- Scalar_, Index, Side,
- TensorEvaluator<
- const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
- Device>,
- nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+ Scalar_, Index, Side,
+ TensorEvaluator<
+ const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+ nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
{
public:
typedef Scalar_ Scalar;
typedef TensorContractionInputMapper<
- Scalar, Index, Side,
- TensorEvaluator<
- const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
- Device>,
- nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
- Self;
+ Scalar, Index, Side,
+ TensorEvaluator<
+ const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+ nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+ Self;
typedef TensorContractionSubMapper<
- Scalar, Index, Side,
- TensorEvaluator<
- const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
- Device>,
- nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
- SubMapper;
+ Scalar, Index, Side,
+ TensorEvaluator<
+ const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+ nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+ SubMapper;
typedef SubMapper VectorMapper;
typedef SubMapper LinearMapper;
@@ -95,11 +92,11 @@ public:
EIGEN_DEVICE_FUNC
TensorContractionInputMapper(
- const TensorEvaluator<
- const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
- Device> &tensor,
- const nocontract_t &, const nocontract_t &, const contract_t &, const contract_t &)
- : m_impl(tensor.impl().impl())
+ const TensorEvaluator<
+ const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>
+ &tensor,
+ const nocontract_t &, const nocontract_t &, const contract_t &, const contract_t &)
+ : m_impl(tensor.impl().impl())
{
Index patch_rows;
Index patch_depth;
@@ -167,7 +164,7 @@ public:
EIGEN_DEVICE_FUNC
TensorContractionInputMapper(const TensorContractionInputMapper &base_mapper)
- : m_impl(base_mapper.m_impl)
+ : m_impl(base_mapper.m_impl)
{
m_patch_cols = base_mapper.m_patch_cols;
m_num_patches = base_mapper.m_num_patches;
@@ -280,11 +277,10 @@ public:
private:
friend class TensorContractionSubMapper<
- Scalar, Index, Side,
- TensorEvaluator<
- const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
- Device>,
- nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>;
+ Scalar, Index, Side,
+ TensorEvaluator<
+ const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+ nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>;
// Load coefficient from a patch specified by the "within patch offset"
// (patchId) and the precomputed indices of the first element of the patch.
@@ -298,14 +294,14 @@ private:
const Index colOffset = patchOffset / m_fastColStride;
const Index inputCol = colIndex + colOffset * m_in_col_strides;
const Index origInputCol = (m_patch_col_inflate_strides == 1)
- ? inputCol
- : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
+ ? inputCol
+ : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
const Index rowOffset = patchOffset - colOffset * m_colStride;
const Index inputRow = rowIndex + rowOffset * m_in_row_strides;
const Index origInputRow = (m_patch_row_inflate_strides == 1)
- ? inputRow
- : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
+ ? inputRow
+ : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
if (origInputCol < 0 || origInputRow < 0 || origInputCol >= m_inputCols ||
origInputRow >= m_inputRows || (inputCol != origInputCol * m_patch_col_inflate_strides) ||
(inputRow != origInputRow * m_patch_row_inflate_strides))
@@ -314,7 +310,7 @@ private:
}
const Index depth = patchId - patchOffset * patchDepth();
const Index inputIndex =
- depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex;
+ depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex;
return m_impl.coeff(inputIndex);
}
@@ -338,7 +334,7 @@ private:
}
const Index depth = patchId - patchOffset * patchDepth();
const Index inputIndex =
- depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
+ depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
return m_impl.coeff(inputIndex);
}
@@ -390,7 +386,7 @@ private:
// span[0] all the way upto (and including) span[1].
const Index depth = patchId - patchOffsets[0] * patchDepth();
const Index inputIndex =
- depth + inputRows[0] * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
+ depth + inputRows[0] * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
return m_impl.template partialPacket<Packet>(inputIndex - span[0],
mask<Packet>(span[0], span[1] + 1));
}
@@ -445,10 +441,10 @@ private:
// Load partial packets and do bit-wise OR to generate required packet
return internal::por<Packet>(
- loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[0], spans[0],
- patchOffsets2Cols[0], colOffsets[0]),
- loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[1], spans[1],
- patchOffsets2Cols[1], colOffsets[1]));
+ loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[0], spans[0],
+ patchOffsets2Cols[0], colOffsets[0]),
+ loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[1], spans[1],
+ patchOffsets2Cols[1], colOffsets[1]));
}
// Helper function to load a packet that is present in a single columns.
@@ -477,7 +473,7 @@ private:
// no padding
const Index depth = patchId - patchOffsets[0] * patchDepth();
const Index inputIndex =
- depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex;
+ depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex;
return m_impl.template packet<Unaligned>(inputIndex);
}
return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
@@ -490,7 +486,7 @@ private:
// load.
template <typename PacketT, typename TensorEvaluatorT>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
- !TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
+ !TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
loadPacketStandard(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const
{
const Index packetSize = internal::unpacket_traits<Packet>::size;
@@ -538,7 +534,7 @@ private:
// packets.
template <typename PacketT, typename TensorEvaluatorT>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
- TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
+ TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
loadPacketStandard(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const
{
const Index packetSize = internal::unpacket_traits<PacketT>::size;
@@ -604,7 +600,7 @@ private:
// no padding
const Index depth = patchId - patchOffset * patchDepth();
const Index inputIndex =
- depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
+ depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
return m_impl.template packet<Unaligned>(inputIndex);
}
@@ -627,10 +623,10 @@ private:
computeBaseIndices(Index patchIndex, Index &rowIndex, Index &colIndex, Index &otherIndex) const
{
const size_t NumInputDims =
- array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+ array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
otherIndex = (NumInputDims == 3) ? 0 : patchIndex / m_fastNumPatches;
const Index patch2DIndex =
- (NumInputDims == 3) ? patchIndex : (patchIndex - otherIndex * m_num_patches);
+ (NumInputDims == 3) ? patchIndex : (patchIndex - otherIndex * m_num_patches);
otherIndex *= m_patchInputStride;
colIndex = patch2DIndex / m_fastOutputRows;
rowIndex = patch2DIndex - colIndex * m_outputRows;
@@ -689,31 +685,28 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen
typename Scalar, typename Index, typename nocontract_t, typename contract_t, int Side,
int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
class TensorContractionSubMapper<
- Scalar, Index, Side,
- TensorEvaluator<
- const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
- Device>,
- nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+ Scalar, Index, Side,
+ TensorEvaluator<
+ const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+ nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
{
public:
typedef typename packet_traits<Scalar>::type Packet;
typedef typename packet_traits<Scalar>::half HalfPacket;
typedef TensorContractionInputMapper<
- Scalar, Index, Side,
- TensorEvaluator<
- const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
- Device>,
- nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
- ParentMapper;
+ Scalar, Index, Side,
+ TensorEvaluator<
+ const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+ nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+ ParentMapper;
typedef TensorContractionSubMapper<
- Scalar, Index, Side,
- TensorEvaluator<
- const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
- Device>,
- nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
- Self;
+ Scalar, Index, Side,
+ TensorEvaluator<
+ const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+ nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+ Self;
typedef Self LinearMapper;
@@ -722,16 +715,16 @@ public:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(const ParentMapper &base_mapper,
Index vert_offset,
Index horiz_offset)
- : m_depth_offset(vert_offset), m_col_offset(horiz_offset), m_base_mapper(base_mapper)
+ : m_depth_offset(vert_offset), m_col_offset(horiz_offset), m_base_mapper(base_mapper)
{
m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(const Self &base_mapper,
Index vert_offset,
Index horiz_offset)
- : m_depth_offset(vert_offset + base_mapper.m_depth_offset),
- m_col_offset(horiz_offset + base_mapper.m_col_offset),
- m_base_mapper(base_mapper.m_base_mapper)
+ : m_depth_offset(vert_offset + base_mapper.m_depth_offset),
+ m_col_offset(horiz_offset + base_mapper.m_col_offset),
+ m_base_mapper(base_mapper.m_base_mapper)
{
m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex);
}
@@ -766,7 +759,7 @@ public:
{
typedef decltype(m_base_mapper.m_impl) TensorEvaluatorT;
return m_base_mapper.template loadPacketStandard<Packet, TensorEvaluatorT>(
- i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
+ i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
}
template <typename Packet> EIGEN_DEVICE_FUNC bool aligned(Index) const { return false; }
@@ -781,7 +774,7 @@ public:
EIGEN_ALWAYS_INLINE Index maxCol(const Index peeled_k) const
{
const Index max_col =
- (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1)) / fastPatchColStride();
+ (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1)) / fastPatchColStride();
return std::min<Index>(1 + max_col, patchCols());
}
@@ -789,8 +782,8 @@ public:
EIGEN_ALWAYS_INLINE Index maxRow(const Index peeled_k, const Index col) const
{
const Index max_row =
- (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1) - col * patchColStride()) /
- fastPatchRowStride();
+ (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1) - col * patchColStride()) /
+ fastPatchRowStride();
return std::min<Index>(1 + max_row, patchRows());
}
@@ -862,7 +855,7 @@ public:
}
template <typename PacketT = Packet>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<
- TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
+ TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type
partialPacketNoPadding(const Index depth, const Index baseIndex, Index num_coeffs) const
{
const Index inputIndex = depth + baseIndex;
@@ -913,8 +906,8 @@ public:
const Index input_row = m_rowIndex + row * m_base_mapper.m_in_row_strides;
*orig_row = (m_base_mapper.m_patch_row_inflate_strides == 1)
- ? input_row
- : ((input_row >= 0) ? (input_row / m_base_mapper.m_fastInputRowStride) : 0);
+ ? input_row
+ : ((input_row >= 0) ? (input_row / m_base_mapper.m_fastInputRowStride) : 0);
return (*orig_row < 0 || *orig_row >= m_base_mapper.m_inputRows) ||
(input_row != *orig_row * m_base_mapper.m_patch_row_inflate_strides);
@@ -932,8 +925,8 @@ public:
const Index input_col = m_colIndex + col * m_base_mapper.m_in_col_strides;
*orig_col = (m_base_mapper.m_patch_col_inflate_strides == 1)
- ? input_col
- : ((input_col >= 0) ? (input_col / m_base_mapper.m_fastInputColStride) : 0);
+ ? input_col
+ : ((input_col >= 0) ? (input_col / m_base_mapper.m_fastInputColStride) : 0);
return (*orig_col < 0 || *orig_col >= m_base_mapper.m_inputCols) ||
(input_col != *orig_col * m_base_mapper.m_patch_col_inflate_strides);
@@ -1033,23 +1026,20 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen
int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
int nr>
struct gemm_pack_rhs<
- Scalar, Index,
- TensorContractionSubMapper<
- Scalar, Index, Rhs,
- TensorEvaluator<
- const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
- Device>,
- nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered,
- Alignment>,
- nr, ColMajor, false, false>
+ Scalar, Index,
+ TensorContractionSubMapper<
+ Scalar, Index, Rhs,
+ TensorEvaluator<
+ const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+ nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>,
+ nr, ColMajor, false, false>
{
typedef TensorContractionSubMapper<
- Scalar, Index, Rhs,
- TensorEvaluator<
- const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
- Device>,
- nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
- SubMapper;
+ Scalar, Index, Rhs,
+ TensorEvaluator<
+ const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+ nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
+ SubMapper;
typedef SubMapper DataMapper;
typedef typename packet_traits<Scalar>::type Packet;
@@ -1159,7 +1149,7 @@ struct gemm_pack_rhs<
const Index idx3 = dm3.baseIndex(r, c);
const Index start_depth =
- ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
+ ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
eigen_assert((max_depth - start_depth) % packet_size == 0);
@@ -1248,22 +1238,20 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen
typename Scalar, typename Index, typename nocontract_t, typename contract_t,
bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, int nr>
struct gemm_pack_rhs<
- Scalar, Index,
- TensorContractionSubMapper<
- Scalar, Index, Rhs,
- TensorEvaluator<
- const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
- Device>,
- nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment>,
- nr, ColMajor, false, false>
+ Scalar, Index,
+ TensorContractionSubMapper<
+ Scalar, Index, Rhs,
+ TensorEvaluator<
+ const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+ nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment>,
+ nr, ColMajor, false, false>
{
typedef TensorContractionSubMapper<
- Scalar, Index, Rhs,
- TensorEvaluator<
- const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
- Device>,
- nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment>
- SubMapper;
+ Scalar, Index, Rhs,
+ TensorEvaluator<
+ const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+ nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment>
+ SubMapper;
typedef SubMapper DataMapper;
typedef typename packet_traits<Scalar>::type Packet;
@@ -1378,7 +1366,7 @@ struct gemm_pack_rhs<
const Index idx3 = dm3.baseIndex(r, c);
const Index start_depth =
- ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
+ ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
eigen_assert((max_depth - start_depth) % packet_size == 0);
@@ -1472,22 +1460,20 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen
typename Scalar, typename Index, typename nocontract_t, typename contract_t,
bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, int nr>
struct gemm_pack_rhs<
- Scalar, Index,
- TensorContractionSubMapper<
- Scalar, Index, Rhs,
- TensorEvaluator<
- const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
- Device>,
- nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>,
- nr, ColMajor, false, false>
+ Scalar, Index,
+ TensorContractionSubMapper<
+ Scalar, Index, Rhs,
+ TensorEvaluator<
+ const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+ nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>,
+ nr, ColMajor, false, false>
{
typedef TensorContractionSubMapper<
- Scalar, Index, Rhs,
- TensorEvaluator<
- const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>,
- Device>,
- nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>
- SubMapper;
+ Scalar, Index, Rhs,
+ TensorEvaluator<
+ const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>,
+ nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>
+ SubMapper;
typedef SubMapper DataMapper;
EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -1582,27 +1568,25 @@ struct gemm_pack_rhs<
*/
template <typename Input, typename Kernel, typename OutputKernel = const NoOpOutputKernel>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static const typename internal::conditional<
- internal::traits<Input>::Layout == ColMajor,
- TensorReshapingOp<
- const DSizes<typename internal::traits<Input>::Index,
- internal::traits<Input>::NumDimensions>,
- const TensorContractionOp<
- const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
- const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
- const Kernel>,
- const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
- const TensorImagePatchOp<Dynamic, Dynamic, const Input>>,
- const OutputKernel>>,
- TensorReshapingOp<
- const DSizes<typename internal::traits<Input>::Index,
- internal::traits<Input>::NumDimensions>,
- const TensorContractionOp<
- const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
- const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
- const TensorImagePatchOp<Dynamic, Dynamic, const Input>>,
- const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
- const Kernel>,
- const OutputKernel>>>::type
+ internal::traits<Input>::Layout == ColMajor,
+ TensorReshapingOp<
+ const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>,
+ const TensorContractionOp<
+ const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
+ const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
+ const Kernel>,
+ const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
+ const TensorImagePatchOp<Dynamic, Dynamic, const Input>>,
+ const OutputKernel>>,
+ TensorReshapingOp<
+ const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>,
+ const TensorContractionOp<
+ const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
+ const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
+ const TensorImagePatchOp<Dynamic, Dynamic, const Input>>,
+ const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>,
+ const Kernel>,
+ const OutputKernel>>>::type
SpatialConvolution(const Input &input, const Kernel &kernel, const Index row_stride = 1,
const Index col_stride = 1, const PaddingType padding_type = PADDING_SAME,
const Index row_in_stride = 1, const Index col_in_stride = 1,
@@ -1612,11 +1596,11 @@ SpatialConvolution(const Input &input, const Kernel &kernel, const Index row_str
typedef typename internal::traits<Input>::Index TensorIndex;
TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions,
internal::traits<Input>::Layout, TensorIndex>>
- in(input);
+ in(input);
TensorRef<
- Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions,
- internal::traits<Kernel>::Layout, TensorIndex>>
- kern(kernel);
+ Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions,
+ internal::traits<Kernel>::Layout, TensorIndex>>
+ kern(kernel);
EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<Kernel>::Layout,
YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -1735,46 +1719,46 @@ SpatialConvolution(const Input &input, const Kernel &kernel, const Index row_str
}
if (padding_explicit)
{
- return choose(
- Cond<internal::traits<Input>::Layout == ColMajor>(),
- kernel.reshape(kernel_dims)
- .contract(input
- .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
- row_in_stride, col_in_stride,
- /*row_inflate_stride=*/1,
- /*col_inflate_stride=*/1, padding_top,
- padding_bottom, padding_left, padding_right,
- /*padding_value=*/0)
- .reshape(pre_contract_dims),
- contract_dims, output_kernel)
- .reshape(post_contract_dims),
- input
- .extract_image_patches(
- kernelRows, kernelCols, row_stride, col_stride, row_in_stride, col_in_stride,
- /*row_inflate_stride=*/1,
- /*col_inflate_stride=*/1, padding_top, padding_bottom, padding_left, padding_right,
- /*padding_value=*/0)
- .reshape(pre_contract_dims)
- .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
- .reshape(post_contract_dims));
+ return choose(Cond<internal::traits<Input>::Layout == ColMajor>(),
+ kernel.reshape(kernel_dims)
+ .contract(input
+ .extract_image_patches(kernelRows, kernelCols, row_stride,
+ col_stride, row_in_stride, col_in_stride,
+ /*row_inflate_stride=*/1,
+ /*col_inflate_stride=*/1, padding_top,
+ padding_bottom, padding_left, padding_right,
+ /*padding_value=*/0)
+ .reshape(pre_contract_dims),
+ contract_dims, output_kernel)
+ .reshape(post_contract_dims),
+ input
+ .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
+ row_in_stride, col_in_stride,
+ /*row_inflate_stride=*/1,
+ /*col_inflate_stride=*/1, padding_top, padding_bottom,
+ padding_left, padding_right,
+ /*padding_value=*/0)
+ .reshape(pre_contract_dims)
+ .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
+ .reshape(post_contract_dims));
}
else
{
return choose(
- Cond<internal::traits<Input>::Layout == ColMajor>(),
- kernel.reshape(kernel_dims)
- .contract(input
- .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
- row_in_stride, col_in_stride, padding_type)
- .reshape(pre_contract_dims),
- contract_dims, output_kernel)
- .reshape(post_contract_dims),
- input
- .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, row_in_stride,
- col_in_stride, padding_type)
- .reshape(pre_contract_dims)
- .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
- .reshape(post_contract_dims));
+ Cond<internal::traits<Input>::Layout == ColMajor>(),
+ kernel.reshape(kernel_dims)
+ .contract(input
+ .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride,
+ row_in_stride, col_in_stride, padding_type)
+ .reshape(pre_contract_dims),
+ contract_dims, output_kernel)
+ .reshape(post_contract_dims),
+ input
+ .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, row_in_stride,
+ col_in_stride, padding_type)
+ .reshape(pre_contract_dims)
+ .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel)
+ .reshape(post_contract_dims));
}
}
diff --git a/compute/cker/include/cker/operation/AddN.h b/compute/cker/include/cker/operation/AddN.h
new file mode 100644
index 000000000..1704da641
--- /dev/null
+++ b/compute/cker/include/cker/operation/AddN.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_ADDN_H__
+#define __NNFW_CKER_ADDN_H__
+
+#include "cker/Shape.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T>
+void AddN(const Shape &input_shape, const size_t num_inputs, const T **input_data, T *output_data)
+{
+ const size_t size = input_shape.FlatSize();
+ for (size_t i = 0; i < size; ++i)
+ {
+ T x = 0;
+ for (size_t j = 0; j < num_inputs; ++j)
+ {
+ x += input_data[j][i];
+ }
+ output_data[i] = x;
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_ADDN_H__
diff --git a/compute/cker/include/cker/operation/AveragePool.h b/compute/cker/include/cker/operation/AveragePool.h
index 6149cafa7..e10f02ad4 100644
--- a/compute/cker/include/cker/operation/AveragePool.h
+++ b/compute/cker/include/cker/operation/AveragePool.h
@@ -73,10 +73,10 @@ void AveragePool<float>(const PoolParams &params, const Shape &input_shape, cons
int hpad = h + params.padding_values.height;
int wpad = w + params.padding_values.width;
int h_start =
- (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
+ (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
int h_end = std::min(hpad / stride_height + 1, output_height);
int w_start =
- (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
+ (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
int w_end = std::min(wpad / stride_width + 1, output_width);
// compute elementwise sum
for (int ph = h_start; ph < h_end; ++ph)
@@ -146,11 +146,11 @@ inline void AveragePool16(const PoolParams &params, const Shape &input_shape,
const int filter_y_start = std::max(0, -in_y_origin);
const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
const int filter_count =
- (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
+ (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
memset(acc, 0, tranche_depth * sizeof(acc[0]));
const uint8_t *input_ptr =
- input_data + depth_base +
- depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
+ input_data + depth_base +
+ depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
for (int fy = filter_y_start; fy < filter_y_end; fy++)
{
const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
@@ -283,11 +283,11 @@ inline void AveragePool32(const PoolParams &params, const Shape &input_shape,
const int filter_y_start = std::max(0, -in_y_origin);
const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
const int filter_count =
- (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
+ (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
memset(acc, 0, tranche_depth * sizeof(acc[0]));
const uint8_t *input_ptr =
- input_data + depth_base +
- depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
+ input_data + depth_base +
+ depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
for (int fy = filter_y_start; fy < filter_y_end; fy++)
{
const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
@@ -395,6 +395,129 @@ void AveragePool<uint8_t>(const PoolParams &params, const Shape &input_shape,
}
}
+template <>
+void AveragePool<int8_t>(const PoolParams &params, const Shape &input_shape,
+ const int8_t *input_data, const Shape &output_shape, int8_t *output_data)
+{
+ // Here, and in other pooling ops, in order to maintain locality of reference,
+ // to minimize some recalculations, and to load into NEON vector registers, we
+ // use an inner loop down the depth. Since depths can be large and hence we
+ // would need arbitrarily large temporary storage, we divide the work up into
+ // depth tranches just within the batch loop.
+ static constexpr int kPoolingAccTrancheSize = 256;
+
+ assert(params.quantized_activation_min <= params.quantized_activation_max);
+ assert(input_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ const int stride_height = params.stride_height;
+ const int stride_width = params.stride_width;
+
+ int32_t acc[kPoolingAccTrancheSize];
+ for (int batch = 0; batch < batches; ++batch)
+ {
+ // We proceed through the depth in tranches (see comment above). The
+ // depth_base is the depth at the beginning of the tranche. The
+ // tranche_depth is the depth dimension of the tranche.
+ for (int depth_base = 0; depth_base < depth; depth_base += kPoolingAccTrancheSize)
+ {
+ const int tranche_depth = std::min(depth - depth_base, kPoolingAccTrancheSize);
+ for (int out_y = 0; out_y < output_height; ++out_y)
+ {
+ for (int out_x = 0; out_x < output_width; ++out_x)
+ {
+ const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
+ const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
+ const int filter_x_start = std::max(0, -in_x_origin);
+ const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
+ const int filter_y_start = std::max(0, -in_y_origin);
+ const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
+ const int filter_count =
+ (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
+ memset(acc, 0, tranche_depth * sizeof(acc[0]));
+ const int8_t *input_ptr =
+ input_data + depth_base +
+ depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
+ for (int fy = filter_y_start; fy < filter_y_end; fy++)
+ {
+ const int8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
+ for (int fx = filter_x_start; fx < filter_x_end; fx++)
+ {
+ const int8_t *input_channel_ptr = input_row_ptr;
+ int channel = 0;
+#ifdef USE_NEON
+ for (; channel <= tranche_depth - 16; channel += 16)
+ {
+ int16x4_t acc_reg[4];
+ int8x16_t input_reg = vld1q_s8(input_channel_ptr);
+ input_channel_ptr += 16;
+ acc_reg[0] = vget_low_s16(vmovl_s8(vget_low_s8(input_reg)));
+ acc_reg[1] = vget_high_s16(vmovl_s8(vget_low_s8(input_reg)));
+ acc_reg[2] = vget_low_s16(vmovl_s8(vget_high_s8(input_reg)));
+ acc_reg[3] = vget_high_s16(vmovl_s8(vget_high_s8(input_reg)));
+ for (int i = 0; i < 4; i++)
+ {
+ vst1q_s32(acc + channel + 4 * i,
+ vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i]));
+ }
+ }
+ for (; channel <= tranche_depth - 8; channel += 8)
+ {
+ int16x4_t acc_reg[2];
+ int16x8_t input_reg = vmovl_s8(vld1_s8(input_channel_ptr));
+ input_channel_ptr += 8;
+ acc_reg[0] = vget_low_s16(input_reg);
+ acc_reg[1] = vget_high_s16(input_reg);
+ for (int i = 0; i < 2; i++)
+ {
+ vst1q_s32(acc + channel + 4 * i,
+ vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i]));
+ }
+ }
+#endif
+ for (; channel < tranche_depth; ++channel)
+ {
+ acc[channel] += *input_channel_ptr++;
+ }
+ input_row_ptr += depth;
+ }
+ }
+ int8_t *output_ptr = output_data + Offset(output_shape, batch, out_y, out_x, depth_base);
+ int channel = 0;
+#ifdef USE_NEON
+ for (; channel <= tranche_depth - 8; channel += 8)
+ {
+ int16_t buf[8];
+ for (int i = 0; i < 8; i++)
+ {
+ buf[i] = acc[channel + i] > 0 ? (acc[channel + i] + filter_count / 2) / filter_count
+ : (acc[channel + i] - filter_count / 2) / filter_count;
+ }
+ int8x8_t buf8 = vqmovn_s16(vld1q_s16(buf));
+ buf8 = vmin_s8(buf8, vdup_n_s8(params.quantized_activation_max));
+ buf8 = vmax_s8(buf8, vdup_n_s8(params.quantized_activation_min));
+ vst1_s8(output_ptr + channel, buf8);
+ }
+#endif
+ for (; channel < tranche_depth; ++channel)
+ {
+ int16_t a = acc[channel] > 0 ? (acc[channel] + filter_count / 2) / filter_count
+ : (acc[channel] - filter_count / 2) / filter_count;
+ a = std::max<int16_t>(a, params.quantized_activation_min);
+ a = std::min<int16_t>(a, params.quantized_activation_max);
+ output_ptr[channel] = static_cast<int8_t>(a);
+ }
+ }
+ }
+ }
+ }
+}
+
} // namespace cker
} // namespace nnfw
diff --git a/compute/cker/include/cker/operation/BatchToSpaceND.h b/compute/cker/include/cker/operation/BatchToSpaceND.h
index e33b2fba5..980ad48dd 100644
--- a/compute/cker/include/cker/operation/BatchToSpaceND.h
+++ b/compute/cker/include/cker/operation/BatchToSpaceND.h
@@ -43,7 +43,7 @@ inline void GetIndexRange(int spatial_index_dim, int block_shape_dim, int input_
// Similarly, (*end_index) * block_shape_dim is rounded up too (note that
// end_index is exclusive).
*end_index =
- std::min(input_dim, (output_dim - spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
+ std::min(input_dim, (output_dim - spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
}
template <typename T>
@@ -116,7 +116,7 @@ inline void BatchToSpaceND(const Shape &unextended_input1_shape, const T *input1
for (int in_w = in_w_start; in_w < in_w_end; ++in_w)
{
const int out_w =
- in_w * block_shape_width + spatial_offset % block_shape_width - crops_left;
+ in_w * block_shape_width + spatial_offset % block_shape_width - crops_left;
assert(out_w >= 0);
assert(out_w < output_width);
T *out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0);
diff --git a/compute/cker/include/cker/operation/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/BinaryArithmeticOps.h
index 8aef1f8c1..c7878496a 100644
--- a/compute/cker/include/cker/operation/BinaryArithmeticOps.h
+++ b/compute/cker/include/cker/operation/BinaryArithmeticOps.h
@@ -139,7 +139,7 @@ inline bool ProcessBroadcastShapes(const Shape &shape0, const Shape &shape1,
// From this point it is assumed contractually that corresponding dimensions
// in shape0 and shape1 are either (a) equal or (b) one or other equals 1.
const bool swap_inputs =
- params->broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast;
+ params->broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast;
const Shape *shape_a = swap_inputs ? &extended_shape1 : &extended_shape0;
const Shape *shape_b = swap_inputs ? &extended_shape0 : &extended_shape1;
@@ -190,34 +190,34 @@ inline bool ProcessBroadcastShapes(const Shape &shape0, const Shape &shape1,
}
template <BinaryArithmeticOpType op_type, typename T>
-inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
- const T *input1_data, const Shape &input2_shape,
- const T *input2_data, const Shape &output_shape, T *output_data)
+inline typename std::enable_if_t<!is_quant8<T>::value>
+BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+ const T *input1_data, const Shape &input2_shape, const T *input2_data,
+ const Shape &output_shape, T *output_data)
{
reference::BinaryArithmeticOp(params, input1_shape, input1_data, input2_shape, input2_data,
output_shape, output_data, GetBinaryArtithmeticFn<op_type, T>());
}
-template <BinaryArithmeticOpType op_type>
-inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
- const uint8_t *input1_data, const Shape &input2_shape,
- const uint8_t *input2_data, const Shape &output_shape,
- uint8_t *output_data)
+template <BinaryArithmeticOpType op_type, typename T>
+inline typename std::enable_if_t<is_quant8<T>::value>
+BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+ const T *input1_data, const Shape &input2_shape, const T *input2_data,
+ const Shape &output_shape, T *output_data)
{
switch (op_type)
{
case nnfw::cker::BinaryArithmeticOpType::ADD:
case nnfw::cker::BinaryArithmeticOpType::SUB:
- optimized::AddQuant8(params, input1_shape, input1_data, input2_shape, input2_data,
- output_shape, output_data);
+ optimized::Add(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
+ output_data);
break;
case nnfw::cker::BinaryArithmeticOpType::MUL:
- optimized::MulQuant8(params, input1_shape, const_cast<uint8_t *>(input1_data), input2_shape,
- const_cast<uint8_t *>(input2_data), output_shape, output_data);
+ optimized::Mul(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
+ output_data);
break;
case nnfw::cker::BinaryArithmeticOpType::DIV:
throw std::runtime_error{"Quant8 Asymm NYI"};
-
default:
assert(false);
break;
@@ -246,9 +246,8 @@ inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shap
output_data);
break;
case nnfw::cker::BinaryArithmeticOpType::DIV:
- reference::BinaryArithmeticOp<float>(params, input1_shape, input1_data, input2_shape,
- input2_data, output_shape, output_data,
- GetBinaryArtithmeticFn<op_type, float>());
+ optimized::Div(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
+ output_data);
break;
default:
assert(false);
@@ -257,33 +256,32 @@ inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shap
}
template <BinaryArithmeticOpType op_type, typename T>
-inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const Shape &input1_shape,
- const T *input1_data, const Shape &input2_shape,
- const T *input2_data, const Shape &output_shape,
- T *output_data)
+inline typename std::enable_if_t<!is_quant8<T>::value>
+BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const Shape &input1_shape,
+ const T *input1_data, const Shape &input2_shape, const T *input2_data,
+ const Shape &output_shape, T *output_data)
{
reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
input2_data, output_shape, output_data,
GetBinaryArtithmeticFn<op_type, T>());
}
-template <BinaryArithmeticOpType op_type>
-inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const Shape &input1_shape,
- const uint8_t *input1_data, const Shape &input2_shape,
- const uint8_t *input2_data, const Shape &output_shape,
- uint8_t *output_data)
+template <BinaryArithmeticOpType op_type, typename T>
+inline typename std::enable_if_t<is_quant8<T>::value>
+BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const Shape &input1_shape,
+ const T *input1_data, const Shape &input2_shape, const T *input2_data,
+ const Shape &output_shape, T *output_data)
{
switch (op_type)
{
case nnfw::cker::BinaryArithmeticOpType::ADD:
case nnfw::cker::BinaryArithmeticOpType::SUB:
- optimized::BroadcastAddDispatchQuant8(params, input1_shape, input1_data, input2_shape,
- input2_data, output_shape, output_data);
+ optimized::BroadcastAddDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
+ output_shape, output_data);
break;
case nnfw::cker::BinaryArithmeticOpType::MUL:
- optimized::BroadcastMulDispatchQuant8(
- params, input1_shape, const_cast<uint8_t *>(input1_data), input2_shape,
- const_cast<uint8_t *>(input2_data), output_shape, output_data);
+ optimized::BroadcastMulDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
+ output_shape, output_data);
break;
case nnfw::cker::BinaryArithmeticOpType::DIV:
case nnfw::cker::BinaryArithmeticOpType::POW:
@@ -312,11 +310,17 @@ inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const S
output_shape, output_data);
break;
case nnfw::cker::BinaryArithmeticOpType::SUB:
+ optimized::BroadcastSubDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
+ output_shape, output_data);
+ break;
case nnfw::cker::BinaryArithmeticOpType::DIV:
+ optimized::BroadcastDivDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
+ output_shape, output_data);
+ break;
case nnfw::cker::BinaryArithmeticOpType::POW:
reference::BroadcastBinaryArithmeticOpSlow<float>(
- params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
- GetBinaryArtithmeticFn<op_type, float>());
+ params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+ GetBinaryArtithmeticFn<op_type, float>());
break;
default:
assert(false);
diff --git a/compute/cker/include/cker/operation/BroadcastTo.h b/compute/cker/include/cker/operation/BroadcastTo.h
index 5068eca96..145deda29 100644
--- a/compute/cker/include/cker/operation/BroadcastTo.h
+++ b/compute/cker/include/cker/operation/BroadcastTo.h
@@ -126,7 +126,7 @@ template <typename Device, typename T> struct BroadcastTo
}
}
};
-} // functor
+} // namespace functor
template <typename T>
inline void BroadcastTo(const Shape &input_shape, T *input_data, const Shape &output_shape,
diff --git a/compute/cker/include/cker/operation/Common.h b/compute/cker/include/cker/operation/Common.h
index d69b38aca..24d4cc4c7 100644
--- a/compute/cker/include/cker/operation/Common.h
+++ b/compute/cker/include/cker/operation/Common.h
@@ -82,7 +82,7 @@ inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size, const
for (; i < bias_size; i++)
{
array_ptr[i] =
- ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i], clamp_min, clamp_max);
+ ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i], clamp_min, clamp_max);
}
}
#else // not NEON
@@ -91,7 +91,7 @@ inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size, const
for (int i = 0; i < bias_size; i++)
{
array_data[array_offset + i] = ActivationFunctionWithMinMax(
- array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max);
+ array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max);
}
}
#endif
diff --git a/compute/cker/include/cker/operation/Comparison.h b/compute/cker/include/cker/operation/Comparison.h
index 47eb6034c..ac6af8487 100644
--- a/compute/cker/include/cker/operation/Comparison.h
+++ b/compute/cker/include/cker/operation/Comparison.h
@@ -42,7 +42,7 @@ inline void ComparisonImpl(const Shape &input1_shape, const T *input1_data,
const Shape &output_shape, bool *output_data)
{
const int64_t flatsize = // number of data....
- MatchingFlatSize(input1_shape, input2_shape, output_shape);
+ MatchingFlatSize(input1_shape, input2_shape, output_shape);
for (int64_t i = 0; i < flatsize; ++i)
{
output_data[i] = F(input1_data[i], input2_data[i]);
@@ -79,9 +79,9 @@ inline void ComparisonWithScaling(ComparisonParams &params, const Shape &input1_
const int32_t shifted_input1_val = input1_val * (1 << left_shift);
const int32_t shifted_input2_val = input2_val * (1 << left_shift);
const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
- shifted_input1_val, input1_multiplier, input1_shift);
+ shifted_input1_val, input1_multiplier, input1_shift);
const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
- shifted_input2_val, input2_multiplier, input2_shift);
+ shifted_input2_val, input2_multiplier, input2_shift);
output_data[i] = F(scaled_input1_val, scaled_input2_val);
}
}
@@ -111,8 +111,8 @@ BroadcastComparison4DSlowImpl(const Shape &unextended_input1_shape, const T *inp
for (int c = 0; c < output_shape.Dims(3); ++c)
{
output_data[Offset(output_shape, b, y, x, c)] =
- F(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
- input2_data[SubscriptToIndex(desc2, b, y, x, c)]);
+ F(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+ input2_data[SubscriptToIndex(desc2, b, y, x, c)]);
}
}
}
@@ -159,15 +159,15 @@ inline void BroadcastComparison4DSlowWithScaling(ComparisonParams &params,
for (int c = 0; c < output_shape.Dims(3); ++c)
{
const int32_t input1_val =
- input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)];
+ input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)];
const int32_t input2_val =
- input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+ input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)];
const int32_t shifted_input1_val = input1_val * (1 << left_shift);
const int32_t shifted_input2_val = input2_val * (1 << left_shift);
const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
- shifted_input1_val, input1_multiplier, input1_shift);
+ shifted_input1_val, input1_multiplier, input1_shift);
const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
- shifted_input2_val, input2_multiplier, input2_shift);
+ shifted_input2_val, input2_multiplier, input2_shift);
output_data[Offset(output_shape, b, y, x, c)] = F(scaled_input1_val, scaled_input2_val);
}
}
@@ -175,55 +175,53 @@ inline void BroadcastComparison4DSlowWithScaling(ComparisonParams &params,
}
}
-#define TFLITE_COMPARISON_OP(name) \
- template <typename T> \
- inline void name(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, \
- const T *input2_data, const Shape &output_shape, bool *output_data) \
- { \
- Comparison<name##Fn>(input1_shape, input1_data, input2_shape, input2_data, output_shape, \
- output_data); \
- } \
- template <typename T> \
- inline void name##NoScaling(const Shape &input1_shape, const T *input1_data, \
- const Shape &input2_shape, const T *input2_data, \
- const Shape &output_shape, bool *output_data) \
- { \
- ComparisonImpl<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data, \
- output_shape, output_data); \
- } \
- template <typename T> \
- inline void name##WithScaling(ComparisonParams &params, const Shape &input1_shape, \
- const T *input1_data, const Shape &input2_shape, \
- const T *input2_data, const Shape &output_shape, \
- bool *output_data) \
- { \
- ComparisonWithScaling<T, name##Fn>(params, input1_shape, input1_data, input2_shape, \
- input2_data, output_shape, output_data); \
- } \
- template <typename T> \
- inline void Broadcast4DSlow##name##NoScaling(const Shape &input1_shape, const T *input1_data, \
- const Shape &input2_shape, const T *input2_data, \
- const Shape &output_shape, bool *output_data) \
- { \
- BroadcastComparison4DSlowImpl<T, name##Fn>(input1_shape, input1_data, input2_shape, \
- input2_data, output_shape, output_data); \
- } \
- template <typename T> \
- inline void Broadcast4DSlow##name(const Shape &input1_shape, const T *input1_data, \
- const Shape &input2_shape, const T *input2_data, \
- const Shape &output_shape, bool *output_data) \
- { \
- BroadcastComparison4DSlow<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data, \
- output_shape, output_data); \
- } \
- template <typename T> \
- inline void Broadcast4DSlow##name##WithScaling(ComparisonParams &params, \
- const Shape &input1_shape, const T *input1_data, \
- const Shape &input2_shape, const T *input2_data, \
- const Shape &output_shape, bool *output_data) \
- { \
- BroadcastComparison4DSlowWithScaling<T, name##Fn>( \
- params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data); \
+#define TFLITE_COMPARISON_OP(name) \
+ template <typename T> \
+ inline void name(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, \
+ const T *input2_data, const Shape &output_shape, bool *output_data) \
+ { \
+ Comparison<name##Fn>(input1_shape, input1_data, input2_shape, input2_data, output_shape, \
+ output_data); \
+ } \
+ template <typename T> \
+ inline void name##NoScaling(const Shape &input1_shape, const T *input1_data, \
+ const Shape &input2_shape, const T *input2_data, \
+ const Shape &output_shape, bool *output_data) \
+ { \
+ ComparisonImpl<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data, \
+ output_shape, output_data); \
+ } \
+ template <typename T> \
+ inline void name##WithScaling( \
+ ComparisonParams &params, const Shape &input1_shape, const T *input1_data, \
+ const Shape &input2_shape, const T *input2_data, const Shape &output_shape, bool *output_data) \
+ { \
+ ComparisonWithScaling<T, name##Fn>(params, input1_shape, input1_data, input2_shape, \
+ input2_data, output_shape, output_data); \
+ } \
+ template <typename T> \
+ inline void Broadcast4DSlow##name##NoScaling(const Shape &input1_shape, const T *input1_data, \
+ const Shape &input2_shape, const T *input2_data, \
+ const Shape &output_shape, bool *output_data) \
+ { \
+ BroadcastComparison4DSlowImpl<T, name##Fn>(input1_shape, input1_data, input2_shape, \
+ input2_data, output_shape, output_data); \
+ } \
+ template <typename T> \
+ inline void Broadcast4DSlow##name(const Shape &input1_shape, const T *input1_data, \
+ const Shape &input2_shape, const T *input2_data, \
+ const Shape &output_shape, bool *output_data) \
+ { \
+ BroadcastComparison4DSlow<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data, \
+ output_shape, output_data); \
+ } \
+ template <typename T> \
+ inline void Broadcast4DSlow##name##WithScaling( \
+ ComparisonParams &params, const Shape &input1_shape, const T *input1_data, \
+ const Shape &input2_shape, const T *input2_data, const Shape &output_shape, bool *output_data) \
+ { \
+ BroadcastComparison4DSlowWithScaling<T, name##Fn>( \
+ params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data); \
}
TFLITE_COMPARISON_OP(Equal);
diff --git a/compute/cker/include/cker/operation/Concatenation.h b/compute/cker/include/cker/operation/Concatenation.h
index 394123e30..9aaca00b7 100644
--- a/compute/cker/include/cker/operation/Concatenation.h
+++ b/compute/cker/include/cker/operation/Concatenation.h
@@ -142,7 +142,7 @@ inline void ConcatenationWithScaling(const ConcatenationParams &params,
for (int j = 0; j < copy_size; ++j)
{
const int32_t value =
- static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) + output_zeropoint;
+ static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) + output_zeropoint;
output_ptr[j] = static_cast<uint8_t>(std::max(std::min(255, value), 0));
}
}
diff --git a/compute/cker/include/cker/operation/Conv.h b/compute/cker/include/cker/operation/Conv.h
index 214f2e612..2572b51ee 100644
--- a/compute/cker/include/cker/operation/Conv.h
+++ b/compute/cker/include/cker/operation/Conv.h
@@ -57,9 +57,9 @@ class Conv
public:
Conv() : _modified_filter_data(), _im2col_shape(4), _need_im2col(false), _prepared(false) {}
- void prepare(const Shape &filter_shape, const float *filter_data, PaddingType padding_type,
- bool &is_replaced_weights, uint32_t dilationWidthFactor,
- uint32_t dilationHeightFactor)
+ void prepareF32(const Shape &filter_shape, const float *filter_data, PaddingType padding_type,
+ bool &is_replaced_weights, uint32_t dilationWidthFactor,
+ uint32_t dilationHeightFactor)
{
if (!_prepared)
{
@@ -71,12 +71,14 @@ public:
}
}
- void prepareQuant(const Shape &input_shape, const Shape &kernel_shape, const Shape &output_shape,
- uint32_t stride_width, uint32_t stride_height)
+ void prepareQ8uPerTensor(const Shape &input_shape, const Shape &kernel_shape,
+ const Shape &output_shape, uint32_t stride_width, uint32_t stride_height,
+ uint32_t dilation_width_factor, uint32_t dilation_height_factor)
{
if (!_prepared)
{
- IsRequiredIm2col(input_shape, kernel_shape, output_shape, stride_width, stride_height);
+ IsRequiredIm2col(input_shape, kernel_shape, output_shape, stride_width, stride_height,
+ dilation_width_factor, dilation_height_factor);
_prepared = true;
}
}
@@ -115,7 +117,8 @@ public:
{
// This means that input or output are dynamic or filter is not constant
IsRequiredIm2col(input_shape, filter_shape, output_shape, params.stride_width,
- params.stride_height);
+ params.stride_height, params.dilation_width_factor,
+ params.dilation_height_factor);
}
int im2col_size = _need_im2col ? _im2col_shape.FlatSize() : 1;
@@ -135,6 +138,29 @@ public:
}
}
+ void operator()(const ConvParams &params, const Shape &input_shape, const uint8_t *input_data,
+ const Shape &filter_shape, const uint8_t *filter_data,
+ const int32_t *filter_zero_point, const Shape &bias_shape,
+ const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data)
+ {
+ reference::Conv<uint8_t, true>(params, _per_channel_output_multiplier.data(),
+ _per_channel_output_shift.data(), input_shape, input_data,
+ filter_shape, filter_data, filter_zero_point, bias_shape,
+ bias_data, output_shape, output_data);
+ }
+
+ void operator()(const ConvParams &params, const Shape &input_shape, const int8_t *input_data,
+ const Shape &filter_shape, const int8_t *filter_data, const Shape &bias_shape,
+ const int32_t *bias_data, const Shape &output_shape, int8_t *output_data)
+ {
+ reference::Conv<int8_t, false>(params, _per_channel_output_multiplier.data(),
+ _per_channel_output_shift.data(), input_shape, input_data,
+ filter_shape, filter_data, nullptr /* filter_zero_point */,
+ bias_shape, bias_data, output_shape, output_data);
+ }
+ std::vector<int32_t> &per_channel_output_multiplier() { return _per_channel_output_multiplier; }
+ std::vector<int> &per_channel_output_shift() { return _per_channel_output_shift; }
+
private:
bool usableMultiThreaded(PaddingType padding_type, uint32_t dilation_width_factor,
int32_t dilation_height_factor)
@@ -154,10 +180,15 @@ private:
}
void IsRequiredIm2col(const Shape &input_shape, const Shape &kernel_shape,
- const Shape &output_shape, uint32_t stride_width, uint32_t stride_height)
+ const Shape &output_shape, uint32_t stride_width, uint32_t stride_height,
+ uint32_t dilation_width_factor, uint32_t dilation_height_factor)
{
- _need_im2col = stride_width != 1 || stride_height != 1 || kernel_shape.Dims(1) != 1 ||
- kernel_shape.Dims(2) != 1;
+ const bool need_dilated_im2col = dilation_width_factor != 1 || dilation_height_factor != 1;
+ const bool need_non_dilated_im2col = stride_width != 1 || stride_height != 1 ||
+ kernel_shape.Dims(1) != 1 || kernel_shape.Dims(2) != 1;
+
+ _need_im2col = need_dilated_im2col || need_non_dilated_im2col;
+
if (_need_im2col)
{
_im2col_shape.SetDim(0, output_shape.Dims(0));
@@ -172,7 +203,25 @@ private:
Shape _im2col_shape;
bool _need_im2col;
bool _prepared;
+ // Per channel output multiplier and shift.
+ std::vector<int32_t> _per_channel_output_multiplier;
+ std::vector<int> _per_channel_output_shift;
+};
+
+struct ConvHybridTempArena
+{
+ ConvHybridTempArena(int batch_size, int input_size)
+ {
+ input_quantized.resize(input_size);
+ // TODO: Optimize the case of batch_size = 1
+ input_scaling_factors.resize(batch_size);
+ input_offsets.resize(batch_size);
+ }
+ std::vector<int8_t> input_quantized;
+ std::vector<float> input_scaling_factors;
+ std::vector<int32_t> input_offsets;
};
+
} // namespace cker
} // namespace nnfw
diff --git a/compute/cker/include/cker/operation/DepthToSpace.h b/compute/cker/include/cker/operation/DepthToSpace.h
new file mode 100644
index 000000000..e57fef01d
--- /dev/null
+++ b/compute/cker/include/cker/operation/DepthToSpace.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_DEPTH_TO_SPACE_H__
+#define __NNFW_CKER_DEPTH_TO_SPACE_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T>
+inline void DepthToSpace(const Shape &unextended_input_shape, const T *input_data,
+ const Shape &unextended_output_shape, T *output_data, int32_t block_size)
+{
+ assert(unextended_input_shape.DimensionsCount() <= 4);
+ assert(unextended_output_shape.DimensionsCount() <= 4);
+ const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
+ const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
+
+ const int input_depth = input_shape.Dims(3);
+ const int input_width = input_shape.Dims(2);
+ const int input_height = input_shape.Dims(1);
+
+ const int output_depth = output_shape.Dims(3);
+ const int batch_size = output_shape.Dims(0);
+
+ // Number of continuous values that we can copy in one interation.
+ const int stride = block_size * output_depth;
+
+ for (int batch = 0; batch < batch_size; ++batch)
+ {
+ for (int in_h = 0; in_h < input_height; ++in_h)
+ {
+ const T *input_ptr = input_data + Offset(input_shape, batch, in_h, 0, 0);
+ for (int offset_h = 0; offset_h < block_size; ++offset_h)
+ {
+ const T *src = input_ptr;
+ for (int in_w = 0; in_w < input_width; ++in_w)
+ {
+ memcpy(output_data, src, stride * sizeof(T));
+ output_data += stride;
+ src += input_depth;
+ }
+ input_ptr += stride;
+ }
+ }
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_SPACE_TO_DEPTH_H__
diff --git a/compute/cker/include/cker/operation/DepthwiseConv.h b/compute/cker/include/cker/operation/DepthwiseConv.h
index 814a9e019..c926ec4f1 100644
--- a/compute/cker/include/cker/operation/DepthwiseConv.h
+++ b/compute/cker/include/cker/operation/DepthwiseConv.h
@@ -22,143 +22,162 @@
#include "cker/Types.h"
#include "cker/Utils.h"
#include "cker/neon/neon_check.h"
+#include "cker/operation/optimized/DepthwiseConvFloat.h"
#include "cker/operation/optimized/DepthwiseConvUint8.h"
+#include "cker/operation/optimized/integer_ops/DepthwiseConvInt8.h"
+#include "cker/operation/reference/integer_ops/DepthwiseConvUInt8.h"
+#include "cker/operation/reference/integer_ops/DepthwiseConvHybrid.h"
+#include "cker/CpuBackendThreadpool.h"
namespace nnfw
{
namespace cker
{
-inline void DepthwiseConv(const DepthwiseConvParams &params, const Shape &input_shape,
- const uint8_t *input_data, const Shape &filter_shape,
- const uint8_t *filter_data, const Shape &bias_shape,
- const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data)
+// TODO(luwa): add multithread to per-channel depthwise_conv
+// DepthwiseConv can run with multi threads on the dim specified by thread_dim.
+// Each thread processes output elements on dim, thread_dim, in the range of
+// [thread_start, thread_end).
+// For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it
+// means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :].
+template <typename T, typename TS> struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task
{
- const int depth_multiplier = params.depth_multiplier;
- const int32_t output_activation_min = params.quantized_activation_min;
- const int32_t output_activation_max = params.quantized_activation_max;
- const int dilation_width_factor = params.dilation_width_factor;
- const int dilation_height_factor = params.dilation_height_factor;
- assert(dilation_width_factor >= 1);
- assert(dilation_height_factor >= 1);
- UNUSED_RELEASE(dilation_width_factor);
- UNUSED_RELEASE(dilation_height_factor);
- assert(input_shape.DimensionsCount() == 4);
- assert(filter_shape.DimensionsCount() == 4);
- assert(output_shape.DimensionsCount() == 4);
- assert(output_activation_min <= output_activation_max);
- UNUSED_RELEASE(output_activation_min);
- UNUSED_RELEASE(output_activation_max);
- const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
- const int input_depth = input_shape.Dims(3);
- assert(output_depth == input_depth * depth_multiplier);
- assert(bias_shape.FlatSize() == output_depth);
- UNUSED_RELEASE(input_depth);
- UNUSED_RELEASE(output_depth);
- UNUSED_RELEASE(depth_multiplier);
-
-// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
-// Jetson TX-2. This compiler does not support the offsetof() macro.
-#if defined(__aarch64__)
-// TODO Use below codes
-
-// const int stride_width = params.stride_width;
-// const int stride_height = params.stride_height;
-// const int pad_width = params.padding_values.width;
-// const int pad_height = params.padding_values.height;
-// const int output_shift = params.output_shift;
-//
-// // Call kernel optimized for depthwise convolutions using 3x3 filters if
-// // parameters are supported.
-// if (Fast3x3FilterKernelSupported(
-// input_shape, filter_shape, stride_width, stride_height,
-// dilation_width_factor, dilation_height_factor, pad_width, pad_height,
-// depth_multiplier, output_shape, output_shift)) {
-// DepthwiseConv3x3Filter(params, input_shape, input_data, filter_shape,
-// filter_data, bias_shape, bias_data, output_shape,
-// output_data);
-// return;
-// }
-#endif
-
- optimized::DepthwiseConvGeneral(params, input_shape, input_data, filter_shape, filter_data,
- bias_shape, bias_data, output_shape, output_data);
+ DepthwiseConvWorkerTask(const DepthwiseConvParams &params, const Shape &input_shape,
+ const T *input_data, const Shape &filter_shape, const T *filter_data,
+ const Shape &bias_shape, const TS *bias_data, const Shape &output_shape,
+ T *output_data, int thread_start, int thread_end, int thread_dim)
+ : params_(params), input_shape_(input_shape), input_data_(input_data),
+ filter_shape_(filter_shape), filter_data_(filter_data), bias_shape_(bias_shape),
+ bias_data_(bias_data), output_shape_(output_shape), output_data_(output_data),
+ thread_start_(thread_start), thread_end_(thread_end), thread_dim_(thread_dim)
+ {
+ }
+
+ void Run() override
+ {
+ optimized::DepthwiseConvImpl(params_, input_shape_, input_data_, filter_shape_, filter_data_,
+ bias_shape_, bias_data_, output_shape_, output_data_,
+ thread_start_, thread_end_, thread_dim_);
+ }
+
+private:
+ const DepthwiseConvParams &params_;
+ const Shape &input_shape_;
+ const T *input_data_;
+ const Shape &filter_shape_;
+ const T *filter_data_;
+ const Shape &bias_shape_;
+ const TS *bias_data_;
+ const Shape &output_shape_;
+ T *output_data_;
+ // const CpuFlags& cpu_flags_;
+ int thread_start_;
+ int thread_end_;
+ int thread_dim_;
+};
+
+inline int HowManyConvThreads(const Shape &output_shape, const Shape &filter_shape)
+{
+ // How many scalar multiplications are needed to make it worth using one
+ // more thread
+ static constexpr int kMinMulPerThread = 1 << 13; // 8k
+ const int filter_height = filter_shape.Dims(1);
+ const int filter_width = filter_shape.Dims(2);
+ const int num_muls = output_shape.FlatSize() * filter_height * filter_width;
+ // Try to avoid real runtime divisions if possible by dividing by a
+ // compile-time constant.
+ int thread_count = std::max(1, num_muls / kMinMulPerThread);
+ return thread_count;
+}
+
+inline bool MultithreadAlongBatches(int thread_count, int batches)
+{
+ assert(thread_count >= 2);
+ // If there are fewer batch entries than the number of threads we want to use,
+ // then better do intra-batch-entry multithreading.
+ if (batches < thread_count)
+ {
+ return false;
+ }
+ // If there are at least 2 batch entries to be handed to each thread, then
+ // it's safe to proceed with batch-wise multithreading: each thread will have
+ // approximately equal number of batch entries to handle, so the load
+ // balancing will be reasonable, and the amount to which the load is not
+ // perfectly balanced will be offset by the inherent advantages of
+ // batch-wise multithreading (each thread is more efficient thanks to working
+ // on larger buffers with less boundary-handling overhead).
+ if (batches >= 2 * thread_count)
+ {
+ return true;
+ }
+ // In the limit case were there are at least 1 but not much more than 1
+ // batch entries per thread, it may be a good idea to do per-batch
+ // multithreading if the number of batch entries is a multiple of the number
+ // of threads, so that each thread will have the same number of batch entries
+ // to process.
+ return ((batches % thread_count) == 0);
}
+template <typename T, typename TS>
inline void DepthwiseConv(const DepthwiseConvParams &params, const Shape &input_shape,
- const float *input_data, const Shape &filter_shape,
- const float *filter_data, const Shape &bias_shape, const float *bias_data,
- const Shape &output_shape, float *output_data)
+ const T *input_data, const Shape &filter_shape, const T *filter_data,
+ const Shape &bias_shape, const TS *bias_data, const Shape &output_shape,
+ T *output_data, ruy::Context *ruy_context)
{
- const int stride_width = params.stride_width;
- const int stride_height = params.stride_height;
- const int dilation_width_factor = params.dilation_width_factor;
- const int dilation_height_factor = params.dilation_height_factor;
- const int pad_width = params.padding_values.width;
- const int pad_height = params.padding_values.height;
- const int depth_multiplier = params.depth_multiplier;
- const float output_activation_min = params.float_activation_min;
- const float output_activation_max = params.float_activation_max;
assert(input_shape.DimensionsCount() == 4);
assert(filter_shape.DimensionsCount() == 4);
assert(output_shape.DimensionsCount() == 4);
- const int batches = MatchingDim(input_shape, 0, output_shape, 0);
- const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
- const int input_height = input_shape.Dims(1);
- const int input_width = input_shape.Dims(2);
- const int input_depth = input_shape.Dims(3);
- const int filter_height = filter_shape.Dims(1);
- const int filter_width = filter_shape.Dims(2);
+ int thread_count = HowManyConvThreads(output_shape, filter_shape);
+
+ // NOTE Borrow RuyContext to get max_num_threads setting
+ // TODO Define and use max_num_threads for CPU backend
+ const auto max_threads = (ruy_context == nullptr) ? 1 : ruy_context->max_num_threads();
+
+ thread_count = std::max(1, std::min(thread_count, max_threads));
+ // Cap the number of threads to 2 for float path to avoid regression in
+ // performance (b/132294857).
+ if (std::is_floating_point<T>::value)
+ {
+ thread_count = std::min(thread_count, 2);
+ }
+
+ const int output_batches = output_shape.Dims(0);
const int output_height = output_shape.Dims(1);
- const int output_width = output_shape.Dims(2);
- assert(output_depth == input_depth * depth_multiplier);
- assert(bias_shape.FlatSize() == output_depth);
- UNUSED_RELEASE(output_depth);
- UNUSED_RELEASE(bias_shape);
- for (int b = 0; b < batches; ++b)
+ if (thread_count == 1)
+ {
+ optimized::DepthwiseConvImpl(params, input_shape, input_data, filter_shape, filter_data,
+ bias_shape, bias_data, output_shape, output_data, 0, output_height,
+ 1);
+ return;
+ }
+
+ int thread_dim, thread_dim_size;
+ if (MultithreadAlongBatches(thread_count, output_batches))
+ {
+ thread_dim = 0;
+ thread_dim_size = output_batches;
+ }
+ else
+ {
+ thread_dim = 1;
+ thread_dim_size = output_height;
+ }
+
+ std::vector<DepthwiseConvWorkerTask<T, TS>> tasks;
+ // TODO(b/131746020) don't create new heap allocations every time.
+ // At least we make it a single heap allocation by using reserve().
+ tasks.reserve(thread_count);
+ int thread_start = 0;
+ for (int i = 0; i < thread_count; ++i)
{
- for (int out_y = 0; out_y < output_height; ++out_y)
- {
- for (int out_x = 0; out_x < output_width; ++out_x)
- {
- for (int ic = 0; ic < input_depth; ++ic)
- {
- for (int m = 0; m < depth_multiplier; m++)
- {
- const int oc = m + ic * depth_multiplier;
- const int in_x_origin = (out_x * stride_width) - pad_width;
- const int in_y_origin = (out_y * stride_height) - pad_height;
- float total = 0.f;
- for (int filter_y = 0; filter_y < filter_height; ++filter_y)
- {
- for (int filter_x = 0; filter_x < filter_width; ++filter_x)
- {
- const int in_x = in_x_origin + dilation_width_factor * filter_x;
- const int in_y = in_y_origin + dilation_height_factor * filter_y;
- // If the location is outside the bounds of the input image,
- // use zero as a default value.
- if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height))
- {
- float input_value = input_data[Offset(input_shape, b, in_y, in_x, ic)];
- float filter_value = filter_data[Offset(filter_shape, 0, filter_y, filter_x, oc)];
- total += (input_value * filter_value);
- }
- }
- }
- float bias_value = 0.0f;
- if (bias_data)
- {
- bias_value = bias_data[oc];
- }
- output_data[Offset(output_shape, b, out_y, out_x, oc)] = ActivationFunctionWithMinMax(
- total + bias_value, output_activation_min, output_activation_max);
- }
- }
- }
- }
+ int thread_end = thread_start + (thread_dim_size - thread_start) / (thread_count - i);
+ tasks.emplace_back(params, input_shape, input_data, filter_shape, filter_data, bias_shape,
+ bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
+ thread_start = thread_end;
}
+ cpu_backend_threadpool::Execute(tasks.size(), tasks.data(), ruy_context);
}
} // namespace cker
diff --git a/compute/cker/include/cker/operation/Dequantize.h b/compute/cker/include/cker/operation/Dequantize.h
new file mode 100644
index 000000000..c8c2fd9d4
--- /dev/null
+++ b/compute/cker/include/cker/operation/Dequantize.h
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_DEQUANTIZE_H__
+#define __NNFW_CKER_DEQUANTIZE_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/neon/neon_check.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+#ifdef USE_NEON
+namespace
+{
+inline void ScaleWithNewZeroPoint(const int32x4_t input, const float32x4_t scale_dup,
+ const float32x4_t zero_times_scale_dup, float32x4_t *output)
+{
+#ifdef __ARM_FEATURE_FMA
+ *output = vfmaq_f32(zero_times_scale_dup, vcvtq_f32_s32(input), scale_dup);
+#else
+ *output = vaddq_f32(vmulq_f32(vcvtq_f32_s32(input), scale_dup), zero_times_scale_dup);
+#endif
+}
+} // namespace
+#endif // USE_NEON
+
+inline void Dequantize(const Shape &input_shape, const uint8_t *input_data,
+ const Shape &output_shape, float *output_data, const float scale,
+ const int32_t zero_point)
+{
+ const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+ int i = 0;
+#ifdef USE_NEON
+ const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale));
+ const float32x4_t zero_times_scale_dup = vdupq_n_f32(static_cast<float>(-zero_point * scale));
+ for (; i <= flat_size - 8; i += 8)
+ {
+ const uint8x8_t input_u8 = vld1_u8(input_data + i);
+ const uint16x8_t input_u16 = vmovl_u8(input_u8);
+ const int16x8_t input_s16 = vreinterpretq_s16_u16(input_u16);
+ const int16x4_t input_s16_low = vget_low_s16(input_s16);
+ const int16x4_t input_s16_high = vget_high_s16(input_s16);
+ const int32x4_t val_low = vmovl_s16(input_s16_low);
+ const int32x4_t val_high = vmovl_s16(input_s16_high);
+
+ float32x4_t result_low, result_high;
+ ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup, &result_low);
+ ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup, &result_high);
+
+ vst1q_f32(output_data + i, result_low);
+ vst1q_f32(output_data + i + 4, result_high);
+ }
+#endif // NEON
+ for (; i < flat_size; ++i)
+ {
+ const int32_t val = input_data[i];
+ const float result = static_cast<float>(scale * (val - zero_point));
+ output_data[i] = result;
+ }
+}
+
+inline void Dequantize(const Shape &input_shape, const int8_t *input_data,
+ const Shape &output_shape, float *output_data, const float scale,
+ const int32_t zero_point)
+{
+ const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+ int i = 0;
+#ifdef USE_NEON
+ const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale));
+ const float32x4_t zero_times_scale_dup = vdupq_n_f32(static_cast<float>(-zero_point * scale));
+ for (; i <= flat_size - 8; i += 8)
+ {
+ const int8x8_t input_s8 = vld1_s8(input_data + i);
+ const int16x8_t input_s16 = vmovl_s8(input_s8);
+ const int16x4_t input_s16_low = vget_low_s16(input_s16);
+ const int16x4_t input_s16_high = vget_high_s16(input_s16);
+ const int32x4_t val_low = vmovl_s16(input_s16_low);
+ const int32x4_t val_high = vmovl_s16(input_s16_high);
+
+ float32x4_t result_low, result_high;
+ ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup, &result_low);
+ ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup, &result_high);
+
+ vst1q_f32(output_data + i, result_low);
+ vst1q_f32(output_data + i + 4, result_high);
+ }
+#endif // NEON
+ for (; i < flat_size; ++i)
+ {
+ const int32_t val = input_data[i];
+ const float result = static_cast<float>(scale * (val - zero_point));
+ output_data[i] = result;
+ }
+}
+
+inline void Dequantize(const Shape &input_shape, const int16_t *input_data,
+ const Shape &output_shape, float *output_data, const float scale,
+ const int32_t zero_point)
+{
+ const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+ int i = 0;
+#ifdef USE_NEON
+ const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale));
+ const float32x4_t zero_times_scale_dup = vdupq_n_f32(static_cast<float>(-zero_point * scale));
+ for (; i <= flat_size - 8; i += 8)
+ {
+ const int16x4_t input_s16_low = vld1_s16(input_data + i);
+ const int16x4_t input_s16_high = vld1_s16(input_data + i + 4);
+ const int32x4_t val_low = vmovl_s16(input_s16_low);
+ const int32x4_t val_high = vmovl_s16(input_s16_high);
+
+ float32x4_t result_low, result_high;
+ ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup, &result_low);
+ ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup, &result_high);
+
+ vst1q_f32(output_data + i, result_low);
+ vst1q_f32(output_data + i + 4, result_high);
+ }
+#endif // NEON
+ for (; i < flat_size; ++i)
+ {
+ const int32_t val = input_data[i];
+ const float result = static_cast<float>(scale * (val - zero_point));
+ output_data[i] = result;
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_DEQUANTIZE_H__
diff --git a/compute/cker/include/cker/operation/ELU.h b/compute/cker/include/cker/operation/ELU.h
new file mode 100644
index 000000000..6bdd7c62e
--- /dev/null
+++ b/compute/cker/include/cker/operation/ELU.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_ELU_H__
+#define __NNFW_CKER_ELU_H__
+
+#include "cker/Shape.h"
+
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void ELU(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+ float *output_data)
+{
+ const int flat_size = MatchingFlatSize(input_shape, output_shape);
+ for (int i = 0; i < flat_size; ++i)
+ {
+ const float val = input_data[i];
+ output_data[i] = val < 0.0 ? std::exp(val) - 1 : val;
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_ELU_H__
diff --git a/compute/cker/include/cker/operation/Einsum.h b/compute/cker/include/cker/operation/Einsum.h
index 3d1837f47..bb9f88f8d 100644
--- a/compute/cker/include/cker/operation/Einsum.h
+++ b/compute/cker/include/cker/operation/Einsum.h
@@ -177,7 +177,7 @@ inline Shape copyShape(const Shape &shape)
{
return Shape::ExtendedShape(shape.DimensionsCount(), shape);
}
-}
+} // namespace
class Einsum
{
@@ -274,7 +274,7 @@ public:
}
for (int i = 0; i < num_inputs; ++i)
{
- for (int label : free_labels[i])
+ for (auto &&label : free_labels[i])
{
result_labels.push_back(label);
result_shape_dims.push_back(label_to_dim_sizes[label]);
@@ -300,7 +300,7 @@ public:
{
// We inflated the output. Modify result labels accordingly.
Labels inflated_labels;
- for (int label : result_labels)
+ for (auto &&label : result_labels)
{
inflated_labels.insert(inflated_labels.end(), output_label_counts[label], label);
}
@@ -394,8 +394,8 @@ private:
for (int label = 0; label < num_labels; ++label)
{
bool removed = (_output_label_counts[label] == 0);
- bool unique = num_inputs == 1 || _input_label_counts[0][label] == 0 ||
- _input_label_counts[1][label] == 0;
+ bool unique =
+ num_inputs == 1 || _input_label_counts[0][label] == 0 || _input_label_counts[1][label] == 0;
_label_types[label] = getDimensionType(removed, unique);
}
}
@@ -483,8 +483,8 @@ private:
if (inputs[i].shape.DimensionsCount() + 1 < (int32_t)labels->size())
{
throw std::runtime_error{"Expected input " + std::to_string(i) + " to have rank at least " +
- std::to_string(labels->size() - 1) + " but got: " +
- std::to_string(inputs[i].shape.DimensionsCount())};
+ std::to_string(labels->size() - 1) +
+ " but got: " + std::to_string(inputs[i].shape.DimensionsCount())};
}
int ellipsis_axis = -1;
const int num_bcast_dims = inputs[i].shape.DimensionsCount() - labels->size() + 1;
@@ -511,7 +511,7 @@ private:
}
std::vector<bool>::iterator it_input =
- std::find(_input_has_ellipsis.begin(), _input_has_ellipsis.end(), true);
+ std::find(_input_has_ellipsis.begin(), _input_has_ellipsis.end(), true);
if (it_input == _input_has_ellipsis.end() && !_output_has_ellipsis)
{
return;
@@ -645,11 +645,11 @@ private:
// Reduce along the last axis (i.e axis 1) of the rank-2 Tensor.
const int32_t output_size =
- reshape[kBroadcasting] * reshape[kBatch] * reshape[kFree] * reshape[kContract];
+ reshape[kBroadcasting] * reshape[kBatch] * reshape[kFree] * reshape[kContract];
functor::ReduceFunctor<Eigen::ThreadPoolDevice, Reducer>::Reduce(
- device, output->shaped<T, 1>({output_size}),
- input_deduped.shaped<T, 2>({output_size, reshape[kReduce]}), Eigen::array<Index, 1>({1}),
- Reducer());
+ device, output->shaped<T, 1>({output_size}),
+ input_deduped.shaped<T, 2>({output_size, reshape[kReduce]}), Eigen::array<Index, 1>({1}),
+ Reducer());
}
bool shouldSwapFreeAndContract(const Labels &labels,
@@ -775,11 +775,11 @@ private:
Shape inflated_shape;
std::vector<int32_t> strided_shape_dims;
std::vector<int32_t> inflated_shape_dims;
- for (int label : labels)
+ for (auto &&label : labels)
{
const int32_t count = label_counts[label];
const int current_axis =
- should_inflate ? strided_shape_dims.size() : inflated_shape_dims.size();
+ should_inflate ? strided_shape_dims.size() : inflated_shape_dims.size();
const int32_t dim = input.shape.Dims(current_axis);
strided_shape_dims.push_back(dim);
inflated_shape_dims.insert(inflated_shape_dims.end(), count, dim);
@@ -879,7 +879,7 @@ private:
for (size_t i = 0; i < inputs.size(); ++i)
{
const int32_t free_axis =
- inputs[i].shape.DimensionsCount() - (swap_free_and_contract[i] ? 1 : 2);
+ inputs[i].shape.DimensionsCount() - (swap_free_and_contract[i] ? 1 : 2);
output_shape.SetDim(i + old_output_shape.DimensionsCount(), inputs[i].shape.Dims(free_axis));
}
bool adj_x = swap_free_and_contract[0];
diff --git a/compute/cker/include/cker/operation/Elementwise.h b/compute/cker/include/cker/operation/Elementwise.h
index 598a032bb..0e980f18e 100644
--- a/compute/cker/include/cker/operation/Elementwise.h
+++ b/compute/cker/include/cker/operation/Elementwise.h
@@ -66,8 +66,9 @@ inline void Rsqrt(const Shape &input_shape, const float *input_data, const Shape
}
}
-inline void Neg(const Shape &input_shape, const float *input_data, const Shape &output_shape,
- float *output_data)
+template <typename T>
+inline void Neg(const Shape &input_shape, const T *input_data, const Shape &output_shape,
+ T *output_data)
{
const int size = MatchingFlatSize(input_shape, output_shape);
for (int i = 0; i < size; i++)
@@ -86,6 +87,39 @@ inline void Log(const Shape &input_shape, const float *input_data, const Shape &
}
}
+inline void Floor(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+ float *output_data)
+{
+ const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+ for (int i = 0; i < flat_size; i++)
+ {
+ output_data[i] = std::floor(input_data[i]);
+ }
+}
+
+inline void Sqrt(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+ float *output_data)
+{
+ const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+ for (int i = 0; i < flat_size; i++)
+ {
+ output_data[i] = std::sqrt(input_data[i]);
+ }
+}
+
+inline void Square(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+ float *output_data)
+{
+ const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+ for (int i = 0; i < flat_size; i++)
+ {
+ output_data[i] = input_data[i] * input_data[i];
+ }
+}
+
} // namespace cker
} // namespace nnfw
diff --git a/compute/cker/include/cker/operation/Fill.h b/compute/cker/include/cker/operation/Fill.h
index 14daf9839..f88c3a5fb 100644
--- a/compute/cker/include/cker/operation/Fill.h
+++ b/compute/cker/include/cker/operation/Fill.h
@@ -25,26 +25,12 @@ namespace nnfw
namespace cker
{
template <typename T>
-inline void Fill(const Shape &input_shape, int *input_data, const T value_data,
- const Shape &output_shape, T output_data)
+inline void Fill(const T *value_data, const Shape &output_shape, T *output_data)
{
- int input_size = input_shape.FlatSize();
- int output_size = 1;
- for (int i = 0; i < input_size; i++)
+ int output_size = output_shape.FlatSize();
+ for (int i = 0; i < output_size; i++)
{
- output_size *= input_data[i];
- }
-
- if (output_size == output_shape.FlatSize())
- {
- for (int i = 0; i < output_size; i++)
- {
- output_data[i] = *value_data;
- }
- }
- else
- {
- throw std::runtime_error("Cker Fill.h: output's size is not matched inferred size of output");
+ output_data[i] = *value_data;
}
}
diff --git a/compute/cker/include/cker/operation/FloorDiv.h b/compute/cker/include/cker/operation/FloorDiv.h
new file mode 100644
index 000000000..cdb2c2a8b
--- /dev/null
+++ b/compute/cker/include/cker/operation/FloorDiv.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_FLOOR_DIV_H__
+#define __NNFW_CKER_FLOOR_DIV_H__
+
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T>
+inline void FloorDivBroadcast(const Shape &unextended_input1_shape, const T *input1_data,
+ const Shape &unextended_input2_shape, const T *input2_data,
+ const Shape &unextended_output_shape, T *output_data)
+{
+ assert(unextended_input1_shape.DimensionsCount() <= 4);
+ assert(unextended_input2_shape.DimensionsCount() <= 4);
+ assert(unextended_output_shape.DimensionsCount() <= 4);
+ const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
+
+ NdArrayDesc<4> desc1;
+ NdArrayDesc<4> desc2;
+ NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
+ &desc2);
+
+ for (int b = 0; b < output_shape.Dims(0); ++b)
+ {
+ for (int y = 0; y < output_shape.Dims(1); ++y)
+ {
+ for (int x = 0; x < output_shape.Dims(2); ++x)
+ {
+ for (int c = 0; c < output_shape.Dims(3); ++c)
+ {
+ auto out_idx = Offset(output_shape, b, y, x, c);
+ auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
+ auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
+ auto in1_val = input1_data[in1_idx];
+ auto in2_val = input2_data[in2_idx];
+ output_data[out_idx] = std::floor(
+ std::divides<double>()(static_cast<double>(in1_val), static_cast<double>(in2_val)));
+ }
+ }
+ }
+ }
+}
+
+template <typename T>
+inline void FloorDivElementwise(const Shape &shape, const T *input1_data, const T *input2_data,
+ T *output_data)
+{
+
+ int num_elements = shape.FlatSize();
+
+ for (int t = 0; t < num_elements; t++)
+ {
+ output_data[t] = std::floor(std::divides<double>()(static_cast<double>(input1_data[t]),
+ static_cast<double>(input2_data[t])));
+ }
+}
+
+} // namespace cker
+
+} // namespace nnfw
+#endif
diff --git a/compute/cker/include/cker/operation/FullyConnected.h b/compute/cker/include/cker/operation/FullyConnected.h
index 4280c9ae2..71a2f19ef 100644
--- a/compute/cker/include/cker/operation/FullyConnected.h
+++ b/compute/cker/include/cker/operation/FullyConnected.h
@@ -19,10 +19,14 @@
#define __NNFW_CKER_FULLY_CONNECTED_H__
#include <ruy/context.h>
+#include "cker/operation/FullyConnectedDense16x1.h"
+#include "cker/operation/FullyConnectedSparse16x1.h"
+#include "cker/operation/optimized/Gemm.h"
#include "cker/Shape.h"
#include "cker/Types.h"
#include "cker/Utils.h"
#include "cker/TensorUtils.h"
+#include "cker/neon/neon_check.h"
namespace nnfw
{
@@ -55,6 +59,42 @@ public:
std::vector<int32_t> accum_scratch;
};
+#if defined(CKER_X86_PLATFORM)
+
+// From tensorflow/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
+ const float *input_data, const Shape &weights_shape,
+ const float *weights_data, const Shape &,
+ const float *optional_bias_data, const Shape &output_shape,
+ float *output_data)
+{
+ const int dims_count = weights_shape.DimensionsCount();
+ const int input_rows = weights_shape.Dims(dims_count - 1);
+ MatrixParams<float> rhs_params;
+ rhs_params.order = Order::kColMajor;
+ rhs_params.rows = input_rows;
+ rhs_params.cols = input_shape.FlatSize() / input_rows;
+ rhs_params.cache_policy = optimized::DefaultCachePolicy(params.rhs_cacheable);
+
+ MatrixParams<float> lhs_params;
+ lhs_params.order = Order::kRowMajor;
+ lhs_params.cols = weights_shape.Dims(dims_count - 1);
+ lhs_params.rows = FlatSizeSkipDim(weights_shape, dims_count - 1);
+ lhs_params.cache_policy = optimized::DefaultCachePolicy(params.lhs_cacheable);
+ MatrixParams<float> dst_params;
+ dst_params.order = Order::kColMajor;
+ dst_params.rows = output_shape.Dims(output_shape.DimensionsCount() - 1);
+ dst_params.cols = FlatSizeSkipDim(output_shape, output_shape.DimensionsCount() - 1);
+ GemmParams<float, float> gemm_params;
+ gemm_params.bias = optional_bias_data;
+ gemm_params.clamp_min = params.float_activation_min;
+ gemm_params.clamp_max = params.float_activation_max;
+ optimized::Gemm(lhs_params, weights_data, rhs_params, input_data, dst_params, output_data,
+ gemm_params);
+}
+
+#else // CKER_X86_PLATFORM
+
inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
const float *input_data, const Shape &weights_shape,
const float *weights_data, const Shape &, const float *bias_data,
@@ -86,6 +126,8 @@ inline void FullyConnected(const FullyConnectedParams &params, const Shape &inpu
}
}
+#endif // CKER_X86_PLATFORM
+
inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
const uint8_t *input_data, const Shape &filter_shape,
const uint8_t *filter_data, const Shape &bias_shape,
@@ -114,7 +156,7 @@ inline void FullyConnected(const FullyConnectedParams &params, const Shape &inpu
const int filter_dim_count = filter_shape.DimensionsCount();
const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
const int output_depth =
- MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1);
+ MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1);
const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
for (int b = 0; b < batches; ++b)
{
@@ -208,12 +250,13 @@ inline void FullyConnectedHybrid(const FullyConnectedParams &params, const Shape
return;
}
-inline void FullyConnectedSparseWeight(const FullyConnectedParams &params, const Shape &input_shape,
- const float *input_data, const Shape &weights_shape,
- const float *weights_data, const Shape &bias_shape,
- const float *bias_data, const Shape &output_shape,
- float *output_data, int w0_size, const uint16_t *w1_segments,
- const uint16_t *w1_indices)
+inline void FullyConnectedSparseWeightRandom(const FullyConnectedParams &params,
+ const Shape &input_shape, const float *input_data,
+ const Shape &weights_shape, const float *weights_data,
+ const Shape &bias_shape, const float *bias_data,
+ const Shape &output_shape, float *output_data,
+ const uint16_t *w1_segments,
+ const uint16_t *w1_indices)
{
UNUSED_RELEASE(params);
UNUSED_RELEASE(input_shape);
@@ -225,7 +268,7 @@ inline void FullyConnectedSparseWeight(const FullyConnectedParams &params, const
const int weights_dims_count = weights_shape.DimensionsCount();
const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
const int output_depth =
- MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
+ MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
UNUSED_RELEASE(bias_shape);
@@ -239,13 +282,13 @@ inline void FullyConnectedSparseWeight(const FullyConnectedParams &params, const
}
for (int b = 0; b < batches; ++b)
{
- for (int idx_0 = 0; idx_0 < w0_size; ++idx_0)
+ for (int idx_0 = 0; idx_0 < output_depth; ++idx_0)
{
for (int pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
{
int idx_1 = w1_indices[pw1];
output_data[b * output_depth + idx_0] +=
- weights_data[pw1] * input_data[b * accum_depth + idx_1];
+ weights_data[pw1] * input_data[b * accum_depth + idx_1];
}
}
}
diff --git a/compute/cker/include/cker/operation/FullyConnectedDense16x1.h b/compute/cker/include/cker/operation/FullyConnectedDense16x1.h
new file mode 100644
index 000000000..a7e9efd7f
--- /dev/null
+++ b/compute/cker/include/cker/operation/FullyConnectedDense16x1.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/* Copyright (c) 2018 Mozilla
+ 2008-2011 Octasic Inc.
+ 2012-2017 Jean-Marc Valin */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __NNFW_CKER_FULLY_CONNECTED_DENSE16x1_H__
+#define __NNFW_CKER_FULLY_CONNECTED_DENSE16x1_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include "cker/TensorUtils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+#if defined(__aarch64__) && defined(USE_NEON)
+inline void FullyConnected16x1Float32(const FullyConnectedParams &params, const Shape &input_shape,
+ const float *input_data, const Shape &weights_shape,
+ const float *weights_data, const Shape &,
+ const float *bias_data, const Shape &, float *output_data)
+{
+ int total_input_size = input_shape.FlatSize();
+ int input_size = weights_shape.Dims(1);
+ const int batch_size = total_input_size / input_size;
+ const int num_units = weights_shape.Dims(0);
+
+ float *out = output_data;
+ const float *weights = weights_data;
+ int rows = num_units;
+ int cols = input_size;
+ int col_stride = input_size;
+ const float *x = input_data;
+
+ // Output = bias if bias tensor exists.
+ if (bias_data)
+ {
+ VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data);
+ }
+ else
+ {
+ ZeroVector(output_data, batch_size * num_units);
+ }
+
+ // rows : out, cols : in
+ int i, j;
+ for (i = 0; i < rows; i += 16)
+ {
+ const float *w = &weights[i * col_stride];
+
+ /* keep y[0..15] in registers for duration of inner loop */
+ float *__restrict y = &out[i];
+
+ float32x4_t y0_3 = vld1q_f32(&y[0]);
+ float32x4_t y4_7 = vld1q_f32(&y[4]);
+ float32x4_t y8_11 = vld1q_f32(&y[8]);
+ float32x4_t y12_15 = vld1q_f32(&y[12]);
+
+ for (j = 0; j < cols; j++)
+ {
+ float32x4_t wvec0_3, wvec4_7, wvec8_11, wvec12_15;
+ float32x4_t xj;
+
+ xj = vld1q_dup_f32(&x[j]);
+
+ wvec0_3 = vld1q_f32(&w[0]);
+ y0_3 = vmlaq_f32(y0_3, wvec0_3, xj);
+ wvec4_7 = vld1q_f32(&w[4]);
+ y4_7 = vmlaq_f32(y4_7, wvec4_7, xj);
+ wvec8_11 = vld1q_f32(&w[8]);
+ y8_11 = vmlaq_f32(y8_11, wvec8_11, xj);
+ wvec12_15 = vld1q_f32(&w[12]);
+ y12_15 = vmlaq_f32(y12_15, wvec12_15, xj);
+
+ w += 16;
+ }
+
+ /* save y[0..15] back to memory */
+
+ vst1q_f32(&y[0], y0_3);
+ vst1q_f32(&y[4], y4_7);
+ vst1q_f32(&y[8], y8_11);
+ vst1q_f32(&y[12], y12_15);
+ }
+ if (params.activation != FusedActivationFunctionType::kNone)
+ {
+ // Apply activation function
+ ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
+ }
+}
+#endif
+} // namespace cker
+} // namespace nnfw
+#endif // __NNFW_CKER_FULLY_CONNECTED_DENSE16x1_H__
diff --git a/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h b/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h
new file mode 100644
index 000000000..df397f73e
--- /dev/null
+++ b/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/* Copyright (c) 2018 Mozilla
+ 2008-2011 Octasic Inc.
+ 2012-2017 Jean-Marc Valin */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef __NNFW_CKER_FULLY_CONNECTED_SPARSE16x1_H__
+#define __NNFW_CKER_FULLY_CONNECTED_SPARSE16x1_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include "cker/TensorUtils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+inline void FullyConnectedSparseWeight16x1(const FullyConnectedParams &params,
+ const Shape &input_shape, const float *input_data,
+ const Shape &weights_shape, const float *weights_data,
+ const Shape &bias_shape, const float *bias_data,
+ const Shape &output_shape, float *output_data,
+ const uint16_t *w1_segments, const uint16_t *w1_indices)
+{
+ UNUSED_RELEASE(input_shape);
+
+ assert(weights_shape.DimensionsCount() == 2);
+ assert(output_shape.DimensionsCount() == 2);
+
+ const int output_dims_count = output_shape.DimensionsCount();
+ const int weights_dims_count = weights_shape.DimensionsCount();
+ const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
+ const int output_depth =
+ MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
+ const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
+
+ UNUSED_RELEASE(bias_shape);
+ if (bias_data)
+ {
+ VectorBatchVectorAssign(bias_data, output_depth, batches, output_data);
+ }
+ else
+ {
+ ZeroVector(output_data, batches * output_depth);
+ }
+ for (int b = 0; b < batches; ++b)
+ {
+ int depth_size = output_depth / 16;
+ for (int idx_0 = 0; idx_0 < depth_size; ++idx_0)
+#ifdef USE_NEON
+ {
+ float *__restrict y;
+ y = &output_data[b * output_depth + idx_0 * 16];
+ /* keep y[0..15] in registers for duration of inner loop */
+ float32x4_t y0_3 = vld1q_f32(&y[0]);
+ float32x4_t y4_7 = vld1q_f32(&y[4]);
+ float32x4_t y8_11 = vld1q_f32(&y[8]);
+ float32x4_t y12_15 = vld1q_f32(&y[12]);
+ for (auto pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
+ {
+ auto idx_1 = w1_indices[pw1];
+ float32x4_t xj = vld1q_dup_f32(&input_data[b * accum_depth + idx_1]);
+ float32x4_t wvec;
+
+ wvec = vld1q_f32(&weights_data[0]);
+ y0_3 = vmlaq_f32(y0_3, wvec, xj);
+ wvec = vld1q_f32(&weights_data[4]);
+ y4_7 = vmlaq_f32(y4_7, wvec, xj);
+ wvec = vld1q_f32(&weights_data[8]);
+ y8_11 = vmlaq_f32(y8_11, wvec, xj);
+ wvec = vld1q_f32(&weights_data[12]);
+ y12_15 = vmlaq_f32(y12_15, wvec, xj);
+
+ weights_data += 16;
+ }
+ /* save y[0..15] back to memory */
+ vst1q_f32(&y[0], y0_3);
+ vst1q_f32(&y[4], y4_7);
+ vst1q_f32(&y[8], y8_11);
+ vst1q_f32(&y[12], y12_15);
+ }
+#else
+ {
+ for (auto pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
+ {
+ float *__restrict y;
+ float xj;
+ auto idx_1 = w1_indices[pw1];
+ xj = input_data[b * accum_depth + idx_1];
+ y = &output_data[b * output_depth + idx_0 * 16];
+ y[0] += weights_data[0] * xj;
+ y[1] += weights_data[1] * xj;
+ y[2] += weights_data[2] * xj;
+ y[3] += weights_data[3] * xj;
+ y[4] += weights_data[4] * xj;
+ y[5] += weights_data[5] * xj;
+ y[6] += weights_data[6] * xj;
+ y[7] += weights_data[7] * xj;
+ y[8] += weights_data[8] * xj;
+ y[9] += weights_data[9] * xj;
+ y[10] += weights_data[10] * xj;
+ y[11] += weights_data[11] * xj;
+ y[12] += weights_data[12] * xj;
+ y[13] += weights_data[13] * xj;
+ y[14] += weights_data[14] * xj;
+ y[15] += weights_data[15] * xj;
+ weights_data += 16;
+ }
+ }
+#endif
+ }
+ if (params.activation != FusedActivationFunctionType::kNone)
+ {
+ // Apply activation function
+ ApplyActivationToVector(output_data, batches * output_depth, params.activation, output_data);
+ }
+}
+} // namespace cker
+} // namespace nnfw
+#endif // __NNFW_CKER_FULLY_CONNECTED_SPARSE16x1_H__
diff --git a/compute/cker/include/cker/operation/FusedBatchNorm.h b/compute/cker/include/cker/operation/FusedBatchNorm.h
index d17a5796b..8a97d8421 100644
--- a/compute/cker/include/cker/operation/FusedBatchNorm.h
+++ b/compute/cker/include/cker/operation/FusedBatchNorm.h
@@ -105,7 +105,7 @@ public:
float rest_size_inv = static_cast<float>(1.0f / static_cast<float>(rest_size));
// This adjustment is for Bessel's correction
float rest_size_adjust =
- static_cast<float>(rest_size) / static_cast<float>(rest_size_minus_one);
+ static_cast<float>(rest_size) / static_cast<float>(rest_size_minus_one);
Eigen::Tensor<float, 1, Eigen::RowMajor> batch_mean(depth);
Eigen::Tensor<float, 1, Eigen::RowMajor> batch_variance(depth);
@@ -117,12 +117,12 @@ public:
batch_variance.device(d) = x_centered.square().sum(reduce_dims) * rest_size_inv;
auto scaling_factor = ((batch_variance + param.epsilon).rsqrt() * scale)
- .eval()
- .reshape(one_by_depth)
- .broadcast(bcast_spec);
+ .eval()
+ .reshape(one_by_depth)
+ .broadcast(bcast_spec);
auto x_scaled = x_centered * scaling_factor;
auto x_shifted =
- (x_scaled + offset.reshape(one_by_depth).broadcast(bcast_spec)).template cast<float>();
+ (x_scaled + offset.reshape(one_by_depth).broadcast(bcast_spec)).template cast<float>();
UNUSED_RELEASE(rest_size_adjust);
diff --git a/compute/cker/include/cker/operation/Helper/BCast.h b/compute/cker/include/cker/operation/Helper/BCast.h
index a0abf2935..211db98ce 100644
--- a/compute/cker/include/cker/operation/Helper/BCast.h
+++ b/compute/cker/include/cker/operation/Helper/BCast.h
@@ -22,7 +22,7 @@
* ToDo : This file will be moved into upper folder when integrate with other
* custom operations.
* And It should merged with EinsumHelper's BCast.
-**/
+ **/
#include "cker/Shape.h"
#include "cker/eigen/EigenSupport.h"
@@ -393,7 +393,7 @@ public:
BCast(const Vec &x, const Vec &y, const bool fewer_dims_optimization = true,
const bool return_flattened_batch_indices = false)
- : BCastList<2>({x, y}, fewer_dims_optimization, return_flattened_batch_indices)
+ : BCastList<2>({x, y}, fewer_dims_optimization, return_flattened_batch_indices)
{
}
diff --git a/compute/cker/include/cker/operation/Helper/MatmulBCast.h b/compute/cker/include/cker/operation/Helper/MatmulBCast.h
index b80ccc0d0..b7d639433 100644
--- a/compute/cker/include/cker/operation/Helper/MatmulBCast.h
+++ b/compute/cker/include/cker/operation/Helper/MatmulBCast.h
@@ -62,13 +62,13 @@ public:
if (!_batch_bcast->IsValid())
return;
- auto x_reshaped = _batch_bcast->x_reshape();
- auto y_reshaped = _batch_bcast->y_reshape();
+ const auto &x_reshaped = _batch_bcast->x_reshape();
+ const auto &y_reshaped = _batch_bcast->y_reshape();
auto output_shape = _batch_bcast->output_shape();
_x_batch_size = std::accumulate(x_reshaped.cbegin(), x_reshaped.cend(), INT32_C(1),
std::multiplies<int32_t>());
- _y_batch_size = std::accumulate(x_reshaped.cbegin(), x_reshaped.cend(), INT32_C(1),
+ _y_batch_size = std::accumulate(y_reshaped.cbegin(), y_reshaped.cend(), INT32_C(1),
std::multiplies<int32_t>());
_output_shape.ReplaceWith(output_shape.size(), output_shape.data());
_output_batch_size = _output_shape.FlatSize();
diff --git a/compute/cker/include/cker/operation/Helper/RandomDistributions.h b/compute/cker/include/cker/operation/Helper/RandomDistributions.h
index baeafd7c9..f16e5019d 100644
--- a/compute/cker/include/cker/operation/Helper/RandomDistributions.h
+++ b/compute/cker/include/cker/operation/Helper/RandomDistributions.h
@@ -168,7 +168,7 @@ public:
// Must have lo < hi
UniformDistribution(int32_t lo, int32_t hi)
- : lo_(lo), range_(static_cast<uint32_t>(hi) - static_cast<uint32_t>(lo))
+ : lo_(lo), range_(static_cast<uint32_t>(hi) - static_cast<uint32_t>(lo))
{
}
@@ -207,7 +207,7 @@ public:
// Must have lo < hi
UniformDistribution(int64_t lo, int64_t hi)
- : lo_(lo), range_(static_cast<uint64_t>(hi) - static_cast<uint64_t>(lo))
+ : lo_(lo), range_(static_cast<uint64_t>(hi) - static_cast<uint64_t>(lo))
{
}
@@ -291,22 +291,22 @@ public:
template <typename Generator>
class UniformFullIntDistribution<Generator, int32_t>
- : public UniformFullIntDistribution32<Generator, int32_t>
+ : public UniformFullIntDistribution32<Generator, int32_t>
{
};
template <typename Generator>
class UniformFullIntDistribution<Generator, uint32_t>
- : public UniformFullIntDistribution32<Generator, uint32_t>
+ : public UniformFullIntDistribution32<Generator, uint32_t>
{
};
template <typename Generator>
class UniformFullIntDistribution<Generator, int64_t>
- : public UniformFullIntDistribution64<Generator, int64_t>
+ : public UniformFullIntDistribution64<Generator, int64_t>
{
};
template <typename Generator>
class UniformFullIntDistribution<Generator, uint64_t>
- : public UniformFullIntDistribution64<Generator, uint64_t>
+ : public UniformFullIntDistribution64<Generator, uint64_t>
{
};
@@ -324,7 +324,7 @@ public:
PHILOX_DEVICE_INLINE
explicit SingleSampleAdapter(Generator *gen)
- : generator_(gen), used_result_index_(Generator::kResultElementCount)
+ : generator_(gen), used_result_index_(Generator::kResultElementCount)
{
}
@@ -615,8 +615,8 @@ class TruncatedNormalDistribution<SingleSampleGenerator, double>
public:
// The number of elements that will be returned.
static constexpr int kResultElementCount = (SingleSampleGenerator::kNativeElementCount > 1)
- ? SingleSampleGenerator::kNativeElementCount / 2
- : 1;
+ ? SingleSampleGenerator::kNativeElementCount / 2
+ : 1;
// Cost of generation of a single element (in cycles).
static constexpr int kElementCost = 90;
// Indicate that this distribution may take variable number of samples
@@ -772,7 +772,7 @@ PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32_t x0, uint32_t x1)
}
} // namespace random
-} // namespace tensorflow
-}
+} // namespace cker
+} // namespace nnfw
#endif // __NNFW_CKER_HELPER_RANDOM_DISTRIBUTIONS_H__
diff --git a/compute/cker/include/cker/operation/Helper/RandomOp.h b/compute/cker/include/cker/operation/Helper/RandomOp.h
index 7dc51fe94..6b7049ddf 100644
--- a/compute/cker/include/cker/operation/Helper/RandomOp.h
+++ b/compute/cker/include/cker/operation/Helper/RandomOp.h
@@ -47,6 +47,6 @@ template <class Distribution> struct FillPhiloxRandom<CPUDevice, Distribution>
};
} // namespace functor
-} // namespace tensorflow
-}
+} // namespace cker
+} // namespace nnfw
#endif // __NNFW_CKER_HELPER_RANDOM_OP_H__
diff --git a/compute/cker/include/cker/operation/Helper/RandomOpCpu.h b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h
index 85d267723..c99f69709 100644
--- a/compute/cker/include/cker/operation/Helper/RandomOpCpu.h
+++ b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h
@@ -109,7 +109,7 @@ template <class Distribution> struct FillPhiloxRandomTask<Distribution, true>
{
const int kGroupSize = Distribution::kResultElementCount;
static const int kGeneratorSkipPerOutputGroup =
- kGroupSize * kReservedSamplesPerOutput / PhiloxRandom::kResultElementCount;
+ kGroupSize * kReservedSamplesPerOutput / PhiloxRandom::kResultElementCount;
int64_t offset = 0;
@@ -157,7 +157,7 @@ operator()(random::PhiloxRandom gen, typename Distribution::ResultElementType *d
} // namespace functor
-} // end namespace tensorflow
-}
+} // namespace cker
+} // namespace nnfw
#endif // __NNFW_CKER_HELPER_RANDOM_OP_CPU_H__
diff --git a/compute/cker/include/cker/operation/Helper/Tensor.h b/compute/cker/include/cker/operation/Helper/Tensor.h
index e6ac008a5..ec29a15c3 100644
--- a/compute/cker/include/cker/operation/Helper/Tensor.h
+++ b/compute/cker/include/cker/operation/Helper/Tensor.h
@@ -29,58 +29,58 @@ template <typename T, int NDIMS = 1, typename IndexType = Eigen::DenseIndex> str
{
// Rank-<NDIMS> tensor of scalar type T.
typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned>
- Tensor;
+ Tensor;
typedef Eigen::TensorMap<Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>,
Eigen::Aligned>
- ConstTensor;
+ ConstTensor;
// Unaligned Rank-<NDIMS> tensor of scalar type T.
typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>> UnalignedTensor;
typedef Eigen::TensorMap<Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>>
- UnalignedConstTensor;
+ UnalignedConstTensor;
typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, int>, Eigen::Aligned>
- Tensor32Bit;
+ Tensor32Bit;
// Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
typedef Eigen::TensorMap<Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>,
Eigen::Aligned>
- Scalar;
+ Scalar;
typedef Eigen::TensorMap<
- Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>, Eigen::Aligned>
- ConstScalar;
+ Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+ ConstScalar;
// Unaligned Scalar tensor of scalar type T.
typedef Eigen::TensorMap<Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>>
- UnalignedScalar;
+ UnalignedScalar;
typedef Eigen::TensorMap<
- Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>>
- UnalignedConstScalar;
+ Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>>
+ UnalignedConstScalar;
// Rank-1 tensor (vector) of scalar type T.
typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> Flat;
typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
- ConstFlat;
+ ConstFlat;
typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> Vec;
typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
- ConstVec;
+ ConstVec;
// Unaligned Rank-1 tensor (vector) of scalar type T.
typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>> UnalignedFlat;
typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>>
- UnalignedConstFlat;
+ UnalignedConstFlat;
typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>> UnalignedVec;
typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>> UnalignedConstVec;
// Rank-2 tensor (matrix) of scalar type T.
typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned> Matrix;
typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned>
- ConstMatrix;
+ ConstMatrix;
// Unaligned Rank-2 tensor (matrix) of scalar type T.
typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>> UnalignedMatrix;
typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>>
- UnalignedConstMatrix;
+ UnalignedConstMatrix;
};
typedef typename TTypes<float, 1>::Tensor32Bit::Index Index32;
diff --git a/compute/cker/include/cker/operation/InstanceNorm.h b/compute/cker/include/cker/operation/InstanceNorm.h
index 6445e8a2b..8fa8b03bc 100644
--- a/compute/cker/include/cker/operation/InstanceNorm.h
+++ b/compute/cker/include/cker/operation/InstanceNorm.h
@@ -78,8 +78,8 @@ inline void InstanceNorm(const InstanceNormParams &params, const Shape &input_sh
double input_value = input_data[Offset(output_shape, batch, height, width, channel)];
double output_value = input_value * a + b;
output_data[Offset(output_shape, batch, height, width, channel)] =
- ActivationFunctionWithMinMax((float)output_value, output_activation_min,
- output_activation_max);
+ ActivationFunctionWithMinMax((float)output_value, output_activation_min,
+ output_activation_max);
}
}
}
diff --git a/compute/cker/include/cker/operation/L2Normalize.h b/compute/cker/include/cker/operation/L2Normalize.h
index a0075c3d0..c1fca91cc 100644
--- a/compute/cker/include/cker/operation/L2Normalize.h
+++ b/compute/cker/include/cker/operation/L2Normalize.h
@@ -77,7 +77,7 @@ void L2NormalizeQuant8(L2NormParams &params, const Shape &input_shape, const uin
{
int32_t diff = *input_data - input_zero_point;
int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
- 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
+ 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
int32_t unclamped_output_val = 128 + rescaled_diff;
int32_t output_val = std::min(static_cast<int32_t>(255),
std::max(static_cast<int32_t>(0), unclamped_output_val));
diff --git a/compute/cker/include/cker/operation/LSTM.h b/compute/cker/include/cker/operation/LSTM.h
new file mode 100644
index 000000000..a8f1f8ca3
--- /dev/null
+++ b/compute/cker/include/cker/operation/LSTM.h
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_UNIDIRECTIONALSEQUENCELSTM_H__
+#define __NNFW_CKER_UNIDIRECTIONALSEQUENCELSTM_H__
+
+#include "cker/TensorUtils.h"
+#include "cker/Types.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+// LINT.IfChange
+// Calculates a single LSTM gate.
+//
+// Implements the following formula: (* is matrix multiply)
+// gate = activate(W_input * input + W_aux * aux_input +
+// W_peephole * cell + W_recurrent * prev_output + bias)
+// with layer norm:
+// gate = activate(W_norm * normalize(...) + bias) // not adding bias inside
+//
+// Activation is sigmoid except for the "cell" gate (configurable, usually tanh)
+//
+// Parameters:
+// Input vectors (to LSTM): | Size: | Optional?
+// input | n_input |
+// aux_input | n_aux_input | y (bidir LSTM)
+// Input vectors (persistent states):
+// output_state | n_output |
+// cell_state | n_cell |
+// 'Constant' inputs:
+// input_to_gate_weights | n_cell * n_input |
+// aux_input_to_gate_weights | n_cell * n_aux_input | y (bidir LSTM)
+// recurrent_to_gate_weights | n_cell * n_output |
+// cell_to_gate_weights | n_cell | y (peephole)
+// gate_bias | n_cell |
+// layer_norm_coefficients | n_cell | y (layer norm)
+// Output vector:
+// gate | n_cell |
+// Scalar parameters:
+// n_batch - batch size / number of vectors
+// n_input, n_aux_input, n_output, n_cell - size of vectors.
+// activation - activation to use.
+// is_input_all_zeros, is_aux_input_all_zeros - if input vectors are all zero.
+// use_layer_norm - if doing layer norm LSTM.
+inline void CalculateLstmGateFloat(const float *input, const float *input_to_gate_weights,
+ const float *aux_input, const float *aux_input_to_gate_weights,
+ const float *output_state,
+ const float *recurrent_to_gate_weights, const float *cell_state,
+ const float *cell_to_gate_weights,
+ const float *layer_norm_coefficients, const float *gate_bias,
+ const int n_batch, const int n_input, const int n_aux_input,
+ const int n_output, const int n_cell,
+ const FusedActivationFunctionType activation, float *gate,
+ const bool is_input_all_zeros, const bool is_aux_input_all_zeros)
+{
+ const bool use_peephole = (cell_to_gate_weights != nullptr);
+ const bool use_layer_norm = (layer_norm_coefficients != nullptr);
+
+ // Initialize scratch buffers with bias for regular lstm or initialize with
+ // zero for layer norm lstm.
+ if (use_layer_norm)
+ {
+ std::fill_n(gate, n_cell * n_batch, 0.0f);
+ }
+ else
+ {
+ VectorBatchVectorAssign(gate_bias, n_cell, n_batch, gate);
+ }
+ // For each batch and cell: compute input_weight * input.
+ // Skip if input is all zeros.
+ if (!is_input_all_zeros)
+ {
+ MatrixBatchVectorMultiplyAccumulate(input_to_gate_weights, n_cell, n_input, input, n_batch,
+ gate, /*result_stride=*/1);
+ }
+ // For each batch and cell: compute aux_input_weight * aux_input.
+ // Skip if auxiliary input is not available or all zeros.
+ if (!is_aux_input_all_zeros)
+ {
+ MatrixBatchVectorMultiplyAccumulate(aux_input_to_gate_weights, n_cell, n_aux_input, aux_input,
+ n_batch, gate, /*result_stride=*/1);
+ }
+ // For each batch and cell: compute recurrent_weight * output_state.
+ MatrixBatchVectorMultiplyAccumulate(recurrent_to_gate_weights, n_cell, n_output, output_state,
+ n_batch, gate, /*result_stride=*/1);
+ // For each batch and cell: compute cell_weight .* cell_state (peephole LSTM)
+ if (use_peephole)
+ {
+ VectorBatchVectorCwiseProductAccumulate(cell_to_gate_weights, n_cell, cell_state, n_batch,
+ gate);
+ }
+ // Do layer normalization (if layer norm LSTM)
+ if (use_layer_norm)
+ {
+ MeanStddevNormalization(gate, gate, n_cell, n_batch);
+ VectorBatchVectorCwiseProduct(layer_norm_coefficients, n_cell, gate, n_batch, gate);
+ VectorBatchVectorAdd(gate_bias, n_cell, n_batch, gate);
+ }
+ // Apply activation
+ ApplyActivationToVector(gate, n_batch * n_cell, activation, gate);
+}
+
+// Updates the LSTM cell state, used by both float and hybrid LSTM versions.
+//
+// Implements the following formula:
+// cell_state_new = clip(forget_gate * cell_state + input_gate * cell_gate)
+//
+// With CIFG LSTM, input gate is replaced by (1-forget_gate).
+//
+// Parameters:
+// - n_batch, n_cell: sizes of vectors
+// - cell_state: input/output vector, size n_batch*n_cell
+// - input_gate: input vector, size n_batch*n_cell.
+// - forget_gate: input/scratch vector, size n_batch*n_cell, modified with CIFG
+// - cell_gate: input vector, size n_batch*n_cell.
+// - use_cifg: use 1-forget_gate instead of input_gate.
+// - clip: if > 0, clip the resulting cell state to [-clip, +clip].
+void UpdateLstmCellFloat(int n_batch, int n_cell, float *cell_state, const float *input_gate,
+ float *forget_gate, const float *cell_gate, bool use_cifg, float clip)
+{
+ // Define variable for 4th argument to avoid warning
+ // Compiler warning: passing argument 4 to restrict-qualified parameter aliases with argument 2
+ const float *cwise_product_rhs = cell_state;
+ VectorVectorCwiseProduct(forget_gate, cwise_product_rhs, n_batch * n_cell, cell_state);
+
+ if (use_cifg)
+ {
+ // With CIFG, input_gate = 1-forget_gate. Use the forget_gate array as
+ // scratch, as input_gate array is not allocated in this case. (Be careful
+ // not to write to the scratch before reading the forget gate data.)
+ float *scratch = forget_gate;
+ Sub1Vector(forget_gate, n_batch * n_cell, scratch);
+ VectorVectorCwiseProductAccumulate(cell_gate, scratch, n_batch * n_cell, cell_state);
+ }
+ else
+ {
+ VectorVectorCwiseProductAccumulate(cell_gate, input_gate, n_batch * n_cell, cell_state);
+ }
+ if (clip > 0.0f)
+ {
+ CwiseClipping(cell_state, n_batch * n_cell, clip);
+ }
+}
+
+// Calculates the output state tensor of an LSTM step.
+//
+// Implements the following formula:
+// output_no_projection = output_gate .* activate(cell_state)
+// (elementwise vector product)
+// If no projection is used:
+// output = output_state = output_no_projection
+// With projection:
+// output = output_state = clip(W*output_no_projection + bias)
+//
+// Output might not have a different 'stride' than n_batch, so we need to copy.
+//
+// Parameters:
+// - n_batch: batches: the number of distinct vectors in each array.
+// - n_cell, n_output: sizes of vectors.
+// - cell_state, output_gate: input vectors, size n_batch*n_cell.
+// - projection_weights, projection_weights_scale, projection_bias:
+// constant inputs, describing projection matrix and bias.
+// - proj_clip: if > 0, clip the output of the projection.
+// - output_state: output vector, size n_batch*n_output. Must be contigous.
+// - scratch: scratch area, size n_batch*n_cell.
+void CalculateLstmOutputFloat(int n_batch, int n_cell, int n_output, const float *cell_state,
+ const float *output_gate, FusedActivationFunctionType activation,
+ const float *projection_weights, const float *projection_bias,
+ const float proj_clip, float *output_state, float *scratch)
+{
+ ApplyActivationToVector(cell_state, n_batch * n_cell, activation, scratch);
+
+ // Define variable for 4th argument to avoid warning
+ // Compiler warning: passing argument 4 to restrict-qualified parameter aliases with argument 2
+ const float *cwise_product_rhs = scratch;
+ VectorVectorCwiseProduct(output_gate, cwise_product_rhs, n_batch * n_cell, scratch);
+
+ const bool use_projection = (projection_weights != nullptr);
+ const bool use_projection_bias = (projection_bias != nullptr);
+
+ if (use_projection)
+ {
+ if (use_projection_bias)
+ {
+ VectorBatchVectorAssign(projection_bias, n_output, n_batch, output_state);
+ }
+ else
+ {
+ std::fill_n(output_state, n_batch * n_output, 0.0f);
+ }
+ MatrixBatchVectorMultiplyAccumulate(projection_weights, n_output, n_cell, scratch, n_batch,
+ output_state, /*result_stride=*/1);
+ if (proj_clip > 0.0f)
+ {
+ CwiseClipping(output_state, n_batch * n_output, proj_clip);
+ }
+ }
+ else
+ {
+ std::copy_n(scratch, n_batch * n_output, output_state);
+ }
+}
+
+// Performs an LSTM batch inference step for input specified by input_ptr.
+// The LSTM cell is specified by the pointers to its weights (*_weights_ptr) and
+// biases (*_bias_ptr), and buffers (*_scratch), along with additional
+// parameters:
+// - params: various LSTM params including activation, clipping, etc.,
+// - n_batch: size of batch,
+// - n_cell: number of cells (or units),
+// - n_input: the input size,
+// - n_aux_input: the auxiliary input size.
+// - n_output: the output size.
+// - output_batch_leading_dim: the leading dimension of the output buffer.
+//
+// Input of size 'n_batch * n_input':
+// input_ptr
+// Input of size 'n_batch * n_aux_input':
+// aux_input_ptr - optional (can be nullptr)
+//
+// LSTM weights:
+// Input weights of size 'n_cell * n_input':
+// input_to_input_weights - optional
+// input_to_forget_weights
+// input_to_cell_weights
+// input_to_output_weights
+// Auxiliary input weights of size 'n_cell * n_aux_input':
+// aux_input_to_input_weights - optional
+// aux_input_to_forget_weights - optional
+// aux_input_to_cell_weights - optional
+// aux_input_to_output_weights - optional
+// Recurrent weights of size 'n_cell * n_output':
+// recurrent_to_input_weights - optional
+// recurrent_to_forget_weights
+// recurrent_to_cell_weights
+// recurrent_to_input_weights
+// Peephole weights of size 'n_cell', representing diagonal matrices.
+// cell_to_input_weights - optional
+// cell_to_cell_weights - optional
+// cell_to_output_weights - optional
+// Projection weights of size 'n_output * n_cell'
+// projection_weights_ptr - optional
+// Gate biases of size 'n_cell':
+// input_gate_bias_ptr - optional
+// forget_gate_bias_ptr
+// cell_gate_bias_ptr
+// output_gate_bias_ptr
+//
+// Layer norm coefficients of size 'n_cell', representing diagonal matrices.
+// input_layer_norm_coefficients_ptr - optional
+// forget_layer_norm_coefficients_ptr - optional
+// cell_layer_norm_coefficients_ptr - optional
+// output_layer_norm_coefficients_ptr - optional
+//
+// The pointers to the cell and output state and the output are updated.
+//
+// The pointers input_ptr, aux_input_ptr, and output_ptr point to data aligned
+// in batch_major order, and each step processes batch_size many inputs from
+// input_ptr, and updates batch_size many cell and output states.
+//
+// The output_batch_dim is output.shape[-1], i.e. the outermost dimension of the
+// output tensor, and in most cases will be equal to n_output. It is usually not
+// when we want to store the LSTM output into a slice of the output tensor, e.g.
+// for bidirectional LSTMs with merge_outputs. In this case, the batched
+// operations cannot be used since they assume that the batched outputs are
+// contiguous, and we manually loop over the batched outputs.
+// LINT.IfChange
+inline void LstmStepFloat(
+ const float *input_ptr, const float *input_to_input_weights_ptr,
+ const float *input_to_forget_weights_ptr, const float *input_to_cell_weights_ptr,
+ const float *input_to_output_weights_ptr, const float *aux_input_ptr,
+ const float *aux_input_to_input_weights_ptr, const float *aux_input_to_forget_weights_ptr,
+ const float *aux_input_to_cell_weights_ptr, const float *aux_input_to_output_weights_ptr,
+ const float *recurrent_to_input_weights_ptr, const float *recurrent_to_forget_weights_ptr,
+ const float *recurrent_to_cell_weights_ptr, const float *recurrent_to_output_weights_ptr,
+ const float *cell_to_input_weights_ptr, const float *cell_to_forget_weights_ptr,
+ const float *cell_to_output_weights_ptr, const float *input_layer_norm_coefficients_ptr,
+ const float *forget_layer_norm_coefficients_ptr, const float *cell_layer_norm_coefficients_ptr,
+ const float *output_layer_norm_coefficients_ptr, const float *input_gate_bias_ptr,
+ const float *forget_gate_bias_ptr, const float *cell_gate_bias_ptr,
+ const float *output_gate_bias_ptr, const float *projection_weights_ptr,
+ const float *projection_bias_ptr, const LSTMParams *params, int n_batch, int n_cell, int n_input,
+ int n_aux_input, int n_output, int output_batch_leading_dim, float *output_state_ptr,
+ float *cell_state_ptr, float *scratch0, float *scratch1, float *scratch2, float *scratch3,
+ float *output_ptr)
+{
+ // Since we have already checked that weights are all there or none, we can
+ // check the existence of only one to the get the condition.
+ const bool use_cifg = (input_to_input_weights_ptr == nullptr);
+
+ // Make named scratch buffers.
+ float *input_gate_scratch = scratch0;
+ float *forget_gate_scratch = scratch1;
+ float *cell_gate_scratch = scratch2;
+ float *output_gate_scratch = scratch3;
+
+ // Check if inputs are all zeros so we can skip some computations.
+ const bool is_input_all_zeros = IsZeroVector(input_ptr, n_batch * n_input);
+ const bool is_aux_input_all_zeros =
+ (aux_input_ptr == nullptr || IsZeroVector(aux_input_ptr, n_batch * n_aux_input));
+ if (!use_cifg)
+ {
+ // Calculate the input gate. (If not CIFG.)
+ CalculateLstmGateFloat(input_ptr, input_to_input_weights_ptr, aux_input_ptr,
+ aux_input_to_input_weights_ptr, output_state_ptr,
+ recurrent_to_input_weights_ptr, cell_state_ptr,
+ cell_to_input_weights_ptr, input_layer_norm_coefficients_ptr,
+ input_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
+ /*activation=kTfLiteActSigmoid*/ FusedActivationFunctionType::kSigmoid,
+ input_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros);
+ }
+ // Calculate the forget gate.
+ CalculateLstmGateFloat(input_ptr, input_to_forget_weights_ptr, aux_input_ptr,
+ aux_input_to_forget_weights_ptr, output_state_ptr,
+ recurrent_to_forget_weights_ptr, cell_state_ptr,
+ cell_to_forget_weights_ptr, forget_layer_norm_coefficients_ptr,
+ forget_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
+ /*activation=kTfLiteActSigmoid*/ FusedActivationFunctionType::kSigmoid,
+ forget_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros);
+ // Calculate the cell update gate.
+ CalculateLstmGateFloat(
+ input_ptr, input_to_cell_weights_ptr, aux_input_ptr, aux_input_to_cell_weights_ptr,
+ output_state_ptr, recurrent_to_cell_weights_ptr, /*cell_state=*/nullptr,
+ /*cell_to_gate_weights=*/nullptr, cell_layer_norm_coefficients_ptr, cell_gate_bias_ptr, n_batch,
+ n_input, n_aux_input, n_output, n_cell, params->activation, cell_gate_scratch,
+ is_input_all_zeros, is_aux_input_all_zeros);
+ // Update the cell state.
+ UpdateLstmCellFloat(n_batch, n_cell, cell_state_ptr, input_gate_scratch, forget_gate_scratch,
+ cell_gate_scratch, use_cifg, params->cell_clip);
+ // Calculate output gate.
+ CalculateLstmGateFloat(input_ptr, input_to_output_weights_ptr, aux_input_ptr,
+ aux_input_to_output_weights_ptr, output_state_ptr,
+ recurrent_to_output_weights_ptr, cell_state_ptr,
+ cell_to_output_weights_ptr, output_layer_norm_coefficients_ptr,
+ output_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell,
+ /*activation=kTfLiteActSigmoid*/ FusedActivationFunctionType::kSigmoid,
+ output_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros);
+ // Update the output state.
+ CalculateLstmOutputFloat(n_batch, n_cell, n_output, cell_state_ptr, output_gate_scratch,
+ params->activation, projection_weights_ptr, projection_bias_ptr,
+ params->proj_clip, output_state_ptr, scratch2);
+ // Copy output state to the output. Note that the output's rows may not be
+ // contiguous (output_batch_leading_dim != n_output).
+ for (int b = 0; b < n_batch; b++)
+ {
+ std::copy_n(output_state_ptr + b * n_output, n_output,
+ output_ptr + b * output_batch_leading_dim);
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_UNIDIRECTIONALSEQUENCELSTM_H__
diff --git a/compute/cker/include/cker/operation/LeakyReLU.h b/compute/cker/include/cker/operation/LeakyReLU.h
new file mode 100644
index 000000000..e12d01bba
--- /dev/null
+++ b/compute/cker/include/cker/operation/LeakyReLU.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_LEKAY_RELU_H__
+#define __NNFW_CKER_LEKAY_RELU_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void LeakyReLU(const LeakyReluParams &params, const Shape &input_shape,
+ const float *input_data, const Shape &output_shape, float *output_data)
+{
+ const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+ for (int i = 0; i < flat_size; i++)
+ {
+ const float val = input_data[i];
+ // Note that alpha might be > 1 or < 0, so we don't use std::max here.
+ output_data[i] = val > 0 ? val : val * params.alpha;
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_RELU_H__
diff --git a/compute/cker/include/cker/operation/LogSoftMax.h b/compute/cker/include/cker/operation/LogSoftMax.h
index 326a44f0c..eb7bdd900 100644
--- a/compute/cker/include/cker/operation/LogSoftMax.h
+++ b/compute/cker/include/cker/operation/LogSoftMax.h
@@ -71,7 +71,7 @@ inline void LogSoftmax(const SoftmaxParams &params, const Shape &input_shape,
for (int c = 0; c < depth; ++c)
{
output_data[(i * depth + c) * inner_size + j] =
- (input_data[(i * depth + c) * inner_size + j] - max) * beta - log_sum;
+ (input_data[(i * depth + c) * inner_size + j] - max) * beta - log_sum;
}
}
}
@@ -124,10 +124,10 @@ inline void LogSoftmax(const SoftmaxParams &params, float input_scale, const Sha
for (int c = 0; c < depth; ++c)
{
const float log_prob =
- scale * input_data[(i * depth + c) * inner_size] * beta - precomputed;
+ scale * input_data[(i * depth + c) * inner_size] * beta - precomputed;
const int32_t prob_quantized = std::rint(log_prob) + params.zero_point;
output_data[(i * depth + c) * inner_size] =
- static_cast<uint8_t>(std::max(std::min(clamp_max, prob_quantized), clamp_min));
+ static_cast<uint8_t>(std::max(std::min(clamp_max, prob_quantized), clamp_min));
}
}
}
diff --git a/compute/cker/include/cker/operation/LogicalAnd.h b/compute/cker/include/cker/operation/LogicalAnd.h
new file mode 100644
index 000000000..e877f5f47
--- /dev/null
+++ b/compute/cker/include/cker/operation/LogicalAnd.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_LOGICAL_AND_H__
+#define __NNFW_CKER_LOGICAL_AND_H__
+
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T>
+inline void LogicalAndBroadcast(const Shape &unextended_input1_shape, const T *input1_data,
+ const Shape &unextended_input2_shape, const T *input2_data,
+ const Shape &unextended_output_shape, T *output_data)
+{
+ assert(unextended_input1_shape.DimensionsCount() <= 4);
+ assert(unextended_input2_shape.DimensionsCount() <= 4);
+ assert(unextended_output_shape.DimensionsCount() <= 4);
+ const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
+
+ NdArrayDesc<4> desc1;
+ NdArrayDesc<4> desc2;
+ NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
+ &desc2);
+
+ for (int b = 0; b < output_shape.Dims(0); ++b)
+ {
+ for (int y = 0; y < output_shape.Dims(1); ++y)
+ {
+ for (int x = 0; x < output_shape.Dims(2); ++x)
+ {
+ for (int c = 0; c < output_shape.Dims(3); ++c)
+ {
+ auto out_idx = Offset(output_shape, b, y, x, c);
+ auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
+ auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
+ auto in1_val = input1_data[in1_idx];
+ auto in2_val = input2_data[in2_idx];
+ output_data[out_idx] = in1_val && in2_val;
+ }
+ }
+ }
+ }
+}
+
+template <typename T>
+inline void LogicalAndElementwise(const Shape &shape, const T *input1_data, const T *input2_data,
+ T *output_data)
+{
+
+ int num_elements = shape.FlatSize();
+
+ for (int t = 0; t < num_elements; t++)
+ {
+ output_data[t] = input1_data[t] && input2_data[t];
+ }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_LOGICAL_AND_H__
diff --git a/compute/cker/include/cker/operation/Logistic.h b/compute/cker/include/cker/operation/Logistic.h
index 3d3e59e55..e9907729e 100644
--- a/compute/cker/include/cker/operation/Logistic.h
+++ b/compute/cker/include/cker/operation/Logistic.h
@@ -29,12 +29,39 @@ namespace nnfw
namespace cker
{
+/**
+ * @brief Internal scalar_logistic_op operation struct
+ *
+ * @note Recent Eigen3 scalar_logistic_op return invalid value on ARM32 if
+ * input value is float type 88 (expected: 1, actual: 0)
+ * As a workaround, we use old version scalar_logistic_op internal struct
+ * TODO Remove this workaround
+ */
+template <typename T> struct scalar_logistic_op
+{
+ EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op)
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T &x) const
+ {
+ const T one = T(1);
+ return one / (one + Eigen::numext::exp(-x));
+ }
+
+ template <typename Packet>
+ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet &x) const
+ {
+ const Packet one = Eigen::internal::pset1<Packet>(T(1));
+ return pdiv(one, padd(one, pexp(pnegate(x))));
+ }
+};
+
inline void Logistic(const Shape &input_shape, const float *input_data, const Shape &output_shape,
float *output_data)
{
auto input_map = MapAsVector(input_data, input_shape);
auto output_map = MapAsVector(output_data, output_shape);
- output_map.array() = input_map.array().unaryExpr(Eigen::internal::scalar_logistic_op<float>());
+
+ // Use old version scalar_logistic_op
+ output_map.array() = input_map.array().unaryExpr(nnfw::cker::scalar_logistic_op<float>());
}
} // namespace cker
diff --git a/compute/cker/include/cker/operation/MatrixBandPart.h b/compute/cker/include/cker/operation/MatrixBandPart.h
index 5674ff3ef..ef2868455 100644
--- a/compute/cker/include/cker/operation/MatrixBandPart.h
+++ b/compute/cker/include/cker/operation/MatrixBandPart.h
@@ -43,11 +43,11 @@ void MatrixBandPart(const T num_lower_diags, const T num_upper_diags, const Shap
if (!(num_lower_diags <= row_num))
throw std::runtime_error(
- "MatrixBandPart : num_lower must be negative or less or equal to number of rows");
+ "MatrixBandPart : num_lower must be negative or less or equal to number of rows");
if (!(num_upper_diags <= col_num))
throw std::runtime_error(
- "MatrixBandPart : num_upper must be negative or less or equal to number of columns");
+ "MatrixBandPart : num_upper must be negative or less or equal to number of columns");
std::fill(output_data, output_data + output_shape.FlatSize(), 0); // output matrix init
@@ -60,9 +60,10 @@ void MatrixBandPart(const T num_lower_diags, const T num_upper_diags, const Shap
auto input = input_data + (batch * row_num * col_num + row * col_num);
const T band_start =
- num_lower_diags < 0 ? 0 : std::min(col_num, std::max(T{0}, row - num_lower_diags));
- const T band_end = num_upper_diags < 0 ? col_num : std::min(static_cast<T>(col_num),
- row + num_upper_diags + 1);
+ num_lower_diags < 0 ? 0 : std::min(col_num, std::max(T{0}, row - num_lower_diags));
+ const T band_end = num_upper_diags < 0
+ ? col_num
+ : std::min(static_cast<T>(col_num), row + num_upper_diags + 1);
for (T band_idx = band_start; band_idx < band_end; band_idx++)
{
diff --git a/compute/cker/include/cker/operation/MaxPool.h b/compute/cker/include/cker/operation/MaxPool.h
index ea3fcaca6..5dc84d368 100644
--- a/compute/cker/include/cker/operation/MaxPool.h
+++ b/compute/cker/include/cker/operation/MaxPool.h
@@ -67,10 +67,10 @@ void MaxPool<float>(const PoolParams &params, const Shape &input_shape, const fl
int hpad = h + params.padding_values.height;
int wpad = w + params.padding_values.width;
int h_start =
- (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
+ (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1;
int h_end = std::min(hpad / stride_height + 1, output_height);
int w_start =
- (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
+ (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1;
int w_end = std::min(wpad / stride_width + 1, output_width);
// compute elementwise sum
for (int ph = h_start; ph < h_end; ++ph)
@@ -79,8 +79,8 @@ void MaxPool<float>(const PoolParams &params, const Shape &input_shape, const fl
{
int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
out_mat.col(out_offset) =
- out_mat.col(out_offset)
- .cwiseMax(in_mat.col(NodeOffset(b, h, w, input_height, input_width)));
+ out_mat.col(out_offset)
+ .cwiseMax(in_mat.col(NodeOffset(b, h, w, input_height, input_width)));
}
}
}
@@ -139,8 +139,8 @@ void MaxPool<uint8_t>(const PoolParams &params, const Shape &input_shape, const
const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
memset(acc, 0, tranche_depth * sizeof(acc[0]));
const uint8_t *input_ptr =
- input_data + depth_base +
- depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
+ input_data + depth_base +
+ depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
for (int fy = filter_y_start; fy < filter_y_end; fy++)
{
const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
diff --git a/compute/cker/include/cker/operation/OneHot.h b/compute/cker/include/cker/operation/OneHot.h
index c0dbc6df5..ddc27b4c2 100644
--- a/compute/cker/include/cker/operation/OneHot.h
+++ b/compute/cker/include/cker/operation/OneHot.h
@@ -55,7 +55,7 @@ void OneHot(const int32_t depth, const T on_value, const T off_value, int32_t ax
for (int k = 0; k < suffix_dim_size; ++k, ++output_data)
{
*output_data =
- static_cast<int>(indices_data[i * suffix_dim_size + k]) == j ? on_value : off_value;
+ static_cast<int>(indices_data[i * suffix_dim_size + k]) == j ? on_value : off_value;
}
}
}
diff --git a/compute/cker/include/cker/operation/Quantize.h b/compute/cker/include/cker/operation/Quantize.h
index 5c82d111f..7292a199a 100644
--- a/compute/cker/include/cker/operation/Quantize.h
+++ b/compute/cker/include/cker/operation/Quantize.h
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.*
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -17,11 +18,14 @@
#ifndef __NNFW_CKER_QUANTIZE_H__
#define __NNFW_CKER_QUANTIZE_H__
+#include "cker/operation/Round.h"
#include "cker/Shape.h"
#include "cker/Types.h"
#include "cker/Utils.h"
-#include <stdexcept>
+#include <cassert>
#include <iostream>
+#include <stdexcept>
+
namespace nnfw
{
namespace cker
@@ -41,6 +45,409 @@ inline void Quantize(const Shape &input_shape, const InputT *input_data, const S
output_data[i] = clamped;
}
}
+
+template <>
+inline void Quantize(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+ int8_t *output_data, const float scale, const int32_t zero_point)
+{
+ const int flat_size = MatchingFlatSize(input_shape, output_shape);
+ static constexpr int32_t min_val = std::numeric_limits<int8_t>::min();
+ static constexpr int32_t max_val = std::numeric_limits<int8_t>::max();
+
+ int i = 0;
+#ifdef USE_NEON
+ const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale);
+ const int32x4_t zero_point_dup = vdupq_n_s32(zero_point);
+ const int32x4_t min_val_dup = vdupq_n_s32(min_val);
+ const int32x4_t max_val_dup = vdupq_n_s32(max_val);
+
+ for (; i <= flat_size - 8; i += 8)
+ {
+ const float *src_data_ptr = input_data + i;
+ float32x4_t input_val_0 = vld1q_f32(src_data_ptr);
+ float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4);
+
+ input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup);
+ input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup);
+
+ int32x4_t casted_val_0 = RoundToNearest(input_val_0);
+ int32x4_t casted_val_1 = RoundToNearest(input_val_1);
+
+ casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup);
+ casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup);
+
+ // Clamp the values to fit the target type's range.
+ casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup);
+ casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup);
+ casted_val_0 = vminq_s32(casted_val_0, max_val_dup);
+ casted_val_1 = vminq_s32(casted_val_1, max_val_dup);
+
+ const int16x4_t narrowed_val_0 = vmovn_s32(casted_val_0);
+ const int16x4_t narrowed_val_1 = vmovn_s32(casted_val_1);
+ const int16x8_t combined_val = vcombine_s16(narrowed_val_0, narrowed_val_1);
+ const int8x8_t combined_val_narrowed = vmovn_s16(combined_val);
+ vst1_s8(output_data + i, combined_val_narrowed);
+ }
+#endif // NEON
+
+ for (; i < flat_size; ++i)
+ {
+ const float val = input_data[i];
+ const int32_t unclamped = static_cast<int32_t>(round(val / scale)) + zero_point;
+ const int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
+ output_data[i] = clamped;
+ }
+}
+
+template <>
+inline void Quantize(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+ uint8_t *output_data, const float scale, const int32_t zero_point)
+{
+ const int flat_size = MatchingFlatSize(input_shape, output_shape);
+ static constexpr int32_t min_val = std::numeric_limits<uint8_t>::min();
+ static constexpr int32_t max_val = std::numeric_limits<uint8_t>::max();
+
+ int i = 0;
+#ifdef USE_NEON
+ const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale);
+ const int32x4_t zero_point_dup = vdupq_n_s32(zero_point);
+ const int32x4_t min_val_dup = vdupq_n_s32(min_val);
+ const int32x4_t max_val_dup = vdupq_n_s32(max_val);
+
+ for (; i <= flat_size - 8; i += 8)
+ {
+ const float *src_data_ptr = input_data + i;
+ float32x4_t input_val_0 = vld1q_f32(src_data_ptr);
+ float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4);
+
+ input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup);
+ input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup);
+
+ int32x4_t casted_val_0 = RoundToNearest(input_val_0);
+ int32x4_t casted_val_1 = RoundToNearest(input_val_1);
+
+ casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup);
+ casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup);
+
+ // Clamp the values to fit the target type's range.
+ casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup);
+ casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup);
+ casted_val_0 = vminq_s32(casted_val_0, max_val_dup);
+ casted_val_1 = vminq_s32(casted_val_1, max_val_dup);
+
+ const uint16x4_t narrowed_val_0 = vqmovun_s32(casted_val_0);
+ const uint16x4_t narrowed_val_1 = vqmovun_s32(casted_val_1);
+ const uint16x8_t combined_val = vcombine_u16(narrowed_val_0, narrowed_val_1);
+ const uint8x8_t combined_val_narrowed = vmovn_u16(combined_val);
+ vst1_u8(output_data + i, combined_val_narrowed);
+ }
+#endif // NEON
+
+ for (; i < flat_size; ++i)
+ {
+ const float val = input_data[i];
+ const int32_t unclamped = static_cast<int32_t>(round(val / scale)) + zero_point;
+ const int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
+ output_data[i] = clamped;
+ }
+}
+
+template <>
+inline void Quantize(const Shape &input_shape, const float *input_data, const Shape &output_shape,
+ int16_t *output_data, const float scale, const int32_t zero_point)
+{
+ const int flat_size = MatchingFlatSize(input_shape, output_shape);
+ static constexpr int32_t min_val = std::numeric_limits<int16_t>::min();
+ static constexpr int32_t max_val = std::numeric_limits<int16_t>::max();
+
+ int i = 0;
+#ifdef USE_NEON
+ const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale);
+ const int32x4_t zero_point_dup = vdupq_n_s32(zero_point);
+ const int32x4_t min_val_dup = vdupq_n_s32(min_val);
+ const int32x4_t max_val_dup = vdupq_n_s32(max_val);
+
+ for (; i <= flat_size - 8; i += 8)
+ {
+ const float *src_data_ptr = input_data + i;
+ float32x4_t input_val_0 = vld1q_f32(src_data_ptr);
+ float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4);
+
+ input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup);
+ input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup);
+
+ int32x4_t casted_val_0 = RoundToNearest(input_val_0);
+ int32x4_t casted_val_1 = RoundToNearest(input_val_1);
+
+ casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup);
+ casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup);
+
+ // Clamp the values to fit the target type's range.
+ casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup);
+ casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup);
+ casted_val_0 = vminq_s32(casted_val_0, max_val_dup);
+ casted_val_1 = vminq_s32(casted_val_1, max_val_dup);
+
+ const int16x4_t narrowed_val_0 = vmovn_s32(casted_val_0);
+ const int16x4_t narrowed_val_1 = vmovn_s32(casted_val_1);
+ vst1_s16(output_data + i, narrowed_val_0);
+ vst1_s16(output_data + i + 4, narrowed_val_1);
+ }
+#endif // NEON
+
+ for (; i < flat_size; ++i)
+ {
+ const float val = input_data[i];
+ const int32_t unclamped = static_cast<int32_t>(round(val / scale)) + zero_point;
+ const int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
+ output_data[i] = clamped;
+ }
+}
+
+inline void Quantize(const int32_t *multiplier, const int32_t *shift, int32_t channel_size,
+ int32_t total_size, int32_t output_zp, int32_t output_min, int32_t output_max,
+ int32_t *scratch, int8_t *output)
+{
+ // Here we're trying to quantize the raw accumulators:
+ // output_channels
+ // data data data data data
+ // rows data data data data data
+ // data data data data data
+ // ....
+ //
+ // In order to minimize the reload of the multipliers & shifts, once we load
+ // the multipliers & shifts, we load & quantize the raw accumulators for every
+ // row.
+#ifdef USE_NEON
+ const int32x4_t output_offset_vec = vdupq_n_s32(output_zp);
+ const int32x4_t output_activation_min_vec = vdupq_n_s32(output_min);
+ const int32x4_t output_activation_max_vec = vdupq_n_s32(output_max);
+ const int32x4_t zeros = vdupq_n_s32(0);
+#endif
+
+ assert(total_size % channel_size == 0);
+ const int32_t rows = total_size / channel_size;
+
+ int c = 0;
+
+#ifdef USE_NEON
+ using gemmlowp::RoundingDivideByPOT;
+ for (; c <= channel_size - 8; c += 8)
+ {
+ int32x4_t out_shift_1 = vld1q_s32(shift + c);
+ int32x4_t out_shift_2 = vld1q_s32(shift + c + 4);
+ int32x4_t left_shift_1 = vmaxq_s32(out_shift_1, zeros);
+ int32x4_t left_shift_2 = vmaxq_s32(out_shift_2, zeros);
+
+ // Right shift will be performed as left shift with negative values.
+ int32x4_t right_shift_1 = vminq_s32(out_shift_1, zeros);
+ int32x4_t right_shift_2 = vminq_s32(out_shift_2, zeros);
+
+ int32x4_t out_mul_1 = vld1q_s32(multiplier + c);
+ int32x4_t out_mul_2 = vld1q_s32(multiplier + c + 4);
+ for (int n = 0; n < rows; ++n)
+ {
+ int loc = n * channel_size + c;
+ int32x4_t acc_1 = vld1q_s32(scratch + loc);
+ int32x4_t acc_2 = vld1q_s32(scratch + loc + 4);
+
+ // Saturating Rounding Doubling High Mul.
+ acc_1 = vshlq_s32(acc_1, left_shift_1);
+ acc_1 = vqrdmulhq_s32(acc_1, out_mul_1);
+ acc_2 = vshlq_s32(acc_2, left_shift_2);
+ acc_2 = vqrdmulhq_s32(acc_2, out_mul_2);
+
+ // Rounding Dividing By POT.
+ acc_1 = vrshlq_s32(acc_1, right_shift_1);
+ acc_2 = vrshlq_s32(acc_2, right_shift_2);
+
+ // Add the output offset.
+ acc_1 = vaddq_s32(acc_1, output_offset_vec);
+ acc_2 = vaddq_s32(acc_2, output_offset_vec);
+
+ // Apply the activation function.
+ acc_1 = vmaxq_s32(acc_1, output_activation_min_vec);
+ acc_1 = vminq_s32(acc_1, output_activation_max_vec);
+ acc_2 = vmaxq_s32(acc_2, output_activation_min_vec);
+ acc_2 = vminq_s32(acc_2, output_activation_max_vec);
+
+ // Saturating cast to int8 and store to destination.
+ const int16x4_t acc_s16_1 = vqmovn_s32(acc_1);
+ const int16x4_t acc_s16_2 = vqmovn_s32(acc_2);
+ const int16x8_t res_s16 = vcombine_s16(acc_s16_1, acc_s16_2);
+ const int8x8_t res_s8 = vqmovn_s16(res_s16);
+ vst1_s8(output + loc, res_s8);
+ }
+ }
+
+#endif // USE_NEON
+ // Handle leftover values, one by one. This is very slow.
+ for (; c < channel_size; c++)
+ {
+ for (int n = 0; n < rows; ++n)
+ {
+ int loc = n * channel_size + c;
+ int32_t acc = scratch[loc];
+ acc = MultiplyByQuantizedMultiplier(acc, multiplier[c], shift[c]);
+ acc += output_zp;
+ acc = std::max(acc, output_min);
+ acc = std::min(acc, output_max);
+ output[loc] = static_cast<int8_t>(acc);
+ }
+ }
+}
+
+template <typename input_type, typename output_type>
+inline void Requantize(const input_type *input_data, int32_t size,
+ int32_t effective_scale_multiplier, int32_t effective_scale_shift,
+ int32_t input_zeropoint, int32_t output_zeropoint, output_type *output_data)
+{
+ assert(!"Requantize: not supported type. It shouldn't reach here.");
+ UNUSED_ALL(input_data, size, effective_scale_multiplier, effective_scale_shift, input_zeropoint,
+ output_zeropoint, output_data);
+}
+
+template <>
+inline void Requantize<uint8_t, int8_t>(const uint8_t *input_data, int32_t size,
+ int32_t effective_scale_multiplier,
+ int32_t effective_scale_shift, int32_t input_zeropoint,
+ int32_t output_zeropoint, int8_t *output_data)
+{
+ static constexpr int32_t kMinOutput = std::numeric_limits<int8_t>::min();
+ static constexpr int32_t kMaxOutput = std::numeric_limits<int8_t>::max();
+
+ int i = 0;
+#ifdef USE_NEON
+ // Constants.
+ const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
+ const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
+ const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
+ const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
+
+ for (; i <= size - 16; i += 16)
+ {
+ const uint8x16_t input_vec = vld1q_u8(input_data + i);
+ const uint16x8_t first_half = vmovl_u8(vget_low_u8(input_vec));
+ const uint16x8_t second_half = vmovl_u8(vget_high_u8(input_vec));
+ int32x4x4_t input;
+ input.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(first_half)));
+ input.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(first_half)));
+ input.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(second_half)));
+ input.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(second_half)));
+ input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup);
+ input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup);
+ input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup);
+ input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup);
+
+ int32x4x4_t result =
+ MultiplyByQuantizedMultiplier4Rows(input, effective_scale_multiplier, effective_scale_shift);
+
+ result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup);
+ result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup);
+ result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup);
+ result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup);
+ result.val[0] = vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup);
+ result.val[1] = vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup);
+ result.val[2] = vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup);
+ result.val[3] = vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup);
+
+ const int16x4_t narrowed_val_1 = vqmovn_s32(result.val[0]);
+ const int16x4_t narrowed_val_2 = vqmovn_s32(result.val[1]);
+ const int16x4_t narrowed_val_3 = vqmovn_s32(result.val[2]);
+ const int16x4_t narrowed_val_4 = vqmovn_s32(result.val[3]);
+ const int16x8_t output_first_half = vcombine_s16(narrowed_val_1, narrowed_val_2);
+ const int16x8_t output_second_half = vcombine_s16(narrowed_val_3, narrowed_val_4);
+ const int8x8_t narrowed_first_half = vqmovn_s16(output_first_half);
+ const int8x8_t narrowed_second_half = vqmovn_s16(output_second_half);
+ const int8x16_t narrowed_result = vcombine_s8(narrowed_first_half, narrowed_second_half);
+ vst1q_s8(output_data + i, narrowed_result);
+ }
+
+#endif
+ for (; i < size; ++i)
+ {
+ const int32_t input = input_data[i] - input_zeropoint;
+ const int32_t output =
+ MultiplyByQuantizedMultiplier(input, effective_scale_multiplier, effective_scale_shift) +
+ output_zeropoint;
+ const int32_t clamped_output = std::max(std::min(output, kMaxOutput), kMinOutput);
+ output_data[i] = static_cast<int8_t>(clamped_output);
+ }
+}
+
+template <>
+inline void Requantize<int8_t, uint8_t>(const int8_t *input_data, int32_t size,
+ int32_t effective_scale_multiplier,
+ int32_t effective_scale_shift, int32_t input_zeropoint,
+ int32_t output_zeropoint, uint8_t *output_data)
+{
+ static constexpr int32_t kMinOutput = std::numeric_limits<uint8_t>::min();
+ static constexpr int32_t kMaxOutput = std::numeric_limits<uint8_t>::max();
+
+ int i = 0;
+#ifdef USE_NEON
+ // Constants.
+ const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
+ const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
+ const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
+ const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
+
+ for (; i <= size - 16; i += 16)
+ {
+ const int8x16_t input_vec = vld1q_s8(input_data + i);
+ const int16x8_t first_half = vmovl_s8(vget_low_s8(input_vec));
+ const int16x8_t second_half = vmovl_s8(vget_high_s8(input_vec));
+ int32x4x4_t input;
+ input.val[0] = vmovl_s16(vget_low_s16(first_half));
+ input.val[1] = vmovl_s16(vget_high_s16(first_half));
+ input.val[2] = vmovl_s16(vget_low_s16(second_half));
+ input.val[3] = vmovl_s16(vget_high_s16(second_half));
+ input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup);
+ input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup);
+ input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup);
+ input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup);
+
+ int32x4x4_t result =
+ MultiplyByQuantizedMultiplier4Rows(input, effective_scale_multiplier, effective_scale_shift);
+
+ result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup);
+ result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup);
+ result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup);
+ result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup);
+ result.val[0] = vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup);
+ result.val[1] = vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup);
+ result.val[2] = vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup);
+ result.val[3] = vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup);
+
+ const uint32x4_t result_val_1_unsigned = vreinterpretq_u32_s32(result.val[0]);
+ const uint32x4_t result_val_2_unsigned = vreinterpretq_u32_s32(result.val[1]);
+ const uint32x4_t result_val_3_unsigned = vreinterpretq_u32_s32(result.val[2]);
+ const uint32x4_t result_val_4_unsigned = vreinterpretq_u32_s32(result.val[3]);
+
+ const uint16x4_t narrowed_val_1 = vqmovn_u32(result_val_1_unsigned);
+ const uint16x4_t narrowed_val_2 = vqmovn_u32(result_val_2_unsigned);
+ const uint16x4_t narrowed_val_3 = vqmovn_u32(result_val_3_unsigned);
+ const uint16x4_t narrowed_val_4 = vqmovn_u32(result_val_4_unsigned);
+ const uint16x8_t output_first_half = vcombine_u16(narrowed_val_1, narrowed_val_2);
+ const uint16x8_t output_second_half = vcombine_u16(narrowed_val_3, narrowed_val_4);
+ const uint8x8_t narrowed_first_half = vqmovn_u16(output_first_half);
+ const uint8x8_t narrowed_second_half = vqmovn_u16(output_second_half);
+ const uint8x16_t narrowed_result = vcombine_u8(narrowed_first_half, narrowed_second_half);
+ vst1q_u8(output_data + i, narrowed_result);
+ }
+
+#endif
+ for (; i < size; ++i)
+ {
+ const int32_t input = input_data[i] - input_zeropoint;
+ const int32_t output =
+ MultiplyByQuantizedMultiplier(input, effective_scale_multiplier, effective_scale_shift) +
+ output_zeropoint;
+ const int32_t clamped_output = std::max(std::min(output, kMaxOutput), kMinOutput);
+ output_data[i] = static_cast<uint8_t>(clamped_output);
+ }
+}
+
} // namespace cker
} // namespace nnfw
diff --git a/compute/cker/include/cker/operation/Range.h b/compute/cker/include/cker/operation/Range.h
index 5c3a773a2..d6ccc68c8 100644
--- a/compute/cker/include/cker/operation/Range.h
+++ b/compute/cker/include/cker/operation/Range.h
@@ -35,8 +35,8 @@ template <typename T> inline int GetSize(T start, T limit, T delta)
}
int size = (std::is_integral<T>::value
- ? ((std::abs(limit - start) + std::abs(delta) - 1) / std::abs(delta))
- : std::ceil(std::abs((limit - start) / delta)));
+ ? ((std::abs(limit - start) + std::abs(delta) - 1) / std::abs(delta))
+ : std::ceil(std::abs((limit - start) / delta)));
return size;
}
diff --git a/compute/cker/include/cker/operation/Reduce.h b/compute/cker/include/cker/operation/Reduce.h
index cf9634a67..02a9eac5e 100644
--- a/compute/cker/include/cker/operation/Reduce.h
+++ b/compute/cker/include/cker/operation/Reduce.h
@@ -21,6 +21,7 @@
#include "cker/Shape.h"
#include "cker/Types.h"
#include "cker/Utils.h"
+#include "cker/neon/neon_check.h"
namespace nnfw
{
@@ -30,6 +31,89 @@ namespace cker
// A generic reduce method that can be used for reduce_sum, reduce_mean, etc.
// This method iterates through input data and reduce elements along the
// dimensions given in axis.
+
+#ifdef USE_NEON
+inline void OptimizedReduceSum(const float *input_data, const Shape &input_shape,
+ float *output_data)
+{
+ const auto input_dims = input_shape.DimsData();
+ const auto input_num_dims = input_shape.DimensionsCount();
+
+ int input_size = 1;
+ int reduce_size = 0;
+ for (int idx = 0; idx < input_num_dims - 1; idx++)
+ {
+ input_size *= input_dims[idx];
+ }
+ reduce_size = input_dims[input_num_dims - 1];
+ int offset = 0;
+ for (int idx = 0; idx < input_size; idx++)
+ {
+ int r_idx = 0;
+ float tmp_data[4] = {
+ 0,
+ };
+ float32x4_t tmp_data_32x4 = vld1q_f32(tmp_data);
+ for (; r_idx <= reduce_size - 32; r_idx += 32)
+ {
+ float32x4_t a10 = vld1q_f32(input_data + offset + r_idx);
+ float32x4_t a11 = vld1q_f32(input_data + offset + r_idx + 4);
+ float32x4_t a12 = vld1q_f32(input_data + offset + r_idx + 8);
+ float32x4_t a13 = vld1q_f32(input_data + offset + r_idx + 12);
+ float32x4_t a20 = vld1q_f32(input_data + offset + r_idx + 16);
+ float32x4_t a21 = vld1q_f32(input_data + offset + r_idx + 20);
+ float32x4_t a22 = vld1q_f32(input_data + offset + r_idx + 24);
+ float32x4_t a23 = vld1q_f32(input_data + offset + r_idx + 28);
+
+ float32x4_t x0 = vaddq_f32(a10, a20);
+ float32x4_t x1 = vaddq_f32(a11, a21);
+ float32x4_t x2 = vaddq_f32(a12, a22);
+ float32x4_t x3 = vaddq_f32(a13, a23);
+
+ float32x4_t y0 = vaddq_f32(x0, x1);
+ float32x4_t y1 = vaddq_f32(x2, x3);
+ float32x4_t y2 = vaddq_f32(y0, y1);
+ tmp_data_32x4 = vaddq_f32(tmp_data_32x4, y2);
+ }
+ for (; r_idx <= reduce_size - 16; r_idx += 16)
+ {
+ float32x4_t a10 = vld1q_f32(input_data + offset + r_idx);
+ float32x4_t a11 = vld1q_f32(input_data + offset + r_idx + 4);
+ float32x4_t a12 = vld1q_f32(input_data + offset + r_idx + 8);
+ float32x4_t a13 = vld1q_f32(input_data + offset + r_idx + 12);
+
+ float32x4_t x0 = vaddq_f32(a10, a11);
+ float32x4_t x1 = vaddq_f32(a12, a13);
+
+ float32x4_t y0 = vaddq_f32(x0, x1);
+ tmp_data_32x4 = vaddq_f32(tmp_data_32x4, y0);
+ }
+ for (; r_idx <= reduce_size - 8; r_idx += 8)
+ {
+ float32x4_t a1 = vld1q_f32(input_data + offset + r_idx);
+ float32x4_t a2 = vld1q_f32(input_data + offset + r_idx + 4);
+ float32x4_t x = vaddq_f32(a1, a2);
+ tmp_data_32x4 = vaddq_f32(tmp_data_32x4, x);
+ }
+ vst1q_f32(tmp_data, tmp_data_32x4);
+ output_data[idx] = tmp_data[0] + tmp_data[1] + tmp_data[2] + tmp_data[3];
+
+ for (; r_idx < reduce_size; r_idx++)
+ {
+ if (r_idx == 0)
+ {
+ output_data[idx] = input_data[offset];
+ }
+ else
+ {
+ output_data[idx] += input_data[offset + r_idx];
+ }
+ }
+ offset += reduce_size;
+ }
+}
+#endif // NEON
+
template <typename In, typename Out>
inline bool ReduceImpl(const In *input_data, const Shape &input_shape, const Shape &,
const int *axis, const int num_axis, int *input_iter,
@@ -39,6 +123,32 @@ inline bool ReduceImpl(const In *input_data, const Shape &input_shape, const Sha
const auto input_num_dims = input_shape.DimensionsCount();
// Reset input iterator.
+ if (num_axis == 1 && axis[0] == input_num_dims - 1)
+ {
+ int input_size = 1;
+ int reduce_size = 0;
+ for (int idx = 0; idx < input_num_dims - 1; idx++)
+ {
+ input_size *= input_dims[idx];
+ }
+ reduce_size = input_dims[input_num_dims - 1];
+ for (int idx = 0; idx < input_size; idx++)
+ {
+ for (int r_idx = 0; r_idx < reduce_size; r_idx++)
+ {
+ if (r_idx == 0)
+ {
+ output_data[idx] = input_data[idx * reduce_size];
+ }
+ else
+ {
+ output_data[idx] = reducer(output_data[idx], input_data[idx * reduce_size + r_idx]);
+ }
+ }
+ }
+ return true;
+ }
+
for (int idx = 0; idx < input_num_dims; ++idx)
{
input_iter[idx] = 0;
@@ -48,7 +158,7 @@ inline bool ReduceImpl(const In *input_data, const Shape &input_shape, const Sha
{
size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
size_t output_offset =
- ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
+ ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
output_data[output_offset] = reducer(output_data[output_offset], input_data[input_offset]);
} while (NextIndex(input_num_dims, input_dims, input_iter));
return true;
@@ -202,12 +312,12 @@ public:
}
// Calculate mean by dividing output_data by num of aggregated element.
- U num_elements_in_axis = 1;
+ size_t num_elements_in_axis = 1;
for (int idx = 0; idx < num_resolved_axis; ++idx)
{
size_t current = static_cast<size_t>(input_shape.Dims(resolved_axis_data()[idx]));
// Overflow prevention.
- if (current > static_cast<size_t>(std::numeric_limits<U>::max() / num_elements_in_axis))
+ if (current > static_cast<size_t>(std::numeric_limits<size_t>::max() / num_elements_in_axis))
{
return false;
}
@@ -220,21 +330,21 @@ public:
if (compute_sum)
{
// TODO(b/116341117): Eliminate float and do this completely in 8bit.
- const float bias = -input_zero_point * scale * num_elements_in_axis + 0.5f;
+ const float bias = -input_zero_point * scale * num_elements_in_axis;
for (size_t idx = 0; idx < num_outputs; ++idx)
{
const U value =
- static_cast<U>(std::round(temp_sum[idx] * scale + bias)) + output_zero_point;
+ static_cast<U>(std::round(temp_sum[idx] * scale + bias)) + output_zero_point;
output_data[idx] = static_cast<T>(value);
}
}
else
{
- const float bias = -input_zero_point * scale + 0.5f;
+ const float bias = -input_zero_point * scale;
for (size_t idx = 0; idx < num_outputs; ++idx)
{
float float_mean =
- static_cast<float>(temp_sum[idx]) / static_cast<float>(num_elements_in_axis);
+ static_cast<float>(temp_sum[idx]) / static_cast<float>(num_elements_in_axis);
float result = std::min(std::round(float_mean * scale + bias) + output_zero_point,
static_cast<float>(std::numeric_limits<T>::max()));
result = std::max(result, static_cast<float>(std::numeric_limits<T>::min()));
diff --git a/compute/cker/include/cker/operation/ReduceMean.h b/compute/cker/include/cker/operation/ReduceMean.h
index 2e4fc6274..924e85037 100644
--- a/compute/cker/include/cker/operation/ReduceMean.h
+++ b/compute/cker/include/cker/operation/ReduceMean.h
@@ -72,9 +72,9 @@ inline bool ReduceMeanImpl(const In *input_data, const Shape &input_shape, const
{
size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
size_t output_offset =
- ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
+ ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
output_data[output_offset] =
- reducer(output_data[output_offset], input_data[input_offset], normalizer);
+ reducer(output_data[output_offset], input_data[input_offset], normalizer);
} while (NextIndex(input_num_dims, input_dims, input_iter));
return true;
}
@@ -102,7 +102,7 @@ inline size_t ReduceSumQuantImpl(const In *input_data, const Shape &input_shape,
{
size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
size_t output_offset =
- ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
+ ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis);
temp_sum[output_offset] = reducer(temp_sum[output_offset], input_data[input_offset]);
} while (NextIndex(input_num_dims, input_dims, input_iter));
return normalizer;
@@ -185,8 +185,8 @@ public:
}
size_t normalizer =
- ReduceSumQuantImpl<In>(input_data, input_shape, resolved_axis_data(), num_resolved_axis,
- temp_index_data(), reducer, _temp_sum.data());
+ ReduceSumQuantImpl<In>(input_data, input_shape, resolved_axis_data(), num_resolved_axis,
+ temp_index_data(), reducer, _temp_sum.data());
if (num_outputs > 0)
{
float scale = input_scale / output_scale;
@@ -231,6 +231,37 @@ void MeanQ8Asymm(const Shape &input_shape, const In *input_data, float input_sca
sum_reducer);
}
+template <typename In, typename Out>
+void MeanAxis1And2(const Shape &input_shape, const In *input_data, const Shape &output_shape,
+ Out *output_data)
+{
+ UNUSED_RELEASE(output_shape);
+ assert(input_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+
+ const int output_batch = output_shape.Dims(0);
+ const int output_depth = output_shape.Dims(3);
+
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+
+ for (int out_b = 0; out_b < output_batch; ++out_b)
+ {
+ for (int out_d = 0; out_d < output_depth; ++out_d)
+ {
+ float value = 0;
+ for (int in_h = 0; in_h < input_height; ++in_h)
+ {
+ for (int in_w = 0; in_w < input_width; ++in_w)
+ {
+ value += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)];
+ }
+ }
+ output_data[Offset(output_shape, out_b, 0, 0, out_d)] = value / (input_width * input_height);
+ }
+ }
+}
+
} // namespace cker
} // namespace nnfw
diff --git a/compute/cker/include/cker/operation/ResizeBilinear.h b/compute/cker/include/cker/operation/ResizeBilinear.h
index 7fc1e9123..ae5af7bb3 100644
--- a/compute/cker/include/cker/operation/ResizeBilinear.h
+++ b/compute/cker/include/cker/operation/ResizeBilinear.h
@@ -62,7 +62,7 @@ inline void ResizeBilinearKernel2x2(int32_t x0, int32_t x1, int32_t y0, int32_t
// Bottom right corner.
output_data[output_offset + output_x_offset + output_y_offset] =
- (output + ((x1y0 + x1y1) / 2)) / 2;
+ (output + ((x1y0 + x1y1) / 2)) / 2;
}
}
@@ -192,8 +192,8 @@ inline void ResizeBilinearGenericSmallChannel(int32_t batches, int32_t input_hei
&x1);
int32_t input_offset[4] = {
- Offset(input_shape, b, y0, x0, 0), Offset(input_shape, b, y0, x1, 0),
- Offset(input_shape, b, y1, x0, 0), Offset(input_shape, b, y1, x1, 0)};
+ Offset(input_shape, b, y0, x0, 0), Offset(input_shape, b, y0, x1, 0),
+ Offset(input_shape, b, y1, x0, 0), Offset(input_shape, b, y1, x1, 0)};
float scale[4] = {(1 - (input_y - y0)) * (1 - (input_x - x0)),
(1 - (input_y - y0)) * (input_x - x0),
(input_y - y0) * (1 - (input_x - x0)), (input_y - y0) * (input_x - x0)};
@@ -202,8 +202,8 @@ inline void ResizeBilinearGenericSmallChannel(int32_t batches, int32_t input_hei
{
const T *input_ptr = &input_data[d];
*output_ptr++ = static_cast<T>(
- input_ptr[input_offset[0]] * scale[0] + input_ptr[input_offset[1]] * scale[1] +
- input_ptr[input_offset[2]] * scale[2] + input_ptr[input_offset[3]] * scale[3]);
+ input_ptr[input_offset[0]] * scale[0] + input_ptr[input_offset[1]] * scale[1] +
+ input_ptr[input_offset[2]] * scale[2] + input_ptr[input_offset[3]] * scale[3]);
}
}
}
@@ -253,17 +253,102 @@ void ResizeBilinear(ResizeBilinearParams &params, const Shape &input_shape,
int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
float height_scale = (params.align_corners && params.output_height > 1)
- ? (static_cast<float>(input_height - 1) / (params.output_height - 1))
- : (static_cast<float>(input_height) / params.output_height);
+ ? (static_cast<float>(input_height - 1) / (params.output_height - 1))
+ : (static_cast<float>(input_height) / params.output_height);
float width_scale = (params.align_corners && params.output_width > 1)
- ? (static_cast<float>(input_width - 1) / (params.output_width - 1))
- : (static_cast<float>(input_width) / params.output_width);
+ ? (static_cast<float>(input_width - 1) / (params.output_width - 1))
+ : (static_cast<float>(input_width) / params.output_width);
ResizeBilinearGenericSmallChannel<uint8_t>(
- batches, input_height, input_width, depth, params.output_height, params.output_width,
- height_scale, width_scale, input_shape, input_data, output_data, params.half_pixel_centers);
+ batches, input_height, input_width, depth, params.output_height, params.output_width,
+ height_scale, width_scale, input_shape, input_data, output_data, params.half_pixel_centers);
}
+
+inline void ComputeInterpolationValues(const int32_t value, const int32_t scale_10,
+ const bool half_pixel_centers, int32_t input_size,
+ int32_t *scaled_value, int32_t *lower_bound,
+ int32_t *upper_bound)
+{
+ if (half_pixel_centers)
+ {
+ *scaled_value = value * scale_10 + scale_10 / 2 - (1 << 9);
+ }
+ else
+ {
+ *scaled_value = value * scale_10;
+ }
+ *lower_bound = std::max(*scaled_value / (1 << 10), 0);
+ *upper_bound = std::min(*scaled_value / (1 << 10) + 1, input_size - 1);
+}
+
+inline void ResizeBilinear(const ResizeBilinearParams &op_params,
+ const Shape &unextended_input_shape, const int8_t *input_data,
+ const Shape &unextended_output_shape, int8_t *output_data)
+{
+ // If half_pixel_centers is True, align_corners must be False.
+ assert(!op_params.half_pixel_centers || !op_params.align_corners);
+ assert(unextended_input_shape.DimensionsCount() <= 4);
+ assert(unextended_output_shape.DimensionsCount() <= 4);
+ const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
+ const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
+
+ const int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int32_t input_height = input_shape.Dims(1);
+ const int32_t input_width = input_shape.Dims(2);
+ const int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
+
+ const int32_t output_height = op_params.output_height;
+ const int32_t output_width = op_params.output_width;
+
+ int32_t height_scale_10 = ((1 << 10) * input_height + output_height / 2) / output_height;
+ int32_t width_scale_10 = ((1 << 10) * input_width + output_width / 2) / output_width;
+ if (op_params.align_corners && output_height > 1)
+ {
+ height_scale_10 =
+ ((1 << 10) * (input_height - 1) + (output_height - 1) / 2) / (output_height - 1);
+ }
+ if (op_params.align_corners && output_width > 1)
+ {
+ width_scale_10 = ((1 << 10) * (input_width - 1) + (output_width - 1) / 2) / (output_width - 1);
+ }
+
+ for (int b = 0; b < batches; ++b)
+ {
+ for (int y = 0; y < output_height; ++y)
+ {
+ int32_t input_y, y0, y1;
+ ComputeInterpolationValues(y, height_scale_10, op_params.half_pixel_centers, input_height,
+ &input_y, &y0, &y1);
+ for (int x = 0; x < output_width; ++x)
+ {
+ int32_t input_x, x0, x1;
+ ComputeInterpolationValues(x, width_scale_10, op_params.half_pixel_centers, input_width,
+ &input_x, &x0, &x1);
+ for (int c = 0; c < depth; ++c)
+ {
+ const int64_t output_20_ll =
+ static_cast<int64_t>(input_data[Offset(input_shape, b, y0, x0, c)]) *
+ ((1 << 10) - (input_y - (1 << 10) * y0)) * ((1 << 10) - (input_x - (1 << 10) * x0));
+ const int64_t output_20_lu =
+ static_cast<int64_t>(input_data[Offset(input_shape, b, y1, x0, c)]) *
+ (input_y - (1 << 10) * y0) * ((1 << 10) - (input_x - (1 << 10) * x0));
+ const int64_t output_20_rl =
+ static_cast<int64_t>(input_data[Offset(input_shape, b, y0, x1, c)]) *
+ ((1 << 10) - (input_y - (1 << 10) * y0)) * (input_x - (1 << 10) * x0);
+ const int64_t output_20_ru =
+ static_cast<int64_t>(input_data[Offset(input_shape, b, y1, x1, c)]) *
+ (input_y - (1 << 10) * y0) * (input_x - (1 << 10) * x0);
+ const int64_t output_20 = output_20_ll + output_20_lu + output_20_rl + output_20_ru;
+ const int64_t round = (output_20 > 0) ? (1 << 19) : -(1 << 19);
+ const int8_t interpolation = static_cast<int8_t>((output_20 + round) / (1 << 20));
+ output_data[Offset(output_shape, b, y, x, c)] = interpolation;
+ }
+ }
+ }
+ }
+}
+
} // namespace cker
} // namespace nnfw
diff --git a/compute/cker/include/cker/operation/Round.h b/compute/cker/include/cker/operation/Round.h
index a04a741cf..d67714564 100644
--- a/compute/cker/include/cker/operation/Round.h
+++ b/compute/cker/include/cker/operation/Round.h
@@ -19,6 +19,7 @@
#define __NNFW_CKER_ROUND_H__
#include "cker/Shape.h"
+#include "cker/Utils.h"
#include <cmath>
@@ -41,6 +42,26 @@ inline float RoundToNearest(float value)
}
}
+#ifdef USE_NEON
+
+inline int32x4_t RoundToNearest(const float32x4_t input)
+{
+#if defined(__aarch64__) || defined(__SSSE3__)
+ // Note: vcvtnq_s32_f32 is not available in ARMv7
+ return vcvtnq_s32_f32(input);
+#else
+ static const float32x4_t zero_val_dup = vdupq_n_f32(0.0f);
+ static const float32x4_t point5_val_dup = vdupq_n_f32(0.5f);
+ static const float32x4_t minus_point5_val_dup = vdupq_n_f32(-0.5f);
+
+ const uint32x4_t mask = vcltq_f32(input, zero_val_dup);
+ const float32x4_t round = vbslq_f32(mask, minus_point5_val_dup, point5_val_dup);
+ return vcvtq_s32_f32(vaddq_f32(input, round));
+#endif // defined(__aarch64__) || defined(__SSSE3__)
+}
+
+#endif // NEON
+
inline void Round(const Shape &input_shape, const float *input_data, const Shape &output_shape,
float *output_data)
{
diff --git a/compute/cker/include/cker/operation/Select.h b/compute/cker/include/cker/operation/Select.h
index ab2de94cc..644fe0a0e 100644
--- a/compute/cker/include/cker/operation/Select.h
+++ b/compute/cker/include/cker/operation/Select.h
@@ -34,7 +34,7 @@ void Select(const Shape &input_condition_shape, const D *input_condition_data,
const T *input_y_data, const Shape &output_shape, T *output_data)
{
const int64_t flatsize =
- MatchingFlatSize(input_condition_shape, input_x_shape, input_y_shape, output_shape);
+ MatchingFlatSize(input_condition_shape, input_x_shape, input_y_shape, output_shape);
for (int64_t i = 0; i < flatsize; ++i)
{
output_data[i] = (input_condition_data[i] != 0) ? input_x_data[i] : input_y_data[i];
@@ -101,7 +101,7 @@ void BroadcastSelect4DSlow(const Shape &input_condition_shape, const D *input_co
const int x_index = SubscriptToIndex(desc_x, b, y, x, c);
const int y_index = SubscriptToIndex(desc_y, b, y, x, c);
output_data[Offset(extended_output_shape, b, y, x, c)] =
- input_condition_data[condition_index] ? input_x_data[x_index] : input_y_data[y_index];
+ input_condition_data[condition_index] ? input_x_data[x_index] : input_y_data[y_index];
}
}
}
diff --git a/compute/cker/include/cker/operation/Slice.h b/compute/cker/include/cker/operation/Slice.h
index a072cff8e..ef97fd5d8 100644
--- a/compute/cker/include/cker/operation/Slice.h
+++ b/compute/cker/include/cker/operation/Slice.h
@@ -43,16 +43,16 @@ inline void Slice(const SliceParams &op_params, const Shape &input_shape,
: start_b + op_params.size[0];
const int start_h = begin_count < 3 ? 0 : op_params.begin[begin_count - 3];
const int stop_h = (size_count < 3 || op_params.size[size_count - 3] == -1)
- ? input_shape.Dims(1)
- : start_h + op_params.size[size_count - 3];
+ ? input_shape.Dims(1)
+ : start_h + op_params.size[size_count - 3];
const int start_w = begin_count < 2 ? 0 : op_params.begin[begin_count - 2];
const int stop_w = (size_count < 2 || op_params.size[size_count - 2] == -1)
- ? input_shape.Dims(2)
- : start_w + op_params.size[size_count - 2];
+ ? input_shape.Dims(2)
+ : start_w + op_params.size[size_count - 2];
const int start_d = begin_count < 1 ? 0 : op_params.begin[begin_count - 1];
const int stop_d = (size_count < 1 || op_params.size[size_count - 1] == -1)
- ? input_shape.Dims(3)
- : start_d + op_params.size[size_count - 1];
+ ? input_shape.Dims(3)
+ : start_d + op_params.size[size_count - 1];
for (int in_b = start_b; in_b < stop_b; ++in_b)
{
diff --git a/compute/cker/include/cker/operation/SoftMax.h b/compute/cker/include/cker/operation/SoftMax.h
index 13e50b87a..35ecde4ba 100644
--- a/compute/cker/include/cker/operation/SoftMax.h
+++ b/compute/cker/include/cker/operation/SoftMax.h
@@ -23,6 +23,10 @@
#include "cker/Types.h"
#include "cker/eigen/Utils.h"
+#if __aarch64__ && __clang__
+#define TFLITE_SOFTMAX_USE_UINT16_LUT
+#endif
+
#include <Eigen/Core>
#include <fixedpoint/fixedpoint.h>
#include <cmath>
@@ -32,6 +36,45 @@ namespace nnfw
namespace cker
{
+namespace reference
+{
+
+// Note. This Softmax function supports all of dimensions
+inline void Softmax(const SoftmaxParams &params, const Shape &input_shape, const float *input_data,
+ const Shape &output_shape, float *output_data)
+{
+ const int trailing_dim = input_shape.DimensionsCount() - 1;
+ const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+ const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+ for (int i = 0; i < outer_size; ++i)
+ {
+ // Find max element value which we'll use to ensure numerical stability
+ // taking advantage of the following equality:
+ // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
+ float max = std::numeric_limits<float>::lowest();
+ for (int c = 0; c < depth; ++c)
+ {
+ max = std::max(max, input_data[i * depth + c]);
+ }
+
+ // Compute sum.
+ float sum = 0.f;
+ for (int c = 0; c < depth; ++c)
+ {
+ sum += std::exp((input_data[i * depth + c] - max) * static_cast<float>(params.beta));
+ }
+
+ // Compute result.
+ for (int c = 0; c < depth; ++c)
+ {
+ output_data[i * depth + c] =
+ std::exp((input_data[i * depth + c] - max) * static_cast<float>(params.beta)) / sum;
+ }
+ }
+}
+} // namespace reference
+
// Performs softmax along the input of size (input_size * batch_size).
inline void Softmax(const float *in, const int input_size, const int batch_size, const float beta,
float *out)
@@ -88,87 +131,306 @@ inline void Softmax(const SoftmaxParams &params, const Shape &input_shape, const
out_mat.array().rowwise() *= scale;
}
-inline void Softmax(const SoftmaxParams &params, const Shape &input_shape,
- const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data)
-{
- const int32_t input_beta_multiplier = params.input_multiplier;
- const int32_t input_beta_left_shift = params.input_left_shift;
- const int diff_min = params.diff_min;
- // The representation chosen for the input to the exp() function is Q5.26.
- // We need to leave extra space since values that we skip might be as large as
- // -32 before multiplying by input_beta_multiplier, and therefore as large as
- // -16 afterwards. Note that exp(-8) is definitely not insignificant to
- // accumulation, but exp(-16) definitely is.
- static const int kScaledDiffIntegerBits = 5;
- static const int kAccumulationIntegerBits = 12;
- using FixedPointScaledDiff = gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
- using FixedPointAccum = gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
- using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
+template <typename T> inline int32_t QuantizeSoftmaxOutput(float prob_rescaled, int32_t zero_point)
+{
+ const int32_t prob_rnd = static_cast<int32_t>(std::round(prob_rescaled));
+ return prob_rnd + zero_point;
+}
+
+#if !__aarch64__
+// With ARM64, rounding is faster than add + truncation.
+template <> inline int32_t QuantizeSoftmaxOutput<uint8_t>(float prob_rescaled, int32_t)
+{
+ return static_cast<int32_t>(prob_rescaled + 0.5f);
+}
+#endif
+
+inline void PopulateSoftmaxLookupTable(float *table, float input_scale, float beta)
+{
+ const float scale = -input_scale * beta;
+ const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
+ for (int32_t val = 0; val <= max_uint8; ++val)
+ {
+ table[max_uint8 - val] = expf(scale * val);
+ }
+}
+template <typename In, typename Out>
+inline void Softmax(const SoftmaxParams &params, const Shape &input_shape, const In *input_data,
+ const Shape &output_shape, Out *output_data)
+{
const int trailing_dim = input_shape.DimensionsCount() - 1;
- const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
- const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+ const int excluding_last_dim = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+ const int last_dim = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
- for (int i = 0; i < outer_size; ++i)
+ const int32_t clamp_max = std::numeric_limits<Out>::max();
+ const int32_t clamp_min = std::numeric_limits<Out>::min();
+ for (int i = 0; i < excluding_last_dim; ++i)
{
- uint8_t max_in_row = 0;
- for (int c = 0; c < depth; ++c)
+ int32_t max_val = std::numeric_limits<In>::min();
+ // Find max quantized value.
+ for (int j = 0; j < last_dim; ++j)
{
- max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+ max_val = std::max(max_val, static_cast<int32_t>(input_data[j]));
}
- FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
- for (int c = 0; c < depth; ++c)
+ float sum_exp = 0.0f;
+ const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
+ const float *table_offset = &params.table[max_uint8 - max_val];
+ // Calculate normalizer sum(exp(x)).
+ for (int j = 0; j < last_dim; ++j)
{
- int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
- if (input_diff >= diff_min)
- {
- const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne(
- input_diff, input_beta_multiplier, input_beta_left_shift);
- const FixedPointScaledDiff scaled_diff_f8 =
- FixedPointScaledDiff::FromRaw(input_diff_rescaled);
- sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
- exp_on_negative_values(scaled_diff_f8));
- }
+ sum_exp += table_offset[input_data[j]];
}
- int32_t fixed_sum_of_exps = sum_of_exps.raw();
- int headroom_plus_one = CountLeadingZeros(static_cast<uint32_t>(fixed_sum_of_exps));
- // This is the number of bits to the left of the binary point above 1.0.
- // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and
- // no later adjustment will be needed.
- int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
- int32_t shifted_sum_minus_one =
- static_cast<int32_t>((static_cast<uint32_t>(fixed_sum_of_exps) << headroom_plus_one) -
- (static_cast<uint32_t>(1) << 31));
+ const float inv_sum_exp = 1.0f / (sum_exp * params.scale);
+ // Normalize and quantize probabilities.
+ for (int j = 0; j < last_dim; ++j)
+ {
+ const float prob_rescaled = table_offset[input_data[j]] * inv_sum_exp;
+ const int32_t prob_quantized = QuantizeSoftmaxOutput<Out>(prob_rescaled, params.zero_point);
+ output_data[j] = static_cast<Out>(std::max(std::min(clamp_max, prob_quantized), clamp_min));
+ }
+ input_data += last_dim;
+ output_data += last_dim;
+ }
+}
- FixedPoint0 shifted_scale =
- one_over_one_plus_x_for_x_in_0_1(FixedPoint0::FromRaw(shifted_sum_minus_one));
+#ifdef TFLITE_SOFTMAX_USE_UINT16_LUT
+// Looks up each element of <indices> in <table>, returns them in a vector.
+inline uint8x16_t aarch64_lookup_vector(const uint8x16x4_t table[4], uint8x16_t indices)
+{
+ // Look up in 1st quarter of the table: top 2 bits of indices == 00
+ uint8x16_t output1 = vqtbl4q_u8(table[0], indices);
+ // Look up in 2nd quarter of the table: top 2 bits of indices == 01
+ uint8x16_t output2 = vqtbl4q_u8(table[1], veorq_u8(indices, vdupq_n_u8(0x40)));
+ // Look up in 3rd quarter of the table: top 2 bits of indices == 10
+ uint8x16_t output3 = vqtbl4q_u8(table[2], veorq_u8(indices, vdupq_n_u8(0x80)));
+ // Look up in 4th quarter of the table: top 2 bits of indices == 11
+ uint8x16_t output4 = vqtbl4q_u8(table[3], veorq_u8(indices, vdupq_n_u8(0xc0)));
- for (int c = 0; c < depth; ++c)
+ // Combine result of the 4 lookups.
+ return vorrq_u8(vorrq_u8(output1, output2), vorrq_u8(output3, output4));
+}
+
+inline void PopulateSoftmaxUInt8LookupTable(uint8_t *uint8_table1, uint8_t *uint8_table2,
+ float input_scale, float beta)
+{
+ const float scale = input_scale * beta;
+ const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
+ const int32_t max_uint16 = std::numeric_limits<uint16_t>::max();
+
+ for (int32_t val = 0; val <= max_uint8; ++val)
+ {
+ float input_to_exp = scale * (val - max_uint8);
+ int32_t temp = static_cast<int>(expf(input_to_exp) * max_uint16 + 0.5);
+ temp = std::min(max_uint16, temp);
+ uint8_t part1 = temp >> 8;
+ uint8_t part2 = temp & 0xff;
+ uint8_table1[val] = static_cast<uint8_t>(part1);
+ uint8_table2[val] = static_cast<uint8_t>(part2);
+ }
+}
+
+inline int FindMaxValue(int size, const uint8_t *input_data, uint8_t offset)
+{
+ int32_t max_val = std::numeric_limits<uint8_t>::min();
+ int j = 0;
+
+ uint8x16_t max_val_dup = vdupq_n_u8(max_val);
+ uint8x16_t offset_dup = vdupq_n_u8(offset);
+ for (; j <= size - 16; j += 16)
+ {
+ uint8x16_t input_value = vld1q_u8(input_data + j);
+ input_value = veorq_u8(input_value, offset_dup);
+ max_val_dup = vmaxq_u8(input_value, max_val_dup);
+ }
+ max_val = std::max(max_val, static_cast<int32_t>(vmaxvq_u8(max_val_dup)));
+
+ for (; j < size; ++j)
+ {
+ max_val = std::max(max_val, static_cast<int32_t>(input_data[j] ^ offset));
+ }
+ return max_val;
+}
+
+#ifdef USE_NEON
+// Value_to_store layout:
+// [high_high, high_low, low_high, low_low].
+inline void StoreValue(int32x4x4_t value_to_store, int8_t *output)
+{
+ const int16x8_t result_1 =
+ vcombine_s16(vqmovn_s32(value_to_store.val[1]), vqmovn_s32(value_to_store.val[0]));
+ const int16x8_t result_2 =
+ vcombine_s16(vqmovn_s32(value_to_store.val[3]), vqmovn_s32(value_to_store.val[2]));
+ const int8x16_t result = vcombine_s8(vqmovn_s16(result_2), vqmovn_s16(result_1));
+ vst1q_s8(output, result);
+}
+
+// Value_to_store layout:
+// [high_high, high_low, low_high, low_low].
+inline void StoreValue(int32x4x4_t value_to_store, uint8_t *output)
+{
+ const uint16x8_t result_1 =
+ vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[1])),
+ vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[0])));
+ const uint16x8_t result_2 =
+ vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[3])),
+ vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[2])));
+ const uint8x16_t result = vcombine_u8(vqmovn_u16(result_2), vqmovn_u16(result_1));
+ vst1q_u8(output, result);
+}
+
+#endif
+
+template <typename In, typename Out>
+inline void SoftmaxInt8LUT(const SoftmaxParams &params, const Shape &input_shape,
+ const In *input_data, const Shape &output_shape, Out *output_data)
+{
+ const int trailing_dim = input_shape.DimensionsCount() - 1;
+ const int excluding_last_dim = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+ const int last_dim = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+ const int32_t clamp_max = std::numeric_limits<Out>::max();
+ const int32_t clamp_min = std::numeric_limits<Out>::min();
+
+ // Offset is used to interpret the input data "correctly".
+ // If the input is uint8, the data will be unchanged.
+ // If the input is int8, since it will be reinterpret as uint8.
+ // e.g.,
+ // int8 127 will be applied "offset" to become 255 in uint8.
+ uint8_t offset = 0;
+ if (std::is_same<In, int8_t>::value)
+ {
+ offset = 0x80;
+ }
+
+ const uint8_t *input_data_uint = reinterpret_cast<const uint8_t *>(input_data);
+
+ // This code uses ARM64-only instructions.
+ // TODO(b/143709993): Port to ARMv7
+
+ // Load the tables into registers. (4*4 128-bit registers)
+ uint8x16x4_t table1[4];
+ table1[0] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 0);
+ table1[1] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 1);
+ table1[2] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 2);
+ table1[3] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 3);
+
+ uint8x16x4_t table2[4];
+ table2[0] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 0);
+ table2[1] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 1);
+ table2[2] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 2);
+ table2[3] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 3);
+
+ for (int i = 0; i < excluding_last_dim; ++i)
+ {
+ // Find max quantized value.
+ int32_t max_val = FindMaxValue(last_dim, input_data_uint, offset);
+
+ int32_t sum_exp = 0;
+ const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
+ const uint8_t table_offset = max_uint8 - max_val;
+
+ // Calculate normalizer sum(exp(x)).
+ int sum_j = 0;
+ uint8x16_t table_offset_dup = vdupq_n_u8(table_offset);
+ uint8x16_t offset_dup = vdupq_n_u8(offset);
+ uint32x4_t sum_4 = vdupq_n_u32(0);
+ const int multiplier_shift = 8;
+ for (; sum_j <= last_dim - 16; sum_j += 16)
+ {
+ uint8x16_t input_value = vld1q_u8(input_data_uint + sum_j);
+ input_value = veorq_u8(input_value, offset_dup);
+ input_value = vaddq_u8(input_value, table_offset_dup);
+
+ const uint8x16_t output1 = aarch64_lookup_vector(table1, input_value);
+ const uint8x16_t output2 = aarch64_lookup_vector(table2, input_value);
+
+ uint16x8_t exp_value1 = vshll_n_u8(vget_high_u8(output1), multiplier_shift);
+ uint16x8_t exp_value2 = vshll_n_u8(vget_low_u8(output1), multiplier_shift);
+
+ exp_value1 = vaddw_u8(exp_value1, vget_high_u8(output2));
+ exp_value2 = vaddw_u8(exp_value2, vget_low_u8(output2));
+
+ sum_4 = vpadalq_u16(sum_4, exp_value1);
+ sum_4 = vpadalq_u16(sum_4, exp_value2);
+ }
+ int temp = vgetq_lane_u32(sum_4, 0) + vgetq_lane_u32(sum_4, 1) + vgetq_lane_u32(sum_4, 2) +
+ vgetq_lane_u32(sum_4, 3);
+ sum_exp += temp;
+
+ for (; sum_j < last_dim; ++sum_j)
{
- int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
- if (input_diff >= diff_min)
- {
- const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne(
- input_diff, input_beta_multiplier, input_beta_left_shift);
- const FixedPointScaledDiff scaled_diff_f8 =
- FixedPointScaledDiff::FromRaw(input_diff_rescaled);
-
- FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
- int32_t unsat_output = gemmlowp::RoundingDivideByPOT((shifted_scale * exp_in_0).raw(),
- num_bits_over_unit + 31 - 8);
-
- output_data[i * depth + c] = static_cast<uint8_t>(
- std::max(std::min(unsat_output, static_cast<int32_t>(255)), static_cast<int32_t>(0)));
- }
- else
- {
- output_data[i * depth + c] = 0;
- }
+ const uint8_t index = (input_data_uint[sum_j] ^ offset) + table_offset;
+
+ uint8_t part1 = params.uint8_table1[index];
+ uint8_t part2 = params.uint8_table2[index];
+ sum_exp += ((part1 << 8) + part2);
+ }
+
+ const float inv_sum_exp = 1.0f / (sum_exp * params.scale);
+
+ int32_t multiplier, shift;
+ QuantizeMultiplier(inv_sum_exp, &multiplier, &shift);
+
+ // Normalize and quantize probabilities.
+ int j = 0;
+ const int32x4_t output_zp_dup = vdupq_n_s32(params.zero_point);
+ const int32x4_t max_val_dup = vdupq_n_s32(clamp_max);
+ const int32x4_t min_val_dup = vdupq_n_s32(clamp_min);
+
+ for (; j <= last_dim - 16; j += 16)
+ {
+ uint8x16_t input_value = vld1q_u8(input_data_uint + j);
+ input_value = veorq_u8(input_value, offset_dup);
+ input_value = vaddq_u8(input_value, table_offset_dup);
+
+ const uint8x16_t output1 = aarch64_lookup_vector(table1, input_value);
+ const uint8x16_t output2 = aarch64_lookup_vector(table2, input_value);
+
+ uint16x8_t exp_value1 = vshll_n_u8(vget_high_u8(output1), multiplier_shift);
+ uint16x8_t exp_value2 = vshll_n_u8(vget_low_u8(output1), multiplier_shift);
+
+ exp_value1 = vaddw_u8(exp_value1, vget_high_u8(output2));
+ exp_value2 = vaddw_u8(exp_value2, vget_low_u8(output2));
+
+ int32x4x4_t output_value;
+ output_value.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(exp_value1)));
+ output_value.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(exp_value1)));
+ output_value.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(exp_value2)));
+ output_value.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(exp_value2)));
+
+ int32x4x4_t temp_val = MultiplyByQuantizedMultiplier4Rows(output_value, multiplier, shift);
+
+ temp_val.val[0] = vaddq_s32(temp_val.val[0], output_zp_dup);
+ temp_val.val[1] = vaddq_s32(temp_val.val[1], output_zp_dup);
+ temp_val.val[2] = vaddq_s32(temp_val.val[2], output_zp_dup);
+ temp_val.val[3] = vaddq_s32(temp_val.val[3], output_zp_dup);
+
+ temp_val.val[0] = vmaxq_s32(vminq_s32(temp_val.val[0], max_val_dup), min_val_dup);
+ temp_val.val[1] = vmaxq_s32(vminq_s32(temp_val.val[1], max_val_dup), min_val_dup);
+ temp_val.val[2] = vmaxq_s32(vminq_s32(temp_val.val[2], max_val_dup), min_val_dup);
+ temp_val.val[3] = vmaxq_s32(vminq_s32(temp_val.val[3], max_val_dup), min_val_dup);
+
+ StoreValue(temp_val, output_data + j);
+ }
+ for (; j < last_dim; ++j)
+ {
+ const uint8_t index = (input_data_uint[j] ^ offset) + table_offset;
+ const uint8_t part1 = params.uint8_table1[index];
+ const uint8_t part2 = params.uint8_table2[index];
+ const int32_t exp_value = (part1 << 8) + part2;
+ const int32_t output_value = MultiplyByQuantizedMultiplier(exp_value, multiplier, shift);
+
+ output_data[j] = static_cast<Out>(
+ std::max(std::min(clamp_max, output_value + params.zero_point), clamp_min));
}
+ input_data_uint += last_dim;
+ output_data += last_dim;
}
}
+#endif
} // namespace cker
} // namespace nnfw
diff --git a/compute/cker/include/cker/operation/SpaceToBatchND.h b/compute/cker/include/cker/operation/SpaceToBatchND.h
index feeb358c9..aff36e2f3 100644
--- a/compute/cker/include/cker/operation/SpaceToBatchND.h
+++ b/compute/cker/include/cker/operation/SpaceToBatchND.h
@@ -79,9 +79,9 @@ inline void SpaceToBatchND(const SpaceToBatchParams &params, const Shape &unexte
else
{
const T *in =
- input_data + Offset(input_shape, input_batch,
- (out_h * block_shape_height + shift_h) - padding_top,
- (out_w * block_shape_width + shift_w) - padding_left, 0);
+ input_data + Offset(input_shape, input_batch,
+ (out_h * block_shape_height + shift_h) - padding_top,
+ (out_w * block_shape_width + shift_w) - padding_left, 0);
memcpy(out, in, depth * sizeof(T));
}
}
diff --git a/compute/cker/include/cker/operation/StatelessRandomUniform.h b/compute/cker/include/cker/operation/StatelessRandomUniform.h
index d5952ae23..dcf649ca1 100644
--- a/compute/cker/include/cker/operation/StatelessRandomUniform.h
+++ b/compute/cker/include/cker/operation/StatelessRandomUniform.h
@@ -72,8 +72,8 @@ void Fill(random::PhiloxRandom random, Tensor *output)
Distribution());
}
-inline void StatelessRandomUniform(const Shape &shape_shape, const int *shape_data,
- const Shape &seed_shape, const int *seed_data,
+inline void StatelessRandomUniform(const Shape &shape_shape, const int32_t *shape_data,
+ const Shape &seed_shape, const int32_t *seed_data,
const Shape &output_shape, float *output_data)
{
Tensor shape_t;
@@ -95,7 +95,7 @@ inline void StatelessRandomUniform(const Shape &shape_shape, const int *shape_da
GenerateKey(seed_t, &key, &counter);
Fill<Eigen::ThreadPoolDevice, random::UniformDistribution<random::PhiloxRandom, float>>(
- random::PhiloxRandom(counter, key), &output_t);
+ random::PhiloxRandom(counter, key), &output_t);
}
} // namespace cker
} // namespace nnfw
diff --git a/compute/cker/include/cker/operation/StridedSlice.h b/compute/cker/include/cker/operation/StridedSlice.h
index c57b4daa0..2f1089575 100644
--- a/compute/cker/include/cker/operation/StridedSlice.h
+++ b/compute/cker/include/cker/operation/StridedSlice.h
@@ -260,12 +260,41 @@ template <typename T>
inline void StridedSlice(const StridedSliceParams &op_params, const Shape &unextended_input_shape,
const T *input_data, const Shape &unextended_output_shape, T *output_data)
{
- // Note that the output_shape is not used herein.
- StridedSliceParams params_copy = op_params;
-
assert(unextended_input_shape.DimensionsCount() <= 4);
assert(unextended_output_shape.DimensionsCount() <= 4);
+ bool optimize = true;
+ int st_count = op_params.strides_count;
+ for (int idx = 0; idx < st_count - 1; idx++)
+ {
+ const int axis_size = unextended_input_shape.Dims(idx);
+ const int start = StartForAxis(op_params, unextended_input_shape, idx);
+ const int stop = StopForAxis(op_params, unextended_input_shape, idx, start);
+ if ((axis_size != 1) && (start != 0 || stop != 0))
+ {
+ optimize = false;
+ break;
+ }
+ }
+
+ if (optimize)
+ {
+ if (op_params.strides[st_count - 1] == 1)
+ {
+ const int start = StartForAxis(op_params, unextended_input_shape, st_count - 1);
+ const int end = StopForAxis(op_params, unextended_input_shape, st_count - 1, start);
+
+ for (int idx = 0; idx < end - start; idx++)
+ {
+ output_data[idx] = input_data[idx + start];
+ }
+ return;
+ }
+ }
+
+ // Note that the output_shape is not used herein.
+ StridedSliceParams params_copy = op_params;
+
const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
diff --git a/compute/cker/include/cker/operation/Tile.h b/compute/cker/include/cker/operation/Tile.h
index 1dcdd9b79..42433468a 100644
--- a/compute/cker/include/cker/operation/Tile.h
+++ b/compute/cker/include/cker/operation/Tile.h
@@ -55,7 +55,7 @@ std::pair<int, int> TileOneDimension(const Shape &in_dimensions, const T *in_dat
{
int stride_size = 0, tiled_stride_size = 0;
std::tie(stride_size, tiled_stride_size) =
- TileOneDimension(in_dimensions, copy_from_data, multipliers, copy_to_data, dimension + 1);
+ TileOneDimension(in_dimensions, copy_from_data, multipliers, copy_to_data, dimension + 1);
copy_from_data += stride_size;
copy_to_data += tiled_stride_size;
total_stride_size += stride_size;
diff --git a/compute/cker/include/cker/operation/Transpose.h b/compute/cker/include/cker/operation/Transpose.h
index 9d8cd340d..52c826c39 100644
--- a/compute/cker/include/cker/operation/Transpose.h
+++ b/compute/cker/include/cker/operation/Transpose.h
@@ -288,7 +288,7 @@ size_t Flatten(const Shape &input_shape, const Shape &output_shape, const Transp
return flat_size;
}
-} // namespace anonymous (util)
+} // namespace
// Transpose2D only deals with typical 2D matrix transpose ops.
// Perform transpose by transposing 4x4 blocks of the input, proceeding from
@@ -555,9 +555,9 @@ void Transpose(const TransposeParams &unshrunk_params, const Shape &unshrunk_inp
const int total_size = shrunk_input_shape.FlatSize();
const int non_flatten_size =
- Flatten(shrunk_input_shape, shrunk_output_shape, shrunk_params,
+ Flatten(shrunk_input_shape, shrunk_output_shape, shrunk_params,
- &non_flatten_input_shape, &non_flatten_output_shape, &non_flatten_params);
+ &non_flatten_input_shape, &non_flatten_output_shape, &non_flatten_params);
assert(non_flatten_params.perm[0] != 0);
for (int i = 0; i < total_size; i += non_flatten_size)
diff --git a/compute/cker/include/cker/operation/TransposeConv.h b/compute/cker/include/cker/operation/TransposeConv.h
index 7db3a1179..d41f86047 100644
--- a/compute/cker/include/cker/operation/TransposeConv.h
+++ b/compute/cker/include/cker/operation/TransposeConv.h
@@ -90,11 +90,11 @@ inline void TransposeConv(const TransposeConvParams &params, const Shape &input_
(out_y < output_height))
{
float input_value =
- input_data[Offset(input_shape, batch, in_y, in_x, in_channel)];
- float filter_value = filter_data[Offset(filter_shape, out_channel, filter_y,
- filter_x, in_channel)];
+ input_data[Offset(input_shape, batch, in_y, in_x, in_channel)];
+ float filter_value =
+ filter_data[Offset(filter_shape, out_channel, filter_y, filter_x, in_channel)];
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] +=
- input_value * filter_value;
+ input_value * filter_value;
}
}
}
diff --git a/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h
index ac5069917..1fe3e1517 100644
--- a/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h
+++ b/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h
@@ -19,6 +19,8 @@
#define __NNFW_CKER_OPTIMIZED_BINARYARITHMETICOPS_H__
#include <functional>
+#include <limits>
+#include <utility>
#include "cker/neon/neon_check.h"
#include "cker/operation/reference/BinaryArithmeticOps.h"
#include "cker/Shape.h"
@@ -33,8 +35,9 @@ namespace cker
namespace optimized
{
+/* Old version: For Sub(float) and Div. */
template <typename ElementwiseF, typename ScalarBroadcastF, typename T>
-inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam &params,
+inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam &params, bool switch_inputs,
const Shape & /* unswitched_input1_shape */,
const T *unswitched_input1_data,
const Shape & /* unswitched_input2_shape */,
@@ -42,11 +45,8 @@ inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam &params,
const Shape & /* output_shape */, T *output_data,
ElementwiseF elementwise_f, ScalarBroadcastF scalar_broadcast_f)
{
- const bool use_unswitched =
- params.broadcast_category == BroadcastableOpCategory::kFirstInputBroadcastsFast;
-
- const T *input1_data = use_unswitched ? unswitched_input1_data : unswitched_input2_data;
- const T *input2_data = use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+ const T *input1_data = switch_inputs ? unswitched_input2_data : unswitched_input1_data;
+ const T *input2_data = switch_inputs ? unswitched_input1_data : unswitched_input2_data;
// Fivefold nested loops. The second input resets its position for each
// iteration of the second loop. The first input resets its position at the
@@ -123,29 +123,129 @@ inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam &params,
}
}
-inline int32_t quant8_sum(const BinaryArithmeticOpParam &params, const uint8_t input1_data,
- const uint8_t input2_data)
+// New version: For Mul, Add and Sub(quant8)
+template <typename ElementwiseF, typename ScalarBroadcastF, typename T>
+inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam &unswitched_params,
+ const Shape & /* unswitched_input1_shape */,
+ const T *unswitched_input1_data,
+ const Shape & /* unswitched_input2_shape */,
+ const T *unswitched_input2_data,
+ const Shape & /* output_shape */, T *output_data,
+ ElementwiseF elementwise_f, ScalarBroadcastF scalar_broadcast_f)
+{
+ BinaryArithmeticOpParam switched_params = unswitched_params;
+ switched_params.input1_offset = unswitched_params.input2_offset;
+ switched_params.input1_multiplier = unswitched_params.input2_multiplier;
+ switched_params.input1_shift = unswitched_params.input2_shift;
+ switched_params.input2_offset = unswitched_params.input1_offset;
+ switched_params.input2_multiplier = unswitched_params.input1_multiplier;
+ switched_params.input2_shift = unswitched_params.input1_shift;
+
+ const bool use_unswitched =
+ unswitched_params.broadcast_category == BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+ const BinaryArithmeticOpParam &params = use_unswitched ? unswitched_params : switched_params;
+ const T *input1_data = use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+ const T *input2_data = use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+ // Fivefold nested loops. The second input resets its position for each
+ // iteration of the second loop. The first input resets its position at the
+ // beginning of the fourth loop. The innermost loop is an elementwise add of
+ // sections of the arrays.
+ T *output_data_ptr = output_data;
+ const T *input1_data_ptr = input1_data;
+ const T *input2_data_reset = input2_data;
+ // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
+ // between input shapes. y3 for input 1 is always broadcast, and so the
+ // dimension there is 1, whereas optionally y1 might be broadcast for
+ // input 2. Put another way, input1.shape.FlatSize = y0 * y1 * y2 * y4,
+ // input2.shape.FlatSize = y0 * y2 * y3 * y4.
+ int y0 = params.broadcast_shape[0];
+ int y1 = params.broadcast_shape[1];
+ int y2 = params.broadcast_shape[2];
+ int y3 = params.broadcast_shape[3];
+ int y4 = params.broadcast_shape[4];
+ if (y4 > 1)
+ {
+ // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
+ // dimension.
+ for (int i0 = 0; i0 < y0; ++i0)
+ {
+ const T *input2_data_ptr = nullptr;
+ for (int i1 = 0; i1 < y1; ++i1)
+ {
+ input2_data_ptr = input2_data_reset;
+ for (int i2 = 0; i2 < y2; ++i2)
+ {
+ for (int i3 = 0; i3 < y3; ++i3)
+ {
+ elementwise_f(y4, params, input1_data_ptr, input2_data_ptr, output_data_ptr);
+ input2_data_ptr += y4;
+ output_data_ptr += y4;
+ }
+ // We have broadcast y4 of input1 data y3 times, and now move on.
+ input1_data_ptr += y4;
+ }
+ }
+ // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
+ input2_data_reset = input2_data_ptr;
+ }
+ }
+ else
+ {
+ // Special case of y4 == 1, in which the innermost loop is a single
+ // element and can be combined with the next (y3) as an inner broadcast.
+ //
+ // Note that this handles the case of pure scalar broadcast when
+ // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
+ // broadcast with batch (as y2 > 1).
+ //
+ // NOTE The process is the same as the above general case except
+ // simplified for y4 == 1 and the loop over y3 is contained within the
+ // AddScalarBroadcast function.
+ for (int i0 = 0; i0 < y0; ++i0)
+ {
+ const T *input2_data_ptr = nullptr;
+ for (int i1 = 0; i1 < y1; ++i1)
+ {
+ input2_data_ptr = input2_data_reset;
+ for (int i2 = 0; i2 < y2; ++i2)
+ {
+ scalar_broadcast_f(y3, params, *input1_data_ptr, input2_data_ptr, output_data_ptr);
+ input2_data_ptr += y3;
+ output_data_ptr += y3;
+ input1_data_ptr += 1;
+ }
+ }
+ input2_data_reset = input2_data_ptr;
+ }
+ }
+}
+
+template <typename T>
+inline typename std::enable_if_t<is_quant8<T>::value, int32_t>
+quant8_sum(const BinaryArithmeticOpParam &params, const T input1_data, const T input2_data)
{
const int32_t input1_val = params.input1_offset + input1_data;
const int32_t input2_val = params.input2_offset + input2_data;
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
- shifted_input1_val, params.input1_multiplier, params.input1_shift);
+ shifted_input1_val, params.input1_multiplier, params.input1_shift);
const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
- shifted_input2_val, params.input2_multiplier, params.input2_shift);
+ shifted_input2_val, params.input2_multiplier, params.input2_shift);
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp(
- raw_sum, params.output_multiplier, params.output_shift) +
+ raw_sum, params.output_multiplier, params.output_shift) +
params.output_offset;
const int32_t clamped_output = std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, raw_output));
return clamped_output;
}
-inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam &params,
- const uint8_t *input1_data, const uint8_t *input2_data,
- uint8_t *output_data)
+inline void AddElementwise(int size, const BinaryArithmeticOpParam &params,
+ const uint8_t *input1_data, const uint8_t *input2_data,
+ uint8_t *output_data)
{
int i = 0;
@@ -193,9 +293,9 @@ inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam &params
const int16x4_t s1_narrowed = vmovn_s32(s1);
const int16x4_t s2_narrowed = vmovn_s32(s2);
const int16x8_t s =
- vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), vdupq_n_s16(params.output_offset));
- const uint8x8_t clamped = vmax_u8(output_activation_min_vector,
- vmin_u8(output_activation_max_vector, vqmovun_s16(s)));
+ vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), vdupq_n_s16(params.output_offset));
+ const uint8x8_t clamped =
+ vmax_u8(output_activation_min_vector, vmin_u8(output_activation_max_vector, vqmovun_s16(s)));
vst1_u8(output_data + i, clamped);
}
#endif // NEON
@@ -206,12 +306,12 @@ inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam &params
const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
- shifted_input1_val, params.input1_multiplier, params.input1_shift);
+ shifted_input1_val, params.input1_multiplier, params.input1_shift);
const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
- shifted_input2_val, params.input2_multiplier, params.input2_shift);
+ shifted_input2_val, params.input2_multiplier, params.input2_shift);
const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp(
- raw_sum, params.output_multiplier, params.output_shift) +
+ raw_sum, params.output_multiplier, params.output_shift) +
params.output_offset;
const int32_t clamped_output = std::min(params.quantized_activation_max,
std::max(params.quantized_activation_min, raw_output));
@@ -220,7 +320,248 @@ inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam &params
}
inline void AddElementwise(int size, const BinaryArithmeticOpParam &params,
- const float *input1_data, const float *input2_data, float *output_data)
+ const int8_t *input1_data, const int8_t *input2_data,
+ int8_t *output_data)
+{
+ int i = 0;
+#ifdef USE_NEON
+ const int8x16_t output_activation_min_vector = vdupq_n_s8(params.quantized_activation_min);
+ const int8x16_t output_activation_max_vector = vdupq_n_s8(params.quantized_activation_max);
+
+ const int input1_left_shift = params.left_shift + params.input1_shift;
+ const int input2_left_shift = params.left_shift + params.input2_shift;
+ const int32x4_t input1_left_dup = vdupq_n_s32(input1_left_shift);
+ const int32x4_t input2_left_dup = vdupq_n_s32(input2_left_shift);
+
+ const int16x8_t input1_offset_dup = vdupq_n_s16(params.input1_offset);
+ const int16x8_t input2_offset_dup = vdupq_n_s16(params.input2_offset);
+
+ for (; i <= size - 16; i += 16)
+ {
+ const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+ const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+
+ const int16x8_t input1_val_s16_high = vmovl_s8(vget_high_s8(input1_val_original));
+ const int16x8_t input1_val_s16_low = vmovl_s8(vget_low_s8(input1_val_original));
+
+ const int16x8_t input2_val_s16_high = vmovl_s8(vget_high_s8(input2_val_original));
+ const int16x8_t input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original));
+ const int16x8_t input1_val_high = vaddq_s16(input1_val_s16_high, input1_offset_dup);
+ const int16x8_t input2_val_high = vaddq_s16(input2_val_s16_high, input2_offset_dup);
+ const int16x8_t input1_val_low = vaddq_s16(input1_val_s16_low, input1_offset_dup);
+ const int16x8_t input2_val_low = vaddq_s16(input2_val_s16_low, input2_offset_dup);
+ const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
+ const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
+ const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);
+ const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low);
+ const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high);
+ const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high);
+ const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low);
+ const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low);
+ int32x4_t x111 = vmovl_s16(input1_val_low_low);
+ int32x4_t x112 = vmovl_s16(input1_val_low_high);
+ int32x4_t x121 = vmovl_s16(input1_val_high_low);
+ int32x4_t x122 = vmovl_s16(input1_val_high_high);
+ int32x4_t x211 = vmovl_s16(input2_val_low_low);
+ int32x4_t x212 = vmovl_s16(input2_val_low_high);
+ int32x4_t x221 = vmovl_s16(input2_val_high_low);
+ int32x4_t x222 = vmovl_s16(input2_val_high_high);
+
+ x111 = vshlq_s32(x111, input1_left_dup);
+ x112 = vshlq_s32(x112, input1_left_dup);
+ x121 = vshlq_s32(x121, input1_left_dup);
+ x122 = vshlq_s32(x122, input1_left_dup);
+ x211 = vshlq_s32(x211, input2_left_dup);
+ x212 = vshlq_s32(x212, input2_left_dup);
+ x221 = vshlq_s32(x221, input2_left_dup);
+ x222 = vshlq_s32(x222, input2_left_dup);
+ x111 = vqrdmulhq_n_s32(x111, params.input1_multiplier);
+ x112 = vqrdmulhq_n_s32(x112, params.input1_multiplier);
+ x121 = vqrdmulhq_n_s32(x121, params.input1_multiplier);
+ x122 = vqrdmulhq_n_s32(x122, params.input1_multiplier);
+ x211 = vqrdmulhq_n_s32(x211, params.input2_multiplier);
+ x212 = vqrdmulhq_n_s32(x212, params.input2_multiplier);
+ x221 = vqrdmulhq_n_s32(x221, params.input2_multiplier);
+ x222 = vqrdmulhq_n_s32(x222, params.input2_multiplier);
+ int32x4_t s11 = vaddq_s32(x111, x211);
+ int32x4_t s12 = vaddq_s32(x112, x212);
+ int32x4_t s21 = vaddq_s32(x121, x221);
+ int32x4_t s22 = vaddq_s32(x122, x222);
+ s11 = vqrdmulhq_n_s32(s11, params.output_multiplier);
+ s12 = vqrdmulhq_n_s32(s12, params.output_multiplier);
+ s21 = vqrdmulhq_n_s32(s21, params.output_multiplier);
+ s22 = vqrdmulhq_n_s32(s22, params.output_multiplier);
+ using gemmlowp::RoundingDivideByPOT;
+ s11 = RoundingDivideByPOT(s11, -params.output_shift);
+ s12 = RoundingDivideByPOT(s12, -params.output_shift);
+ s21 = RoundingDivideByPOT(s21, -params.output_shift);
+ s22 = RoundingDivideByPOT(s22, -params.output_shift);
+ const int16x4_t s11_narrowed = vmovn_s32(s11);
+ const int16x4_t s12_narrowed = vmovn_s32(s12);
+ const int16x4_t s21_narrowed = vmovn_s32(s21);
+ const int16x4_t s22_narrowed = vmovn_s32(s22);
+ const int16x8_t s1 =
+ vaddq_s16(vcombine_s16(s11_narrowed, s12_narrowed), vdupq_n_s16(params.output_offset));
+ const int16x8_t s2 =
+ vaddq_s16(vcombine_s16(s21_narrowed, s22_narrowed), vdupq_n_s16(params.output_offset));
+ const int8x16_t s = vcombine_s8(vqmovn_s16(s1), vqmovn_s16(s2));
+
+ const int8x16_t clamped =
+ vmaxq_s8(output_activation_min_vector, vminq_s8(output_activation_max_vector, s));
+ vst1q_s8(output_data + i, clamped);
+ }
+#endif // NEON
+
+ for (; i < size; ++i)
+ {
+ const int32_t input1_val = params.input1_offset + input1_data[i];
+ const int32_t input2_val = params.input2_offset + input2_data[i];
+ const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+ const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+ const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+ shifted_input1_val, params.input1_multiplier, params.input1_shift);
+ const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+ shifted_input2_val, params.input2_multiplier, params.input2_shift);
+ const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+ const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+ raw_sum, params.output_multiplier, params.output_shift) +
+ params.output_offset;
+ const int32_t clamped_output = std::min(params.quantized_activation_max,
+ std::max(params.quantized_activation_min, raw_output));
+ output_data[i] = static_cast<int8_t>(clamped_output);
+ }
+}
+
+struct BinaryOpFuncAddFloat
+{
+#ifdef USE_NEON
+ static inline float32x4_t calculate(const float32x4_t &a, const float32x4_t &b)
+ {
+ return vaddq_f32(a, b);
+ }
+#endif // USE_NEON
+ static inline float calculate(const float a, const float b) { return a + b; }
+};
+
+struct BinaryOpFuncSubFloat
+{
+#ifdef USE_NEON
+ static inline float32x4_t calculate(const float32x4_t &a, const float32x4_t &b)
+ {
+ return vsubq_f32(a, b);
+ }
+#endif // USE_NEON
+ static inline float calculate(const float a, const float b) { return a - b; }
+};
+
+struct BinaryOpFuncMulFloat
+{
+#ifdef USE_NEON
+ static inline float32x4_t calculate(const float32x4_t &a, const float32x4_t &b)
+ {
+ return vmulq_f32(a, b);
+ }
+#endif // USE_NEON
+ static inline float calculate(const float a, const float b) { return a * b; }
+};
+
+struct BinaryOpFuncDivFloat
+{
+#ifdef USE_NEON
+#ifdef __aarch64__
+ static inline float32x4_t calculate(const float32x4_t &a, const float32x4_t &b)
+ {
+ return vdivq_f32(a, b);
+ }
+#endif // __aarch64__
+#endif // USE_NEON
+ static inline float calculate(const float a, const float b) { return a / b; }
+};
+
+template <class BASEOPERATOR> struct BinaryOpFuncSwapArgs
+{
+ template <typename T> static inline T calculate(const T &a, const T &b)
+ {
+ return BASEOPERATOR::calculate(b, a);
+ }
+};
+
+struct BinaryOpActivationFloatNone
+{
+#ifdef USE_NEON
+ static inline float32x4_t applyCeiling(const float32x4_t &value, const float32x4_t &ceilingParam)
+ {
+ (void)ceilingParam; // suppress unused argument warning
+ return value;
+ }
+ static inline float32x4_t applyFloor(const float32x4_t &value, const float32x4_t &floorParam)
+ {
+ (void)floorParam;
+ return value;
+ }
+#endif // USE_NEON
+ static inline float applyCeiling(const float value, const float ceilingParam)
+ {
+ (void)ceilingParam;
+ return value;
+ }
+ static inline float applyFloor(const float value, const float floorParam)
+ {
+ (void)floorParam;
+ return value;
+ }
+};
+
+struct BinaryOpActivationFloatMax
+{
+#ifdef USE_NEON
+ static inline float32x4_t applyCeiling(const float32x4_t &value, const float32x4_t &ceilingParam)
+ {
+ (void)ceilingParam; // suppress unused argument warning
+ return value;
+ }
+ static inline float32x4_t applyFloor(const float32x4_t &value, const float32x4_t &floorParam)
+ {
+ return vmaxq_f32(value, floorParam);
+ }
+#endif // USE_NEON
+ static inline float applyCeiling(const float value, const float ceilingParam)
+ {
+ (void)ceilingParam;
+ return value;
+ }
+ static inline float applyFloor(const float value, const float floorParam)
+ {
+ return std::max(value, floorParam);
+ }
+};
+
+struct BinaryOpActivationFloatMinMax
+{
+#ifdef USE_NEON
+ static inline float32x4_t applyCeiling(const float32x4_t &value, const float32x4_t &ceilingParam)
+ {
+ return vminq_f32(value, ceilingParam);
+ }
+ static inline float32x4_t applyFloor(const float32x4_t &value, const float32x4_t &floorParam)
+ {
+ return vmaxq_f32(value, floorParam);
+ }
+#endif // USE_NEON
+ static inline float applyCeiling(const float value, const float ceilingParam)
+ {
+ return std::min(value, ceilingParam);
+ }
+ static inline float applyFloor(const float value, const float floorParam)
+ {
+ return std::max(value, floorParam);
+ }
+};
+
+template <class OPERATOR, class ACTIVATION>
+inline void BinaryOpElementwise(int size, const BinaryArithmeticOpParam &params,
+ const float *input1_data, const float *input2_data,
+ float *output_data)
{
int i = 0;
@@ -237,18 +578,18 @@ inline void AddElementwise(int size, const BinaryArithmeticOpParam &params,
auto a21 = vld1q_f32(input2_data + i + 4);
auto a22 = vld1q_f32(input2_data + i + 8);
auto a23 = vld1q_f32(input2_data + i + 12);
- auto x0 = vaddq_f32(a10, a20);
- auto x1 = vaddq_f32(a11, a21);
- auto x2 = vaddq_f32(a12, a22);
- auto x3 = vaddq_f32(a13, a23);
- x0 = vmaxq_f32(activation_min, x0);
- x1 = vmaxq_f32(activation_min, x1);
- x2 = vmaxq_f32(activation_min, x2);
- x3 = vmaxq_f32(activation_min, x3);
- x0 = vminq_f32(activation_max, x0);
- x1 = vminq_f32(activation_max, x1);
- x2 = vminq_f32(activation_max, x2);
- x3 = vminq_f32(activation_max, x3);
+ auto x0 = OPERATOR::calculate(a10, a20);
+ auto x1 = OPERATOR::calculate(a11, a21);
+ auto x2 = OPERATOR::calculate(a12, a22);
+ auto x3 = OPERATOR::calculate(a13, a23);
+ x0 = ACTIVATION::applyFloor(x0, activation_min);
+ x1 = ACTIVATION::applyFloor(x1, activation_min);
+ x2 = ACTIVATION::applyFloor(x2, activation_min);
+ x3 = ACTIVATION::applyFloor(x3, activation_min);
+ x0 = ACTIVATION::applyCeiling(x0, activation_max);
+ x1 = ACTIVATION::applyCeiling(x1, activation_max);
+ x2 = ACTIVATION::applyCeiling(x2, activation_max);
+ x3 = ACTIVATION::applyCeiling(x3, activation_max);
vst1q_f32(output_data + i, x0);
vst1q_f32(output_data + i + 4, x1);
vst1q_f32(output_data + i + 8, x2);
@@ -258,26 +599,101 @@ inline void AddElementwise(int size, const BinaryArithmeticOpParam &params,
{
auto a1 = vld1q_f32(input1_data + i);
auto a2 = vld1q_f32(input2_data + i);
- auto x = vaddq_f32(a1, a2);
- x = vmaxq_f32(activation_min, x);
- x = vminq_f32(activation_max, x);
- vst1q_f32(output_data + i, x);
+ auto x = OPERATOR::calculate(a1, a2); // vaddq
+ auto x_clamped =
+ ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max);
+ vst1q_f32(output_data + i, x_clamped);
}
-#endif // NEON
+#endif // USE_NEON
+ for (; i < size; i++)
+ {
+ auto x = OPERATOR::calculate(input1_data[i], input2_data[i]);
+ output_data[i] = ACTIVATION::applyCeiling(
+ ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max);
+ }
+}
+
+// Broadcast binary op template that can often be used for inner loop
+// This function will handle scalar_value (LHS) and vector_values (RHS).
+// Since it's a float function, input params does not matter here.
+template <class OPERATOR, class ACTIVATION>
+inline void BinaryOpScalarBroadcast(int size, const BinaryArithmeticOpParam &params,
+ const float broadcast_value, const float *input2_data,
+ float *output_data)
+{
+ int i = 0;
+
+#ifdef USE_NEON
+ const auto activation_min = vdupq_n_f32(params.float_activation_min);
+ const auto activation_max = vdupq_n_f32(params.float_activation_max);
+ const auto broadcast_value_dup = vdupq_n_f32(broadcast_value);
+ for (; i <= size - 16; i += 16)
+ {
+ auto a20 = vld1q_f32(input2_data + i);
+ auto a21 = vld1q_f32(input2_data + i + 4);
+ auto a22 = vld1q_f32(input2_data + i + 8);
+ auto a23 = vld1q_f32(input2_data + i + 12);
+ auto x0 = OPERATOR::calculate(broadcast_value_dup, a20);
+ auto x1 = OPERATOR::calculate(broadcast_value_dup, a21);
+ auto x2 = OPERATOR::calculate(broadcast_value_dup, a22);
+ auto x3 = OPERATOR::calculate(broadcast_value_dup, a23);
+ x0 = ACTIVATION::applyFloor(x0, activation_min);
+ x1 = ACTIVATION::applyFloor(x1, activation_min);
+ x2 = ACTIVATION::applyFloor(x2, activation_min);
+ x3 = ACTIVATION::applyFloor(x3, activation_min);
+ x0 = ACTIVATION::applyCeiling(x0, activation_max);
+ x1 = ACTIVATION::applyCeiling(x1, activation_max);
+ x2 = ACTIVATION::applyCeiling(x2, activation_max);
+ x3 = ACTIVATION::applyCeiling(x3, activation_max);
+ vst1q_f32(output_data + i, x0);
+ vst1q_f32(output_data + i + 4, x1);
+ vst1q_f32(output_data + i + 8, x2);
+ vst1q_f32(output_data + i + 12, x3);
+ }
+ for (; i <= size - 4; i += 4)
+ {
+ auto a2 = vld1q_f32(input2_data + i);
+ auto x = OPERATOR::calculate(broadcast_value_dup, a2);
+ auto x_clamped =
+ ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max);
+ vst1q_f32(output_data + i, x_clamped);
+ }
+#endif // USE_NEON
for (; i < size; i++)
{
- auto x = input1_data[i] + input2_data[i];
- output_data[i] = ActivationFunctionWithMinMax<float>(x, params.float_activation_min,
- params.float_activation_max);
+ auto x = OPERATOR::calculate(broadcast_value, input2_data[i]);
+ output_data[i] = ACTIVATION::applyCeiling(
+ ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max);
}
}
-inline void AddQuant8(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
- const uint8_t *input1_data, const Shape &input2_shape,
- const uint8_t *input2_data, const Shape &output_shape, uint8_t *output_data)
+using BinaryOpImplFloatFuncs =
+ std::pair<void (*)(int, const BinaryArithmeticOpParam &, const float *, const float *, float *),
+ void (*)(int, const BinaryArithmeticOpParam &, const float, const float *, float *)>;
+
+template <class FUNC>
+inline BinaryOpImplFloatFuncs
+getBinaryOpWithActivationImplFloat(const BinaryArithmeticOpParam &params)
+{
+ if (params.float_activation_max == std::numeric_limits<float>::max())
+ if (params.float_activation_min == std::numeric_limits<float>::lowest())
+ return BinaryOpImplFloatFuncs(BinaryOpElementwise<FUNC, BinaryOpActivationFloatNone>,
+ BinaryOpScalarBroadcast<FUNC, BinaryOpActivationFloatNone>);
+ else
+ return BinaryOpImplFloatFuncs(BinaryOpElementwise<FUNC, BinaryOpActivationFloatMax>,
+ BinaryOpScalarBroadcast<FUNC, BinaryOpActivationFloatMax>);
+ else
+ return BinaryOpImplFloatFuncs(BinaryOpElementwise<FUNC, BinaryOpActivationFloatMinMax>,
+ BinaryOpScalarBroadcast<FUNC, BinaryOpActivationFloatMinMax>);
+}
+
+template <typename T>
+inline typename std::enable_if_t<is_quant8<T>::value>
+Add(const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data,
+ const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data)
{
const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
- AddElementwiseQuant8(flat_size, params, input1_data, input2_data, output_data);
+ AddElementwise(flat_size, params, input1_data, input2_data, output_data);
}
inline void Add(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
@@ -285,15 +701,16 @@ inline void Add(const BinaryArithmeticOpParam &params, const Shape &input1_shape
const Shape &output_shape, float *output_data)
{
const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
- AddElementwise(flat_size, params, input1_data, input2_data, output_data);
+ auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncAddFloat>(params);
+ (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data);
}
// Scalar-broadcast add that can be used for inner loop of more general
// broadcast add, so that, for example, scalar-broadcast with batch will still
// be fast.
-inline void AddScalarBroadcastQuant8(int size, const BinaryArithmeticOpParam &params,
- uint8_t broadcast_value, const uint8_t *input2_data,
- uint8_t *output_data)
+inline void AddScalarBroadcast(int size, const BinaryArithmeticOpParam &params,
+ uint8_t broadcast_value, const uint8_t *input2_data,
+ uint8_t *output_data)
{
int i = 0;
int32_t clamped_output;
@@ -304,58 +721,115 @@ inline void AddScalarBroadcastQuant8(int size, const BinaryArithmeticOpParam &pa
}
}
-inline void AddScalarBroadcast(int size, const BinaryArithmeticOpParam &params,
- float broadcast_value, const float *input2_data, float *output_data)
+// Scalar-broadcast add that can be used for inner loop of more general
+// broadcast add, so that, for example, scalar-broadcast with batch will still
+// be fast.
+inline void AddScalarBroadcast(int size, const BinaryArithmeticOpParam &params, int8_t input1_data,
+ const int8_t *input2_data, int8_t *output_data)
{
+ using gemmlowp::RoundingDivideByPOT;
int i = 0;
#ifdef USE_NEON
- const float32x4_t output_activation_min_vector = vdupq_n_f32(params.float_activation_min);
- const float32x4_t output_activation_max_vector = vdupq_n_f32(params.float_activation_max);
- const float32x4_t broadcast_value_dup = vdupq_n_f32(broadcast_value);
- for (; i <= size - 4; i += 4)
- {
- const float32x4_t input2_val_original = vld1q_f32(input2_data + i);
+ const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift);
+ const int8x8_t output_activation_min_vector = vdup_n_s8(params.quantized_activation_min);
+ const int8x8_t output_activation_max_vector = vdup_n_s8(params.quantized_activation_max);
- const float32x4_t output = vaddq_f32(input2_val_original, broadcast_value_dup);
+ // Process broadcast scalar.
+ const int8x8_t input1_val_original = vdup_n_s8(input1_data);
+ const int16x8_t input1_val_s16 = vmovl_s8(input1_val_original);
+ const int16x8_t input1_val = vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
+ const int16x4_t input1_val_high = vget_high_s16(input1_val);
+ const int16x4_t input1_val_low = vget_low_s16(input1_val);
+ int32x4_t x11 = vmovl_s16(input1_val_low);
+ int32x4_t x12 = vmovl_s16(input1_val_high);
+ x11 = vshlq_s32(x11, left_shift_dup);
+ x12 = vshlq_s32(x12, left_shift_dup);
+ x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier);
+ x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
+ const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift);
+ x11 = vshlq_s32(x11, input1_shift_dup);
+ x12 = vshlq_s32(x12, input1_shift_dup);
- const float32x4_t clamped =
- vmaxq_f32(output_activation_min_vector, vminq_f32(output_activation_max_vector, output));
- vst1q_f32(output_data + i, clamped);
+ for (; i <= size - 8; i += 8)
+ {
+ const int8x8_t input2_val_original = vld1_s8(input2_data + i);
+ const int16x8_t input2_val_s16 = vmovl_s8(input2_val_original);
+ const int16x8_t input2_val = vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
+ const int16x4_t input2_val_high = vget_high_s16(input2_val);
+ const int16x4_t input2_val_low = vget_low_s16(input2_val);
+ int32x4_t x21 = vmovl_s16(input2_val_low);
+ int32x4_t x22 = vmovl_s16(input2_val_high);
+ x21 = vshlq_s32(x21, left_shift_dup);
+ x22 = vshlq_s32(x22, left_shift_dup);
+ x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
+ x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
+ const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift);
+ x21 = vshlq_s32(x21, input2_shift_dup);
+ x22 = vshlq_s32(x22, input2_shift_dup);
+ int32x4_t s1 = vaddq_s32(x11, x21);
+ int32x4_t s2 = vaddq_s32(x12, x22);
+ s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
+ s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
+ s1 = RoundingDivideByPOT(s1, -params.output_shift);
+ s2 = RoundingDivideByPOT(s2, -params.output_shift);
+ const int16x4_t s1_narrowed = vmovn_s32(s1);
+ const int16x4_t s2_narrowed = vmovn_s32(s2);
+ const int16x8_t s =
+ vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), vdupq_n_s16(params.output_offset));
+ const int8x8_t clamped =
+ vmax_s8(output_activation_min_vector, vmin_s8(output_activation_max_vector, vqmovn_s16(s)));
+ vst1_s8(output_data + i, clamped);
}
#endif // NEON
- for (; i < size; ++i)
+
+ if (i < size)
{
- auto x = broadcast_value + input2_data[i];
- output_data[i] = ActivationFunctionWithMinMax<float>(x, params.float_activation_min,
- params.float_activation_max);
+ // Process broadcast scalar.
+ const int32_t input1_val = params.input1_offset + input1_data;
+ const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+ const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+ shifted_input1_val, params.input1_multiplier, params.input1_shift);
+
+ for (; i < size; ++i)
+ {
+ const int32_t input2_val = params.input2_offset + input2_data[i];
+ const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+ const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+ shifted_input2_val, params.input2_multiplier, params.input2_shift);
+ const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+ const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+ raw_sum, params.output_multiplier, params.output_shift) +
+ params.output_offset;
+ const int32_t clamped_output = std::min(
+ params.quantized_activation_max, std::max(params.quantized_activation_min, raw_output));
+ output_data[i] = static_cast<int8_t>(clamped_output);
+ }
}
}
-inline void BroadcastAddDispatchQuant8(const BinaryArithmeticOpParam &params,
- const Shape &input1_shape, const uint8_t *input1_data,
- const Shape &input2_shape, const uint8_t *input2_data,
- const Shape &output_shape, uint8_t *output_data)
+template <typename T>
+inline typename std::enable_if_t<is_quant8<T>::value>
+BroadcastAddDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+ const T *input1_data, const Shape &input2_shape, const T *input2_data,
+ const Shape &output_shape, T *output_data)
{
if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
{
- const std::function<uint8_t(const BinaryArithmeticOpParam &, const uint8_t &, const uint8_t &)>
- fn = [](const BinaryArithmeticOpParam &params, const uint8_t &a,
- const uint8_t &b) -> uint8_t {
- return static_cast<uint8_t>(quant8_sum(params, a, b));
- };
- reference::BroadcastBinaryArithmeticOpSlowQuant8(params, input1_shape, input1_data,
- input2_shape, input2_data, output_shape,
- output_data, fn);
- }
- else
- {
- BinaryBroadcastFiveFold(
- params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
- static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *,
- uint8_t *)>(AddElementwiseQuant8),
- static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *,
- uint8_t *)>(AddScalarBroadcastQuant8));
+ const std::function<T(const BinaryArithmeticOpParam &, const T &, const T &)> fn =
+ [](const BinaryArithmeticOpParam &params, const T &a, const T &b) {
+ return static_cast<T>(quant8_sum(params, a, b));
+ };
+ reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
+ input2_data, output_shape, output_data, fn);
+ return;
}
+
+ BinaryBroadcastFiveFold(
+ params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+ static_cast<void (*)(int, const BinaryArithmeticOpParam &, const T *, const T *, T *)>(
+ AddElementwise),
+ static_cast<void (*)(int, const BinaryArithmeticOpParam &, T, const T *, T *)>(
+ AddScalarBroadcast));
}
inline void BroadcastAddDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
@@ -366,18 +840,18 @@ inline void BroadcastAddDispatch(const BinaryArithmeticOpParam &params, const Sh
if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
{
const std::function<float(const float &, const float &)> fn =
- [](const float &a, const float &b) -> float { return a + b; };
+ [](const float &a, const float &b) -> float { return a + b; };
reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
input2_data, output_shape, output_data, fn);
}
else
{
+ auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncAddFloat>(params);
+
BinaryBroadcastFiveFold(
- params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
- static_cast<void (*)(int, const BinaryArithmeticOpParam &, const float *, const float *,
- float *)>(AddElementwise),
- static_cast<void (*)(int, const BinaryArithmeticOpParam &, float, const float *, float *)>(
- AddScalarBroadcast));
+ params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast,
+ input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+ implFuncs.first, implFuncs.second);
}
}
@@ -385,75 +859,57 @@ inline void Sub(const BinaryArithmeticOpParam &params, const Shape &input1_shape
const float *input1_data, const Shape &input2_shape, const float *input2_data,
const Shape &output_shape, float *output_data)
{
- int i = 0;
- const int size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
-#ifdef USE_NEON
- const auto activation_min = vdupq_n_f32(params.float_activation_min);
- const auto activation_max = vdupq_n_f32(params.float_activation_max);
- for (; i <= size - 16; i += 16)
+ const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+ auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncSubFloat>(params);
+ (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void BroadcastSubDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+ const float *input1_data, const Shape &input2_shape,
+ const float *input2_data, const Shape &output_shape,
+ float *output_data)
+{
+ if (params.broadcast_category == BroadcastableOpCategory::kFirstInputBroadcastsFast)
{
- auto a10 = vld1q_f32(input1_data + i);
- auto a11 = vld1q_f32(input1_data + i + 4);
- auto a12 = vld1q_f32(input1_data + i + 8);
- auto a13 = vld1q_f32(input1_data + i + 12);
- auto a20 = vld1q_f32(input2_data + i);
- auto a21 = vld1q_f32(input2_data + i + 4);
- auto a22 = vld1q_f32(input2_data + i + 8);
- auto a23 = vld1q_f32(input2_data + i + 12);
- auto x0 = vsubq_f32(a10, a20);
- auto x1 = vsubq_f32(a11, a21);
- auto x2 = vsubq_f32(a12, a22);
- auto x3 = vsubq_f32(a13, a23);
- x0 = vmaxq_f32(activation_min, x0);
- x1 = vmaxq_f32(activation_min, x1);
- x2 = vmaxq_f32(activation_min, x2);
- x3 = vmaxq_f32(activation_min, x3);
- x0 = vminq_f32(activation_max, x0);
- x1 = vminq_f32(activation_max, x1);
- x2 = vminq_f32(activation_max, x2);
- x3 = vminq_f32(activation_max, x3);
- vst1q_f32(output_data + i, x0);
- vst1q_f32(output_data + i + 4, x1);
- vst1q_f32(output_data + i + 8, x2);
- vst1q_f32(output_data + i + 12, x3);
+ auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncSubFloat>(params);
+ BinaryBroadcastFiveFold(params, false, input1_shape, input1_data, input2_shape, input2_data,
+ output_shape, output_data, implFuncs.first, implFuncs.second);
}
- for (; i <= size - 4; i += 4)
+ else if (params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast)
{
- auto a1 = vld1q_f32(input1_data + i);
- auto a2 = vld1q_f32(input2_data + i);
- auto x = vsubq_f32(a1, a2);
- x = vmaxq_f32(activation_min, x);
- x = vminq_f32(activation_max, x);
- vst1q_f32(output_data + i, x);
+ auto implFuncs =
+ getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncSubFloat>>(params);
+ BinaryBroadcastFiveFold(params, true, input1_shape, input1_data, input2_shape, input2_data,
+ output_shape, output_data, implFuncs.first, implFuncs.second);
}
-#endif // NEON
-
- for (; i < size; i++)
+ else
{
- auto x = input1_data[i] - input2_data[i];
- output_data[i] =
- ActivationFunctionWithMinMax(x, params.float_activation_min, params.float_activation_max);
+ const std::function<float(const float &, const float &)> fn =
+ [](const float &a, const float &b) -> float { return a - b; };
+ reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
+ input2_data, output_shape, output_data, fn);
}
}
-inline int32_t quant8_mul(const BinaryArithmeticOpParam &params, const uint8_t input1_data,
- const uint8_t input2_data)
+template <typename T>
+inline typename std::enable_if_t<is_quant8<T>::value, int32_t>
+quant8_mul(const BinaryArithmeticOpParam &params, const T input1_data, const T input2_data)
{
const int32_t input1_val = params.input1_offset + input1_data;
const int32_t input2_val = params.input2_offset + input2_data;
const int32_t unclamped_result =
- params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
- params.output_multiplier,
- params.output_shift);
+ params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
+ params.output_multiplier,
+ params.output_shift);
const int32_t clamped_output = std::min(
- params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result));
+ params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result));
return clamped_output;
}
-inline void MulElementwiseQuant8(int size, const BinaryArithmeticOpParam &params,
- const uint8_t *input1_data, const uint8_t *input2_data,
- uint8_t *output_data)
+inline void MulElementwise(int size, const BinaryArithmeticOpParam &params,
+ const uint8_t *input1_data, const uint8_t *input2_data,
+ uint8_t *output_data)
{
int i = 0;
@@ -495,8 +951,8 @@ inline void MulElementwiseQuant8(int size, const BinaryArithmeticOpParam &params
const auto p1_narrowed = vqmovn_s32(p1);
const auto p2_narrowed = vqmovn_s32(p2);
const auto p = vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
- const auto clamped = vmax_u8(output_activation_min_vector,
- vmin_u8(output_activation_max_vector, vqmovun_s16(p)));
+ const auto clamped =
+ vmax_u8(output_activation_min_vector, vmin_u8(output_activation_max_vector, vqmovun_s16(p)));
vst1_u8(output_data + i, clamped);
}
#endif // NEON
@@ -506,76 +962,111 @@ inline void MulElementwiseQuant8(int size, const BinaryArithmeticOpParam &params
const int32_t input1_val = params.input1_offset + input1_data[i];
const int32_t input2_val = params.input2_offset + input2_data[i];
const int32_t unclamped_result =
- params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
- params.output_multiplier,
- params.output_shift);
- const int32_t clamped_output =
- std::min(params.quantized_activation_max,
- std::max(params.quantized_activation_min, unclamped_result));
+ params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
+ params.output_multiplier,
+ params.output_shift);
+ const int32_t clamped_output = std::min(
+ params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result));
output_data[i] = static_cast<uint8_t>(clamped_output);
}
}
inline void MulElementwise(int size, const BinaryArithmeticOpParam &params,
- const float *input1_data, const float *input2_data, float *output_data)
+ const int8_t *input1_data, const int8_t *input2_data,
+ int8_t *output_data)
{
int i = 0;
-
#ifdef USE_NEON
- const auto activation_min = vdupq_n_f32(params.float_activation_min);
- const auto activation_max = vdupq_n_f32(params.float_activation_max);
+ const int16x8_t input1_offset_vector = vdupq_n_s16(params.input1_offset);
+ const int16x8_t input2_offset_vector = vdupq_n_s16(params.input2_offset);
+ const int16x8_t output_offset_vector = vdupq_n_s16(params.output_offset);
+ const auto output_activation_min_vector = vdupq_n_s8(params.quantized_activation_min);
+ const auto output_activation_max_vector = vdupq_n_s8(params.quantized_activation_max);
+ const int left_shift = std::max(0, params.output_shift);
+ const int right_shift = std::max(0, -params.output_shift);
+ const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
for (; i <= size - 16; i += 16)
{
- auto a10 = vld1q_f32(input1_data + i);
- auto a11 = vld1q_f32(input1_data + i + 4);
- auto a12 = vld1q_f32(input1_data + i + 8);
- auto a13 = vld1q_f32(input1_data + i + 12);
- auto a20 = vld1q_f32(input2_data + i);
- auto a21 = vld1q_f32(input2_data + i + 4);
- auto a22 = vld1q_f32(input2_data + i + 8);
- auto a23 = vld1q_f32(input2_data + i + 12);
- auto x0 = vmulq_f32(a10, a20);
- auto x1 = vmulq_f32(a11, a21);
- auto x2 = vmulq_f32(a12, a22);
- auto x3 = vmulq_f32(a13, a23);
- x0 = vmaxq_f32(activation_min, x0);
- x1 = vmaxq_f32(activation_min, x1);
- x2 = vmaxq_f32(activation_min, x2);
- x3 = vmaxq_f32(activation_min, x3);
- x0 = vminq_f32(activation_max, x0);
- x1 = vminq_f32(activation_max, x1);
- x2 = vminq_f32(activation_max, x2);
- x3 = vminq_f32(activation_max, x3);
- vst1q_f32(output_data + i, x0);
- vst1q_f32(output_data + i + 4, x1);
- vst1q_f32(output_data + i + 8, x2);
- vst1q_f32(output_data + i + 12, x3);
- }
- for (; i <= size - 4; i += 4)
- {
- auto a1 = vld1q_f32(input1_data + i);
- auto a2 = vld1q_f32(input2_data + i);
- auto x = vmulq_f32(a1, a2);
- x = vmaxq_f32(activation_min, x);
- x = vminq_f32(activation_max, x);
- vst1q_f32(output_data + i, x);
+ // We load / store 16 at a time, multiplying as four sets of 4 int32s.
+ const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+ const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+
+ const int16x8_t input1_val_s16_high = vmovl_s8(vget_high_s8(input1_val_original));
+ const int16x8_t input1_val_s16_low = vmovl_s8(vget_low_s8(input1_val_original));
+
+ const int16x8_t input2_val_s16_high = vmovl_s8(vget_high_s8(input2_val_original));
+ const int16x8_t input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original));
+ const int16x8_t input1_val_high = vaddq_s16(input1_val_s16_high, input1_offset_vector);
+ const int16x8_t input2_val_high = vaddq_s16(input2_val_s16_high, input2_offset_vector);
+ const int16x8_t input1_val_low = vaddq_s16(input1_val_s16_low, input1_offset_vector);
+ const int16x8_t input2_val_low = vaddq_s16(input2_val_s16_low, input2_offset_vector);
+ const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
+ const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
+ const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);
+ const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low);
+ const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high);
+ const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high);
+ const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low);
+ const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low);
+
+ auto p1 = vmull_s16(input2_val_high_high, input1_val_high_high);
+ auto p2 = vmull_s16(input2_val_high_low, input1_val_high_low);
+ auto p3 = vmull_s16(input2_val_low_high, input1_val_low_high);
+ auto p4 = vmull_s16(input2_val_low_low, input1_val_low_low);
+
+ p1 = vshlq_s32(p1, left_shift_vec);
+ p2 = vshlq_s32(p2, left_shift_vec);
+ p3 = vshlq_s32(p3, left_shift_vec);
+ p4 = vshlq_s32(p4, left_shift_vec);
+
+ p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
+ p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+ p3 = vqrdmulhq_n_s32(p3, params.output_multiplier);
+ p4 = vqrdmulhq_n_s32(p4, params.output_multiplier);
+ using gemmlowp::RoundingDivideByPOT;
+ p1 = RoundingDivideByPOT(p1, right_shift);
+ p2 = RoundingDivideByPOT(p2, right_shift);
+ p3 = RoundingDivideByPOT(p3, right_shift);
+ p4 = RoundingDivideByPOT(p4, right_shift);
+
+ const auto p1_narrowed = vqmovn_s32(p1);
+ const auto p2_narrowed = vqmovn_s32(p2);
+ const auto p3_narrowed = vqmovn_s32(p3);
+ const auto p4_narrowed = vqmovn_s32(p4);
+
+ const int16x8_t p_part1 =
+ vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector);
+ const int16x8_t p_part2 =
+ vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector);
+ const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1));
+
+ const auto clamped =
+ vmaxq_s8(output_activation_min_vector, vminq_s8(output_activation_max_vector, p));
+ vst1q_s8(output_data + i, clamped);
}
#endif // NEON
- for (; i < size; i++)
+ for (; i < size; ++i)
{
- auto x = input1_data[i] * input2_data[i];
- output_data[i] =
- ActivationFunctionWithMinMax(x, params.float_activation_min, params.float_activation_max);
+ const int32_t input1_val = params.input1_offset + input1_data[i];
+ const int32_t input2_val = params.input2_offset + input2_data[i];
+ const int32_t unclamped_result =
+ params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
+ params.output_multiplier,
+ params.output_shift);
+ const int32_t clamped_output = std::min(
+ params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result));
+ output_data[i] = static_cast<int8_t>(clamped_output);
}
}
-inline void MulQuant8(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
- const uint8_t *input1_data, const Shape &input2_shape,
- const uint8_t *input2_data, const Shape &output_shape, uint8_t *output_data)
+template <typename T>
+inline typename std::enable_if_t<is_quant8<T>::value>
+Mul(const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data,
+ const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data)
{
const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
- MulElementwiseQuant8(flat_size, params, input1_data, input2_data, output_data);
+ MulElementwise(flat_size, params, input1_data, input2_data, output_data);
}
inline void Mul(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
@@ -583,12 +1074,13 @@ inline void Mul(const BinaryArithmeticOpParam &params, const Shape &input1_shape
const Shape &output_shape, float *output_data)
{
const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
- MulElementwise(flat_size, params, input1_data, input2_data, output_data);
+ auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncMulFloat>(params);
+ (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data);
}
-inline void MulSimpleBroadcastQuant8(int size, const BinaryArithmeticOpParam &params,
- const uint8_t broadcast_value, const uint8_t *input2_data,
- uint8_t *output_data)
+inline void MulSimpleBroadcast(int size, const BinaryArithmeticOpParam &params,
+ const uint8_t broadcast_value, const uint8_t *input2_data,
+ uint8_t *output_data)
{
int i = 0;
int32_t clamped_output;
@@ -600,60 +1092,108 @@ inline void MulSimpleBroadcastQuant8(int size, const BinaryArithmeticOpParam &pa
}
// Broadcast mul that can often be used for inner loop of broadcast Mul.
-// This function will handle scalar_value (LHS) * vector_values (RHS).
-// Since it's a float function, input params does not matter here.
inline void MulSimpleBroadcast(int size, const BinaryArithmeticOpParam &params,
- const float broadcast_value, const float *input2_data,
- float *output_data)
+ const int8_t broadcast_value, const int8_t *input2_data,
+ int8_t *output_data)
{
+ const int16_t input1_val = params.input1_offset + broadcast_value;
+
int i = 0;
#ifdef USE_NEON
- const float32x4_t output_activation_min_vector = vdupq_n_f32(params.float_activation_min);
- const float32x4_t output_activation_max_vector = vdupq_n_f32(params.float_activation_max);
- const float32x4_t broadcast_value_dup = vdupq_n_f32(broadcast_value);
- for (; i <= size - 4; i += 4)
+ const auto input2_offset_vector = vdupq_n_s16(params.input2_offset);
+ const auto output_offset_vector = vdupq_n_s16(params.output_offset);
+ const auto output_activation_min_vector = vdupq_n_s8(params.quantized_activation_min);
+ const auto output_activation_max_vector = vdupq_n_s8(params.quantized_activation_max);
+ const int left_shift = std::max(0, params.output_shift);
+ const int right_shift = std::max(0, -params.output_shift);
+ const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
+ for (; i <= size - 16; i += 16)
{
- const float32x4_t input2_val_original = vld1q_f32(input2_data + i);
+ // We load / store 16 at a time, multiplying as four sets of 4 int32s.
+ const auto input2_val_original = vld1q_s8(input2_data + i);
+ const auto input2_val_s16_high = vmovl_s8(vget_high_s8(input2_val_original));
+ const auto input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original));
+
+ const auto input2_val_high = vaddq_s16(input2_val_s16_high, input2_offset_vector);
+ const auto input2_val_low = vaddq_s16(input2_val_s16_low, input2_offset_vector);
+
+ const auto input2_val_low_low = vget_low_s16(input2_val_low);
+ const auto input2_val_low_high = vget_high_s16(input2_val_low);
+ const auto input2_val_high_low = vget_low_s16(input2_val_high);
+ const auto input2_val_high_high = vget_high_s16(input2_val_high);
+
+ auto p1 = vmull_n_s16(input2_val_high_high, input1_val);
+ auto p2 = vmull_n_s16(input2_val_high_low, input1_val);
+ auto p3 = vmull_n_s16(input2_val_low_high, input1_val);
+ auto p4 = vmull_n_s16(input2_val_low_low, input1_val);
+
+ p1 = vshlq_s32(p1, left_shift_vec);
+ p2 = vshlq_s32(p2, left_shift_vec);
+ p3 = vshlq_s32(p3, left_shift_vec);
+ p4 = vshlq_s32(p4, left_shift_vec);
+
+ p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
+ p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+ p3 = vqrdmulhq_n_s32(p3, params.output_multiplier);
+ p4 = vqrdmulhq_n_s32(p4, params.output_multiplier);
+ using gemmlowp::RoundingDivideByPOT;
+ p1 = RoundingDivideByPOT(p1, right_shift);
+ p2 = RoundingDivideByPOT(p2, right_shift);
+ p3 = RoundingDivideByPOT(p3, right_shift);
+ p4 = RoundingDivideByPOT(p4, right_shift);
- const float32x4_t output = vmulq_f32(input2_val_original, broadcast_value_dup);
+ const auto p1_narrowed = vqmovn_s32(p1);
+ const auto p2_narrowed = vqmovn_s32(p2);
+ const auto p3_narrowed = vqmovn_s32(p3);
+ const auto p4_narrowed = vqmovn_s32(p4);
- const float32x4_t clamped =
- vmaxq_f32(output_activation_min_vector, vminq_f32(output_activation_max_vector, output));
- vst1q_f32(output_data + i, clamped);
+ const int16x8_t p_part1 =
+ vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector);
+ const int16x8_t p_part2 =
+ vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector);
+ const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1));
+
+ const auto clamped =
+ vmaxq_s8(output_activation_min_vector, vminq_s8(output_activation_max_vector, p));
+ vst1q_s8(output_data + i, clamped);
}
#endif // NEON
for (; i < size; ++i)
{
- float x = broadcast_value * input2_data[i];
- output_data[i] =
- ActivationFunctionWithMinMax(x, params.float_activation_min, params.float_activation_max);
+ const int32_t input2_val = params.input2_offset + input2_data[i];
+ const int32_t unclamped_result =
+ params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
+ params.output_multiplier,
+ params.output_shift);
+ const int32_t clamped_output = std::min(
+ params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result));
+ output_data[i] = static_cast<int8_t>(clamped_output);
}
}
-inline void BroadcastMulDispatchQuant8(const BinaryArithmeticOpParam &params,
- const Shape &input1_shape, const uint8_t *input1_data,
- const Shape &input2_shape, const uint8_t *input2_data,
- const Shape &output_shape, uint8_t *output_data)
+template <typename T>
+inline typename std::enable_if_t<is_quant8<T>::value>
+BroadcastMulDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+ const T *input1_data, const Shape &input2_shape, const T *input2_data,
+ const Shape &output_shape, T *output_data)
{
if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
{
- const std::function<uint8_t(const BinaryArithmeticOpParam &, const uint8_t &, const uint8_t &)>
- fn = [](const BinaryArithmeticOpParam &params, const uint8_t &a,
- const uint8_t &b) -> uint8_t {
- return static_cast<uint8_t>(quant8_mul(params, a, b));
- };
- reference::BroadcastBinaryArithmeticOpSlowQuant8(params, input1_shape, input1_data,
- input2_shape, input2_data, output_shape,
- output_data, fn);
+ const std::function<T(const BinaryArithmeticOpParam &, const T &, const T &)> fn =
+ [](const BinaryArithmeticOpParam &params, const T &a, const T &b) {
+ return static_cast<T>(quant8_mul(params, a, b));
+ };
+ reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
+ input2_data, output_shape, output_data, fn);
return;
}
BinaryBroadcastFiveFold(
- params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
- static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *,
- uint8_t *)>(MulElementwiseQuant8),
- static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *,
- uint8_t *)>(MulSimpleBroadcastQuant8));
+ params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+ static_cast<void (*)(int, const BinaryArithmeticOpParam &, const T *, const T *, T *)>(
+ MulElementwise),
+ static_cast<void (*)(int, const BinaryArithmeticOpParam &, T, const T *, T *)>(
+ MulSimpleBroadcast));
}
inline void BroadcastMulDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
@@ -665,17 +1205,59 @@ inline void BroadcastMulDispatch(const BinaryArithmeticOpParam &params, const Sh
{
// TODO: Use GetBinaryArithmeticFn
const std::function<float(const float &, const float &)> fn =
- [](const float &a, const float &b) -> float { return a * b; };
+ [](const float &a, const float &b) -> float { return a * b; };
reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
input2_data, output_shape, output_data, fn);
return;
}
- BinaryBroadcastFiveFold(
- params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
- static_cast<void (*)(int, const BinaryArithmeticOpParam &, const float *, const float *,
- float *)>(MulElementwise),
- static_cast<void (*)(int, const BinaryArithmeticOpParam &, float, const float *, float *)>(
- MulSimpleBroadcast));
+ auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncMulFloat>(params);
+ BinaryBroadcastFiveFold(params, input1_shape, input1_data, input2_shape, input2_data,
+ output_shape, output_data, implFuncs.first, implFuncs.second);
+}
+
+inline void Div(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+ const float *input1_data, const Shape &input2_shape, const float *input2_data,
+ const Shape &output_shape, float *output_data)
+{
+#ifdef __aarch64__
+ const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+ auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncDivFloat>(params);
+ (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data);
+#else
+ const std::function<float(const float &, const float &)> fn =
+ [](const float &a, const float &b) -> float { return a / b; };
+ reference::BinaryArithmeticOp(params, input1_shape, input1_data, input2_shape, input2_data,
+ output_shape, output_data, fn);
+#endif // __aarch64__
+}
+
+inline void BroadcastDivDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+ const float *input1_data, const Shape &input2_shape,
+ const float *input2_data, const Shape &output_shape,
+ float *output_data)
+{
+#ifdef __aarch64__
+ if (params.broadcast_category == BroadcastableOpCategory::kFirstInputBroadcastsFast)
+ {
+ auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncDivFloat>(params);
+ BinaryBroadcastFiveFold(params, false, input1_shape, input1_data, input2_shape, input2_data,
+ output_shape, output_data, implFuncs.first, implFuncs.second);
+ }
+ else if (params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast)
+ {
+ auto implFuncs =
+ getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncDivFloat>>(params);
+ BinaryBroadcastFiveFold(params, true, input1_shape, input1_data, input2_shape, input2_data,
+ output_shape, output_data, implFuncs.first, implFuncs.second);
+ }
+ else
+#endif // __aarch64__
+ {
+ const std::function<float(const float &, const float &)> fn =
+ [](const float &a, const float &b) -> float { return a / b; };
+ reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
+ input2_data, output_shape, output_data, fn);
+ }
}
} // namespace optimized
diff --git a/compute/cker/include/cker/operation/optimized/Conv.h b/compute/cker/include/cker/operation/optimized/Conv.h
index 0f620146c..6e0e129c6 100644
--- a/compute/cker/include/cker/operation/optimized/Conv.h
+++ b/compute/cker/include/cker/operation/optimized/Conv.h
@@ -42,13 +42,15 @@ namespace cker
namespace optimized
{
+std::mutex _gemmlowp_mutex;
+
struct GemmlowpOutputPipeline
{
typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col> ColVectorMap;
typedef std::tuple<gemmlowp::OutputStageBiasAddition<ColVectorMap>,
gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent,
gemmlowp::OutputStageClamp, gemmlowp::OutputStageSaturatingCastToUint8>
- Pipeline;
+ Pipeline;
static Pipeline MakeExp(const int32_t *bias_data, int output_rows, int32_t output_offset,
int32_t output_multiplier, int output_left_shift,
int32_t output_activation_min, int32_t output_activation_max)
@@ -106,7 +108,7 @@ inline void Conv(const ConvParams &params, const Shape &input_shape, const uint8
const int filter_height = filter_shape.Dims(1);
const bool need_dilated_im2col = dilation_width_factor != 1 || dilation_height_factor != 1;
const bool need_im2col =
- stride_width != 1 || stride_height != 1 || filter_width != 1 || filter_height != 1;
+ stride_width != 1 || stride_height != 1 || filter_width != 1 || filter_height != 1;
if (need_dilated_im2col)
{
assert(im2col_data);
@@ -141,7 +143,7 @@ inline void Conv(const ConvParams &params, const Shape &input_shape, const uint8
// the other calls commented out. This is a partial rollback of cl/196819423.
// const int gemm_input_cols = FlatSizeSkipDim(*gemm_input_shape, 3);
const int gemm_input_cols =
- gemm_input_shape->Dims(0) * gemm_input_shape->Dims(1) * gemm_input_shape->Dims(2);
+ gemm_input_shape->Dims(0) * gemm_input_shape->Dims(1) * gemm_input_shape->Dims(2);
const int filter_rows = filter_shape.Dims(0);
// See b/79927784.
// const int filter_cols = FlatSizeSkipDim(filter_shape, 0);
@@ -156,17 +158,19 @@ inline void Conv(const ConvParams &params, const Shape &input_shape, const uint8
assert(bias_shape.FlatSize() == output_rows);
UNUSED_RELEASE(bias_shape);
gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor> filter_matrix(
- filter_data, filter_rows, filter_cols);
+ filter_data, filter_rows, filter_cols);
gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor> input_matrix(
- gemm_input_data, gemm_input_rows, gemm_input_cols);
+ gemm_input_data, gemm_input_rows, gemm_input_cols);
gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::ColMajor> output_matrix(output_data, output_rows,
output_cols);
const auto &output_pipeline =
- GemmlowpOutputPipeline::MakeExp(bias_data, output_rows, output_offset, output_multiplier,
- output_shift, output_activation_min, output_activation_max);
+ GemmlowpOutputPipeline::MakeExp(bias_data, output_rows, output_offset, output_multiplier,
+ output_shift, output_activation_min, output_activation_max);
+
+ std::lock_guard<std::mutex> lock_guard(_gemmlowp_mutex);
gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
- gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, input_offset,
- output_pipeline);
+ gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, input_offset,
+ output_pipeline);
}
} // namespace optimized
@@ -202,10 +206,10 @@ public:
T *output_data, int output_height, int output_width)
{
const bool is_1x1_kernel =
- (filter_height == 1 && filter_width == 1 && stride_rows == 1 && stride_cols == 1);
+ (filter_height == 1 && filter_width == 1 && stride_rows == 1 && stride_cols == 1);
const bool is_same_height_width =
- (filter_height == input_height && filter_width == input_width && pad_width == 0 &&
- pad_height == 0);
+ (filter_height == input_height && filter_width == input_width && pad_width == 0 &&
+ pad_height == 0);
if (is_1x1_kernel || is_same_height_width)
{
// is_1x1_kernel: For 1x1 kernel, the 2D convolution is reduced to matrix multiplication.
diff --git a/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h b/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h
new file mode 100644
index 000000000..17b2fc7a2
--- /dev/null
+++ b/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h
@@ -0,0 +1,1250 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_FLOAT_H__
+#define __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_FLOAT_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include "cker/neon/neon_check.h"
+
+namespace nnfw
+{
+namespace cker
+{
+namespace optimized
+{
+
+// Implementation of float DepthwiseConv
+
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+struct FloatDepthwiseConvKernel
+{
+};
+
+#ifdef USE_NEON
+
+template <> struct FloatDepthwiseConvKernel<false, 8, 1>
+{
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+ float *acc_buffer_ptr)
+ {
+ (void)input_depth;
+ (void)depth_multiplier;
+ (void)input_ptr_increment;
+ // Load the filters
+ float32x4_t filter[2];
+ for (int i = 0; i < 2; i++)
+ {
+ filter[i] = vld1q_f32(filter_ptr + 4 * i);
+ }
+ int outp = 0;
+ // Handle 2 output pixels at a time.
+ for (; outp <= num_output_pixels - 2; outp += 2)
+ {
+ // Load the inputs
+ float32x4_t input[4];
+ for (int i = 0; i < 4; i++)
+ {
+ input[i] = vld1q_f32(input_ptr + 4 * i);
+ }
+ input_ptr += 16;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc[4];
+ for (int i = 0; i < 4; i++)
+ {
+ acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ acc[0] = vmlaq_f32(acc[0], input[0], filter[0]);
+ acc[1] = vmlaq_f32(acc[1], input[1], filter[1]);
+ acc[2] = vmlaq_f32(acc[2], input[2], filter[0]);
+ acc[3] = vmlaq_f32(acc[3], input[3], filter[1]);
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++)
+ {
+ vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ // Handle one output pixel at a time.
+ for (; outp < num_output_pixels; outp++)
+ {
+ // Load the inputs
+ float32x4_t input[2];
+ for (int i = 0; i < 2; i++)
+ {
+ input[i] = vld1q_f32(input_ptr + 4 * i);
+ }
+ input_ptr += 8;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc[2];
+ for (int i = 0; i < 2; i++)
+ {
+ acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ for (int i = 0; i < 2; i++)
+ {
+ acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 2; i++)
+ {
+ vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 8;
+ }
+ }
+};
+
+template <> struct FloatDepthwiseConvKernel<false, 2, 1>
+{
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+ float *acc_buffer_ptr)
+ {
+ (void)input_depth;
+ (void)depth_multiplier;
+ (void)input_ptr_increment;
+
+ const float32x2_t filters = vld1_f32(filter_ptr);
+ const float32x4_t filters_dup2 = vcombine_f32(filters, filters);
+ int outp = 0;
+ // Handle 8 output pixels at a time.
+ for (; outp <= num_output_pixels - 8; outp += 8)
+ {
+ // Load the inputs
+ float32x4_t input[4];
+ for (int i = 0; i < 4; i++)
+ {
+ input[i] = vld1q_f32(input_ptr + 4 * i);
+ }
+ input_ptr += 16;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc[4];
+ for (int i = 0; i < 4; i++)
+ {
+ acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ for (int i = 0; i < 4; i++)
+ {
+ acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++)
+ {
+ vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ // Handle 4 output pixels at a time.
+ for (; outp <= num_output_pixels - 4; outp += 4)
+ {
+ // Load the inputs
+ float32x4_t input[2];
+ for (int i = 0; i < 2; i++)
+ {
+ input[i] = vld1q_f32(input_ptr + 4 * i);
+ }
+ input_ptr += 8;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc[2];
+ for (int i = 0; i < 2; i++)
+ {
+ acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ for (int i = 0; i < 2; i++)
+ {
+ acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 2; i++)
+ {
+ vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 8;
+ }
+ // Handle 2 output pixels at a time.
+ for (; outp <= num_output_pixels - 2; outp += 2)
+ {
+ // Load the inputs
+ const float32x4_t input = vld1q_f32(input_ptr);
+ input_ptr += 4;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+ // Multiply-accumulate
+ acc = vmlaq_f32(acc, input, filters_dup2);
+ // Store the accumulators back to acc_buffer
+ vst1q_f32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 4;
+ }
+ // Handle 1 output pixel at a time
+ for (; outp < num_output_pixels; outp++)
+ {
+ // Load the inputs
+ const float32x2_t input = vld1_f32(input_ptr);
+ input_ptr += 2;
+ // Load the accumulators from acc_buffer
+ float32x2_t acc = vld1_f32(acc_buffer_ptr);
+ // Multiply-accumulate
+ acc = vmla_f32(acc, input, filters);
+ // Store the accumulators back to acc_buffer
+ vst1_f32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 2;
+ }
+ }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 0, 1>
+{
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+ float *acc_buffer_ptr)
+ {
+ (void)depth_multiplier;
+
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++)
+ {
+ const float *local_filter_ptr = filter_ptr;
+ const float *local_input_ptr = input_ptr;
+ int ic = 0;
+ // Handle 16 input channels at a time.
+ for (; ic <= input_depth - 16; ic += 16)
+ {
+ // Load the filters
+ float32x4_t filter_0 = vld1q_f32(local_filter_ptr + 4 * 0);
+ float32x4_t filter_1 = vld1q_f32(local_filter_ptr + 4 * 1);
+ float32x4_t filter_2 = vld1q_f32(local_filter_ptr + 4 * 2);
+ float32x4_t filter_3 = vld1q_f32(local_filter_ptr + 4 * 3);
+ local_filter_ptr += 16;
+ // Load the inputs
+ float32x4_t input_0 = vld1q_f32(local_input_ptr + 4 * 0);
+ float32x4_t input_1 = vld1q_f32(local_input_ptr + 4 * 1);
+ float32x4_t input_2 = vld1q_f32(local_input_ptr + 4 * 2);
+ float32x4_t input_3 = vld1q_f32(local_input_ptr + 4 * 3);
+ local_input_ptr += 16;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
+ float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
+ float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
+ float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
+ // Multiply-accumulate
+ acc_0 = vmlaq_f32(acc_0, input_0, filter_0);
+ acc_1 = vmlaq_f32(acc_1, input_1, filter_1);
+ acc_2 = vmlaq_f32(acc_2, input_2, filter_2);
+ acc_3 = vmlaq_f32(acc_3, input_3, filter_3);
+ // Store the accumulators back to acc_buffer
+ vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
+ vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
+ vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
+ vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
+ acc_buffer_ptr += 16;
+ }
+ // Handle 4 input channels at a time.
+ for (; ic <= input_depth - 4; ic += 4)
+ {
+ // Load the filters
+ float32x4_t filter;
+ filter = vld1q_f32(local_filter_ptr);
+ local_filter_ptr += 4;
+ // Load the inputs
+ float32x4_t input;
+ input = vld1q_f32(local_input_ptr);
+ local_input_ptr += 4;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc;
+ acc = vld1q_f32(acc_buffer_ptr);
+ // Multiply-accumulate
+ acc = vmlaq_f32(acc, input, filter);
+ // Store the accumulators back to acc_buffer
+ vst1q_f32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 4;
+ }
+ // Handle one input channel at a time.
+ for (; ic < input_depth; ic++)
+ {
+ const float input_val = *local_input_ptr++;
+ const float filter_val = *local_filter_ptr++;
+ *acc_buffer_ptr++ += filter_val * input_val;
+ }
+ input_ptr += input_ptr_increment;
+ }
+ }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 0, 8>
+{
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+ float *acc_buffer_ptr)
+ {
+ (void)depth_multiplier;
+
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++)
+ {
+ const float *local_filter_ptr = filter_ptr;
+ const float *local_input_ptr = input_ptr;
+ int ic = 0;
+ // Handle 2 input channels at a time.
+ for (; ic <= input_depth - 2; ic += 2)
+ {
+ // Load the filters
+ float32x4_t filter[4];
+ for (int i = 0; i < 4; i++)
+ {
+ filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+ }
+ local_filter_ptr += 16;
+ // Load the inputs
+ const float32x2_t input = vld1_f32(local_input_ptr);
+ local_input_ptr += 2;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc[4];
+ for (int i = 0; i < 4; i++)
+ {
+ acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0);
+ acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0);
+ acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1);
+ acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1);
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++)
+ {
+ vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ // Handle one input channel at a time.
+ for (; ic < input_depth; ic++)
+ {
+ // Load the filters
+ float32x4_t filter[2];
+ for (int i = 0; i < 2; i++)
+ {
+ filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+ }
+ local_filter_ptr += 8;
+ // Load the inputs
+ const float input_val = *local_input_ptr++;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc[2];
+ for (int i = 0; i < 2; i++)
+ {
+ acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ for (int i = 0; i < 2; i++)
+ {
+ acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 2; i++)
+ {
+ vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 8;
+ }
+ input_ptr += input_ptr_increment;
+ }
+ }
+};
+
+// Note this implementation is very slow for input_depths < 8
+// (e.g. comparable to reference implementation) see, specializations for
+// input_depth=3 below.
+template <> struct FloatDepthwiseConvKernel<true, 0, 2>
+{
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+ float *acc_buffer_ptr)
+ {
+ (void)depth_multiplier;
+
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++)
+ {
+ const float *local_filter_ptr = filter_ptr;
+ const float *local_input_ptr = input_ptr;
+ int ic = 0;
+ // Handle 8 input channels at a time.
+ for (; ic <= input_depth - 8; ic += 8)
+ {
+ // Load the filters
+ float32x4_t filter[4];
+ for (int i = 0; i < 4; i++)
+ {
+ filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+ }
+ local_filter_ptr += 16;
+ // Load the inputs
+ float32x4x2_t input_dup2[2];
+ for (int i = 0; i < 2; i++)
+ {
+ const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i);
+ input_dup2[i] = vzipq_f32(input, input);
+ }
+ local_input_ptr += 8;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc[4];
+ for (int i = 0; i < 4; i++)
+ {
+ acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]);
+ acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]);
+ acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]);
+ acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]);
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++)
+ {
+ vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ // Handle 4 input channels at a time.
+ for (; ic <= input_depth - 4; ic += 4)
+ {
+ // Load the filters
+ float32x2_t filter[4];
+ for (int i = 0; i < 4; i++)
+ {
+ filter[i] = vld1_f32(local_filter_ptr + 2 * i);
+ }
+ local_filter_ptr += 8;
+ // Load the inputs
+ const float32x4_t input = vld1q_f32(local_input_ptr);
+ local_input_ptr += 4;
+ // Load the accumulators from acc_buffer
+ float32x2_t acc[4];
+ for (int i = 0; i < 4; i++)
+ {
+ acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+ }
+ // Multiply-accumulate
+ acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0);
+ acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1);
+ acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0);
+ acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1);
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++)
+ {
+ vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+ }
+ acc_buffer_ptr += 8;
+ }
+ // Handle 2 input channels at a time.
+ for (; ic <= input_depth - 2; ic += 2)
+ {
+ // Load the filters
+ const float32x4_t filter = vld1q_f32(local_filter_ptr);
+ local_filter_ptr += 4;
+ // Load the inputs
+ const float32x2_t input = vld1_f32(local_input_ptr);
+ local_input_ptr += 2;
+ // Load the accumulators from acc_buffer
+ float32x2_t acc[2];
+ for (int i = 0; i < 2; i++)
+ {
+ acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+ }
+ // Multiply-accumulate
+ acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0);
+ acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1);
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 2; i++)
+ {
+ vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+ }
+ acc_buffer_ptr += 4;
+ }
+ // Handle one input channel at a time.
+ for (; ic < input_depth; ic++)
+ {
+ // Load the inputs
+ const float input_val = *local_input_ptr++;
+ // Multiply-accumulate
+ for (int i = 0; i < 2; i++)
+ {
+ acc_buffer_ptr[i] += local_filter_ptr[i] * input_val;
+ }
+ local_filter_ptr += 2;
+ acc_buffer_ptr += 2;
+ }
+ input_ptr += input_ptr_increment;
+ }
+ }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 3, 2>
+{
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+ float *acc_buffer_ptr)
+ {
+ (void)input_depth;
+ (void)depth_multiplier;
+
+ // Load the filters
+ float32x2_t filter[3];
+ for (int i = 0; i < 3; i++)
+ {
+ filter[i] = vld1_f32(filter_ptr + 2 * i);
+ }
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++)
+ {
+ const float32x2_t input01 = vld1_f32(input_ptr);
+ const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
+ // Load the accumulators from acc_buffer
+ float32x2_t acc[3];
+ for (int i = 0; i < 3; i++)
+ {
+ acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+ }
+ // Multiply-accumulate for each input channel there 2 outputs
+ acc[0] = vmla_lane_f32(acc[0], filter[0], input01, 0);
+ acc[1] = vmla_lane_f32(acc[1], filter[1], input01, 1);
+ acc[2] = vmla_lane_f32(acc[2], filter[2], input2, 0);
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 3; i++)
+ {
+ vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+ }
+ acc_buffer_ptr += 6;
+ input_ptr += input_ptr_increment;
+ }
+ }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 3, 4>
+{
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+ float *acc_buffer_ptr)
+ {
+ (void)input_depth;
+ (void)depth_multiplier;
+
+ // Load the filters
+ float32x4_t filter[3];
+ for (int i = 0; i < 3; i++)
+ {
+ filter[i] = vld1q_f32(filter_ptr + 4 * i);
+ }
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++)
+ {
+ // NOTE: we only want 3 values, so we read it as two ops where
+ // the second op just duplicates the lane
+ const float32x2_t input01 = vld1_f32(input_ptr);
+ const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
+ // Load the accumulators from acc_buffer
+ float32x4_t acc[3];
+ for (int i = 0; i < 3; i++)
+ {
+ acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate all outputs.
+ acc[0] = vmlaq_lane_f32(acc[0], filter[0], input01, 0);
+ acc[1] = vmlaq_lane_f32(acc[1], filter[1], input01, 1);
+ acc[2] = vmlaq_lane_f32(acc[2], filter[2], input2, 0);
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 3; i++)
+ {
+ vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 12;
+ input_ptr += input_ptr_increment;
+ }
+ }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 1, 8>
+{
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+ float *acc_buffer_ptr)
+ {
+ (void)input_depth;
+ (void)depth_multiplier;
+
+ // Load the filters
+ float32x4_t filter[2];
+ for (int i = 0; i < 2; i++)
+ {
+ filter[i] = vld1q_f32(filter_ptr + 4 * i);
+ }
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++)
+ {
+ // Load the inputs
+ const float input_val = *input_ptr;
+ input_ptr += input_ptr_increment;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc[2];
+ for (int i = 0; i < 2; i++)
+ {
+ acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ for (int i = 0; i < 2; i++)
+ {
+ acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 2; i++)
+ {
+ vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 8;
+ }
+ }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 1, 32>
+{
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+ float *acc_buffer_ptr)
+ {
+ (void)input_depth;
+ (void)depth_multiplier;
+
+ // Load the filters
+ float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
+ float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
+ float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
+ float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
+ float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
+ float32x4_t filter_5 = vld1q_f32(filter_ptr + 4 * 5);
+ float32x4_t filter_6 = vld1q_f32(filter_ptr + 4 * 6);
+ float32x4_t filter_7 = vld1q_f32(filter_ptr + 4 * 7);
+
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++)
+ {
+ // Load the inputs
+ const float input_val = *input_ptr;
+ input_ptr += input_ptr_increment;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
+ float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
+ float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
+ float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
+ float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
+ float32x4_t acc_5 = vld1q_f32(acc_buffer_ptr + 4 * 5);
+ float32x4_t acc_6 = vld1q_f32(acc_buffer_ptr + 4 * 6);
+ float32x4_t acc_7 = vld1q_f32(acc_buffer_ptr + 4 * 7);
+ // Multiply-accumulate
+ acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
+ acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
+ acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
+ acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
+ acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
+ acc_5 = vmlaq_n_f32(acc_5, filter_5, input_val);
+ acc_6 = vmlaq_n_f32(acc_6, filter_6, input_val);
+ acc_7 = vmlaq_n_f32(acc_7, filter_7, input_val);
+ // Store the accumulators back to acc_buffer
+ vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
+ vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
+ vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
+ vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
+ vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
+ vst1q_f32(acc_buffer_ptr + 4 * 5, acc_5);
+ vst1q_f32(acc_buffer_ptr + 4 * 6, acc_6);
+ vst1q_f32(acc_buffer_ptr + 4 * 7, acc_7);
+ acc_buffer_ptr += 32;
+ }
+ }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 1, 20>
+{
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+ float *acc_buffer_ptr)
+ {
+ (void)input_depth;
+ (void)depth_multiplier;
+
+ // Load the filters
+ float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
+ float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
+ float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
+ float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
+ float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
+
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++)
+ {
+ // Load the inputs
+ const float input_val = *input_ptr;
+ input_ptr += input_ptr_increment;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
+ float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
+ float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
+ float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
+ float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
+ // Multiply-accumulate
+ acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
+ acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
+ acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
+ acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
+ acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
+ // Store the accumulators back to acc_buffer
+ vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
+ vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
+ vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
+ vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
+ vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
+ acc_buffer_ptr += 20;
+ }
+ }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 0, 16>
+{
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+ float *acc_buffer_ptr)
+ {
+ (void)depth_multiplier;
+
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++)
+ {
+ const float *local_filter_ptr = filter_ptr;
+ const float *local_input_ptr = input_ptr;
+ for (int ic = 0; ic < input_depth; ic++)
+ {
+ // Load the filters
+ float32x4_t filter[4];
+ for (int i = 0; i < 4; i++)
+ {
+ filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+ }
+ local_filter_ptr += 16;
+ // Load the inputs
+ const float input_val = *local_input_ptr++;
+ // Load the accumulators from acc_buffer
+ float32x4_t acc[4];
+ for (int i = 0; i < 4; i++)
+ {
+ acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ for (int i = 0; i < 4; i++)
+ {
+ acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++)
+ {
+ vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ input_ptr += input_ptr_increment;
+ }
+ }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 8, 1>
+{
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+ float *acc_buffer_ptr)
+ {
+ (void)input_depth;
+ (void)depth_multiplier;
+
+ // Load the filters
+ float32x4_t filter[2];
+ for (int i = 0; i < 2; i++)
+ {
+ filter[i] = vld1q_f32(filter_ptr + 4 * i);
+ }
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++)
+ {
+ // Load the inputs
+ float32x4_t input[2];
+ for (int i = 0; i < 2; i++)
+ {
+ input[i] = vld1q_f32(input_ptr + 4 * i);
+ }
+ // Load the accumulators from acc_buffer
+ float32x4_t acc[2];
+ for (int i = 0; i < 2; i++)
+ {
+ acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ for (int i = 0; i < 2; i++)
+ {
+ acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 2; i++)
+ {
+ vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 8;
+ input_ptr += input_ptr_increment;
+ }
+ }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 2, 1>
+{
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+ float *acc_buffer_ptr)
+ {
+ (void)input_depth;
+ (void)depth_multiplier;
+
+ float32x2_t filter = vld1_f32(filter_ptr);
+ float32x4_t filter_x4 = vcombine_f32(filter, filter);
+ int outp = 0;
+
+ // Handle two output pixels at a time.
+ for (; outp <= num_output_pixels - 2; outp += 2)
+ {
+ // Load the inputs
+ float32x2_t input_1 = vld1_f32(input_ptr);
+ input_ptr += input_ptr_increment;
+ float32x2_t input_2 = vld1_f32(input_ptr);
+ input_ptr += input_ptr_increment;
+ float32x4_t input = vcombine_f32(input_1, input_2);
+
+ // Load the accumulators from acc_buffer
+ float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+
+ // Multiply-accumulate
+ acc = vmlaq_f32(acc, input, filter_x4);
+
+ // Store the accumulators back to acc_buffer
+ vst1q_f32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 4;
+ }
+ // Handle one output pixel at a time.
+ for (; outp < num_output_pixels; outp++)
+ {
+ // Load the inputs
+ float32x2_t input = vld1_f32(input_ptr);
+ input_ptr += input_ptr_increment;
+
+ // Load the accumulators from acc_buffer
+ float32x2_t acc = vld1_f32(acc_buffer_ptr);
+
+ // Multiply-accumulate
+ acc = vmla_f32(acc, input, filter);
+
+ // Store the accumulators back to acc_buffer
+ vst1_f32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 2;
+ }
+ }
+};
+
+template <> struct FloatDepthwiseConvKernel<true, 4, 1>
+{
+ static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+ const float *input_ptr, int input_ptr_increment, const float *filter_ptr,
+ float *acc_buffer_ptr)
+ {
+ (void)input_depth;
+ (void)depth_multiplier;
+
+ float32x4_t filter = vld1q_f32(filter_ptr);
+
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++)
+ {
+ // Load the inputs
+ float32x4_t input = vld1q_f32(input_ptr);
+ // Load the accumulators from acc_buffer
+ float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+ // Multiply-accumulate
+ acc = vmlaq_f32(acc, input, filter);
+ // Store the accumulators back to acc_buffer
+ vst1q_f32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 4;
+ input_ptr += input_ptr_increment;
+ }
+ }
+};
+#endif
+
+// Accumulates the effect of one row of the filter, on a segment of one row
+// of the output, accessing the corresponding one row of the input.
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+void FloatDepthwiseConvAccumRow(int stride, int dilation_factor, int input_depth, int input_width,
+ const float *input_data, int pad_width, int depth_multiplier,
+ int filter_width, const float *filter_data, int out_x_buffer_start,
+ int out_x_buffer_end, int output_depth, float *acc_buffer)
+{
+ // Sanity check parameters. This is important in particular to ensure
+ // that we keep the number of template instantiations minimal, so we don't
+ // increase binary size unnecessarily.
+ static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
+ static_assert(kFixedInputDepth || kAllowStrided, "");
+ assert(stride == 1 || kAllowStrided);
+ if (kFixedInputDepth)
+ {
+ assert(input_depth == kFixedInputDepth);
+ }
+ if (kFixedDepthMultiplier)
+ {
+ assert(depth_multiplier == kFixedDepthMultiplier);
+ }
+ assert(output_depth == input_depth * depth_multiplier);
+ const int input_ptr_increment = stride * input_depth;
+ const float *filter_base_ptr = filter_data;
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+ {
+ // For the current (filter_x, filter_y) point in the filter,
+ // compute the boundaries of the corresponding output row segment.
+ int out_x_loop_start_unclamped = 0;
+ int out_x_loop_end_unclamped = 0;
+ if (kAllowStrided)
+ {
+ if (stride == 2)
+ {
+ out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 1) / 2;
+ out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
+ }
+ else if (stride == 4)
+ {
+ out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 3) / 4;
+ out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
+ }
+ else
+ {
+ out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + stride - 1) / stride;
+ out_x_loop_end_unclamped =
+ (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
+ }
+ }
+ else
+ {
+ out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x;
+ out_x_loop_end_unclamped = pad_width + input_width - dilation_factor * filter_x;
+ }
+ // The kernel will have to iterate on the segment of the
+ // output row that starts at out_x_loop_start and out_x_loop_end.
+ const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclamped);
+ const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclamped);
+
+ float *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+ const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+ const float *input_ptr = input_data + in_x_origin * input_depth;
+ const int num_output_pixels = out_x_loop_end - out_x_loop_start;
+ FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run(
+ num_output_pixels, input_depth, depth_multiplier, input_ptr, input_ptr_increment,
+ filter_base_ptr, acc_buffer_ptr);
+ filter_base_ptr += output_depth;
+ }
+}
+
+// generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized.
+inline void FloatDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth,
+ int input_width, const float *input_data,
+ int pad_width, int depth_multiplier, int filter_width,
+ const float *filter_data, int out_x_buffer_start,
+ int out_x_buffer_end, int output_depth,
+ float *acc_buffer)
+{
+ const float *filter_base_ptr = filter_data;
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+ {
+ const int out_x_loop_start =
+ std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
+ const int out_x_loop_end =
+ std::min(out_x_buffer_end,
+ (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
+
+ float *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+ const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+ const float *input_ptr = input_data + in_x_origin * input_depth;
+ const int input_ptr_increment = (stride - 1) * input_depth;
+ for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++)
+ {
+ const float *filter_ptr = filter_base_ptr;
+ for (int ic = 0; ic < input_depth; ++ic)
+ {
+ const float input_val = *input_ptr++;
+ for (int m = 0; m < depth_multiplier; m++)
+ {
+ const float filter_val = *filter_ptr++;
+ *acc_buffer_ptr++ += filter_val * input_val;
+ }
+ }
+ input_ptr += input_ptr_increment;
+ }
+ filter_base_ptr += output_depth;
+ }
+}
+
+// Initializes the accumulator buffer with bias values.
+inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
+ const float *bias_data, float *acc_buffer)
+{
+ // TODO(benoitjacob): This might need optimized specializations
+ // for small output_depth values, if that ever becomes an important
+ // case (like it was for some quantized DepthwiseConv cases).
+ for (int i = 0; i < num_output_pixels; i++)
+ {
+ memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth);
+ }
+}
+
+// DepthwiseConv can run with multi threads on the dim specified by thread_dim.
+// Each thread processes output elements on dim, thread_dim, in the range of
+// [thread_start, thread_end).
+// For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it
+// means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :].
+inline void DepthwiseConvImpl(const DepthwiseConvParams &params, const Shape &input_shape,
+ const float *input_data, const Shape &filter_shape,
+ const float *filter_data, const Shape &bias_shape,
+ const float *bias_data, const Shape &output_shape, float *output_data,
+ int thread_start, int thread_end, int thread_dim)
+{
+ UNUSED_RELEASE(bias_shape);
+ const int stride_width = params.stride_width;
+ const int stride_height = params.stride_height;
+ const int pad_width = params.padding_values.width;
+ const int pad_height = params.padding_values.height;
+ const int depth_multiplier = params.depth_multiplier;
+ const float output_activation_min = params.float_activation_min;
+ const float output_activation_max = params.float_activation_max;
+ const int dilation_width_factor = params.dilation_width_factor;
+ const int dilation_height_factor = params.dilation_height_factor;
+ assert(input_shape.DimensionsCount() == 4);
+ assert(filter_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+ assert(thread_dim == 0 || thread_dim == 1);
+
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int input_depth = input_shape.Dims(3);
+ const int filter_height = filter_shape.Dims(1);
+ const int filter_width = filter_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ assert(output_depth == input_depth * depth_multiplier);
+ assert(bias_shape.FlatSize() == output_depth);
+
+ static const int kAccBufferMaxSize = 4832;
+ float acc_buffer[kAccBufferMaxSize];
+ assert(kAccBufferMaxSize >= output_depth);
+ const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
+ const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
+ assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);
+ assert(kAccBufferActualSize <= kAccBufferMaxSize);
+ assert(kOutputPixelsInAccBuffer >= 1);
+
+ UNUSED_RELEASE(kAccBufferActualSize);
+
+ // row_accum_func will point to the core accumulation function to be used
+ // for this DepthwiseConv op.
+ using row_accum_func_t = decltype(&FloatDepthwiseConvAccumRowGeneric);
+ row_accum_func_t row_accum_func = nullptr;
+
+#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \
+ if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \
+ (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \
+ depth_multiplier == FIXED_DEPTH_MULTIPLIER) \
+ { \
+ row_accum_func = \
+ FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>; \
+ }
+
+#ifdef USE_NEON
+ // We go over our list of kernels by decreasing order of preference
+ // for the cases where multiple kernels could apply.
+
+ // Start with the fastest kernels: AllowStrided=false, fixed input depth.
+
+ TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
+
+ // Next come the strided kernels: AllowStrided=true, fixed input depth.
+ // They are a bit less efficient, but allow stride!=1.
+
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 2)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 4)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
+
+ // Finally, the kernels allowing a variable input depth,
+ // these are the least efficient but most general kernels.
+
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 8)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 16)
+
+#endif // USE_NEON
+
+#undef TFMINI_USE_DEPTHWISECONV_KERNEL
+
+ // No matching fast kernel found, use slow fallback.
+ if (!row_accum_func)
+ {
+ row_accum_func = FloatDepthwiseConvAccumRowGeneric;
+ }
+
+ const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
+ const int input_batch_stride = input_height_stride * input_shape.Dims(1);
+ const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
+
+ // Now that we have determined row_accum_func, we can start work.
+ int batch_start = 0;
+ int batch_end = batches;
+ int row_start = 0;
+ int row_end = output_height;
+ int output_ptr_offset = 0;
+
+ switch (thread_dim)
+ {
+ case 0:
+ // Multithread along with the batch axis
+ assert(thread_start >= 0);
+ assert(thread_end <= batches);
+ batch_start = thread_start;
+ batch_end = thread_end;
+ output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
+ break;
+ case 1:
+ // Multithread along with the row axis
+ assert(thread_start >= 0);
+ assert(thread_end <= output_height);
+ row_start = thread_start;
+ row_end = thread_end;
+ output_ptr_offset = row_start * output_width * output_depth;
+ break;
+ }
+
+ float *output_ptr = output_data + output_ptr_offset;
+ int batch_step = (output_height + row_start - row_end) * output_width * output_depth;
+
+ for (int b = batch_start; b < batch_end; ++b)
+ {
+ for (int out_y = row_start; out_y < row_end; ++out_y)
+ {
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ const int filter_y_start =
+ std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
+ const int filter_y_end =
+ std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
+ dilation_height_factor);
+ for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
+ out_x_buffer_start += kOutputPixelsInAccBuffer)
+ {
+ const int out_x_buffer_end =
+ std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+ // We call a 'pixel' a group of activation that share all but the
+ // 'depth'/'channel' coordinate. num_output_pixels is the number of
+ // output pixels that we will accumulate in this loop iteration.
+ const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
+ // Initialize our local accumulator with the bias values, so we don't
+ // have to add them later.
+ DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer);
+ // Accumulation loop. Most of the time should be spent in here.
+ for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
+ {
+ const int in_y = in_y_origin + dilation_height_factor * filter_y;
+ row_accum_func(stride_width, dilation_width_factor, input_depth, input_width,
+ input_data + in_y * input_height_stride + b * input_batch_stride,
+ pad_width, depth_multiplier, filter_width,
+ filter_data + filter_y * filter_height_stride, out_x_buffer_start,
+ out_x_buffer_end, output_depth, acc_buffer);
+ }
+ // Finished accumulating. Now store to destination.
+ const int num_output_values = output_depth * num_output_pixels;
+ int i = 0;
+// TODO(benoitjacob) optimized code goes here
+#ifdef USE_NEON
+ // Handle 16 values at a time
+ for (; i <= num_output_values - 16; i += 16)
+ {
+ float32x4_t acc[4];
+ for (int k = 0; k < 4; k++)
+ {
+ acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
+ }
+ for (int k = 0; k < 4; k++)
+ {
+ acc[k] = vmaxq_f32(vdupq_n_f32(output_activation_min),
+ vminq_f32(vdupq_n_f32(output_activation_max), acc[k]));
+ }
+ for (int k = 0; k < 4; k++)
+ {
+ vst1q_f32(output_ptr + 4 * k, acc[k]);
+ }
+ output_ptr += 16;
+ }
+ // Handle 4 values at a time
+ for (; i <= num_output_values - 4; i += 4)
+ {
+ float32x4_t acc = vld1q_f32(acc_buffer + i);
+
+ acc = vmaxq_f32(vdupq_n_f32(output_activation_min),
+ vminq_f32(vdupq_n_f32(output_activation_max), acc));
+
+ vst1q_f32(output_ptr, acc);
+ output_ptr += 4;
+ }
+#endif
+ // Handle leftover values, one by one. This is very slow.
+ for (; i < num_output_values; i++)
+ {
+ float acc = acc_buffer[i];
+ acc = std::max(output_activation_min, std::min(output_activation_max, acc));
+
+ *output_ptr++ = acc;
+ }
+ }
+ }
+ output_ptr += batch_step;
+ }
+}
+
+} // namespace optimized
+} // namespace cker
+} // namespace nnfw
+
+#endif
diff --git a/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h b/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h
index d383b126d..5ca56fd09 100644
--- a/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h
+++ b/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h
@@ -32,6 +32,8 @@ namespace cker
{
namespace optimized
{
+namespace depthwise_conv
+{
// Implementation of quantized DepthwiseConv
@@ -44,8 +46,8 @@ struct QuantizedDepthwiseConvKernel
template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>
{
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
- const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+ const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
{
(void)input_depth;
(void)depth_multiplier;
@@ -57,7 +59,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>
for (int i = 0; i < 2; i++)
{
filter[i] =
- vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])), vdupq_n_s16(filter_offset));
+ vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])), vdupq_n_s16(filter_offset));
}
// Handle one output pixel at a time.
for (int outp = 0; outp < num_output_pixels; outp++)
@@ -80,9 +82,9 @@ template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>
for (int i = 0; i < 2; i++)
{
acc[0].val[i] =
- vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), vget_low_s16(input_dup2.val[i]));
+ vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), vget_low_s16(input_dup2.val[i]));
acc[1].val[i] =
- vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), vget_high_s16(input_dup2.val[i]));
+ vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), vget_high_s16(input_dup2.val[i]));
}
// Store the accumulators back to acc_buffer
for (int i = 0; i < 2; i++)
@@ -98,8 +100,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>
template <> struct QuantizedDepthwiseConvKernel<false, 8, 1>
{
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
- const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+ const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
{
(void)input_depth;
(void)depth_multiplier;
@@ -174,8 +176,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 8, 1>
template <> struct QuantizedDepthwiseConvKernel<false, 4, 2>
{
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
- const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+ const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
{
(void)input_depth;
(void)depth_multiplier;
@@ -206,9 +208,9 @@ template <> struct QuantizedDepthwiseConvKernel<false, 4, 2>
for (int i = 0; i < 2; i++)
{
acc[2 * i + 0] =
- vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), vget_low_s16(input_dup2.val[i]));
+ vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), vget_low_s16(input_dup2.val[i]));
acc[2 * i + 1] =
- vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), vget_high_s16(input_dup2.val[i]));
+ vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), vget_high_s16(input_dup2.val[i]));
}
// Store the accumulators back to acc_buffer
for (int i = 0; i < 4; i++)
@@ -253,8 +255,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 4, 2>
template <> struct QuantizedDepthwiseConvKernel<false, 2, 8>
{
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
- const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+ const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
{
(void)input_depth;
(void)depth_multiplier;
@@ -338,8 +340,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 2, 8>
template <> struct QuantizedDepthwiseConvKernel<false, 2, 2>
{
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
- const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+ const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
{
(void)input_depth;
(void)depth_multiplier;
@@ -409,8 +411,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 2, 2>
template <> struct QuantizedDepthwiseConvKernel<false, 2, 1>
{
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
- const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+ const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
{
(void)input_depth;
(void)depth_multiplier;
@@ -534,8 +536,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 2, 1>
template <> struct QuantizedDepthwiseConvKernel<false, 1, 2>
{
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
- const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+ const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
{
(void)input_depth;
(void)depth_multiplier;
@@ -600,8 +602,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 1, 2>
template <> struct QuantizedDepthwiseConvKernel<false, 1, 4>
{
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
- const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+ const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
{
(void)input_depth;
(void)depth_multiplier;
@@ -703,8 +705,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 1, 4>
template <> struct QuantizedDepthwiseConvKernel<false, 4, 1>
{
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
- const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+ const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
{
(void)input_depth;
(void)depth_multiplier;
@@ -778,8 +780,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 4, 1>
template <> struct QuantizedDepthwiseConvKernel<false, 4, 4>
{
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
- const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+ const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
{
(void)input_depth;
(void)depth_multiplier;
@@ -864,8 +866,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 4, 4>
template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
{
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
- const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+ const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
{
(void)input_depth;
(void)depth_multiplier;
@@ -873,7 +875,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
// We will do that by register-level table-look-up using VTBL instructions.
// Here we prepare the registers containing the table-lookup indices.
static const uint8_t dup3_indices_array[3][8] = {
- {0, 0, 0, 1, 1, 1, 2, 2}, {2, 3, 3, 3, 4, 4, 4, 5}, {5, 5, 6, 6, 6, 7, 7, 7}};
+ {0, 0, 0, 1, 1, 1, 2, 2}, {2, 3, 3, 3, 4, 4, 4, 5}, {5, 5, 6, 6, 6, 7, 7, 7}};
uint8x8_t dup3_indices[3];
for (int i = 0; i < 3; i++)
{
@@ -928,9 +930,9 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
for (int j = 0; j < 3; j++)
{
acc[0].val[j] =
- vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), vget_low_s16(filter[j]));
+ vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), vget_low_s16(filter[j]));
acc[1].val[j] =
- vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), vget_high_s16(filter[j]));
+ vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), vget_high_s16(filter[j]));
}
// Store the accumulators back to acc_buffer
for (int i = 0; i < 2; i++)
@@ -944,10 +946,10 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
// Handle one input channel at a time.
for (; ic < input_depth; ic++)
{
- const uint16_t input_val = *local_input_ptr++ + input_offset;
+ const int16_t input_val = *local_input_ptr++ + input_offset;
for (int i = 0; i < 3; i++)
{
- const uint16_t filter_val = local_filter_ptr[i] + filter_offset;
+ const int16_t filter_val = local_filter_ptr[i] + filter_offset;
*acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
}
local_filter_ptr += 3;
@@ -960,8 +962,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>
{
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
- const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+ const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
{
(void)input_depth;
(void)depth_multiplier;
@@ -1002,9 +1004,9 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>
for (int j = 0; j < 2; j++)
{
acc[0].val[j] =
- vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), vget_low_s16(input_dup2.val[j]));
+ vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), vget_low_s16(input_dup2.val[j]));
acc[1].val[j] =
- vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), vget_high_s16(input_dup2.val[j]));
+ vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), vget_high_s16(input_dup2.val[j]));
}
// Store the accumulators back to acc_buffer.
for (int i = 0; i < 2; i++)
@@ -1018,10 +1020,10 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>
for (; ic < input_depth; ic++)
{
// Load the inputs.
- const uint16_t input_val = *local_input_ptr++ + input_offset;
+ const int16_t input_val = *local_input_ptr++ + input_offset;
for (int i = 0; i < 2; i++)
{
- const uint16_t filter_val = local_filter_ptr[i] + filter_offset;
+ const int16_t filter_val = local_filter_ptr[i] + filter_offset;
*acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
}
local_filter_ptr += 2;
@@ -1034,8 +1036,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>
template <> struct QuantizedDepthwiseConvKernel<true, 0, 1>
{
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
- const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+ const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
{
(void)input_depth;
(void)depth_multiplier;
@@ -1112,8 +1114,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 1>
// Handle one input channel at a time.
for (; ic < input_depth; ic++)
{
- const uint16_t input_val = *local_input_ptr++ + input_offset;
- const uint16_t filter_val = *local_filter_ptr++ + filter_offset;
+ const int16_t input_val = *local_input_ptr++ + input_offset;
+ const int16_t filter_val = *local_filter_ptr++ + filter_offset;
*acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
}
input_ptr += input_ptr_increment;
@@ -1124,8 +1126,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 1>
template <> struct QuantizedDepthwiseConvKernel<true, 16, 1>
{
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
- const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+ const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
{
(void)input_depth;
(void)depth_multiplier;
@@ -1174,7 +1176,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 16, 1>
{
acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), vget_low_s16(filter[i]));
acc[2 * i + 1] =
- vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), vget_high_s16(filter[i]));
+ vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), vget_high_s16(filter[i]));
}
// Store the accumulators back to acc_buffer
for (int i = 0; i < 4; i++)
@@ -1189,8 +1191,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 16, 1>
template <> struct QuantizedDepthwiseConvKernel<true, 8, 1>
{
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
- const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+ const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
{
(void)input_depth;
(void)depth_multiplier;
@@ -1228,8 +1230,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 8, 1>
template <> struct QuantizedDepthwiseConvKernel<true, 1, 16>
{
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
- const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+ const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
{
(void)input_depth;
(void)depth_multiplier;
@@ -1253,7 +1255,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 16>
{
uint8_t input_u8 = *input_ptr;
input_ptr += input_ptr_increment;
- uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
+ int16_t input = static_cast<int16_t>(input_u8) + input_offset;
// Load the accumulators from acc_buffer
int32x4_t acc[4];
for (int i = 0; i < 4; i++)
@@ -1279,8 +1281,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 16>
template <> struct QuantizedDepthwiseConvKernel<true, 1, 32>
{
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
- const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+ const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
{
(void)input_depth;
(void)depth_multiplier;
@@ -1302,7 +1304,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 32>
{
uint8_t input_u8 = *input_ptr;
input_ptr += input_ptr_increment;
- uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
+ int16_t input = static_cast<int16_t>(input_u8) + input_offset;
// Load the accumulators from acc_buffer
int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
@@ -1338,8 +1340,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 32>
template <> struct QuantizedDepthwiseConvKernel<true, 1, 20>
{
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
- const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+ const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
{
(void)input_depth;
(void)depth_multiplier;
@@ -1363,7 +1365,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 20>
{
uint8_t input_u8 = *input_ptr;
input_ptr += input_ptr_increment;
- uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
+ int16_t input = static_cast<int16_t>(input_u8) + input_offset;
// Load the accumulators from acc_buffer
int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
@@ -1390,21 +1392,21 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 20>
template <> struct QuantizedDepthwiseConvKernel<true, 1, 8>
{
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
- const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+ const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
{
(void)input_depth;
(void)depth_multiplier;
// Load the filters, add filter_offset.
const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
const int16x8_t filter =
- vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));
+ vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));
// Handle one output pixel at a time.
for (int outp = 0; outp < num_output_pixels; outp++)
{
uint8_t input_u8 = *input_ptr;
input_ptr += input_ptr_increment;
- uint16_t input = static_cast<int16_t>(input_u8 + input_offset);
+ int16_t input = static_cast<int16_t>(input_u8) + input_offset;
// Load the accumulators from acc_buffer
int32x4_t acc[2];
for (int i = 0; i < 2; i++)
@@ -1427,8 +1429,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 8>
template <> struct QuantizedDepthwiseConvKernel<true, 2, 1>
{
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
- const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+ const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
{
(void)input_depth;
(void)depth_multiplier;
@@ -1455,7 +1457,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 2, 1>
input_u16 = vset_lane_u16((reinterpret_cast<const uint16_t *>(input_ptr))[0], input_u16, 1);
input_ptr += input_ptr_increment;
const int16x4_t input_s16 =
- vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16))));
+ vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16))));
const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
// Multiply-accumulate.
@@ -1490,8 +1492,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 2, 1>
template <> struct QuantizedDepthwiseConvKernel<true, 4, 1>
{
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
- const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+ const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
{
(void)input_depth;
(void)depth_multiplier;
@@ -1555,8 +1557,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 4, 1>
template <> struct QuantizedDepthwiseConvKernel<false, 12, 1>
{
static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
- const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment,
- const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr)
+ const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr)
{
(void)input_depth;
(void)depth_multiplier;
@@ -1652,9 +1654,9 @@ void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_d
else
{
out_x_loop_start_unclampled =
- (pad_width - dilation_factor * filter_x + stride - 1) / stride;
+ (pad_width - dilation_factor * filter_x + stride - 1) / stride;
out_x_loop_end_unclampled =
- (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
+ (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
}
}
else
@@ -1672,8 +1674,8 @@ void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_d
const uint8_t *input_ptr = input_data + in_x_origin * input_depth;
const int num_output_pixels = out_x_loop_end - out_x_loop_start;
QuantizedDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run(
- num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset,
- input_ptr_increment, filter_base_ptr, filter_offset, acc_buffer_ptr);
+ num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset,
+ input_ptr_increment, filter_base_ptr, filter_offset, acc_buffer_ptr);
filter_base_ptr += output_depth;
}
}
@@ -1690,11 +1692,11 @@ inline void QuantizedDepthwiseConvAccumRowGeneric(int stride, int dilation_facto
const uint8_t *filter_base_ptr = filter_data;
for (int filter_x = 0; filter_x < filter_width; ++filter_x)
{
- const int out_x_loop_start = std::max(
- out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
+ const int out_x_loop_start =
+ std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
const int out_x_loop_end =
- std::min(out_x_buffer_end,
- (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
+ std::min(out_x_buffer_end,
+ (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
@@ -1813,7 +1815,8 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape
const uint8_t *input_data, const Shape &filter_shape,
const uint8_t *filter_data, const Shape &bias_shape,
const int32_t *bias_data, const Shape &output_shape,
- uint8_t *output_data)
+ uint8_t *output_data, int thread_start, int thread_end,
+ int thread_dim)
{
(void)bias_shape;
const int stride_width = params.stride_width;
@@ -1852,6 +1855,8 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape
assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);
assert(kAccBufferActualSize <= kAccBufferMaxSize);
assert(kOutputPixelsInAccBuffer >= 1);
+ assert(thread_dim == 0 || thread_dim == 1);
+
UNUSED_RELEASE(kAccBufferActualSize);
// row_accum_func will point to the core accumulation function to be used
@@ -1865,7 +1870,7 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape
depth_multiplier == FIXED_DEPTH_MULTIPLIER) \
{ \
row_accum_func = \
- QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>; \
+ QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>; \
}
#ifdef USE_NEON
@@ -1919,22 +1924,49 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape
const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
// Now that we have determined row_accum_func, we can start work.
- uint8_t *output_ptr = output_data;
- for (int b = 0; b < batches; ++b)
+ int batch_start = 0;
+ int batch_end = batches;
+ int row_start = 0;
+ int row_end = output_height;
+ int output_ptr_offset = 0;
+
+ switch (thread_dim)
+ {
+ case 0:
+ // Multithread along with the batch axis
+ assert(thread_start >= 0);
+ assert(thread_end <= batches);
+ batch_start = thread_start;
+ batch_end = thread_end;
+ output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
+ break;
+ case 1:
+ // Multithread along with the row axis
+ assert(thread_start >= 0);
+ assert(thread_end <= output_height);
+ row_start = thread_start;
+ row_end = thread_end;
+ output_ptr_offset = row_start * output_width * output_depth;
+ break;
+ }
+
+ uint8_t *output_ptr = output_data + output_ptr_offset;
+ int batch_step = (output_height + row_start - row_end) * output_width * output_depth;
+ for (int b = batch_start; b < batch_end; ++b)
{
- for (int out_y = 0; out_y < output_height; ++out_y)
+ for (int out_y = row_start; out_y < row_end; ++out_y)
{
const int in_y_origin = (out_y * stride_height) - pad_height;
const int filter_y_start =
- std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
+ std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
const int filter_y_end =
- std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
- dilation_height_factor);
+ std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
+ dilation_height_factor);
for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
out_x_buffer_start += kOutputPixelsInAccBuffer)
{
const int out_x_buffer_end =
- std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+ std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
// We call a 'pixel' a group of activation that share all but the
// 'depth'/'channel' coordinate. num_output_pixels is the number of
// output pixels that we will accumulate in this loop iteration.
@@ -1952,7 +1984,7 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape
filter_data + filter_y * filter_height_stride, filter_offset,
out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer);
}
- // Finished accumulating int32 values. Now need to convert them to
+ // Finished accumulating int32_t values. Now need to convert them to
// the final 8bit form and store them.
const int num_output_values = output_depth * num_output_pixels;
int i = 0;
@@ -2113,9 +2145,111 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams &params, const Shape
}
}
}
+ output_ptr += batch_step;
}
}
+} // namespace depthwise_conv
+
+// template <DepthwiseConvOutputRounding kOutputRounding>
+inline void DepthwiseConvWithRounding(const DepthwiseConvParams &params, const Shape &input_shape,
+ const uint8_t *input_data, const Shape &filter_shape,
+ const uint8_t *filter_data, const Shape &bias_shape,
+ const int32_t *bias_data, const Shape &output_shape,
+ uint8_t *output_data, int thread_start, int thread_end,
+ int thread_dim)
+{
+ const int depth_multiplier = params.depth_multiplier;
+ const int32_t output_activation_min = params.quantized_activation_min;
+ const int32_t output_activation_max = params.quantized_activation_max;
+ const int dilation_width_factor = params.dilation_width_factor;
+ const int dilation_height_factor = params.dilation_height_factor;
+ assert(dilation_width_factor >= 1);
+ assert(dilation_height_factor >= 1);
+ assert(input_shape.DimensionsCount() == 4);
+ assert(filter_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+ assert(output_activation_min <= output_activation_max);
+ const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+ const int input_depth = input_shape.Dims(3);
+ assert(output_depth == input_depth * depth_multiplier);
+ assert(bias_shape.FlatSize() == output_depth);
+
+ UNUSED_RELEASE(depth_multiplier);
+ UNUSED_RELEASE(output_activation_min);
+ UNUSED_RELEASE(output_activation_max);
+ UNUSED_RELEASE(dilation_width_factor);
+ UNUSED_RELEASE(dilation_height_factor);
+ UNUSED_RELEASE(output_depth);
+ UNUSED_RELEASE(input_depth);
+
+// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
+// Jetson TX-2. This compiler does not support the offsetof() macro.
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+// TODO Use below codes
+// // Dispatch to dot-product 3x3 kernels when supported.
+//
+// ruy::Context *ruy_context = cpu_backend_context->ruy_context();
+// const bool has_dot_product_instructions =
+// ruy_context != nullptr &&
+// (ruy_context->GetRuntimeEnabledPaths() & ruy::Path::kNeonDotprod) != ruy::Path::kNone;
+// if (has_dot_product_instructions)
+// {
+// using optimized_ops::depthwise_conv::DotProduct3x3KernelType;
+// DotProduct3x3KernelType kernel_type =
+// optimized_ops::depthwise_conv::CategorizeDotProductKernel(
+// input_shape, filter_shape, params);
+// if (kernel_type != DotProduct3x3KernelType::kNone)
+// {
+// optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3<
+// DepthwiseConvImplementation::kUseNeon3x3DotProduct>(params, input_shape, input_data,
+// filter_shape, filter_data,
+// bias_shape,
+// bias_data, output_shape,
+// output_data);
+// return;
+// }
+// }
+//
+// // Dispatch to non-dot-product 3x3 kernels when supported.
+//
+// const int stride_width = params.stride_width;
+// const int stride_height = params.stride_height;
+// const int pad_width = params.padding_values.width;
+// const int pad_height = params.padding_values.height;
+// const int output_shift = params.output_shift;
+//
+// // Call kernel optimized for depthwise convolutions using 3x3 filters if
+// // parameters are supported.
+// if (depthwise_conv::Fast3x3FilterKernelSupported(input_shape, filter_shape, stride_width,
+// stride_height, dilation_width_factor,
+// dilation_height_factor, pad_width, pad_height,
+// depth_multiplier, output_shape, output_shift))
+// {
+// depthwise_conv::DepthwiseConv3x3Filter<kOutputRounding>(
+// params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data,
+// output_shape, output_data, thread_start, thread_end, thread_dim);
+// return;
+// }
+#endif
+
+ depthwise_conv::DepthwiseConvGeneral(params, input_shape, input_data, filter_shape, filter_data,
+ bias_shape, bias_data, output_shape, output_data,
+ thread_start, thread_end, thread_dim);
+}
+
+inline void DepthwiseConvImpl(const DepthwiseConvParams &params, const Shape &input_shape,
+ const uint8_t *input_data, const Shape &filter_shape,
+ const uint8_t *filter_data, const Shape &bias_shape,
+ const int32_t *bias_data, const Shape &output_shape,
+ uint8_t *output_data, int thread_start, int thread_end,
+ int thread_dim)
+{
+ return DepthwiseConvWithRounding(params, input_shape, input_data, filter_shape, filter_data,
+ bias_shape, bias_data, output_shape, output_data, thread_start,
+ thread_end, thread_dim);
+}
+
} // namespace optimized
} // namespace cker
} // namespace nnfw
diff --git a/compute/cker/include/cker/operation/optimized/Gemm.h b/compute/cker/include/cker/operation/optimized/Gemm.h
new file mode 100644
index 000000000..cfebef452
--- /dev/null
+++ b/compute/cker/include/cker/operation/optimized/Gemm.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_OPTIMIZED_GEMM_H__
+#define __NNFW_CKER_OPTIMIZED_GEMM_H__
+
+#include "cker/eigen/eigen_gemm_eigen.h"
+#include "cker/Shape.h"
+#include "cker/Types.h"
+
+#include <ruy/context.h>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace optimized
+{
+
+#if defined(CKER_X86_PLATFORM)
+
+/* From tensorflow/tensorflow/lite/kernels/cpu_backend_gemm_x86.h */
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar, typename DstScalar,
+ QuantizationFlavor quantization_flavor>
+struct GemmImplX86
+{
+ static void Run(const MatrixParams<LhsScalar> &, const LhsScalar *,
+ const MatrixParams<RhsScalar> &, const RhsScalar *,
+ const MatrixParams<DstScalar> &, DstScalar *,
+ const GemmParams<AccumScalar, DstScalar, quantization_flavor> &)
+ {
+ static_assert(
+ std::is_floating_point<LhsScalar>::value && std::is_floating_point<RhsScalar>::value &&
+ std::is_floating_point<AccumScalar>::value && std::is_floating_point<DstScalar>::value &&
+ quantization_flavor != QuantizationFlavor::kFloatingPoint,
+ "GemmImplX86 does not supported types other than float yet.");
+ }
+};
+
+// For float, defer to eigen for now.
+template <> struct GemmImplX86<float, float, float, float, QuantizationFlavor::kFloatingPoint>
+{
+ static void Run(const MatrixParams<float> &lhs_params, const float *lhs_data,
+ const MatrixParams<float> &rhs_params, const float *rhs_data,
+ const MatrixParams<float> &dst_params, float *dst_data,
+ const GemmParams<float, float, QuantizationFlavor::kFloatingPoint> &params)
+ {
+ detail::GemmImplUsingEigen::Run(lhs_params, lhs_data, rhs_params, rhs_data, dst_params,
+ dst_data, params);
+ }
+};
+
+/* From tensorflow/tensorflow/lite/kernels/cpu_backend_gemm.h */
+/* GEMM dispatch implementation for x86.
+ */
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar, typename DstScalar,
+ QuantizationFlavor quantization_flavor>
+struct GemmImpl : GemmImplX86<LhsScalar, RhsScalar, AccumScalar, DstScalar, quantization_flavor>
+{
+};
+
+/* From tensorflow/tensorflow/lite/kernels/cpu_backend_gemm.h */
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar, typename DstScalar,
+ QuantizationFlavor quantization_flavor>
+void Gemm(const MatrixParams<LhsScalar> &lhs_params, const LhsScalar *lhs_data,
+ const MatrixParams<RhsScalar> &rhs_params, const RhsScalar *rhs_data,
+ const MatrixParams<DstScalar> &dst_params, DstScalar *dst_data,
+ const GemmParams<AccumScalar, DstScalar, quantization_flavor> &params)
+{
+ // Generic case: dispatch to any backend as a general GEMM.
+ GemmImpl<LhsScalar, RhsScalar, AccumScalar, DstScalar, quantization_flavor>::Run(
+ lhs_params, lhs_data, rhs_params, rhs_data, dst_params, dst_data, params);
+}
+
+// From tensorflow/tensorflow/lite/kernels/cpu_backend_gemm_params.h
+inline CachePolicy DefaultCachePolicy(bool is_constant_data)
+{
+ return is_constant_data ? CachePolicy::kCacheIfLargeSpeedup : CachePolicy::kNeverCache;
+}
+#endif // CKER_X86_PLATFORM
+
+} // namespace optimized
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_OPTIMIZED_GEMM_H__
diff --git a/compute/cker/include/cker/operation/optimized/OptimizedUtils.h b/compute/cker/include/cker/operation/optimized/OptimizedUtils.h
index ae1f9e78e..f5edc94ab 100644
--- a/compute/cker/include/cker/operation/optimized/OptimizedUtils.h
+++ b/compute/cker/include/cker/operation/optimized/OptimizedUtils.h
@@ -111,7 +111,7 @@ inline void ExtractPatchIntoBufferColumn(const Shape &input_shape, int w, int h,
{
const int bottom_row_elements = (bottom_padding * kwidth * in_depth);
const int bottom_start =
- output_row_offset + ((top_padding + (ih_end - ih_start)) * kwidth * in_depth);
+ output_row_offset + ((top_padding + (ih_end - ih_start)) * kwidth * in_depth);
memset(conv_buffer_data + bottom_start, zero_byte, (bottom_row_elements * sizeof(T)));
}
}
@@ -159,7 +159,7 @@ void DilatedIm2col(const ConvParams &params, const Shape &input_shape, const T *
for (int batch = 0; batch < batches; ++batch)
{
const T zero_byte =
- zero_bytes_len > 1 ? static_cast<T>(zero_bytes[batch]) : static_cast<T>(zero_bytes[0]);
+ zero_bytes_len > 1 ? static_cast<T>(zero_bytes[batch]) : static_cast<T>(zero_bytes[0]);
for (int out_y = 0; out_y < output_height; ++out_y)
{
for (int out_x = 0; out_x < output_width; ++out_x)
diff --git a/compute/cker/include/cker/operation/optimized/integer_ops/DepthwiseConvInt8.h b/compute/cker/include/cker/operation/optimized/integer_ops/DepthwiseConvInt8.h
new file mode 100644
index 000000000..bd8497920
--- /dev/null
+++ b/compute/cker/include/cker/operation/optimized/integer_ops/DepthwiseConvInt8.h
@@ -0,0 +1,2138 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_INT8_H__
+#define __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_INT8_H__
+
+#include "cker/CpuBackendThreadpool.h"
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include "cker/neon/neon_check.h"
+#include "cker/operation/Quantize.h"
+
+#include <fixedpoint/fixedpoint.h>
+#include <public/gemmlowp.h>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace optimized_integer_ops
+{
+
+// Category of depthwise convolution output rounding.
+enum class DepthwiseConvOutputRounding
+{
+ kNone = 0, // Invalid: specific method must be specified.
+ kAwayFromZero, // Original method: exact halves rounded away from zero.
+ kUpward, // Halves towards +infinity: adds 0.5 before truncate.
+ // This is where a future kNearestEven would be placed.
+};
+
+// Category of depthwise convolution depth multiplication.
+enum class DepthwiseConvDepthMultiplication
+{
+ kNoMultiplication = 0, // Depth multiplier = 1.
+ kUnitInputDepth, // Input depth = 1, output depth = depth multiplier.
+};
+
+namespace depthwise_conv
+{
+
+// Implementation of quantized DepthwiseConv
+
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+struct QuantizedDepthwiseConvKernel
+{
+};
+
+#ifdef USE_NEON
+template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>
+{
+ static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+ const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+ {
+ // Load the filters.
+ int8x8x2_t filter_s8;
+ filter_s8.val[0] = vld1_s8(filter_ptr);
+ filter_s8.val[1] = vld1_s8(filter_ptr + 8);
+ int16x8_t filter[2];
+ for (int i = 0; i < 2; i++)
+ {
+ filter[i] = vmovl_s8(filter_s8.val[i]);
+ }
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++)
+ {
+ // Load the accumulators from acc_buffer
+ int32x4x2_t acc[2];
+ for (int i = 0; i < 2; i++)
+ {
+ acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+ }
+ // Load the inputs, add input_offset.
+ const int8x8_t input_s8 = vld1_s8(input_ptr);
+ input_ptr += input_ptr_increment;
+ const int16x8_t input_s16 = vmovl_s8(input_s8);
+ const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+ // Duplicate the input values, 2-fold
+ const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+ // Multiply-accumulate
+ for (int i = 0; i < 2; i++)
+ {
+ acc[0].val[i] =
+ vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), vget_low_s16(input_dup2.val[i]));
+ acc[1].val[i] =
+ vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), vget_high_s16(input_dup2.val[i]));
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 2; i++)
+ {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+ vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 8, 1>
+{
+ static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+ const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+ const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+ {
+ // Load the filters.
+ const int8x8_t filter_s8 = vld1_s8(filter_ptr);
+ const int16x8_t filter = vmovl_s8(filter_s8);
+
+ int outp = 0;
+ // Handle 2 output pixels at a time.
+ for (; outp <= num_output_pixels - 2; outp += 2)
+ {
+ // Load the accumulators from acc_buffer.
+ int32x4_t acc[4];
+ for (int i = 0; i < 4; i++)
+ {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Load the inputs, add input_offset.
+ int8x8_t input_s8[2];
+ for (int i = 0; i < 2; i++)
+ {
+ input_s8[i] = vld1_s8(input_ptr + 8 * i);
+ }
+ input_ptr += 16;
+ int16x8_t input[2];
+ for (int i = 0; i < 2; i++)
+ {
+ input[i] = vmovl_s8(input_s8[i]);
+ }
+ for (int i = 0; i < 2; i++)
+ {
+ input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+ }
+ // Multiply-accumulate.
+ acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0]));
+ acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0]));
+ acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1]));
+ acc[3] = vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1]));
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++)
+ {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ // Handle 1 output pixel at a time.
+ for (; outp < num_output_pixels; outp++)
+ {
+ // Load the accumulators from acc_buffer.
+ int32x4_t acc[2];
+ acc[0] = vld1q_s32(acc_buffer_ptr);
+ acc[1] = vld1q_s32(acc_buffer_ptr + 4);
+
+ // Load the inputs, add input_offset.
+ const int8x8_t input_s8 = vld1_s8(input_ptr);
+ input_ptr += 8;
+ const int16x8_t input_s16 = vmovl_s8(input_s8);
+ const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+ // Multiply-accumulate.
+ acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input));
+ acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input));
+ // Store the accumulators back to acc_buffer
+ vst1q_s32(acc_buffer_ptr, acc[0]);
+ vst1q_s32(acc_buffer_ptr + 4, acc[1]);
+ acc_buffer_ptr += 8;
+ }
+ }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 4, 2>
+{
+ static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+ const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+ const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+ {
+ // Load the filters.
+ const int8x8_t filter_s8 = vld1_s8(filter_ptr);
+ const int16x8_t filter = vmovl_s8(filter_s8);
+
+ int outp = 0;
+ // Handle 2 output pixels at a time.
+ for (; outp <= num_output_pixels - 2; outp += 2)
+ {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[4];
+ for (int i = 0; i < 4; i++)
+ {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Load the inputs, add input_offset.
+ const int8x8_t input_s8 = vld1_s8(input_ptr);
+ input_ptr += 8;
+ const int16x8_t input_s16 = vmovl_s8(input_s8);
+ const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+ // Duplicate the input values, 2-fold
+ const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+ // Multiply-accumulate
+ for (int i = 0; i < 2; i++)
+ {
+ acc[2 * i + 0] =
+ vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), vget_low_s16(input_dup2.val[i]));
+ acc[2 * i + 1] =
+ vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), vget_high_s16(input_dup2.val[i]));
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++)
+ {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ // Handle one output pixel at a time.
+ for (; outp < num_output_pixels; outp++)
+ {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[2];
+ for (int i = 0; i < 2; i++)
+ {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Load the inputs, add input_offset.
+ int8x8_t input_s8 = vdup_n_s8(0);
+ input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+ input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+ input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+ input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+ input_ptr += 4;
+ const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+ // Duplicate the input values, 2-fold
+ const int16x4x2_t input_dup2 = vzip_s16(input, input);
+ // Multiply-accumulate
+ acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]);
+ acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]);
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 2; i++)
+ {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 8;
+ }
+ }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 2, 8>
+{
+ static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+ const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+ const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+ {
+ // Load the filters.
+ int16x8_t filter[2];
+ for (int i = 0; i < 2; i++)
+ {
+ const int8x8_t filter_s8 = vld1_s8(filter_ptr + 8 * i);
+ filter[i] = vmovl_s8(filter_s8);
+ }
+ int outp = 0;
+ // Handle two output pixels at a time.
+ for (; outp <= num_output_pixels - 2; outp += 2)
+ {
+ // Load the accumulators from acc_buffer.
+ int32x4_t acc[8];
+ for (int i = 0; i < 8; i++)
+ {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Load the inputs, add input_offset.
+ int8x8_t input_s8 = vdup_n_s8(0);
+ input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+ input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+ input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+ input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+ input_ptr += 4;
+ const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+ // Multiply-accumulate.
+ acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+ acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
+ acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
+ acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
+ acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2);
+ acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2);
+ acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3);
+ acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3);
+ // Store the accumulators back to acc_buffer.
+ for (int i = 0; i < 8; i++)
+ {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 32;
+ }
+ // Handle one output pixel at a time.
+ for (; outp < num_output_pixels; outp++)
+ {
+ // Load the accumulators from acc_buffer.
+ int32x4_t acc[4];
+ for (int i = 0; i < 4; i++)
+ {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Load the inputs, add input_offset.
+ int8x8_t input_s8 = vdup_n_s8(0);
+ input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+ input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+ input_ptr += 2;
+ const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+ // Multiply-accumulate.
+ acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+ acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
+ acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
+ acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
+
+ // Store the accumulators back to acc_buffer.
+ for (int i = 0; i < 4; i++)
+ {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 2, 2>
+{
+ static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+ const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+ const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+ {
+ // Load the filters.
+ int8x8_t filter_s8 = vdup_n_s8(0);
+ filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+ filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+ filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
+ filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
+ const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+ int outp = 0;
+ // Handle 4 output pixels at a time.
+ for (; outp <= num_output_pixels - 4; outp += 4)
+ {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[4];
+ for (int i = 0; i < 4; i++)
+ {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+
+ // Load the inputs, add input_offset.
+ const int8x8_t input_s8 = vld1_s8(input_ptr);
+ input_ptr += 8;
+ const int16x8_t input_s16 = vmovl_s8(input_s8);
+ const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+ // Duplicate the input values, 2-fold
+ const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+ // Multiply-accumulate
+ acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
+ acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
+ acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
+ acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++)
+ {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ // Handle one output pixel at a time.
+ for (; outp < num_output_pixels; outp++)
+ {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+
+ int8x8_t input_s8 = vdup_n_s8(0);
+ input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+ input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+ input_ptr += 2;
+ const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+ // Duplicate the input values, 2-fold
+ const int16x4_t input_dup2 = vzip_s16(input, input).val[0];
+ // Multiply-accumulate
+ acc = vmlal_s16(acc, filter, input_dup2);
+ // Store the accumulators back to acc_buffer
+ vst1q_s32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 4;
+ }
+ }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 2, 1>
+{
+ static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+ const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+ const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+ {
+ // Load the filters.
+ int8x8_t filter_s8 = vdup_n_s8(0);
+ filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+ filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+ filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2);
+ filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3);
+ const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+ int outp = 0;
+ // Handle 8 output pixels at a time.
+ for (; outp <= num_output_pixels - 8; outp += 8)
+ {
+ // Load the accumulators from acc_buffer.
+ int32x4_t acc[4];
+ for (int i = 0; i < 4; i++)
+ {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Load the inputs, add input_offset.
+ int8x8_t input_s8[2];
+ for (int i = 0; i < 2; i++)
+ {
+ input_s8[i] = vld1_s8(input_ptr + 8 * i);
+ }
+ input_ptr += 16;
+ int16x8_t input[2];
+ for (int i = 0; i < 2; i++)
+ {
+ input[i] = vmovl_s8(input_s8[i]);
+ }
+ for (int i = 0; i < 2; i++)
+ {
+ input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+ }
+
+ // Multiply-accumulate.
+ acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0]));
+ acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0]));
+ acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1]));
+ acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1]));
+ // Store the accumulators back to acc_buffer.
+ for (int i = 0; i < 4; i++)
+ {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ // Handle 4 output pixels at a time.
+ for (; outp <= num_output_pixels - 4; outp += 4)
+ {
+ // Load the accumulators from acc_buffer.
+ int32x4_t acc[2];
+ for (int i = 0; i < 2; i++)
+ {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Load the inputs, add input_offset.
+ const int8x8_t input_s8 = vld1_s8(input_ptr);
+ input_ptr += 8;
+ const int16x8_t input_s16 = vmovl_s8(input_s8);
+ const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+ // Multiply-accumulate.
+ acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input));
+ acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input));
+ // Store the accumulators back to acc_buffer.
+ for (int i = 0; i < 2; i++)
+ {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 8;
+ }
+ // Handle 2 output pixels at a time.
+ for (; outp <= num_output_pixels - 2; outp += 2)
+ {
+ // Load the accumulators from acc_buffer.
+ int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+ // Load the inputs, add input_offset.
+ int8x8_t input_s8 = vdup_n_s8(0);
+ input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+ input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+ input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+ input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+ input_ptr += 4;
+ const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+ // Multiply-accumulate.
+ acc = vmlal_s16(acc, filter, input);
+ // Store the accumulators back to acc_buffer.
+ vst1q_s32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 4;
+ }
+ // Handle 1 output pixel at a time.
+ for (; outp < num_output_pixels; outp++)
+ {
+ // Load the accumulators from acc_buffer.
+ int32x2_t acc = vld1_s32(acc_buffer_ptr);
+ // Load the inputs, add input_offset.
+ int8x8_t input_s8 = vdup_n_s8(0);
+ input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+ input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+ input_ptr += 2;
+ const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+ // Multiply-accumulate.
+ acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
+ // Store the accumulators back to acc_buffer.
+ vst1_s32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 2;
+ }
+ }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 1, 2>
+{
+ static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+ const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+ const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+ {
+ // Load the filters.
+ int8x8_t filter_s8 = vdup_n_s8(0);
+ filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+ filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+ filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2);
+ filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3);
+ const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+ int outp = 0;
+ // Handle 8 output pixels at a time.
+ for (; outp <= num_output_pixels - 8; outp += 8)
+ {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[4];
+ for (int i = 0; i < 4; i++)
+ {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+
+ // Load the inputs, add input_offset.
+ const int8x8_t input_s8 = vld1_s8(input_ptr);
+ input_ptr += 8;
+ const int16x8_t input_s16 = vmovl_s8(input_s8);
+ const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+ // Duplicate the input values, 2-fold
+ const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+ // Multiply-accumulate
+ acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
+ acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
+ acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
+ acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++)
+ {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ // Handle one output pixel at a time.
+ for (; outp < num_output_pixels; outp++)
+ {
+ // Load the accumulators from acc_buffer
+ int32x2_t acc = vld1_s32(acc_buffer_ptr);
+
+ // Load the inputs, add input_offset.
+ const uint32_t input = *input_ptr++ + input_offset;
+
+ // Multiply-accumulate
+ acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input));
+ // Store the accumulators back to acc_buffer
+ vst1_s32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 2;
+ }
+ }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 1, 4>
+{
+ static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+ const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+ const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+ {
+ // Load the filters.
+ int8x8_t filter_s8 = vdup_n_s8(0);
+ filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+ filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+ filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
+ filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
+ const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+ int outp = 0;
+ // Handle 8 output pixels at a time.
+ for (; outp <= num_output_pixels - 8; outp += 8)
+ {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[8];
+ for (int i = 0; i < 8; i++)
+ {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+
+ // Load the inputs, add input_offset.
+ int8x8_t input_s8 = vld1_s8(input_ptr);
+ input_ptr += 8;
+ const int16x8_t input_s16 = vmovl_s8(input_s8);
+ const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+ // Multiply-accumulate
+ acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0);
+ acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1);
+ acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2);
+ acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3);
+ acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0);
+ acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1);
+ acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2);
+ acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3);
+
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 8; i++)
+ {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 32;
+ }
+ // Handle 4 output pixels at a time.
+ for (; outp <= num_output_pixels - 4; outp += 4)
+ {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[4];
+ for (int i = 0; i < 4; i++)
+ {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+
+ // Load the inputs, add input_offset.
+ int8x8_t input_s8 = vdup_n_s8(0);
+ input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+ input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+ input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+ input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+ input_ptr += 4;
+ const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+ // Multiply-accumulate
+ acc[0] = vmlal_lane_s16(acc[0], filter, input, 0);
+ acc[1] = vmlal_lane_s16(acc[1], filter, input, 1);
+ acc[2] = vmlal_lane_s16(acc[2], filter, input, 2);
+ acc[3] = vmlal_lane_s16(acc[3], filter, input, 3);
+
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++)
+ {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ // Handle one output pixel at a time.
+ for (; outp < num_output_pixels; outp++)
+ {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+
+ // Load the inputs, add input_offset.
+ const uint32_t input = *input_ptr++ + input_offset;
+
+ // Multiply-accumulate
+ acc = vmlal_n_s16(acc, filter, input);
+ // Store the accumulators back to acc_buffer
+ vst1q_s32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 4;
+ }
+ }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 4, 1>
+{
+ static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+ const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+ const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+ {
+ // Load the filters.
+ int8x8_t filter_s8 = vdup_n_s8(0);
+ filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+ filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+ filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
+ filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
+ const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+ int outp = 0;
+ // Handle 4 output pixels at a time.
+ for (; outp <= num_output_pixels - 4; outp += 4)
+ {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[4];
+ for (int i = 0; i < 4; i++)
+ {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Load the inputs, add input_offset.
+ int16x8_t input[2];
+ for (int i = 0; i < 2; i++)
+ {
+ const int8x8_t input_s8 = vld1_s8(input_ptr + 8 * i);
+ const int16x8_t input_s16 = vmovl_s8(input_s8);
+ input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+ }
+ input_ptr += 16;
+ // Multiply-accumulate
+ for (int i = 0; i < 2; i++)
+ {
+ acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i]));
+ acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i]));
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++)
+ {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ // Handle one output pixel at a time.
+ for (; outp < num_output_pixels; outp++)
+ {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc;
+ acc = vld1q_s32(acc_buffer_ptr);
+
+ // Load the inputs, add input_offset.
+ int8x8_t input_s8 = vdup_n_s8(0);
+ input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+ input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+ input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+ input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+ input_ptr += 4;
+ const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+ // Multiply-accumulate
+ acc = vmlal_s16(acc, filter, input);
+ // Store the accumulators back to acc_buffer
+ vst1q_s32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 4;
+ }
+ }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 4, 4>
+{
+ static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+ const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+ const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+ {
+ // Load the filters.
+ int16x8_t filter[2];
+ for (int i = 0; i < 2; i++)
+ {
+ const int8x8_t filter_s8 = vld1_s8(filter_ptr + 8 * i);
+ filter[i] = vmovl_s8(filter_s8);
+ }
+
+ int outp = 0;
+ // Handle 2 output pixels at a time.
+ for (; outp <= num_output_pixels - 2; outp += 2)
+ {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[8];
+ for (int i = 0; i < 8; i++)
+ {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+
+ // Load the inputs, add input_offset.
+ int8x8_t input_s8 = vld1_s8(input_ptr);
+ input_ptr += 8;
+ const int16x8_t input_s16 = vmovl_s8(input_s8);
+ const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+ // Multiply-accumulate
+ acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), vget_low_s16(input), 0);
+ acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), vget_low_s16(input), 1);
+ acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), vget_low_s16(input), 2);
+ acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), vget_low_s16(input), 3);
+ acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), vget_high_s16(input), 0);
+ acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), vget_high_s16(input), 1);
+ acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), vget_high_s16(input), 2);
+ acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), vget_high_s16(input), 3);
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 8; i++)
+ {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 32;
+ }
+ // Handle one output pixel at a time.
+ for (; outp < num_output_pixels; outp++)
+ {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[4];
+ for (int i = 0; i < 4; i++)
+ {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+
+ // Load the inputs, add input_offset.
+ int8x8_t input_s8 = vdup_n_s8(0);
+ input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+ input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+ input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+ input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+ input_ptr += 4;
+ const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+ // Multiply-accumulate
+ acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+ acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1);
+ acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2);
+ acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3);
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++)
+ {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
+{
+ static void Run(int num_output_pixels, int input_depth, int /* depth_multiplier */,
+ const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+ {
+ // We will have to duplicate bytes in a NEON register, 3-fold.
+ // We will do that by register-level table-look-up using VTBL instructions.
+ // Here we prepare the registers containing the table-lookup indices.
+ static const int8_t dup3_indices_array[3][8] = {
+ {0, 0, 0, 1, 1, 1, 2, 2}, {2, 3, 3, 3, 4, 4, 4, 5}, {5, 5, 6, 6, 6, 7, 7, 7}};
+ int8x8_t dup3_indices[3];
+ for (int i = 0; i < 3; i++)
+ {
+ dup3_indices[i] = vld1_s8(dup3_indices_array[i]);
+ }
+
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++)
+ {
+ const int8_t *local_filter_ptr = filter_ptr;
+ const int8_t *local_input_ptr = input_ptr;
+ int ic = 0;
+ // Handle 8 input channels at a time.
+ for (; ic <= input_depth - 8; ic += 8)
+ {
+ // Load the filters.
+ int16x8_t filter[3];
+ int8x8x3_t filter_s8;
+ filter_s8.val[0] = vld1_s8(local_filter_ptr);
+ filter_s8.val[1] = vld1_s8(local_filter_ptr + 8);
+ filter_s8.val[2] = vld1_s8(local_filter_ptr + 16);
+ local_filter_ptr += 24;
+ for (int i = 0; i < 3; i++)
+ {
+ filter[i] = vmovl_s8(filter_s8.val[i]);
+ }
+ // Load the inputs, duplicate 3-fold, add input_offset.
+ const int8x8_t input_s8 = vld1_s8(local_input_ptr);
+ local_input_ptr += 8;
+
+ int8x8_t input_s8_dup3[3];
+ for (int i = 0; i < 3; i++)
+ {
+ input_s8_dup3[i] = vtbl1_s8(input_s8, dup3_indices[i]);
+ }
+ int16x8_t input_dup3[3];
+ for (int i = 0; i < 3; i++)
+ {
+ const int16x8_t input_s16_dup3 = vmovl_s8(input_s8_dup3[i]);
+ input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset));
+ }
+ // Load the accumulators from acc_buffer
+ int32x4x3_t acc[2];
+ for (int i = 0; i < 2; i++)
+ {
+ acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+ acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16);
+ }
+ // Multiply-accumulate
+ for (int j = 0; j < 3; j++)
+ {
+ acc[0].val[j] =
+ vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), vget_low_s16(filter[j]));
+ acc[1].val[j] =
+ vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), vget_high_s16(filter[j]));
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 2; i++)
+ {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+ vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+ vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]);
+ }
+ acc_buffer_ptr += 24;
+ }
+ // Handle one input channel at a time.
+ for (; ic < input_depth; ic++)
+ {
+ const int16_t input_val = *local_input_ptr++ + input_offset;
+ for (int i = 0; i < 3; i++)
+ {
+ *acc_buffer_ptr++ += static_cast<int32_t>(local_filter_ptr[i]) * input_val;
+ }
+ local_filter_ptr += 3;
+ }
+ input_ptr += input_ptr_increment;
+ }
+ }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>
+{
+ static void Run(int num_output_pixels, int input_depth, int /* depth_multiplier */,
+ const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+ {
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++)
+ {
+ const int8_t *local_filter_ptr = filter_ptr;
+ const int8_t *local_input_ptr = input_ptr;
+ int ic = 0;
+ // Handle 8 input channels at a time.
+ for (; ic <= input_depth - 8; ic += 8)
+ {
+ // Load the filters.
+ int16x8_t filter[2];
+ int8x8x2_t filter_s8;
+ filter_s8.val[0] = vld1_s8(local_filter_ptr);
+ filter_s8.val[1] = vld1_s8(local_filter_ptr + 8);
+ local_filter_ptr += 16;
+ for (int i = 0; i < 2; i++)
+ {
+ filter[i] = vmovl_s8(filter_s8.val[i]);
+ }
+ // Load the inputs, add input_offset, duplicate 2-fold.
+ const int8x8_t input_s8 = vld1_s8(local_input_ptr);
+ local_input_ptr += 8;
+ const int16x8_t input_s16 = vmovl_s8(input_s8);
+ const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+ const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+ // Load the accumulators from acc_buffer.
+ int32x4x2_t acc[2];
+ for (int i = 0; i < 2; i++)
+ {
+ acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+ }
+ // Multiply-accumulate.
+ for (int j = 0; j < 2; j++)
+ {
+ acc[0].val[j] =
+ vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), vget_low_s16(input_dup2.val[j]));
+ acc[1].val[j] =
+ vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), vget_high_s16(input_dup2.val[j]));
+ }
+ // Store the accumulators back to acc_buffer.
+ for (int i = 0; i < 2; i++)
+ {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+ vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ // Handle one input channel at a time.
+ for (; ic < input_depth; ic++)
+ {
+ // Load the inputs.
+ const int16_t input_val = *local_input_ptr++ + input_offset;
+ for (int i = 0; i < 2; i++)
+ {
+ *acc_buffer_ptr++ += static_cast<int32_t>(local_filter_ptr[i]) * input_val;
+ }
+ local_filter_ptr += 2;
+ }
+ input_ptr += input_ptr_increment;
+ }
+ }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 0, 1>
+{
+ static void Run(int num_output_pixels, int input_depth, int /* depth_multiplier */,
+ const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+ {
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++)
+ {
+ const int8_t *local_filter_ptr = filter_ptr;
+ const int8_t *local_input_ptr = input_ptr;
+ int ic = 0;
+ // Handle 16 input channels at a time.
+ for (; ic <= input_depth - 16; ic += 16)
+ {
+ // Load the filters.
+ int8x8_t filter_s8_0 = vld1_s8(local_filter_ptr + 8 * 0);
+ int8x8_t filter_s8_1 = vld1_s8(local_filter_ptr + 8 * 1);
+ local_filter_ptr += 16;
+ int16x8_t filter_0 = vmovl_s8(filter_s8_0);
+ int16x8_t filter_1 = vmovl_s8(filter_s8_1);
+ // Load the inputs, add input_offset.
+ int8x8_t input_s8_0 = vld1_s8(local_input_ptr + 8 * 0);
+ int8x8_t input_s8_1 = vld1_s8(local_input_ptr + 8 * 1);
+ local_input_ptr += 16;
+ int16x8_t input_0 = vmovl_s8(input_s8_0);
+ int16x8_t input_1 = vmovl_s8(input_s8_1);
+ input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
+ input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
+ // Load the accumulators from acc_buffer
+ int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+ int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+ int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+ int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+ acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0));
+ acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0));
+ acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1));
+ acc_3 = vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1));
+ // Store the accumulators back to acc_buffer
+ vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+ vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+ vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+ vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+ acc_buffer_ptr += 16;
+ }
+ // Handle 8 input channels at a time.
+ for (; ic <= input_depth - 8; ic += 8)
+ {
+ // Load the filters.
+ const int8x8_t filter_s8 = vld1_s8(local_filter_ptr);
+ local_filter_ptr += 8;
+ const int16x8_t filter = vmovl_s8(filter_s8);
+ // Load the inputs, add input_offset.
+ const int8x8_t input_s8 = vld1_s8(local_input_ptr);
+ local_input_ptr += 8;
+ const int16x8_t input_s16 = vmovl_s8(input_s8);
+ const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[2];
+ for (int i = 0; i < 2; i++)
+ {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
+ acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 2; i++)
+ {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 8;
+ }
+ // Handle one input channel at a time.
+ for (; ic < input_depth; ic++)
+ {
+ const int16_t input_val = *local_input_ptr++ + input_offset;
+ const int16_t filter_val = *local_filter_ptr++;
+ *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
+ }
+ input_ptr += input_ptr_increment;
+ }
+ }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 16, 1>
+{
+ static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+ const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+ {
+ // Load the filters.
+ int8x8_t filter_s8[2];
+ for (int i = 0; i < 2; i++)
+ {
+ filter_s8[i] = vld1_s8(filter_ptr + 8 * i);
+ }
+ int16x8_t filter[2];
+ for (int i = 0; i < 2; i++)
+ {
+ filter[i] = vmovl_s8(filter_s8[i]);
+ }
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++)
+ {
+ // Load the inputs, add input_offset.
+ int8x8_t input_s8[2];
+ for (int i = 0; i < 2; i++)
+ {
+ input_s8[i] = vld1_s8(input_ptr + 8 * i);
+ }
+ input_ptr += input_ptr_increment;
+ int16x8_t input[2];
+ for (int i = 0; i < 2; i++)
+ {
+ input[i] = vmovl_s8(input_s8[i]);
+ }
+ for (int i = 0; i < 2; i++)
+ {
+ input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+ }
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[4];
+ for (int i = 0; i < 4; i++)
+ {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ for (int i = 0; i < 2; i++)
+ {
+ acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), vget_low_s16(filter[i]));
+ acc[2 * i + 1] =
+ vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), vget_high_s16(filter[i]));
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++)
+ {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 8, 1>
+{
+ static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+ const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+ {
+ // Load the filters.
+ const int8x8_t filter_s8 = vld1_s8(filter_ptr);
+ const int16x8_t filter = vmovl_s8(filter_s8);
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++)
+ {
+ // Load the inputs, add input_offset.
+ const int8x8_t input_s8 = vld1_s8(input_ptr);
+ const int16x8_t input_s16 = vmovl_s8(input_s8);
+ const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[2];
+ for (int i = 0; i < 2; i++)
+ {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
+ acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 2; i++)
+ {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 8;
+ input_ptr += input_ptr_increment;
+ }
+ }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 1, 16>
+{
+ static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+ const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+ {
+ // Load the filters.
+ int8x8_t filter_s8[2];
+ for (int i = 0; i < 2; i++)
+ {
+ filter_s8[i] = vld1_s8(filter_ptr + 8 * i);
+ }
+ int16x8_t filter[2];
+ for (int i = 0; i < 2; i++)
+ {
+ filter[i] = vmovl_s8(filter_s8[i]);
+ }
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++)
+ {
+ int8_t input_s8 = *input_ptr;
+ input_ptr += input_ptr_increment;
+ int16_t input = static_cast<int16_t>(input_s8 + input_offset);
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[4];
+ for (int i = 0; i < 4; i++)
+ {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ for (int i = 0; i < 2; i++)
+ {
+ acc[2 * i + 0] = vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input);
+ acc[2 * i + 1] = vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input);
+ }
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 4; i++)
+ {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 16;
+ }
+ }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 1, 32>
+{
+ static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+ const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+ {
+ // Load the filters.
+ int8x8_t filter_s8_0 = vld1_s8(filter_ptr + 8 * 0);
+ int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 8 * 1);
+ int8x8_t filter_s8_2 = vld1_s8(filter_ptr + 8 * 2);
+ int8x8_t filter_s8_3 = vld1_s8(filter_ptr + 8 * 3);
+ int16x8_t filter_0 = vmovl_s8(filter_s8_0);
+ int16x8_t filter_1 = vmovl_s8(filter_s8_1);
+ int16x8_t filter_2 = vmovl_s8(filter_s8_2);
+ int16x8_t filter_3 = vmovl_s8(filter_s8_3);
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++)
+ {
+ int8_t input_s8 = *input_ptr;
+ input_ptr += input_ptr_increment;
+ int16_t input = static_cast<int16_t>(input_s8 + input_offset);
+ // Load the accumulators from acc_buffer
+ int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+ int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+ int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+ int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+ int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
+ int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5);
+ int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6);
+ int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7);
+ // Multiply-accumulate
+ acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
+ acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
+ acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
+ acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
+ acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input);
+ acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input);
+ acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input);
+ acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input);
+ // Store the accumulators back to acc_buffer
+ vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+ vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+ vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+ vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+ vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
+ vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5);
+ vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6);
+ vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7);
+ acc_buffer_ptr += 32;
+ }
+ }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 1, 20>
+{
+ static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+ const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+ {
+ // Load the filters.
+ // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8.
+ // We load the first 16 bytes into filter_s8_{0,1} as usual.
+ // Then we load the 8 last bytes into filter_s8_x (x for 'extra').
+ // This is redundant: the first 4 bytes of filter_s8_x are the same
+ // as the last 4 bytes of filter_s8_x.
+ int8x8_t filter_s8_0 = vld1_s8(filter_ptr + 8 * 0);
+ int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 8 * 1);
+ int8x8_t filter_s8_x = vld1_s8(filter_ptr + 8 * 1 + 4);
+ int16x8_t filter_0 = vmovl_s8(filter_s8_0);
+ int16x8_t filter_1 = vmovl_s8(filter_s8_1);
+ int16x8_t filter_x = vmovl_s8(filter_s8_x);
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++)
+ {
+ int8_t input_s8 = *input_ptr;
+ input_ptr += input_ptr_increment;
+ int16_t input = static_cast<int16_t>(input_s8 + input_offset);
+ // Load the accumulators from acc_buffer
+ int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+ int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+ int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+ int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+ int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
+ // Multiply-accumulate
+ acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
+ acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
+ acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
+ acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
+ acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input);
+ // Store the accumulators back to acc_buffer
+ vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+ vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+ vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+ vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+ vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
+ acc_buffer_ptr += 20;
+ }
+ }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 1, 8>
+{
+ static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+ const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+ {
+ // Load the filters.
+ const int8x8_t filter_s8 = vld1_s8(filter_ptr);
+ const int16x8_t filter = vmovl_s8(filter_s8);
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++)
+ {
+ int8_t input_s8 = *input_ptr;
+ input_ptr += input_ptr_increment;
+ int16_t input = static_cast<int16_t>(input_s8 + input_offset);
+ // Load the accumulators from acc_buffer
+ int32x4_t acc[2];
+ for (int i = 0; i < 2; i++)
+ {
+ acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+ }
+ // Multiply-accumulate
+ acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input);
+ acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input);
+ // Store the accumulators back to acc_buffer
+ for (int i = 0; i < 2; i++)
+ {
+ vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+ }
+ acc_buffer_ptr += 8;
+ }
+ }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 2, 1>
+{
+ static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+ const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+ {
+ // Load the filters.
+ int8x8_t filter_s8 = vdup_n_s8(0);
+ filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+ filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+ filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2);
+ filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3);
+ const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+ int outp = 0;
+
+ // Handle 2 output pixels at a time.
+ for (; outp <= num_output_pixels - 2; outp += 2)
+ {
+ // Load the accumulators from acc_buffer.
+ int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+ // Load the inputs, add input_offset.
+ int16x4_t input_s16 = vdup_n_s16(0);
+ input_s16 = vset_lane_s16((reinterpret_cast<const int16_t *>(input_ptr))[0], input_s16, 0);
+ input_ptr += input_ptr_increment;
+ input_s16 = vset_lane_s16((reinterpret_cast<const int16_t *>(input_ptr))[0], input_s16, 1);
+ input_ptr += input_ptr_increment;
+ input_s16 = vget_low_s16(vmovl_s8(vreinterpret_s8_s16(input_s16)));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+ // Multiply-accumulate.
+ acc = vmlal_s16(acc, filter, input);
+ // Store the accumulators back to acc_buffer.
+ vst1q_s32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 4;
+ }
+
+ // Handle 1 output pixel at a time.
+ for (; outp < num_output_pixels; outp++)
+ {
+ // Load the accumulators from acc_buffer.
+ int32x2_t acc = vld1_s32(acc_buffer_ptr);
+ // Load the inputs, add input_offset.
+ int8x8_t input_s8 = vdup_n_s8(0);
+ input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+ input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+ input_ptr += input_ptr_increment;
+ const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+ // Multiply-accumulate.
+ acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
+ // Store the accumulators back to acc_buffer.
+ vst1_s32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 2;
+ }
+ }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 4, 1>
+{
+ static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+ const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+ {
+ if (num_output_pixels <= 0)
+ {
+ return;
+ }
+
+ // Load the filters.
+ int8x8_t filter_s8 = vdup_n_s8(0);
+ filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+ filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+ filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
+ filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
+ const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+ int outp = 0;
+
+ // Handle one output pixel at a time until second to the last pixel. Second
+ // to the last because we read eight input pixels while only processing
+ // four.
+ for (; outp < num_output_pixels - 1; outp++)
+ {
+ // Load the accumulators from acc_buffer
+ int32x4_t acc;
+ acc = vld1q_s32(acc_buffer_ptr);
+
+ // Load the inputs, add input_offset.
+ int8x8_t input_s8 = vld1_s8(input_ptr);
+ input_ptr += input_ptr_increment;
+ const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+ // Multiply-accumulate
+ acc = vmlal_s16(acc, filter, input);
+ // Store the accumulators back to acc_buffer
+ vst1q_s32(acc_buffer_ptr, acc);
+ acc_buffer_ptr += 4;
+ }
+
+ // Handle the last output pixel.
+ // Load the accumulators from acc_buffer
+ int32x4_t acc;
+ acc = vld1q_s32(acc_buffer_ptr);
+
+ // Load the inputs, add input_offset.
+ int8x8_t input_s8 = vdup_n_s8(0);
+ input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+ input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+ input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+ input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+ const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+ const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+ // Multiply-accumulate
+ acc = vmlal_s16(acc, filter, input);
+ // Store the accumulators back to acc_buffer
+ vst1q_s32(acc_buffer_ptr, acc);
+ }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 12, 1>
+{
+ static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+ const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+ const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+ {
+ // Load the filters.
+ int8x8_t filter_s8_0 = vld1_s8(filter_ptr);
+ int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 4);
+ int16x8_t filter_s16_0 = vmovl_s8(filter_s8_0);
+ int16x8_t filter_s16_1 = vmovl_s8(filter_s8_1);
+ int16x4_t filter_0 = vget_low_s16(filter_s16_0);
+ int16x4_t filter_1 = vget_high_s16(filter_s16_0);
+ int16x4_t filter_2 = vget_high_s16(filter_s16_1);
+
+ // Handle one output pixel at a time.
+ for (int outp = 0; outp < num_output_pixels; outp++)
+ {
+ // Load the inputs, add input_offset.
+ int8x8_t input_s8_0 = vld1_s8(input_ptr);
+ int8x8_t input_s8_1 = vld1_s8(input_ptr + 4);
+ input_ptr += input_ptr_increment;
+ int16x8_t input_0 = vmovl_s8(input_s8_0);
+ int16x8_t input_1 = vmovl_s8(input_s8_1);
+ input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
+ input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
+
+ // Load the accumulators from acc_buffer
+ int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+ int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+ int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+
+ // Multiply-accumulate
+ acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0);
+ acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1);
+ acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2);
+
+ // Store the accumulators back to acc_buffer
+ vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+ vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+ vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+
+ acc_buffer_ptr += 12;
+ }
+ }
+};
+#endif
+
+// Accumulates the effect of one row of the filter, on a segment of one row
+// of the output, accessing the corresponding one row of the input.
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_depth,
+ int input_width, const int8_t *input_data, int16_t input_offset,
+ int pad_width, int depth_multiplier, int filter_width,
+ const int8_t *filter_data, int out_x_buffer_start,
+ int out_x_buffer_end, int output_depth, int32_t *acc_buffer)
+{
+ // Consistency check parameters. This is important in particular to ensure
+ // that we keep the number of template instantiations minimal, so we don't
+ // increase binary size unnecessarily.
+ static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
+ static_assert(kFixedInputDepth || kAllowStrided, "");
+ assert(stride == 1 || kAllowStrided);
+ if (kFixedInputDepth)
+ {
+ assert(input_depth == kFixedInputDepth);
+ }
+ if (kFixedDepthMultiplier)
+ {
+ assert(depth_multiplier == kFixedDepthMultiplier);
+ }
+ assert(output_depth == input_depth * depth_multiplier);
+ const int input_ptr_increment = stride * input_depth;
+ const int8_t *filter_base_ptr = filter_data;
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+ {
+ // For the current (filter_x, filter_y) point in the filter,
+ // compute the boundaries of the corresponding output row segment.
+ int out_x_loop_start_unclamped = 0;
+ int out_x_loop_end_unclamped = 0;
+ if (kAllowStrided)
+ {
+ if (stride == 2)
+ {
+ out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 1) / 2;
+ out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
+ }
+ else if (stride == 4)
+ {
+ out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 3) / 4;
+ out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
+ }
+ else
+ {
+ out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + stride - 1) / stride;
+ out_x_loop_end_unclamped =
+ (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
+ }
+ }
+ else
+ {
+ out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x;
+ out_x_loop_end_unclamped = pad_width + input_width - dilation_factor * filter_x;
+ }
+ // The kernel will have to iterate on the segment of the
+ // output row that starts at out_x_loop_start and out_x_loop_end.
+ const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclamped);
+ const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclamped);
+
+ int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+ const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+ const int8_t *input_ptr = input_data + in_x_origin * input_depth;
+ const int num_output_pixels = out_x_loop_end - out_x_loop_start;
+ QuantizedDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run(
+ num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset,
+ input_ptr_increment, filter_base_ptr, acc_buffer_ptr);
+ filter_base_ptr += output_depth;
+ }
+}
+
+// generic fallback of DepthwiseConvAccumRow, portable, non-templatized.
+inline void QuantizedDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth,
+ int input_width, const int8_t *input_data,
+ int16_t input_offset, int pad_width,
+ int depth_multiplier, int filter_width,
+ const int8_t *filter_data, int out_x_buffer_start,
+ int out_x_buffer_end, int output_depth,
+ int32_t *acc_buffer)
+{
+ const int8_t *filter_base_ptr = filter_data;
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+ {
+ const int out_x_loop_start =
+ std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
+ const int out_x_loop_end =
+ std::min(out_x_buffer_end,
+ (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
+
+ int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+ const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+ const int8_t *input_ptr = input_data + in_x_origin * input_depth;
+ const int input_ptr_increment = (stride - 1) * input_depth;
+ for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++)
+ {
+ const int8_t *filter_ptr = filter_base_ptr;
+ for (int ic = 0; ic < input_depth; ++ic)
+ {
+ const int16_t input_val = *input_ptr++ + input_offset;
+ for (int m = 0; m < depth_multiplier; m++)
+ {
+ const int16_t filter_val = *filter_ptr++;
+ *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
+ }
+ }
+ input_ptr += input_ptr_increment;
+ }
+ filter_base_ptr += output_depth;
+ }
+}
+
+// Initializes the accumulator buffer with bias values.
+inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
+ const int32_t *bias_data, int32_t *acc_buffer)
+{
+ int i = 0;
+#ifdef USE_NEON
+ if (output_depth == 1)
+ {
+ const int32x4_t b = vdupq_n_s32(bias_data[0]);
+ for (; i <= num_output_pixels - 16; i += 16)
+ {
+ vst1q_s32(acc_buffer + i + 0, b);
+ vst1q_s32(acc_buffer + i + 4, b);
+ vst1q_s32(acc_buffer + i + 8, b);
+ vst1q_s32(acc_buffer + i + 12, b);
+ }
+ for (; i <= num_output_pixels - 4; i += 4)
+ {
+ vst1q_s32(acc_buffer + i, b);
+ }
+ }
+ else if (output_depth == 2)
+ {
+ int32x4_t b = vdupq_n_s32(bias_data[0]);
+ b = vsetq_lane_s32(bias_data[1], b, 1);
+ b = vsetq_lane_s32(bias_data[1], b, 3);
+ for (; i <= num_output_pixels - 8; i += 8)
+ {
+ vst1q_s32(acc_buffer + 2 * i + 0, b);
+ vst1q_s32(acc_buffer + 2 * i + 4, b);
+ vst1q_s32(acc_buffer + 2 * i + 8, b);
+ vst1q_s32(acc_buffer + 2 * i + 12, b);
+ }
+ for (; i <= num_output_pixels - 2; i += 2)
+ {
+ vst1q_s32(acc_buffer + 2 * i, b);
+ }
+ }
+ else if (output_depth == 4)
+ {
+ const int32x4_t b = vld1q_s32(bias_data);
+ for (; i <= num_output_pixels - 4; i += 4)
+ {
+ vst1q_s32(acc_buffer + 4 * i + 0, b);
+ vst1q_s32(acc_buffer + 4 * i + 4, b);
+ vst1q_s32(acc_buffer + 4 * i + 8, b);
+ vst1q_s32(acc_buffer + 4 * i + 12, b);
+ }
+ for (; i < num_output_pixels; i++)
+ {
+ vst1q_s32(acc_buffer + 4 * i, b);
+ }
+ }
+ else if (output_depth == 8)
+ {
+ const int32x4_t b0 = vld1q_s32(bias_data);
+ const int32x4_t b1 = vld1q_s32(bias_data + 4);
+ for (; i <= num_output_pixels - 2; i += 2)
+ {
+ vst1q_s32(acc_buffer + 8 * i + 0, b0);
+ vst1q_s32(acc_buffer + 8 * i + 4, b1);
+ vst1q_s32(acc_buffer + 8 * i + 8, b0);
+ vst1q_s32(acc_buffer + 8 * i + 12, b1);
+ }
+ for (; i < num_output_pixels; i++)
+ {
+ vst1q_s32(acc_buffer + 8 * i + 0, b0);
+ vst1q_s32(acc_buffer + 8 * i + 4, b1);
+ }
+ }
+ else if (output_depth == 16)
+ {
+ const int32x4_t b0 = vld1q_s32(bias_data);
+ const int32x4_t b1 = vld1q_s32(bias_data + 4);
+ const int32x4_t b2 = vld1q_s32(bias_data + 8);
+ const int32x4_t b3 = vld1q_s32(bias_data + 12);
+ for (; i < num_output_pixels; i++)
+ {
+ vst1q_s32(acc_buffer + 16 * i + 0, b0);
+ vst1q_s32(acc_buffer + 16 * i + 4, b1);
+ vst1q_s32(acc_buffer + 16 * i + 8, b2);
+ vst1q_s32(acc_buffer + 16 * i + 12, b3);
+ }
+ }
+#endif
+ for (; i < num_output_pixels; i++)
+ {
+ memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth);
+ }
+}
+
+inline void DepthwiseConvGeneral(const DepthwiseConvParams &params,
+ const int32_t *output_multiplier, const int32_t *output_shift,
+ const Shape &input_shape, const int8_t *input_data,
+ const Shape &filter_shape, const int8_t *filter_data,
+ const Shape & /* bias_shape */, const int32_t *bias_data,
+ const Shape &output_shape, int8_t *output_data, int thread_start,
+ int thread_end, int thread_dim)
+{
+ const int stride_width = params.stride_width;
+ const int stride_height = params.stride_height;
+ const int pad_width = params.padding_values.width;
+ const int pad_height = params.padding_values.height;
+ const int depth_multiplier = params.depth_multiplier;
+ const int32_t output_activation_min = params.quantized_activation_min;
+ const int32_t output_activation_max = params.quantized_activation_max;
+ const int32_t input_offset = params.input_offset;
+ const int32_t output_offset = params.output_offset;
+ const int dilation_width_factor = params.dilation_width_factor;
+ const int dilation_height_factor = params.dilation_height_factor;
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int input_depth = input_shape.Dims(3);
+ const int filter_height = filter_shape.Dims(1);
+ const int filter_width = filter_shape.Dims(2);
+ const int output_rows = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+
+ static const int kAccBufferMaxSize = 2048;
+ int32_t acc_buffer[kAccBufferMaxSize];
+ assert(kAccBufferMaxSize >= output_depth);
+ const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
+ const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
+ UNUSED_RELEASE(kAccBufferActualSize);
+ assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);
+ assert(kAccBufferActualSize <= kAccBufferMaxSize);
+ assert(kOutputPixelsInAccBuffer >= 1);
+ assert(thread_dim == 0 || thread_dim == 1);
+
+ // row_accum_func will point to the core accumulation function to be used
+ // for this DepthwiseConv op.
+ using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
+ row_accum_func_t row_accum_func = nullptr;
+
+#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \
+ if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \
+ (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \
+ depth_multiplier == FIXED_DEPTH_MULTIPLIER) \
+ { \
+ row_accum_func = \
+ QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>; \
+ }
+
+#ifdef USE_NEON
+ // We go over our list of kernels by decreasing order of preference
+ // for the cases where multiple kernels could apply.
+
+ // Start with the fastest kernels: AllowStrided=false, fixed input depth.
+
+ TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1)
+
+ // Next come the strided kernels: AllowStrided=true, fixed input depth.
+ // They are a bit less efficient, but allow stride!=1.
+
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
+
+ // Finally, the kernels allowing a variable input depth,
+ // these are the least efficient but most general kernels.
+
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
+ TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
+#endif // USE_NEON
+
+ // No matching fast kernel found, use slow fallback.
+ if (!row_accum_func)
+ {
+ row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
+ }
+
+#undef TFMINI_USE_DEPTHWISECONV_KERNEL
+
+ const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
+ const int input_batch_stride = input_height_stride * input_shape.Dims(1);
+ const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
+
+ // Now that we have determined row_accum_func, we can start work.
+ int batch_start = 0;
+ int batch_end = batches;
+ int row_start = 0;
+ int row_end = output_rows;
+ int output_ptr_offset = 0;
+
+ switch (thread_dim)
+ {
+ case 0:
+ assert(thread_start >= 0);
+ assert(thread_end <= batches);
+ batch_start = thread_start;
+ batch_end = thread_end;
+ output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
+ break;
+ case 1:
+ assert(thread_start >= 0);
+ assert(thread_end <= output_rows);
+ row_start = thread_start;
+ row_end = thread_end;
+ output_ptr_offset = row_start * output_width * output_depth;
+ break;
+ }
+
+ int8_t *output_ptr = output_data + output_ptr_offset;
+ int batch_step = (output_rows + row_start - row_end) * output_width * output_depth;
+ for (int b = batch_start; b < batch_end; ++b)
+ {
+ for (int out_y = row_start; out_y < row_end; ++out_y)
+ {
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ const int filter_y_start =
+ std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
+ const int filter_y_end =
+ std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
+ dilation_height_factor);
+ for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
+ out_x_buffer_start += kOutputPixelsInAccBuffer)
+ {
+ const int out_x_buffer_end =
+ std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+ // We call a 'pixel' a group of activation that share all but the
+ // 'depth'/'channel' coordinate. num_output_pixels is the number of
+ // output pixels that we will accumulate in this loop iteration.
+ const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
+ // Initialize our local accumulator with the bias values, so we don't
+ // have to add them later.
+ DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer);
+ // Accumulation loop. Most of the time should be spent in here.
+ for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
+ {
+ const int in_y = in_y_origin + dilation_height_factor * filter_y;
+ row_accum_func(stride_width, dilation_width_factor, input_depth, input_width,
+ input_data + in_y * input_height_stride + b * input_batch_stride,
+ input_offset, pad_width, depth_multiplier, filter_width,
+ filter_data + filter_y * filter_height_stride, out_x_buffer_start,
+ out_x_buffer_end, output_depth, acc_buffer);
+ }
+ // Finished accumulating int32_t values. Now need to convert them to
+ // the final 8bit form and store them.
+ const int num_output_values = output_depth * num_output_pixels;
+
+ Quantize(output_multiplier, output_shift, output_depth, num_output_values, output_offset,
+ output_activation_min, output_activation_max, acc_buffer, output_ptr);
+
+ output_ptr += num_output_values;
+ }
+ }
+ output_ptr += batch_step;
+ }
+}
+
+} // namespace depthwise_conv
+
+template <DepthwiseConvOutputRounding kOutputRounding>
+inline void DepthwiseConvWithRounding(const DepthwiseConvParams &params,
+ const int32_t *output_multiplier, const int32_t *output_shift,
+ const Shape &input_shape, const int8_t *input_data,
+ const Shape &filter_shape, const int8_t *filter_data,
+ const Shape &bias_shape, const int32_t *bias_data,
+ const Shape &output_shape, int8_t *output_data,
+ int thread_start, int thread_end, int thread_dim)
+{
+ const int depth_multiplier = params.depth_multiplier;
+ const int dilation_width_factor = params.dilation_width_factor;
+ const int dilation_height_factor = params.dilation_height_factor;
+ UNUSED_RELEASE(depth_multiplier);
+ UNUSED_RELEASE(dilation_width_factor);
+ UNUSED_RELEASE(dilation_height_factor);
+ assert(dilation_width_factor >= 1);
+ assert(dilation_height_factor >= 1);
+ assert(input_shape.DimensionsCount() == 4);
+ assert(filter_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+ const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+ const int input_depth = input_shape.Dims(3);
+ UNUSED_RELEASE(output_depth);
+ UNUSED_RELEASE(input_depth);
+ assert(output_depth == input_depth * depth_multiplier);
+ assert(bias_shape.FlatSize() == output_depth);
+
+// TODO Use below codes
+#if 0
+// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
+// Jetson TX-2. This compiler does not support the offsetof() macro.
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+#if defined(__ANDROID__) && defined(__clang__)
+ CpuFlags cpu_flags;
+ GetCpuFlags(&cpu_flags);
+ const bool has_dot_product_instructions = cpu_flags.neon_dotprod;
+
+ // Dispatch to dot-product 3x3 kernels when supported.
+ if (has_dot_product_instructions)
+ {
+ using optimized_ops::depthwise_conv::DotProduct3x3KernelType;
+ DotProduct3x3KernelType kernel_type = optimized_ops::depthwise_conv::CategorizeDotProductKernel<
+ optimized_ops::depthwise_conv::QuantizationType::kPerChannelInt8>(
+ input_shape, filter_shape, output_shape, params, output_shift);
+ if (kernel_type != DotProduct3x3KernelType::kNone)
+ {
+ DepthwiseConvParams params_copy = params;
+ params_copy.output_shift_per_channel = output_shift;
+ params_copy.output_multiplier_per_channel = output_multiplier;
+ optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3PerChannel<
+ DepthwiseConvImplementation::kUseNeon3x3DotProduct>(
+ params_copy, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data,
+ output_shape, output_data, thread_start, thread_end, thread_dim);
+ return;
+ }
+ }
+
+#endif
+ // Dispatch to non-dot-product 3x3 kernels when supported.
+
+ const int stride_width = params.stride_width;
+ const int stride_height = params.stride_height;
+ const int pad_width = params.padding_values.width;
+ const int pad_height = params.padding_values.height;
+
+ // Call kernel optimized for depthwise convolutions using 3x3 filters if
+ // parameters are supported.
+ if (optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported<
+ optimized_ops::depthwise_conv::QuantizationType::kPerChannelInt8>(
+ input_shape, filter_shape, stride_width, stride_height, dilation_width_factor,
+ dilation_height_factor, pad_width, pad_height, depth_multiplier, output_shape, 0,
+ output_shift))
+ {
+ optimized_ops::depthwise_conv::DepthwiseConv3x3FilterPerChannel<
+ DepthwiseConvOutputRounding::kUpward>(
+ params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
+ bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
+ return;
+ }
+#endif
+
+#endif /* end of if 0 */
+
+ depthwise_conv::DepthwiseConvGeneral(
+ params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
+ bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
+}
+
+inline void DepthwiseConvImpl(const DepthwiseConvParams &params, const int32_t *output_multiplier,
+ const int32_t *output_shift, const Shape &input_shape,
+ const int8_t *input_data, const Shape &filter_shape,
+ const int8_t *filter_data, const Shape &bias_shape,
+ const int32_t *bias_data, const Shape &output_shape,
+ int8_t *output_data, int thread_start, int thread_end, int thread_dim)
+{
+ return DepthwiseConvWithRounding<DepthwiseConvOutputRounding::kAwayFromZero>(
+ params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
+ bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
+}
+
+template <typename T, typename TS> struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task
+{
+ DepthwiseConvWorkerTask(const DepthwiseConvParams &params, const int32_t *output_multiplier,
+ const int32_t *output_shift, const Shape &input_shape,
+ const T *input_data, const Shape &filter_shape, const T *filter_data,
+ const Shape &bias_shape, const TS *bias_data, const Shape &output_shape,
+ T *output_data, int thread_start, int thread_end, int thread_dim)
+ : params_(params), output_multiplier_(output_multiplier), output_shift_(output_shift),
+ input_shape_(input_shape), input_data_(input_data), filter_shape_(filter_shape),
+ filter_data_(filter_data), bias_shape_(bias_shape), bias_data_(bias_data),
+ output_shape_(output_shape), output_data_(output_data), thread_start_(thread_start),
+ thread_end_(thread_end), thread_dim_(thread_dim)
+ {
+ }
+
+ void Run() override
+ {
+ DepthwiseConvImpl(params_, output_multiplier_, output_shift_, input_shape_, input_data_,
+ filter_shape_, filter_data_, bias_shape_, bias_data_, output_shape_,
+ output_data_, thread_start_, thread_end_, thread_dim_);
+ }
+
+private:
+ const DepthwiseConvParams &params_;
+ const int32_t *output_multiplier_;
+ const int32_t *output_shift_;
+ const Shape &input_shape_;
+ const T *input_data_;
+ const Shape &filter_shape_;
+ const T *filter_data_;
+ const Shape &bias_shape_;
+ const TS *bias_data_;
+ const Shape &output_shape_;
+ T *output_data_;
+ int thread_start_;
+ int thread_end_;
+ int thread_dim_;
+};
+
+inline int HowManyConvThreads(const Shape &output_shape, const Shape &filter_shape, int thread_dim)
+{
+ constexpr int kMinMulPerThread = 8;
+ const int output_units = output_shape.Dims(thread_dim);
+ const int filter_height = filter_shape.Dims(1);
+ const int filter_width = filter_shape.Dims(2);
+ const int num_mul_per_unit =
+ FlatSizeSkipDim(output_shape, thread_dim) * filter_height * filter_width;
+ const int min_units_per_thread = kMinMulPerThread / num_mul_per_unit + 1;
+ int thread_count = output_units / min_units_per_thread;
+ return thread_count;
+}
+
+inline void DepthwiseConvPerChannel(const DepthwiseConvParams &params,
+ const int32_t *output_multiplier, const int32_t *output_shift,
+ const Shape &input_shape, const int8_t *input_data,
+ const Shape &filter_shape, const int8_t *filter_data,
+ const Shape &bias_shape, const int32_t *bias_data,
+ const Shape &output_shape, int8_t *output_data,
+ ruy::Context *ruy_context)
+{
+ UNUSED_ALL(params, output_multiplier, output_shift, input_shape, input_data, filter_shape,
+ filter_data, bias_shape, bias_data, output_shape, output_data, ruy_context);
+
+ assert(input_shape.DimensionsCount() == 4);
+ assert(filter_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+
+ const int output_batches = output_shape.Dims(0);
+ const int output_rows = output_shape.Dims(1);
+ int thread_count_batch = HowManyConvThreads(output_shape, filter_shape, 0);
+ int thread_count_row = HowManyConvThreads(output_shape, filter_shape, 1);
+ int thread_dim, thread_count, thread_dim_size;
+ if (thread_count_batch > thread_count_row)
+ {
+ thread_dim = 0;
+ thread_dim_size = output_batches;
+ thread_count = thread_count_batch;
+ }
+ else
+ {
+ thread_dim = 1;
+ thread_dim_size = output_rows;
+ thread_count = thread_count_row;
+ }
+
+ // NOTE Borrow RuyContext to get max_num_threads setting
+ // TODO Define and use max_num_threads for CPU backend
+ const int max_threads = ruy_context->max_num_threads();
+ thread_count = std::max(1, std::min(thread_count, max_threads));
+
+ if (thread_count == 1)
+ {
+ DepthwiseConvImpl(params, output_multiplier, output_shift, input_shape, input_data,
+ filter_shape, filter_data, bias_shape, bias_data, output_shape, output_data,
+ /*thread_start=*/0,
+ /*thread_end=*/output_rows, /*thread_dim=*/1);
+ }
+ else
+ {
+ std::vector<DepthwiseConvWorkerTask<int8_t, int32_t>> tasks;
+ // TODO(b/131746020) don't create new heap allocations every time.
+ // At least we make it a single heap allocation by using reserve().
+ tasks.reserve(thread_count);
+ int thread_start = 0;
+ for (int i = 0; i < thread_count; ++i)
+ {
+ int thread_end = thread_start + (thread_dim_size - thread_start) / (thread_count - i);
+ tasks.emplace_back(params, output_multiplier, output_shift, input_shape, input_data,
+ filter_shape, filter_data, bias_shape, bias_data, output_shape,
+ output_data, thread_start, thread_end, thread_dim);
+ thread_start = thread_end;
+ }
+ cpu_backend_threadpool::Execute(tasks.size(), tasks.data(), ruy_context);
+ }
+}
+
+} // namespace optimized_integer_ops
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_INT8_H__
diff --git a/compute/cker/include/cker/operation/reference/BatchMatMul.h b/compute/cker/include/cker/operation/reference/BatchMatMul.h
index e8ffd4014..1b3020de2 100644
--- a/compute/cker/include/cker/operation/reference/BatchMatMul.h
+++ b/compute/cker/include/cker/operation/reference/BatchMatMul.h
@@ -87,9 +87,8 @@ inline void BatchMatMul(const Shape &lhs_shape, const float *lhs_data, const Sha
{
const float *lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
const float *rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
- float *out_ptr =
- output_data +
- ((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) * lhs_rows * rhs_cols;
+ float *out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) *
+ lhs_rows * rhs_cols;
for (int j = 0; j < rhs_cols; ++j)
{
for (int i = 0; i < lhs_rows; ++i)
diff --git a/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h
index f7e39248c..96e1d9127 100644
--- a/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h
+++ b/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h
@@ -56,28 +56,22 @@ inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shap
const int size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
for (int i = 0; i < size; i++)
{
- output_data[i] =
- ActivationFunctionWithMinMax(fn(input1_data[i], input2_data[i]),
- params.float_activation_min, params.float_activation_max);
+ output_data[i] = ActivationFunctionWithMinMax(
+ fn(input1_data[i], input2_data[i]), params.float_activation_min, params.float_activation_max);
}
}
template <typename T>
-inline void BroadcastBinaryArithmeticOpSlowQuant8(
- const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data,
- const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data,
- const std::function<T(const BinaryArithmeticOpParam &params, const T &, const T &)> &fn)
+inline typename std::enable_if_t<is_quant8<T>::value> BroadcastBinaryArithmeticOpSlow(
+ const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data,
+ const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data,
+ const std::function<T(const BinaryArithmeticOpParam &params, const T &, const T &)> &fn)
{
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape);
- if ((params.quantized_activation_min < 0) && (params.quantized_activation_max > 255))
- {
- throw std::runtime_error{"Support only for Quant8."};
- }
-
// Comment from tensorflow lite:
//
// In Tensorflow, the dimensions are canonically named (batch_number, row,
@@ -99,11 +93,10 @@ inline void BroadcastBinaryArithmeticOpSlowQuant8(
{
for (int c = 0; c < extended_output_shape.Dims(3); ++c)
{
- output_data[Offset(extended_output_shape, b, y, x, c)] =
- ActivationFunctionWithMinMax<uint8_t>(
- fn(params, input1_data[SubscriptToIndex(desc1, b, y, x, c)],
- input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
- params.quantized_activation_min, params.quantized_activation_max);
+ output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax<T>(
+ fn(params, input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+ input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
+ params.quantized_activation_min, params.quantized_activation_max);
}
}
}
@@ -143,9 +136,9 @@ inline void BroadcastBinaryArithmeticOpSlow(const BinaryArithmeticOpParam &param
for (int c = 0; c < extended_output_shape.Dims(3); ++c)
{
output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax<T>(
- fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
- input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
- params.quantized_activation_min, params.quantized_activation_max);
+ fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+ input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
+ params.quantized_activation_min, params.quantized_activation_max);
}
}
}
@@ -154,9 +147,9 @@ inline void BroadcastBinaryArithmeticOpSlow(const BinaryArithmeticOpParam &param
template <>
inline void BroadcastBinaryArithmeticOpSlow(
- const BinaryArithmeticOpParam &params, const Shape &input1_shape, const float *input1_data,
- const Shape &input2_shape, const float *input2_data, const Shape &output_shape,
- float *output_data, const std::function<float(const float &, const float &)> &fn)
+ const BinaryArithmeticOpParam &params, const Shape &input1_shape, const float *input1_data,
+ const Shape &input2_shape, const float *input2_data, const Shape &output_shape,
+ float *output_data, const std::function<float(const float &, const float &)> &fn)
{
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
@@ -171,10 +164,10 @@ inline void BroadcastBinaryArithmeticOpSlow(
{
for (int c = 0; c < extended_output_shape.Dims(3); ++c)
{
- output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax(
- fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
- input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
- params.float_activation_min, params.float_activation_max);
+ output_data[Offset(extended_output_shape, b, y, x, c)] =
+ ActivationFunctionWithMinMax(fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+ input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
+ params.float_activation_min, params.float_activation_max);
}
}
}
diff --git a/compute/cker/include/cker/operation/reference/Conv.h b/compute/cker/include/cker/operation/reference/Conv.h
index 86e8b5143..e316083a5 100644
--- a/compute/cker/include/cker/operation/reference/Conv.h
+++ b/compute/cker/include/cker/operation/reference/Conv.h
@@ -98,8 +98,8 @@ inline void Conv(const ConvParams &params, const Shape &input_shape, const float
bias_value = bias_data[out_channel];
}
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
- ActivationFunctionWithMinMax(total + bias_value, output_activation_min,
- output_activation_max);
+ ActivationFunctionWithMinMax(total + bias_value, output_activation_min,
+ output_activation_max);
}
}
}
@@ -183,7 +183,213 @@ inline void Conv(const ConvParams &params, const Shape &input_shape, const uint8
acc = std::max(acc, output_activation_min);
acc = std::min(acc, output_activation_max);
output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
- static_cast<uint8_t>(acc);
+ static_cast<uint8_t>(acc);
+ }
+ }
+ }
+ }
+}
+
+template <typename T, bool is_asymmetric>
+inline void Conv(const ConvParams &params, const int32_t *output_multiplier,
+ const int32_t *output_shift, const Shape &input_shape, const T *input_data,
+ const Shape &filter_shape, const T *filter_data, const int32_t *filter_zeropoint,
+ const Shape &bias_shape, const int32_t *bias_data, const Shape &output_shape,
+ T *output_data)
+
+{
+ UNUSED_RELEASE(bias_shape);
+ // Get parameters.
+ const int32_t input_offset = params.input_offset; // r = s(q - Z)
+ const int stride_width = params.stride_width;
+ const int stride_height = params.stride_height;
+ const int dilation_width_factor = params.dilation_width_factor;
+ const int dilation_height_factor = params.dilation_height_factor;
+ const int pad_width = params.padding_values.width;
+ const int pad_height = params.padding_values.height;
+ const int32_t output_offset = params.output_offset;
+
+ // Set min and max value of the output.
+ const int32_t output_activation_min = params.quantized_activation_min;
+ const int32_t output_activation_max = params.quantized_activation_max;
+
+ // Consistency check.
+ assert(output_activation_min < output_activation_max);
+ assert(input_shape.DimensionsCount() == 4);
+ assert(filter_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+ const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+ if (bias_data)
+ {
+ assert(bias_shape.FlatSize() == output_depth);
+ }
+
+ // Check dimensions of the tensors.
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int filter_height = filter_shape.Dims(1);
+ const int filter_width = filter_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ for (int batch = 0; batch < batches; ++batch)
+ {
+ for (int out_y = 0; out_y < output_height; ++out_y)
+ {
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ for (int out_x = 0; out_x < output_width; ++out_x)
+ {
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ for (int out_channel = 0; out_channel < output_depth; ++out_channel)
+ {
+ int32_t acc = 0;
+ for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+ {
+ const int in_y = in_y_origin + dilation_height_factor * filter_y;
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+ {
+ const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+ // Zero padding by omitting the areas outside the image.
+ const bool is_point_inside_image =
+ (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);
+
+ if (!is_point_inside_image)
+ {
+ continue;
+ }
+
+ for (int in_channel = 0; in_channel < input_depth; ++in_channel)
+ {
+ const T input_val = input_data[Offset(input_shape, batch, in_y, in_x, in_channel)];
+ const T filter_val =
+ filter_data[Offset(filter_shape, out_channel, filter_y, filter_x, in_channel)];
+ if (is_asymmetric)
+ {
+ const int32_t filter_offset = -filter_zeropoint[out_channel];
+ acc += (filter_val + filter_offset) * (input_val + input_offset);
+ }
+ else
+ {
+ // Accumulate with 32 bits accumulator.
+ // In the nudging process during model quantization, we force
+ // real value of 0.0 be represented by a quantized value. This
+ // guarantees that the input_offset is a int8_t, even though
+ // it is represented using int32_t. int32_t += int8_t *
+ // (int8_t - int8_t) so the highest value we can get from each
+ // accumulation is [-127, 127] * ([-128, 127] -
+ // [-128, 127]), which is [-32512, 32512]. log2(32512)
+ // = 14.98, which means we can accumulate at least 2^16
+ // multiplications without overflow. The accumulator is
+ // applied to a filter so the accumulation logic will hold as
+ // long as the filter size (filter_y * filter_x * in_channel)
+ // does not exceed 2^16, which is the case in all the models
+ // we have seen so far.
+ // TODO(jianlijianli): Add a check to make sure the
+ // accumulator depth is smaller than 2^16.
+ acc += filter_val * (input_val + input_offset);
+ UNUSED_RELEASE(filter_zeropoint);
+ }
+ }
+ }
+ }
+
+ if (bias_data)
+ {
+ acc += bias_data[out_channel];
+ }
+ acc = MultiplyByQuantizedMultiplier(acc, output_multiplier[out_channel],
+ output_shift[out_channel]);
+ acc += output_offset;
+ acc = std::max(acc, output_activation_min);
+ acc = std::min(acc, output_activation_max);
+ output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = static_cast<T>(acc);
+ }
+ }
+ }
+ }
+}
+
+// Slightly modified from tflite 2.13.0 HybridConvPerChannel
+// im2col and im2col_shape are removed since it is not used in reference kernel.
+inline void HybridConvPerChannel(const ConvParams &params, float *scaling_factors_ptr,
+ const Shape &input_shape, const int8_t *input_data,
+ const Shape &filter_shape, const int8_t *filter_data,
+ const Shape &bias_shape, const float *bias_data,
+ const Shape &output_shape, float *output_data,
+ const float *per_channel_scale, const int32_t *input_offset)
+
+{
+ const int stride_width = params.stride_width;
+ const int stride_height = params.stride_height;
+ const int dilation_width_factor = params.dilation_width_factor;
+ const int dilation_height_factor = params.dilation_height_factor;
+ const int pad_width = params.padding_values.width;
+ const int pad_height = params.padding_values.height;
+ const float output_activation_min = params.float_activation_min;
+ const float output_activation_max = params.float_activation_max;
+ assert(input_shape.DimensionsCount() == 4);
+ assert(filter_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int input_depth = input_shape.Dims(3);
+ const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+ if (bias_data)
+ {
+ assert(bias_shape.FlatSize() == output_depth);
+ UNUSED_RELEASE(bias_shape);
+ }
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int filter_height = filter_shape.Dims(1);
+ const int filter_width = filter_shape.Dims(2);
+ const int filter_input_depth = filter_shape.Dims(3);
+ const int groups = input_depth / filter_input_depth;
+ assert(input_depth % filter_input_depth == 0);
+ const int filters_per_group = output_depth / groups;
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ for (int batch = 0; batch < batches; ++batch)
+ {
+ for (int out_y = 0; out_y < output_height; ++out_y)
+ {
+ for (int out_x = 0; out_x < output_width; ++out_x)
+ {
+ for (int out_channel = 0; out_channel < output_depth; ++out_channel)
+ {
+ auto group = out_channel / filters_per_group;
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ int32_t acc = 0;
+ for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+ {
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+ {
+ for (int in_channel = 0; in_channel < filter_input_depth; ++in_channel)
+ {
+ const int in_x = in_x_origin + dilation_width_factor * filter_x;
+ const int in_y = in_y_origin + dilation_height_factor * filter_y;
+ // If the location is outside the bounds of the input image,
+ // use zero as a default value.
+ if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height))
+ {
+ int32_t input_val = input_data[Offset(input_shape, batch, in_y, in_x,
+ in_channel + group * filter_input_depth)];
+ int32_t filter_val =
+ filter_data[Offset(filter_shape, out_channel, filter_y, filter_x, in_channel)];
+ acc += filter_val * (input_val - input_offset[batch]);
+ }
+ }
+ }
+ }
+ float acc_float = acc * per_channel_scale[out_channel] * scaling_factors_ptr[batch];
+ if (bias_data)
+ {
+ acc_float += bias_data[out_channel];
+ }
+ output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+ ActivationFunctionWithMinMax(acc_float, output_activation_min, output_activation_max);
}
}
}
diff --git a/compute/cker/include/cker/operation/reference/integer_ops/DepthwiseConvHybrid.h b/compute/cker/include/cker/operation/reference/integer_ops/DepthwiseConvHybrid.h
new file mode 100644
index 000000000..9fc58ad3b
--- /dev/null
+++ b/compute/cker/include/cker/operation/reference/integer_ops/DepthwiseConvHybrid.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_REFERENCE_DEPTHWISE_CONV_HYBRID_H__
+#define __NNFW_CKER_REFERENCE_DEPTHWISE_CONV_HYBRID_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+namespace reference_integer_ops
+{
+
+inline void DepthwiseConvHybridPerChannel(const DepthwiseConvParams &params,
+ float *scaling_factors_ptr, const Shape &input_shape,
+ const int8_t *input_data, const Shape &filter_shape,
+ const int8_t *filter_data, const Shape &bias_shape,
+ const float *bias_data, const Shape &output_shape,
+ float *output_data, const float *per_channel_scale,
+ int32_t *input_offset)
+{
+ const int stride_width = params.stride_width;
+ const int stride_height = params.stride_height;
+ const int dilation_width_factor = params.dilation_width_factor;
+ const int dilation_height_factor = params.dilation_height_factor;
+ const int pad_width = params.padding_values.width;
+ const int pad_height = params.padding_values.height;
+ const int depth_multiplier = params.depth_multiplier;
+ const float output_activation_min = params.float_activation_min;
+ const float output_activation_max = params.float_activation_max;
+
+ // Check dimensions of the tensors.
+ assert(input_shape.DimensionsCount() == 4);
+ assert(filter_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int input_depth = input_shape.Dims(3);
+ const int filter_height = filter_shape.Dims(1);
+ const int filter_width = filter_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ const int bias_depth = bias_shape.FlatSize();
+ UNUSED_RELEASE(output_depth);
+ UNUSED_RELEASE(bias_shape);
+ assert(output_depth == input_depth * depth_multiplier);
+ assert(bias_depth == output_depth);
+
+ for (int batch = 0; batch < batches; ++batch)
+ {
+ for (int out_y = 0; out_y < output_height; ++out_y)
+ {
+ for (int out_x = 0; out_x < output_width; ++out_x)
+ {
+ for (int in_channel = 0; in_channel < input_depth; ++in_channel)
+ {
+ for (int m = 0; m < depth_multiplier; ++m)
+ {
+ const int output_channel = m + in_channel * depth_multiplier;
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ int32_t acc = 0;
+ for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+ {
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+ {
+ const int in_x = in_x_origin + dilation_width_factor * filter_x;
+ const int in_y = in_y_origin + dilation_height_factor * filter_y;
+ // Zero padding by omitting the areas outside the image.
+ const bool is_point_inside_image =
+ (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);
+ if (is_point_inside_image)
+ {
+ int32_t input_val =
+ input_data[Offset(input_shape, batch, in_y, in_x, in_channel)];
+ int32_t filter_val =
+ filter_data[Offset(filter_shape, 0, filter_y, filter_x, output_channel)];
+ acc += filter_val * (input_val - input_offset[batch]);
+ }
+ }
+ }
+ float acc_float = static_cast<float>(acc);
+ acc_float *= per_channel_scale[output_channel] * scaling_factors_ptr[batch];
+ if (bias_data && output_channel < bias_depth)
+ {
+ acc_float += bias_data[output_channel];
+ }
+ output_data[Offset(output_shape, batch, out_y, out_x, output_channel)] =
+ ActivationFunctionWithMinMax(acc_float, output_activation_min, output_activation_max);
+ }
+ }
+ }
+ }
+ }
+}
+
+} // namespace reference_integer_ops
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_REFERENCE_DEPTHWISE_CONV_HYBRID_H__
diff --git a/compute/cker/include/cker/operation/reference/integer_ops/DepthwiseConvUInt8.h b/compute/cker/include/cker/operation/reference/integer_ops/DepthwiseConvUInt8.h
new file mode 100644
index 000000000..025e40705
--- /dev/null
+++ b/compute/cker/include/cker/operation/reference/integer_ops/DepthwiseConvUInt8.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_REFERENCE_DEPTHWISE_CONV_UINT8_H__
+#define __NNFW_CKER_REFERENCE_DEPTHWISE_CONV_UINT8_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+namespace reference_integer_ops
+{
+inline void DepthwiseConvPerChannel(const DepthwiseConvParams &params,
+ const int32_t *output_multiplier, const int32_t *output_shift,
+ const Shape &input_shape, const uint8_t *input_data,
+ const Shape &filter_shape, const uint8_t *filter_data,
+ const int32_t *filter_zeropoint, const Shape &bias_shape,
+ const int32_t *bias_data, const Shape &output_shape,
+ uint8_t *output_data)
+{
+ // Get parameters.
+ // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
+ const int stride_width = params.stride_width;
+ const int stride_height = params.stride_height;
+ const int dilation_width_factor = params.dilation_width_factor;
+ const int dilation_height_factor = params.dilation_height_factor;
+ const int pad_width = params.padding_values.width;
+ const int pad_height = params.padding_values.height;
+ const int depth_multiplier = params.depth_multiplier;
+ const int32_t input_offset = params.input_offset;
+ const int32_t output_offset = params.output_offset;
+ const int32_t output_activation_min = params.quantized_activation_min;
+ const int32_t output_activation_max = params.quantized_activation_max;
+
+ // Check dimensions of the tensors.
+ assert(input_shape.DimensionsCount() == 4);
+ assert(filter_shape.DimensionsCount() == 4);
+ assert(output_shape.DimensionsCount() == 4);
+
+ assert(output_activation_min <= output_activation_max);
+ const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+ const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+ const int input_height = input_shape.Dims(1);
+ const int input_width = input_shape.Dims(2);
+ const int input_depth = input_shape.Dims(3);
+ const int filter_height = filter_shape.Dims(1);
+ const int filter_width = filter_shape.Dims(2);
+ const int output_height = output_shape.Dims(1);
+ const int output_width = output_shape.Dims(2);
+ UNUSED_RELEASE(output_depth);
+ UNUSED_RELEASE(bias_shape);
+ assert(output_depth == input_depth * depth_multiplier);
+ assert(bias_shape.FlatSize() == output_depth);
+
+ for (int batch = 0; batch < batches; ++batch)
+ {
+ for (int out_y = 0; out_y < output_height; ++out_y)
+ {
+ for (int out_x = 0; out_x < output_width; ++out_x)
+ {
+ for (int in_channel = 0; in_channel < input_depth; ++in_channel)
+ {
+ for (int m = 0; m < depth_multiplier; ++m)
+ {
+ const int output_channel = m + in_channel * depth_multiplier;
+ const int in_x_origin = (out_x * stride_width) - pad_width;
+ const int in_y_origin = (out_y * stride_height) - pad_height;
+ int32_t acc = 0;
+ for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+ {
+ for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+ {
+ const int in_x = in_x_origin + dilation_width_factor * filter_x;
+ const int in_y = in_y_origin + dilation_height_factor * filter_y;
+ // Zero padding by omitting the areas outside the image.
+ const bool is_point_inside_image =
+ (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);
+ if (is_point_inside_image)
+ {
+ uint8_t input_val =
+ input_data[Offset(input_shape, batch, in_y, in_x, in_channel)];
+ uint8_t filter_val =
+ filter_data[Offset(filter_shape, 0, filter_y, filter_x, output_channel)];
+
+ // { for per-channel
+ // NOTE: The following comment is copied from tflite int8 implementation
+ // It may not be 100% true for uint8 per-channel.
+ //
+ // Accumulate with 32 bits accumulator.
+ // In the nudging process during model quantization, we force
+ // real value of 0.0 be represented by a quantized value. This
+ // guarantees that the input_offset is a int8, even though it
+ // is represented using int32_t.
+ // int32 += int8 * (int8 - int8) so the highest value we can
+ // get from each accumulation is [-127, 127] * ([-128, 127] -
+ // [-128, 127]), which is [-32512, 32512]. log2(32512)
+ // = 14.98, which means we can accumulate at least 2^16
+ // multiplications without overflow. The accumulator is
+ // applied to a filter so the accumulation logic will hold as
+ // long as the filter size (filter_y * filter_x * in_channel)
+ // does not exceed 2^16, which is the case in all the models
+ // we have seen so far.
+ // TODO(jianlijianli): Add a check to make sure the
+ // accumulator depth is smaller than 2^16.
+ const int32_t filter_offset = -filter_zeropoint[output_channel];
+ acc += (filter_val + filter_offset) * (input_val + input_offset);
+ // } for per-channel
+ }
+ }
+ }
+ if (bias_data)
+ {
+ acc += bias_data[output_channel];
+ }
+ acc = MultiplyByQuantizedMultiplier(acc, output_multiplier[output_channel],
+ output_shift[output_channel]);
+ acc += output_offset;
+ acc = std::max(acc, output_activation_min);
+ acc = std::min(acc, output_activation_max);
+ // For q8u per-channel, int8_t -> uint8_t
+ output_data[Offset(output_shape, batch, out_y, out_x, output_channel)] =
+ static_cast<uint8_t>(acc);
+ }
+ }
+ }
+ }
+ }
+}
+
+} // namespace reference_integer_ops
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_REFERENCE_DEPTHWISE_CONV_UINT8_H__
diff --git a/compute/cker/include/cker/ruy/RuySupport.h b/compute/cker/include/cker/ruy/RuySupport.h
index 9612dd517..14489a804 100644
--- a/compute/cker/include/cker/ruy/RuySupport.h
+++ b/compute/cker/include/cker/ruy/RuySupport.h
@@ -19,7 +19,9 @@
#define __NNFW_CKER_RUY_RUY_SUPPORT_H__
#include <util/ConfigSource.h>
-#include <ruy/context.h>
+#include <ruy/matrix.h>
+#include <ruy/ruy.h>
+#include <cassert>
#include "cker/Types.h"
namespace nnfw
@@ -29,44 +31,66 @@ namespace cker
namespace ruy_support
{
+inline ruy::CachePolicy ToRuyCachePolicy(CachePolicy cache_policy)
+{
+ switch (cache_policy)
+ {
+ case CachePolicy::kNeverCache:
+ return ruy::CachePolicy::kNeverCache;
+ case CachePolicy::kCacheIfLargeSpeedup:
+ return ruy::CachePolicy::kCacheIfLargeSpeedup;
+ case CachePolicy::kAlwaysCache:
+ return ruy::CachePolicy::kAlwaysCache;
+ default:
+ assert(false);
+ return ruy::CachePolicy::kNeverCache;
+ }
+}
+
template <typename Scalar, typename DataPointer>
void MakeRuyMatrix(const MatrixParams<Scalar> &params, DataPointer data_ptr,
- ruy::Matrix<Scalar> *dst)
+ ruy::Matrix<Scalar> *dst, bool use_caching = false)
{
- dst->layout.rows = params.rows;
- dst->layout.cols = params.cols;
- if (params.order == Order::kColMajor)
+ ruy::Order ruy_order =
+ params.order == Order::kColMajor ? ruy::Order::kColMajor : ruy::Order::kRowMajor;
+ ruy::MakeSimpleLayout(params.rows, params.cols, ruy_order, dst->mutable_layout());
+ // Note that ruy::Matrix::data is a ConstCheckingPtr, not a plain pointer.
+ // It does care whether we assign to it a Scalar* or a const Scalar*.
+ dst->set_data(data_ptr);
+ dst->set_zero_point(params.zero_point);
+ if (use_caching)
{
- dst->layout.order = ruy::Order::kColMajor;
- dst->layout.stride = params.rows;
+ dst->set_cache_policy(ToRuyCachePolicy(params.cache_policy));
}
- else
+}
+
+// Integer-quantized case with destination type narrower than int32
+template <typename DstScalar, QuantizationFlavor quantization_flavor>
+void MakeRuyMulParams(const GemmParams<std::int32_t, DstScalar, quantization_flavor> &params,
+ ruy::MulParams<std::int32_t, DstScalar> *ruy_mul_params)
+{
+ static_assert(sizeof(DstScalar) < sizeof(std::int32_t), "");
+ if (quantization_flavor == QuantizationFlavor::kIntegerWithUniformMultiplier)
{
- dst->layout.order = ruy::Order::kRowMajor;
- dst->layout.stride = params.cols;
+ ruy_mul_params->set_multiplier_fixedpoint(params.multiplier_fixedpoint);
+ ruy_mul_params->set_multiplier_exponent(params.multiplier_exponent);
}
- // Note that ruy::Matrix::data is a ConstCheckingPtr, not a plain pointer.
- // It does care whether we assign to it a Scalar* or a const Scalar*.
- dst->data = data_ptr;
- dst->zero_point = params.zero_point;
- dst->cacheable = params.cacheable;
+ if (quantization_flavor == QuantizationFlavor::kIntegerWithPerRowMultiplier)
+ {
+ ruy_mul_params->set_multiplier_fixedpoint_perchannel(params.multiplier_fixedpoint_perchannel);
+ ruy_mul_params->set_multiplier_exponent_perchannel(params.multiplier_exponent_perchannel);
+ }
+ ruy_mul_params->set_bias(params.bias);
+ ruy_mul_params->set_clamp_min(params.clamp_min);
+ ruy_mul_params->set_clamp_max(params.clamp_max);
}
-template <typename GemmParamsType, typename RuySpecType>
-void MakeRuySpec(const GemmParamsType &params, RuySpecType *ruy_spec)
+// Raw-integer case with destination type int32.
+template <QuantizationFlavor quantization_flavor>
+void MakeRuyMulParams(const GemmParams<std::int32_t, std::int32_t, quantization_flavor> &params,
+ ruy::MulParams<std::int32_t, std::int32_t> *ruy_mul_params)
{
- // This validation has already been performed by the Gemm API entry point,
- // but it doesn't hurt to test specifically this again here, where it's
- // being used.
- ValidateGemmParams(params);
-
- ruy_spec->multiplier_fixedpoint = params.multiplier_fixedpoint;
- ruy_spec->multiplier_exponent = params.multiplier_exponent;
- ruy_spec->multiplier_fixedpoint_perchannel = params.multiplier_fixedpoint_perchannel;
- ruy_spec->multiplier_exponent_perchannel = params.multiplier_exponent_perchannel;
- ruy_spec->bias = params.bias;
- ruy_spec->clamp_min = params.clamp_min;
- ruy_spec->clamp_max = params.clamp_max;
+ ruy_mul_params->set_bias(params.bias);
}
} // namespace ruy_support
diff --git a/compute/cker/include/cker/train/operation/FullyConnected.h b/compute/cker/include/cker/train/operation/FullyConnected.h
new file mode 100644
index 000000000..b0255d287
--- /dev/null
+++ b/compute/cker/include/cker/train/operation/FullyConnected.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_TRAIN_OPERATION_FULLY_CONNECTED_H__
+#define __NNFW_CKER_TRAIN_OPERATION_FULLY_CONNECTED_H__
+
+#include "cker/eigen/Utils.h"
+#include "cker/Shape.h"
+
+namespace nnfw
+{
+namespace cker
+{
+namespace train
+{
+
+template <typename T>
+inline void FullyConnectedBiasGrad(const Shape &incomming_shape, const T *incomming_data,
+ const Shape &grad_shape, T *grad_data)
+{
+ const auto bias_size = grad_shape.FlatSize();
+ if (bias_size != incomming_shape.Dims(incomming_shape.DimensionsCount() - 1) ||
+ bias_size != grad_shape.Dims(0))
+ throw std::runtime_error("cker::FullyConnectedBiasGrad: Unmatched shape");
+
+ const auto in_mat = MapAsMatrixWithLastDimAsRows(incomming_data, incomming_shape);
+ auto grad_mat = MapAsMatrixWithLastDimAsRows(grad_data, grad_shape);
+
+ grad_mat = in_mat.rowwise().sum();
+}
+
+} // namespace train
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_FULLY_CONNECTED_H__
diff --git a/compute/cker/include/cker/train/operation/Loss.h b/compute/cker/include/cker/train/operation/Loss.h
new file mode 100644
index 000000000..94f49ff07
--- /dev/null
+++ b/compute/cker/include/cker/train/operation/Loss.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_TRAIN_OPERATION_LOSS_H__
+#define __NNFW_CKER_TRAIN_OPERATION_LOSS_H__
+
+#include "cker/Shape.h"
+#include "cker/eigen/Utils.h"
+
+namespace nnfw
+{
+namespace cker
+{
+namespace train
+{
+
+template <typename T>
+inline void MSE(const Shape &y_pred_shape, const T *y_pred_data, const Shape &y_true_shape,
+ const T *y_true_data, const Shape &output_shape, T *output_data)
+{
+ // TODO Consider Reduction
+ if (output_shape != Shape{1})
+ throw std::runtime_error("cker::MSE: output_shape != Shape{1}");
+ if (y_pred_shape != y_true_shape)
+ throw std::runtime_error("cker::MSE: y_pred_shape != y_true_shape");
+
+ const auto y_pred = MapAsMatrixWithLastDimAsRows(y_pred_data, y_pred_shape);
+ const auto y_true = MapAsMatrixWithLastDimAsRows(y_true_data, y_true_shape);
+
+ double squared_sum = 0.0f;
+ for (size_t c = 0; c < (size_t)y_pred.cols(); ++c)
+ {
+ for (size_t r = 0; r < (size_t)y_pred.rows(); ++r)
+ {
+ double error = y_pred.coeff(r, c) - y_true.coeff(r, c);
+ squared_sum += (error * error);
+ }
+ }
+
+ auto size = y_pred.cols() * y_pred.rows();
+ output_data[0] = (T)(squared_sum / size);
+}
+
+template <typename T>
+inline void MSEGrad(const Shape &y_pred_shape, const T *y_pred_data, const Shape &y_true_shape,
+ const T *y_true_data, const Shape &grad_shape, T *grad_data)
+{
+ if (y_pred_shape != y_true_shape)
+ throw std::runtime_error("cker::MSEGrad: y_pred_shape != y_true_shape");
+ if (y_pred_shape != grad_shape)
+ throw std::runtime_error("cker::MSEGrad: y_pred_shape != grad_shape");
+
+ const int size = grad_shape.FlatSize();
+ for (int i = 0; i < size; ++i)
+ {
+ grad_data[i] = static_cast<T>(-2 * (y_true_data[i] - y_pred_data[i]) / size);
+ }
+}
+
+} // namespace train
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_TRAIN_OPERATION_LOSS_H__
diff --git a/compute/cker/include/cker/train/operation/ReLU.h b/compute/cker/include/cker/train/operation/ReLU.h
new file mode 100644
index 000000000..32cf7fa9c
--- /dev/null
+++ b/compute/cker/include/cker/train/operation/ReLU.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_TRAIN_OPERATION_RELU_H__
+#define __NNFW_CKER_TRAIN_OPERATION_RELU_H__
+
+#include "cker/Shape.h"
+#include "cker/eigen/Utils.h"
+
+#include <Eigen/Core>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace train
+{
+
+inline void ReLUGrad(const Shape &output_shape, const float *output_data,
+ const Shape &incoming_shape, const float *incoming_data,
+ const Shape &grad_shape, float *grad_data)
+{
+ const auto output_map = MapAsVector(output_data, output_shape);
+ const auto incoming_map = MapAsVector(incoming_data, incoming_shape);
+ auto grad_map = MapAsVector(grad_data, grad_shape);
+
+ if (output_shape == incoming_shape && output_shape == grad_shape)
+ grad_map.array() = incoming_map.array() * (output_map.array() > 0.0f).template cast<float>();
+ else
+ throw std::runtime_error("cker::ReLUGrad: Unsupported shape");
+}
+
+} // namespace train
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_TRAIN_OPERATION_RELU_H__
diff --git a/compute/cker/src/Range.test.cc b/compute/cker/src/Range.test.cc
new file mode 100644
index 000000000..e5fe4801f
--- /dev/null
+++ b/compute/cker/src/Range.test.cc
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cker/operation/Range.h>
+
+#include <gtest/gtest.h>
+#include <vector>
+
+TEST(CKer_Operation, Range)
+{
+ {
+ const int start = 0;
+ const int limit = 10;
+ const int delta = 1;
+ std::vector<int> actual(10);
+ nnfw::cker::Range<int>(&start, &limit, &delta, actual.data());
+
+ for (int i = 0; i < actual.size(); i++)
+ ASSERT_EQ(actual[i], i);
+ }
+
+ {
+ const int start = 3;
+ const int limit = 18;
+ const int delta = 3;
+ std::vector<int> expected = {3, 6, 9, 12, 15};
+ std::vector<int> actual(expected.size());
+ nnfw::cker::Range<int>(&start, &limit, &delta, actual.data());
+
+ for (int i = 0; i < actual.size(); i++)
+ ASSERT_EQ(actual[i], expected[i]);
+ }
+
+ {
+ const float start = 3;
+ const float limit = 1;
+ const float delta = -0.5;
+ std::vector<float> expected = {3, 2.5, 2, 1.5};
+ std::vector<float> actual(expected.size());
+ nnfw::cker::Range<float>(&start, &limit, &delta, actual.data());
+
+ for (int i = 0; i < actual.size(); i++)
+ ASSERT_FLOAT_EQ(actual[i], expected[i]);
+ }
+}
+
+TEST(CKer_Operation, neg_Range)
+{
+ {
+ const int start = 212;
+ const int limit = 10;
+ const int delta = 1;
+ std::vector<int> actual(10);
+
+ EXPECT_ANY_THROW(nnfw::cker::Range<int>(&start, &limit, &delta, actual.data()));
+ }
+}
diff --git a/compute/cker/src/train/FullyConnected.test.cc b/compute/cker/src/train/FullyConnected.test.cc
new file mode 100644
index 000000000..37c2d4a97
--- /dev/null
+++ b/compute/cker/src/train/FullyConnected.test.cc
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cker/train/operation/FullyConnected.h>
+
+#include <gtest/gtest.h>
+#include <vector>
+
+TEST(CKer_Operation, FullyConnectedBiasGrad)
+{
+ {
+ // Shape: {2, 4}
+ std::vector<float> incoming_backward = {-1, 2, -3, 4, 5, -6, -7, 8};
+ // Shape: {4}
+ std::vector<float> expected_bias_backward = {4, -4, -10, 12};
+ std::vector<float> bias_backward(4);
+
+ nnfw::cker::train::FullyConnectedBiasGrad(
+ nnfw::cker::Shape{2, 4}, incoming_backward.data(),
+ nnfw::cker::Shape{static_cast<int>(bias_backward.size())}, bias_backward.data());
+
+ for (size_t i = 0; i < bias_backward.size(); ++i)
+ ASSERT_EQ(bias_backward[i], expected_bias_backward[i]);
+ }
+
+ {
+ // Shape: {3, 3}
+ std::vector<float> incoming_backward = {-1, 2, -3, 4, 5, -6, -7, 8, 9};
+ // Shape: {3}
+ std::vector<float> expected_bias_backward = {-4, 15, 0};
+ std::vector<float> bias_backward(3);
+
+ nnfw::cker::train::FullyConnectedBiasGrad(
+ nnfw::cker::Shape{3, 3}, incoming_backward.data(),
+ nnfw::cker::Shape{static_cast<int>(bias_backward.size())}, bias_backward.data());
+
+ for (size_t i = 0; i < bias_backward.size(); ++i)
+ ASSERT_EQ(bias_backward[i], expected_bias_backward[i]);
+ }
+
+ {
+ // Shape: {1, 2, 2, 3}
+ std::vector<float> incoming_backward = {-1, 2, -3, 4, 5, -6, -7, 8, 9, -10, -11, 12};
+ // Shape: {3}
+ std::vector<float> expected_bias_backward = {-14, 4, 12};
+ std::vector<float> bias_backward(3);
+
+ nnfw::cker::train::FullyConnectedBiasGrad(
+ nnfw::cker::Shape{1, 2, 2, 3}, incoming_backward.data(),
+ nnfw::cker::Shape{static_cast<int>(bias_backward.size())}, bias_backward.data());
+
+ for (size_t i = 0; i < bias_backward.size(); ++i)
+ ASSERT_EQ(bias_backward[i], expected_bias_backward[i]);
+ }
+}
+
+TEST(CKer_Operation, neg_FullyConnectedBiasGrad)
+{
+ {
+ // Unmatched shape
+ // Shape: {2, 4}
+ std::vector<float> incoming_backward = {-1, 2, -3, 4, 5, -6, -7, 8};
+ // Shape: {3}
+ std::vector<float> bias_backward(3);
+ EXPECT_ANY_THROW(nnfw::cker::train::FullyConnectedBiasGrad(
+ nnfw::cker::Shape{2, 4}, incoming_backward.data(),
+ nnfw::cker::Shape{static_cast<int>(bias_backward.size())},
+ bias_backward.data()););
+ }
+}
diff --git a/compute/cker/src/train/Loss.test.cc b/compute/cker/src/train/Loss.test.cc
new file mode 100644
index 000000000..98568f47a
--- /dev/null
+++ b/compute/cker/src/train/Loss.test.cc
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cker/train/operation/Loss.h>
+
+#include <gtest/gtest.h>
+#include <vector>
+
+TEST(CKer_Operation, LossMSE)
+{
+ {
+ // Shape: {1, 10} -> m_rows:10, m_cols:1
+ std::vector<int> y_pred = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ std::vector<int> y_true = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+ std::vector<int> output(1);
+ std::vector<int> expected = {1};
+
+ nnfw::cker::train::MSE(nnfw::cker::Shape{1, 10}, y_pred.data(), nnfw::cker::Shape{1, 10},
+ y_true.data(), nnfw::cker::Shape{1}, output.data());
+
+ EXPECT_EQ(output[0], expected[0]);
+ }
+
+ {
+ // Shape: {1, 10} -> m_rows:10, m_cols:1
+ std::vector<float> y_pred = {1., 2., 3., 4., 5., 6., 7., 8., 9., 10.};
+ std::vector<float> y_true = {0., 1., 2., 3., 4., 5., 6., 7., 8., 9.};
+ std::vector<float> output(1);
+ std::vector<float> expected = {1.0};
+
+ nnfw::cker::train::MSE(nnfw::cker::Shape{1, 10}, y_pred.data(), nnfw::cker::Shape{1, 10},
+ y_true.data(), nnfw::cker::Shape{1}, output.data());
+
+ EXPECT_FLOAT_EQ(output[0], expected[0]);
+ }
+
+ {
+ // Shape: {2, 3} -> m_rows:3, m_cols:2
+ std::vector<float> y_pred = {27.2, 31.8, 51.9, 10.2, 34.2, 12.4};
+ std::vector<float> y_true = {31.3, 40.3, 29.7, 12.9, 25.8, 11.9};
+ std::vector<float> output(1);
+ std::vector<float> expected = {110.0};
+
+ nnfw::cker::train::MSE(nnfw::cker::Shape{2, 3}, y_pred.data(), nnfw::cker::Shape{2, 3},
+ y_true.data(), nnfw::cker::Shape{1}, output.data());
+
+ EXPECT_FLOAT_EQ(output[0], expected[0]);
+ }
+
+ {
+ // Shape: {2, 3, 4} -> m_rows:4, m_cols:6
+ std::vector<float> y_pred = {1., 2., 3., 4., 1., 2., 3., 4., 1., 2., 3., 4.,
+ 1., 2., 3., 4., 1., 2., 3., 4., 1., 2., 3., 4.};
+ std::vector<float> y_true = {1., 1., 1., 1., 2., 2., 2., 2., 3., 3., 3., 3.,
+ 1., 1., 1., 1., 2., 2., 2., 2., 3., 3., 3., 3.};
+ std::vector<float> output(1);
+ std::vector<float> expected = {2.1666667};
+
+ nnfw::cker::train::MSE(nnfw::cker::Shape{2, 3, 4}, y_pred.data(), nnfw::cker::Shape{2, 3, 4},
+ y_true.data(), nnfw::cker::Shape{1}, output.data());
+
+ EXPECT_FLOAT_EQ(output[0], expected[0]);
+ }
+}
+
+TEST(CKer_Operation, neg_LossMSE)
+{
+ {
+ // Invalid expected value
+ std::vector<float> y_pred = {1., 2., 3., 4., 5., 6., 7., 8., 9., 10.};
+ std::vector<float> y_true = {0., 1., 2., 3., 4., 5., 6., 7., 8., 9.};
+ std::vector<float> output(1);
+ std::vector<float> expected = {-1.0};
+
+ nnfw::cker::train::MSE(nnfw::cker::Shape{2, 3, 4}, y_pred.data(), nnfw::cker::Shape{2, 3, 4},
+ y_true.data(), nnfw::cker::Shape{1}, output.data());
+
+ EXPECT_NE(output[0], expected[0]);
+ }
+
+ {
+ // Invalid output shape
+ std::vector<float> y_pred = {1., 2., 3., 4., 5., 6., 7., 8., 9., 10.};
+ std::vector<float> y_true = {0., 1., 2., 3., 4., 5., 6., 7., 8., 9.};
+ std::vector<float> output(3);
+ std::vector<float> expected = {1.0};
+
+ EXPECT_ANY_THROW(nnfw::cker::train::MSE(nnfw::cker::Shape{2, 3, 4}, y_pred.data(),
+ nnfw::cker::Shape{2, 3, 4}, y_true.data(),
+ nnfw::cker::Shape{3}, output.data()));
+ }
+
+ {
+ // Different y_pread and y_true shape
+ std::vector<float> y_pred = {1., 2., 3., 4., 5., 6., 7., 8., 9., 10.};
+ std::vector<float> y_true = {0., 1., 2., 3., 4., 5.};
+ std::vector<float> output(1);
+ std::vector<float> expected = {1.0};
+
+ EXPECT_ANY_THROW(nnfw::cker::train::MSE(nnfw::cker::Shape{2, 3, 4}, y_pred.data(),
+ nnfw::cker::Shape{2, 3}, y_true.data(),
+ nnfw::cker::Shape{1}, output.data()));
+ }
+}
+
+TEST(CKer_Operation, LossMSEGrad)
+{
+ {
+ // Shape: {1, 10} -> m_rows:10, m_cols:1
+ std::vector<int> y_pred = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ std::vector<int> y_true = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+ std::vector<int> deriv_y_pred(10);
+ std::vector<int> expected = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+ nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(), nnfw::cker::Shape{1, 10},
+ y_true.data(), nnfw::cker::Shape{1, 10}, deriv_y_pred.data());
+
+ for (size_t i = 0; i < deriv_y_pred.size(); ++i)
+ EXPECT_EQ(deriv_y_pred[i], expected[i]);
+ }
+
+ {
+ // Shape: {1, 10} -> m_rows:10, m_cols:1
+ std::vector<float> y_pred = {1., 2., 3., 4., 5., 6., 7., 8., 9., 10.};
+ std::vector<float> y_true = {0., 1., 2., 3., 4., 5., 6., 7., 8., 9.};
+ std::vector<float> deriv_y_pred(10);
+ std::vector<float> expected = {0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2};
+
+ nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(), nnfw::cker::Shape{1, 10},
+ y_true.data(), nnfw::cker::Shape{1, 10}, deriv_y_pred.data());
+
+ for (size_t i = 0; i < deriv_y_pred.size(); ++i)
+ EXPECT_FLOAT_EQ(deriv_y_pred[i], expected[i]);
+ }
+
+ {
+ // Shape: {2, 3} -> m_rows:3, m_cols:2
+ std::vector<float> y_pred = {27.2, 31.8, 51.9, 10.2, 34.2, 12.4};
+ std::vector<float> y_true = {31.3, 40.3, 29.7, 12.9, 25.8, 11.9};
+ std::vector<float> deriv_y_pred(6);
+ std::vector<float> expected = {-1.3666667, -2.8333333, 7.4, -0.9, 2.8, 0.1666667};
+
+ nnfw::cker::train::MSEGrad(nnfw::cker::Shape{2, 3}, y_pred.data(), nnfw::cker::Shape{2, 3},
+ y_true.data(), nnfw::cker::Shape{2, 3}, deriv_y_pred.data());
+
+ for (size_t i = 0; i < deriv_y_pred.size(); ++i)
+ EXPECT_FLOAT_EQ(deriv_y_pred[i], expected[i]);
+ }
+}
+
+TEST(CKer_Operation, neg_LossMSEGrad)
+{
+ {
+ // Invalid expected value
+ std::vector<float> y_pred = {27.2, 31.8, 51.9, 10.2, 34.2, 12.4};
+ std::vector<float> y_true = {31.3, 40.3, 29.7, 12.9, 25.8, 11.9};
+ std::vector<float> deriv_y_pred(6);
+ std::vector<float> expected = {1., 1., 1., 1., 1., 1.};
+
+ nnfw::cker::train::MSEGrad(nnfw::cker::Shape{2, 3}, y_pred.data(), nnfw::cker::Shape{2, 3},
+ y_true.data(), nnfw::cker::Shape{2, 3}, deriv_y_pred.data());
+
+ for (size_t i = 0; i < deriv_y_pred.size(); ++i)
+ EXPECT_NE(deriv_y_pred[i], expected[i]);
+ }
+
+ {
+ // Different y_pred and y_true shape
+ std::vector<float> y_pred = {1., 2., 3., 4., 5., 6., 7., 8., 9., 10.};
+ std::vector<float> y_true = {0., 1., 2., 3., 4., 5.};
+ std::vector<float> deriv_y_pred(10);
+
+ EXPECT_ANY_THROW(nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(),
+ nnfw::cker::Shape{2, 3}, y_true.data(),
+ nnfw::cker::Shape{1, 10}, deriv_y_pred.data()));
+ }
+
+ {
+ // Different y_pred and deriv_y_pred shape
+ std::vector<float> y_pred = {1., 2., 3., 4., 5., 6., 7., 8., 9., 10.};
+ std::vector<float> y_true = {0., 1., 2., 3., 4., 5., 6., 7., 8., 9.};
+ std::vector<float> deriv_y_pred(6);
+
+ EXPECT_ANY_THROW(nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(),
+ nnfw::cker::Shape{1, 10}, y_true.data(),
+ nnfw::cker::Shape{2, 3}, deriv_y_pred.data()));
+ }
+}
diff --git a/compute/cker/src/train/Relu.test.cc b/compute/cker/src/train/Relu.test.cc
new file mode 100644
index 000000000..d94411038
--- /dev/null
+++ b/compute/cker/src/train/Relu.test.cc
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cker/operation/ReLU.h>
+#include <cker/train/operation/ReLU.h>
+
+#include <gtest/gtest.h>
+#include <vector>
+
+namespace
+{
+
+template <typename T> class ReluOpVerifier
+{
+public:
+ ReluOpVerifier(const std::vector<T> &input, const std::vector<T> &expected_output,
+ const std::vector<T> &backprop_output,
+ const std::vector<T> &expected_backprop_input)
+ : _input{input}, _expected_output{expected_output}, _backprop_output{backprop_output},
+ _expected_backprop_input{expected_backprop_input}
+ {
+ EXPECT_TRUE(input.size() == expected_output.size());
+ _output.resize(_expected_output.size());
+ _backprop_input.resize(_expected_backprop_input.size());
+ }
+
+public:
+ void verifyExpected()
+ {
+ nnfw::cker::ReLU(nnfw::cker::Shape{static_cast<int>(_input.size())}, _input.data(),
+ nnfw::cker::Shape{static_cast<int>(_output.size())}, _output.data());
+
+ for (size_t i = 0; i < _output.size(); ++i)
+ ASSERT_EQ(_output[i], _expected_output[i]);
+
+ if (_backprop_output.size() > 0)
+ {
+ nnfw::cker::train::ReLUGrad(
+ nnfw::cker::Shape{static_cast<int>(_output.size())}, _output.data(),
+ nnfw::cker::Shape{static_cast<int>(_backprop_output.size())}, _backprop_output.data(),
+ nnfw::cker::Shape{static_cast<int>(_backprop_input.size())}, _backprop_input.data());
+
+ for (size_t i = 0; i < _backprop_input.size(); ++i)
+ ASSERT_EQ(_backprop_input[i], _expected_backprop_input[i]);
+ }
+ }
+
+private:
+ std::vector<T> _input;
+ std::vector<T> _output;
+ std::vector<T> _expected_output;
+ std::vector<T> _backprop_output;
+ std::vector<T> _backprop_input;
+ std::vector<T> _expected_backprop_input;
+};
+
+} // namespace
+
+TEST(CKer_Operation, ReLU)
+{
+ {
+ std::vector<float> input_forward = {-1, 2, 3, -4};
+ std::vector<float> expected_forward = {0, 2, 3, 0};
+ std::vector<float> incoming_backward = {-5, 6, -7, 8};
+ std::vector<float> expected_backward = {0, 6, -7, 0};
+ ReluOpVerifier<float> verifier{input_forward, expected_forward, incoming_backward,
+ expected_backward};
+ verifier.verifyExpected();
+ }
+
+ {
+ std::vector<float> input_forward = {0, -1, 2, 3, -4, 5, 6, -7};
+ std::vector<float> expected_forward = {0, 0, 2, 3, 0, 5, 6, 0};
+ std::vector<float> incoming_backward = {8, -9, 10, 11, -12, -13, 14, -15};
+ std::vector<float> expected_backward = {0, 0, 10, 11, 0, -13, 14, 0};
+ ReluOpVerifier<float> verifier{input_forward, expected_forward, incoming_backward,
+ expected_backward};
+ verifier.verifyExpected();
+ }
+}
+
+TEST(CKer_Operation, neg_ReLU)
+{
+ {
+ // Unmatched shape
+ std::vector<float> input_forward = {0, -1, 2, 3, -4};
+ std::vector<float> expected_forward = {0, 0, 2, 3, 0};
+ std::vector<float> incoming_backward = {-5, 6, -7, 8};
+ std::vector<float> expected_backward = {0, 6, -7, 0};
+ ReluOpVerifier<float> verifier{input_forward, expected_forward, incoming_backward,
+ expected_backward};
+ EXPECT_ANY_THROW(verifier.verifyExpected());
+ }
+}