diff options
Diffstat (limited to 'compute/cker')
85 files changed, 9678 insertions, 1409 deletions
diff --git a/compute/cker/CMakeLists.txt b/compute/cker/CMakeLists.txt index 609dd45a3..d464dccae 100644 --- a/compute/cker/CMakeLists.txt +++ b/compute/cker/CMakeLists.txt @@ -8,15 +8,33 @@ target_link_libraries(nnfw_lib_cker INTERFACE gemmlowp) target_link_libraries(nnfw_lib_cker INTERFACE ruy) target_link_libraries(nnfw_lib_cker INTERFACE ruy_instrumentation) target_compile_definitions(nnfw_lib_cker INTERFACE USE_RUY_GEMV) -if(EXPERIMENTAL_RUY_FEATURE) - target_compile_definitions(nnfw_lib_cker INTERFACE EXPERIMENTAL_RUY_FEATURE) -endif(EXPERIMENTAL_RUY_FEATURE) if(PROFILE_RUY) target_link_libraries(nnfw_lib_cker INTERFACE ruy_profiler) endif(PROFILE_RUY) +if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)") + target_compile_definitions(nnfw_lib_cker INTERFACE CKER_X86_PLATFORM) +endif(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)") + target_include_directories(nnfw_lib_cker INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include) # Workaround to avoid warning # TODO Resolve warning target_compile_options(nnfw_lib_cker INTERFACE -Wno-attributes) + +if(NOT ENABLE_TEST) + return() +endif(NOT ENABLE_TEST) + +set(TEST_CKER test_cker) + +file(GLOB_RECURSE TESTS "src/*.test.cc") + +add_executable(${TEST_CKER} ${TESTS}) + +target_link_libraries(${TEST_CKER} nnfw_lib_cker) +target_link_libraries(${TEST_CKER} nnfw_coverage) +target_link_libraries(${TEST_CKER} gtest gtest_main ${LIB_PTHREAD}) + +add_test(${TEST_CKER} ${TEST_CKER}) +install(TARGETS ${TEST_CKER} DESTINATION unittest) diff --git a/compute/cker/include/cker/CpuBackendThreadpool.h b/compute/cker/include/cker/CpuBackendThreadpool.h new file mode 100644 index 000000000..8ec6140bd --- /dev/null +++ b/compute/cker/include/cker/CpuBackendThreadpool.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2019 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_CPU_BACKEND_THREADPOOL_H_ +#define __NNFW_CKER_CPU_BACKEND_THREADPOOL_H_ + +#include <ruy/context.h> // from @ruy +#include <ruy/thread_pool.h> // from @ruy + +#include <stdexcept> + +namespace nnfw +{ +namespace cker +{ +namespace cpu_backend_threadpool +{ + +using Task = ruy::Task; + +template <typename TaskType> +void Execute(int tasks_count, TaskType *tasks, ruy::Context *ruy_context) +{ + assert(ruy_context != nullptr); + assert(tasks_count <= ruy_context->max_num_threads()); + if (ruy_context == nullptr) + { + throw std::runtime_error("CpuBackendThreadpool.h: ruy::Context is null"); + } + ruy_context->mutable_thread_pool()->Execute(tasks_count, tasks); +} + +} // namespace cpu_backend_threadpool +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_CPU_BACKEND_THREADPOOL_H_ diff --git a/compute/cker/include/cker/NeonTensorUtils.h b/compute/cker/include/cker/NeonTensorUtils.h index 246fd9a46..45ad969c3 100644 --- a/compute/cker/include/cker/NeonTensorUtils.h +++ b/compute/cker/include/cker/NeonTensorUtils.h @@ -20,11 +20,13 @@ #include <ruy/path.h> #include <ruy/ruy.h> -#include <ruy/detect_arm.h> #include "cker/Types.h" #include "cker/neon/neon_check.h" #include "cker/ruy/RuySupport.h" #include "util/logging.h" +#if defined __linux__ && defined __aarch64__ +#include <sys/auxv.h> +#endif #include <cassert> #include <cmath> @@ -41,6 +43,8 @@ namespace cker namespace { +constexpr int kFloatValuesPerNeonVector = 4; + // TODO(ahentz): Clean up. using int8 = std::int8_t; using uint8 = std::uint8_t; @@ -49,6 +53,11 @@ using uint16 = std::uint16_t; using int32 = std::int32_t; using uint32 = std::uint32_t; +template <int PerNeonSize> inline int RoundDownVectors(int size) +{ + return size & ~(PerNeonSize - 1); +} + // Allocates, at least, size bytes of uninitialized storage whose alignment is // specified by alignment. The size parameter must be an integral multiple of // alignment. @@ -73,14 +82,37 @@ inline int32_t AccumulateNeonLane(const int32x4_t lane) } // namespace -#ifdef __aarch64__ +// The implementation of dotprod detection is copied from ruy's internal +// function DetectDotprod(). +// At the moment it's only implemented on Linux ARM64. Consider syncing again +// with ruy in the future to share improvements. +#if defined __linux__ && defined __aarch64__ +inline bool DetectDotprodByLinuxAuxvMethod() +{ + // This is the value of HWCAP_ASIMDDP in sufficiently recent Linux headers, + // however we need to support building against older headers for the time + // being. + const int kLocalHwcapAsimddp = 1 << 20; + return getauxval(AT_HWCAP) & kLocalHwcapAsimddp; +} +#endif + +inline bool DetectArmNeonDotprod() +{ +#if defined __linux__ && defined __aarch64__ + return DetectDotprodByLinuxAuxvMethod(); +#endif -bool HasSdotInstruction() + return false; +} + +inline bool HasSdotInstruction() { - static const bool has_dotprod = ruy::DetectDotprod(); + static const bool has_dotprod = DetectArmNeonDotprod(); return has_dotprod; } +#ifdef __aarch64__ // We interleave vector data to make the dot product logic more efficient. // Suppose that vectors is: // a0 a1 a2 a3 a4 a5 ... @@ -93,13 +125,13 @@ bool HasSdotInstruction() // e0 e1 e2 e3 f0 f1 f2 f3 ... // Once the data is interleaved, each 16-byte read from the vectors pointer // contains 4 bytes from each of 4 vectors. -const int8_t *ShuffleVectors(const int8_t *vectors, const int n_batch, const int m_cols, - void **shuffled_vectors_free) +inline const int8_t *ShuffleVectors(const int8_t *vectors, const int n_batch, const int m_cols, + void **shuffled_vectors_free) { const int kWeightsPerUint32 = 4; int8 *shuffled_vectors = reinterpret_cast<int8 *>( - aligned_alloc(kWeightsPerUint32, n_batch * m_cols, shuffled_vectors_free)); + aligned_alloc(kWeightsPerUint32, n_batch * m_cols, shuffled_vectors_free)); for (int i = 0; i < n_batch; i += 4) { @@ -113,25 +145,25 @@ const int8_t *ShuffleVectors(const int8_t *vectors, const int n_batch, const int while (unshuffled_vec0_ptr != end_vec0_ptr) { asm volatile( - // This code path requires that (n_cols % 16) == 0 so we can safely - // read in 16-byte chunks from each row. - "ld1 {v0.16b}, [%[unshuffled_vec0_ptr]], #16\n" - "ld1 {v1.16b}, [%[unshuffled_vec1_ptr]], #16\n" - "ld1 {v2.16b}, [%[unshuffled_vec2_ptr]], #16\n" - "ld1 {v3.16b}, [%[unshuffled_vec3_ptr]], #16\n" - - "st4 {v0.s, v1.s, v2.s, v3.s}[0], [%[shuffled_vectors_ptr]], #16\n" - "st4 {v0.s, v1.s, v2.s, v3.s}[1], [%[shuffled_vectors_ptr]], #16\n" - "st4 {v0.s, v1.s, v2.s, v3.s}[2], [%[shuffled_vectors_ptr]], #16\n" - "st4 {v0.s, v1.s, v2.s, v3.s}[3], [%[shuffled_vectors_ptr]], #16\n" - - : [unshuffled_vec0_ptr] "+r"(unshuffled_vec0_ptr), - [unshuffled_vec1_ptr] "+r"(unshuffled_vec1_ptr), - [unshuffled_vec2_ptr] "+r"(unshuffled_vec2_ptr), - [unshuffled_vec3_ptr] "+r"(unshuffled_vec3_ptr), - [shuffled_vectors_ptr] "+r"(shuffled_vectors_ptr) - : - : "v0", "v1", "v2", "v3", "cc", "memory"); + // This code path requires that (n_cols % 16) == 0 so we can safely + // read in 16-byte chunks from each row. + "ld1 {v0.16b}, [%[unshuffled_vec0_ptr]], #16\n" + "ld1 {v1.16b}, [%[unshuffled_vec1_ptr]], #16\n" + "ld1 {v2.16b}, [%[unshuffled_vec2_ptr]], #16\n" + "ld1 {v3.16b}, [%[unshuffled_vec3_ptr]], #16\n" + + "st4 {v0.s, v1.s, v2.s, v3.s}[0], [%[shuffled_vectors_ptr]], #16\n" + "st4 {v0.s, v1.s, v2.s, v3.s}[1], [%[shuffled_vectors_ptr]], #16\n" + "st4 {v0.s, v1.s, v2.s, v3.s}[2], [%[shuffled_vectors_ptr]], #16\n" + "st4 {v0.s, v1.s, v2.s, v3.s}[3], [%[shuffled_vectors_ptr]], #16\n" + + : [ unshuffled_vec0_ptr ] "+r"(unshuffled_vec0_ptr), + [ unshuffled_vec1_ptr ] "+r"(unshuffled_vec1_ptr), + [ unshuffled_vec2_ptr ] "+r"(unshuffled_vec2_ptr), + [ unshuffled_vec3_ptr ] "+r"(unshuffled_vec3_ptr), + [ shuffled_vectors_ptr ] "+r"(shuffled_vectors_ptr) + : + : "v0", "v1", "v2", "v3", "cc", "memory"); } } @@ -172,104 +204,104 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(const int8_t *__restr const int8 *mat_ptr3 = matrix + ((row + 3) * m_cols); asm volatile( - // Zero out the accumulator registers. - "dup v0.4s, wzr\n" - "dup v1.4s, wzr\n" - "dup v2.4s, wzr\n" - "dup v3.4s, wzr\n" - - "1:\n" // batch_cols_loop - - // Read 16 more bytes from a pair of matrix rows. - "ld1 {v12.16b}, [%[mat_ptr0]], #16\n" - - // Prefetch two rows ahead. - "prfm pldl1strm, [%[mat_ptr2]]\n" - "prfm pldl1strm, [%[mat_ptr3]]\n" - - // Read from input vectors 4 times; 64 bytes total. - // Each 16-byte register contains parts of 4 vectors; see the - // shuffle logic above. - - // From Benoit, places to look in the future: - // - Move load instructions further from sdot - // - Switch loop use-then-reload - // - Do partial unrolling to use register space better - "ld1 {v8.16b}, [%[vec_ptr]], #16\n" - ".word 0x4f8ce100 // sdot v0.4s, v8.16b, v12.4b[0]\n" - "ld1 {v9.16b}, [%[vec_ptr]], #16\n" - ".word 0x4face121 // sdot v1.4s, v9.16b, v12.4b[1]\n" - "ld1 {v10.16b}, [%[vec_ptr]], #16\n" - ".word 0x4f8ce940 // sdot v0.4s, v10.16b, v12.4b[2]\n" - "ld1 {v11.16b}, [%[vec_ptr]], #16\n" - ".word 0x4face961 // sdot v1.4s, v11.16b, v12.4b[3]\n" - - // Update prefetch pointers. - "add %[mat_ptr2], %[mat_ptr2], #16\n" - "add %[mat_ptr3], %[mat_ptr3], #16\n" - - // Re-use those vectors for the next row as well. - "ld1 {v13.16b}, [%[mat_ptr1]], #16\n" - ".word 0x4f8de102 // sdot v2.4s, v8.16b, v13.4b[0]\n" - ".word 0x4fade123 // sdot v3.4s, v9.16b, v13.4b[1]\n" - ".word 0x4f8de942 // sdot v2.4s, v10.16b, v13.4b[2]\n" - ".word 0x4fade963 // sdot v3.4s, v11.16b, v13.4b[3]\n" - - // If we're not done with these rows, continue. - "cmp %[mat_ptr0], %[mat_ptr0_end]\n" - "bne 1b\n" // batch_cols_loop - - // Done with the rows, sum the results. - "add v0.4s, v0.4s, v1.4s\n" - "add v2.4s, v2.4s, v3.4s\n" - - // Convert the per-vector sums to floating point. - "scvtf v0.4s, v0.4s\n" - "scvtf v1.4s, v2.4s\n" - - // Fetch scale factors. - "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n" - - // Multiply scale factors times sums. - "fmul v0.4s, v4.4s, v0.4s\n" - "fmul v1.4s, v4.4s, v1.4s\n" - - // Load previous result values. - // The result position is: - // result[batch * m_rows + row] - // Here that is factored into: - // result_ptr = result + row - // *result_ptr = res[0] - // (uint8*)result_ptr += (m_rows * sizeof(float)) - // *result_ptr = res[1] - // ... - // Since we're reading two rows at a time, though, we read both - // result[batch * m_rows + row] - // and - // result[batch * m_rows + row + 1] - "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n" - "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n" - "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n" - "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n" - - // Go back to the starting position (subtract wide_rows * 4). - "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n" - - // Add previous result values. - "fadd v9.4s, v9.4s, v0.4s\n" - "fadd v10.4s, v10.4s, v1.4s\n" - - // Store results. - "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n" - "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n" - "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n" - "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n" - : [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1), [vec_ptr] "+r"(vec_ptr), - [result_ptr] "+r"(result_ptr), [mat_ptr2] "+r"(mat_ptr2), [mat_ptr3] "+r"(mat_ptr3) - : [mat_ptr0_end] "r"(mat_ptr0_end), [scaling_factors_ptr] "r"(scaling_factors_ptr), - [wide_rows] "r"(wide_rows) - : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", - "v13", "cc", "memory"); + // Zero out the accumulator registers. + "dup v0.4s, wzr\n" + "dup v1.4s, wzr\n" + "dup v2.4s, wzr\n" + "dup v3.4s, wzr\n" + + "1:\n" // batch_cols_loop + + // Read 16 more bytes from a pair of matrix rows. + "ld1 {v12.16b}, [%[mat_ptr0]], #16\n" + + // Prefetch two rows ahead. + "prfm pldl1strm, [%[mat_ptr2]]\n" + "prfm pldl1strm, [%[mat_ptr3]]\n" + + // Read from input vectors 4 times; 64 bytes total. + // Each 16-byte register contains parts of 4 vectors; see the + // shuffle logic above. + + // From Benoit, places to look in the future: + // - Move load instructions further from sdot + // - Switch loop use-then-reload + // - Do partial unrolling to use register space better + "ld1 {v8.16b}, [%[vec_ptr]], #16\n" + ".word 0x4f8ce100 // sdot v0.4s, v8.16b, v12.4b[0]\n" + "ld1 {v9.16b}, [%[vec_ptr]], #16\n" + ".word 0x4face121 // sdot v1.4s, v9.16b, v12.4b[1]\n" + "ld1 {v10.16b}, [%[vec_ptr]], #16\n" + ".word 0x4f8ce940 // sdot v0.4s, v10.16b, v12.4b[2]\n" + "ld1 {v11.16b}, [%[vec_ptr]], #16\n" + ".word 0x4face961 // sdot v1.4s, v11.16b, v12.4b[3]\n" + + // Update prefetch pointers. + "add %[mat_ptr2], %[mat_ptr2], #16\n" + "add %[mat_ptr3], %[mat_ptr3], #16\n" + + // Re-use those vectors for the next row as well. + "ld1 {v13.16b}, [%[mat_ptr1]], #16\n" + ".word 0x4f8de102 // sdot v2.4s, v8.16b, v13.4b[0]\n" + ".word 0x4fade123 // sdot v3.4s, v9.16b, v13.4b[1]\n" + ".word 0x4f8de942 // sdot v2.4s, v10.16b, v13.4b[2]\n" + ".word 0x4fade963 // sdot v3.4s, v11.16b, v13.4b[3]\n" + + // If we're not done with these rows, continue. + "cmp %[mat_ptr0], %[mat_ptr0_end]\n" + "bne 1b\n" // batch_cols_loop + + // Done with the rows, sum the results. + "add v0.4s, v0.4s, v1.4s\n" + "add v2.4s, v2.4s, v3.4s\n" + + // Convert the per-vector sums to floating point. + "scvtf v0.4s, v0.4s\n" + "scvtf v1.4s, v2.4s\n" + + // Fetch scale factors. + "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n" + + // Multiply scale factors times sums. + "fmul v0.4s, v4.4s, v0.4s\n" + "fmul v1.4s, v4.4s, v1.4s\n" + + // Load previous result values. + // The result position is: + // result[batch * m_rows + row] + // Here that is factored into: + // result_ptr = result + row + // *result_ptr = res[0] + // (uint8*)result_ptr += (m_rows * sizeof(float)) + // *result_ptr = res[1] + // ... + // Since we're reading two rows at a time, though, we read both + // result[batch * m_rows + row] + // and + // result[batch * m_rows + row + 1] + "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n" + "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n" + "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n" + "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n" + + // Go back to the starting position (subtract wide_rows * 4). + "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n" + + // Add previous result values. + "fadd v9.4s, v9.4s, v0.4s\n" + "fadd v10.4s, v10.4s, v1.4s\n" + + // Store results. + "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n" + "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n" + "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n" + "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n" + : [ mat_ptr0 ] "+r"(mat_ptr0), [ mat_ptr1 ] "+r"(mat_ptr1), [ vec_ptr ] "+r"(vec_ptr), + [ result_ptr ] "+r"(result_ptr), [ mat_ptr2 ] "+r"(mat_ptr2), [ mat_ptr3 ] "+r"(mat_ptr3) + : [ mat_ptr0_end ] "r"(mat_ptr0_end), [ scaling_factors_ptr ] "r"(scaling_factors_ptr), + [ wide_rows ] "r"(wide_rows) + : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", + "v13", "cc", "memory"); } } @@ -277,9 +309,9 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate(const int8_t *__restr } static void DotprodMatrixBatchFourVectorMultiplyAccumulate( - const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors, - const float *scaling_factors, int n_batch, float *__restrict__ result, - const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums) + const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors, + const float *scaling_factors, int n_batch, float *__restrict__ result, + const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums) { void *shuffled_vectors_free; const int8_t *shuffled_vectors = ShuffleVectors(vectors, n_batch, m_cols, &shuffled_vectors_free); @@ -300,102 +332,102 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate( const int32_t *batch_offsets_ptr = input_offset + batch; const int32_t is_channel_scale_nullptr = per_channel_scale == nullptr; const int32_t is_row_sums_nullptr = row_sums_ptr == nullptr; - asm volatile("dup v0.4s, wzr\n" - "dup v1.4s, wzr\n" - "dup v2.4s, wzr\n" - "dup v3.4s, wzr\n" - // Load zero points. - "ld1 {v7.4s}, [%[batch_offsets_ptr]]\n" - "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n" - // Zero out zero point accumulators. - "dup v14.4s, wzr\n" - "dup v15.4s, wzr\n" - - // Load per channel scales if not null. - "cmp %w[is_channel_scale_nullptr], #0\n" - "bne 1f\n" - "ld1r {v16.4s}, [%[channel_scales_ptr]], #4\n" - "ld1r {v17.4s}, [%[channel_scales_ptr]]\n" - "fmul v16.4s, v16.4s, v4.4s\n" - "fmul v17.4s, v17.4s, v4.4s\n" - "b 2f\n" - "1:\n" - "mov v16.16b, v4.16b\n" - "mov v17.16b, v4.16b\n" - "2:\n" - "ld1 {v12.16b}, [%[mat_ptr0]], #16\n" - "ld1 {v8.16b}, [%[vec_ptr]], #16\n" - ".word 0x4f8ce100 // sdot v0.4s, v8.16b, v12.4b[0]\n" - "ld1 {v9.16b}, [%[vec_ptr]], #16\n" - ".word 0x4face121 // sdot v1.4s, v9.16b, v12.4b[1]\n" - "ld1 {v10.16b}, [%[vec_ptr]], #16\n" - ".word 0x4f8ce940 // sdot v0.4s, v10.16b, v12.4b[2]\n" - "ld1 {v11.16b}, [%[vec_ptr]], #16\n" - ".word 0x4face961 // sdot v1.4s, v11.16b, v12.4b[3]\n" - "ld1 {v13.16b}, [%[mat_ptr1]], #16\n" - ".word 0x4f8de102 // sdot v2.4s, v8.16b, v13.4b[0]\n" - ".word 0x4fade123 // sdot v3.4s, v9.16b, v13.4b[1]\n" - ".word 0x4f8de942 // sdot v2.4s, v10.16b, v13.4b[2]\n" - ".word 0x4fade963 // sdot v3.4s, v11.16b, v13.4b[3]\n" - "cmp %w[is_row_sums_nullptr], #1\n" - "bne 3f\n" - // Accumulate row_sums for zero point calculations. - "saddlp v12.8h, v12.16b\n" - "saddlp v13.8h, v13.16b\n" - "sadalp v14.4s, v12.8h\n" - "sadalp v15.4s, v13.8h\n" - "3:\n" - "cmp %[mat_ptr0], %[mat_ptr0_end]\n" - "bne 2b\n" - "add v0.4s, v0.4s, v1.4s\n" - "add v2.4s, v2.4s, v3.4s\n" - - "cmp %w[is_row_sums_nullptr], #1\n" - "bne 4f\n" - // Calculate zero point offsets. - "addv s14, v14.4s\n" - "addv s15, v15.4s\n" - "dup v14.4s, v14.s[0]\n" - "dup v15.4s, v15.s[0]\n" - "b 5f\n" - "4:\n" - "ld1r {v14.4s}, [%[row_sums_ptr]], #4\n" - "ld1r {v15.4s}, [%[row_sums_ptr]]\n" - "5:\n" - - "mul v14.4s, v14.4s, v7.4s\n" - "mul v15.4s, v15.4s, v7.4s\n" - "sub v0.4s, v0.4s, v14.4s\n" - "sub v2.4s, v2.4s, v15.4s\n" - - "scvtf v0.4s, v0.4s\n" - "scvtf v1.4s, v2.4s\n" - - // Multiply scale. - "fmul v0.4s, v16.4s, v0.4s\n" - "fmul v1.4s, v17.4s, v1.4s\n" - - "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n" - "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n" - "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n" - "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n" - "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n" - "fadd v9.4s, v9.4s, v0.4s\n" - "fadd v10.4s, v10.4s, v1.4s\n" - "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n" - "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n" - "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n" - "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n" - : [mat_ptr0] "+r"(mat_ptr0), [mat_ptr1] "+r"(mat_ptr1), [vec_ptr] "+r"(vec_ptr), - [result_ptr] "+r"(result_ptr), [row_sums_ptr] "+r"(row_sums_ptr) - : [mat_ptr0_end] "r"(mat_ptr0_end), - [scaling_factors_ptr] "r"(scaling_factors_ptr), [wide_rows] "r"(wide_rows), - [channel_scales_ptr] "r"(channel_scales_ptr), - [batch_offsets_ptr] "r"(batch_offsets_ptr), - [is_channel_scale_nullptr] "r"(is_channel_scale_nullptr), - [is_row_sums_nullptr] "r"(is_row_sums_nullptr) - : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", - "v12", "v13", "v14", "v15", "v16", "v17", "w0", "w1", "cc", "memory"); + asm volatile( + "dup v0.4s, wzr\n" + "dup v1.4s, wzr\n" + "dup v2.4s, wzr\n" + "dup v3.4s, wzr\n" + // Load zero points. + "ld1 {v7.4s}, [%[batch_offsets_ptr]]\n" + "ld1 {v4.4s}, [%[scaling_factors_ptr]]\n" + // Zero out zero point accumulators. + "dup v14.4s, wzr\n" + "dup v15.4s, wzr\n" + + // Load per channel scales if not null. + "cmp %w[is_channel_scale_nullptr], #0\n" + "bne 1f\n" + "ld1r {v16.4s}, [%[channel_scales_ptr]], #4\n" + "ld1r {v17.4s}, [%[channel_scales_ptr]]\n" + "fmul v16.4s, v16.4s, v4.4s\n" + "fmul v17.4s, v17.4s, v4.4s\n" + "b 2f\n" + "1:\n" + "mov v16.16b, v4.16b\n" + "mov v17.16b, v4.16b\n" + "2:\n" + "ld1 {v12.16b}, [%[mat_ptr0]], #16\n" + "ld1 {v8.16b}, [%[vec_ptr]], #16\n" + ".word 0x4f8ce100 // sdot v0.4s, v8.16b, v12.4b[0]\n" + "ld1 {v9.16b}, [%[vec_ptr]], #16\n" + ".word 0x4face121 // sdot v1.4s, v9.16b, v12.4b[1]\n" + "ld1 {v10.16b}, [%[vec_ptr]], #16\n" + ".word 0x4f8ce940 // sdot v0.4s, v10.16b, v12.4b[2]\n" + "ld1 {v11.16b}, [%[vec_ptr]], #16\n" + ".word 0x4face961 // sdot v1.4s, v11.16b, v12.4b[3]\n" + "ld1 {v13.16b}, [%[mat_ptr1]], #16\n" + ".word 0x4f8de102 // sdot v2.4s, v8.16b, v13.4b[0]\n" + ".word 0x4fade123 // sdot v3.4s, v9.16b, v13.4b[1]\n" + ".word 0x4f8de942 // sdot v2.4s, v10.16b, v13.4b[2]\n" + ".word 0x4fade963 // sdot v3.4s, v11.16b, v13.4b[3]\n" + "cmp %w[is_row_sums_nullptr], #1\n" + "bne 3f\n" + // Accumulate row_sums for zero point calculations. + "saddlp v12.8h, v12.16b\n" + "saddlp v13.8h, v13.16b\n" + "sadalp v14.4s, v12.8h\n" + "sadalp v15.4s, v13.8h\n" + "3:\n" + "cmp %[mat_ptr0], %[mat_ptr0_end]\n" + "bne 2b\n" + "add v0.4s, v0.4s, v1.4s\n" + "add v2.4s, v2.4s, v3.4s\n" + + "cmp %w[is_row_sums_nullptr], #1\n" + "bne 4f\n" + // Calculate zero point offsets. + "addv s14, v14.4s\n" + "addv s15, v15.4s\n" + "dup v14.4s, v14.s[0]\n" + "dup v15.4s, v15.s[0]\n" + "b 5f\n" + "4:\n" + "ld1r {v14.4s}, [%[row_sums_ptr]], #4\n" + "ld1r {v15.4s}, [%[row_sums_ptr]]\n" + "5:\n" + + "mul v14.4s, v14.4s, v7.4s\n" + "mul v15.4s, v15.4s, v7.4s\n" + "sub v0.4s, v0.4s, v14.4s\n" + "sub v2.4s, v2.4s, v15.4s\n" + + "scvtf v0.4s, v0.4s\n" + "scvtf v1.4s, v2.4s\n" + + // Multiply scale. + "fmul v0.4s, v16.4s, v0.4s\n" + "fmul v1.4s, v17.4s, v1.4s\n" + + "ld2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n" + "ld2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n" + "ld2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n" + "ld2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n" + "sub %[result_ptr], %[result_ptr], %[wide_rows], lsl #2\n" + "fadd v9.4s, v9.4s, v0.4s\n" + "fadd v10.4s, v10.4s, v1.4s\n" + "st2 {v9.s, v10.s}[0], [%[result_ptr]], %[wide_rows]\n" + "st2 {v9.s, v10.s}[1], [%[result_ptr]], %[wide_rows]\n" + "st2 {v9.s, v10.s}[2], [%[result_ptr]], %[wide_rows]\n" + "st2 {v9.s, v10.s}[3], [%[result_ptr]], %[wide_rows]\n" + : [ mat_ptr0 ] "+r"(mat_ptr0), [ mat_ptr1 ] "+r"(mat_ptr1), [ vec_ptr ] "+r"(vec_ptr), + [ result_ptr ] "+r"(result_ptr), [ row_sums_ptr ] "+r"(row_sums_ptr) + : [ mat_ptr0_end ] "r"(mat_ptr0_end), [ scaling_factors_ptr ] "r"(scaling_factors_ptr), + [ wide_rows ] "r"(wide_rows), [ channel_scales_ptr ] "r"(channel_scales_ptr), + [ batch_offsets_ptr ] "r"(batch_offsets_ptr), + [ is_channel_scale_nullptr ] "r"(is_channel_scale_nullptr), + [ is_row_sums_nullptr ] "r"(is_row_sums_nullptr) + : "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", + "v13", "v14", "v15", "v16", "v17", "w0", "w1", "cc", "memory"); } } @@ -425,10 +457,10 @@ static void DotprodMatrixBatchFourVectorMultiplyAccumulate( // // We don't use this kernel when n_batch = 1 because the baseline kernel // is fine for that case. -void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate( - const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors, - const float *scaling_factors, int n_batch, float *__restrict__ result, - const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums) +inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate( + const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors, + const float *scaling_factors, int n_batch, float *__restrict__ result, + const float *per_channel_scale, const int32_t *input_offset, int32_t *row_sums) { const int kWeightsPerUint32 = 4; @@ -443,14 +475,14 @@ void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate( void *padded_vectors_free; const int padded_vectors_size = batch_round_up * m_cols; int8_t *padded_vectors = reinterpret_cast<int8_t *>( - aligned_alloc(kWeightsPerUint32, padded_vectors_size, &padded_vectors_free)); + aligned_alloc(kWeightsPerUint32, padded_vectors_size, &padded_vectors_free)); memset(padded_vectors, 0, padded_vectors_size); void *padded_result_free; const int result_size = n_batch * m_rows * sizeof(float); const int padded_result_size = batch_round_up * m_rows * sizeof(float); float *padded_result = reinterpret_cast<float *>( - aligned_alloc(kWeightsPerUint32, padded_result_size, &padded_result_free)); + aligned_alloc(kWeightsPerUint32, padded_result_size, &padded_result_free)); memcpy(padded_result, result, result_size); memset(reinterpret_cast<char *>(padded_result) + result_size, 0, padded_result_size - result_size); @@ -462,7 +494,7 @@ void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate( void *padded_scaling_factors_free; const int padded_scaling_factors_size = batch_round_up * sizeof(float); float *padded_scaling_factors = reinterpret_cast<float *>( - aligned_alloc(kWeightsPerUint32, padded_scaling_factors_size, &padded_scaling_factors_free)); + aligned_alloc(kWeightsPerUint32, padded_scaling_factors_size, &padded_scaling_factors_free)); assert(static_cast<int>(n_batch * sizeof(float)) <= padded_scaling_factors_size); assert(static_cast<int>(batch_round_up * sizeof(float)) <= padded_scaling_factors_size); memset(padded_scaling_factors, 0, batch_round_up * sizeof(float)); @@ -473,7 +505,7 @@ void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate( void *padded_input_offset_free; const int padded_input_offset_size = batch_round_up * sizeof(int32_t); int32_t *padded_input_offset = reinterpret_cast<int32_t *>( - aligned_alloc(kWeightsPerUint32, padded_input_offset_size, &padded_input_offset_free)); + aligned_alloc(kWeightsPerUint32, padded_input_offset_size, &padded_input_offset_free)); assert(static_cast<int>(n_batch * sizeof(int32_t)) <= padded_input_offset_size); assert(static_cast<int>(batch_round_up * sizeof(int32_t)) <= padded_input_offset_size); memset(padded_input_offset, 0, batch_round_up * sizeof(int32_t)); @@ -481,8 +513,8 @@ void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate( // Call the main kernel. DotprodMatrixBatchFourVectorMultiplyAccumulate( - matrix, m_rows, m_cols, padded_vectors, padded_scaling_factors, batch_round_up, - padded_result, per_channel_scale, padded_input_offset, row_sums); + matrix, m_rows, m_cols, padded_vectors, padded_scaling_factors, batch_round_up, padded_result, + per_channel_scale, padded_input_offset, row_sums); free(padded_input_offset_free); } @@ -500,20 +532,40 @@ void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate( free(padded_scaling_factors_free); } -void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, - const int m_rows, const int m_cols, - const int8_t *vectors, - const float *scaling_factors, int n_batch, - float *__restrict__ result) +inline void DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate( + const int8_t *__restrict__ matrix, const int m_rows, const int m_cols, const int8_t *vectors, + const float *scaling_factors, int n_batch, float *__restrict__ result) { DotprodMatrixBatchPaddedFourVectorMultiplyAccumulate( - matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result, - /*per_channel_scale=*/nullptr, /*input_offset=*/nullptr, - /*row_sums=*/nullptr); + matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result, + /*per_channel_scale=*/nullptr, /*input_offset=*/nullptr, + /*row_sums=*/nullptr); } #endif // __aarch64__ -bool NeonIsZeroVector(const float *vector, int v_size) +inline void NeonCwiseClipping(float *vector, const int v_size, const float clipping_value) +{ + const float32x4_t clipping_value_f32x4 = vmovq_n_f32(clipping_value); + const float32x4_t neg_clipping_value_f32x4 = vmovq_n_f32(-clipping_value); + + int i = 0; + for (; i <= v_size - kFloatValuesPerNeonVector; i += kFloatValuesPerNeonVector) + { + // Load from memory to vector. + float32x4_t v_f32x4 = vld1q_f32(vector + i); + // Clip between clipping_value and -clipping_value. + v_f32x4 = vminq_f32(clipping_value_f32x4, v_f32x4); + v_f32x4 = vmaxq_f32(neg_clipping_value_f32x4, v_f32x4); + // Save to output. + vst1q_f32(vector + i, v_f32x4); + } + for (; i < v_size; i++) + { + vector[i] = std::max(std::min(clipping_value, vector[i]), -clipping_value); + } +} + +inline bool NeonIsZeroVector(const float *vector, int v_size) { // If v_size is not divisible by kFloatWeightsPerNeonLane, we cannot // use the main vectorized loop, and we need to process sequentially. @@ -544,15 +596,16 @@ bool NeonIsZeroVector(const float *vector, int v_size) return true; } -void NeonCpuBackendGemm(const int8_t *input, const int32_t *bias, - const int8_t *input_to_gate_weights, int32_t n_batch, int32_t n_input, - int32_t n_output, int32_t, int32_t *scratch, ruy::Context *ruy_context) +inline void NeonCpuBackendGemm(const int8_t *input, const int32_t *bias, + const int8_t *input_to_gate_weights, int32_t n_batch, + int32_t n_input, int32_t n_output, int32_t, int32_t *scratch, + ruy::Context *ruy_context) { MatrixParams<int8_t> lhs_params; lhs_params.order = Order::kRowMajor; lhs_params.rows = n_output; lhs_params.cols = n_input; - lhs_params.cacheable = true; + lhs_params.cache_policy = CachePolicy::kAlwaysCache; MatrixParams<int8_t> rhs_params; rhs_params.order = Order::kColMajor; @@ -574,19 +627,44 @@ void NeonCpuBackendGemm(const int8_t *input, const int32_t *bias, ruy::Matrix<int8_t> ruy_lhs; ruy::Matrix<int8_t> ruy_rhs; ruy::Matrix<int32_t> ruy_dst; - ruy_support::MakeRuyMatrix(lhs_params, input_to_gate_weights, &ruy_lhs); - ruy_support::MakeRuyMatrix(rhs_params, input, &ruy_rhs); + // Note that cache is always enabled for input and weight tensors + ruy_support::MakeRuyMatrix(lhs_params, input_to_gate_weights, &ruy_lhs, true); + ruy_support::MakeRuyMatrix(rhs_params, input, &ruy_rhs, true); ruy_support::MakeRuyMatrix(dst_params, scratch, &ruy_dst); - ruy::BasicSpec<int32_t, int32_t> ruy_spec; - ruy_support::MakeRuySpec(gemm_params, &ruy_spec); + ruy::MulParams<int32_t, int32_t> ruy_mul_params; + ruy_support::MakeRuyMulParams(gemm_params, &ruy_mul_params); - constexpr ruy::Path kRuyPath = ruy::kAllPaths; - ruy::Mul<kRuyPath>(ruy_lhs, ruy_rhs, ruy_spec, ruy_context, &ruy_dst); + ruy::Mul(ruy_lhs, ruy_rhs, ruy_mul_params, ruy_context, &ruy_dst); +} + +inline void NeonSub1Vector(const float *vector, int v_size, float *result) +{ + // If v_size is not divisible by the vector size, then we need to process the + // final few elements sequentially. postamble_start shows the start index + // where this should happen. + const int postamble_start = RoundDownVectors<kFloatValuesPerNeonVector>(v_size); + + float32x4_t one_f32x4 = vmovq_n_f32(1.0); + int v = 0; + for (; v < postamble_start; v += kFloatValuesPerNeonVector) + { + // Load 4 float values from the current pointers of the input column and + // subtract from 1. + float32x4_t v_f32x4 = vld1q_f32(vector + v); + float32x4_t result_f32x4 = vsubq_f32(one_f32x4, v_f32x4); + // Save to output. + vst1q_f32(result + v, result_f32x4); + } + for (; v < v_size; v++) + { + result[v] = 1.0f - vector[v]; + } } -void NeonSymmetricQuantizeFloats(const float *values, const int size, int8_t *quantized_values, - float *min, float *max, float *scaling_factor) +inline void NeonSymmetricQuantizeFloats(const float *values, const int size, + int8_t *quantized_values, float *min, float *max, + float *scaling_factor) { // TODO(raziel): vectorize min/max calculation. auto minmax = std::minmax_element(values, values + size); @@ -658,15 +736,16 @@ void NeonSymmetricQuantizeFloats(const float *values, const int size, int8_t *qu for (int i = postamble_start; i < size; ++i) { const int32_t quantized_value = - static_cast<int32_t>(std::round(scaling_factor_inv * values[i])); + static_cast<int32_t>(std::round(scaling_factor_inv * values[i])); quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value)); } } -void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, const int m_rows, - const int m_cols, const int8_t *__restrict__ vectors, - const float *scaling_factors, int n_batch, - float *__restrict__ result, int result_stride) +inline void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, + const int m_rows, const int m_cols, + const int8_t *__restrict__ vectors, + const float *scaling_factors, int n_batch, + float *__restrict__ result, int result_stride) { #ifdef __aarch64__ if (HasSdotInstruction() && m_cols % 16 == 0 && m_rows % 2 == 0 && m_rows >= n_batch) @@ -751,7 +830,7 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, // Here the assumption is that each buffer is 4-byte aligned. Otherwise, // performance may suffer significantly. assert( // NOLINT - ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0); + ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0); const int8x16_t s1_8x16 = vld1q_s8((const int8_t *)(aligned_vec + col)); const int8x16_t s2_8x16 = vld1q_s8((const int8_t *)(row_ptr + col)); // Multiply the low bits (i.e. the lower 8 8bit numbers in the @@ -776,7 +855,7 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, // Here the assumption is that each buffer is 4-bytes aligned. // Otherwise, performance may suffer significantly. assert( // NOLINT - ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0); + ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1)) == 0); const int8x8_t s1_8x8 = vld1_s8((const int8_t *)(aligned_vec + col)); const int8x8_t s2_8x8 = vld1_s8((const int8_t *)(row_ptr + col)); const int16x8_t prod_16x8 = vmull_s8(s1_8x8, s2_8x8); @@ -804,9 +883,9 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, free(aligned_vec_free); } -void NeonMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols, - const float *vector, int n_batch, float *result, - int result_stride) +inline void NeonMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols, + const float *vector, int n_batch, float *result, + int result_stride) { // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main // vectorized loop, and we need to process sequentially. postamble_start shows @@ -845,11 +924,12 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, in } } -void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, const int m_rows, - const int m_cols, const int8_t *__restrict__ vectors, - const float *scaling_factors, int n_batch, - int32_t *scratch, float *__restrict__ result, - int result_stride, ruy::Context *ruy_context) +inline void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, + const int m_rows, const int m_cols, + const int8_t *__restrict__ vectors, + const float *scaling_factors, int n_batch, + int32_t *scratch, float *__restrict__ result, + int result_stride, ruy::Context *ruy_context) { if (m_rows % 4 == 0 && result_stride == 1) { @@ -872,7 +952,7 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, const float32x4_t float_val1 = vcvtq_f32_s32(scratch_val1); const float32x4_t result0 = vmlaq_f32(vld1q_f32(result), float_val0, scaling_factor0); const float32x4_t result1 = - vmlaq_f32(vld1q_f32(result + 4 * result_stride), float_val1, scaling_factor1); + vmlaq_f32(vld1q_f32(result + 4 * result_stride), float_val1, scaling_factor1); vst1q_f32(result, result0); vst1q_f32(result + 4 * result_stride, result1); } diff --git a/compute/cker/include/cker/PortableTensorUtils.h b/compute/cker/include/cker/PortableTensorUtils.h index 54714e214..7e4b01a01 100644 --- a/compute/cker/include/cker/PortableTensorUtils.h +++ b/compute/cker/include/cker/PortableTensorUtils.h @@ -45,6 +45,10 @@ public: return a < 0.f ? 0.f : a; case FusedActivationFunctionType::kRelu6: return std::max(0.f, std::min(a, 6.f)); + case FusedActivationFunctionType::kTanh: + return std::tanh(a); + case FusedActivationFunctionType::kSigmoid: + return 1.0f / (1.0f + std::exp(-a)); default: // TODO(aselle): More informative fatal error! exit(1); @@ -55,8 +59,17 @@ private: FusedActivationFunctionType act_; }; -void PortableVectorBatchVectorAssign(const float *vector, int v_size, int n_batch, - float *batch_vector) +template <typename T> +void PortableCwiseClipping(T *vector, const int v_size, const T clipping_value) +{ + for (int i = 0; i < v_size; i++) + { + vector[i] = std::max(std::min(clipping_value, vector[i]), static_cast<T>(-clipping_value)); + } +} + +inline void PortableVectorBatchVectorAssign(const float *vector, int v_size, int n_batch, + float *batch_vector) { for (int b = 0; b < n_batch; b++) { @@ -64,7 +77,20 @@ void PortableVectorBatchVectorAssign(const float *vector, int v_size, int n_batc } } -bool PortableIsZeroVector(const float *vector, int v_size) +inline void PortableVectorBatchVectorAdd(const float *vector, int v_size, int n_batch, + float *batch_vector) +{ + for (int b = 0; b < n_batch; b++) + { + for (int i = 0; i < v_size; ++i) + { + batch_vector[i] += vector[i]; + } + batch_vector += v_size; + } +} + +inline bool PortableIsZeroVector(const float *vector, int v_size) { for (int i = 0; i < v_size; ++i) { @@ -74,8 +100,8 @@ bool PortableIsZeroVector(const float *vector, int v_size) return true; } -void PortableApplyActivationToVector(const float *vector, int v_size, - FusedActivationFunctionType activation, float *result) +inline void PortableApplyActivationToVector(const float *vector, int v_size, + FusedActivationFunctionType activation, float *result) { auto activation_func = ActivationFunctor(activation); for (int v = 0; v < v_size; v++) @@ -84,8 +110,17 @@ void PortableApplyActivationToVector(const float *vector, int v_size, } } -void PortableSymmetricQuantizeFloats(const float *values, const int size, int8_t *quantized_values, - float *min_value, float *max_value, float *scaling_factor) +inline void PortableSub1Vector(const float *vector, int v_size, float *result) +{ + for (int v = 0; v < v_size; v++) + { + *result++ = 1.0f - *vector++; + } +} + +inline void PortableSymmetricQuantizeFloats(const float *values, const int size, + int8_t *quantized_values, float *min_value, + float *max_value, float *scaling_factor) { auto minmax = std::minmax_element(values, values + size); *min_value = *minmax.first; @@ -103,17 +138,72 @@ void PortableSymmetricQuantizeFloats(const float *values, const int size, int8_t for (int i = 0; i < size; ++i) { const int32_t quantized_value = - static_cast<int32_t>(std::round(values[i] * scaling_factor_inv)); + static_cast<int32_t>(std::round(values[i] * scaling_factor_inv)); // Clamp: just in case some odd numeric offset. quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value)); } } -void PortableMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, - const int m_rows, const int m_cols, - const int8_t *__restrict__ vectors, - const float *scaling_factors, int n_batch, - float *__restrict__ result, int result_stride) +inline void PortableAsymmetricQuantizeFloats(const float *values, const int size, + int8_t *quantized_values, float *scaling_factor, + int32_t *offset) +{ + /* Copied from TensorFlow PortableAsymmetricQuantizeFloats */ + const int32_t kMinScale = -128; + const int32_t kMaxScale = 127; + const double qmin_double = kMinScale; + const double qmax_double = kMaxScale; + const auto minmax = std::minmax_element(values, values + size); + const double rmin = static_cast<double>(std::min(0.0f, *minmax.first)); + const double rmax = static_cast<double>(std::max(0.0f, *minmax.second)); + if (rmin == rmax) + { + memset(quantized_values, 0, size * sizeof(int8_t)); + *scaling_factor = 1; + *offset = 0; + return; + } + else + { + double scale = (rmax - rmin) / (qmax_double - qmin_double); + const double zero_point_from_min = qmin_double - rmin / scale; + const double zero_point_from_max = qmax_double - rmax / scale; + const double zero_point_from_min_error = std::abs(qmin_double) + std::abs(rmin / scale); + const double zero_point_from_max_error = std::abs(qmax_double) + std::abs(rmax / scale); + const double zero_point_double = zero_point_from_min_error < zero_point_from_max_error + ? zero_point_from_min + : zero_point_from_max; + int8_t nudged_zero_point = 0; + if (zero_point_double <= qmin_double) + { + nudged_zero_point = kMinScale; + } + else if (zero_point_double >= qmax_double) + { + nudged_zero_point = kMaxScale; + } + else + { + nudged_zero_point = static_cast<int8_t>(round(zero_point_double)); + } + *scaling_factor = scale; + *offset = nudged_zero_point; + } + const float scaling_factor_inv = 1.0f / *scaling_factor; + for (int i = 0; i < size; ++i) + { + const int32_t quantized_value = + static_cast<int32_t>(std::round(*offset + values[i] * scaling_factor_inv)); + quantized_values[i] = std::min(kMaxScale, std::max(kMinScale, quantized_value)); + } +} + +inline void PortableMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, + const int m_rows, const int m_cols, + const int8_t *__restrict__ vectors, + const float *scaling_factors, int n_batch, + float *__restrict__ result, + int result_stride) { int batch, row, col; for (batch = 0; batch < n_batch; ++batch, vectors += m_cols) @@ -138,20 +228,20 @@ void PortableMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matr } // for batch } -void PortableMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, - const int m_rows, const int m_cols, - const int8_t *__restrict__ vector, - const float *scaling_factors, int n_batch, - int32_t *, float *__restrict__ result, - int result_stride, ruy::Context *) +inline void PortableMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, + const int m_rows, const int m_cols, + const int8_t *__restrict__ vector, + const float *scaling_factors, int n_batch, + int32_t *, float *__restrict__ result, + int result_stride, ruy::Context *) { PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector, scaling_factors, n_batch, result, result_stride); } -void PortableMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols, - const float *vector, int n_batch, float *result, - int result_stride) +inline void PortableMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols, + const float *vector, int n_batch, + float *result, int result_stride) { float *result_in_batch = result; for (int b = 0; b < n_batch; b++) @@ -171,7 +261,36 @@ void PortableMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows } } -void PortableZeroVector(float *vector, int v_size) { std::fill_n(vector, v_size, 0); } +inline void PortableMeanStddevNormalization(const float *input_vector, float *output_vector, + int v_size, int n_batch) +{ + for (int batch = 0; batch < n_batch; ++batch) + { + float sum = 0.0f; + for (int i = 0; i < v_size; ++i) + { + sum += input_vector[i]; + } + const float mean = sum / v_size; + float sum_diff_sq = 0.0f; + for (int i = 0; i < v_size; ++i) + { + const float diff = input_vector[i] - mean; + sum_diff_sq += diff * diff; + } + const float variance = sum_diff_sq / v_size; + constexpr float kNormalizationConstant = 1e-8f; + const float stddev_inv = 1.0f / std::sqrt(variance + kNormalizationConstant); + for (int i = 0; i < v_size; ++i) + { + output_vector[i] = (input_vector[i] - mean) * stddev_inv; + } + input_vector += v_size; + output_vector += v_size; + } +} + +inline void PortableZeroVector(float *vector, int v_size) { std::fill_n(vector, v_size, 0); } } // namespace cker } // namespace nnfw diff --git a/compute/cker/include/cker/Shape.h b/compute/cker/include/cker/Shape.h index 2486f01a6..9269ce9aa 100644 --- a/compute/cker/include/cker/Shape.h +++ b/compute/cker/include/cker/Shape.h @@ -136,12 +136,27 @@ public: std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32_t)); } + inline void ReplaceWith(const Shape &other) + { + ReplaceWith(other.DimensionsCount(), other.DimsData()); + } + + inline void ReplaceWith(Shape &&other) + { + Resize(0); + std::swap(_size, other._size); + if (_size <= kMaxSmallSize) + std::copy(other._dims, other._dims + kMaxSmallSize, _dims); + else + _dims_pointer = other._dims_pointer; + } + template <typename T> inline void BuildFrom(const T &src_iterable) { const int dimensions_count = std::distance(src_iterable.begin(), src_iterable.end()); Resize(dimensions_count); int32_t *data = DimsData(); - for (auto it : src_iterable) + for (auto &&it : src_iterable) { *data = it; ++data; @@ -172,7 +187,6 @@ public: for (int i = 0; i < _size; i++) { const int dim = dims_data[i]; - assert(dim >= 1); buffer_size *= dim; } return buffer_size; diff --git a/compute/cker/include/cker/TensorUtils.h b/compute/cker/include/cker/TensorUtils.h index e07c91239..bac79b887 100644 --- a/compute/cker/include/cker/TensorUtils.h +++ b/compute/cker/include/cker/TensorUtils.h @@ -31,55 +31,133 @@ namespace nnfw namespace cker { -void VectorBatchVectorAssign(const float *vector, int v_size, int n_batch, float *batch_vector) +inline void CwiseClipping(float *vector, const int v_size, const float clipping_value) +{ + NEON_OR_PORTABLE(CwiseClipping, vector, v_size, clipping_value); +} + +inline void VectorBatchVectorAdd(const float *vector, int v_size, int n_batch, float *batch_vector) +{ + PortableVectorBatchVectorAdd(vector, v_size, n_batch, batch_vector); +} + +inline void VectorBatchVectorAssign(const float *vector, int v_size, int n_batch, + float *batch_vector) { PortableVectorBatchVectorAssign(vector, v_size, n_batch, batch_vector); } -bool IsZeroVector(const float *vector, int v_size) +// Cwise product of two vectors. +template <typename T> +inline void VectorVectorCwiseProduct(const T *__restrict__ vector1, const T *__restrict__ vector2, + int v_size, T *__restrict__ result) +{ + for (int v = 0; v < v_size; v++) + { + *result++ = *vector1++ * *vector2++; + } +} + +// Cwise product and accumulate of two vectors. Since it's a MAC operation, the +// assumption here is that result array is initialized to valid values. +template <typename T> +inline void VectorVectorCwiseProductAccumulate(const T *__restrict__ vector1, + const T *__restrict__ vector2, int v_size, + T *__restrict__ result) +{ + for (int v = 0; v < v_size; v++) + { + *result++ += *vector1++ * *vector2++; + } +} + +// Cwise product of a vector and a batch-vector. +template <typename T> +inline void VectorBatchVectorCwiseProduct(const T *vector, int v_size, const T *batch_vector, + int n_batch, T *result) +{ + for (int b = 0; b < n_batch; b++) + { + VectorVectorCwiseProduct(vector, batch_vector, v_size, result); + // Update the pointers. + result += v_size; + batch_vector += v_size; + } +} + +// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC +// operation, the assumption here is that result array is initialized to valid +// values. +template <typename T> +inline void VectorBatchVectorCwiseProductAccumulate(const T *vector, int v_size, + const T *batch_vector, int n_batch, T *result) +{ + for (int b = 0; b < n_batch; b++) + { + VectorVectorCwiseProductAccumulate(vector, batch_vector, v_size, result); + // Update the pointers. + result += v_size; + batch_vector += v_size; + } +} + +inline bool IsZeroVector(const float *vector, int v_size) { return NEON_OR_PORTABLE(IsZeroVector, vector, v_size); } -void ApplyActivationToVector(const float *vector, int v_size, - FusedActivationFunctionType activation, float *result) +inline void ApplyActivationToVector(const float *vector, int v_size, + FusedActivationFunctionType activation, float *result) { PortableApplyActivationToVector(vector, v_size, activation, result); } -void SymmetricQuantizeFloats(const float *values, const int size, int8_t *quantized_values, - float *min, float *max, float *scaling_factor) +inline void Sub1Vector(const float *vector, int v_size, float *result) +{ + NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result); +} + +inline void SymmetricQuantizeFloats(const float *values, const int size, int8_t *quantized_values, + float *min, float *max, float *scaling_factor) { return NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values, min, max, scaling_factor); } -void MatrixBatchVectorMultiplyAccumulate(const int8_t *matrix, const int m_rows, const int m_cols, - const int8_t *vector, const float *scaling_factors, - int n_batch, float *result, int result_stride) +inline void MatrixBatchVectorMultiplyAccumulate(const int8_t *matrix, const int m_rows, + const int m_cols, const int8_t *vector, + const float *scaling_factors, int n_batch, + float *result, int result_stride) { NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, vector, scaling_factors, n_batch, result, result_stride); } -void MatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols, - const float *vector, int n_batch, float *result, - int result_stride) +inline void MatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols, + const float *vector, int n_batch, float *result, + int result_stride) { NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, vector, n_batch, result, result_stride); } -void MatrixBatchVectorMultiplyAccumulate(const int8_t *matrix, const int m_rows, const int m_cols, - const int8_t *vectors, const float *scaling_factors, - int n_batch, int32_t *scratch, float *result, - int result_stride, ruy::Context *ruy_context) +inline void MatrixBatchVectorMultiplyAccumulate(const int8_t *matrix, const int m_rows, + const int m_cols, const int8_t *vectors, + const float *scaling_factors, int n_batch, + int32_t *scratch, float *result, int result_stride, + ruy::Context *ruy_context) { NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, scratch, result, result_stride, ruy_context); } -void ZeroVector(float *vector, int v_size) { PortableZeroVector(vector, v_size); } +inline void MeanStddevNormalization(const float *input_vector, float *output_vector, int v_size, + int n_batch) +{ + PortableMeanStddevNormalization(input_vector, output_vector, v_size, n_batch); +} + +inline void ZeroVector(float *vector, int v_size) { PortableZeroVector(vector, v_size); } } // namespace cker } // namespace nnfw diff --git a/compute/cker/include/cker/Types.h b/compute/cker/include/cker/Types.h index c0c9313ea..3fd0cf5b6 100644 --- a/compute/cker/include/cker/Types.h +++ b/compute/cker/include/cker/Types.h @@ -34,6 +34,8 @@ enum class FusedActivationFunctionType kRelu6 = 1, kRelu1 = 2, kRelu = 3, + kTanh = 4, + kSigmoid = 6, }; enum class PaddingType { @@ -78,8 +80,6 @@ enum class BroadcastableOpCategory : uint8_t struct PoolParams { - FusedActivationFunctionType activation; - PaddingType padding_type; PaddingValues padding_values; int stride_height; int stride_width; @@ -109,6 +109,8 @@ struct SoftmaxParams int32_t zero_point; float scale; float *table; + uint8_t *uint8_table1; + uint8_t *uint8_table2; }; struct PackParams @@ -170,25 +172,25 @@ struct ComparisonParams struct BinaryArithmeticOpParam { // Shape dependent / common to data / op types. - BroadcastableOpCategory broadcast_category; + BroadcastableOpCategory broadcast_category{BroadcastableOpCategory::kNone}; // uint8 inference params. - int32_t input1_offset; - int32_t input2_offset; - int32_t output_offset; - int32_t output_multiplier; - int32_t output_shift; + int32_t input1_offset = 0; + int32_t input2_offset = 0; + int32_t output_offset = 0; + int32_t output_multiplier = 0; + int32_t output_shift = 0; // Add / Sub, not Mul, uint8 inference params. - int32_t left_shift; - int32_t input1_multiplier; - int32_t input1_shift; - int32_t input2_multiplier; - int32_t input2_shift; + int32_t left_shift = 0; + int32_t input1_multiplier = 0; + int32_t input1_shift = 0; + int32_t input2_multiplier = 0; + int32_t input2_shift = 0; // uint8, etc, activation params. - int32_t quantized_activation_min; - int32_t quantized_activation_max; + int32_t quantized_activation_min = 0; + int32_t quantized_activation_max = 0; // float activation params. - float float_activation_min; - float float_activation_max; + float float_activation_min = 0; + float float_activation_max = 0; // Processed output dimensions. // Let input "a" be the one that broadcasts in the faster-changing dimension. @@ -256,9 +258,12 @@ struct FullyConnectedParams // uint8, etc, activation params. int32_t quantized_activation_min; int32_t quantized_activation_max; - // float activation params. + // float activation params float float_activation_min; float float_activation_max; + // Mark the operands as cacheable if they are unchanging, e.g. weights. + bool lhs_cacheable; + bool rhs_cacheable; // FullyConnectedWeightsFormat weights_format; }; @@ -268,6 +273,27 @@ struct L2NormParams int32_t input_zero_point; }; +enum LSTMKernelType +{ + kTfLiteLSTMFullKernel = 0, + kTfLiteLSTMBasicKernel +}; + +struct LSTMParams +{ + // Parameters for LSTM version 1. + FusedActivationFunctionType activation{FusedActivationFunctionType::kNone}; + float cell_clip; + float proj_clip; + + // Parameters for LSTM version 2. + // kTfLiteLSTMBasicKernel is only supported in version 2 or above. + LSTMKernelType kernel_type; + + // Parameters for LSTM version 4. + bool asymmetric_quantize_inputs; +}; + struct GatherParams { int32_t axis; @@ -366,12 +392,24 @@ struct SpaceToDepthParams int32_t block_size; }; +struct LeakyReluParams +{ + float alpha; +}; + enum class Order { kColMajor, kRowMajor }; +enum class CachePolicy : std::uint8_t +{ + kNeverCache, + kCacheIfLargeSpeedup, + kAlwaysCache, +}; + // MatrixParams encapsulates the parameters that Gemm needs about each // matrix, besides the buffer data pointer. // Compare to ruy::Matrix, which also encapsulates the data pointer. @@ -390,10 +428,13 @@ template <typename Scalar> struct MatrixParams // The zero_point, i.e. which Scalar value is to be interpreted as zero. // When Scalar is floating-point, this must be 0. Scalar zero_point = 0; - // Indicate whether the underlying data will remain unchanged for - // some period of time. Defaults to false, but should be set to true - // for unchanging data (e.g. weights buffers in many cases) - bool cacheable = false; + // When the data pointed to by this matrix is constant data, so that it is + // valid to assume that equality of pointers implies equality of data, + // a CachePolicy may be used instead of the default kNeverCache, + // which will enable ruy to take advantage of this constancy of the data to + // cache the packing work, which can be a large speedup in matrix*vector + // and other narrow shapes. + CachePolicy cache_policy = CachePolicy::kNeverCache; }; // Enumeration of broad categories of Gemm. @@ -442,9 +483,9 @@ enum class QuantizationFlavor // (only those that need perchannel quantization do). template <typename AccumScalar, typename DstScalar, QuantizationFlavor quantization_flavor = - std::is_floating_point<AccumScalar>::value - ? QuantizationFlavor::kFloatingPoint - : QuantizationFlavor::kIntegerWithUniformMultiplier> + std::is_floating_point<AccumScalar>::value + ? QuantizationFlavor::kFloatingPoint + : QuantizationFlavor::kIntegerWithUniformMultiplier> struct GemmParams { // Only for non-floating-point cases. The fixed-point part (i.e. the mantissa) @@ -471,12 +512,12 @@ struct GemmParams const AccumScalar *bias = nullptr; // min clamp bound of destination values. DstScalar clamp_min = std::is_floating_point<DstScalar>::value - ? -std::numeric_limits<DstScalar>::infinity() - : std::numeric_limits<DstScalar>::lowest(); + ? -std::numeric_limits<DstScalar>::infinity() + : std::numeric_limits<DstScalar>::lowest(); // max clamp bound of destination values. DstScalar clamp_max = std::is_floating_point<DstScalar>::value - ? std::numeric_limits<DstScalar>::infinity() - : std::numeric_limits<DstScalar>::max(); + ? std::numeric_limits<DstScalar>::infinity() + : std::numeric_limits<DstScalar>::max(); }; // Validates self-consistency of GemmParams. diff --git a/compute/cker/include/cker/Utils.h b/compute/cker/include/cker/Utils.h index 2abb998d0..9aae0a957 100644 --- a/compute/cker/include/cker/Utils.h +++ b/compute/cker/include/cker/Utils.h @@ -20,6 +20,8 @@ #include "Shape.h" +#include "neon/neon_check.h" + #include <algorithm> #include <cstdint> #include <fixedpoint/fixedpoint.h> @@ -29,6 +31,11 @@ namespace nnfw namespace cker { +template <typename T> struct is_quant8 +{ + static constexpr bool value = std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value; +}; + template <typename T> inline T ActivationFunctionWithMinMax(T x, T output_activation_min, T output_activation_max) { @@ -88,8 +95,8 @@ inline int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multip int left_shift = shift > 0 ? shift : 0; int right_shift = shift > 0 ? 0 : -shift; return gemmlowp::RoundingDivideByPOT( - gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier), - right_shift); + gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier), + right_shift); } inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(int32_t x, int32_t quantized_multiplier, @@ -103,8 +110,36 @@ inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(int32_t x, int left_shift) { return gemmlowp::RoundingDivideByPOT( - gemmlowp::SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift); + gemmlowp::SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift); +} + +#ifdef USE_NEON +inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(int32x4x4_t input_val, + int32_t quantized_multiplier, int32_t shift) +{ + const int left_shift = std::max(shift, 0); + const int right_shift = std::min(shift, 0); + int32x4x4_t result; + + int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier); + int32x4_t left_shift_dup = vdupq_n_s32(left_shift); + int32x4_t right_shift_dup = vdupq_n_s32(right_shift); + + result.val[0] = vrshlq_s32( + vqrdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup), multiplier_dup), right_shift_dup); + + result.val[1] = vrshlq_s32( + vqrdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup), multiplier_dup), right_shift_dup); + + result.val[2] = vrshlq_s32( + vqrdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup), multiplier_dup), right_shift_dup); + + result.val[3] = vrshlq_s32( + vqrdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup), multiplier_dup), right_shift_dup); + + return result; } +#endif inline int NodeOffset(int b, int h, int w, int height, int width) { @@ -162,7 +197,7 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift, const F3 fixedpoint_input = F3::FromRaw(input >> 1); const F3 fixedpoint_half_input = SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input); const F3 fixedpoint_half_three = - GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5); + GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5); // Newton-Raphson iteration // Naive unoptimized starting guess: x = 1 F3 x = F3::One(); @@ -173,7 +208,7 @@ inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift, x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3); } const F0 fixedpoint_half_sqrt_2 = - GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.); + GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.); x = x * fixedpoint_half_sqrt_2; *output_inv_sqrt = x.raw(); if (*output_shift < 0) @@ -429,7 +464,7 @@ template <typename T> class SequentialTensorWriter { public: SequentialTensorWriter(const T *input_data, T *output_data) - : input_data_(input_data), output_ptr_(output_data) + : input_data_(input_data), output_ptr_(output_data) { } diff --git a/compute/cker/include/cker/eigen/EigenSupport.h b/compute/cker/include/cker/eigen/EigenSupport.h index 49c34211a..e3b10990e 100644 --- a/compute/cker/include/cker/eigen/EigenSupport.h +++ b/compute/cker/include/cker/eigen/EigenSupport.h @@ -39,17 +39,17 @@ namespace eigen_support // library. typedef Eigen::TensorMap<Eigen::Tensor<float, 2, Eigen::RowMajor, Eigen::DenseIndex>, Eigen::Aligned> - EigenMatrix; + EigenMatrix; typedef Eigen::TensorMap<Eigen::Tensor<const float, 2, Eigen::RowMajor, Eigen::DenseIndex>, Eigen::Aligned> - ConstEigenMatrix; + ConstEigenMatrix; typedef Eigen::TensorMap<Eigen::Tensor<float, 4, Eigen::RowMajor, Eigen::DenseIndex>, Eigen::Aligned> - EigenTensor; + EigenTensor; typedef Eigen::TensorMap<Eigen::Tensor<const float, 4, Eigen::RowMajor, Eigen::DenseIndex>, Eigen::Aligned> - ConstEigenTensor; + ConstEigenTensor; // Utility functions we need for the EigenTensor API. template <typename Device, typename T> struct MatMulConvFunctor diff --git a/compute/cker/include/cker/eigen/Utils.h b/compute/cker/include/cker/eigen/Utils.h index f9c706370..40cb85432 100644 --- a/compute/cker/include/cker/eigen/Utils.h +++ b/compute/cker/include/cker/eigen/Utils.h @@ -36,9 +36,9 @@ namespace cker // Eigen::Map<Eigen::Matrix<const float, ...>> template <typename Scalar> using VectorMap = typename std::conditional< - std::is_const<Scalar>::value, - Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, 1>>, - Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>>::type; + std::is_const<Scalar>::value, + Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, 1>>, + Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>>::type; template <typename Scalar> VectorMap<Scalar> MapAsVector(Scalar *data, const Shape &shape) { @@ -51,10 +51,10 @@ template <typename Scalar> VectorMap<Scalar> MapAsVector(Scalar *data, const Sha // above also applies here. template <typename Scalar> using MatrixMap = typename std::conditional< - std::is_const<Scalar>::value, - Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, - Eigen::Dynamic>>, - Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type; + std::is_const<Scalar>::value, + Eigen::Map< + const Eigen::Matrix<typename std::remove_const<Scalar>::type, Eigen::Dynamic, Eigen::Dynamic>>, + Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type; template <typename Scalar> MatrixMap<Scalar> MapAsMatrixWithLastDimAsRows(Scalar *data, const Shape &shape) diff --git a/compute/cker/include/cker/eigen/eigen_convolution_helpers.h b/compute/cker/include/cker/eigen/eigen_convolution_helpers.h index dc3e2552d..9d4fd2eaf 100644 --- a/compute/cker/include/cker/eigen/eigen_convolution_helpers.h +++ b/compute/cker/include/cker/eigen/eigen_convolution_helpers.h @@ -49,20 +49,19 @@ class TensorEvaluatorHasPartialPacket public: template <typename TensorEvaluatorT, typename PacketT, typename IndexT> static auto functionExistsSfinae( - typename std::enable_if< - unpacket_traits<PacketT>::masked_load_available && - std::is_same< - PacketT, - decltype(std::declval<const TensorEvaluatorT>().template partialPacket<PacketT>( - std::declval<IndexT>(), - std::declval<typename unpacket_traits<PacketT>::mask_t>()))>::value>::type *) - -> std::true_type; + typename std::enable_if< + unpacket_traits<PacketT>::masked_load_available && + std::is_same<PacketT, + decltype(std::declval<const TensorEvaluatorT>().template partialPacket<PacketT>( + std::declval<IndexT>(), + std::declval<typename unpacket_traits<PacketT>::mask_t>()))>::value>::type *) + -> std::true_type; template <typename TensorEvaluatorT, typename PacketT, typename IndexT> static auto functionExistsSfinae(...) -> std::false_type; typedef decltype( - functionExistsSfinae<TensorEvaluatorType, PacketType, IndexType>(nullptr)) status; + functionExistsSfinae<TensorEvaluatorType, PacketType, IndexType>(nullptr)) status; static constexpr bool value = status::value; }; @@ -71,9 +70,9 @@ public: // [from, to) range. If the mask bit is 1, element will be loaded/stored. template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - typename std::enable_if<unpacket_traits<Packet>::masked_load_available, - typename unpacket_traits<Packet>::mask_t>::type - mask(int from, int to) + typename std::enable_if<unpacket_traits<Packet>::masked_load_available, + typename unpacket_traits<Packet>::mask_t>::type + mask(int from, int to) { const Index packet_size = internal::unpacket_traits<Packet>::size; eigen_assert(0 <= from && to <= (packet_size + 1) && from < to); diff --git a/compute/cker/include/cker/eigen/eigen_gemm_eigen.h b/compute/cker/include/cker/eigen/eigen_gemm_eigen.h new file mode 100644 index 000000000..d4f8fc09d --- /dev/null +++ b/compute/cker/include/cker/eigen/eigen_gemm_eigen.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2019 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_EGIEN_EIGEN_GEMM_EIGEN_H__ +#define __NNFW_CKER_EGIEN_EIGEN_GEMM_EIGEN_H__ + +// See b/131835803: in TFLite code, because eigen_spatial_convolutions.h does +// #define Eigen EigenForTFLite, it is difficult to have any #include of Eigen +// headers in a header file, as that results in name classes (compilation +// errors) depending on the order in which these headers are #included. +// So we have moved the #include of Eigen here, in a .cc file, where we have +// control over the header #include sequence. +// #include "third_party/eigen3/Eigen/Core" +// #include "tensorflow/lite/kernels/cpu_backend_context.h" +// #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h" +// #include "tensorflow/lite/kernels/internal/common.h" +// #include "cker/eigen/eigen_convolution_helpers.h" +#include "cker/operation/Common.h" +#include "cker/Types.h" + +#include <Eigen/Core> + +namespace nnfw +{ +namespace cker +{ +namespace detail +{ + +// tensorflow/tensorflow/lite/kernels/cpu_backend_gemm_eigen.h and cpu_backend_gemm_eigen.cc +struct GemmImplUsingEigen +{ + static void Run(const MatrixParams<float> &lhs_params, const float *lhs_data, + const MatrixParams<float> &rhs_params, const float *rhs_data, + const MatrixParams<float> &dst_params, float *dst_data, + const GemmParams<float, float> ¶ms) + { + // This code assumes specific storage orders, encoded in these Eigen types. + // These assumptions have been checked by TF_LITE_ASSERT's in the public + // Gemm entry point already, before the implementation gets to this point. + using EigenMatrixMapRowMajorConst = + Eigen::Map<const Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>; + using EigenMatrixMapColMajorConst = + Eigen::Map<const Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>>; + using EigenMatrixMapColMajorMutable = + Eigen::Map<Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor>>; + + EigenMatrixMapRowMajorConst eigen_lhs(lhs_data, lhs_params.rows, lhs_params.cols); + EigenMatrixMapColMajorConst eigen_rhs(rhs_data, rhs_params.rows, rhs_params.cols); + EigenMatrixMapColMajorMutable eigen_dst(dst_data, dst_params.rows, dst_params.cols); + + if (rhs_params.cols == 1) + { + eigen_dst.col(0).noalias() = eigen_lhs * eigen_rhs.col(0); + } + else if (lhs_params.rows == 1) + { + eigen_dst.row(0).noalias() = eigen_lhs.row(0) * eigen_rhs; + } + else + { + eigen_dst.noalias() = eigen_lhs * eigen_rhs; + } + + if (params.bias) + { + BiasAndClamp(params.clamp_min, params.clamp_max, dst_params.rows, params.bias, + dst_params.rows * dst_params.cols, dst_data); + } + else + { + eigen_dst = eigen_dst.cwiseMin(params.clamp_max).cwiseMax(params.clamp_min); + } + } +}; + +} // namespace detail +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_EGIEN_EIGEN_GEMM_EIGEN_H__ diff --git a/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h b/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h index 92e1614d1..c931ac518 100644 --- a/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h +++ b/compute/cker/include/cker/eigen/eigen_spatial_convolutions-inl.h @@ -62,30 +62,27 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen typename Scalar_, typename Index, typename nocontract_t, typename contract_t, int Side, int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> class TensorContractionInputMapper< - Scalar_, Index, Side, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> + Scalar_, Index, Side, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> { public: typedef Scalar_ Scalar; typedef TensorContractionInputMapper< - Scalar, Index, Side, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> - Self; + Scalar, Index, Side, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> + Self; typedef TensorContractionSubMapper< - Scalar, Index, Side, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> - SubMapper; + Scalar, Index, Side, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> + SubMapper; typedef SubMapper VectorMapper; typedef SubMapper LinearMapper; @@ -95,11 +92,11 @@ public: EIGEN_DEVICE_FUNC TensorContractionInputMapper( - const TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device> &tensor, - const nocontract_t &, const nocontract_t &, const contract_t &, const contract_t &) - : m_impl(tensor.impl().impl()) + const TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device> + &tensor, + const nocontract_t &, const nocontract_t &, const contract_t &, const contract_t &) + : m_impl(tensor.impl().impl()) { Index patch_rows; Index patch_depth; @@ -167,7 +164,7 @@ public: EIGEN_DEVICE_FUNC TensorContractionInputMapper(const TensorContractionInputMapper &base_mapper) - : m_impl(base_mapper.m_impl) + : m_impl(base_mapper.m_impl) { m_patch_cols = base_mapper.m_patch_cols; m_num_patches = base_mapper.m_num_patches; @@ -280,11 +277,10 @@ public: private: friend class TensorContractionSubMapper< - Scalar, Index, Side, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>; + Scalar, Index, Side, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>; // Load coefficient from a patch specified by the "within patch offset" // (patchId) and the precomputed indices of the first element of the patch. @@ -298,14 +294,14 @@ private: const Index colOffset = patchOffset / m_fastColStride; const Index inputCol = colIndex + colOffset * m_in_col_strides; const Index origInputCol = (m_patch_col_inflate_strides == 1) - ? inputCol - : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0); + ? inputCol + : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0); const Index rowOffset = patchOffset - colOffset * m_colStride; const Index inputRow = rowIndex + rowOffset * m_in_row_strides; const Index origInputRow = (m_patch_row_inflate_strides == 1) - ? inputRow - : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0); + ? inputRow + : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0); if (origInputCol < 0 || origInputRow < 0 || origInputCol >= m_inputCols || origInputRow >= m_inputRows || (inputCol != origInputCol * m_patch_col_inflate_strides) || (inputRow != origInputRow * m_patch_row_inflate_strides)) @@ -314,7 +310,7 @@ private: } const Index depth = patchId - patchOffset * patchDepth(); const Index inputIndex = - depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex; + depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex; return m_impl.coeff(inputIndex); } @@ -338,7 +334,7 @@ private: } const Index depth = patchId - patchOffset * patchDepth(); const Index inputIndex = - depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex; + depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex; return m_impl.coeff(inputIndex); } @@ -390,7 +386,7 @@ private: // span[0] all the way upto (and including) span[1]. const Index depth = patchId - patchOffsets[0] * patchDepth(); const Index inputIndex = - depth + inputRows[0] * m_rowInputStride + inputCol * m_colInputStride + otherIndex; + depth + inputRows[0] * m_rowInputStride + inputCol * m_colInputStride + otherIndex; return m_impl.template partialPacket<Packet>(inputIndex - span[0], mask<Packet>(span[0], span[1] + 1)); } @@ -445,10 +441,10 @@ private: // Load partial packets and do bit-wise OR to generate required packet return internal::por<Packet>( - loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[0], spans[0], - patchOffsets2Cols[0], colOffsets[0]), - loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[1], spans[1], - patchOffsets2Cols[1], colOffsets[1])); + loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[0], spans[0], + patchOffsets2Cols[0], colOffsets[0]), + loadPartialPacketStandard(rowIndex, colIndex, otherIndex, patchIds[1], spans[1], + patchOffsets2Cols[1], colOffsets[1])); } // Helper function to load a packet that is present in a single columns. @@ -477,7 +473,7 @@ private: // no padding const Index depth = patchId - patchOffsets[0] * patchDepth(); const Index inputIndex = - depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex; + depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex; return m_impl.template packet<Unaligned>(inputIndex); } return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex); @@ -490,7 +486,7 @@ private: // load. template <typename PacketT, typename TensorEvaluatorT> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if< - !TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type + !TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type loadPacketStandard(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const { const Index packetSize = internal::unpacket_traits<Packet>::size; @@ -538,7 +534,7 @@ private: // packets. template <typename PacketT, typename TensorEvaluatorT> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if< - TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type + TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type loadPacketStandard(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const { const Index packetSize = internal::unpacket_traits<PacketT>::size; @@ -604,7 +600,7 @@ private: // no padding const Index depth = patchId - patchOffset * patchDepth(); const Index inputIndex = - depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex; + depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex; return m_impl.template packet<Unaligned>(inputIndex); } @@ -627,10 +623,10 @@ private: computeBaseIndices(Index patchIndex, Index &rowIndex, Index &colIndex, Index &otherIndex) const { const size_t NumInputDims = - array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; + array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; otherIndex = (NumInputDims == 3) ? 0 : patchIndex / m_fastNumPatches; const Index patch2DIndex = - (NumInputDims == 3) ? patchIndex : (patchIndex - otherIndex * m_num_patches); + (NumInputDims == 3) ? patchIndex : (patchIndex - otherIndex * m_num_patches); otherIndex *= m_patchInputStride; colIndex = patch2DIndex / m_fastOutputRows; rowIndex = patch2DIndex - colIndex * m_outputRows; @@ -689,31 +685,28 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen typename Scalar, typename Index, typename nocontract_t, typename contract_t, int Side, int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> class TensorContractionSubMapper< - Scalar, Index, Side, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> + Scalar, Index, Side, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> { public: typedef typename packet_traits<Scalar>::type Packet; typedef typename packet_traits<Scalar>::half HalfPacket; typedef TensorContractionInputMapper< - Scalar, Index, Side, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> - ParentMapper; + Scalar, Index, Side, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> + ParentMapper; typedef TensorContractionSubMapper< - Scalar, Index, Side, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> - Self; + Scalar, Index, Side, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> + Self; typedef Self LinearMapper; @@ -722,16 +715,16 @@ public: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(const ParentMapper &base_mapper, Index vert_offset, Index horiz_offset) - : m_depth_offset(vert_offset), m_col_offset(horiz_offset), m_base_mapper(base_mapper) + : m_depth_offset(vert_offset), m_col_offset(horiz_offset), m_base_mapper(base_mapper) { m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(const Self &base_mapper, Index vert_offset, Index horiz_offset) - : m_depth_offset(vert_offset + base_mapper.m_depth_offset), - m_col_offset(horiz_offset + base_mapper.m_col_offset), - m_base_mapper(base_mapper.m_base_mapper) + : m_depth_offset(vert_offset + base_mapper.m_depth_offset), + m_col_offset(horiz_offset + base_mapper.m_col_offset), + m_base_mapper(base_mapper.m_base_mapper) { m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex); } @@ -766,7 +759,7 @@ public: { typedef decltype(m_base_mapper.m_impl) TensorEvaluatorT; return m_base_mapper.template loadPacketStandard<Packet, TensorEvaluatorT>( - i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex); + i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex); } template <typename Packet> EIGEN_DEVICE_FUNC bool aligned(Index) const { return false; } @@ -781,7 +774,7 @@ public: EIGEN_ALWAYS_INLINE Index maxCol(const Index peeled_k) const { const Index max_col = - (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1)) / fastPatchColStride(); + (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1)) / fastPatchColStride(); return std::min<Index>(1 + max_col, patchCols()); } @@ -789,8 +782,8 @@ public: EIGEN_ALWAYS_INLINE Index maxRow(const Index peeled_k, const Index col) const { const Index max_row = - (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1) - col * patchColStride()) / - fastPatchRowStride(); + (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1) - col * patchColStride()) / + fastPatchRowStride(); return std::min<Index>(1 + max_row, patchRows()); } @@ -862,7 +855,7 @@ public: } template <typename PacketT = Packet> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if< - TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type + TensorEvaluatorHasPartialPacket<TensorEvaluatorT, PacketT, Index>::value, PacketT>::type partialPacketNoPadding(const Index depth, const Index baseIndex, Index num_coeffs) const { const Index inputIndex = depth + baseIndex; @@ -913,8 +906,8 @@ public: const Index input_row = m_rowIndex + row * m_base_mapper.m_in_row_strides; *orig_row = (m_base_mapper.m_patch_row_inflate_strides == 1) - ? input_row - : ((input_row >= 0) ? (input_row / m_base_mapper.m_fastInputRowStride) : 0); + ? input_row + : ((input_row >= 0) ? (input_row / m_base_mapper.m_fastInputRowStride) : 0); return (*orig_row < 0 || *orig_row >= m_base_mapper.m_inputRows) || (input_row != *orig_row * m_base_mapper.m_patch_row_inflate_strides); @@ -932,8 +925,8 @@ public: const Index input_col = m_colIndex + col * m_base_mapper.m_in_col_strides; *orig_col = (m_base_mapper.m_patch_col_inflate_strides == 1) - ? input_col - : ((input_col >= 0) ? (input_col / m_base_mapper.m_fastInputColStride) : 0); + ? input_col + : ((input_col >= 0) ? (input_col / m_base_mapper.m_fastInputColStride) : 0); return (*orig_col < 0 || *orig_col >= m_base_mapper.m_inputCols) || (input_col != *orig_col * m_base_mapper.m_patch_col_inflate_strides); @@ -1033,23 +1026,20 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, int nr> struct gemm_pack_rhs< - Scalar, Index, - TensorContractionSubMapper< - Scalar, Index, Rhs, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, - Alignment>, - nr, ColMajor, false, false> + Scalar, Index, + TensorContractionSubMapper< + Scalar, Index, Rhs, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>, + nr, ColMajor, false, false> { typedef TensorContractionSubMapper< - Scalar, Index, Rhs, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> - SubMapper; + Scalar, Index, Rhs, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> + SubMapper; typedef SubMapper DataMapper; typedef typename packet_traits<Scalar>::type Packet; @@ -1159,7 +1149,7 @@ struct gemm_pack_rhs< const Index idx3 = dm3.baseIndex(r, c); const Index start_depth = - ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0; + ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0; const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth); eigen_assert((max_depth - start_depth) % packet_size == 0); @@ -1248,22 +1238,20 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen typename Scalar, typename Index, typename nocontract_t, typename contract_t, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, int nr> struct gemm_pack_rhs< - Scalar, Index, - TensorContractionSubMapper< - Scalar, Index, Rhs, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment>, - nr, ColMajor, false, false> + Scalar, Index, + TensorContractionSubMapper< + Scalar, Index, Rhs, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment>, + nr, ColMajor, false, false> { typedef TensorContractionSubMapper< - Scalar, Index, Rhs, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment> - SubMapper; + Scalar, Index, Rhs, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, 2, inner_dim_contiguous, inner_dim_reordered, Alignment> + SubMapper; typedef SubMapper DataMapper; typedef typename packet_traits<Scalar>::type Packet; @@ -1378,7 +1366,7 @@ struct gemm_pack_rhs< const Index idx3 = dm3.baseIndex(r, c); const Index start_depth = - ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0; + ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0; const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth); eigen_assert((max_depth - start_depth) % packet_size == 0); @@ -1472,22 +1460,20 @@ template <typename NewDimension, Index Rows, Index Cols, typename ArgType, typen typename Scalar, typename Index, typename nocontract_t, typename contract_t, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, int nr> struct gemm_pack_rhs< - Scalar, Index, - TensorContractionSubMapper< - Scalar, Index, Rhs, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>, - nr, ColMajor, false, false> + Scalar, Index, + TensorContractionSubMapper< + Scalar, Index, Rhs, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment>, + nr, ColMajor, false, false> { typedef TensorContractionSubMapper< - Scalar, Index, Rhs, - TensorEvaluator< - const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, - Device>, - nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> - SubMapper; + Scalar, Index, Rhs, + TensorEvaluator< + const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType>>, Device>, + nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> + SubMapper; typedef SubMapper DataMapper; EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE) @@ -1582,27 +1568,25 @@ struct gemm_pack_rhs< */ template <typename Input, typename Kernel, typename OutputKernel = const NoOpOutputKernel> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static const typename internal::conditional< - internal::traits<Input>::Layout == ColMajor, - TensorReshapingOp< - const DSizes<typename internal::traits<Input>::Index, - internal::traits<Input>::NumDimensions>, - const TensorContractionOp< - const array<IndexPair<typename internal::traits<Input>::Index>, 1>, - const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, - const Kernel>, - const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, - const TensorImagePatchOp<Dynamic, Dynamic, const Input>>, - const OutputKernel>>, - TensorReshapingOp< - const DSizes<typename internal::traits<Input>::Index, - internal::traits<Input>::NumDimensions>, - const TensorContractionOp< - const array<IndexPair<typename internal::traits<Input>::Index>, 1>, - const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, - const TensorImagePatchOp<Dynamic, Dynamic, const Input>>, - const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, - const Kernel>, - const OutputKernel>>>::type + internal::traits<Input>::Layout == ColMajor, + TensorReshapingOp< + const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, + const TensorContractionOp< + const array<IndexPair<typename internal::traits<Input>::Index>, 1>, + const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, + const Kernel>, + const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, + const TensorImagePatchOp<Dynamic, Dynamic, const Input>>, + const OutputKernel>>, + TensorReshapingOp< + const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, + const TensorContractionOp< + const array<IndexPair<typename internal::traits<Input>::Index>, 1>, + const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, + const TensorImagePatchOp<Dynamic, Dynamic, const Input>>, + const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, + const Kernel>, + const OutputKernel>>>::type SpatialConvolution(const Input &input, const Kernel &kernel, const Index row_stride = 1, const Index col_stride = 1, const PaddingType padding_type = PADDING_SAME, const Index row_in_stride = 1, const Index col_in_stride = 1, @@ -1612,11 +1596,11 @@ SpatialConvolution(const Input &input, const Kernel &kernel, const Index row_str typedef typename internal::traits<Input>::Index TensorIndex; TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex>> - in(input); + in(input); TensorRef< - Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, - internal::traits<Kernel>::Layout, TensorIndex>> - kern(kernel); + Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, + internal::traits<Kernel>::Layout, TensorIndex>> + kern(kernel); EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<Kernel>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE) @@ -1735,46 +1719,46 @@ SpatialConvolution(const Input &input, const Kernel &kernel, const Index row_str } if (padding_explicit) { - return choose( - Cond<internal::traits<Input>::Layout == ColMajor>(), - kernel.reshape(kernel_dims) - .contract(input - .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, - row_in_stride, col_in_stride, - /*row_inflate_stride=*/1, - /*col_inflate_stride=*/1, padding_top, - padding_bottom, padding_left, padding_right, - /*padding_value=*/0) - .reshape(pre_contract_dims), - contract_dims, output_kernel) - .reshape(post_contract_dims), - input - .extract_image_patches( - kernelRows, kernelCols, row_stride, col_stride, row_in_stride, col_in_stride, - /*row_inflate_stride=*/1, - /*col_inflate_stride=*/1, padding_top, padding_bottom, padding_left, padding_right, - /*padding_value=*/0) - .reshape(pre_contract_dims) - .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel) - .reshape(post_contract_dims)); + return choose(Cond<internal::traits<Input>::Layout == ColMajor>(), + kernel.reshape(kernel_dims) + .contract(input + .extract_image_patches(kernelRows, kernelCols, row_stride, + col_stride, row_in_stride, col_in_stride, + /*row_inflate_stride=*/1, + /*col_inflate_stride=*/1, padding_top, + padding_bottom, padding_left, padding_right, + /*padding_value=*/0) + .reshape(pre_contract_dims), + contract_dims, output_kernel) + .reshape(post_contract_dims), + input + .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, + row_in_stride, col_in_stride, + /*row_inflate_stride=*/1, + /*col_inflate_stride=*/1, padding_top, padding_bottom, + padding_left, padding_right, + /*padding_value=*/0) + .reshape(pre_contract_dims) + .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel) + .reshape(post_contract_dims)); } else { return choose( - Cond<internal::traits<Input>::Layout == ColMajor>(), - kernel.reshape(kernel_dims) - .contract(input - .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, - row_in_stride, col_in_stride, padding_type) - .reshape(pre_contract_dims), - contract_dims, output_kernel) - .reshape(post_contract_dims), - input - .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, row_in_stride, - col_in_stride, padding_type) - .reshape(pre_contract_dims) - .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel) - .reshape(post_contract_dims)); + Cond<internal::traits<Input>::Layout == ColMajor>(), + kernel.reshape(kernel_dims) + .contract(input + .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, + row_in_stride, col_in_stride, padding_type) + .reshape(pre_contract_dims), + contract_dims, output_kernel) + .reshape(post_contract_dims), + input + .extract_image_patches(kernelRows, kernelCols, row_stride, col_stride, row_in_stride, + col_in_stride, padding_type) + .reshape(pre_contract_dims) + .contract(kernel.reshape(kernel_dims), contract_dims, output_kernel) + .reshape(post_contract_dims)); } } diff --git a/compute/cker/include/cker/operation/AddN.h b/compute/cker/include/cker/operation/AddN.h new file mode 100644 index 000000000..1704da641 --- /dev/null +++ b/compute/cker/include/cker/operation/AddN.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_ADDN_H__ +#define __NNFW_CKER_ADDN_H__ + +#include "cker/Shape.h" + +namespace nnfw +{ +namespace cker +{ + +template <typename T> +void AddN(const Shape &input_shape, const size_t num_inputs, const T **input_data, T *output_data) +{ + const size_t size = input_shape.FlatSize(); + for (size_t i = 0; i < size; ++i) + { + T x = 0; + for (size_t j = 0; j < num_inputs; ++j) + { + x += input_data[j][i]; + } + output_data[i] = x; + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_ADDN_H__ diff --git a/compute/cker/include/cker/operation/AveragePool.h b/compute/cker/include/cker/operation/AveragePool.h index 6149cafa7..e10f02ad4 100644 --- a/compute/cker/include/cker/operation/AveragePool.h +++ b/compute/cker/include/cker/operation/AveragePool.h @@ -73,10 +73,10 @@ void AveragePool<float>(const PoolParams ¶ms, const Shape &input_shape, cons int hpad = h + params.padding_values.height; int wpad = w + params.padding_values.width; int h_start = - (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1; + (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1; int h_end = std::min(hpad / stride_height + 1, output_height); int w_start = - (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1; + (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1; int w_end = std::min(wpad / stride_width + 1, output_width); // compute elementwise sum for (int ph = h_start; ph < h_end; ++ph) @@ -146,11 +146,11 @@ inline void AveragePool16(const PoolParams ¶ms, const Shape &input_shape, const int filter_y_start = std::max(0, -in_y_origin); const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); const int filter_count = - (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start); + (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start); memset(acc, 0, tranche_depth * sizeof(acc[0])); const uint8_t *input_ptr = - input_data + depth_base + - depth * (in_x_origin + input_width * (in_y_origin + input_height * batch)); + input_data + depth_base + + depth * (in_x_origin + input_width * (in_y_origin + input_height * batch)); for (int fy = filter_y_start; fy < filter_y_end; fy++) { const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start); @@ -283,11 +283,11 @@ inline void AveragePool32(const PoolParams ¶ms, const Shape &input_shape, const int filter_y_start = std::max(0, -in_y_origin); const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); const int filter_count = - (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start); + (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start); memset(acc, 0, tranche_depth * sizeof(acc[0])); const uint8_t *input_ptr = - input_data + depth_base + - depth * (in_x_origin + input_width * (in_y_origin + input_height * batch)); + input_data + depth_base + + depth * (in_x_origin + input_width * (in_y_origin + input_height * batch)); for (int fy = filter_y_start; fy < filter_y_end; fy++) { const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start); @@ -395,6 +395,129 @@ void AveragePool<uint8_t>(const PoolParams ¶ms, const Shape &input_shape, } } +template <> +void AveragePool<int8_t>(const PoolParams ¶ms, const Shape &input_shape, + const int8_t *input_data, const Shape &output_shape, int8_t *output_data) +{ + // Here, and in other pooling ops, in order to maintain locality of reference, + // to minimize some recalculations, and to load into NEON vector registers, we + // use an inner loop down the depth. Since depths can be large and hence we + // would need arbitrarily large temporary storage, we divide the work up into + // depth tranches just within the batch loop. + static constexpr int kPoolingAccTrancheSize = 256; + + assert(params.quantized_activation_min <= params.quantized_activation_max); + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int depth = MatchingDim(input_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; + + int32_t acc[kPoolingAccTrancheSize]; + for (int batch = 0; batch < batches; ++batch) + { + // We proceed through the depth in tranches (see comment above). The + // depth_base is the depth at the beginning of the tranche. The + // tranche_depth is the depth dimension of the tranche. + for (int depth_base = 0; depth_base < depth; depth_base += kPoolingAccTrancheSize) + { + const int tranche_depth = std::min(depth - depth_base, kPoolingAccTrancheSize); + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + const int in_x_origin = (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = (out_y * stride_height) - params.padding_values.height; + const int filter_x_start = std::max(0, -in_x_origin); + const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin); + const int filter_y_start = std::max(0, -in_y_origin); + const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); + const int filter_count = + (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start); + memset(acc, 0, tranche_depth * sizeof(acc[0])); + const int8_t *input_ptr = + input_data + depth_base + + depth * (in_x_origin + input_width * (in_y_origin + input_height * batch)); + for (int fy = filter_y_start; fy < filter_y_end; fy++) + { + const int8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start); + for (int fx = filter_x_start; fx < filter_x_end; fx++) + { + const int8_t *input_channel_ptr = input_row_ptr; + int channel = 0; +#ifdef USE_NEON + for (; channel <= tranche_depth - 16; channel += 16) + { + int16x4_t acc_reg[4]; + int8x16_t input_reg = vld1q_s8(input_channel_ptr); + input_channel_ptr += 16; + acc_reg[0] = vget_low_s16(vmovl_s8(vget_low_s8(input_reg))); + acc_reg[1] = vget_high_s16(vmovl_s8(vget_low_s8(input_reg))); + acc_reg[2] = vget_low_s16(vmovl_s8(vget_high_s8(input_reg))); + acc_reg[3] = vget_high_s16(vmovl_s8(vget_high_s8(input_reg))); + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc + channel + 4 * i, + vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i])); + } + } + for (; channel <= tranche_depth - 8; channel += 8) + { + int16x4_t acc_reg[2]; + int16x8_t input_reg = vmovl_s8(vld1_s8(input_channel_ptr)); + input_channel_ptr += 8; + acc_reg[0] = vget_low_s16(input_reg); + acc_reg[1] = vget_high_s16(input_reg); + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc + channel + 4 * i, + vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i])); + } + } +#endif + for (; channel < tranche_depth; ++channel) + { + acc[channel] += *input_channel_ptr++; + } + input_row_ptr += depth; + } + } + int8_t *output_ptr = output_data + Offset(output_shape, batch, out_y, out_x, depth_base); + int channel = 0; +#ifdef USE_NEON + for (; channel <= tranche_depth - 8; channel += 8) + { + int16_t buf[8]; + for (int i = 0; i < 8; i++) + { + buf[i] = acc[channel + i] > 0 ? (acc[channel + i] + filter_count / 2) / filter_count + : (acc[channel + i] - filter_count / 2) / filter_count; + } + int8x8_t buf8 = vqmovn_s16(vld1q_s16(buf)); + buf8 = vmin_s8(buf8, vdup_n_s8(params.quantized_activation_max)); + buf8 = vmax_s8(buf8, vdup_n_s8(params.quantized_activation_min)); + vst1_s8(output_ptr + channel, buf8); + } +#endif + for (; channel < tranche_depth; ++channel) + { + int16_t a = acc[channel] > 0 ? (acc[channel] + filter_count / 2) / filter_count + : (acc[channel] - filter_count / 2) / filter_count; + a = std::max<int16_t>(a, params.quantized_activation_min); + a = std::min<int16_t>(a, params.quantized_activation_max); + output_ptr[channel] = static_cast<int8_t>(a); + } + } + } + } + } +} + } // namespace cker } // namespace nnfw diff --git a/compute/cker/include/cker/operation/BatchToSpaceND.h b/compute/cker/include/cker/operation/BatchToSpaceND.h index e33b2fba5..980ad48dd 100644 --- a/compute/cker/include/cker/operation/BatchToSpaceND.h +++ b/compute/cker/include/cker/operation/BatchToSpaceND.h @@ -43,7 +43,7 @@ inline void GetIndexRange(int spatial_index_dim, int block_shape_dim, int input_ // Similarly, (*end_index) * block_shape_dim is rounded up too (note that // end_index is exclusive). *end_index = - std::min(input_dim, (output_dim - spatial_index_dim + block_shape_dim - 1) / block_shape_dim); + std::min(input_dim, (output_dim - spatial_index_dim + block_shape_dim - 1) / block_shape_dim); } template <typename T> @@ -116,7 +116,7 @@ inline void BatchToSpaceND(const Shape &unextended_input1_shape, const T *input1 for (int in_w = in_w_start; in_w < in_w_end; ++in_w) { const int out_w = - in_w * block_shape_width + spatial_offset % block_shape_width - crops_left; + in_w * block_shape_width + spatial_offset % block_shape_width - crops_left; assert(out_w >= 0); assert(out_w < output_width); T *out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0); diff --git a/compute/cker/include/cker/operation/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/BinaryArithmeticOps.h index 8aef1f8c1..c7878496a 100644 --- a/compute/cker/include/cker/operation/BinaryArithmeticOps.h +++ b/compute/cker/include/cker/operation/BinaryArithmeticOps.h @@ -139,7 +139,7 @@ inline bool ProcessBroadcastShapes(const Shape &shape0, const Shape &shape1, // From this point it is assumed contractually that corresponding dimensions // in shape0 and shape1 are either (a) equal or (b) one or other equals 1. const bool swap_inputs = - params->broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast; + params->broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast; const Shape *shape_a = swap_inputs ? &extended_shape1 : &extended_shape0; const Shape *shape_b = swap_inputs ? &extended_shape0 : &extended_shape1; @@ -190,34 +190,34 @@ inline bool ProcessBroadcastShapes(const Shape &shape0, const Shape &shape1, } template <BinaryArithmeticOpType op_type, typename T> -inline void BinaryArithmeticOp(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, - const T *input1_data, const Shape &input2_shape, - const T *input2_data, const Shape &output_shape, T *output_data) +inline typename std::enable_if_t<!is_quant8<T>::value> +BinaryArithmeticOp(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const T *input1_data, const Shape &input2_shape, const T *input2_data, + const Shape &output_shape, T *output_data) { reference::BinaryArithmeticOp(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, GetBinaryArtithmeticFn<op_type, T>()); } -template <BinaryArithmeticOpType op_type> -inline void BinaryArithmeticOp(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, - const uint8_t *input1_data, const Shape &input2_shape, - const uint8_t *input2_data, const Shape &output_shape, - uint8_t *output_data) +template <BinaryArithmeticOpType op_type, typename T> +inline typename std::enable_if_t<is_quant8<T>::value> +BinaryArithmeticOp(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const T *input1_data, const Shape &input2_shape, const T *input2_data, + const Shape &output_shape, T *output_data) { switch (op_type) { case nnfw::cker::BinaryArithmeticOpType::ADD: case nnfw::cker::BinaryArithmeticOpType::SUB: - optimized::AddQuant8(params, input1_shape, input1_data, input2_shape, input2_data, - output_shape, output_data); + optimized::Add(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, + output_data); break; case nnfw::cker::BinaryArithmeticOpType::MUL: - optimized::MulQuant8(params, input1_shape, const_cast<uint8_t *>(input1_data), input2_shape, - const_cast<uint8_t *>(input2_data), output_shape, output_data); + optimized::Mul(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, + output_data); break; case nnfw::cker::BinaryArithmeticOpType::DIV: throw std::runtime_error{"Quant8 Asymm NYI"}; - default: assert(false); break; @@ -246,9 +246,8 @@ inline void BinaryArithmeticOp(const BinaryArithmeticOpParam ¶ms, const Shap output_data); break; case nnfw::cker::BinaryArithmeticOpType::DIV: - reference::BinaryArithmeticOp<float>(params, input1_shape, input1_data, input2_shape, - input2_data, output_shape, output_data, - GetBinaryArtithmeticFn<op_type, float>()); + optimized::Div(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, + output_data); break; default: assert(false); @@ -257,33 +256,32 @@ inline void BinaryArithmeticOp(const BinaryArithmeticOpParam ¶ms, const Shap } template <BinaryArithmeticOpType op_type, typename T> -inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, - const T *input1_data, const Shape &input2_shape, - const T *input2_data, const Shape &output_shape, - T *output_data) +inline typename std::enable_if_t<!is_quant8<T>::value> +BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const T *input1_data, const Shape &input2_shape, const T *input2_data, + const Shape &output_shape, T *output_data) { reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, GetBinaryArtithmeticFn<op_type, T>()); } -template <BinaryArithmeticOpType op_type> -inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, - const uint8_t *input1_data, const Shape &input2_shape, - const uint8_t *input2_data, const Shape &output_shape, - uint8_t *output_data) +template <BinaryArithmeticOpType op_type, typename T> +inline typename std::enable_if_t<is_quant8<T>::value> +BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const T *input1_data, const Shape &input2_shape, const T *input2_data, + const Shape &output_shape, T *output_data) { switch (op_type) { case nnfw::cker::BinaryArithmeticOpType::ADD: case nnfw::cker::BinaryArithmeticOpType::SUB: - optimized::BroadcastAddDispatchQuant8(params, input1_shape, input1_data, input2_shape, - input2_data, output_shape, output_data); + optimized::BroadcastAddDispatch(params, input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data); break; case nnfw::cker::BinaryArithmeticOpType::MUL: - optimized::BroadcastMulDispatchQuant8( - params, input1_shape, const_cast<uint8_t *>(input1_data), input2_shape, - const_cast<uint8_t *>(input2_data), output_shape, output_data); + optimized::BroadcastMulDispatch(params, input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data); break; case nnfw::cker::BinaryArithmeticOpType::DIV: case nnfw::cker::BinaryArithmeticOpType::POW: @@ -312,11 +310,17 @@ inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam ¶ms, const S output_shape, output_data); break; case nnfw::cker::BinaryArithmeticOpType::SUB: + optimized::BroadcastSubDispatch(params, input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data); + break; case nnfw::cker::BinaryArithmeticOpType::DIV: + optimized::BroadcastDivDispatch(params, input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data); + break; case nnfw::cker::BinaryArithmeticOpType::POW: reference::BroadcastBinaryArithmeticOpSlow<float>( - params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, - GetBinaryArtithmeticFn<op_type, float>()); + params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, + GetBinaryArtithmeticFn<op_type, float>()); break; default: assert(false); diff --git a/compute/cker/include/cker/operation/BroadcastTo.h b/compute/cker/include/cker/operation/BroadcastTo.h index 5068eca96..145deda29 100644 --- a/compute/cker/include/cker/operation/BroadcastTo.h +++ b/compute/cker/include/cker/operation/BroadcastTo.h @@ -126,7 +126,7 @@ template <typename Device, typename T> struct BroadcastTo } } }; -} // functor +} // namespace functor template <typename T> inline void BroadcastTo(const Shape &input_shape, T *input_data, const Shape &output_shape, diff --git a/compute/cker/include/cker/operation/Common.h b/compute/cker/include/cker/operation/Common.h index d69b38aca..24d4cc4c7 100644 --- a/compute/cker/include/cker/operation/Common.h +++ b/compute/cker/include/cker/operation/Common.h @@ -82,7 +82,7 @@ inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size, const for (; i < bias_size; i++) { array_ptr[i] = - ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i], clamp_min, clamp_max); + ActivationFunctionWithMinMax(array_ptr[i] + bias_data[i], clamp_min, clamp_max); } } #else // not NEON @@ -91,7 +91,7 @@ inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size, const for (int i = 0; i < bias_size; i++) { array_data[array_offset + i] = ActivationFunctionWithMinMax( - array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max); + array_data[array_offset + i] + bias_data[i], clamp_min, clamp_max); } } #endif diff --git a/compute/cker/include/cker/operation/Comparison.h b/compute/cker/include/cker/operation/Comparison.h index 47eb6034c..ac6af8487 100644 --- a/compute/cker/include/cker/operation/Comparison.h +++ b/compute/cker/include/cker/operation/Comparison.h @@ -42,7 +42,7 @@ inline void ComparisonImpl(const Shape &input1_shape, const T *input1_data, const Shape &output_shape, bool *output_data) { const int64_t flatsize = // number of data.... - MatchingFlatSize(input1_shape, input2_shape, output_shape); + MatchingFlatSize(input1_shape, input2_shape, output_shape); for (int64_t i = 0; i < flatsize; ++i) { output_data[i] = F(input1_data[i], input2_data[i]); @@ -79,9 +79,9 @@ inline void ComparisonWithScaling(ComparisonParams ¶ms, const Shape &input1_ const int32_t shifted_input1_val = input1_val * (1 << left_shift); const int32_t shifted_input2_val = input2_val * (1 << left_shift); const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input1_val, input1_multiplier, input1_shift); + shifted_input1_val, input1_multiplier, input1_shift); const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input2_val, input2_multiplier, input2_shift); + shifted_input2_val, input2_multiplier, input2_shift); output_data[i] = F(scaled_input1_val, scaled_input2_val); } } @@ -111,8 +111,8 @@ BroadcastComparison4DSlowImpl(const Shape &unextended_input1_shape, const T *inp for (int c = 0; c < output_shape.Dims(3); ++c) { output_data[Offset(output_shape, b, y, x, c)] = - F(input1_data[SubscriptToIndex(desc1, b, y, x, c)], - input2_data[SubscriptToIndex(desc2, b, y, x, c)]); + F(input1_data[SubscriptToIndex(desc1, b, y, x, c)], + input2_data[SubscriptToIndex(desc2, b, y, x, c)]); } } } @@ -159,15 +159,15 @@ inline void BroadcastComparison4DSlowWithScaling(ComparisonParams ¶ms, for (int c = 0; c < output_shape.Dims(3); ++c) { const int32_t input1_val = - input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)]; + input1_offset + input1_data[SubscriptToIndex(desc1, b, y, x, c)]; const int32_t input2_val = - input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)]; + input2_offset + input2_data[SubscriptToIndex(desc2, b, y, x, c)]; const int32_t shifted_input1_val = input1_val * (1 << left_shift); const int32_t shifted_input2_val = input2_val * (1 << left_shift); const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input1_val, input1_multiplier, input1_shift); + shifted_input1_val, input1_multiplier, input1_shift); const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input2_val, input2_multiplier, input2_shift); + shifted_input2_val, input2_multiplier, input2_shift); output_data[Offset(output_shape, b, y, x, c)] = F(scaled_input1_val, scaled_input2_val); } } @@ -175,55 +175,53 @@ inline void BroadcastComparison4DSlowWithScaling(ComparisonParams ¶ms, } } -#define TFLITE_COMPARISON_OP(name) \ - template <typename T> \ - inline void name(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, \ - const T *input2_data, const Shape &output_shape, bool *output_data) \ - { \ - Comparison<name##Fn>(input1_shape, input1_data, input2_shape, input2_data, output_shape, \ - output_data); \ - } \ - template <typename T> \ - inline void name##NoScaling(const Shape &input1_shape, const T *input1_data, \ - const Shape &input2_shape, const T *input2_data, \ - const Shape &output_shape, bool *output_data) \ - { \ - ComparisonImpl<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data, \ - output_shape, output_data); \ - } \ - template <typename T> \ - inline void name##WithScaling(ComparisonParams ¶ms, const Shape &input1_shape, \ - const T *input1_data, const Shape &input2_shape, \ - const T *input2_data, const Shape &output_shape, \ - bool *output_data) \ - { \ - ComparisonWithScaling<T, name##Fn>(params, input1_shape, input1_data, input2_shape, \ - input2_data, output_shape, output_data); \ - } \ - template <typename T> \ - inline void Broadcast4DSlow##name##NoScaling(const Shape &input1_shape, const T *input1_data, \ - const Shape &input2_shape, const T *input2_data, \ - const Shape &output_shape, bool *output_data) \ - { \ - BroadcastComparison4DSlowImpl<T, name##Fn>(input1_shape, input1_data, input2_shape, \ - input2_data, output_shape, output_data); \ - } \ - template <typename T> \ - inline void Broadcast4DSlow##name(const Shape &input1_shape, const T *input1_data, \ - const Shape &input2_shape, const T *input2_data, \ - const Shape &output_shape, bool *output_data) \ - { \ - BroadcastComparison4DSlow<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data, \ - output_shape, output_data); \ - } \ - template <typename T> \ - inline void Broadcast4DSlow##name##WithScaling(ComparisonParams ¶ms, \ - const Shape &input1_shape, const T *input1_data, \ - const Shape &input2_shape, const T *input2_data, \ - const Shape &output_shape, bool *output_data) \ - { \ - BroadcastComparison4DSlowWithScaling<T, name##Fn>( \ - params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data); \ +#define TFLITE_COMPARISON_OP(name) \ + template <typename T> \ + inline void name(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, \ + const T *input2_data, const Shape &output_shape, bool *output_data) \ + { \ + Comparison<name##Fn>(input1_shape, input1_data, input2_shape, input2_data, output_shape, \ + output_data); \ + } \ + template <typename T> \ + inline void name##NoScaling(const Shape &input1_shape, const T *input1_data, \ + const Shape &input2_shape, const T *input2_data, \ + const Shape &output_shape, bool *output_data) \ + { \ + ComparisonImpl<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data, \ + output_shape, output_data); \ + } \ + template <typename T> \ + inline void name##WithScaling( \ + ComparisonParams ¶ms, const Shape &input1_shape, const T *input1_data, \ + const Shape &input2_shape, const T *input2_data, const Shape &output_shape, bool *output_data) \ + { \ + ComparisonWithScaling<T, name##Fn>(params, input1_shape, input1_data, input2_shape, \ + input2_data, output_shape, output_data); \ + } \ + template <typename T> \ + inline void Broadcast4DSlow##name##NoScaling(const Shape &input1_shape, const T *input1_data, \ + const Shape &input2_shape, const T *input2_data, \ + const Shape &output_shape, bool *output_data) \ + { \ + BroadcastComparison4DSlowImpl<T, name##Fn>(input1_shape, input1_data, input2_shape, \ + input2_data, output_shape, output_data); \ + } \ + template <typename T> \ + inline void Broadcast4DSlow##name(const Shape &input1_shape, const T *input1_data, \ + const Shape &input2_shape, const T *input2_data, \ + const Shape &output_shape, bool *output_data) \ + { \ + BroadcastComparison4DSlow<T, name##Fn>(input1_shape, input1_data, input2_shape, input2_data, \ + output_shape, output_data); \ + } \ + template <typename T> \ + inline void Broadcast4DSlow##name##WithScaling( \ + ComparisonParams ¶ms, const Shape &input1_shape, const T *input1_data, \ + const Shape &input2_shape, const T *input2_data, const Shape &output_shape, bool *output_data) \ + { \ + BroadcastComparison4DSlowWithScaling<T, name##Fn>( \ + params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data); \ } TFLITE_COMPARISON_OP(Equal); diff --git a/compute/cker/include/cker/operation/Concatenation.h b/compute/cker/include/cker/operation/Concatenation.h index 394123e30..9aaca00b7 100644 --- a/compute/cker/include/cker/operation/Concatenation.h +++ b/compute/cker/include/cker/operation/Concatenation.h @@ -142,7 +142,7 @@ inline void ConcatenationWithScaling(const ConcatenationParams ¶ms, for (int j = 0; j < copy_size; ++j) { const int32_t value = - static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) + output_zeropoint; + static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) + output_zeropoint; output_ptr[j] = static_cast<uint8_t>(std::max(std::min(255, value), 0)); } } diff --git a/compute/cker/include/cker/operation/Conv.h b/compute/cker/include/cker/operation/Conv.h index 214f2e612..2572b51ee 100644 --- a/compute/cker/include/cker/operation/Conv.h +++ b/compute/cker/include/cker/operation/Conv.h @@ -57,9 +57,9 @@ class Conv public: Conv() : _modified_filter_data(), _im2col_shape(4), _need_im2col(false), _prepared(false) {} - void prepare(const Shape &filter_shape, const float *filter_data, PaddingType padding_type, - bool &is_replaced_weights, uint32_t dilationWidthFactor, - uint32_t dilationHeightFactor) + void prepareF32(const Shape &filter_shape, const float *filter_data, PaddingType padding_type, + bool &is_replaced_weights, uint32_t dilationWidthFactor, + uint32_t dilationHeightFactor) { if (!_prepared) { @@ -71,12 +71,14 @@ public: } } - void prepareQuant(const Shape &input_shape, const Shape &kernel_shape, const Shape &output_shape, - uint32_t stride_width, uint32_t stride_height) + void prepareQ8uPerTensor(const Shape &input_shape, const Shape &kernel_shape, + const Shape &output_shape, uint32_t stride_width, uint32_t stride_height, + uint32_t dilation_width_factor, uint32_t dilation_height_factor) { if (!_prepared) { - IsRequiredIm2col(input_shape, kernel_shape, output_shape, stride_width, stride_height); + IsRequiredIm2col(input_shape, kernel_shape, output_shape, stride_width, stride_height, + dilation_width_factor, dilation_height_factor); _prepared = true; } } @@ -115,7 +117,8 @@ public: { // This means that input or output are dynamic or filter is not constant IsRequiredIm2col(input_shape, filter_shape, output_shape, params.stride_width, - params.stride_height); + params.stride_height, params.dilation_width_factor, + params.dilation_height_factor); } int im2col_size = _need_im2col ? _im2col_shape.FlatSize() : 1; @@ -135,6 +138,29 @@ public: } } + void operator()(const ConvParams ¶ms, const Shape &input_shape, const uint8_t *input_data, + const Shape &filter_shape, const uint8_t *filter_data, + const int32_t *filter_zero_point, const Shape &bias_shape, + const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data) + { + reference::Conv<uint8_t, true>(params, _per_channel_output_multiplier.data(), + _per_channel_output_shift.data(), input_shape, input_data, + filter_shape, filter_data, filter_zero_point, bias_shape, + bias_data, output_shape, output_data); + } + + void operator()(const ConvParams ¶ms, const Shape &input_shape, const int8_t *input_data, + const Shape &filter_shape, const int8_t *filter_data, const Shape &bias_shape, + const int32_t *bias_data, const Shape &output_shape, int8_t *output_data) + { + reference::Conv<int8_t, false>(params, _per_channel_output_multiplier.data(), + _per_channel_output_shift.data(), input_shape, input_data, + filter_shape, filter_data, nullptr /* filter_zero_point */, + bias_shape, bias_data, output_shape, output_data); + } + std::vector<int32_t> &per_channel_output_multiplier() { return _per_channel_output_multiplier; } + std::vector<int> &per_channel_output_shift() { return _per_channel_output_shift; } + private: bool usableMultiThreaded(PaddingType padding_type, uint32_t dilation_width_factor, int32_t dilation_height_factor) @@ -154,10 +180,15 @@ private: } void IsRequiredIm2col(const Shape &input_shape, const Shape &kernel_shape, - const Shape &output_shape, uint32_t stride_width, uint32_t stride_height) + const Shape &output_shape, uint32_t stride_width, uint32_t stride_height, + uint32_t dilation_width_factor, uint32_t dilation_height_factor) { - _need_im2col = stride_width != 1 || stride_height != 1 || kernel_shape.Dims(1) != 1 || - kernel_shape.Dims(2) != 1; + const bool need_dilated_im2col = dilation_width_factor != 1 || dilation_height_factor != 1; + const bool need_non_dilated_im2col = stride_width != 1 || stride_height != 1 || + kernel_shape.Dims(1) != 1 || kernel_shape.Dims(2) != 1; + + _need_im2col = need_dilated_im2col || need_non_dilated_im2col; + if (_need_im2col) { _im2col_shape.SetDim(0, output_shape.Dims(0)); @@ -172,7 +203,25 @@ private: Shape _im2col_shape; bool _need_im2col; bool _prepared; + // Per channel output multiplier and shift. + std::vector<int32_t> _per_channel_output_multiplier; + std::vector<int> _per_channel_output_shift; +}; + +struct ConvHybridTempArena +{ + ConvHybridTempArena(int batch_size, int input_size) + { + input_quantized.resize(input_size); + // TODO: Optimize the case of batch_size = 1 + input_scaling_factors.resize(batch_size); + input_offsets.resize(batch_size); + } + std::vector<int8_t> input_quantized; + std::vector<float> input_scaling_factors; + std::vector<int32_t> input_offsets; }; + } // namespace cker } // namespace nnfw diff --git a/compute/cker/include/cker/operation/DepthToSpace.h b/compute/cker/include/cker/operation/DepthToSpace.h new file mode 100644 index 000000000..e57fef01d --- /dev/null +++ b/compute/cker/include/cker/operation/DepthToSpace.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_DEPTH_TO_SPACE_H__ +#define __NNFW_CKER_DEPTH_TO_SPACE_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" + +namespace nnfw +{ +namespace cker +{ + +template <typename T> +inline void DepthToSpace(const Shape &unextended_input_shape, const T *input_data, + const Shape &unextended_output_shape, T *output_data, int32_t block_size) +{ + assert(unextended_input_shape.DimensionsCount() <= 4); + assert(unextended_output_shape.DimensionsCount() <= 4); + const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape); + const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape); + + const int input_depth = input_shape.Dims(3); + const int input_width = input_shape.Dims(2); + const int input_height = input_shape.Dims(1); + + const int output_depth = output_shape.Dims(3); + const int batch_size = output_shape.Dims(0); + + // Number of continuous values that we can copy in one interation. + const int stride = block_size * output_depth; + + for (int batch = 0; batch < batch_size; ++batch) + { + for (int in_h = 0; in_h < input_height; ++in_h) + { + const T *input_ptr = input_data + Offset(input_shape, batch, in_h, 0, 0); + for (int offset_h = 0; offset_h < block_size; ++offset_h) + { + const T *src = input_ptr; + for (int in_w = 0; in_w < input_width; ++in_w) + { + memcpy(output_data, src, stride * sizeof(T)); + output_data += stride; + src += input_depth; + } + input_ptr += stride; + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_SPACE_TO_DEPTH_H__ diff --git a/compute/cker/include/cker/operation/DepthwiseConv.h b/compute/cker/include/cker/operation/DepthwiseConv.h index 814a9e019..c926ec4f1 100644 --- a/compute/cker/include/cker/operation/DepthwiseConv.h +++ b/compute/cker/include/cker/operation/DepthwiseConv.h @@ -22,143 +22,162 @@ #include "cker/Types.h" #include "cker/Utils.h" #include "cker/neon/neon_check.h" +#include "cker/operation/optimized/DepthwiseConvFloat.h" #include "cker/operation/optimized/DepthwiseConvUint8.h" +#include "cker/operation/optimized/integer_ops/DepthwiseConvInt8.h" +#include "cker/operation/reference/integer_ops/DepthwiseConvUInt8.h" +#include "cker/operation/reference/integer_ops/DepthwiseConvHybrid.h" +#include "cker/CpuBackendThreadpool.h" namespace nnfw { namespace cker { -inline void DepthwiseConv(const DepthwiseConvParams ¶ms, const Shape &input_shape, - const uint8_t *input_data, const Shape &filter_shape, - const uint8_t *filter_data, const Shape &bias_shape, - const int32_t *bias_data, const Shape &output_shape, uint8_t *output_data) +// TODO(luwa): add multithread to per-channel depthwise_conv +// DepthwiseConv can run with multi threads on the dim specified by thread_dim. +// Each thread processes output elements on dim, thread_dim, in the range of +// [thread_start, thread_end). +// For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it +// means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :]. +template <typename T, typename TS> struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task { - const int depth_multiplier = params.depth_multiplier; - const int32_t output_activation_min = params.quantized_activation_min; - const int32_t output_activation_max = params.quantized_activation_max; - const int dilation_width_factor = params.dilation_width_factor; - const int dilation_height_factor = params.dilation_height_factor; - assert(dilation_width_factor >= 1); - assert(dilation_height_factor >= 1); - UNUSED_RELEASE(dilation_width_factor); - UNUSED_RELEASE(dilation_height_factor); - assert(input_shape.DimensionsCount() == 4); - assert(filter_shape.DimensionsCount() == 4); - assert(output_shape.DimensionsCount() == 4); - assert(output_activation_min <= output_activation_max); - UNUSED_RELEASE(output_activation_min); - UNUSED_RELEASE(output_activation_max); - const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); - const int input_depth = input_shape.Dims(3); - assert(output_depth == input_depth * depth_multiplier); - assert(bias_shape.FlatSize() == output_depth); - UNUSED_RELEASE(input_depth); - UNUSED_RELEASE(output_depth); - UNUSED_RELEASE(depth_multiplier); - -// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on -// Jetson TX-2. This compiler does not support the offsetof() macro. -#if defined(__aarch64__) -// TODO Use below codes - -// const int stride_width = params.stride_width; -// const int stride_height = params.stride_height; -// const int pad_width = params.padding_values.width; -// const int pad_height = params.padding_values.height; -// const int output_shift = params.output_shift; -// -// // Call kernel optimized for depthwise convolutions using 3x3 filters if -// // parameters are supported. -// if (Fast3x3FilterKernelSupported( -// input_shape, filter_shape, stride_width, stride_height, -// dilation_width_factor, dilation_height_factor, pad_width, pad_height, -// depth_multiplier, output_shape, output_shift)) { -// DepthwiseConv3x3Filter(params, input_shape, input_data, filter_shape, -// filter_data, bias_shape, bias_data, output_shape, -// output_data); -// return; -// } -#endif - - optimized::DepthwiseConvGeneral(params, input_shape, input_data, filter_shape, filter_data, - bias_shape, bias_data, output_shape, output_data); + DepthwiseConvWorkerTask(const DepthwiseConvParams ¶ms, const Shape &input_shape, + const T *input_data, const Shape &filter_shape, const T *filter_data, + const Shape &bias_shape, const TS *bias_data, const Shape &output_shape, + T *output_data, int thread_start, int thread_end, int thread_dim) + : params_(params), input_shape_(input_shape), input_data_(input_data), + filter_shape_(filter_shape), filter_data_(filter_data), bias_shape_(bias_shape), + bias_data_(bias_data), output_shape_(output_shape), output_data_(output_data), + thread_start_(thread_start), thread_end_(thread_end), thread_dim_(thread_dim) + { + } + + void Run() override + { + optimized::DepthwiseConvImpl(params_, input_shape_, input_data_, filter_shape_, filter_data_, + bias_shape_, bias_data_, output_shape_, output_data_, + thread_start_, thread_end_, thread_dim_); + } + +private: + const DepthwiseConvParams ¶ms_; + const Shape &input_shape_; + const T *input_data_; + const Shape &filter_shape_; + const T *filter_data_; + const Shape &bias_shape_; + const TS *bias_data_; + const Shape &output_shape_; + T *output_data_; + // const CpuFlags& cpu_flags_; + int thread_start_; + int thread_end_; + int thread_dim_; +}; + +inline int HowManyConvThreads(const Shape &output_shape, const Shape &filter_shape) +{ + // How many scalar multiplications are needed to make it worth using one + // more thread + static constexpr int kMinMulPerThread = 1 << 13; // 8k + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int num_muls = output_shape.FlatSize() * filter_height * filter_width; + // Try to avoid real runtime divisions if possible by dividing by a + // compile-time constant. + int thread_count = std::max(1, num_muls / kMinMulPerThread); + return thread_count; +} + +inline bool MultithreadAlongBatches(int thread_count, int batches) +{ + assert(thread_count >= 2); + // If there are fewer batch entries than the number of threads we want to use, + // then better do intra-batch-entry multithreading. + if (batches < thread_count) + { + return false; + } + // If there are at least 2 batch entries to be handed to each thread, then + // it's safe to proceed with batch-wise multithreading: each thread will have + // approximately equal number of batch entries to handle, so the load + // balancing will be reasonable, and the amount to which the load is not + // perfectly balanced will be offset by the inherent advantages of + // batch-wise multithreading (each thread is more efficient thanks to working + // on larger buffers with less boundary-handling overhead). + if (batches >= 2 * thread_count) + { + return true; + } + // In the limit case were there are at least 1 but not much more than 1 + // batch entries per thread, it may be a good idea to do per-batch + // multithreading if the number of batch entries is a multiple of the number + // of threads, so that each thread will have the same number of batch entries + // to process. + return ((batches % thread_count) == 0); } +template <typename T, typename TS> inline void DepthwiseConv(const DepthwiseConvParams ¶ms, const Shape &input_shape, - const float *input_data, const Shape &filter_shape, - const float *filter_data, const Shape &bias_shape, const float *bias_data, - const Shape &output_shape, float *output_data) + const T *input_data, const Shape &filter_shape, const T *filter_data, + const Shape &bias_shape, const TS *bias_data, const Shape &output_shape, + T *output_data, ruy::Context *ruy_context) { - const int stride_width = params.stride_width; - const int stride_height = params.stride_height; - const int dilation_width_factor = params.dilation_width_factor; - const int dilation_height_factor = params.dilation_height_factor; - const int pad_width = params.padding_values.width; - const int pad_height = params.padding_values.height; - const int depth_multiplier = params.depth_multiplier; - const float output_activation_min = params.float_activation_min; - const float output_activation_max = params.float_activation_max; assert(input_shape.DimensionsCount() == 4); assert(filter_shape.DimensionsCount() == 4); assert(output_shape.DimensionsCount() == 4); - const int batches = MatchingDim(input_shape, 0, output_shape, 0); - const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); - const int input_height = input_shape.Dims(1); - const int input_width = input_shape.Dims(2); - const int input_depth = input_shape.Dims(3); - const int filter_height = filter_shape.Dims(1); - const int filter_width = filter_shape.Dims(2); + int thread_count = HowManyConvThreads(output_shape, filter_shape); + + // NOTE Borrow RuyContext to get max_num_threads setting + // TODO Define and use max_num_threads for CPU backend + const auto max_threads = (ruy_context == nullptr) ? 1 : ruy_context->max_num_threads(); + + thread_count = std::max(1, std::min(thread_count, max_threads)); + // Cap the number of threads to 2 for float path to avoid regression in + // performance (b/132294857). + if (std::is_floating_point<T>::value) + { + thread_count = std::min(thread_count, 2); + } + + const int output_batches = output_shape.Dims(0); const int output_height = output_shape.Dims(1); - const int output_width = output_shape.Dims(2); - assert(output_depth == input_depth * depth_multiplier); - assert(bias_shape.FlatSize() == output_depth); - UNUSED_RELEASE(output_depth); - UNUSED_RELEASE(bias_shape); - for (int b = 0; b < batches; ++b) + if (thread_count == 1) + { + optimized::DepthwiseConvImpl(params, input_shape, input_data, filter_shape, filter_data, + bias_shape, bias_data, output_shape, output_data, 0, output_height, + 1); + return; + } + + int thread_dim, thread_dim_size; + if (MultithreadAlongBatches(thread_count, output_batches)) + { + thread_dim = 0; + thread_dim_size = output_batches; + } + else + { + thread_dim = 1; + thread_dim_size = output_height; + } + + std::vector<DepthwiseConvWorkerTask<T, TS>> tasks; + // TODO(b/131746020) don't create new heap allocations every time. + // At least we make it a single heap allocation by using reserve(). + tasks.reserve(thread_count); + int thread_start = 0; + for (int i = 0; i < thread_count; ++i) { - for (int out_y = 0; out_y < output_height; ++out_y) - { - for (int out_x = 0; out_x < output_width; ++out_x) - { - for (int ic = 0; ic < input_depth; ++ic) - { - for (int m = 0; m < depth_multiplier; m++) - { - const int oc = m + ic * depth_multiplier; - const int in_x_origin = (out_x * stride_width) - pad_width; - const int in_y_origin = (out_y * stride_height) - pad_height; - float total = 0.f; - for (int filter_y = 0; filter_y < filter_height; ++filter_y) - { - for (int filter_x = 0; filter_x < filter_width; ++filter_x) - { - const int in_x = in_x_origin + dilation_width_factor * filter_x; - const int in_y = in_y_origin + dilation_height_factor * filter_y; - // If the location is outside the bounds of the input image, - // use zero as a default value. - if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) - { - float input_value = input_data[Offset(input_shape, b, in_y, in_x, ic)]; - float filter_value = filter_data[Offset(filter_shape, 0, filter_y, filter_x, oc)]; - total += (input_value * filter_value); - } - } - } - float bias_value = 0.0f; - if (bias_data) - { - bias_value = bias_data[oc]; - } - output_data[Offset(output_shape, b, out_y, out_x, oc)] = ActivationFunctionWithMinMax( - total + bias_value, output_activation_min, output_activation_max); - } - } - } - } + int thread_end = thread_start + (thread_dim_size - thread_start) / (thread_count - i); + tasks.emplace_back(params, input_shape, input_data, filter_shape, filter_data, bias_shape, + bias_data, output_shape, output_data, thread_start, thread_end, thread_dim); + thread_start = thread_end; } + cpu_backend_threadpool::Execute(tasks.size(), tasks.data(), ruy_context); } } // namespace cker diff --git a/compute/cker/include/cker/operation/Dequantize.h b/compute/cker/include/cker/operation/Dequantize.h new file mode 100644 index 000000000..c8c2fd9d4 --- /dev/null +++ b/compute/cker/include/cker/operation/Dequantize.h @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_DEQUANTIZE_H__ +#define __NNFW_CKER_DEQUANTIZE_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/neon/neon_check.h" + +namespace nnfw +{ +namespace cker +{ + +#ifdef USE_NEON +namespace +{ +inline void ScaleWithNewZeroPoint(const int32x4_t input, const float32x4_t scale_dup, + const float32x4_t zero_times_scale_dup, float32x4_t *output) +{ +#ifdef __ARM_FEATURE_FMA + *output = vfmaq_f32(zero_times_scale_dup, vcvtq_f32_s32(input), scale_dup); +#else + *output = vaddq_f32(vmulq_f32(vcvtq_f32_s32(input), scale_dup), zero_times_scale_dup); +#endif +} +} // namespace +#endif // USE_NEON + +inline void Dequantize(const Shape &input_shape, const uint8_t *input_data, + const Shape &output_shape, float *output_data, const float scale, + const int32_t zero_point) +{ + const int flat_size = MatchingFlatSize(input_shape, output_shape); + + int i = 0; +#ifdef USE_NEON + const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale)); + const float32x4_t zero_times_scale_dup = vdupq_n_f32(static_cast<float>(-zero_point * scale)); + for (; i <= flat_size - 8; i += 8) + { + const uint8x8_t input_u8 = vld1_u8(input_data + i); + const uint16x8_t input_u16 = vmovl_u8(input_u8); + const int16x8_t input_s16 = vreinterpretq_s16_u16(input_u16); + const int16x4_t input_s16_low = vget_low_s16(input_s16); + const int16x4_t input_s16_high = vget_high_s16(input_s16); + const int32x4_t val_low = vmovl_s16(input_s16_low); + const int32x4_t val_high = vmovl_s16(input_s16_high); + + float32x4_t result_low, result_high; + ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup, &result_low); + ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup, &result_high); + + vst1q_f32(output_data + i, result_low); + vst1q_f32(output_data + i + 4, result_high); + } +#endif // NEON + for (; i < flat_size; ++i) + { + const int32_t val = input_data[i]; + const float result = static_cast<float>(scale * (val - zero_point)); + output_data[i] = result; + } +} + +inline void Dequantize(const Shape &input_shape, const int8_t *input_data, + const Shape &output_shape, float *output_data, const float scale, + const int32_t zero_point) +{ + const int flat_size = MatchingFlatSize(input_shape, output_shape); + + int i = 0; +#ifdef USE_NEON + const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale)); + const float32x4_t zero_times_scale_dup = vdupq_n_f32(static_cast<float>(-zero_point * scale)); + for (; i <= flat_size - 8; i += 8) + { + const int8x8_t input_s8 = vld1_s8(input_data + i); + const int16x8_t input_s16 = vmovl_s8(input_s8); + const int16x4_t input_s16_low = vget_low_s16(input_s16); + const int16x4_t input_s16_high = vget_high_s16(input_s16); + const int32x4_t val_low = vmovl_s16(input_s16_low); + const int32x4_t val_high = vmovl_s16(input_s16_high); + + float32x4_t result_low, result_high; + ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup, &result_low); + ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup, &result_high); + + vst1q_f32(output_data + i, result_low); + vst1q_f32(output_data + i + 4, result_high); + } +#endif // NEON + for (; i < flat_size; ++i) + { + const int32_t val = input_data[i]; + const float result = static_cast<float>(scale * (val - zero_point)); + output_data[i] = result; + } +} + +inline void Dequantize(const Shape &input_shape, const int16_t *input_data, + const Shape &output_shape, float *output_data, const float scale, + const int32_t zero_point) +{ + const int flat_size = MatchingFlatSize(input_shape, output_shape); + + int i = 0; +#ifdef USE_NEON + const float32x4_t scale_dup = vdupq_n_f32(static_cast<float>(scale)); + const float32x4_t zero_times_scale_dup = vdupq_n_f32(static_cast<float>(-zero_point * scale)); + for (; i <= flat_size - 8; i += 8) + { + const int16x4_t input_s16_low = vld1_s16(input_data + i); + const int16x4_t input_s16_high = vld1_s16(input_data + i + 4); + const int32x4_t val_low = vmovl_s16(input_s16_low); + const int32x4_t val_high = vmovl_s16(input_s16_high); + + float32x4_t result_low, result_high; + ScaleWithNewZeroPoint(val_low, scale_dup, zero_times_scale_dup, &result_low); + ScaleWithNewZeroPoint(val_high, scale_dup, zero_times_scale_dup, &result_high); + + vst1q_f32(output_data + i, result_low); + vst1q_f32(output_data + i + 4, result_high); + } +#endif // NEON + for (; i < flat_size; ++i) + { + const int32_t val = input_data[i]; + const float result = static_cast<float>(scale * (val - zero_point)); + output_data[i] = result; + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_DEQUANTIZE_H__ diff --git a/compute/cker/include/cker/operation/ELU.h b/compute/cker/include/cker/operation/ELU.h new file mode 100644 index 000000000..6bdd7c62e --- /dev/null +++ b/compute/cker/include/cker/operation/ELU.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_ELU_H__ +#define __NNFW_CKER_ELU_H__ + +#include "cker/Shape.h" + +#include <cmath> + +namespace nnfw +{ +namespace cker +{ + +inline void ELU(const Shape &input_shape, const float *input_data, const Shape &output_shape, + float *output_data) +{ + const int flat_size = MatchingFlatSize(input_shape, output_shape); + for (int i = 0; i < flat_size; ++i) + { + const float val = input_data[i]; + output_data[i] = val < 0.0 ? std::exp(val) - 1 : val; + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_ELU_H__ diff --git a/compute/cker/include/cker/operation/Einsum.h b/compute/cker/include/cker/operation/Einsum.h index 3d1837f47..bb9f88f8d 100644 --- a/compute/cker/include/cker/operation/Einsum.h +++ b/compute/cker/include/cker/operation/Einsum.h @@ -177,7 +177,7 @@ inline Shape copyShape(const Shape &shape) { return Shape::ExtendedShape(shape.DimensionsCount(), shape); } -} +} // namespace class Einsum { @@ -274,7 +274,7 @@ public: } for (int i = 0; i < num_inputs; ++i) { - for (int label : free_labels[i]) + for (auto &&label : free_labels[i]) { result_labels.push_back(label); result_shape_dims.push_back(label_to_dim_sizes[label]); @@ -300,7 +300,7 @@ public: { // We inflated the output. Modify result labels accordingly. Labels inflated_labels; - for (int label : result_labels) + for (auto &&label : result_labels) { inflated_labels.insert(inflated_labels.end(), output_label_counts[label], label); } @@ -394,8 +394,8 @@ private: for (int label = 0; label < num_labels; ++label) { bool removed = (_output_label_counts[label] == 0); - bool unique = num_inputs == 1 || _input_label_counts[0][label] == 0 || - _input_label_counts[1][label] == 0; + bool unique = + num_inputs == 1 || _input_label_counts[0][label] == 0 || _input_label_counts[1][label] == 0; _label_types[label] = getDimensionType(removed, unique); } } @@ -483,8 +483,8 @@ private: if (inputs[i].shape.DimensionsCount() + 1 < (int32_t)labels->size()) { throw std::runtime_error{"Expected input " + std::to_string(i) + " to have rank at least " + - std::to_string(labels->size() - 1) + " but got: " + - std::to_string(inputs[i].shape.DimensionsCount())}; + std::to_string(labels->size() - 1) + + " but got: " + std::to_string(inputs[i].shape.DimensionsCount())}; } int ellipsis_axis = -1; const int num_bcast_dims = inputs[i].shape.DimensionsCount() - labels->size() + 1; @@ -511,7 +511,7 @@ private: } std::vector<bool>::iterator it_input = - std::find(_input_has_ellipsis.begin(), _input_has_ellipsis.end(), true); + std::find(_input_has_ellipsis.begin(), _input_has_ellipsis.end(), true); if (it_input == _input_has_ellipsis.end() && !_output_has_ellipsis) { return; @@ -645,11 +645,11 @@ private: // Reduce along the last axis (i.e axis 1) of the rank-2 Tensor. const int32_t output_size = - reshape[kBroadcasting] * reshape[kBatch] * reshape[kFree] * reshape[kContract]; + reshape[kBroadcasting] * reshape[kBatch] * reshape[kFree] * reshape[kContract]; functor::ReduceFunctor<Eigen::ThreadPoolDevice, Reducer>::Reduce( - device, output->shaped<T, 1>({output_size}), - input_deduped.shaped<T, 2>({output_size, reshape[kReduce]}), Eigen::array<Index, 1>({1}), - Reducer()); + device, output->shaped<T, 1>({output_size}), + input_deduped.shaped<T, 2>({output_size, reshape[kReduce]}), Eigen::array<Index, 1>({1}), + Reducer()); } bool shouldSwapFreeAndContract(const Labels &labels, @@ -775,11 +775,11 @@ private: Shape inflated_shape; std::vector<int32_t> strided_shape_dims; std::vector<int32_t> inflated_shape_dims; - for (int label : labels) + for (auto &&label : labels) { const int32_t count = label_counts[label]; const int current_axis = - should_inflate ? strided_shape_dims.size() : inflated_shape_dims.size(); + should_inflate ? strided_shape_dims.size() : inflated_shape_dims.size(); const int32_t dim = input.shape.Dims(current_axis); strided_shape_dims.push_back(dim); inflated_shape_dims.insert(inflated_shape_dims.end(), count, dim); @@ -879,7 +879,7 @@ private: for (size_t i = 0; i < inputs.size(); ++i) { const int32_t free_axis = - inputs[i].shape.DimensionsCount() - (swap_free_and_contract[i] ? 1 : 2); + inputs[i].shape.DimensionsCount() - (swap_free_and_contract[i] ? 1 : 2); output_shape.SetDim(i + old_output_shape.DimensionsCount(), inputs[i].shape.Dims(free_axis)); } bool adj_x = swap_free_and_contract[0]; diff --git a/compute/cker/include/cker/operation/Elementwise.h b/compute/cker/include/cker/operation/Elementwise.h index 598a032bb..0e980f18e 100644 --- a/compute/cker/include/cker/operation/Elementwise.h +++ b/compute/cker/include/cker/operation/Elementwise.h @@ -66,8 +66,9 @@ inline void Rsqrt(const Shape &input_shape, const float *input_data, const Shape } } -inline void Neg(const Shape &input_shape, const float *input_data, const Shape &output_shape, - float *output_data) +template <typename T> +inline void Neg(const Shape &input_shape, const T *input_data, const Shape &output_shape, + T *output_data) { const int size = MatchingFlatSize(input_shape, output_shape); for (int i = 0; i < size; i++) @@ -86,6 +87,39 @@ inline void Log(const Shape &input_shape, const float *input_data, const Shape & } } +inline void Floor(const Shape &input_shape, const float *input_data, const Shape &output_shape, + float *output_data) +{ + const int flat_size = MatchingFlatSize(input_shape, output_shape); + + for (int i = 0; i < flat_size; i++) + { + output_data[i] = std::floor(input_data[i]); + } +} + +inline void Sqrt(const Shape &input_shape, const float *input_data, const Shape &output_shape, + float *output_data) +{ + const int flat_size = MatchingFlatSize(input_shape, output_shape); + + for (int i = 0; i < flat_size; i++) + { + output_data[i] = std::sqrt(input_data[i]); + } +} + +inline void Square(const Shape &input_shape, const float *input_data, const Shape &output_shape, + float *output_data) +{ + const int flat_size = MatchingFlatSize(input_shape, output_shape); + + for (int i = 0; i < flat_size; i++) + { + output_data[i] = input_data[i] * input_data[i]; + } +} + } // namespace cker } // namespace nnfw diff --git a/compute/cker/include/cker/operation/Fill.h b/compute/cker/include/cker/operation/Fill.h index 14daf9839..f88c3a5fb 100644 --- a/compute/cker/include/cker/operation/Fill.h +++ b/compute/cker/include/cker/operation/Fill.h @@ -25,26 +25,12 @@ namespace nnfw namespace cker { template <typename T> -inline void Fill(const Shape &input_shape, int *input_data, const T value_data, - const Shape &output_shape, T output_data) +inline void Fill(const T *value_data, const Shape &output_shape, T *output_data) { - int input_size = input_shape.FlatSize(); - int output_size = 1; - for (int i = 0; i < input_size; i++) + int output_size = output_shape.FlatSize(); + for (int i = 0; i < output_size; i++) { - output_size *= input_data[i]; - } - - if (output_size == output_shape.FlatSize()) - { - for (int i = 0; i < output_size; i++) - { - output_data[i] = *value_data; - } - } - else - { - throw std::runtime_error("Cker Fill.h: output's size is not matched inferred size of output"); + output_data[i] = *value_data; } } diff --git a/compute/cker/include/cker/operation/FloorDiv.h b/compute/cker/include/cker/operation/FloorDiv.h new file mode 100644 index 000000000..cdb2c2a8b --- /dev/null +++ b/compute/cker/include/cker/operation/FloorDiv.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_FLOOR_DIV_H__ +#define __NNFW_CKER_FLOOR_DIV_H__ + +#include "cker/Shape.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ + +template <typename T> +inline void FloorDivBroadcast(const Shape &unextended_input1_shape, const T *input1_data, + const Shape &unextended_input2_shape, const T *input2_data, + const Shape &unextended_output_shape, T *output_data) +{ + assert(unextended_input1_shape.DimensionsCount() <= 4); + assert(unextended_input2_shape.DimensionsCount() <= 4); + assert(unextended_output_shape.DimensionsCount() <= 4); + const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape); + + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1, + &desc2); + + for (int b = 0; b < output_shape.Dims(0); ++b) + { + for (int y = 0; y < output_shape.Dims(1); ++y) + { + for (int x = 0; x < output_shape.Dims(2); ++x) + { + for (int c = 0; c < output_shape.Dims(3); ++c) + { + auto out_idx = Offset(output_shape, b, y, x, c); + auto in1_idx = SubscriptToIndex(desc1, b, y, x, c); + auto in2_idx = SubscriptToIndex(desc2, b, y, x, c); + auto in1_val = input1_data[in1_idx]; + auto in2_val = input2_data[in2_idx]; + output_data[out_idx] = std::floor( + std::divides<double>()(static_cast<double>(in1_val), static_cast<double>(in2_val))); + } + } + } + } +} + +template <typename T> +inline void FloorDivElementwise(const Shape &shape, const T *input1_data, const T *input2_data, + T *output_data) +{ + + int num_elements = shape.FlatSize(); + + for (int t = 0; t < num_elements; t++) + { + output_data[t] = std::floor(std::divides<double>()(static_cast<double>(input1_data[t]), + static_cast<double>(input2_data[t]))); + } +} + +} // namespace cker + +} // namespace nnfw +#endif diff --git a/compute/cker/include/cker/operation/FullyConnected.h b/compute/cker/include/cker/operation/FullyConnected.h index 4280c9ae2..71a2f19ef 100644 --- a/compute/cker/include/cker/operation/FullyConnected.h +++ b/compute/cker/include/cker/operation/FullyConnected.h @@ -19,10 +19,14 @@ #define __NNFW_CKER_FULLY_CONNECTED_H__ #include <ruy/context.h> +#include "cker/operation/FullyConnectedDense16x1.h" +#include "cker/operation/FullyConnectedSparse16x1.h" +#include "cker/operation/optimized/Gemm.h" #include "cker/Shape.h" #include "cker/Types.h" #include "cker/Utils.h" #include "cker/TensorUtils.h" +#include "cker/neon/neon_check.h" namespace nnfw { @@ -55,6 +59,42 @@ public: std::vector<int32_t> accum_scratch; }; +#if defined(CKER_X86_PLATFORM) + +// From tensorflow/tensorflow/lite/kernels/internal/optimized/optimized_ops.h +inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &input_shape, + const float *input_data, const Shape &weights_shape, + const float *weights_data, const Shape &, + const float *optional_bias_data, const Shape &output_shape, + float *output_data) +{ + const int dims_count = weights_shape.DimensionsCount(); + const int input_rows = weights_shape.Dims(dims_count - 1); + MatrixParams<float> rhs_params; + rhs_params.order = Order::kColMajor; + rhs_params.rows = input_rows; + rhs_params.cols = input_shape.FlatSize() / input_rows; + rhs_params.cache_policy = optimized::DefaultCachePolicy(params.rhs_cacheable); + + MatrixParams<float> lhs_params; + lhs_params.order = Order::kRowMajor; + lhs_params.cols = weights_shape.Dims(dims_count - 1); + lhs_params.rows = FlatSizeSkipDim(weights_shape, dims_count - 1); + lhs_params.cache_policy = optimized::DefaultCachePolicy(params.lhs_cacheable); + MatrixParams<float> dst_params; + dst_params.order = Order::kColMajor; + dst_params.rows = output_shape.Dims(output_shape.DimensionsCount() - 1); + dst_params.cols = FlatSizeSkipDim(output_shape, output_shape.DimensionsCount() - 1); + GemmParams<float, float> gemm_params; + gemm_params.bias = optional_bias_data; + gemm_params.clamp_min = params.float_activation_min; + gemm_params.clamp_max = params.float_activation_max; + optimized::Gemm(lhs_params, weights_data, rhs_params, input_data, dst_params, output_data, + gemm_params); +} + +#else // CKER_X86_PLATFORM + inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &input_shape, const float *input_data, const Shape &weights_shape, const float *weights_data, const Shape &, const float *bias_data, @@ -86,6 +126,8 @@ inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &inpu } } +#endif // CKER_X86_PLATFORM + inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &input_shape, const uint8_t *input_data, const Shape &filter_shape, const uint8_t *filter_data, const Shape &bias_shape, @@ -114,7 +156,7 @@ inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &inpu const int filter_dim_count = filter_shape.DimensionsCount(); const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1); const int output_depth = - MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1); + MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1); const int accum_depth = filter_shape.Dims(filter_dim_count - 1); for (int b = 0; b < batches; ++b) { @@ -208,12 +250,13 @@ inline void FullyConnectedHybrid(const FullyConnectedParams ¶ms, const Shape return; } -inline void FullyConnectedSparseWeight(const FullyConnectedParams ¶ms, const Shape &input_shape, - const float *input_data, const Shape &weights_shape, - const float *weights_data, const Shape &bias_shape, - const float *bias_data, const Shape &output_shape, - float *output_data, int w0_size, const uint16_t *w1_segments, - const uint16_t *w1_indices) +inline void FullyConnectedSparseWeightRandom(const FullyConnectedParams ¶ms, + const Shape &input_shape, const float *input_data, + const Shape &weights_shape, const float *weights_data, + const Shape &bias_shape, const float *bias_data, + const Shape &output_shape, float *output_data, + const uint16_t *w1_segments, + const uint16_t *w1_indices) { UNUSED_RELEASE(params); UNUSED_RELEASE(input_shape); @@ -225,7 +268,7 @@ inline void FullyConnectedSparseWeight(const FullyConnectedParams ¶ms, const const int weights_dims_count = weights_shape.DimensionsCount(); const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1); const int output_depth = - MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1); + MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1); const int accum_depth = weights_shape.Dims(weights_dims_count - 1); UNUSED_RELEASE(bias_shape); @@ -239,13 +282,13 @@ inline void FullyConnectedSparseWeight(const FullyConnectedParams ¶ms, const } for (int b = 0; b < batches; ++b) { - for (int idx_0 = 0; idx_0 < w0_size; ++idx_0) + for (int idx_0 = 0; idx_0 < output_depth; ++idx_0) { for (int pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1) { int idx_1 = w1_indices[pw1]; output_data[b * output_depth + idx_0] += - weights_data[pw1] * input_data[b * accum_depth + idx_1]; + weights_data[pw1] * input_data[b * accum_depth + idx_1]; } } } diff --git a/compute/cker/include/cker/operation/FullyConnectedDense16x1.h b/compute/cker/include/cker/operation/FullyConnectedDense16x1.h new file mode 100644 index 000000000..a7e9efd7f --- /dev/null +++ b/compute/cker/include/cker/operation/FullyConnectedDense16x1.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* Copyright (c) 2018 Mozilla + 2008-2011 Octasic Inc. + 2012-2017 Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef __NNFW_CKER_FULLY_CONNECTED_DENSE16x1_H__ +#define __NNFW_CKER_FULLY_CONNECTED_DENSE16x1_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" +#include "cker/TensorUtils.h" + +namespace nnfw +{ +namespace cker +{ +#if defined(__aarch64__) && defined(USE_NEON) +inline void FullyConnected16x1Float32(const FullyConnectedParams ¶ms, const Shape &input_shape, + const float *input_data, const Shape &weights_shape, + const float *weights_data, const Shape &, + const float *bias_data, const Shape &, float *output_data) +{ + int total_input_size = input_shape.FlatSize(); + int input_size = weights_shape.Dims(1); + const int batch_size = total_input_size / input_size; + const int num_units = weights_shape.Dims(0); + + float *out = output_data; + const float *weights = weights_data; + int rows = num_units; + int cols = input_size; + int col_stride = input_size; + const float *x = input_data; + + // Output = bias if bias tensor exists. + if (bias_data) + { + VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data); + } + else + { + ZeroVector(output_data, batch_size * num_units); + } + + // rows : out, cols : in + int i, j; + for (i = 0; i < rows; i += 16) + { + const float *w = &weights[i * col_stride]; + + /* keep y[0..15] in registers for duration of inner loop */ + float *__restrict y = &out[i]; + + float32x4_t y0_3 = vld1q_f32(&y[0]); + float32x4_t y4_7 = vld1q_f32(&y[4]); + float32x4_t y8_11 = vld1q_f32(&y[8]); + float32x4_t y12_15 = vld1q_f32(&y[12]); + + for (j = 0; j < cols; j++) + { + float32x4_t wvec0_3, wvec4_7, wvec8_11, wvec12_15; + float32x4_t xj; + + xj = vld1q_dup_f32(&x[j]); + + wvec0_3 = vld1q_f32(&w[0]); + y0_3 = vmlaq_f32(y0_3, wvec0_3, xj); + wvec4_7 = vld1q_f32(&w[4]); + y4_7 = vmlaq_f32(y4_7, wvec4_7, xj); + wvec8_11 = vld1q_f32(&w[8]); + y8_11 = vmlaq_f32(y8_11, wvec8_11, xj); + wvec12_15 = vld1q_f32(&w[12]); + y12_15 = vmlaq_f32(y12_15, wvec12_15, xj); + + w += 16; + } + + /* save y[0..15] back to memory */ + + vst1q_f32(&y[0], y0_3); + vst1q_f32(&y[4], y4_7); + vst1q_f32(&y[8], y8_11); + vst1q_f32(&y[12], y12_15); + } + if (params.activation != FusedActivationFunctionType::kNone) + { + // Apply activation function + ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data); + } +} +#endif +} // namespace cker +} // namespace nnfw +#endif // __NNFW_CKER_FULLY_CONNECTED_DENSE16x1_H__ diff --git a/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h b/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h new file mode 100644 index 000000000..df397f73e --- /dev/null +++ b/compute/cker/include/cker/operation/FullyConnectedSparse16x1.h @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/* Copyright (c) 2018 Mozilla + 2008-2011 Octasic Inc. + 2012-2017 Jean-Marc Valin */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef __NNFW_CKER_FULLY_CONNECTED_SPARSE16x1_H__ +#define __NNFW_CKER_FULLY_CONNECTED_SPARSE16x1_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" +#include "cker/TensorUtils.h" + +namespace nnfw +{ +namespace cker +{ +inline void FullyConnectedSparseWeight16x1(const FullyConnectedParams ¶ms, + const Shape &input_shape, const float *input_data, + const Shape &weights_shape, const float *weights_data, + const Shape &bias_shape, const float *bias_data, + const Shape &output_shape, float *output_data, + const uint16_t *w1_segments, const uint16_t *w1_indices) +{ + UNUSED_RELEASE(input_shape); + + assert(weights_shape.DimensionsCount() == 2); + assert(output_shape.DimensionsCount() == 2); + + const int output_dims_count = output_shape.DimensionsCount(); + const int weights_dims_count = weights_shape.DimensionsCount(); + const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1); + const int output_depth = + MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1); + const int accum_depth = weights_shape.Dims(weights_dims_count - 1); + + UNUSED_RELEASE(bias_shape); + if (bias_data) + { + VectorBatchVectorAssign(bias_data, output_depth, batches, output_data); + } + else + { + ZeroVector(output_data, batches * output_depth); + } + for (int b = 0; b < batches; ++b) + { + int depth_size = output_depth / 16; + for (int idx_0 = 0; idx_0 < depth_size; ++idx_0) +#ifdef USE_NEON + { + float *__restrict y; + y = &output_data[b * output_depth + idx_0 * 16]; + /* keep y[0..15] in registers for duration of inner loop */ + float32x4_t y0_3 = vld1q_f32(&y[0]); + float32x4_t y4_7 = vld1q_f32(&y[4]); + float32x4_t y8_11 = vld1q_f32(&y[8]); + float32x4_t y12_15 = vld1q_f32(&y[12]); + for (auto pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1) + { + auto idx_1 = w1_indices[pw1]; + float32x4_t xj = vld1q_dup_f32(&input_data[b * accum_depth + idx_1]); + float32x4_t wvec; + + wvec = vld1q_f32(&weights_data[0]); + y0_3 = vmlaq_f32(y0_3, wvec, xj); + wvec = vld1q_f32(&weights_data[4]); + y4_7 = vmlaq_f32(y4_7, wvec, xj); + wvec = vld1q_f32(&weights_data[8]); + y8_11 = vmlaq_f32(y8_11, wvec, xj); + wvec = vld1q_f32(&weights_data[12]); + y12_15 = vmlaq_f32(y12_15, wvec, xj); + + weights_data += 16; + } + /* save y[0..15] back to memory */ + vst1q_f32(&y[0], y0_3); + vst1q_f32(&y[4], y4_7); + vst1q_f32(&y[8], y8_11); + vst1q_f32(&y[12], y12_15); + } +#else + { + for (auto pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1) + { + float *__restrict y; + float xj; + auto idx_1 = w1_indices[pw1]; + xj = input_data[b * accum_depth + idx_1]; + y = &output_data[b * output_depth + idx_0 * 16]; + y[0] += weights_data[0] * xj; + y[1] += weights_data[1] * xj; + y[2] += weights_data[2] * xj; + y[3] += weights_data[3] * xj; + y[4] += weights_data[4] * xj; + y[5] += weights_data[5] * xj; + y[6] += weights_data[6] * xj; + y[7] += weights_data[7] * xj; + y[8] += weights_data[8] * xj; + y[9] += weights_data[9] * xj; + y[10] += weights_data[10] * xj; + y[11] += weights_data[11] * xj; + y[12] += weights_data[12] * xj; + y[13] += weights_data[13] * xj; + y[14] += weights_data[14] * xj; + y[15] += weights_data[15] * xj; + weights_data += 16; + } + } +#endif + } + if (params.activation != FusedActivationFunctionType::kNone) + { + // Apply activation function + ApplyActivationToVector(output_data, batches * output_depth, params.activation, output_data); + } +} +} // namespace cker +} // namespace nnfw +#endif // __NNFW_CKER_FULLY_CONNECTED_SPARSE16x1_H__ diff --git a/compute/cker/include/cker/operation/FusedBatchNorm.h b/compute/cker/include/cker/operation/FusedBatchNorm.h index d17a5796b..8a97d8421 100644 --- a/compute/cker/include/cker/operation/FusedBatchNorm.h +++ b/compute/cker/include/cker/operation/FusedBatchNorm.h @@ -105,7 +105,7 @@ public: float rest_size_inv = static_cast<float>(1.0f / static_cast<float>(rest_size)); // This adjustment is for Bessel's correction float rest_size_adjust = - static_cast<float>(rest_size) / static_cast<float>(rest_size_minus_one); + static_cast<float>(rest_size) / static_cast<float>(rest_size_minus_one); Eigen::Tensor<float, 1, Eigen::RowMajor> batch_mean(depth); Eigen::Tensor<float, 1, Eigen::RowMajor> batch_variance(depth); @@ -117,12 +117,12 @@ public: batch_variance.device(d) = x_centered.square().sum(reduce_dims) * rest_size_inv; auto scaling_factor = ((batch_variance + param.epsilon).rsqrt() * scale) - .eval() - .reshape(one_by_depth) - .broadcast(bcast_spec); + .eval() + .reshape(one_by_depth) + .broadcast(bcast_spec); auto x_scaled = x_centered * scaling_factor; auto x_shifted = - (x_scaled + offset.reshape(one_by_depth).broadcast(bcast_spec)).template cast<float>(); + (x_scaled + offset.reshape(one_by_depth).broadcast(bcast_spec)).template cast<float>(); UNUSED_RELEASE(rest_size_adjust); diff --git a/compute/cker/include/cker/operation/Helper/BCast.h b/compute/cker/include/cker/operation/Helper/BCast.h index a0abf2935..211db98ce 100644 --- a/compute/cker/include/cker/operation/Helper/BCast.h +++ b/compute/cker/include/cker/operation/Helper/BCast.h @@ -22,7 +22,7 @@ * ToDo : This file will be moved into upper folder when integrate with other * custom operations. * And It should merged with EinsumHelper's BCast. -**/ + **/ #include "cker/Shape.h" #include "cker/eigen/EigenSupport.h" @@ -393,7 +393,7 @@ public: BCast(const Vec &x, const Vec &y, const bool fewer_dims_optimization = true, const bool return_flattened_batch_indices = false) - : BCastList<2>({x, y}, fewer_dims_optimization, return_flattened_batch_indices) + : BCastList<2>({x, y}, fewer_dims_optimization, return_flattened_batch_indices) { } diff --git a/compute/cker/include/cker/operation/Helper/MatmulBCast.h b/compute/cker/include/cker/operation/Helper/MatmulBCast.h index b80ccc0d0..b7d639433 100644 --- a/compute/cker/include/cker/operation/Helper/MatmulBCast.h +++ b/compute/cker/include/cker/operation/Helper/MatmulBCast.h @@ -62,13 +62,13 @@ public: if (!_batch_bcast->IsValid()) return; - auto x_reshaped = _batch_bcast->x_reshape(); - auto y_reshaped = _batch_bcast->y_reshape(); + const auto &x_reshaped = _batch_bcast->x_reshape(); + const auto &y_reshaped = _batch_bcast->y_reshape(); auto output_shape = _batch_bcast->output_shape(); _x_batch_size = std::accumulate(x_reshaped.cbegin(), x_reshaped.cend(), INT32_C(1), std::multiplies<int32_t>()); - _y_batch_size = std::accumulate(x_reshaped.cbegin(), x_reshaped.cend(), INT32_C(1), + _y_batch_size = std::accumulate(y_reshaped.cbegin(), y_reshaped.cend(), INT32_C(1), std::multiplies<int32_t>()); _output_shape.ReplaceWith(output_shape.size(), output_shape.data()); _output_batch_size = _output_shape.FlatSize(); diff --git a/compute/cker/include/cker/operation/Helper/RandomDistributions.h b/compute/cker/include/cker/operation/Helper/RandomDistributions.h index baeafd7c9..f16e5019d 100644 --- a/compute/cker/include/cker/operation/Helper/RandomDistributions.h +++ b/compute/cker/include/cker/operation/Helper/RandomDistributions.h @@ -168,7 +168,7 @@ public: // Must have lo < hi UniformDistribution(int32_t lo, int32_t hi) - : lo_(lo), range_(static_cast<uint32_t>(hi) - static_cast<uint32_t>(lo)) + : lo_(lo), range_(static_cast<uint32_t>(hi) - static_cast<uint32_t>(lo)) { } @@ -207,7 +207,7 @@ public: // Must have lo < hi UniformDistribution(int64_t lo, int64_t hi) - : lo_(lo), range_(static_cast<uint64_t>(hi) - static_cast<uint64_t>(lo)) + : lo_(lo), range_(static_cast<uint64_t>(hi) - static_cast<uint64_t>(lo)) { } @@ -291,22 +291,22 @@ public: template <typename Generator> class UniformFullIntDistribution<Generator, int32_t> - : public UniformFullIntDistribution32<Generator, int32_t> + : public UniformFullIntDistribution32<Generator, int32_t> { }; template <typename Generator> class UniformFullIntDistribution<Generator, uint32_t> - : public UniformFullIntDistribution32<Generator, uint32_t> + : public UniformFullIntDistribution32<Generator, uint32_t> { }; template <typename Generator> class UniformFullIntDistribution<Generator, int64_t> - : public UniformFullIntDistribution64<Generator, int64_t> + : public UniformFullIntDistribution64<Generator, int64_t> { }; template <typename Generator> class UniformFullIntDistribution<Generator, uint64_t> - : public UniformFullIntDistribution64<Generator, uint64_t> + : public UniformFullIntDistribution64<Generator, uint64_t> { }; @@ -324,7 +324,7 @@ public: PHILOX_DEVICE_INLINE explicit SingleSampleAdapter(Generator *gen) - : generator_(gen), used_result_index_(Generator::kResultElementCount) + : generator_(gen), used_result_index_(Generator::kResultElementCount) { } @@ -615,8 +615,8 @@ class TruncatedNormalDistribution<SingleSampleGenerator, double> public: // The number of elements that will be returned. static constexpr int kResultElementCount = (SingleSampleGenerator::kNativeElementCount > 1) - ? SingleSampleGenerator::kNativeElementCount / 2 - : 1; + ? SingleSampleGenerator::kNativeElementCount / 2 + : 1; // Cost of generation of a single element (in cycles). static constexpr int kElementCost = 90; // Indicate that this distribution may take variable number of samples @@ -772,7 +772,7 @@ PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32_t x0, uint32_t x1) } } // namespace random -} // namespace tensorflow -} +} // namespace cker +} // namespace nnfw #endif // __NNFW_CKER_HELPER_RANDOM_DISTRIBUTIONS_H__ diff --git a/compute/cker/include/cker/operation/Helper/RandomOp.h b/compute/cker/include/cker/operation/Helper/RandomOp.h index 7dc51fe94..6b7049ddf 100644 --- a/compute/cker/include/cker/operation/Helper/RandomOp.h +++ b/compute/cker/include/cker/operation/Helper/RandomOp.h @@ -47,6 +47,6 @@ template <class Distribution> struct FillPhiloxRandom<CPUDevice, Distribution> }; } // namespace functor -} // namespace tensorflow -} +} // namespace cker +} // namespace nnfw #endif // __NNFW_CKER_HELPER_RANDOM_OP_H__ diff --git a/compute/cker/include/cker/operation/Helper/RandomOpCpu.h b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h index 85d267723..c99f69709 100644 --- a/compute/cker/include/cker/operation/Helper/RandomOpCpu.h +++ b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h @@ -109,7 +109,7 @@ template <class Distribution> struct FillPhiloxRandomTask<Distribution, true> { const int kGroupSize = Distribution::kResultElementCount; static const int kGeneratorSkipPerOutputGroup = - kGroupSize * kReservedSamplesPerOutput / PhiloxRandom::kResultElementCount; + kGroupSize * kReservedSamplesPerOutput / PhiloxRandom::kResultElementCount; int64_t offset = 0; @@ -157,7 +157,7 @@ operator()(random::PhiloxRandom gen, typename Distribution::ResultElementType *d } // namespace functor -} // end namespace tensorflow -} +} // namespace cker +} // namespace nnfw #endif // __NNFW_CKER_HELPER_RANDOM_OP_CPU_H__ diff --git a/compute/cker/include/cker/operation/Helper/Tensor.h b/compute/cker/include/cker/operation/Helper/Tensor.h index e6ac008a5..ec29a15c3 100644 --- a/compute/cker/include/cker/operation/Helper/Tensor.h +++ b/compute/cker/include/cker/operation/Helper/Tensor.h @@ -29,58 +29,58 @@ template <typename T, int NDIMS = 1, typename IndexType = Eigen::DenseIndex> str { // Rank-<NDIMS> tensor of scalar type T. typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned> - Tensor; + Tensor; typedef Eigen::TensorMap<Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned> - ConstTensor; + ConstTensor; // Unaligned Rank-<NDIMS> tensor of scalar type T. typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>> UnalignedTensor; typedef Eigen::TensorMap<Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>> - UnalignedConstTensor; + UnalignedConstTensor; typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, int>, Eigen::Aligned> - Tensor32Bit; + Tensor32Bit; // Scalar tensor (implemented as a rank-0 tensor) of scalar type T. typedef Eigen::TensorMap<Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>, Eigen::Aligned> - Scalar; + Scalar; typedef Eigen::TensorMap< - Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>, Eigen::Aligned> - ConstScalar; + Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>, Eigen::Aligned> + ConstScalar; // Unaligned Scalar tensor of scalar type T. typedef Eigen::TensorMap<Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>> - UnalignedScalar; + UnalignedScalar; typedef Eigen::TensorMap< - Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>> - UnalignedConstScalar; + Eigen::TensorFixedSize<const T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>> + UnalignedConstScalar; // Rank-1 tensor (vector) of scalar type T. typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> Flat; typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> - ConstFlat; + ConstFlat; typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> Vec; typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned> - ConstVec; + ConstVec; // Unaligned Rank-1 tensor (vector) of scalar type T. typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>> UnalignedFlat; typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>> - UnalignedConstFlat; + UnalignedConstFlat; typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>> UnalignedVec; typedef Eigen::TensorMap<Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>> UnalignedConstVec; // Rank-2 tensor (matrix) of scalar type T. typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned> Matrix; typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned> - ConstMatrix; + ConstMatrix; // Unaligned Rank-2 tensor (matrix) of scalar type T. typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>> UnalignedMatrix; typedef Eigen::TensorMap<Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>> - UnalignedConstMatrix; + UnalignedConstMatrix; }; typedef typename TTypes<float, 1>::Tensor32Bit::Index Index32; diff --git a/compute/cker/include/cker/operation/InstanceNorm.h b/compute/cker/include/cker/operation/InstanceNorm.h index 6445e8a2b..8fa8b03bc 100644 --- a/compute/cker/include/cker/operation/InstanceNorm.h +++ b/compute/cker/include/cker/operation/InstanceNorm.h @@ -78,8 +78,8 @@ inline void InstanceNorm(const InstanceNormParams ¶ms, const Shape &input_sh double input_value = input_data[Offset(output_shape, batch, height, width, channel)]; double output_value = input_value * a + b; output_data[Offset(output_shape, batch, height, width, channel)] = - ActivationFunctionWithMinMax((float)output_value, output_activation_min, - output_activation_max); + ActivationFunctionWithMinMax((float)output_value, output_activation_min, + output_activation_max); } } } diff --git a/compute/cker/include/cker/operation/L2Normalize.h b/compute/cker/include/cker/operation/L2Normalize.h index a0075c3d0..c1fca91cc 100644 --- a/compute/cker/include/cker/operation/L2Normalize.h +++ b/compute/cker/include/cker/operation/L2Normalize.h @@ -77,7 +77,7 @@ void L2NormalizeQuant8(L2NormParams ¶ms, const Shape &input_shape, const uin { int32_t diff = *input_data - input_zero_point; int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp( - 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift); + 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift); int32_t unclamped_output_val = 128 + rescaled_diff; int32_t output_val = std::min(static_cast<int32_t>(255), std::max(static_cast<int32_t>(0), unclamped_output_val)); diff --git a/compute/cker/include/cker/operation/LSTM.h b/compute/cker/include/cker/operation/LSTM.h new file mode 100644 index 000000000..a8f1f8ca3 --- /dev/null +++ b/compute/cker/include/cker/operation/LSTM.h @@ -0,0 +1,371 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_UNIDIRECTIONALSEQUENCELSTM_H__ +#define __NNFW_CKER_UNIDIRECTIONALSEQUENCELSTM_H__ + +#include "cker/TensorUtils.h" +#include "cker/Types.h" + +namespace nnfw +{ +namespace cker +{ + +// LINT.IfChange +// Calculates a single LSTM gate. +// +// Implements the following formula: (* is matrix multiply) +// gate = activate(W_input * input + W_aux * aux_input + +// W_peephole * cell + W_recurrent * prev_output + bias) +// with layer norm: +// gate = activate(W_norm * normalize(...) + bias) // not adding bias inside +// +// Activation is sigmoid except for the "cell" gate (configurable, usually tanh) +// +// Parameters: +// Input vectors (to LSTM): | Size: | Optional? +// input | n_input | +// aux_input | n_aux_input | y (bidir LSTM) +// Input vectors (persistent states): +// output_state | n_output | +// cell_state | n_cell | +// 'Constant' inputs: +// input_to_gate_weights | n_cell * n_input | +// aux_input_to_gate_weights | n_cell * n_aux_input | y (bidir LSTM) +// recurrent_to_gate_weights | n_cell * n_output | +// cell_to_gate_weights | n_cell | y (peephole) +// gate_bias | n_cell | +// layer_norm_coefficients | n_cell | y (layer norm) +// Output vector: +// gate | n_cell | +// Scalar parameters: +// n_batch - batch size / number of vectors +// n_input, n_aux_input, n_output, n_cell - size of vectors. +// activation - activation to use. +// is_input_all_zeros, is_aux_input_all_zeros - if input vectors are all zero. +// use_layer_norm - if doing layer norm LSTM. +inline void CalculateLstmGateFloat(const float *input, const float *input_to_gate_weights, + const float *aux_input, const float *aux_input_to_gate_weights, + const float *output_state, + const float *recurrent_to_gate_weights, const float *cell_state, + const float *cell_to_gate_weights, + const float *layer_norm_coefficients, const float *gate_bias, + const int n_batch, const int n_input, const int n_aux_input, + const int n_output, const int n_cell, + const FusedActivationFunctionType activation, float *gate, + const bool is_input_all_zeros, const bool is_aux_input_all_zeros) +{ + const bool use_peephole = (cell_to_gate_weights != nullptr); + const bool use_layer_norm = (layer_norm_coefficients != nullptr); + + // Initialize scratch buffers with bias for regular lstm or initialize with + // zero for layer norm lstm. + if (use_layer_norm) + { + std::fill_n(gate, n_cell * n_batch, 0.0f); + } + else + { + VectorBatchVectorAssign(gate_bias, n_cell, n_batch, gate); + } + // For each batch and cell: compute input_weight * input. + // Skip if input is all zeros. + if (!is_input_all_zeros) + { + MatrixBatchVectorMultiplyAccumulate(input_to_gate_weights, n_cell, n_input, input, n_batch, + gate, /*result_stride=*/1); + } + // For each batch and cell: compute aux_input_weight * aux_input. + // Skip if auxiliary input is not available or all zeros. + if (!is_aux_input_all_zeros) + { + MatrixBatchVectorMultiplyAccumulate(aux_input_to_gate_weights, n_cell, n_aux_input, aux_input, + n_batch, gate, /*result_stride=*/1); + } + // For each batch and cell: compute recurrent_weight * output_state. + MatrixBatchVectorMultiplyAccumulate(recurrent_to_gate_weights, n_cell, n_output, output_state, + n_batch, gate, /*result_stride=*/1); + // For each batch and cell: compute cell_weight .* cell_state (peephole LSTM) + if (use_peephole) + { + VectorBatchVectorCwiseProductAccumulate(cell_to_gate_weights, n_cell, cell_state, n_batch, + gate); + } + // Do layer normalization (if layer norm LSTM) + if (use_layer_norm) + { + MeanStddevNormalization(gate, gate, n_cell, n_batch); + VectorBatchVectorCwiseProduct(layer_norm_coefficients, n_cell, gate, n_batch, gate); + VectorBatchVectorAdd(gate_bias, n_cell, n_batch, gate); + } + // Apply activation + ApplyActivationToVector(gate, n_batch * n_cell, activation, gate); +} + +// Updates the LSTM cell state, used by both float and hybrid LSTM versions. +// +// Implements the following formula: +// cell_state_new = clip(forget_gate * cell_state + input_gate * cell_gate) +// +// With CIFG LSTM, input gate is replaced by (1-forget_gate). +// +// Parameters: +// - n_batch, n_cell: sizes of vectors +// - cell_state: input/output vector, size n_batch*n_cell +// - input_gate: input vector, size n_batch*n_cell. +// - forget_gate: input/scratch vector, size n_batch*n_cell, modified with CIFG +// - cell_gate: input vector, size n_batch*n_cell. +// - use_cifg: use 1-forget_gate instead of input_gate. +// - clip: if > 0, clip the resulting cell state to [-clip, +clip]. +void UpdateLstmCellFloat(int n_batch, int n_cell, float *cell_state, const float *input_gate, + float *forget_gate, const float *cell_gate, bool use_cifg, float clip) +{ + // Define variable for 4th argument to avoid warning + // Compiler warning: passing argument 4 to restrict-qualified parameter aliases with argument 2 + const float *cwise_product_rhs = cell_state; + VectorVectorCwiseProduct(forget_gate, cwise_product_rhs, n_batch * n_cell, cell_state); + + if (use_cifg) + { + // With CIFG, input_gate = 1-forget_gate. Use the forget_gate array as + // scratch, as input_gate array is not allocated in this case. (Be careful + // not to write to the scratch before reading the forget gate data.) + float *scratch = forget_gate; + Sub1Vector(forget_gate, n_batch * n_cell, scratch); + VectorVectorCwiseProductAccumulate(cell_gate, scratch, n_batch * n_cell, cell_state); + } + else + { + VectorVectorCwiseProductAccumulate(cell_gate, input_gate, n_batch * n_cell, cell_state); + } + if (clip > 0.0f) + { + CwiseClipping(cell_state, n_batch * n_cell, clip); + } +} + +// Calculates the output state tensor of an LSTM step. +// +// Implements the following formula: +// output_no_projection = output_gate .* activate(cell_state) +// (elementwise vector product) +// If no projection is used: +// output = output_state = output_no_projection +// With projection: +// output = output_state = clip(W*output_no_projection + bias) +// +// Output might not have a different 'stride' than n_batch, so we need to copy. +// +// Parameters: +// - n_batch: batches: the number of distinct vectors in each array. +// - n_cell, n_output: sizes of vectors. +// - cell_state, output_gate: input vectors, size n_batch*n_cell. +// - projection_weights, projection_weights_scale, projection_bias: +// constant inputs, describing projection matrix and bias. +// - proj_clip: if > 0, clip the output of the projection. +// - output_state: output vector, size n_batch*n_output. Must be contigous. +// - scratch: scratch area, size n_batch*n_cell. +void CalculateLstmOutputFloat(int n_batch, int n_cell, int n_output, const float *cell_state, + const float *output_gate, FusedActivationFunctionType activation, + const float *projection_weights, const float *projection_bias, + const float proj_clip, float *output_state, float *scratch) +{ + ApplyActivationToVector(cell_state, n_batch * n_cell, activation, scratch); + + // Define variable for 4th argument to avoid warning + // Compiler warning: passing argument 4 to restrict-qualified parameter aliases with argument 2 + const float *cwise_product_rhs = scratch; + VectorVectorCwiseProduct(output_gate, cwise_product_rhs, n_batch * n_cell, scratch); + + const bool use_projection = (projection_weights != nullptr); + const bool use_projection_bias = (projection_bias != nullptr); + + if (use_projection) + { + if (use_projection_bias) + { + VectorBatchVectorAssign(projection_bias, n_output, n_batch, output_state); + } + else + { + std::fill_n(output_state, n_batch * n_output, 0.0f); + } + MatrixBatchVectorMultiplyAccumulate(projection_weights, n_output, n_cell, scratch, n_batch, + output_state, /*result_stride=*/1); + if (proj_clip > 0.0f) + { + CwiseClipping(output_state, n_batch * n_output, proj_clip); + } + } + else + { + std::copy_n(scratch, n_batch * n_output, output_state); + } +} + +// Performs an LSTM batch inference step for input specified by input_ptr. +// The LSTM cell is specified by the pointers to its weights (*_weights_ptr) and +// biases (*_bias_ptr), and buffers (*_scratch), along with additional +// parameters: +// - params: various LSTM params including activation, clipping, etc., +// - n_batch: size of batch, +// - n_cell: number of cells (or units), +// - n_input: the input size, +// - n_aux_input: the auxiliary input size. +// - n_output: the output size. +// - output_batch_leading_dim: the leading dimension of the output buffer. +// +// Input of size 'n_batch * n_input': +// input_ptr +// Input of size 'n_batch * n_aux_input': +// aux_input_ptr - optional (can be nullptr) +// +// LSTM weights: +// Input weights of size 'n_cell * n_input': +// input_to_input_weights - optional +// input_to_forget_weights +// input_to_cell_weights +// input_to_output_weights +// Auxiliary input weights of size 'n_cell * n_aux_input': +// aux_input_to_input_weights - optional +// aux_input_to_forget_weights - optional +// aux_input_to_cell_weights - optional +// aux_input_to_output_weights - optional +// Recurrent weights of size 'n_cell * n_output': +// recurrent_to_input_weights - optional +// recurrent_to_forget_weights +// recurrent_to_cell_weights +// recurrent_to_input_weights +// Peephole weights of size 'n_cell', representing diagonal matrices. +// cell_to_input_weights - optional +// cell_to_cell_weights - optional +// cell_to_output_weights - optional +// Projection weights of size 'n_output * n_cell' +// projection_weights_ptr - optional +// Gate biases of size 'n_cell': +// input_gate_bias_ptr - optional +// forget_gate_bias_ptr +// cell_gate_bias_ptr +// output_gate_bias_ptr +// +// Layer norm coefficients of size 'n_cell', representing diagonal matrices. +// input_layer_norm_coefficients_ptr - optional +// forget_layer_norm_coefficients_ptr - optional +// cell_layer_norm_coefficients_ptr - optional +// output_layer_norm_coefficients_ptr - optional +// +// The pointers to the cell and output state and the output are updated. +// +// The pointers input_ptr, aux_input_ptr, and output_ptr point to data aligned +// in batch_major order, and each step processes batch_size many inputs from +// input_ptr, and updates batch_size many cell and output states. +// +// The output_batch_dim is output.shape[-1], i.e. the outermost dimension of the +// output tensor, and in most cases will be equal to n_output. It is usually not +// when we want to store the LSTM output into a slice of the output tensor, e.g. +// for bidirectional LSTMs with merge_outputs. In this case, the batched +// operations cannot be used since they assume that the batched outputs are +// contiguous, and we manually loop over the batched outputs. +// LINT.IfChange +inline void LstmStepFloat( + const float *input_ptr, const float *input_to_input_weights_ptr, + const float *input_to_forget_weights_ptr, const float *input_to_cell_weights_ptr, + const float *input_to_output_weights_ptr, const float *aux_input_ptr, + const float *aux_input_to_input_weights_ptr, const float *aux_input_to_forget_weights_ptr, + const float *aux_input_to_cell_weights_ptr, const float *aux_input_to_output_weights_ptr, + const float *recurrent_to_input_weights_ptr, const float *recurrent_to_forget_weights_ptr, + const float *recurrent_to_cell_weights_ptr, const float *recurrent_to_output_weights_ptr, + const float *cell_to_input_weights_ptr, const float *cell_to_forget_weights_ptr, + const float *cell_to_output_weights_ptr, const float *input_layer_norm_coefficients_ptr, + const float *forget_layer_norm_coefficients_ptr, const float *cell_layer_norm_coefficients_ptr, + const float *output_layer_norm_coefficients_ptr, const float *input_gate_bias_ptr, + const float *forget_gate_bias_ptr, const float *cell_gate_bias_ptr, + const float *output_gate_bias_ptr, const float *projection_weights_ptr, + const float *projection_bias_ptr, const LSTMParams *params, int n_batch, int n_cell, int n_input, + int n_aux_input, int n_output, int output_batch_leading_dim, float *output_state_ptr, + float *cell_state_ptr, float *scratch0, float *scratch1, float *scratch2, float *scratch3, + float *output_ptr) +{ + // Since we have already checked that weights are all there or none, we can + // check the existence of only one to the get the condition. + const bool use_cifg = (input_to_input_weights_ptr == nullptr); + + // Make named scratch buffers. + float *input_gate_scratch = scratch0; + float *forget_gate_scratch = scratch1; + float *cell_gate_scratch = scratch2; + float *output_gate_scratch = scratch3; + + // Check if inputs are all zeros so we can skip some computations. + const bool is_input_all_zeros = IsZeroVector(input_ptr, n_batch * n_input); + const bool is_aux_input_all_zeros = + (aux_input_ptr == nullptr || IsZeroVector(aux_input_ptr, n_batch * n_aux_input)); + if (!use_cifg) + { + // Calculate the input gate. (If not CIFG.) + CalculateLstmGateFloat(input_ptr, input_to_input_weights_ptr, aux_input_ptr, + aux_input_to_input_weights_ptr, output_state_ptr, + recurrent_to_input_weights_ptr, cell_state_ptr, + cell_to_input_weights_ptr, input_layer_norm_coefficients_ptr, + input_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell, + /*activation=kTfLiteActSigmoid*/ FusedActivationFunctionType::kSigmoid, + input_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros); + } + // Calculate the forget gate. + CalculateLstmGateFloat(input_ptr, input_to_forget_weights_ptr, aux_input_ptr, + aux_input_to_forget_weights_ptr, output_state_ptr, + recurrent_to_forget_weights_ptr, cell_state_ptr, + cell_to_forget_weights_ptr, forget_layer_norm_coefficients_ptr, + forget_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell, + /*activation=kTfLiteActSigmoid*/ FusedActivationFunctionType::kSigmoid, + forget_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros); + // Calculate the cell update gate. + CalculateLstmGateFloat( + input_ptr, input_to_cell_weights_ptr, aux_input_ptr, aux_input_to_cell_weights_ptr, + output_state_ptr, recurrent_to_cell_weights_ptr, /*cell_state=*/nullptr, + /*cell_to_gate_weights=*/nullptr, cell_layer_norm_coefficients_ptr, cell_gate_bias_ptr, n_batch, + n_input, n_aux_input, n_output, n_cell, params->activation, cell_gate_scratch, + is_input_all_zeros, is_aux_input_all_zeros); + // Update the cell state. + UpdateLstmCellFloat(n_batch, n_cell, cell_state_ptr, input_gate_scratch, forget_gate_scratch, + cell_gate_scratch, use_cifg, params->cell_clip); + // Calculate output gate. + CalculateLstmGateFloat(input_ptr, input_to_output_weights_ptr, aux_input_ptr, + aux_input_to_output_weights_ptr, output_state_ptr, + recurrent_to_output_weights_ptr, cell_state_ptr, + cell_to_output_weights_ptr, output_layer_norm_coefficients_ptr, + output_gate_bias_ptr, n_batch, n_input, n_aux_input, n_output, n_cell, + /*activation=kTfLiteActSigmoid*/ FusedActivationFunctionType::kSigmoid, + output_gate_scratch, is_input_all_zeros, is_aux_input_all_zeros); + // Update the output state. + CalculateLstmOutputFloat(n_batch, n_cell, n_output, cell_state_ptr, output_gate_scratch, + params->activation, projection_weights_ptr, projection_bias_ptr, + params->proj_clip, output_state_ptr, scratch2); + // Copy output state to the output. Note that the output's rows may not be + // contiguous (output_batch_leading_dim != n_output). + for (int b = 0; b < n_batch; b++) + { + std::copy_n(output_state_ptr + b * n_output, n_output, + output_ptr + b * output_batch_leading_dim); + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_UNIDIRECTIONALSEQUENCELSTM_H__ diff --git a/compute/cker/include/cker/operation/LeakyReLU.h b/compute/cker/include/cker/operation/LeakyReLU.h new file mode 100644 index 000000000..e12d01bba --- /dev/null +++ b/compute/cker/include/cker/operation/LeakyReLU.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_LEKAY_RELU_H__ +#define __NNFW_CKER_LEKAY_RELU_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" + +#include <cmath> + +namespace nnfw +{ +namespace cker +{ + +inline void LeakyReLU(const LeakyReluParams ¶ms, const Shape &input_shape, + const float *input_data, const Shape &output_shape, float *output_data) +{ + const int flat_size = MatchingFlatSize(input_shape, output_shape); + + for (int i = 0; i < flat_size; i++) + { + const float val = input_data[i]; + // Note that alpha might be > 1 or < 0, so we don't use std::max here. + output_data[i] = val > 0 ? val : val * params.alpha; + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_RELU_H__ diff --git a/compute/cker/include/cker/operation/LogSoftMax.h b/compute/cker/include/cker/operation/LogSoftMax.h index 326a44f0c..eb7bdd900 100644 --- a/compute/cker/include/cker/operation/LogSoftMax.h +++ b/compute/cker/include/cker/operation/LogSoftMax.h @@ -71,7 +71,7 @@ inline void LogSoftmax(const SoftmaxParams ¶ms, const Shape &input_shape, for (int c = 0; c < depth; ++c) { output_data[(i * depth + c) * inner_size + j] = - (input_data[(i * depth + c) * inner_size + j] - max) * beta - log_sum; + (input_data[(i * depth + c) * inner_size + j] - max) * beta - log_sum; } } } @@ -124,10 +124,10 @@ inline void LogSoftmax(const SoftmaxParams ¶ms, float input_scale, const Sha for (int c = 0; c < depth; ++c) { const float log_prob = - scale * input_data[(i * depth + c) * inner_size] * beta - precomputed; + scale * input_data[(i * depth + c) * inner_size] * beta - precomputed; const int32_t prob_quantized = std::rint(log_prob) + params.zero_point; output_data[(i * depth + c) * inner_size] = - static_cast<uint8_t>(std::max(std::min(clamp_max, prob_quantized), clamp_min)); + static_cast<uint8_t>(std::max(std::min(clamp_max, prob_quantized), clamp_min)); } } } diff --git a/compute/cker/include/cker/operation/LogicalAnd.h b/compute/cker/include/cker/operation/LogicalAnd.h new file mode 100644 index 000000000..e877f5f47 --- /dev/null +++ b/compute/cker/include/cker/operation/LogicalAnd.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_LOGICAL_AND_H__ +#define __NNFW_CKER_LOGICAL_AND_H__ + +#include "cker/Shape.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ + +template <typename T> +inline void LogicalAndBroadcast(const Shape &unextended_input1_shape, const T *input1_data, + const Shape &unextended_input2_shape, const T *input2_data, + const Shape &unextended_output_shape, T *output_data) +{ + assert(unextended_input1_shape.DimensionsCount() <= 4); + assert(unextended_input2_shape.DimensionsCount() <= 4); + assert(unextended_output_shape.DimensionsCount() <= 4); + const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape); + + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1, + &desc2); + + for (int b = 0; b < output_shape.Dims(0); ++b) + { + for (int y = 0; y < output_shape.Dims(1); ++y) + { + for (int x = 0; x < output_shape.Dims(2); ++x) + { + for (int c = 0; c < output_shape.Dims(3); ++c) + { + auto out_idx = Offset(output_shape, b, y, x, c); + auto in1_idx = SubscriptToIndex(desc1, b, y, x, c); + auto in2_idx = SubscriptToIndex(desc2, b, y, x, c); + auto in1_val = input1_data[in1_idx]; + auto in2_val = input2_data[in2_idx]; + output_data[out_idx] = in1_val && in2_val; + } + } + } + } +} + +template <typename T> +inline void LogicalAndElementwise(const Shape &shape, const T *input1_data, const T *input2_data, + T *output_data) +{ + + int num_elements = shape.FlatSize(); + + for (int t = 0; t < num_elements; t++) + { + output_data[t] = input1_data[t] && input2_data[t]; + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_LOGICAL_AND_H__ diff --git a/compute/cker/include/cker/operation/Logistic.h b/compute/cker/include/cker/operation/Logistic.h index 3d3e59e55..e9907729e 100644 --- a/compute/cker/include/cker/operation/Logistic.h +++ b/compute/cker/include/cker/operation/Logistic.h @@ -29,12 +29,39 @@ namespace nnfw namespace cker { +/** + * @brief Internal scalar_logistic_op operation struct + * + * @note Recent Eigen3 scalar_logistic_op return invalid value on ARM32 if + * input value is float type 88 (expected: 1, actual: 0) + * As a workaround, we use old version scalar_logistic_op internal struct + * TODO Remove this workaround + */ +template <typename T> struct scalar_logistic_op +{ + EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T &x) const + { + const T one = T(1); + return one / (one + Eigen::numext::exp(-x)); + } + + template <typename Packet> + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet &x) const + { + const Packet one = Eigen::internal::pset1<Packet>(T(1)); + return pdiv(one, padd(one, pexp(pnegate(x)))); + } +}; + inline void Logistic(const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data) { auto input_map = MapAsVector(input_data, input_shape); auto output_map = MapAsVector(output_data, output_shape); - output_map.array() = input_map.array().unaryExpr(Eigen::internal::scalar_logistic_op<float>()); + + // Use old version scalar_logistic_op + output_map.array() = input_map.array().unaryExpr(nnfw::cker::scalar_logistic_op<float>()); } } // namespace cker diff --git a/compute/cker/include/cker/operation/MatrixBandPart.h b/compute/cker/include/cker/operation/MatrixBandPart.h index 5674ff3ef..ef2868455 100644 --- a/compute/cker/include/cker/operation/MatrixBandPart.h +++ b/compute/cker/include/cker/operation/MatrixBandPart.h @@ -43,11 +43,11 @@ void MatrixBandPart(const T num_lower_diags, const T num_upper_diags, const Shap if (!(num_lower_diags <= row_num)) throw std::runtime_error( - "MatrixBandPart : num_lower must be negative or less or equal to number of rows"); + "MatrixBandPart : num_lower must be negative or less or equal to number of rows"); if (!(num_upper_diags <= col_num)) throw std::runtime_error( - "MatrixBandPart : num_upper must be negative or less or equal to number of columns"); + "MatrixBandPart : num_upper must be negative or less or equal to number of columns"); std::fill(output_data, output_data + output_shape.FlatSize(), 0); // output matrix init @@ -60,9 +60,10 @@ void MatrixBandPart(const T num_lower_diags, const T num_upper_diags, const Shap auto input = input_data + (batch * row_num * col_num + row * col_num); const T band_start = - num_lower_diags < 0 ? 0 : std::min(col_num, std::max(T{0}, row - num_lower_diags)); - const T band_end = num_upper_diags < 0 ? col_num : std::min(static_cast<T>(col_num), - row + num_upper_diags + 1); + num_lower_diags < 0 ? 0 : std::min(col_num, std::max(T{0}, row - num_lower_diags)); + const T band_end = num_upper_diags < 0 + ? col_num + : std::min(static_cast<T>(col_num), row + num_upper_diags + 1); for (T band_idx = band_start; band_idx < band_end; band_idx++) { diff --git a/compute/cker/include/cker/operation/MaxPool.h b/compute/cker/include/cker/operation/MaxPool.h index ea3fcaca6..5dc84d368 100644 --- a/compute/cker/include/cker/operation/MaxPool.h +++ b/compute/cker/include/cker/operation/MaxPool.h @@ -67,10 +67,10 @@ void MaxPool<float>(const PoolParams ¶ms, const Shape &input_shape, const fl int hpad = h + params.padding_values.height; int wpad = w + params.padding_values.width; int h_start = - (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1; + (hpad < params.filter_height) ? 0 : (hpad - params.filter_height) / stride_height + 1; int h_end = std::min(hpad / stride_height + 1, output_height); int w_start = - (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1; + (wpad < params.filter_width) ? 0 : (wpad - params.filter_width) / stride_width + 1; int w_end = std::min(wpad / stride_width + 1, output_width); // compute elementwise sum for (int ph = h_start; ph < h_end; ++ph) @@ -79,8 +79,8 @@ void MaxPool<float>(const PoolParams ¶ms, const Shape &input_shape, const fl { int out_offset = NodeOffset(b, ph, pw, output_height, output_width); out_mat.col(out_offset) = - out_mat.col(out_offset) - .cwiseMax(in_mat.col(NodeOffset(b, h, w, input_height, input_width))); + out_mat.col(out_offset) + .cwiseMax(in_mat.col(NodeOffset(b, h, w, input_height, input_width))); } } } @@ -139,8 +139,8 @@ void MaxPool<uint8_t>(const PoolParams ¶ms, const Shape &input_shape, const const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); memset(acc, 0, tranche_depth * sizeof(acc[0])); const uint8_t *input_ptr = - input_data + depth_base + - depth * (in_x_origin + input_width * (in_y_origin + input_height * batch)); + input_data + depth_base + + depth * (in_x_origin + input_width * (in_y_origin + input_height * batch)); for (int fy = filter_y_start; fy < filter_y_end; fy++) { const uint8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start); diff --git a/compute/cker/include/cker/operation/OneHot.h b/compute/cker/include/cker/operation/OneHot.h index c0dbc6df5..ddc27b4c2 100644 --- a/compute/cker/include/cker/operation/OneHot.h +++ b/compute/cker/include/cker/operation/OneHot.h @@ -55,7 +55,7 @@ void OneHot(const int32_t depth, const T on_value, const T off_value, int32_t ax for (int k = 0; k < suffix_dim_size; ++k, ++output_data) { *output_data = - static_cast<int>(indices_data[i * suffix_dim_size + k]) == j ? on_value : off_value; + static_cast<int>(indices_data[i * suffix_dim_size + k]) == j ? on_value : off_value; } } } diff --git a/compute/cker/include/cker/operation/Quantize.h b/compute/cker/include/cker/operation/Quantize.h index 5c82d111f..7292a199a 100644 --- a/compute/cker/include/cker/operation/Quantize.h +++ b/compute/cker/include/cker/operation/Quantize.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved.* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,11 +18,14 @@ #ifndef __NNFW_CKER_QUANTIZE_H__ #define __NNFW_CKER_QUANTIZE_H__ +#include "cker/operation/Round.h" #include "cker/Shape.h" #include "cker/Types.h" #include "cker/Utils.h" -#include <stdexcept> +#include <cassert> #include <iostream> +#include <stdexcept> + namespace nnfw { namespace cker @@ -41,6 +45,409 @@ inline void Quantize(const Shape &input_shape, const InputT *input_data, const S output_data[i] = clamped; } } + +template <> +inline void Quantize(const Shape &input_shape, const float *input_data, const Shape &output_shape, + int8_t *output_data, const float scale, const int32_t zero_point) +{ + const int flat_size = MatchingFlatSize(input_shape, output_shape); + static constexpr int32_t min_val = std::numeric_limits<int8_t>::min(); + static constexpr int32_t max_val = std::numeric_limits<int8_t>::max(); + + int i = 0; +#ifdef USE_NEON + const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale); + const int32x4_t zero_point_dup = vdupq_n_s32(zero_point); + const int32x4_t min_val_dup = vdupq_n_s32(min_val); + const int32x4_t max_val_dup = vdupq_n_s32(max_val); + + for (; i <= flat_size - 8; i += 8) + { + const float *src_data_ptr = input_data + i; + float32x4_t input_val_0 = vld1q_f32(src_data_ptr); + float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4); + + input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup); + input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup); + + int32x4_t casted_val_0 = RoundToNearest(input_val_0); + int32x4_t casted_val_1 = RoundToNearest(input_val_1); + + casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup); + casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup); + + // Clamp the values to fit the target type's range. + casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup); + casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup); + casted_val_0 = vminq_s32(casted_val_0, max_val_dup); + casted_val_1 = vminq_s32(casted_val_1, max_val_dup); + + const int16x4_t narrowed_val_0 = vmovn_s32(casted_val_0); + const int16x4_t narrowed_val_1 = vmovn_s32(casted_val_1); + const int16x8_t combined_val = vcombine_s16(narrowed_val_0, narrowed_val_1); + const int8x8_t combined_val_narrowed = vmovn_s16(combined_val); + vst1_s8(output_data + i, combined_val_narrowed); + } +#endif // NEON + + for (; i < flat_size; ++i) + { + const float val = input_data[i]; + const int32_t unclamped = static_cast<int32_t>(round(val / scale)) + zero_point; + const int32_t clamped = std::min(std::max(unclamped, min_val), max_val); + output_data[i] = clamped; + } +} + +template <> +inline void Quantize(const Shape &input_shape, const float *input_data, const Shape &output_shape, + uint8_t *output_data, const float scale, const int32_t zero_point) +{ + const int flat_size = MatchingFlatSize(input_shape, output_shape); + static constexpr int32_t min_val = std::numeric_limits<uint8_t>::min(); + static constexpr int32_t max_val = std::numeric_limits<uint8_t>::max(); + + int i = 0; +#ifdef USE_NEON + const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale); + const int32x4_t zero_point_dup = vdupq_n_s32(zero_point); + const int32x4_t min_val_dup = vdupq_n_s32(min_val); + const int32x4_t max_val_dup = vdupq_n_s32(max_val); + + for (; i <= flat_size - 8; i += 8) + { + const float *src_data_ptr = input_data + i; + float32x4_t input_val_0 = vld1q_f32(src_data_ptr); + float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4); + + input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup); + input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup); + + int32x4_t casted_val_0 = RoundToNearest(input_val_0); + int32x4_t casted_val_1 = RoundToNearest(input_val_1); + + casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup); + casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup); + + // Clamp the values to fit the target type's range. + casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup); + casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup); + casted_val_0 = vminq_s32(casted_val_0, max_val_dup); + casted_val_1 = vminq_s32(casted_val_1, max_val_dup); + + const uint16x4_t narrowed_val_0 = vqmovun_s32(casted_val_0); + const uint16x4_t narrowed_val_1 = vqmovun_s32(casted_val_1); + const uint16x8_t combined_val = vcombine_u16(narrowed_val_0, narrowed_val_1); + const uint8x8_t combined_val_narrowed = vmovn_u16(combined_val); + vst1_u8(output_data + i, combined_val_narrowed); + } +#endif // NEON + + for (; i < flat_size; ++i) + { + const float val = input_data[i]; + const int32_t unclamped = static_cast<int32_t>(round(val / scale)) + zero_point; + const int32_t clamped = std::min(std::max(unclamped, min_val), max_val); + output_data[i] = clamped; + } +} + +template <> +inline void Quantize(const Shape &input_shape, const float *input_data, const Shape &output_shape, + int16_t *output_data, const float scale, const int32_t zero_point) +{ + const int flat_size = MatchingFlatSize(input_shape, output_shape); + static constexpr int32_t min_val = std::numeric_limits<int16_t>::min(); + static constexpr int32_t max_val = std::numeric_limits<int16_t>::max(); + + int i = 0; +#ifdef USE_NEON + const float32x4_t reverse_scale_dup = vdupq_n_f32(1.0f / scale); + const int32x4_t zero_point_dup = vdupq_n_s32(zero_point); + const int32x4_t min_val_dup = vdupq_n_s32(min_val); + const int32x4_t max_val_dup = vdupq_n_s32(max_val); + + for (; i <= flat_size - 8; i += 8) + { + const float *src_data_ptr = input_data + i; + float32x4_t input_val_0 = vld1q_f32(src_data_ptr); + float32x4_t input_val_1 = vld1q_f32(src_data_ptr + 4); + + input_val_0 = vmulq_f32(input_val_0, reverse_scale_dup); + input_val_1 = vmulq_f32(input_val_1, reverse_scale_dup); + + int32x4_t casted_val_0 = RoundToNearest(input_val_0); + int32x4_t casted_val_1 = RoundToNearest(input_val_1); + + casted_val_0 = vaddq_s32(casted_val_0, zero_point_dup); + casted_val_1 = vaddq_s32(casted_val_1, zero_point_dup); + + // Clamp the values to fit the target type's range. + casted_val_0 = vmaxq_s32(casted_val_0, min_val_dup); + casted_val_1 = vmaxq_s32(casted_val_1, min_val_dup); + casted_val_0 = vminq_s32(casted_val_0, max_val_dup); + casted_val_1 = vminq_s32(casted_val_1, max_val_dup); + + const int16x4_t narrowed_val_0 = vmovn_s32(casted_val_0); + const int16x4_t narrowed_val_1 = vmovn_s32(casted_val_1); + vst1_s16(output_data + i, narrowed_val_0); + vst1_s16(output_data + i + 4, narrowed_val_1); + } +#endif // NEON + + for (; i < flat_size; ++i) + { + const float val = input_data[i]; + const int32_t unclamped = static_cast<int32_t>(round(val / scale)) + zero_point; + const int32_t clamped = std::min(std::max(unclamped, min_val), max_val); + output_data[i] = clamped; + } +} + +inline void Quantize(const int32_t *multiplier, const int32_t *shift, int32_t channel_size, + int32_t total_size, int32_t output_zp, int32_t output_min, int32_t output_max, + int32_t *scratch, int8_t *output) +{ + // Here we're trying to quantize the raw accumulators: + // output_channels + // data data data data data + // rows data data data data data + // data data data data data + // .... + // + // In order to minimize the reload of the multipliers & shifts, once we load + // the multipliers & shifts, we load & quantize the raw accumulators for every + // row. +#ifdef USE_NEON + const int32x4_t output_offset_vec = vdupq_n_s32(output_zp); + const int32x4_t output_activation_min_vec = vdupq_n_s32(output_min); + const int32x4_t output_activation_max_vec = vdupq_n_s32(output_max); + const int32x4_t zeros = vdupq_n_s32(0); +#endif + + assert(total_size % channel_size == 0); + const int32_t rows = total_size / channel_size; + + int c = 0; + +#ifdef USE_NEON + using gemmlowp::RoundingDivideByPOT; + for (; c <= channel_size - 8; c += 8) + { + int32x4_t out_shift_1 = vld1q_s32(shift + c); + int32x4_t out_shift_2 = vld1q_s32(shift + c + 4); + int32x4_t left_shift_1 = vmaxq_s32(out_shift_1, zeros); + int32x4_t left_shift_2 = vmaxq_s32(out_shift_2, zeros); + + // Right shift will be performed as left shift with negative values. + int32x4_t right_shift_1 = vminq_s32(out_shift_1, zeros); + int32x4_t right_shift_2 = vminq_s32(out_shift_2, zeros); + + int32x4_t out_mul_1 = vld1q_s32(multiplier + c); + int32x4_t out_mul_2 = vld1q_s32(multiplier + c + 4); + for (int n = 0; n < rows; ++n) + { + int loc = n * channel_size + c; + int32x4_t acc_1 = vld1q_s32(scratch + loc); + int32x4_t acc_2 = vld1q_s32(scratch + loc + 4); + + // Saturating Rounding Doubling High Mul. + acc_1 = vshlq_s32(acc_1, left_shift_1); + acc_1 = vqrdmulhq_s32(acc_1, out_mul_1); + acc_2 = vshlq_s32(acc_2, left_shift_2); + acc_2 = vqrdmulhq_s32(acc_2, out_mul_2); + + // Rounding Dividing By POT. + acc_1 = vrshlq_s32(acc_1, right_shift_1); + acc_2 = vrshlq_s32(acc_2, right_shift_2); + + // Add the output offset. + acc_1 = vaddq_s32(acc_1, output_offset_vec); + acc_2 = vaddq_s32(acc_2, output_offset_vec); + + // Apply the activation function. + acc_1 = vmaxq_s32(acc_1, output_activation_min_vec); + acc_1 = vminq_s32(acc_1, output_activation_max_vec); + acc_2 = vmaxq_s32(acc_2, output_activation_min_vec); + acc_2 = vminq_s32(acc_2, output_activation_max_vec); + + // Saturating cast to int8 and store to destination. + const int16x4_t acc_s16_1 = vqmovn_s32(acc_1); + const int16x4_t acc_s16_2 = vqmovn_s32(acc_2); + const int16x8_t res_s16 = vcombine_s16(acc_s16_1, acc_s16_2); + const int8x8_t res_s8 = vqmovn_s16(res_s16); + vst1_s8(output + loc, res_s8); + } + } + +#endif // USE_NEON + // Handle leftover values, one by one. This is very slow. + for (; c < channel_size; c++) + { + for (int n = 0; n < rows; ++n) + { + int loc = n * channel_size + c; + int32_t acc = scratch[loc]; + acc = MultiplyByQuantizedMultiplier(acc, multiplier[c], shift[c]); + acc += output_zp; + acc = std::max(acc, output_min); + acc = std::min(acc, output_max); + output[loc] = static_cast<int8_t>(acc); + } + } +} + +template <typename input_type, typename output_type> +inline void Requantize(const input_type *input_data, int32_t size, + int32_t effective_scale_multiplier, int32_t effective_scale_shift, + int32_t input_zeropoint, int32_t output_zeropoint, output_type *output_data) +{ + assert(!"Requantize: not supported type. It shouldn't reach here."); + UNUSED_ALL(input_data, size, effective_scale_multiplier, effective_scale_shift, input_zeropoint, + output_zeropoint, output_data); +} + +template <> +inline void Requantize<uint8_t, int8_t>(const uint8_t *input_data, int32_t size, + int32_t effective_scale_multiplier, + int32_t effective_scale_shift, int32_t input_zeropoint, + int32_t output_zeropoint, int8_t *output_data) +{ + static constexpr int32_t kMinOutput = std::numeric_limits<int8_t>::min(); + static constexpr int32_t kMaxOutput = std::numeric_limits<int8_t>::max(); + + int i = 0; +#ifdef USE_NEON + // Constants. + const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint); + const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint); + const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput); + const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput); + + for (; i <= size - 16; i += 16) + { + const uint8x16_t input_vec = vld1q_u8(input_data + i); + const uint16x8_t first_half = vmovl_u8(vget_low_u8(input_vec)); + const uint16x8_t second_half = vmovl_u8(vget_high_u8(input_vec)); + int32x4x4_t input; + input.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(first_half))); + input.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(first_half))); + input.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(second_half))); + input.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(second_half))); + input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup); + input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup); + input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup); + input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup); + + int32x4x4_t result = + MultiplyByQuantizedMultiplier4Rows(input, effective_scale_multiplier, effective_scale_shift); + + result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup); + result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup); + result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup); + result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup); + result.val[0] = vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup); + result.val[1] = vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup); + result.val[2] = vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup); + result.val[3] = vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup); + + const int16x4_t narrowed_val_1 = vqmovn_s32(result.val[0]); + const int16x4_t narrowed_val_2 = vqmovn_s32(result.val[1]); + const int16x4_t narrowed_val_3 = vqmovn_s32(result.val[2]); + const int16x4_t narrowed_val_4 = vqmovn_s32(result.val[3]); + const int16x8_t output_first_half = vcombine_s16(narrowed_val_1, narrowed_val_2); + const int16x8_t output_second_half = vcombine_s16(narrowed_val_3, narrowed_val_4); + const int8x8_t narrowed_first_half = vqmovn_s16(output_first_half); + const int8x8_t narrowed_second_half = vqmovn_s16(output_second_half); + const int8x16_t narrowed_result = vcombine_s8(narrowed_first_half, narrowed_second_half); + vst1q_s8(output_data + i, narrowed_result); + } + +#endif + for (; i < size; ++i) + { + const int32_t input = input_data[i] - input_zeropoint; + const int32_t output = + MultiplyByQuantizedMultiplier(input, effective_scale_multiplier, effective_scale_shift) + + output_zeropoint; + const int32_t clamped_output = std::max(std::min(output, kMaxOutput), kMinOutput); + output_data[i] = static_cast<int8_t>(clamped_output); + } +} + +template <> +inline void Requantize<int8_t, uint8_t>(const int8_t *input_data, int32_t size, + int32_t effective_scale_multiplier, + int32_t effective_scale_shift, int32_t input_zeropoint, + int32_t output_zeropoint, uint8_t *output_data) +{ + static constexpr int32_t kMinOutput = std::numeric_limits<uint8_t>::min(); + static constexpr int32_t kMaxOutput = std::numeric_limits<uint8_t>::max(); + + int i = 0; +#ifdef USE_NEON + // Constants. + const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint); + const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint); + const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput); + const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput); + + for (; i <= size - 16; i += 16) + { + const int8x16_t input_vec = vld1q_s8(input_data + i); + const int16x8_t first_half = vmovl_s8(vget_low_s8(input_vec)); + const int16x8_t second_half = vmovl_s8(vget_high_s8(input_vec)); + int32x4x4_t input; + input.val[0] = vmovl_s16(vget_low_s16(first_half)); + input.val[1] = vmovl_s16(vget_high_s16(first_half)); + input.val[2] = vmovl_s16(vget_low_s16(second_half)); + input.val[3] = vmovl_s16(vget_high_s16(second_half)); + input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup); + input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup); + input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup); + input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup); + + int32x4x4_t result = + MultiplyByQuantizedMultiplier4Rows(input, effective_scale_multiplier, effective_scale_shift); + + result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup); + result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup); + result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup); + result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup); + result.val[0] = vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup); + result.val[1] = vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup); + result.val[2] = vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup); + result.val[3] = vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup); + + const uint32x4_t result_val_1_unsigned = vreinterpretq_u32_s32(result.val[0]); + const uint32x4_t result_val_2_unsigned = vreinterpretq_u32_s32(result.val[1]); + const uint32x4_t result_val_3_unsigned = vreinterpretq_u32_s32(result.val[2]); + const uint32x4_t result_val_4_unsigned = vreinterpretq_u32_s32(result.val[3]); + + const uint16x4_t narrowed_val_1 = vqmovn_u32(result_val_1_unsigned); + const uint16x4_t narrowed_val_2 = vqmovn_u32(result_val_2_unsigned); + const uint16x4_t narrowed_val_3 = vqmovn_u32(result_val_3_unsigned); + const uint16x4_t narrowed_val_4 = vqmovn_u32(result_val_4_unsigned); + const uint16x8_t output_first_half = vcombine_u16(narrowed_val_1, narrowed_val_2); + const uint16x8_t output_second_half = vcombine_u16(narrowed_val_3, narrowed_val_4); + const uint8x8_t narrowed_first_half = vqmovn_u16(output_first_half); + const uint8x8_t narrowed_second_half = vqmovn_u16(output_second_half); + const uint8x16_t narrowed_result = vcombine_u8(narrowed_first_half, narrowed_second_half); + vst1q_u8(output_data + i, narrowed_result); + } + +#endif + for (; i < size; ++i) + { + const int32_t input = input_data[i] - input_zeropoint; + const int32_t output = + MultiplyByQuantizedMultiplier(input, effective_scale_multiplier, effective_scale_shift) + + output_zeropoint; + const int32_t clamped_output = std::max(std::min(output, kMaxOutput), kMinOutput); + output_data[i] = static_cast<uint8_t>(clamped_output); + } +} + } // namespace cker } // namespace nnfw diff --git a/compute/cker/include/cker/operation/Range.h b/compute/cker/include/cker/operation/Range.h index 5c3a773a2..d6ccc68c8 100644 --- a/compute/cker/include/cker/operation/Range.h +++ b/compute/cker/include/cker/operation/Range.h @@ -35,8 +35,8 @@ template <typename T> inline int GetSize(T start, T limit, T delta) } int size = (std::is_integral<T>::value - ? ((std::abs(limit - start) + std::abs(delta) - 1) / std::abs(delta)) - : std::ceil(std::abs((limit - start) / delta))); + ? ((std::abs(limit - start) + std::abs(delta) - 1) / std::abs(delta)) + : std::ceil(std::abs((limit - start) / delta))); return size; } diff --git a/compute/cker/include/cker/operation/Reduce.h b/compute/cker/include/cker/operation/Reduce.h index cf9634a67..02a9eac5e 100644 --- a/compute/cker/include/cker/operation/Reduce.h +++ b/compute/cker/include/cker/operation/Reduce.h @@ -21,6 +21,7 @@ #include "cker/Shape.h" #include "cker/Types.h" #include "cker/Utils.h" +#include "cker/neon/neon_check.h" namespace nnfw { @@ -30,6 +31,89 @@ namespace cker // A generic reduce method that can be used for reduce_sum, reduce_mean, etc. // This method iterates through input data and reduce elements along the // dimensions given in axis. + +#ifdef USE_NEON +inline void OptimizedReduceSum(const float *input_data, const Shape &input_shape, + float *output_data) +{ + const auto input_dims = input_shape.DimsData(); + const auto input_num_dims = input_shape.DimensionsCount(); + + int input_size = 1; + int reduce_size = 0; + for (int idx = 0; idx < input_num_dims - 1; idx++) + { + input_size *= input_dims[idx]; + } + reduce_size = input_dims[input_num_dims - 1]; + int offset = 0; + for (int idx = 0; idx < input_size; idx++) + { + int r_idx = 0; + float tmp_data[4] = { + 0, + }; + float32x4_t tmp_data_32x4 = vld1q_f32(tmp_data); + for (; r_idx <= reduce_size - 32; r_idx += 32) + { + float32x4_t a10 = vld1q_f32(input_data + offset + r_idx); + float32x4_t a11 = vld1q_f32(input_data + offset + r_idx + 4); + float32x4_t a12 = vld1q_f32(input_data + offset + r_idx + 8); + float32x4_t a13 = vld1q_f32(input_data + offset + r_idx + 12); + float32x4_t a20 = vld1q_f32(input_data + offset + r_idx + 16); + float32x4_t a21 = vld1q_f32(input_data + offset + r_idx + 20); + float32x4_t a22 = vld1q_f32(input_data + offset + r_idx + 24); + float32x4_t a23 = vld1q_f32(input_data + offset + r_idx + 28); + + float32x4_t x0 = vaddq_f32(a10, a20); + float32x4_t x1 = vaddq_f32(a11, a21); + float32x4_t x2 = vaddq_f32(a12, a22); + float32x4_t x3 = vaddq_f32(a13, a23); + + float32x4_t y0 = vaddq_f32(x0, x1); + float32x4_t y1 = vaddq_f32(x2, x3); + float32x4_t y2 = vaddq_f32(y0, y1); + tmp_data_32x4 = vaddq_f32(tmp_data_32x4, y2); + } + for (; r_idx <= reduce_size - 16; r_idx += 16) + { + float32x4_t a10 = vld1q_f32(input_data + offset + r_idx); + float32x4_t a11 = vld1q_f32(input_data + offset + r_idx + 4); + float32x4_t a12 = vld1q_f32(input_data + offset + r_idx + 8); + float32x4_t a13 = vld1q_f32(input_data + offset + r_idx + 12); + + float32x4_t x0 = vaddq_f32(a10, a11); + float32x4_t x1 = vaddq_f32(a12, a13); + + float32x4_t y0 = vaddq_f32(x0, x1); + tmp_data_32x4 = vaddq_f32(tmp_data_32x4, y0); + } + for (; r_idx <= reduce_size - 8; r_idx += 8) + { + float32x4_t a1 = vld1q_f32(input_data + offset + r_idx); + float32x4_t a2 = vld1q_f32(input_data + offset + r_idx + 4); + float32x4_t x = vaddq_f32(a1, a2); + tmp_data_32x4 = vaddq_f32(tmp_data_32x4, x); + } + vst1q_f32(tmp_data, tmp_data_32x4); + output_data[idx] = tmp_data[0] + tmp_data[1] + tmp_data[2] + tmp_data[3]; + + for (; r_idx < reduce_size; r_idx++) + { + if (r_idx == 0) + { + output_data[idx] = input_data[offset]; + } + else + { + output_data[idx] += input_data[offset + r_idx]; + } + } + offset += reduce_size; + } +} +#endif // NEON + template <typename In, typename Out> inline bool ReduceImpl(const In *input_data, const Shape &input_shape, const Shape &, const int *axis, const int num_axis, int *input_iter, @@ -39,6 +123,32 @@ inline bool ReduceImpl(const In *input_data, const Shape &input_shape, const Sha const auto input_num_dims = input_shape.DimensionsCount(); // Reset input iterator. + if (num_axis == 1 && axis[0] == input_num_dims - 1) + { + int input_size = 1; + int reduce_size = 0; + for (int idx = 0; idx < input_num_dims - 1; idx++) + { + input_size *= input_dims[idx]; + } + reduce_size = input_dims[input_num_dims - 1]; + for (int idx = 0; idx < input_size; idx++) + { + for (int r_idx = 0; r_idx < reduce_size; r_idx++) + { + if (r_idx == 0) + { + output_data[idx] = input_data[idx * reduce_size]; + } + else + { + output_data[idx] = reducer(output_data[idx], input_data[idx * reduce_size + r_idx]); + } + } + } + return true; + } + for (int idx = 0; idx < input_num_dims; ++idx) { input_iter[idx] = 0; @@ -48,7 +158,7 @@ inline bool ReduceImpl(const In *input_data, const Shape &input_shape, const Sha { size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr); size_t output_offset = - ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis); + ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis); output_data[output_offset] = reducer(output_data[output_offset], input_data[input_offset]); } while (NextIndex(input_num_dims, input_dims, input_iter)); return true; @@ -202,12 +312,12 @@ public: } // Calculate mean by dividing output_data by num of aggregated element. - U num_elements_in_axis = 1; + size_t num_elements_in_axis = 1; for (int idx = 0; idx < num_resolved_axis; ++idx) { size_t current = static_cast<size_t>(input_shape.Dims(resolved_axis_data()[idx])); // Overflow prevention. - if (current > static_cast<size_t>(std::numeric_limits<U>::max() / num_elements_in_axis)) + if (current > static_cast<size_t>(std::numeric_limits<size_t>::max() / num_elements_in_axis)) { return false; } @@ -220,21 +330,21 @@ public: if (compute_sum) { // TODO(b/116341117): Eliminate float and do this completely in 8bit. - const float bias = -input_zero_point * scale * num_elements_in_axis + 0.5f; + const float bias = -input_zero_point * scale * num_elements_in_axis; for (size_t idx = 0; idx < num_outputs; ++idx) { const U value = - static_cast<U>(std::round(temp_sum[idx] * scale + bias)) + output_zero_point; + static_cast<U>(std::round(temp_sum[idx] * scale + bias)) + output_zero_point; output_data[idx] = static_cast<T>(value); } } else { - const float bias = -input_zero_point * scale + 0.5f; + const float bias = -input_zero_point * scale; for (size_t idx = 0; idx < num_outputs; ++idx) { float float_mean = - static_cast<float>(temp_sum[idx]) / static_cast<float>(num_elements_in_axis); + static_cast<float>(temp_sum[idx]) / static_cast<float>(num_elements_in_axis); float result = std::min(std::round(float_mean * scale + bias) + output_zero_point, static_cast<float>(std::numeric_limits<T>::max())); result = std::max(result, static_cast<float>(std::numeric_limits<T>::min())); diff --git a/compute/cker/include/cker/operation/ReduceMean.h b/compute/cker/include/cker/operation/ReduceMean.h index 2e4fc6274..924e85037 100644 --- a/compute/cker/include/cker/operation/ReduceMean.h +++ b/compute/cker/include/cker/operation/ReduceMean.h @@ -72,9 +72,9 @@ inline bool ReduceMeanImpl(const In *input_data, const Shape &input_shape, const { size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr); size_t output_offset = - ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis); + ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis); output_data[output_offset] = - reducer(output_data[output_offset], input_data[input_offset], normalizer); + reducer(output_data[output_offset], input_data[input_offset], normalizer); } while (NextIndex(input_num_dims, input_dims, input_iter)); return true; } @@ -102,7 +102,7 @@ inline size_t ReduceSumQuantImpl(const In *input_data, const Shape &input_shape, { size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr); size_t output_offset = - ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis); + ReducedOutputOffset(input_num_dims, input_dims, input_iter, num_axis, axis); temp_sum[output_offset] = reducer(temp_sum[output_offset], input_data[input_offset]); } while (NextIndex(input_num_dims, input_dims, input_iter)); return normalizer; @@ -185,8 +185,8 @@ public: } size_t normalizer = - ReduceSumQuantImpl<In>(input_data, input_shape, resolved_axis_data(), num_resolved_axis, - temp_index_data(), reducer, _temp_sum.data()); + ReduceSumQuantImpl<In>(input_data, input_shape, resolved_axis_data(), num_resolved_axis, + temp_index_data(), reducer, _temp_sum.data()); if (num_outputs > 0) { float scale = input_scale / output_scale; @@ -231,6 +231,37 @@ void MeanQ8Asymm(const Shape &input_shape, const In *input_data, float input_sca sum_reducer); } +template <typename In, typename Out> +void MeanAxis1And2(const Shape &input_shape, const In *input_data, const Shape &output_shape, + Out *output_data) +{ + UNUSED_RELEASE(output_shape); + assert(input_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + + const int output_batch = output_shape.Dims(0); + const int output_depth = output_shape.Dims(3); + + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + + for (int out_b = 0; out_b < output_batch; ++out_b) + { + for (int out_d = 0; out_d < output_depth; ++out_d) + { + float value = 0; + for (int in_h = 0; in_h < input_height; ++in_h) + { + for (int in_w = 0; in_w < input_width; ++in_w) + { + value += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)]; + } + } + output_data[Offset(output_shape, out_b, 0, 0, out_d)] = value / (input_width * input_height); + } + } +} + } // namespace cker } // namespace nnfw diff --git a/compute/cker/include/cker/operation/ResizeBilinear.h b/compute/cker/include/cker/operation/ResizeBilinear.h index 7fc1e9123..ae5af7bb3 100644 --- a/compute/cker/include/cker/operation/ResizeBilinear.h +++ b/compute/cker/include/cker/operation/ResizeBilinear.h @@ -62,7 +62,7 @@ inline void ResizeBilinearKernel2x2(int32_t x0, int32_t x1, int32_t y0, int32_t // Bottom right corner. output_data[output_offset + output_x_offset + output_y_offset] = - (output + ((x1y0 + x1y1) / 2)) / 2; + (output + ((x1y0 + x1y1) / 2)) / 2; } } @@ -192,8 +192,8 @@ inline void ResizeBilinearGenericSmallChannel(int32_t batches, int32_t input_hei &x1); int32_t input_offset[4] = { - Offset(input_shape, b, y0, x0, 0), Offset(input_shape, b, y0, x1, 0), - Offset(input_shape, b, y1, x0, 0), Offset(input_shape, b, y1, x1, 0)}; + Offset(input_shape, b, y0, x0, 0), Offset(input_shape, b, y0, x1, 0), + Offset(input_shape, b, y1, x0, 0), Offset(input_shape, b, y1, x1, 0)}; float scale[4] = {(1 - (input_y - y0)) * (1 - (input_x - x0)), (1 - (input_y - y0)) * (input_x - x0), (input_y - y0) * (1 - (input_x - x0)), (input_y - y0) * (input_x - x0)}; @@ -202,8 +202,8 @@ inline void ResizeBilinearGenericSmallChannel(int32_t batches, int32_t input_hei { const T *input_ptr = &input_data[d]; *output_ptr++ = static_cast<T>( - input_ptr[input_offset[0]] * scale[0] + input_ptr[input_offset[1]] * scale[1] + - input_ptr[input_offset[2]] * scale[2] + input_ptr[input_offset[3]] * scale[3]); + input_ptr[input_offset[0]] * scale[0] + input_ptr[input_offset[1]] * scale[1] + + input_ptr[input_offset[2]] * scale[2] + input_ptr[input_offset[3]] * scale[3]); } } } @@ -253,17 +253,102 @@ void ResizeBilinear(ResizeBilinearParams ¶ms, const Shape &input_shape, int32_t depth = MatchingDim(input_shape, 3, output_shape, 3); float height_scale = (params.align_corners && params.output_height > 1) - ? (static_cast<float>(input_height - 1) / (params.output_height - 1)) - : (static_cast<float>(input_height) / params.output_height); + ? (static_cast<float>(input_height - 1) / (params.output_height - 1)) + : (static_cast<float>(input_height) / params.output_height); float width_scale = (params.align_corners && params.output_width > 1) - ? (static_cast<float>(input_width - 1) / (params.output_width - 1)) - : (static_cast<float>(input_width) / params.output_width); + ? (static_cast<float>(input_width - 1) / (params.output_width - 1)) + : (static_cast<float>(input_width) / params.output_width); ResizeBilinearGenericSmallChannel<uint8_t>( - batches, input_height, input_width, depth, params.output_height, params.output_width, - height_scale, width_scale, input_shape, input_data, output_data, params.half_pixel_centers); + batches, input_height, input_width, depth, params.output_height, params.output_width, + height_scale, width_scale, input_shape, input_data, output_data, params.half_pixel_centers); } + +inline void ComputeInterpolationValues(const int32_t value, const int32_t scale_10, + const bool half_pixel_centers, int32_t input_size, + int32_t *scaled_value, int32_t *lower_bound, + int32_t *upper_bound) +{ + if (half_pixel_centers) + { + *scaled_value = value * scale_10 + scale_10 / 2 - (1 << 9); + } + else + { + *scaled_value = value * scale_10; + } + *lower_bound = std::max(*scaled_value / (1 << 10), 0); + *upper_bound = std::min(*scaled_value / (1 << 10) + 1, input_size - 1); +} + +inline void ResizeBilinear(const ResizeBilinearParams &op_params, + const Shape &unextended_input_shape, const int8_t *input_data, + const Shape &unextended_output_shape, int8_t *output_data) +{ + // If half_pixel_centers is True, align_corners must be False. + assert(!op_params.half_pixel_centers || !op_params.align_corners); + assert(unextended_input_shape.DimensionsCount() <= 4); + assert(unextended_output_shape.DimensionsCount() <= 4); + const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape); + const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape); + + const int32_t batches = MatchingDim(input_shape, 0, output_shape, 0); + const int32_t input_height = input_shape.Dims(1); + const int32_t input_width = input_shape.Dims(2); + const int32_t depth = MatchingDim(input_shape, 3, output_shape, 3); + + const int32_t output_height = op_params.output_height; + const int32_t output_width = op_params.output_width; + + int32_t height_scale_10 = ((1 << 10) * input_height + output_height / 2) / output_height; + int32_t width_scale_10 = ((1 << 10) * input_width + output_width / 2) / output_width; + if (op_params.align_corners && output_height > 1) + { + height_scale_10 = + ((1 << 10) * (input_height - 1) + (output_height - 1) / 2) / (output_height - 1); + } + if (op_params.align_corners && output_width > 1) + { + width_scale_10 = ((1 << 10) * (input_width - 1) + (output_width - 1) / 2) / (output_width - 1); + } + + for (int b = 0; b < batches; ++b) + { + for (int y = 0; y < output_height; ++y) + { + int32_t input_y, y0, y1; + ComputeInterpolationValues(y, height_scale_10, op_params.half_pixel_centers, input_height, + &input_y, &y0, &y1); + for (int x = 0; x < output_width; ++x) + { + int32_t input_x, x0, x1; + ComputeInterpolationValues(x, width_scale_10, op_params.half_pixel_centers, input_width, + &input_x, &x0, &x1); + for (int c = 0; c < depth; ++c) + { + const int64_t output_20_ll = + static_cast<int64_t>(input_data[Offset(input_shape, b, y0, x0, c)]) * + ((1 << 10) - (input_y - (1 << 10) * y0)) * ((1 << 10) - (input_x - (1 << 10) * x0)); + const int64_t output_20_lu = + static_cast<int64_t>(input_data[Offset(input_shape, b, y1, x0, c)]) * + (input_y - (1 << 10) * y0) * ((1 << 10) - (input_x - (1 << 10) * x0)); + const int64_t output_20_rl = + static_cast<int64_t>(input_data[Offset(input_shape, b, y0, x1, c)]) * + ((1 << 10) - (input_y - (1 << 10) * y0)) * (input_x - (1 << 10) * x0); + const int64_t output_20_ru = + static_cast<int64_t>(input_data[Offset(input_shape, b, y1, x1, c)]) * + (input_y - (1 << 10) * y0) * (input_x - (1 << 10) * x0); + const int64_t output_20 = output_20_ll + output_20_lu + output_20_rl + output_20_ru; + const int64_t round = (output_20 > 0) ? (1 << 19) : -(1 << 19); + const int8_t interpolation = static_cast<int8_t>((output_20 + round) / (1 << 20)); + output_data[Offset(output_shape, b, y, x, c)] = interpolation; + } + } + } + } +} + } // namespace cker } // namespace nnfw diff --git a/compute/cker/include/cker/operation/Round.h b/compute/cker/include/cker/operation/Round.h index a04a741cf..d67714564 100644 --- a/compute/cker/include/cker/operation/Round.h +++ b/compute/cker/include/cker/operation/Round.h @@ -19,6 +19,7 @@ #define __NNFW_CKER_ROUND_H__ #include "cker/Shape.h" +#include "cker/Utils.h" #include <cmath> @@ -41,6 +42,26 @@ inline float RoundToNearest(float value) } } +#ifdef USE_NEON + +inline int32x4_t RoundToNearest(const float32x4_t input) +{ +#if defined(__aarch64__) || defined(__SSSE3__) + // Note: vcvtnq_s32_f32 is not available in ARMv7 + return vcvtnq_s32_f32(input); +#else + static const float32x4_t zero_val_dup = vdupq_n_f32(0.0f); + static const float32x4_t point5_val_dup = vdupq_n_f32(0.5f); + static const float32x4_t minus_point5_val_dup = vdupq_n_f32(-0.5f); + + const uint32x4_t mask = vcltq_f32(input, zero_val_dup); + const float32x4_t round = vbslq_f32(mask, minus_point5_val_dup, point5_val_dup); + return vcvtq_s32_f32(vaddq_f32(input, round)); +#endif // defined(__aarch64__) || defined(__SSSE3__) +} + +#endif // NEON + inline void Round(const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data) { diff --git a/compute/cker/include/cker/operation/Select.h b/compute/cker/include/cker/operation/Select.h index ab2de94cc..644fe0a0e 100644 --- a/compute/cker/include/cker/operation/Select.h +++ b/compute/cker/include/cker/operation/Select.h @@ -34,7 +34,7 @@ void Select(const Shape &input_condition_shape, const D *input_condition_data, const T *input_y_data, const Shape &output_shape, T *output_data) { const int64_t flatsize = - MatchingFlatSize(input_condition_shape, input_x_shape, input_y_shape, output_shape); + MatchingFlatSize(input_condition_shape, input_x_shape, input_y_shape, output_shape); for (int64_t i = 0; i < flatsize; ++i) { output_data[i] = (input_condition_data[i] != 0) ? input_x_data[i] : input_y_data[i]; @@ -101,7 +101,7 @@ void BroadcastSelect4DSlow(const Shape &input_condition_shape, const D *input_co const int x_index = SubscriptToIndex(desc_x, b, y, x, c); const int y_index = SubscriptToIndex(desc_y, b, y, x, c); output_data[Offset(extended_output_shape, b, y, x, c)] = - input_condition_data[condition_index] ? input_x_data[x_index] : input_y_data[y_index]; + input_condition_data[condition_index] ? input_x_data[x_index] : input_y_data[y_index]; } } } diff --git a/compute/cker/include/cker/operation/Slice.h b/compute/cker/include/cker/operation/Slice.h index a072cff8e..ef97fd5d8 100644 --- a/compute/cker/include/cker/operation/Slice.h +++ b/compute/cker/include/cker/operation/Slice.h @@ -43,16 +43,16 @@ inline void Slice(const SliceParams &op_params, const Shape &input_shape, : start_b + op_params.size[0]; const int start_h = begin_count < 3 ? 0 : op_params.begin[begin_count - 3]; const int stop_h = (size_count < 3 || op_params.size[size_count - 3] == -1) - ? input_shape.Dims(1) - : start_h + op_params.size[size_count - 3]; + ? input_shape.Dims(1) + : start_h + op_params.size[size_count - 3]; const int start_w = begin_count < 2 ? 0 : op_params.begin[begin_count - 2]; const int stop_w = (size_count < 2 || op_params.size[size_count - 2] == -1) - ? input_shape.Dims(2) - : start_w + op_params.size[size_count - 2]; + ? input_shape.Dims(2) + : start_w + op_params.size[size_count - 2]; const int start_d = begin_count < 1 ? 0 : op_params.begin[begin_count - 1]; const int stop_d = (size_count < 1 || op_params.size[size_count - 1] == -1) - ? input_shape.Dims(3) - : start_d + op_params.size[size_count - 1]; + ? input_shape.Dims(3) + : start_d + op_params.size[size_count - 1]; for (int in_b = start_b; in_b < stop_b; ++in_b) { diff --git a/compute/cker/include/cker/operation/SoftMax.h b/compute/cker/include/cker/operation/SoftMax.h index 13e50b87a..35ecde4ba 100644 --- a/compute/cker/include/cker/operation/SoftMax.h +++ b/compute/cker/include/cker/operation/SoftMax.h @@ -23,6 +23,10 @@ #include "cker/Types.h" #include "cker/eigen/Utils.h" +#if __aarch64__ && __clang__ +#define TFLITE_SOFTMAX_USE_UINT16_LUT +#endif + #include <Eigen/Core> #include <fixedpoint/fixedpoint.h> #include <cmath> @@ -32,6 +36,45 @@ namespace nnfw namespace cker { +namespace reference +{ + +// Note. This Softmax function supports all of dimensions +inline void Softmax(const SoftmaxParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); + + for (int i = 0; i < outer_size; ++i) + { + // Find max element value which we'll use to ensure numerical stability + // taking advantage of the following equality: + // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C)) + float max = std::numeric_limits<float>::lowest(); + for (int c = 0; c < depth; ++c) + { + max = std::max(max, input_data[i * depth + c]); + } + + // Compute sum. + float sum = 0.f; + for (int c = 0; c < depth; ++c) + { + sum += std::exp((input_data[i * depth + c] - max) * static_cast<float>(params.beta)); + } + + // Compute result. + for (int c = 0; c < depth; ++c) + { + output_data[i * depth + c] = + std::exp((input_data[i * depth + c] - max) * static_cast<float>(params.beta)) / sum; + } + } +} +} // namespace reference + // Performs softmax along the input of size (input_size * batch_size). inline void Softmax(const float *in, const int input_size, const int batch_size, const float beta, float *out) @@ -88,87 +131,306 @@ inline void Softmax(const SoftmaxParams ¶ms, const Shape &input_shape, const out_mat.array().rowwise() *= scale; } -inline void Softmax(const SoftmaxParams ¶ms, const Shape &input_shape, - const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data) -{ - const int32_t input_beta_multiplier = params.input_multiplier; - const int32_t input_beta_left_shift = params.input_left_shift; - const int diff_min = params.diff_min; - // The representation chosen for the input to the exp() function is Q5.26. - // We need to leave extra space since values that we skip might be as large as - // -32 before multiplying by input_beta_multiplier, and therefore as large as - // -16 afterwards. Note that exp(-8) is definitely not insignificant to - // accumulation, but exp(-16) definitely is. - static const int kScaledDiffIntegerBits = 5; - static const int kAccumulationIntegerBits = 12; - using FixedPointScaledDiff = gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>; - using FixedPointAccum = gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>; - using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>; +template <typename T> inline int32_t QuantizeSoftmaxOutput(float prob_rescaled, int32_t zero_point) +{ + const int32_t prob_rnd = static_cast<int32_t>(std::round(prob_rescaled)); + return prob_rnd + zero_point; +} + +#if !__aarch64__ +// With ARM64, rounding is faster than add + truncation. +template <> inline int32_t QuantizeSoftmaxOutput<uint8_t>(float prob_rescaled, int32_t) +{ + return static_cast<int32_t>(prob_rescaled + 0.5f); +} +#endif + +inline void PopulateSoftmaxLookupTable(float *table, float input_scale, float beta) +{ + const float scale = -input_scale * beta; + const int32_t max_uint8 = std::numeric_limits<uint8_t>::max(); + for (int32_t val = 0; val <= max_uint8; ++val) + { + table[max_uint8 - val] = expf(scale * val); + } +} +template <typename In, typename Out> +inline void Softmax(const SoftmaxParams ¶ms, const Shape &input_shape, const In *input_data, + const Shape &output_shape, Out *output_data) +{ const int trailing_dim = input_shape.DimensionsCount() - 1; - const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); - const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); + const int excluding_last_dim = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int last_dim = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); - for (int i = 0; i < outer_size; ++i) + const int32_t clamp_max = std::numeric_limits<Out>::max(); + const int32_t clamp_min = std::numeric_limits<Out>::min(); + for (int i = 0; i < excluding_last_dim; ++i) { - uint8_t max_in_row = 0; - for (int c = 0; c < depth; ++c) + int32_t max_val = std::numeric_limits<In>::min(); + // Find max quantized value. + for (int j = 0; j < last_dim; ++j) { - max_in_row = std::max(max_in_row, input_data[i * depth + c]); + max_val = std::max(max_val, static_cast<int32_t>(input_data[j])); } - FixedPointAccum sum_of_exps = FixedPointAccum::Zero(); - for (int c = 0; c < depth; ++c) + float sum_exp = 0.0f; + const int32_t max_uint8 = std::numeric_limits<uint8_t>::max(); + const float *table_offset = ¶ms.table[max_uint8 - max_val]; + // Calculate normalizer sum(exp(x)). + for (int j = 0; j < last_dim; ++j) { - int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row; - if (input_diff >= diff_min) - { - const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne( - input_diff, input_beta_multiplier, input_beta_left_shift); - const FixedPointScaledDiff scaled_diff_f8 = - FixedPointScaledDiff::FromRaw(input_diff_rescaled); - sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>( - exp_on_negative_values(scaled_diff_f8)); - } + sum_exp += table_offset[input_data[j]]; } - int32_t fixed_sum_of_exps = sum_of_exps.raw(); - int headroom_plus_one = CountLeadingZeros(static_cast<uint32_t>(fixed_sum_of_exps)); - // This is the number of bits to the left of the binary point above 1.0. - // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and - // no later adjustment will be needed. - int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one; - int32_t shifted_sum_minus_one = - static_cast<int32_t>((static_cast<uint32_t>(fixed_sum_of_exps) << headroom_plus_one) - - (static_cast<uint32_t>(1) << 31)); + const float inv_sum_exp = 1.0f / (sum_exp * params.scale); + // Normalize and quantize probabilities. + for (int j = 0; j < last_dim; ++j) + { + const float prob_rescaled = table_offset[input_data[j]] * inv_sum_exp; + const int32_t prob_quantized = QuantizeSoftmaxOutput<Out>(prob_rescaled, params.zero_point); + output_data[j] = static_cast<Out>(std::max(std::min(clamp_max, prob_quantized), clamp_min)); + } + input_data += last_dim; + output_data += last_dim; + } +} - FixedPoint0 shifted_scale = - one_over_one_plus_x_for_x_in_0_1(FixedPoint0::FromRaw(shifted_sum_minus_one)); +#ifdef TFLITE_SOFTMAX_USE_UINT16_LUT +// Looks up each element of <indices> in <table>, returns them in a vector. +inline uint8x16_t aarch64_lookup_vector(const uint8x16x4_t table[4], uint8x16_t indices) +{ + // Look up in 1st quarter of the table: top 2 bits of indices == 00 + uint8x16_t output1 = vqtbl4q_u8(table[0], indices); + // Look up in 2nd quarter of the table: top 2 bits of indices == 01 + uint8x16_t output2 = vqtbl4q_u8(table[1], veorq_u8(indices, vdupq_n_u8(0x40))); + // Look up in 3rd quarter of the table: top 2 bits of indices == 10 + uint8x16_t output3 = vqtbl4q_u8(table[2], veorq_u8(indices, vdupq_n_u8(0x80))); + // Look up in 4th quarter of the table: top 2 bits of indices == 11 + uint8x16_t output4 = vqtbl4q_u8(table[3], veorq_u8(indices, vdupq_n_u8(0xc0))); - for (int c = 0; c < depth; ++c) + // Combine result of the 4 lookups. + return vorrq_u8(vorrq_u8(output1, output2), vorrq_u8(output3, output4)); +} + +inline void PopulateSoftmaxUInt8LookupTable(uint8_t *uint8_table1, uint8_t *uint8_table2, + float input_scale, float beta) +{ + const float scale = input_scale * beta; + const int32_t max_uint8 = std::numeric_limits<uint8_t>::max(); + const int32_t max_uint16 = std::numeric_limits<uint16_t>::max(); + + for (int32_t val = 0; val <= max_uint8; ++val) + { + float input_to_exp = scale * (val - max_uint8); + int32_t temp = static_cast<int>(expf(input_to_exp) * max_uint16 + 0.5); + temp = std::min(max_uint16, temp); + uint8_t part1 = temp >> 8; + uint8_t part2 = temp & 0xff; + uint8_table1[val] = static_cast<uint8_t>(part1); + uint8_table2[val] = static_cast<uint8_t>(part2); + } +} + +inline int FindMaxValue(int size, const uint8_t *input_data, uint8_t offset) +{ + int32_t max_val = std::numeric_limits<uint8_t>::min(); + int j = 0; + + uint8x16_t max_val_dup = vdupq_n_u8(max_val); + uint8x16_t offset_dup = vdupq_n_u8(offset); + for (; j <= size - 16; j += 16) + { + uint8x16_t input_value = vld1q_u8(input_data + j); + input_value = veorq_u8(input_value, offset_dup); + max_val_dup = vmaxq_u8(input_value, max_val_dup); + } + max_val = std::max(max_val, static_cast<int32_t>(vmaxvq_u8(max_val_dup))); + + for (; j < size; ++j) + { + max_val = std::max(max_val, static_cast<int32_t>(input_data[j] ^ offset)); + } + return max_val; +} + +#ifdef USE_NEON +// Value_to_store layout: +// [high_high, high_low, low_high, low_low]. +inline void StoreValue(int32x4x4_t value_to_store, int8_t *output) +{ + const int16x8_t result_1 = + vcombine_s16(vqmovn_s32(value_to_store.val[1]), vqmovn_s32(value_to_store.val[0])); + const int16x8_t result_2 = + vcombine_s16(vqmovn_s32(value_to_store.val[3]), vqmovn_s32(value_to_store.val[2])); + const int8x16_t result = vcombine_s8(vqmovn_s16(result_2), vqmovn_s16(result_1)); + vst1q_s8(output, result); +} + +// Value_to_store layout: +// [high_high, high_low, low_high, low_low]. +inline void StoreValue(int32x4x4_t value_to_store, uint8_t *output) +{ + const uint16x8_t result_1 = + vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[1])), + vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[0]))); + const uint16x8_t result_2 = + vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[3])), + vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[2]))); + const uint8x16_t result = vcombine_u8(vqmovn_u16(result_2), vqmovn_u16(result_1)); + vst1q_u8(output, result); +} + +#endif + +template <typename In, typename Out> +inline void SoftmaxInt8LUT(const SoftmaxParams ¶ms, const Shape &input_shape, + const In *input_data, const Shape &output_shape, Out *output_data) +{ + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int excluding_last_dim = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int last_dim = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); + + const int32_t clamp_max = std::numeric_limits<Out>::max(); + const int32_t clamp_min = std::numeric_limits<Out>::min(); + + // Offset is used to interpret the input data "correctly". + // If the input is uint8, the data will be unchanged. + // If the input is int8, since it will be reinterpret as uint8. + // e.g., + // int8 127 will be applied "offset" to become 255 in uint8. + uint8_t offset = 0; + if (std::is_same<In, int8_t>::value) + { + offset = 0x80; + } + + const uint8_t *input_data_uint = reinterpret_cast<const uint8_t *>(input_data); + + // This code uses ARM64-only instructions. + // TODO(b/143709993): Port to ARMv7 + + // Load the tables into registers. (4*4 128-bit registers) + uint8x16x4_t table1[4]; + table1[0] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 0); + table1[1] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 1); + table1[2] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 2); + table1[3] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 3); + + uint8x16x4_t table2[4]; + table2[0] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 0); + table2[1] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 1); + table2[2] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 2); + table2[3] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 3); + + for (int i = 0; i < excluding_last_dim; ++i) + { + // Find max quantized value. + int32_t max_val = FindMaxValue(last_dim, input_data_uint, offset); + + int32_t sum_exp = 0; + const int32_t max_uint8 = std::numeric_limits<uint8_t>::max(); + const uint8_t table_offset = max_uint8 - max_val; + + // Calculate normalizer sum(exp(x)). + int sum_j = 0; + uint8x16_t table_offset_dup = vdupq_n_u8(table_offset); + uint8x16_t offset_dup = vdupq_n_u8(offset); + uint32x4_t sum_4 = vdupq_n_u32(0); + const int multiplier_shift = 8; + for (; sum_j <= last_dim - 16; sum_j += 16) + { + uint8x16_t input_value = vld1q_u8(input_data_uint + sum_j); + input_value = veorq_u8(input_value, offset_dup); + input_value = vaddq_u8(input_value, table_offset_dup); + + const uint8x16_t output1 = aarch64_lookup_vector(table1, input_value); + const uint8x16_t output2 = aarch64_lookup_vector(table2, input_value); + + uint16x8_t exp_value1 = vshll_n_u8(vget_high_u8(output1), multiplier_shift); + uint16x8_t exp_value2 = vshll_n_u8(vget_low_u8(output1), multiplier_shift); + + exp_value1 = vaddw_u8(exp_value1, vget_high_u8(output2)); + exp_value2 = vaddw_u8(exp_value2, vget_low_u8(output2)); + + sum_4 = vpadalq_u16(sum_4, exp_value1); + sum_4 = vpadalq_u16(sum_4, exp_value2); + } + int temp = vgetq_lane_u32(sum_4, 0) + vgetq_lane_u32(sum_4, 1) + vgetq_lane_u32(sum_4, 2) + + vgetq_lane_u32(sum_4, 3); + sum_exp += temp; + + for (; sum_j < last_dim; ++sum_j) { - int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row; - if (input_diff >= diff_min) - { - const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne( - input_diff, input_beta_multiplier, input_beta_left_shift); - const FixedPointScaledDiff scaled_diff_f8 = - FixedPointScaledDiff::FromRaw(input_diff_rescaled); - - FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8); - int32_t unsat_output = gemmlowp::RoundingDivideByPOT((shifted_scale * exp_in_0).raw(), - num_bits_over_unit + 31 - 8); - - output_data[i * depth + c] = static_cast<uint8_t>( - std::max(std::min(unsat_output, static_cast<int32_t>(255)), static_cast<int32_t>(0))); - } - else - { - output_data[i * depth + c] = 0; - } + const uint8_t index = (input_data_uint[sum_j] ^ offset) + table_offset; + + uint8_t part1 = params.uint8_table1[index]; + uint8_t part2 = params.uint8_table2[index]; + sum_exp += ((part1 << 8) + part2); + } + + const float inv_sum_exp = 1.0f / (sum_exp * params.scale); + + int32_t multiplier, shift; + QuantizeMultiplier(inv_sum_exp, &multiplier, &shift); + + // Normalize and quantize probabilities. + int j = 0; + const int32x4_t output_zp_dup = vdupq_n_s32(params.zero_point); + const int32x4_t max_val_dup = vdupq_n_s32(clamp_max); + const int32x4_t min_val_dup = vdupq_n_s32(clamp_min); + + for (; j <= last_dim - 16; j += 16) + { + uint8x16_t input_value = vld1q_u8(input_data_uint + j); + input_value = veorq_u8(input_value, offset_dup); + input_value = vaddq_u8(input_value, table_offset_dup); + + const uint8x16_t output1 = aarch64_lookup_vector(table1, input_value); + const uint8x16_t output2 = aarch64_lookup_vector(table2, input_value); + + uint16x8_t exp_value1 = vshll_n_u8(vget_high_u8(output1), multiplier_shift); + uint16x8_t exp_value2 = vshll_n_u8(vget_low_u8(output1), multiplier_shift); + + exp_value1 = vaddw_u8(exp_value1, vget_high_u8(output2)); + exp_value2 = vaddw_u8(exp_value2, vget_low_u8(output2)); + + int32x4x4_t output_value; + output_value.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(exp_value1))); + output_value.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(exp_value1))); + output_value.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(exp_value2))); + output_value.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(exp_value2))); + + int32x4x4_t temp_val = MultiplyByQuantizedMultiplier4Rows(output_value, multiplier, shift); + + temp_val.val[0] = vaddq_s32(temp_val.val[0], output_zp_dup); + temp_val.val[1] = vaddq_s32(temp_val.val[1], output_zp_dup); + temp_val.val[2] = vaddq_s32(temp_val.val[2], output_zp_dup); + temp_val.val[3] = vaddq_s32(temp_val.val[3], output_zp_dup); + + temp_val.val[0] = vmaxq_s32(vminq_s32(temp_val.val[0], max_val_dup), min_val_dup); + temp_val.val[1] = vmaxq_s32(vminq_s32(temp_val.val[1], max_val_dup), min_val_dup); + temp_val.val[2] = vmaxq_s32(vminq_s32(temp_val.val[2], max_val_dup), min_val_dup); + temp_val.val[3] = vmaxq_s32(vminq_s32(temp_val.val[3], max_val_dup), min_val_dup); + + StoreValue(temp_val, output_data + j); + } + for (; j < last_dim; ++j) + { + const uint8_t index = (input_data_uint[j] ^ offset) + table_offset; + const uint8_t part1 = params.uint8_table1[index]; + const uint8_t part2 = params.uint8_table2[index]; + const int32_t exp_value = (part1 << 8) + part2; + const int32_t output_value = MultiplyByQuantizedMultiplier(exp_value, multiplier, shift); + + output_data[j] = static_cast<Out>( + std::max(std::min(clamp_max, output_value + params.zero_point), clamp_min)); } + input_data_uint += last_dim; + output_data += last_dim; } } +#endif } // namespace cker } // namespace nnfw diff --git a/compute/cker/include/cker/operation/SpaceToBatchND.h b/compute/cker/include/cker/operation/SpaceToBatchND.h index feeb358c9..aff36e2f3 100644 --- a/compute/cker/include/cker/operation/SpaceToBatchND.h +++ b/compute/cker/include/cker/operation/SpaceToBatchND.h @@ -79,9 +79,9 @@ inline void SpaceToBatchND(const SpaceToBatchParams ¶ms, const Shape &unexte else { const T *in = - input_data + Offset(input_shape, input_batch, - (out_h * block_shape_height + shift_h) - padding_top, - (out_w * block_shape_width + shift_w) - padding_left, 0); + input_data + Offset(input_shape, input_batch, + (out_h * block_shape_height + shift_h) - padding_top, + (out_w * block_shape_width + shift_w) - padding_left, 0); memcpy(out, in, depth * sizeof(T)); } } diff --git a/compute/cker/include/cker/operation/StatelessRandomUniform.h b/compute/cker/include/cker/operation/StatelessRandomUniform.h index d5952ae23..dcf649ca1 100644 --- a/compute/cker/include/cker/operation/StatelessRandomUniform.h +++ b/compute/cker/include/cker/operation/StatelessRandomUniform.h @@ -72,8 +72,8 @@ void Fill(random::PhiloxRandom random, Tensor *output) Distribution()); } -inline void StatelessRandomUniform(const Shape &shape_shape, const int *shape_data, - const Shape &seed_shape, const int *seed_data, +inline void StatelessRandomUniform(const Shape &shape_shape, const int32_t *shape_data, + const Shape &seed_shape, const int32_t *seed_data, const Shape &output_shape, float *output_data) { Tensor shape_t; @@ -95,7 +95,7 @@ inline void StatelessRandomUniform(const Shape &shape_shape, const int *shape_da GenerateKey(seed_t, &key, &counter); Fill<Eigen::ThreadPoolDevice, random::UniformDistribution<random::PhiloxRandom, float>>( - random::PhiloxRandom(counter, key), &output_t); + random::PhiloxRandom(counter, key), &output_t); } } // namespace cker } // namespace nnfw diff --git a/compute/cker/include/cker/operation/StridedSlice.h b/compute/cker/include/cker/operation/StridedSlice.h index c57b4daa0..2f1089575 100644 --- a/compute/cker/include/cker/operation/StridedSlice.h +++ b/compute/cker/include/cker/operation/StridedSlice.h @@ -260,12 +260,41 @@ template <typename T> inline void StridedSlice(const StridedSliceParams &op_params, const Shape &unextended_input_shape, const T *input_data, const Shape &unextended_output_shape, T *output_data) { - // Note that the output_shape is not used herein. - StridedSliceParams params_copy = op_params; - assert(unextended_input_shape.DimensionsCount() <= 4); assert(unextended_output_shape.DimensionsCount() <= 4); + bool optimize = true; + int st_count = op_params.strides_count; + for (int idx = 0; idx < st_count - 1; idx++) + { + const int axis_size = unextended_input_shape.Dims(idx); + const int start = StartForAxis(op_params, unextended_input_shape, idx); + const int stop = StopForAxis(op_params, unextended_input_shape, idx, start); + if ((axis_size != 1) && (start != 0 || stop != 0)) + { + optimize = false; + break; + } + } + + if (optimize) + { + if (op_params.strides[st_count - 1] == 1) + { + const int start = StartForAxis(op_params, unextended_input_shape, st_count - 1); + const int end = StopForAxis(op_params, unextended_input_shape, st_count - 1, start); + + for (int idx = 0; idx < end - start; idx++) + { + output_data[idx] = input_data[idx + start]; + } + return; + } + } + + // Note that the output_shape is not used herein. + StridedSliceParams params_copy = op_params; + const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape); const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape); diff --git a/compute/cker/include/cker/operation/Tile.h b/compute/cker/include/cker/operation/Tile.h index 1dcdd9b79..42433468a 100644 --- a/compute/cker/include/cker/operation/Tile.h +++ b/compute/cker/include/cker/operation/Tile.h @@ -55,7 +55,7 @@ std::pair<int, int> TileOneDimension(const Shape &in_dimensions, const T *in_dat { int stride_size = 0, tiled_stride_size = 0; std::tie(stride_size, tiled_stride_size) = - TileOneDimension(in_dimensions, copy_from_data, multipliers, copy_to_data, dimension + 1); + TileOneDimension(in_dimensions, copy_from_data, multipliers, copy_to_data, dimension + 1); copy_from_data += stride_size; copy_to_data += tiled_stride_size; total_stride_size += stride_size; diff --git a/compute/cker/include/cker/operation/Transpose.h b/compute/cker/include/cker/operation/Transpose.h index 9d8cd340d..52c826c39 100644 --- a/compute/cker/include/cker/operation/Transpose.h +++ b/compute/cker/include/cker/operation/Transpose.h @@ -288,7 +288,7 @@ size_t Flatten(const Shape &input_shape, const Shape &output_shape, const Transp return flat_size; } -} // namespace anonymous (util) +} // namespace // Transpose2D only deals with typical 2D matrix transpose ops. // Perform transpose by transposing 4x4 blocks of the input, proceeding from @@ -555,9 +555,9 @@ void Transpose(const TransposeParams &unshrunk_params, const Shape &unshrunk_inp const int total_size = shrunk_input_shape.FlatSize(); const int non_flatten_size = - Flatten(shrunk_input_shape, shrunk_output_shape, shrunk_params, + Flatten(shrunk_input_shape, shrunk_output_shape, shrunk_params, - &non_flatten_input_shape, &non_flatten_output_shape, &non_flatten_params); + &non_flatten_input_shape, &non_flatten_output_shape, &non_flatten_params); assert(non_flatten_params.perm[0] != 0); for (int i = 0; i < total_size; i += non_flatten_size) diff --git a/compute/cker/include/cker/operation/TransposeConv.h b/compute/cker/include/cker/operation/TransposeConv.h index 7db3a1179..d41f86047 100644 --- a/compute/cker/include/cker/operation/TransposeConv.h +++ b/compute/cker/include/cker/operation/TransposeConv.h @@ -90,11 +90,11 @@ inline void TransposeConv(const TransposeConvParams ¶ms, const Shape &input_ (out_y < output_height)) { float input_value = - input_data[Offset(input_shape, batch, in_y, in_x, in_channel)]; - float filter_value = filter_data[Offset(filter_shape, out_channel, filter_y, - filter_x, in_channel)]; + input_data[Offset(input_shape, batch, in_y, in_x, in_channel)]; + float filter_value = + filter_data[Offset(filter_shape, out_channel, filter_y, filter_x, in_channel)]; output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] += - input_value * filter_value; + input_value * filter_value; } } } diff --git a/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h index ac5069917..1fe3e1517 100644 --- a/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h +++ b/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h @@ -19,6 +19,8 @@ #define __NNFW_CKER_OPTIMIZED_BINARYARITHMETICOPS_H__ #include <functional> +#include <limits> +#include <utility> #include "cker/neon/neon_check.h" #include "cker/operation/reference/BinaryArithmeticOps.h" #include "cker/Shape.h" @@ -33,8 +35,9 @@ namespace cker namespace optimized { +/* Old version: For Sub(float) and Div. */ template <typename ElementwiseF, typename ScalarBroadcastF, typename T> -inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam ¶ms, +inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam ¶ms, bool switch_inputs, const Shape & /* unswitched_input1_shape */, const T *unswitched_input1_data, const Shape & /* unswitched_input2_shape */, @@ -42,11 +45,8 @@ inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam ¶ms, const Shape & /* output_shape */, T *output_data, ElementwiseF elementwise_f, ScalarBroadcastF scalar_broadcast_f) { - const bool use_unswitched = - params.broadcast_category == BroadcastableOpCategory::kFirstInputBroadcastsFast; - - const T *input1_data = use_unswitched ? unswitched_input1_data : unswitched_input2_data; - const T *input2_data = use_unswitched ? unswitched_input2_data : unswitched_input1_data; + const T *input1_data = switch_inputs ? unswitched_input2_data : unswitched_input1_data; + const T *input2_data = switch_inputs ? unswitched_input1_data : unswitched_input2_data; // Fivefold nested loops. The second input resets its position for each // iteration of the second loop. The first input resets its position at the @@ -123,29 +123,129 @@ inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam ¶ms, } } -inline int32_t quant8_sum(const BinaryArithmeticOpParam ¶ms, const uint8_t input1_data, - const uint8_t input2_data) +// New version: For Mul, Add and Sub(quant8) +template <typename ElementwiseF, typename ScalarBroadcastF, typename T> +inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam &unswitched_params, + const Shape & /* unswitched_input1_shape */, + const T *unswitched_input1_data, + const Shape & /* unswitched_input2_shape */, + const T *unswitched_input2_data, + const Shape & /* output_shape */, T *output_data, + ElementwiseF elementwise_f, ScalarBroadcastF scalar_broadcast_f) +{ + BinaryArithmeticOpParam switched_params = unswitched_params; + switched_params.input1_offset = unswitched_params.input2_offset; + switched_params.input1_multiplier = unswitched_params.input2_multiplier; + switched_params.input1_shift = unswitched_params.input2_shift; + switched_params.input2_offset = unswitched_params.input1_offset; + switched_params.input2_multiplier = unswitched_params.input1_multiplier; + switched_params.input2_shift = unswitched_params.input1_shift; + + const bool use_unswitched = + unswitched_params.broadcast_category == BroadcastableOpCategory::kFirstInputBroadcastsFast; + + const BinaryArithmeticOpParam ¶ms = use_unswitched ? unswitched_params : switched_params; + const T *input1_data = use_unswitched ? unswitched_input1_data : unswitched_input2_data; + const T *input2_data = use_unswitched ? unswitched_input2_data : unswitched_input1_data; + + // Fivefold nested loops. The second input resets its position for each + // iteration of the second loop. The first input resets its position at the + // beginning of the fourth loop. The innermost loop is an elementwise add of + // sections of the arrays. + T *output_data_ptr = output_data; + const T *input1_data_ptr = input1_data; + const T *input2_data_reset = input2_data; + // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared + // between input shapes. y3 for input 1 is always broadcast, and so the + // dimension there is 1, whereas optionally y1 might be broadcast for + // input 2. Put another way, input1.shape.FlatSize = y0 * y1 * y2 * y4, + // input2.shape.FlatSize = y0 * y2 * y3 * y4. + int y0 = params.broadcast_shape[0]; + int y1 = params.broadcast_shape[1]; + int y2 = params.broadcast_shape[2]; + int y3 = params.broadcast_shape[3]; + int y4 = params.broadcast_shape[4]; + if (y4 > 1) + { + // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner + // dimension. + for (int i0 = 0; i0 < y0; ++i0) + { + const T *input2_data_ptr = nullptr; + for (int i1 = 0; i1 < y1; ++i1) + { + input2_data_ptr = input2_data_reset; + for (int i2 = 0; i2 < y2; ++i2) + { + for (int i3 = 0; i3 < y3; ++i3) + { + elementwise_f(y4, params, input1_data_ptr, input2_data_ptr, output_data_ptr); + input2_data_ptr += y4; + output_data_ptr += y4; + } + // We have broadcast y4 of input1 data y3 times, and now move on. + input1_data_ptr += y4; + } + } + // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on. + input2_data_reset = input2_data_ptr; + } + } + else + { + // Special case of y4 == 1, in which the innermost loop is a single + // element and can be combined with the next (y3) as an inner broadcast. + // + // Note that this handles the case of pure scalar broadcast when + // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar + // broadcast with batch (as y2 > 1). + // + // NOTE The process is the same as the above general case except + // simplified for y4 == 1 and the loop over y3 is contained within the + // AddScalarBroadcast function. + for (int i0 = 0; i0 < y0; ++i0) + { + const T *input2_data_ptr = nullptr; + for (int i1 = 0; i1 < y1; ++i1) + { + input2_data_ptr = input2_data_reset; + for (int i2 = 0; i2 < y2; ++i2) + { + scalar_broadcast_f(y3, params, *input1_data_ptr, input2_data_ptr, output_data_ptr); + input2_data_ptr += y3; + output_data_ptr += y3; + input1_data_ptr += 1; + } + } + input2_data_reset = input2_data_ptr; + } + } +} + +template <typename T> +inline typename std::enable_if_t<is_quant8<T>::value, int32_t> +quant8_sum(const BinaryArithmeticOpParam ¶ms, const T input1_data, const T input2_data) { const int32_t input1_val = params.input1_offset + input1_data; const int32_t input2_val = params.input2_offset + input2_data; const int32_t shifted_input1_val = input1_val * (1 << params.left_shift); const int32_t shifted_input2_val = input2_val * (1 << params.left_shift); const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input1_val, params.input1_multiplier, params.input1_shift); + shifted_input1_val, params.input1_multiplier, params.input1_shift); const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input2_val, params.input2_multiplier, params.input2_shift); + shifted_input2_val, params.input2_multiplier, params.input2_shift); const int32_t raw_sum = scaled_input1_val + scaled_input2_val; const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp( - raw_sum, params.output_multiplier, params.output_shift) + + raw_sum, params.output_multiplier, params.output_shift) + params.output_offset; const int32_t clamped_output = std::min(params.quantized_activation_max, std::max(params.quantized_activation_min, raw_output)); return clamped_output; } -inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam ¶ms, - const uint8_t *input1_data, const uint8_t *input2_data, - uint8_t *output_data) +inline void AddElementwise(int size, const BinaryArithmeticOpParam ¶ms, + const uint8_t *input1_data, const uint8_t *input2_data, + uint8_t *output_data) { int i = 0; @@ -193,9 +293,9 @@ inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam ¶ms const int16x4_t s1_narrowed = vmovn_s32(s1); const int16x4_t s2_narrowed = vmovn_s32(s2); const int16x8_t s = - vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), vdupq_n_s16(params.output_offset)); - const uint8x8_t clamped = vmax_u8(output_activation_min_vector, - vmin_u8(output_activation_max_vector, vqmovun_s16(s))); + vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), vdupq_n_s16(params.output_offset)); + const uint8x8_t clamped = + vmax_u8(output_activation_min_vector, vmin_u8(output_activation_max_vector, vqmovun_s16(s))); vst1_u8(output_data + i, clamped); } #endif // NEON @@ -206,12 +306,12 @@ inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam ¶ms const int32_t shifted_input1_val = input1_val * (1 << params.left_shift); const int32_t shifted_input2_val = input2_val * (1 << params.left_shift); const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input1_val, params.input1_multiplier, params.input1_shift); + shifted_input1_val, params.input1_multiplier, params.input1_shift); const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( - shifted_input2_val, params.input2_multiplier, params.input2_shift); + shifted_input2_val, params.input2_multiplier, params.input2_shift); const int32_t raw_sum = scaled_input1_val + scaled_input2_val; const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp( - raw_sum, params.output_multiplier, params.output_shift) + + raw_sum, params.output_multiplier, params.output_shift) + params.output_offset; const int32_t clamped_output = std::min(params.quantized_activation_max, std::max(params.quantized_activation_min, raw_output)); @@ -220,7 +320,248 @@ inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam ¶ms } inline void AddElementwise(int size, const BinaryArithmeticOpParam ¶ms, - const float *input1_data, const float *input2_data, float *output_data) + const int8_t *input1_data, const int8_t *input2_data, + int8_t *output_data) +{ + int i = 0; +#ifdef USE_NEON + const int8x16_t output_activation_min_vector = vdupq_n_s8(params.quantized_activation_min); + const int8x16_t output_activation_max_vector = vdupq_n_s8(params.quantized_activation_max); + + const int input1_left_shift = params.left_shift + params.input1_shift; + const int input2_left_shift = params.left_shift + params.input2_shift; + const int32x4_t input1_left_dup = vdupq_n_s32(input1_left_shift); + const int32x4_t input2_left_dup = vdupq_n_s32(input2_left_shift); + + const int16x8_t input1_offset_dup = vdupq_n_s16(params.input1_offset); + const int16x8_t input2_offset_dup = vdupq_n_s16(params.input2_offset); + + for (; i <= size - 16; i += 16) + { + const int8x16_t input1_val_original = vld1q_s8(input1_data + i); + const int8x16_t input2_val_original = vld1q_s8(input2_data + i); + + const int16x8_t input1_val_s16_high = vmovl_s8(vget_high_s8(input1_val_original)); + const int16x8_t input1_val_s16_low = vmovl_s8(vget_low_s8(input1_val_original)); + + const int16x8_t input2_val_s16_high = vmovl_s8(vget_high_s8(input2_val_original)); + const int16x8_t input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original)); + const int16x8_t input1_val_high = vaddq_s16(input1_val_s16_high, input1_offset_dup); + const int16x8_t input2_val_high = vaddq_s16(input2_val_s16_high, input2_offset_dup); + const int16x8_t input1_val_low = vaddq_s16(input1_val_s16_low, input1_offset_dup); + const int16x8_t input2_val_low = vaddq_s16(input2_val_s16_low, input2_offset_dup); + const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high); + const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high); + const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low); + const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low); + const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high); + const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high); + const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low); + const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low); + int32x4_t x111 = vmovl_s16(input1_val_low_low); + int32x4_t x112 = vmovl_s16(input1_val_low_high); + int32x4_t x121 = vmovl_s16(input1_val_high_low); + int32x4_t x122 = vmovl_s16(input1_val_high_high); + int32x4_t x211 = vmovl_s16(input2_val_low_low); + int32x4_t x212 = vmovl_s16(input2_val_low_high); + int32x4_t x221 = vmovl_s16(input2_val_high_low); + int32x4_t x222 = vmovl_s16(input2_val_high_high); + + x111 = vshlq_s32(x111, input1_left_dup); + x112 = vshlq_s32(x112, input1_left_dup); + x121 = vshlq_s32(x121, input1_left_dup); + x122 = vshlq_s32(x122, input1_left_dup); + x211 = vshlq_s32(x211, input2_left_dup); + x212 = vshlq_s32(x212, input2_left_dup); + x221 = vshlq_s32(x221, input2_left_dup); + x222 = vshlq_s32(x222, input2_left_dup); + x111 = vqrdmulhq_n_s32(x111, params.input1_multiplier); + x112 = vqrdmulhq_n_s32(x112, params.input1_multiplier); + x121 = vqrdmulhq_n_s32(x121, params.input1_multiplier); + x122 = vqrdmulhq_n_s32(x122, params.input1_multiplier); + x211 = vqrdmulhq_n_s32(x211, params.input2_multiplier); + x212 = vqrdmulhq_n_s32(x212, params.input2_multiplier); + x221 = vqrdmulhq_n_s32(x221, params.input2_multiplier); + x222 = vqrdmulhq_n_s32(x222, params.input2_multiplier); + int32x4_t s11 = vaddq_s32(x111, x211); + int32x4_t s12 = vaddq_s32(x112, x212); + int32x4_t s21 = vaddq_s32(x121, x221); + int32x4_t s22 = vaddq_s32(x122, x222); + s11 = vqrdmulhq_n_s32(s11, params.output_multiplier); + s12 = vqrdmulhq_n_s32(s12, params.output_multiplier); + s21 = vqrdmulhq_n_s32(s21, params.output_multiplier); + s22 = vqrdmulhq_n_s32(s22, params.output_multiplier); + using gemmlowp::RoundingDivideByPOT; + s11 = RoundingDivideByPOT(s11, -params.output_shift); + s12 = RoundingDivideByPOT(s12, -params.output_shift); + s21 = RoundingDivideByPOT(s21, -params.output_shift); + s22 = RoundingDivideByPOT(s22, -params.output_shift); + const int16x4_t s11_narrowed = vmovn_s32(s11); + const int16x4_t s12_narrowed = vmovn_s32(s12); + const int16x4_t s21_narrowed = vmovn_s32(s21); + const int16x4_t s22_narrowed = vmovn_s32(s22); + const int16x8_t s1 = + vaddq_s16(vcombine_s16(s11_narrowed, s12_narrowed), vdupq_n_s16(params.output_offset)); + const int16x8_t s2 = + vaddq_s16(vcombine_s16(s21_narrowed, s22_narrowed), vdupq_n_s16(params.output_offset)); + const int8x16_t s = vcombine_s8(vqmovn_s16(s1), vqmovn_s16(s2)); + + const int8x16_t clamped = + vmaxq_s8(output_activation_min_vector, vminq_s8(output_activation_max_vector, s)); + vst1q_s8(output_data + i, clamped); + } +#endif // NEON + + for (; i < size; ++i) + { + const int32_t input1_val = params.input1_offset + input1_data[i]; + const int32_t input2_val = params.input2_offset + input2_data[i]; + const int32_t shifted_input1_val = input1_val * (1 << params.left_shift); + const int32_t shifted_input2_val = input2_val * (1 << params.left_shift); + const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input1_val, params.input1_multiplier, params.input1_shift); + const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input2_val, params.input2_multiplier, params.input2_shift); + const int32_t raw_sum = scaled_input1_val + scaled_input2_val; + const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp( + raw_sum, params.output_multiplier, params.output_shift) + + params.output_offset; + const int32_t clamped_output = std::min(params.quantized_activation_max, + std::max(params.quantized_activation_min, raw_output)); + output_data[i] = static_cast<int8_t>(clamped_output); + } +} + +struct BinaryOpFuncAddFloat +{ +#ifdef USE_NEON + static inline float32x4_t calculate(const float32x4_t &a, const float32x4_t &b) + { + return vaddq_f32(a, b); + } +#endif // USE_NEON + static inline float calculate(const float a, const float b) { return a + b; } +}; + +struct BinaryOpFuncSubFloat +{ +#ifdef USE_NEON + static inline float32x4_t calculate(const float32x4_t &a, const float32x4_t &b) + { + return vsubq_f32(a, b); + } +#endif // USE_NEON + static inline float calculate(const float a, const float b) { return a - b; } +}; + +struct BinaryOpFuncMulFloat +{ +#ifdef USE_NEON + static inline float32x4_t calculate(const float32x4_t &a, const float32x4_t &b) + { + return vmulq_f32(a, b); + } +#endif // USE_NEON + static inline float calculate(const float a, const float b) { return a * b; } +}; + +struct BinaryOpFuncDivFloat +{ +#ifdef USE_NEON +#ifdef __aarch64__ + static inline float32x4_t calculate(const float32x4_t &a, const float32x4_t &b) + { + return vdivq_f32(a, b); + } +#endif // __aarch64__ +#endif // USE_NEON + static inline float calculate(const float a, const float b) { return a / b; } +}; + +template <class BASEOPERATOR> struct BinaryOpFuncSwapArgs +{ + template <typename T> static inline T calculate(const T &a, const T &b) + { + return BASEOPERATOR::calculate(b, a); + } +}; + +struct BinaryOpActivationFloatNone +{ +#ifdef USE_NEON + static inline float32x4_t applyCeiling(const float32x4_t &value, const float32x4_t &ceilingParam) + { + (void)ceilingParam; // suppress unused argument warning + return value; + } + static inline float32x4_t applyFloor(const float32x4_t &value, const float32x4_t &floorParam) + { + (void)floorParam; + return value; + } +#endif // USE_NEON + static inline float applyCeiling(const float value, const float ceilingParam) + { + (void)ceilingParam; + return value; + } + static inline float applyFloor(const float value, const float floorParam) + { + (void)floorParam; + return value; + } +}; + +struct BinaryOpActivationFloatMax +{ +#ifdef USE_NEON + static inline float32x4_t applyCeiling(const float32x4_t &value, const float32x4_t &ceilingParam) + { + (void)ceilingParam; // suppress unused argument warning + return value; + } + static inline float32x4_t applyFloor(const float32x4_t &value, const float32x4_t &floorParam) + { + return vmaxq_f32(value, floorParam); + } +#endif // USE_NEON + static inline float applyCeiling(const float value, const float ceilingParam) + { + (void)ceilingParam; + return value; + } + static inline float applyFloor(const float value, const float floorParam) + { + return std::max(value, floorParam); + } +}; + +struct BinaryOpActivationFloatMinMax +{ +#ifdef USE_NEON + static inline float32x4_t applyCeiling(const float32x4_t &value, const float32x4_t &ceilingParam) + { + return vminq_f32(value, ceilingParam); + } + static inline float32x4_t applyFloor(const float32x4_t &value, const float32x4_t &floorParam) + { + return vmaxq_f32(value, floorParam); + } +#endif // USE_NEON + static inline float applyCeiling(const float value, const float ceilingParam) + { + return std::min(value, ceilingParam); + } + static inline float applyFloor(const float value, const float floorParam) + { + return std::max(value, floorParam); + } +}; + +template <class OPERATOR, class ACTIVATION> +inline void BinaryOpElementwise(int size, const BinaryArithmeticOpParam ¶ms, + const float *input1_data, const float *input2_data, + float *output_data) { int i = 0; @@ -237,18 +578,18 @@ inline void AddElementwise(int size, const BinaryArithmeticOpParam ¶ms, auto a21 = vld1q_f32(input2_data + i + 4); auto a22 = vld1q_f32(input2_data + i + 8); auto a23 = vld1q_f32(input2_data + i + 12); - auto x0 = vaddq_f32(a10, a20); - auto x1 = vaddq_f32(a11, a21); - auto x2 = vaddq_f32(a12, a22); - auto x3 = vaddq_f32(a13, a23); - x0 = vmaxq_f32(activation_min, x0); - x1 = vmaxq_f32(activation_min, x1); - x2 = vmaxq_f32(activation_min, x2); - x3 = vmaxq_f32(activation_min, x3); - x0 = vminq_f32(activation_max, x0); - x1 = vminq_f32(activation_max, x1); - x2 = vminq_f32(activation_max, x2); - x3 = vminq_f32(activation_max, x3); + auto x0 = OPERATOR::calculate(a10, a20); + auto x1 = OPERATOR::calculate(a11, a21); + auto x2 = OPERATOR::calculate(a12, a22); + auto x3 = OPERATOR::calculate(a13, a23); + x0 = ACTIVATION::applyFloor(x0, activation_min); + x1 = ACTIVATION::applyFloor(x1, activation_min); + x2 = ACTIVATION::applyFloor(x2, activation_min); + x3 = ACTIVATION::applyFloor(x3, activation_min); + x0 = ACTIVATION::applyCeiling(x0, activation_max); + x1 = ACTIVATION::applyCeiling(x1, activation_max); + x2 = ACTIVATION::applyCeiling(x2, activation_max); + x3 = ACTIVATION::applyCeiling(x3, activation_max); vst1q_f32(output_data + i, x0); vst1q_f32(output_data + i + 4, x1); vst1q_f32(output_data + i + 8, x2); @@ -258,26 +599,101 @@ inline void AddElementwise(int size, const BinaryArithmeticOpParam ¶ms, { auto a1 = vld1q_f32(input1_data + i); auto a2 = vld1q_f32(input2_data + i); - auto x = vaddq_f32(a1, a2); - x = vmaxq_f32(activation_min, x); - x = vminq_f32(activation_max, x); - vst1q_f32(output_data + i, x); + auto x = OPERATOR::calculate(a1, a2); // vaddq + auto x_clamped = + ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max); + vst1q_f32(output_data + i, x_clamped); } -#endif // NEON +#endif // USE_NEON + for (; i < size; i++) + { + auto x = OPERATOR::calculate(input1_data[i], input2_data[i]); + output_data[i] = ACTIVATION::applyCeiling( + ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max); + } +} + +// Broadcast binary op template that can often be used for inner loop +// This function will handle scalar_value (LHS) and vector_values (RHS). +// Since it's a float function, input params does not matter here. +template <class OPERATOR, class ACTIVATION> +inline void BinaryOpScalarBroadcast(int size, const BinaryArithmeticOpParam ¶ms, + const float broadcast_value, const float *input2_data, + float *output_data) +{ + int i = 0; + +#ifdef USE_NEON + const auto activation_min = vdupq_n_f32(params.float_activation_min); + const auto activation_max = vdupq_n_f32(params.float_activation_max); + const auto broadcast_value_dup = vdupq_n_f32(broadcast_value); + for (; i <= size - 16; i += 16) + { + auto a20 = vld1q_f32(input2_data + i); + auto a21 = vld1q_f32(input2_data + i + 4); + auto a22 = vld1q_f32(input2_data + i + 8); + auto a23 = vld1q_f32(input2_data + i + 12); + auto x0 = OPERATOR::calculate(broadcast_value_dup, a20); + auto x1 = OPERATOR::calculate(broadcast_value_dup, a21); + auto x2 = OPERATOR::calculate(broadcast_value_dup, a22); + auto x3 = OPERATOR::calculate(broadcast_value_dup, a23); + x0 = ACTIVATION::applyFloor(x0, activation_min); + x1 = ACTIVATION::applyFloor(x1, activation_min); + x2 = ACTIVATION::applyFloor(x2, activation_min); + x3 = ACTIVATION::applyFloor(x3, activation_min); + x0 = ACTIVATION::applyCeiling(x0, activation_max); + x1 = ACTIVATION::applyCeiling(x1, activation_max); + x2 = ACTIVATION::applyCeiling(x2, activation_max); + x3 = ACTIVATION::applyCeiling(x3, activation_max); + vst1q_f32(output_data + i, x0); + vst1q_f32(output_data + i + 4, x1); + vst1q_f32(output_data + i + 8, x2); + vst1q_f32(output_data + i + 12, x3); + } + for (; i <= size - 4; i += 4) + { + auto a2 = vld1q_f32(input2_data + i); + auto x = OPERATOR::calculate(broadcast_value_dup, a2); + auto x_clamped = + ACTIVATION::applyCeiling(ACTIVATION::applyFloor(x, activation_min), activation_max); + vst1q_f32(output_data + i, x_clamped); + } +#endif // USE_NEON for (; i < size; i++) { - auto x = input1_data[i] + input2_data[i]; - output_data[i] = ActivationFunctionWithMinMax<float>(x, params.float_activation_min, - params.float_activation_max); + auto x = OPERATOR::calculate(broadcast_value, input2_data[i]); + output_data[i] = ACTIVATION::applyCeiling( + ACTIVATION::applyFloor(x, params.float_activation_min), params.float_activation_max); } } -inline void AddQuant8(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, - const uint8_t *input1_data, const Shape &input2_shape, - const uint8_t *input2_data, const Shape &output_shape, uint8_t *output_data) +using BinaryOpImplFloatFuncs = + std::pair<void (*)(int, const BinaryArithmeticOpParam &, const float *, const float *, float *), + void (*)(int, const BinaryArithmeticOpParam &, const float, const float *, float *)>; + +template <class FUNC> +inline BinaryOpImplFloatFuncs +getBinaryOpWithActivationImplFloat(const BinaryArithmeticOpParam ¶ms) +{ + if (params.float_activation_max == std::numeric_limits<float>::max()) + if (params.float_activation_min == std::numeric_limits<float>::lowest()) + return BinaryOpImplFloatFuncs(BinaryOpElementwise<FUNC, BinaryOpActivationFloatNone>, + BinaryOpScalarBroadcast<FUNC, BinaryOpActivationFloatNone>); + else + return BinaryOpImplFloatFuncs(BinaryOpElementwise<FUNC, BinaryOpActivationFloatMax>, + BinaryOpScalarBroadcast<FUNC, BinaryOpActivationFloatMax>); + else + return BinaryOpImplFloatFuncs(BinaryOpElementwise<FUNC, BinaryOpActivationFloatMinMax>, + BinaryOpScalarBroadcast<FUNC, BinaryOpActivationFloatMinMax>); +} + +template <typename T> +inline typename std::enable_if_t<is_quant8<T>::value> +Add(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, const T *input1_data, + const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data) { const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape); - AddElementwiseQuant8(flat_size, params, input1_data, input2_data, output_data); + AddElementwise(flat_size, params, input1_data, input2_data, output_data); } inline void Add(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, @@ -285,15 +701,16 @@ inline void Add(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape const Shape &output_shape, float *output_data) { const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape); - AddElementwise(flat_size, params, input1_data, input2_data, output_data); + auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncAddFloat>(params); + (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data); } // Scalar-broadcast add that can be used for inner loop of more general // broadcast add, so that, for example, scalar-broadcast with batch will still // be fast. -inline void AddScalarBroadcastQuant8(int size, const BinaryArithmeticOpParam ¶ms, - uint8_t broadcast_value, const uint8_t *input2_data, - uint8_t *output_data) +inline void AddScalarBroadcast(int size, const BinaryArithmeticOpParam ¶ms, + uint8_t broadcast_value, const uint8_t *input2_data, + uint8_t *output_data) { int i = 0; int32_t clamped_output; @@ -304,58 +721,115 @@ inline void AddScalarBroadcastQuant8(int size, const BinaryArithmeticOpParam &pa } } -inline void AddScalarBroadcast(int size, const BinaryArithmeticOpParam ¶ms, - float broadcast_value, const float *input2_data, float *output_data) +// Scalar-broadcast add that can be used for inner loop of more general +// broadcast add, so that, for example, scalar-broadcast with batch will still +// be fast. +inline void AddScalarBroadcast(int size, const BinaryArithmeticOpParam ¶ms, int8_t input1_data, + const int8_t *input2_data, int8_t *output_data) { + using gemmlowp::RoundingDivideByPOT; int i = 0; #ifdef USE_NEON - const float32x4_t output_activation_min_vector = vdupq_n_f32(params.float_activation_min); - const float32x4_t output_activation_max_vector = vdupq_n_f32(params.float_activation_max); - const float32x4_t broadcast_value_dup = vdupq_n_f32(broadcast_value); - for (; i <= size - 4; i += 4) - { - const float32x4_t input2_val_original = vld1q_f32(input2_data + i); + const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift); + const int8x8_t output_activation_min_vector = vdup_n_s8(params.quantized_activation_min); + const int8x8_t output_activation_max_vector = vdup_n_s8(params.quantized_activation_max); - const float32x4_t output = vaddq_f32(input2_val_original, broadcast_value_dup); + // Process broadcast scalar. + const int8x8_t input1_val_original = vdup_n_s8(input1_data); + const int16x8_t input1_val_s16 = vmovl_s8(input1_val_original); + const int16x8_t input1_val = vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset)); + const int16x4_t input1_val_high = vget_high_s16(input1_val); + const int16x4_t input1_val_low = vget_low_s16(input1_val); + int32x4_t x11 = vmovl_s16(input1_val_low); + int32x4_t x12 = vmovl_s16(input1_val_high); + x11 = vshlq_s32(x11, left_shift_dup); + x12 = vshlq_s32(x12, left_shift_dup); + x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier); + x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier); + const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift); + x11 = vshlq_s32(x11, input1_shift_dup); + x12 = vshlq_s32(x12, input1_shift_dup); - const float32x4_t clamped = - vmaxq_f32(output_activation_min_vector, vminq_f32(output_activation_max_vector, output)); - vst1q_f32(output_data + i, clamped); + for (; i <= size - 8; i += 8) + { + const int8x8_t input2_val_original = vld1_s8(input2_data + i); + const int16x8_t input2_val_s16 = vmovl_s8(input2_val_original); + const int16x8_t input2_val = vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset)); + const int16x4_t input2_val_high = vget_high_s16(input2_val); + const int16x4_t input2_val_low = vget_low_s16(input2_val); + int32x4_t x21 = vmovl_s16(input2_val_low); + int32x4_t x22 = vmovl_s16(input2_val_high); + x21 = vshlq_s32(x21, left_shift_dup); + x22 = vshlq_s32(x22, left_shift_dup); + x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier); + x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier); + const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift); + x21 = vshlq_s32(x21, input2_shift_dup); + x22 = vshlq_s32(x22, input2_shift_dup); + int32x4_t s1 = vaddq_s32(x11, x21); + int32x4_t s2 = vaddq_s32(x12, x22); + s1 = vqrdmulhq_n_s32(s1, params.output_multiplier); + s2 = vqrdmulhq_n_s32(s2, params.output_multiplier); + s1 = RoundingDivideByPOT(s1, -params.output_shift); + s2 = RoundingDivideByPOT(s2, -params.output_shift); + const int16x4_t s1_narrowed = vmovn_s32(s1); + const int16x4_t s2_narrowed = vmovn_s32(s2); + const int16x8_t s = + vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), vdupq_n_s16(params.output_offset)); + const int8x8_t clamped = + vmax_s8(output_activation_min_vector, vmin_s8(output_activation_max_vector, vqmovn_s16(s))); + vst1_s8(output_data + i, clamped); } #endif // NEON - for (; i < size; ++i) + + if (i < size) { - auto x = broadcast_value + input2_data[i]; - output_data[i] = ActivationFunctionWithMinMax<float>(x, params.float_activation_min, - params.float_activation_max); + // Process broadcast scalar. + const int32_t input1_val = params.input1_offset + input1_data; + const int32_t shifted_input1_val = input1_val * (1 << params.left_shift); + const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input1_val, params.input1_multiplier, params.input1_shift); + + for (; i < size; ++i) + { + const int32_t input2_val = params.input2_offset + input2_data[i]; + const int32_t shifted_input2_val = input2_val * (1 << params.left_shift); + const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp( + shifted_input2_val, params.input2_multiplier, params.input2_shift); + const int32_t raw_sum = scaled_input1_val + scaled_input2_val; + const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp( + raw_sum, params.output_multiplier, params.output_shift) + + params.output_offset; + const int32_t clamped_output = std::min( + params.quantized_activation_max, std::max(params.quantized_activation_min, raw_output)); + output_data[i] = static_cast<int8_t>(clamped_output); + } } } -inline void BroadcastAddDispatchQuant8(const BinaryArithmeticOpParam ¶ms, - const Shape &input1_shape, const uint8_t *input1_data, - const Shape &input2_shape, const uint8_t *input2_data, - const Shape &output_shape, uint8_t *output_data) +template <typename T> +inline typename std::enable_if_t<is_quant8<T>::value> +BroadcastAddDispatch(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const T *input1_data, const Shape &input2_shape, const T *input2_data, + const Shape &output_shape, T *output_data) { if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) { - const std::function<uint8_t(const BinaryArithmeticOpParam &, const uint8_t &, const uint8_t &)> - fn = [](const BinaryArithmeticOpParam ¶ms, const uint8_t &a, - const uint8_t &b) -> uint8_t { - return static_cast<uint8_t>(quant8_sum(params, a, b)); - }; - reference::BroadcastBinaryArithmeticOpSlowQuant8(params, input1_shape, input1_data, - input2_shape, input2_data, output_shape, - output_data, fn); - } - else - { - BinaryBroadcastFiveFold( - params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, - static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *, - uint8_t *)>(AddElementwiseQuant8), - static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *, - uint8_t *)>(AddScalarBroadcastQuant8)); + const std::function<T(const BinaryArithmeticOpParam &, const T &, const T &)> fn = + [](const BinaryArithmeticOpParam ¶ms, const T &a, const T &b) { + return static_cast<T>(quant8_sum(params, a, b)); + }; + reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape, + input2_data, output_shape, output_data, fn); + return; } + + BinaryBroadcastFiveFold( + params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, + static_cast<void (*)(int, const BinaryArithmeticOpParam &, const T *, const T *, T *)>( + AddElementwise), + static_cast<void (*)(int, const BinaryArithmeticOpParam &, T, const T *, T *)>( + AddScalarBroadcast)); } inline void BroadcastAddDispatch(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, @@ -366,18 +840,18 @@ inline void BroadcastAddDispatch(const BinaryArithmeticOpParam ¶ms, const Sh if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) { const std::function<float(const float &, const float &)> fn = - [](const float &a, const float &b) -> float { return a + b; }; + [](const float &a, const float &b) -> float { return a + b; }; reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, fn); } else { + auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncAddFloat>(params); + BinaryBroadcastFiveFold( - params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, - static_cast<void (*)(int, const BinaryArithmeticOpParam &, const float *, const float *, - float *)>(AddElementwise), - static_cast<void (*)(int, const BinaryArithmeticOpParam &, float, const float *, float *)>( - AddScalarBroadcast)); + params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast, + input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, + implFuncs.first, implFuncs.second); } } @@ -385,75 +859,57 @@ inline void Sub(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape const float *input1_data, const Shape &input2_shape, const float *input2_data, const Shape &output_shape, float *output_data) { - int i = 0; - const int size = MatchingElementsSize(input1_shape, input2_shape, output_shape); -#ifdef USE_NEON - const auto activation_min = vdupq_n_f32(params.float_activation_min); - const auto activation_max = vdupq_n_f32(params.float_activation_max); - for (; i <= size - 16; i += 16) + const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape); + auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncSubFloat>(params); + (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data); +} + +inline void BroadcastSubDispatch(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const float *input1_data, const Shape &input2_shape, + const float *input2_data, const Shape &output_shape, + float *output_data) +{ + if (params.broadcast_category == BroadcastableOpCategory::kFirstInputBroadcastsFast) { - auto a10 = vld1q_f32(input1_data + i); - auto a11 = vld1q_f32(input1_data + i + 4); - auto a12 = vld1q_f32(input1_data + i + 8); - auto a13 = vld1q_f32(input1_data + i + 12); - auto a20 = vld1q_f32(input2_data + i); - auto a21 = vld1q_f32(input2_data + i + 4); - auto a22 = vld1q_f32(input2_data + i + 8); - auto a23 = vld1q_f32(input2_data + i + 12); - auto x0 = vsubq_f32(a10, a20); - auto x1 = vsubq_f32(a11, a21); - auto x2 = vsubq_f32(a12, a22); - auto x3 = vsubq_f32(a13, a23); - x0 = vmaxq_f32(activation_min, x0); - x1 = vmaxq_f32(activation_min, x1); - x2 = vmaxq_f32(activation_min, x2); - x3 = vmaxq_f32(activation_min, x3); - x0 = vminq_f32(activation_max, x0); - x1 = vminq_f32(activation_max, x1); - x2 = vminq_f32(activation_max, x2); - x3 = vminq_f32(activation_max, x3); - vst1q_f32(output_data + i, x0); - vst1q_f32(output_data + i + 4, x1); - vst1q_f32(output_data + i + 8, x2); - vst1q_f32(output_data + i + 12, x3); + auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncSubFloat>(params); + BinaryBroadcastFiveFold(params, false, input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data, implFuncs.first, implFuncs.second); } - for (; i <= size - 4; i += 4) + else if (params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast) { - auto a1 = vld1q_f32(input1_data + i); - auto a2 = vld1q_f32(input2_data + i); - auto x = vsubq_f32(a1, a2); - x = vmaxq_f32(activation_min, x); - x = vminq_f32(activation_max, x); - vst1q_f32(output_data + i, x); + auto implFuncs = + getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncSubFloat>>(params); + BinaryBroadcastFiveFold(params, true, input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data, implFuncs.first, implFuncs.second); } -#endif // NEON - - for (; i < size; i++) + else { - auto x = input1_data[i] - input2_data[i]; - output_data[i] = - ActivationFunctionWithMinMax(x, params.float_activation_min, params.float_activation_max); + const std::function<float(const float &, const float &)> fn = + [](const float &a, const float &b) -> float { return a - b; }; + reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape, + input2_data, output_shape, output_data, fn); } } -inline int32_t quant8_mul(const BinaryArithmeticOpParam ¶ms, const uint8_t input1_data, - const uint8_t input2_data) +template <typename T> +inline typename std::enable_if_t<is_quant8<T>::value, int32_t> +quant8_mul(const BinaryArithmeticOpParam ¶ms, const T input1_data, const T input2_data) { const int32_t input1_val = params.input1_offset + input1_data; const int32_t input2_val = params.input2_offset + input2_data; const int32_t unclamped_result = - params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val, - params.output_multiplier, - params.output_shift); + params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val, + params.output_multiplier, + params.output_shift); const int32_t clamped_output = std::min( - params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result)); + params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result)); return clamped_output; } -inline void MulElementwiseQuant8(int size, const BinaryArithmeticOpParam ¶ms, - const uint8_t *input1_data, const uint8_t *input2_data, - uint8_t *output_data) +inline void MulElementwise(int size, const BinaryArithmeticOpParam ¶ms, + const uint8_t *input1_data, const uint8_t *input2_data, + uint8_t *output_data) { int i = 0; @@ -495,8 +951,8 @@ inline void MulElementwiseQuant8(int size, const BinaryArithmeticOpParam ¶ms const auto p1_narrowed = vqmovn_s32(p1); const auto p2_narrowed = vqmovn_s32(p2); const auto p = vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector); - const auto clamped = vmax_u8(output_activation_min_vector, - vmin_u8(output_activation_max_vector, vqmovun_s16(p))); + const auto clamped = + vmax_u8(output_activation_min_vector, vmin_u8(output_activation_max_vector, vqmovun_s16(p))); vst1_u8(output_data + i, clamped); } #endif // NEON @@ -506,76 +962,111 @@ inline void MulElementwiseQuant8(int size, const BinaryArithmeticOpParam ¶ms const int32_t input1_val = params.input1_offset + input1_data[i]; const int32_t input2_val = params.input2_offset + input2_data[i]; const int32_t unclamped_result = - params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val, - params.output_multiplier, - params.output_shift); - const int32_t clamped_output = - std::min(params.quantized_activation_max, - std::max(params.quantized_activation_min, unclamped_result)); + params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val, + params.output_multiplier, + params.output_shift); + const int32_t clamped_output = std::min( + params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result)); output_data[i] = static_cast<uint8_t>(clamped_output); } } inline void MulElementwise(int size, const BinaryArithmeticOpParam ¶ms, - const float *input1_data, const float *input2_data, float *output_data) + const int8_t *input1_data, const int8_t *input2_data, + int8_t *output_data) { int i = 0; - #ifdef USE_NEON - const auto activation_min = vdupq_n_f32(params.float_activation_min); - const auto activation_max = vdupq_n_f32(params.float_activation_max); + const int16x8_t input1_offset_vector = vdupq_n_s16(params.input1_offset); + const int16x8_t input2_offset_vector = vdupq_n_s16(params.input2_offset); + const int16x8_t output_offset_vector = vdupq_n_s16(params.output_offset); + const auto output_activation_min_vector = vdupq_n_s8(params.quantized_activation_min); + const auto output_activation_max_vector = vdupq_n_s8(params.quantized_activation_max); + const int left_shift = std::max(0, params.output_shift); + const int right_shift = std::max(0, -params.output_shift); + const int32x4_t left_shift_vec = vdupq_n_s32(left_shift); for (; i <= size - 16; i += 16) { - auto a10 = vld1q_f32(input1_data + i); - auto a11 = vld1q_f32(input1_data + i + 4); - auto a12 = vld1q_f32(input1_data + i + 8); - auto a13 = vld1q_f32(input1_data + i + 12); - auto a20 = vld1q_f32(input2_data + i); - auto a21 = vld1q_f32(input2_data + i + 4); - auto a22 = vld1q_f32(input2_data + i + 8); - auto a23 = vld1q_f32(input2_data + i + 12); - auto x0 = vmulq_f32(a10, a20); - auto x1 = vmulq_f32(a11, a21); - auto x2 = vmulq_f32(a12, a22); - auto x3 = vmulq_f32(a13, a23); - x0 = vmaxq_f32(activation_min, x0); - x1 = vmaxq_f32(activation_min, x1); - x2 = vmaxq_f32(activation_min, x2); - x3 = vmaxq_f32(activation_min, x3); - x0 = vminq_f32(activation_max, x0); - x1 = vminq_f32(activation_max, x1); - x2 = vminq_f32(activation_max, x2); - x3 = vminq_f32(activation_max, x3); - vst1q_f32(output_data + i, x0); - vst1q_f32(output_data + i + 4, x1); - vst1q_f32(output_data + i + 8, x2); - vst1q_f32(output_data + i + 12, x3); - } - for (; i <= size - 4; i += 4) - { - auto a1 = vld1q_f32(input1_data + i); - auto a2 = vld1q_f32(input2_data + i); - auto x = vmulq_f32(a1, a2); - x = vmaxq_f32(activation_min, x); - x = vminq_f32(activation_max, x); - vst1q_f32(output_data + i, x); + // We load / store 16 at a time, multiplying as four sets of 4 int32s. + const int8x16_t input1_val_original = vld1q_s8(input1_data + i); + const int8x16_t input2_val_original = vld1q_s8(input2_data + i); + + const int16x8_t input1_val_s16_high = vmovl_s8(vget_high_s8(input1_val_original)); + const int16x8_t input1_val_s16_low = vmovl_s8(vget_low_s8(input1_val_original)); + + const int16x8_t input2_val_s16_high = vmovl_s8(vget_high_s8(input2_val_original)); + const int16x8_t input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original)); + const int16x8_t input1_val_high = vaddq_s16(input1_val_s16_high, input1_offset_vector); + const int16x8_t input2_val_high = vaddq_s16(input2_val_s16_high, input2_offset_vector); + const int16x8_t input1_val_low = vaddq_s16(input1_val_s16_low, input1_offset_vector); + const int16x8_t input2_val_low = vaddq_s16(input2_val_s16_low, input2_offset_vector); + const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high); + const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high); + const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low); + const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low); + const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high); + const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high); + const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low); + const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low); + + auto p1 = vmull_s16(input2_val_high_high, input1_val_high_high); + auto p2 = vmull_s16(input2_val_high_low, input1_val_high_low); + auto p3 = vmull_s16(input2_val_low_high, input1_val_low_high); + auto p4 = vmull_s16(input2_val_low_low, input1_val_low_low); + + p1 = vshlq_s32(p1, left_shift_vec); + p2 = vshlq_s32(p2, left_shift_vec); + p3 = vshlq_s32(p3, left_shift_vec); + p4 = vshlq_s32(p4, left_shift_vec); + + p1 = vqrdmulhq_n_s32(p1, params.output_multiplier); + p2 = vqrdmulhq_n_s32(p2, params.output_multiplier); + p3 = vqrdmulhq_n_s32(p3, params.output_multiplier); + p4 = vqrdmulhq_n_s32(p4, params.output_multiplier); + using gemmlowp::RoundingDivideByPOT; + p1 = RoundingDivideByPOT(p1, right_shift); + p2 = RoundingDivideByPOT(p2, right_shift); + p3 = RoundingDivideByPOT(p3, right_shift); + p4 = RoundingDivideByPOT(p4, right_shift); + + const auto p1_narrowed = vqmovn_s32(p1); + const auto p2_narrowed = vqmovn_s32(p2); + const auto p3_narrowed = vqmovn_s32(p3); + const auto p4_narrowed = vqmovn_s32(p4); + + const int16x8_t p_part1 = + vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector); + const int16x8_t p_part2 = + vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector); + const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1)); + + const auto clamped = + vmaxq_s8(output_activation_min_vector, vminq_s8(output_activation_max_vector, p)); + vst1q_s8(output_data + i, clamped); } #endif // NEON - for (; i < size; i++) + for (; i < size; ++i) { - auto x = input1_data[i] * input2_data[i]; - output_data[i] = - ActivationFunctionWithMinMax(x, params.float_activation_min, params.float_activation_max); + const int32_t input1_val = params.input1_offset + input1_data[i]; + const int32_t input2_val = params.input2_offset + input2_data[i]; + const int32_t unclamped_result = + params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val, + params.output_multiplier, + params.output_shift); + const int32_t clamped_output = std::min( + params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result)); + output_data[i] = static_cast<int8_t>(clamped_output); } } -inline void MulQuant8(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, - const uint8_t *input1_data, const Shape &input2_shape, - const uint8_t *input2_data, const Shape &output_shape, uint8_t *output_data) +template <typename T> +inline typename std::enable_if_t<is_quant8<T>::value> +Mul(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, const T *input1_data, + const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data) { const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape); - MulElementwiseQuant8(flat_size, params, input1_data, input2_data, output_data); + MulElementwise(flat_size, params, input1_data, input2_data, output_data); } inline void Mul(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, @@ -583,12 +1074,13 @@ inline void Mul(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape const Shape &output_shape, float *output_data) { const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape); - MulElementwise(flat_size, params, input1_data, input2_data, output_data); + auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncMulFloat>(params); + (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data); } -inline void MulSimpleBroadcastQuant8(int size, const BinaryArithmeticOpParam ¶ms, - const uint8_t broadcast_value, const uint8_t *input2_data, - uint8_t *output_data) +inline void MulSimpleBroadcast(int size, const BinaryArithmeticOpParam ¶ms, + const uint8_t broadcast_value, const uint8_t *input2_data, + uint8_t *output_data) { int i = 0; int32_t clamped_output; @@ -600,60 +1092,108 @@ inline void MulSimpleBroadcastQuant8(int size, const BinaryArithmeticOpParam &pa } // Broadcast mul that can often be used for inner loop of broadcast Mul. -// This function will handle scalar_value (LHS) * vector_values (RHS). -// Since it's a float function, input params does not matter here. inline void MulSimpleBroadcast(int size, const BinaryArithmeticOpParam ¶ms, - const float broadcast_value, const float *input2_data, - float *output_data) + const int8_t broadcast_value, const int8_t *input2_data, + int8_t *output_data) { + const int16_t input1_val = params.input1_offset + broadcast_value; + int i = 0; #ifdef USE_NEON - const float32x4_t output_activation_min_vector = vdupq_n_f32(params.float_activation_min); - const float32x4_t output_activation_max_vector = vdupq_n_f32(params.float_activation_max); - const float32x4_t broadcast_value_dup = vdupq_n_f32(broadcast_value); - for (; i <= size - 4; i += 4) + const auto input2_offset_vector = vdupq_n_s16(params.input2_offset); + const auto output_offset_vector = vdupq_n_s16(params.output_offset); + const auto output_activation_min_vector = vdupq_n_s8(params.quantized_activation_min); + const auto output_activation_max_vector = vdupq_n_s8(params.quantized_activation_max); + const int left_shift = std::max(0, params.output_shift); + const int right_shift = std::max(0, -params.output_shift); + const int32x4_t left_shift_vec = vdupq_n_s32(left_shift); + for (; i <= size - 16; i += 16) { - const float32x4_t input2_val_original = vld1q_f32(input2_data + i); + // We load / store 16 at a time, multiplying as four sets of 4 int32s. + const auto input2_val_original = vld1q_s8(input2_data + i); + const auto input2_val_s16_high = vmovl_s8(vget_high_s8(input2_val_original)); + const auto input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original)); + + const auto input2_val_high = vaddq_s16(input2_val_s16_high, input2_offset_vector); + const auto input2_val_low = vaddq_s16(input2_val_s16_low, input2_offset_vector); + + const auto input2_val_low_low = vget_low_s16(input2_val_low); + const auto input2_val_low_high = vget_high_s16(input2_val_low); + const auto input2_val_high_low = vget_low_s16(input2_val_high); + const auto input2_val_high_high = vget_high_s16(input2_val_high); + + auto p1 = vmull_n_s16(input2_val_high_high, input1_val); + auto p2 = vmull_n_s16(input2_val_high_low, input1_val); + auto p3 = vmull_n_s16(input2_val_low_high, input1_val); + auto p4 = vmull_n_s16(input2_val_low_low, input1_val); + + p1 = vshlq_s32(p1, left_shift_vec); + p2 = vshlq_s32(p2, left_shift_vec); + p3 = vshlq_s32(p3, left_shift_vec); + p4 = vshlq_s32(p4, left_shift_vec); + + p1 = vqrdmulhq_n_s32(p1, params.output_multiplier); + p2 = vqrdmulhq_n_s32(p2, params.output_multiplier); + p3 = vqrdmulhq_n_s32(p3, params.output_multiplier); + p4 = vqrdmulhq_n_s32(p4, params.output_multiplier); + using gemmlowp::RoundingDivideByPOT; + p1 = RoundingDivideByPOT(p1, right_shift); + p2 = RoundingDivideByPOT(p2, right_shift); + p3 = RoundingDivideByPOT(p3, right_shift); + p4 = RoundingDivideByPOT(p4, right_shift); - const float32x4_t output = vmulq_f32(input2_val_original, broadcast_value_dup); + const auto p1_narrowed = vqmovn_s32(p1); + const auto p2_narrowed = vqmovn_s32(p2); + const auto p3_narrowed = vqmovn_s32(p3); + const auto p4_narrowed = vqmovn_s32(p4); - const float32x4_t clamped = - vmaxq_f32(output_activation_min_vector, vminq_f32(output_activation_max_vector, output)); - vst1q_f32(output_data + i, clamped); + const int16x8_t p_part1 = + vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector); + const int16x8_t p_part2 = + vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector); + const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1)); + + const auto clamped = + vmaxq_s8(output_activation_min_vector, vminq_s8(output_activation_max_vector, p)); + vst1q_s8(output_data + i, clamped); } #endif // NEON for (; i < size; ++i) { - float x = broadcast_value * input2_data[i]; - output_data[i] = - ActivationFunctionWithMinMax(x, params.float_activation_min, params.float_activation_max); + const int32_t input2_val = params.input2_offset + input2_data[i]; + const int32_t unclamped_result = + params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val, + params.output_multiplier, + params.output_shift); + const int32_t clamped_output = std::min( + params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result)); + output_data[i] = static_cast<int8_t>(clamped_output); } } -inline void BroadcastMulDispatchQuant8(const BinaryArithmeticOpParam ¶ms, - const Shape &input1_shape, const uint8_t *input1_data, - const Shape &input2_shape, const uint8_t *input2_data, - const Shape &output_shape, uint8_t *output_data) +template <typename T> +inline typename std::enable_if_t<is_quant8<T>::value> +BroadcastMulDispatch(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const T *input1_data, const Shape &input2_shape, const T *input2_data, + const Shape &output_shape, T *output_data) { if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) { - const std::function<uint8_t(const BinaryArithmeticOpParam &, const uint8_t &, const uint8_t &)> - fn = [](const BinaryArithmeticOpParam ¶ms, const uint8_t &a, - const uint8_t &b) -> uint8_t { - return static_cast<uint8_t>(quant8_mul(params, a, b)); - }; - reference::BroadcastBinaryArithmeticOpSlowQuant8(params, input1_shape, input1_data, - input2_shape, input2_data, output_shape, - output_data, fn); + const std::function<T(const BinaryArithmeticOpParam &, const T &, const T &)> fn = + [](const BinaryArithmeticOpParam ¶ms, const T &a, const T &b) { + return static_cast<T>(quant8_mul(params, a, b)); + }; + reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape, + input2_data, output_shape, output_data, fn); return; } BinaryBroadcastFiveFold( - params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, - static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *, - uint8_t *)>(MulElementwiseQuant8), - static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *, - uint8_t *)>(MulSimpleBroadcastQuant8)); + params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, + static_cast<void (*)(int, const BinaryArithmeticOpParam &, const T *, const T *, T *)>( + MulElementwise), + static_cast<void (*)(int, const BinaryArithmeticOpParam &, T, const T *, T *)>( + MulSimpleBroadcast)); } inline void BroadcastMulDispatch(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, @@ -665,17 +1205,59 @@ inline void BroadcastMulDispatch(const BinaryArithmeticOpParam ¶ms, const Sh { // TODO: Use GetBinaryArithmeticFn const std::function<float(const float &, const float &)> fn = - [](const float &a, const float &b) -> float { return a * b; }; + [](const float &a, const float &b) -> float { return a * b; }; reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, fn); return; } - BinaryBroadcastFiveFold( - params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, - static_cast<void (*)(int, const BinaryArithmeticOpParam &, const float *, const float *, - float *)>(MulElementwise), - static_cast<void (*)(int, const BinaryArithmeticOpParam &, float, const float *, float *)>( - MulSimpleBroadcast)); + auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncMulFloat>(params); + BinaryBroadcastFiveFold(params, input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data, implFuncs.first, implFuncs.second); +} + +inline void Div(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const float *input1_data, const Shape &input2_shape, const float *input2_data, + const Shape &output_shape, float *output_data) +{ +#ifdef __aarch64__ + const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape); + auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncDivFloat>(params); + (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data); +#else + const std::function<float(const float &, const float &)> fn = + [](const float &a, const float &b) -> float { return a / b; }; + reference::BinaryArithmeticOp(params, input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data, fn); +#endif // __aarch64__ +} + +inline void BroadcastDivDispatch(const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, + const float *input1_data, const Shape &input2_shape, + const float *input2_data, const Shape &output_shape, + float *output_data) +{ +#ifdef __aarch64__ + if (params.broadcast_category == BroadcastableOpCategory::kFirstInputBroadcastsFast) + { + auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncDivFloat>(params); + BinaryBroadcastFiveFold(params, false, input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data, implFuncs.first, implFuncs.second); + } + else if (params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast) + { + auto implFuncs = + getBinaryOpWithActivationImplFloat<BinaryOpFuncSwapArgs<BinaryOpFuncDivFloat>>(params); + BinaryBroadcastFiveFold(params, true, input1_shape, input1_data, input2_shape, input2_data, + output_shape, output_data, implFuncs.first, implFuncs.second); + } + else +#endif // __aarch64__ + { + const std::function<float(const float &, const float &)> fn = + [](const float &a, const float &b) -> float { return a / b; }; + reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape, + input2_data, output_shape, output_data, fn); + } } } // namespace optimized diff --git a/compute/cker/include/cker/operation/optimized/Conv.h b/compute/cker/include/cker/operation/optimized/Conv.h index 0f620146c..6e0e129c6 100644 --- a/compute/cker/include/cker/operation/optimized/Conv.h +++ b/compute/cker/include/cker/operation/optimized/Conv.h @@ -42,13 +42,15 @@ namespace cker namespace optimized { +std::mutex _gemmlowp_mutex; + struct GemmlowpOutputPipeline { typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col> ColVectorMap; typedef std::tuple<gemmlowp::OutputStageBiasAddition<ColVectorMap>, gemmlowp::OutputStageScaleInt32ByFixedPointAndExponent, gemmlowp::OutputStageClamp, gemmlowp::OutputStageSaturatingCastToUint8> - Pipeline; + Pipeline; static Pipeline MakeExp(const int32_t *bias_data, int output_rows, int32_t output_offset, int32_t output_multiplier, int output_left_shift, int32_t output_activation_min, int32_t output_activation_max) @@ -106,7 +108,7 @@ inline void Conv(const ConvParams ¶ms, const Shape &input_shape, const uint8 const int filter_height = filter_shape.Dims(1); const bool need_dilated_im2col = dilation_width_factor != 1 || dilation_height_factor != 1; const bool need_im2col = - stride_width != 1 || stride_height != 1 || filter_width != 1 || filter_height != 1; + stride_width != 1 || stride_height != 1 || filter_width != 1 || filter_height != 1; if (need_dilated_im2col) { assert(im2col_data); @@ -141,7 +143,7 @@ inline void Conv(const ConvParams ¶ms, const Shape &input_shape, const uint8 // the other calls commented out. This is a partial rollback of cl/196819423. // const int gemm_input_cols = FlatSizeSkipDim(*gemm_input_shape, 3); const int gemm_input_cols = - gemm_input_shape->Dims(0) * gemm_input_shape->Dims(1) * gemm_input_shape->Dims(2); + gemm_input_shape->Dims(0) * gemm_input_shape->Dims(1) * gemm_input_shape->Dims(2); const int filter_rows = filter_shape.Dims(0); // See b/79927784. // const int filter_cols = FlatSizeSkipDim(filter_shape, 0); @@ -156,17 +158,19 @@ inline void Conv(const ConvParams ¶ms, const Shape &input_shape, const uint8 assert(bias_shape.FlatSize() == output_rows); UNUSED_RELEASE(bias_shape); gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor> filter_matrix( - filter_data, filter_rows, filter_cols); + filter_data, filter_rows, filter_cols); gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor> input_matrix( - gemm_input_data, gemm_input_rows, gemm_input_cols); + gemm_input_data, gemm_input_rows, gemm_input_cols); gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::ColMajor> output_matrix(output_data, output_rows, output_cols); const auto &output_pipeline = - GemmlowpOutputPipeline::MakeExp(bias_data, output_rows, output_offset, output_multiplier, - output_shift, output_activation_min, output_activation_max); + GemmlowpOutputPipeline::MakeExp(bias_data, output_rows, output_offset, output_multiplier, + output_shift, output_activation_min, output_activation_max); + + std::lock_guard<std::mutex> lock_guard(_gemmlowp_mutex); gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( - gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, input_offset, - output_pipeline); + gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, input_offset, + output_pipeline); } } // namespace optimized @@ -202,10 +206,10 @@ public: T *output_data, int output_height, int output_width) { const bool is_1x1_kernel = - (filter_height == 1 && filter_width == 1 && stride_rows == 1 && stride_cols == 1); + (filter_height == 1 && filter_width == 1 && stride_rows == 1 && stride_cols == 1); const bool is_same_height_width = - (filter_height == input_height && filter_width == input_width && pad_width == 0 && - pad_height == 0); + (filter_height == input_height && filter_width == input_width && pad_width == 0 && + pad_height == 0); if (is_1x1_kernel || is_same_height_width) { // is_1x1_kernel: For 1x1 kernel, the 2D convolution is reduced to matrix multiplication. diff --git a/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h b/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h new file mode 100644 index 000000000..17b2fc7a2 --- /dev/null +++ b/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h @@ -0,0 +1,1250 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_FLOAT_H__ +#define __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_FLOAT_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" +#include "cker/neon/neon_check.h" + +namespace nnfw +{ +namespace cker +{ +namespace optimized +{ + +// Implementation of float DepthwiseConv + +template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> +struct FloatDepthwiseConvKernel +{ +}; + +#ifdef USE_NEON + +template <> struct FloatDepthwiseConvKernel<false, 8, 1> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + (void)input_ptr_increment; + // Load the filters + float32x4_t filter[2]; + for (int i = 0; i < 2; i++) + { + filter[i] = vld1q_f32(filter_ptr + 4 * i); + } + int outp = 0; + // Handle 2 output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) + { + // Load the inputs + float32x4_t input[4]; + for (int i = 0; i < 4; i++) + { + input[i] = vld1q_f32(input_ptr + 4 * i); + } + input_ptr += 16; + // Load the accumulators from acc_buffer + float32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + acc[0] = vmlaq_f32(acc[0], input[0], filter[0]); + acc[1] = vmlaq_f32(acc[1], input[1], filter[1]); + acc[2] = vmlaq_f32(acc[2], input[2], filter[0]); + acc[3] = vmlaq_f32(acc[3], input[3], filter[1]); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the inputs + float32x4_t input[2]; + for (int i = 0; i < 2; i++) + { + input[i] = vld1q_f32(input_ptr + 4 * i); + } + input_ptr += 8; + // Load the accumulators from acc_buffer + float32x4_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc[i] = vmlaq_f32(acc[i], input[i], filter[i]); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + } +}; + +template <> struct FloatDepthwiseConvKernel<false, 2, 1> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + (void)input_ptr_increment; + + const float32x2_t filters = vld1_f32(filter_ptr); + const float32x4_t filters_dup2 = vcombine_f32(filters, filters); + int outp = 0; + // Handle 8 output pixels at a time. + for (; outp <= num_output_pixels - 8; outp += 8) + { + // Load the inputs + float32x4_t input[4]; + for (int i = 0; i < 4; i++) + { + input[i] = vld1q_f32(input_ptr + 4 * i); + } + input_ptr += 16; + // Load the accumulators from acc_buffer + float32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 4; i++) + { + acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle 4 output pixels at a time. + for (; outp <= num_output_pixels - 4; outp += 4) + { + // Load the inputs + float32x4_t input[2]; + for (int i = 0; i < 2; i++) + { + input[i] = vld1q_f32(input_ptr + 4 * i); + } + input_ptr += 8; + // Load the accumulators from acc_buffer + float32x4_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + // Handle 2 output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) + { + // Load the inputs + const float32x4_t input = vld1q_f32(input_ptr); + input_ptr += 4; + // Load the accumulators from acc_buffer + float32x4_t acc = vld1q_f32(acc_buffer_ptr); + // Multiply-accumulate + acc = vmlaq_f32(acc, input, filters_dup2); + // Store the accumulators back to acc_buffer + vst1q_f32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + // Handle 1 output pixel at a time + for (; outp < num_output_pixels; outp++) + { + // Load the inputs + const float32x2_t input = vld1_f32(input_ptr); + input_ptr += 2; + // Load the accumulators from acc_buffer + float32x2_t acc = vld1_f32(acc_buffer_ptr); + // Multiply-accumulate + acc = vmla_f32(acc, input, filters); + // Store the accumulators back to acc_buffer + vst1_f32(acc_buffer_ptr, acc); + acc_buffer_ptr += 2; + } + } +}; + +template <> struct FloatDepthwiseConvKernel<true, 0, 1> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)depth_multiplier; + + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + const float *local_filter_ptr = filter_ptr; + const float *local_input_ptr = input_ptr; + int ic = 0; + // Handle 16 input channels at a time. + for (; ic <= input_depth - 16; ic += 16) + { + // Load the filters + float32x4_t filter_0 = vld1q_f32(local_filter_ptr + 4 * 0); + float32x4_t filter_1 = vld1q_f32(local_filter_ptr + 4 * 1); + float32x4_t filter_2 = vld1q_f32(local_filter_ptr + 4 * 2); + float32x4_t filter_3 = vld1q_f32(local_filter_ptr + 4 * 3); + local_filter_ptr += 16; + // Load the inputs + float32x4_t input_0 = vld1q_f32(local_input_ptr + 4 * 0); + float32x4_t input_1 = vld1q_f32(local_input_ptr + 4 * 1); + float32x4_t input_2 = vld1q_f32(local_input_ptr + 4 * 2); + float32x4_t input_3 = vld1q_f32(local_input_ptr + 4 * 3); + local_input_ptr += 16; + // Load the accumulators from acc_buffer + float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0); + float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1); + float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2); + float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3); + // Multiply-accumulate + acc_0 = vmlaq_f32(acc_0, input_0, filter_0); + acc_1 = vmlaq_f32(acc_1, input_1, filter_1); + acc_2 = vmlaq_f32(acc_2, input_2, filter_2); + acc_3 = vmlaq_f32(acc_3, input_3, filter_3); + // Store the accumulators back to acc_buffer + vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0); + vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1); + vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2); + vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3); + acc_buffer_ptr += 16; + } + // Handle 4 input channels at a time. + for (; ic <= input_depth - 4; ic += 4) + { + // Load the filters + float32x4_t filter; + filter = vld1q_f32(local_filter_ptr); + local_filter_ptr += 4; + // Load the inputs + float32x4_t input; + input = vld1q_f32(local_input_ptr); + local_input_ptr += 4; + // Load the accumulators from acc_buffer + float32x4_t acc; + acc = vld1q_f32(acc_buffer_ptr); + // Multiply-accumulate + acc = vmlaq_f32(acc, input, filter); + // Store the accumulators back to acc_buffer + vst1q_f32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + // Handle one input channel at a time. + for (; ic < input_depth; ic++) + { + const float input_val = *local_input_ptr++; + const float filter_val = *local_filter_ptr++; + *acc_buffer_ptr++ += filter_val * input_val; + } + input_ptr += input_ptr_increment; + } + } +}; + +template <> struct FloatDepthwiseConvKernel<true, 0, 8> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)depth_multiplier; + + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + const float *local_filter_ptr = filter_ptr; + const float *local_input_ptr = input_ptr; + int ic = 0; + // Handle 2 input channels at a time. + for (; ic <= input_depth - 2; ic += 2) + { + // Load the filters + float32x4_t filter[4]; + for (int i = 0; i < 4; i++) + { + filter[i] = vld1q_f32(local_filter_ptr + 4 * i); + } + local_filter_ptr += 16; + // Load the inputs + const float32x2_t input = vld1_f32(local_input_ptr); + local_input_ptr += 2; + // Load the accumulators from acc_buffer + float32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0); + acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0); + acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1); + acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one input channel at a time. + for (; ic < input_depth; ic++) + { + // Load the filters + float32x4_t filter[2]; + for (int i = 0; i < 2; i++) + { + filter[i] = vld1q_f32(local_filter_ptr + 4 * i); + } + local_filter_ptr += 8; + // Load the inputs + const float input_val = *local_input_ptr++; + // Load the accumulators from acc_buffer + float32x4_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + input_ptr += input_ptr_increment; + } + } +}; + +// Note this implementation is very slow for input_depths < 8 +// (e.g. comparable to reference implementation) see, specializations for +// input_depth=3 below. +template <> struct FloatDepthwiseConvKernel<true, 0, 2> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)depth_multiplier; + + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + const float *local_filter_ptr = filter_ptr; + const float *local_input_ptr = input_ptr; + int ic = 0; + // Handle 8 input channels at a time. + for (; ic <= input_depth - 8; ic += 8) + { + // Load the filters + float32x4_t filter[4]; + for (int i = 0; i < 4; i++) + { + filter[i] = vld1q_f32(local_filter_ptr + 4 * i); + } + local_filter_ptr += 16; + // Load the inputs + float32x4x2_t input_dup2[2]; + for (int i = 0; i < 2; i++) + { + const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i); + input_dup2[i] = vzipq_f32(input, input); + } + local_input_ptr += 8; + // Load the accumulators from acc_buffer + float32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]); + acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]); + acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]); + acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle 4 input channels at a time. + for (; ic <= input_depth - 4; ic += 4) + { + // Load the filters + float32x2_t filter[4]; + for (int i = 0; i < 4; i++) + { + filter[i] = vld1_f32(local_filter_ptr + 2 * i); + } + local_filter_ptr += 8; + // Load the inputs + const float32x4_t input = vld1q_f32(local_input_ptr); + local_input_ptr += 4; + // Load the accumulators from acc_buffer + float32x2_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1_f32(acc_buffer_ptr + 2 * i); + } + // Multiply-accumulate + acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0); + acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1); + acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0); + acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1_f32(acc_buffer_ptr + 2 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + // Handle 2 input channels at a time. + for (; ic <= input_depth - 2; ic += 2) + { + // Load the filters + const float32x4_t filter = vld1q_f32(local_filter_ptr); + local_filter_ptr += 4; + // Load the inputs + const float32x2_t input = vld1_f32(local_input_ptr); + local_input_ptr += 2; + // Load the accumulators from acc_buffer + float32x2_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1_f32(acc_buffer_ptr + 2 * i); + } + // Multiply-accumulate + acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0); + acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1_f32(acc_buffer_ptr + 2 * i, acc[i]); + } + acc_buffer_ptr += 4; + } + // Handle one input channel at a time. + for (; ic < input_depth; ic++) + { + // Load the inputs + const float input_val = *local_input_ptr++; + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc_buffer_ptr[i] += local_filter_ptr[i] * input_val; + } + local_filter_ptr += 2; + acc_buffer_ptr += 2; + } + input_ptr += input_ptr_increment; + } + } +}; + +template <> struct FloatDepthwiseConvKernel<true, 3, 2> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + + // Load the filters + float32x2_t filter[3]; + for (int i = 0; i < 3; i++) + { + filter[i] = vld1_f32(filter_ptr + 2 * i); + } + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + const float32x2_t input01 = vld1_f32(input_ptr); + const float32x2_t input2 = vld1_dup_f32(input_ptr + 2); + // Load the accumulators from acc_buffer + float32x2_t acc[3]; + for (int i = 0; i < 3; i++) + { + acc[i] = vld1_f32(acc_buffer_ptr + 2 * i); + } + // Multiply-accumulate for each input channel there 2 outputs + acc[0] = vmla_lane_f32(acc[0], filter[0], input01, 0); + acc[1] = vmla_lane_f32(acc[1], filter[1], input01, 1); + acc[2] = vmla_lane_f32(acc[2], filter[2], input2, 0); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 3; i++) + { + vst1_f32(acc_buffer_ptr + 2 * i, acc[i]); + } + acc_buffer_ptr += 6; + input_ptr += input_ptr_increment; + } + } +}; + +template <> struct FloatDepthwiseConvKernel<true, 3, 4> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + + // Load the filters + float32x4_t filter[3]; + for (int i = 0; i < 3; i++) + { + filter[i] = vld1q_f32(filter_ptr + 4 * i); + } + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + // NOTE: we only want 3 values, so we read it as two ops where + // the second op just duplicates the lane + const float32x2_t input01 = vld1_f32(input_ptr); + const float32x2_t input2 = vld1_dup_f32(input_ptr + 2); + // Load the accumulators from acc_buffer + float32x4_t acc[3]; + for (int i = 0; i < 3; i++) + { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate all outputs. + acc[0] = vmlaq_lane_f32(acc[0], filter[0], input01, 0); + acc[1] = vmlaq_lane_f32(acc[1], filter[1], input01, 1); + acc[2] = vmlaq_lane_f32(acc[2], filter[2], input2, 0); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 3; i++) + { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 12; + input_ptr += input_ptr_increment; + } + } +}; + +template <> struct FloatDepthwiseConvKernel<true, 1, 8> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + + // Load the filters + float32x4_t filter[2]; + for (int i = 0; i < 2; i++) + { + filter[i] = vld1q_f32(filter_ptr + 4 * i); + } + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + // Load the inputs + const float input_val = *input_ptr; + input_ptr += input_ptr_increment; + // Load the accumulators from acc_buffer + float32x4_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + } +}; + +template <> struct FloatDepthwiseConvKernel<true, 1, 32> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + + // Load the filters + float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0); + float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1); + float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2); + float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3); + float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4); + float32x4_t filter_5 = vld1q_f32(filter_ptr + 4 * 5); + float32x4_t filter_6 = vld1q_f32(filter_ptr + 4 * 6); + float32x4_t filter_7 = vld1q_f32(filter_ptr + 4 * 7); + + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + // Load the inputs + const float input_val = *input_ptr; + input_ptr += input_ptr_increment; + // Load the accumulators from acc_buffer + float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0); + float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1); + float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2); + float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3); + float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4); + float32x4_t acc_5 = vld1q_f32(acc_buffer_ptr + 4 * 5); + float32x4_t acc_6 = vld1q_f32(acc_buffer_ptr + 4 * 6); + float32x4_t acc_7 = vld1q_f32(acc_buffer_ptr + 4 * 7); + // Multiply-accumulate + acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val); + acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val); + acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val); + acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val); + acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val); + acc_5 = vmlaq_n_f32(acc_5, filter_5, input_val); + acc_6 = vmlaq_n_f32(acc_6, filter_6, input_val); + acc_7 = vmlaq_n_f32(acc_7, filter_7, input_val); + // Store the accumulators back to acc_buffer + vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0); + vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1); + vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2); + vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3); + vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4); + vst1q_f32(acc_buffer_ptr + 4 * 5, acc_5); + vst1q_f32(acc_buffer_ptr + 4 * 6, acc_6); + vst1q_f32(acc_buffer_ptr + 4 * 7, acc_7); + acc_buffer_ptr += 32; + } + } +}; + +template <> struct FloatDepthwiseConvKernel<true, 1, 20> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + + // Load the filters + float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0); + float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1); + float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2); + float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3); + float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4); + + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + // Load the inputs + const float input_val = *input_ptr; + input_ptr += input_ptr_increment; + // Load the accumulators from acc_buffer + float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0); + float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1); + float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2); + float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3); + float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4); + // Multiply-accumulate + acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val); + acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val); + acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val); + acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val); + acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val); + // Store the accumulators back to acc_buffer + vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0); + vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1); + vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2); + vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3); + vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4); + acc_buffer_ptr += 20; + } + } +}; + +template <> struct FloatDepthwiseConvKernel<true, 0, 16> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)depth_multiplier; + + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + const float *local_filter_ptr = filter_ptr; + const float *local_input_ptr = input_ptr; + for (int ic = 0; ic < input_depth; ic++) + { + // Load the filters + float32x4_t filter[4]; + for (int i = 0; i < 4; i++) + { + filter[i] = vld1q_f32(local_filter_ptr + 4 * i); + } + local_filter_ptr += 16; + // Load the inputs + const float input_val = *local_input_ptr++; + // Load the accumulators from acc_buffer + float32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 4; i++) + { + acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + input_ptr += input_ptr_increment; + } + } +}; + +template <> struct FloatDepthwiseConvKernel<true, 8, 1> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + + // Load the filters + float32x4_t filter[2]; + for (int i = 0; i < 2; i++) + { + filter[i] = vld1q_f32(filter_ptr + 4 * i); + } + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + // Load the inputs + float32x4_t input[2]; + for (int i = 0; i < 2; i++) + { + input[i] = vld1q_f32(input_ptr + 4 * i); + } + // Load the accumulators from acc_buffer + float32x4_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc[i] = vmlaq_f32(acc[i], input[i], filter[i]); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + input_ptr += input_ptr_increment; + } + } +}; + +template <> struct FloatDepthwiseConvKernel<true, 2, 1> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + + float32x2_t filter = vld1_f32(filter_ptr); + float32x4_t filter_x4 = vcombine_f32(filter, filter); + int outp = 0; + + // Handle two output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) + { + // Load the inputs + float32x2_t input_1 = vld1_f32(input_ptr); + input_ptr += input_ptr_increment; + float32x2_t input_2 = vld1_f32(input_ptr); + input_ptr += input_ptr_increment; + float32x4_t input = vcombine_f32(input_1, input_2); + + // Load the accumulators from acc_buffer + float32x4_t acc = vld1q_f32(acc_buffer_ptr); + + // Multiply-accumulate + acc = vmlaq_f32(acc, input, filter_x4); + + // Store the accumulators back to acc_buffer + vst1q_f32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the inputs + float32x2_t input = vld1_f32(input_ptr); + input_ptr += input_ptr_increment; + + // Load the accumulators from acc_buffer + float32x2_t acc = vld1_f32(acc_buffer_ptr); + + // Multiply-accumulate + acc = vmla_f32(acc, input, filter); + + // Store the accumulators back to acc_buffer + vst1_f32(acc_buffer_ptr, acc); + acc_buffer_ptr += 2; + } + } +}; + +template <> struct FloatDepthwiseConvKernel<true, 4, 1> +{ + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float *input_ptr, int input_ptr_increment, const float *filter_ptr, + float *acc_buffer_ptr) + { + (void)input_depth; + (void)depth_multiplier; + + float32x4_t filter = vld1q_f32(filter_ptr); + + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + // Load the inputs + float32x4_t input = vld1q_f32(input_ptr); + // Load the accumulators from acc_buffer + float32x4_t acc = vld1q_f32(acc_buffer_ptr); + // Multiply-accumulate + acc = vmlaq_f32(acc, input, filter); + // Store the accumulators back to acc_buffer + vst1q_f32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + input_ptr += input_ptr_increment; + } + } +}; +#endif + +// Accumulates the effect of one row of the filter, on a segment of one row +// of the output, accessing the corresponding one row of the input. +template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> +void FloatDepthwiseConvAccumRow(int stride, int dilation_factor, int input_depth, int input_width, + const float *input_data, int pad_width, int depth_multiplier, + int filter_width, const float *filter_data, int out_x_buffer_start, + int out_x_buffer_end, int output_depth, float *acc_buffer) +{ + // Sanity check parameters. This is important in particular to ensure + // that we keep the number of template instantiations minimal, so we don't + // increase binary size unnecessarily. + static_assert(kFixedDepthMultiplier || !kFixedInputDepth, ""); + static_assert(kFixedInputDepth || kAllowStrided, ""); + assert(stride == 1 || kAllowStrided); + if (kFixedInputDepth) + { + assert(input_depth == kFixedInputDepth); + } + if (kFixedDepthMultiplier) + { + assert(depth_multiplier == kFixedDepthMultiplier); + } + assert(output_depth == input_depth * depth_multiplier); + const int input_ptr_increment = stride * input_depth; + const float *filter_base_ptr = filter_data; + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + // For the current (filter_x, filter_y) point in the filter, + // compute the boundaries of the corresponding output row segment. + int out_x_loop_start_unclamped = 0; + int out_x_loop_end_unclamped = 0; + if (kAllowStrided) + { + if (stride == 2) + { + out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 1) / 2; + out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 1) / 2; + } + else if (stride == 4) + { + out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 3) / 4; + out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 3) / 4; + } + else + { + out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + stride - 1) / stride; + out_x_loop_end_unclamped = + (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride; + } + } + else + { + out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x; + out_x_loop_end_unclamped = pad_width + input_width - dilation_factor * filter_x; + } + // The kernel will have to iterate on the segment of the + // output row that starts at out_x_loop_start and out_x_loop_end. + const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclamped); + const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclamped); + + float *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; + const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x; + const float *input_ptr = input_data + in_x_origin * input_depth; + const int num_output_pixels = out_x_loop_end - out_x_loop_start; + FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run( + num_output_pixels, input_depth, depth_multiplier, input_ptr, input_ptr_increment, + filter_base_ptr, acc_buffer_ptr); + filter_base_ptr += output_depth; + } +} + +// generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized. +inline void FloatDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth, + int input_width, const float *input_data, + int pad_width, int depth_multiplier, int filter_width, + const float *filter_data, int out_x_buffer_start, + int out_x_buffer_end, int output_depth, + float *acc_buffer) +{ + const float *filter_base_ptr = filter_data; + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + const int out_x_loop_start = + std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride); + const int out_x_loop_end = + std::min(out_x_buffer_end, + (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride); + + float *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; + const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x; + const float *input_ptr = input_data + in_x_origin * input_depth; + const int input_ptr_increment = (stride - 1) * input_depth; + for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) + { + const float *filter_ptr = filter_base_ptr; + for (int ic = 0; ic < input_depth; ++ic) + { + const float input_val = *input_ptr++; + for (int m = 0; m < depth_multiplier; m++) + { + const float filter_val = *filter_ptr++; + *acc_buffer_ptr++ += filter_val * input_val; + } + } + input_ptr += input_ptr_increment; + } + filter_base_ptr += output_depth; + } +} + +// Initializes the accumulator buffer with bias values. +inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth, + const float *bias_data, float *acc_buffer) +{ + // TODO(benoitjacob): This might need optimized specializations + // for small output_depth values, if that ever becomes an important + // case (like it was for some quantized DepthwiseConv cases). + for (int i = 0; i < num_output_pixels; i++) + { + memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth); + } +} + +// DepthwiseConv can run with multi threads on the dim specified by thread_dim. +// Each thread processes output elements on dim, thread_dim, in the range of +// [thread_start, thread_end). +// For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it +// means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :]. +inline void DepthwiseConvImpl(const DepthwiseConvParams ¶ms, const Shape &input_shape, + const float *input_data, const Shape &filter_shape, + const float *filter_data, const Shape &bias_shape, + const float *bias_data, const Shape &output_shape, float *output_data, + int thread_start, int thread_end, int thread_dim) +{ + UNUSED_RELEASE(bias_shape); + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int depth_multiplier = params.depth_multiplier; + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + assert(thread_dim == 0 || thread_dim == 1); + + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + assert(output_depth == input_depth * depth_multiplier); + assert(bias_shape.FlatSize() == output_depth); + + static const int kAccBufferMaxSize = 4832; + float acc_buffer[kAccBufferMaxSize]; + assert(kAccBufferMaxSize >= output_depth); + const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth; + const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth; + assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize); + assert(kAccBufferActualSize <= kAccBufferMaxSize); + assert(kOutputPixelsInAccBuffer >= 1); + + UNUSED_RELEASE(kAccBufferActualSize); + + // row_accum_func will point to the core accumulation function to be used + // for this DepthwiseConv op. + using row_accum_func_t = decltype(&FloatDepthwiseConvAccumRowGeneric); + row_accum_func_t row_accum_func = nullptr; + +#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \ + if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \ + (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \ + depth_multiplier == FIXED_DEPTH_MULTIPLIER) \ + { \ + row_accum_func = \ + FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>; \ + } + +#ifdef USE_NEON + // We go over our list of kernels by decreasing order of preference + // for the cases where multiple kernels could apply. + + // Start with the fastest kernels: AllowStrided=false, fixed input depth. + + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1) + + // Next come the strided kernels: AllowStrided=true, fixed input depth. + // They are a bit less efficient, but allow stride!=1. + + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 2) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 4) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1) + + // Finally, the kernels allowing a variable input depth, + // these are the least efficient but most general kernels. + + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 8) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 16) + +#endif // USE_NEON + +#undef TFMINI_USE_DEPTHWISECONV_KERNEL + + // No matching fast kernel found, use slow fallback. + if (!row_accum_func) + { + row_accum_func = FloatDepthwiseConvAccumRowGeneric; + } + + const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2); + const int input_batch_stride = input_height_stride * input_shape.Dims(1); + const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2); + + // Now that we have determined row_accum_func, we can start work. + int batch_start = 0; + int batch_end = batches; + int row_start = 0; + int row_end = output_height; + int output_ptr_offset = 0; + + switch (thread_dim) + { + case 0: + // Multithread along with the batch axis + assert(thread_start >= 0); + assert(thread_end <= batches); + batch_start = thread_start; + batch_end = thread_end; + output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0); + break; + case 1: + // Multithread along with the row axis + assert(thread_start >= 0); + assert(thread_end <= output_height); + row_start = thread_start; + row_end = thread_end; + output_ptr_offset = row_start * output_width * output_depth; + break; + } + + float *output_ptr = output_data + output_ptr_offset; + int batch_step = (output_height + row_start - row_end) * output_width * output_depth; + + for (int b = batch_start; b < batch_end; ++b) + { + for (int out_y = row_start; out_y < row_end; ++out_y) + { + const int in_y_origin = (out_y * stride_height) - pad_height; + const int filter_y_start = + std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor); + const int filter_y_end = + std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) / + dilation_height_factor); + for (int out_x_buffer_start = 0; out_x_buffer_start < output_width; + out_x_buffer_start += kOutputPixelsInAccBuffer) + { + const int out_x_buffer_end = + std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer); + // We call a 'pixel' a group of activation that share all but the + // 'depth'/'channel' coordinate. num_output_pixels is the number of + // output pixels that we will accumulate in this loop iteration. + const int num_output_pixels = out_x_buffer_end - out_x_buffer_start; + // Initialize our local accumulator with the bias values, so we don't + // have to add them later. + DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer); + // Accumulation loop. Most of the time should be spent in here. + for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) + { + const int in_y = in_y_origin + dilation_height_factor * filter_y; + row_accum_func(stride_width, dilation_width_factor, input_depth, input_width, + input_data + in_y * input_height_stride + b * input_batch_stride, + pad_width, depth_multiplier, filter_width, + filter_data + filter_y * filter_height_stride, out_x_buffer_start, + out_x_buffer_end, output_depth, acc_buffer); + } + // Finished accumulating. Now store to destination. + const int num_output_values = output_depth * num_output_pixels; + int i = 0; +// TODO(benoitjacob) optimized code goes here +#ifdef USE_NEON + // Handle 16 values at a time + for (; i <= num_output_values - 16; i += 16) + { + float32x4_t acc[4]; + for (int k = 0; k < 4; k++) + { + acc[k] = vld1q_f32(acc_buffer + i + 4 * k); + } + for (int k = 0; k < 4; k++) + { + acc[k] = vmaxq_f32(vdupq_n_f32(output_activation_min), + vminq_f32(vdupq_n_f32(output_activation_max), acc[k])); + } + for (int k = 0; k < 4; k++) + { + vst1q_f32(output_ptr + 4 * k, acc[k]); + } + output_ptr += 16; + } + // Handle 4 values at a time + for (; i <= num_output_values - 4; i += 4) + { + float32x4_t acc = vld1q_f32(acc_buffer + i); + + acc = vmaxq_f32(vdupq_n_f32(output_activation_min), + vminq_f32(vdupq_n_f32(output_activation_max), acc)); + + vst1q_f32(output_ptr, acc); + output_ptr += 4; + } +#endif + // Handle leftover values, one by one. This is very slow. + for (; i < num_output_values; i++) + { + float acc = acc_buffer[i]; + acc = std::max(output_activation_min, std::min(output_activation_max, acc)); + + *output_ptr++ = acc; + } + } + } + output_ptr += batch_step; + } +} + +} // namespace optimized +} // namespace cker +} // namespace nnfw + +#endif diff --git a/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h b/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h index d383b126d..5ca56fd09 100644 --- a/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h +++ b/compute/cker/include/cker/operation/optimized/DepthwiseConvUint8.h @@ -32,6 +32,8 @@ namespace cker { namespace optimized { +namespace depthwise_conv +{ // Implementation of quantized DepthwiseConv @@ -44,8 +46,8 @@ struct QuantizedDepthwiseConvKernel template <> struct QuantizedDepthwiseConvKernel<true, 8, 2> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -57,7 +59,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 8, 2> for (int i = 0; i < 2; i++) { filter[i] = - vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])), vdupq_n_s16(filter_offset)); + vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])), vdupq_n_s16(filter_offset)); } // Handle one output pixel at a time. for (int outp = 0; outp < num_output_pixels; outp++) @@ -80,9 +82,9 @@ template <> struct QuantizedDepthwiseConvKernel<true, 8, 2> for (int i = 0; i < 2; i++) { acc[0].val[i] = - vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), vget_low_s16(input_dup2.val[i])); + vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), vget_low_s16(input_dup2.val[i])); acc[1].val[i] = - vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), vget_high_s16(input_dup2.val[i])); + vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), vget_high_s16(input_dup2.val[i])); } // Store the accumulators back to acc_buffer for (int i = 0; i < 2; i++) @@ -98,8 +100,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 8, 2> template <> struct QuantizedDepthwiseConvKernel<false, 8, 1> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -174,8 +176,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 8, 1> template <> struct QuantizedDepthwiseConvKernel<false, 4, 2> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -206,9 +208,9 @@ template <> struct QuantizedDepthwiseConvKernel<false, 4, 2> for (int i = 0; i < 2; i++) { acc[2 * i + 0] = - vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), vget_low_s16(input_dup2.val[i])); + vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), vget_low_s16(input_dup2.val[i])); acc[2 * i + 1] = - vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), vget_high_s16(input_dup2.val[i])); + vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), vget_high_s16(input_dup2.val[i])); } // Store the accumulators back to acc_buffer for (int i = 0; i < 4; i++) @@ -253,8 +255,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 4, 2> template <> struct QuantizedDepthwiseConvKernel<false, 2, 8> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -338,8 +340,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 2, 8> template <> struct QuantizedDepthwiseConvKernel<false, 2, 2> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -409,8 +411,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 2, 2> template <> struct QuantizedDepthwiseConvKernel<false, 2, 1> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -534,8 +536,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 2, 1> template <> struct QuantizedDepthwiseConvKernel<false, 1, 2> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -600,8 +602,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 1, 2> template <> struct QuantizedDepthwiseConvKernel<false, 1, 4> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -703,8 +705,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 1, 4> template <> struct QuantizedDepthwiseConvKernel<false, 4, 1> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -778,8 +780,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 4, 1> template <> struct QuantizedDepthwiseConvKernel<false, 4, 4> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -864,8 +866,8 @@ template <> struct QuantizedDepthwiseConvKernel<false, 4, 4> template <> struct QuantizedDepthwiseConvKernel<true, 0, 3> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -873,7 +875,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 3> // We will do that by register-level table-look-up using VTBL instructions. // Here we prepare the registers containing the table-lookup indices. static const uint8_t dup3_indices_array[3][8] = { - {0, 0, 0, 1, 1, 1, 2, 2}, {2, 3, 3, 3, 4, 4, 4, 5}, {5, 5, 6, 6, 6, 7, 7, 7}}; + {0, 0, 0, 1, 1, 1, 2, 2}, {2, 3, 3, 3, 4, 4, 4, 5}, {5, 5, 6, 6, 6, 7, 7, 7}}; uint8x8_t dup3_indices[3]; for (int i = 0; i < 3; i++) { @@ -928,9 +930,9 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 3> for (int j = 0; j < 3; j++) { acc[0].val[j] = - vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), vget_low_s16(filter[j])); + vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), vget_low_s16(filter[j])); acc[1].val[j] = - vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), vget_high_s16(filter[j])); + vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), vget_high_s16(filter[j])); } // Store the accumulators back to acc_buffer for (int i = 0; i < 2; i++) @@ -944,10 +946,10 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 3> // Handle one input channel at a time. for (; ic < input_depth; ic++) { - const uint16_t input_val = *local_input_ptr++ + input_offset; + const int16_t input_val = *local_input_ptr++ + input_offset; for (int i = 0; i < 3; i++) { - const uint16_t filter_val = local_filter_ptr[i] + filter_offset; + const int16_t filter_val = local_filter_ptr[i] + filter_offset; *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val; } local_filter_ptr += 3; @@ -960,8 +962,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 3> template <> struct QuantizedDepthwiseConvKernel<true, 0, 2> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -1002,9 +1004,9 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 2> for (int j = 0; j < 2; j++) { acc[0].val[j] = - vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), vget_low_s16(input_dup2.val[j])); + vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), vget_low_s16(input_dup2.val[j])); acc[1].val[j] = - vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), vget_high_s16(input_dup2.val[j])); + vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), vget_high_s16(input_dup2.val[j])); } // Store the accumulators back to acc_buffer. for (int i = 0; i < 2; i++) @@ -1018,10 +1020,10 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 2> for (; ic < input_depth; ic++) { // Load the inputs. - const uint16_t input_val = *local_input_ptr++ + input_offset; + const int16_t input_val = *local_input_ptr++ + input_offset; for (int i = 0; i < 2; i++) { - const uint16_t filter_val = local_filter_ptr[i] + filter_offset; + const int16_t filter_val = local_filter_ptr[i] + filter_offset; *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val; } local_filter_ptr += 2; @@ -1034,8 +1036,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 2> template <> struct QuantizedDepthwiseConvKernel<true, 0, 1> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -1112,8 +1114,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 1> // Handle one input channel at a time. for (; ic < input_depth; ic++) { - const uint16_t input_val = *local_input_ptr++ + input_offset; - const uint16_t filter_val = *local_filter_ptr++ + filter_offset; + const int16_t input_val = *local_input_ptr++ + input_offset; + const int16_t filter_val = *local_filter_ptr++ + filter_offset; *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val; } input_ptr += input_ptr_increment; @@ -1124,8 +1126,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 0, 1> template <> struct QuantizedDepthwiseConvKernel<true, 16, 1> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -1174,7 +1176,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 16, 1> { acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), vget_low_s16(filter[i])); acc[2 * i + 1] = - vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), vget_high_s16(filter[i])); + vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), vget_high_s16(filter[i])); } // Store the accumulators back to acc_buffer for (int i = 0; i < 4; i++) @@ -1189,8 +1191,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 16, 1> template <> struct QuantizedDepthwiseConvKernel<true, 8, 1> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -1228,8 +1230,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 8, 1> template <> struct QuantizedDepthwiseConvKernel<true, 1, 16> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -1253,7 +1255,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 16> { uint8_t input_u8 = *input_ptr; input_ptr += input_ptr_increment; - uint16_t input = static_cast<int16_t>(input_u8 + input_offset); + int16_t input = static_cast<int16_t>(input_u8) + input_offset; // Load the accumulators from acc_buffer int32x4_t acc[4]; for (int i = 0; i < 4; i++) @@ -1279,8 +1281,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 16> template <> struct QuantizedDepthwiseConvKernel<true, 1, 32> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -1302,7 +1304,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 32> { uint8_t input_u8 = *input_ptr; input_ptr += input_ptr_increment; - uint16_t input = static_cast<int16_t>(input_u8 + input_offset); + int16_t input = static_cast<int16_t>(input_u8) + input_offset; // Load the accumulators from acc_buffer int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); @@ -1338,8 +1340,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 32> template <> struct QuantizedDepthwiseConvKernel<true, 1, 20> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -1363,7 +1365,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 20> { uint8_t input_u8 = *input_ptr; input_ptr += input_ptr_increment; - uint16_t input = static_cast<int16_t>(input_u8 + input_offset); + int16_t input = static_cast<int16_t>(input_u8) + input_offset; // Load the accumulators from acc_buffer int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); @@ -1390,21 +1392,21 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 20> template <> struct QuantizedDepthwiseConvKernel<true, 1, 8> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; // Load the filters, add filter_offset. const uint8x8_t filter_u8 = vld1_u8(filter_ptr); const int16x8_t filter = - vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset)); + vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset)); // Handle one output pixel at a time. for (int outp = 0; outp < num_output_pixels; outp++) { uint8_t input_u8 = *input_ptr; input_ptr += input_ptr_increment; - uint16_t input = static_cast<int16_t>(input_u8 + input_offset); + int16_t input = static_cast<int16_t>(input_u8) + input_offset; // Load the accumulators from acc_buffer int32x4_t acc[2]; for (int i = 0; i < 2; i++) @@ -1427,8 +1429,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 1, 8> template <> struct QuantizedDepthwiseConvKernel<true, 2, 1> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -1455,7 +1457,7 @@ template <> struct QuantizedDepthwiseConvKernel<true, 2, 1> input_u16 = vset_lane_u16((reinterpret_cast<const uint16_t *>(input_ptr))[0], input_u16, 1); input_ptr += input_ptr_increment; const int16x4_t input_s16 = - vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16)))); + vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u16(input_u16)))); const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); // Multiply-accumulate. @@ -1490,8 +1492,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 2, 1> template <> struct QuantizedDepthwiseConvKernel<true, 4, 1> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -1555,8 +1557,8 @@ template <> struct QuantizedDepthwiseConvKernel<true, 4, 1> template <> struct QuantizedDepthwiseConvKernel<false, 12, 1> { static void Run(int num_output_pixels, int input_depth, int depth_multiplier, - const uint8_t *input_ptr, uint16_t input_offset, int input_ptr_increment, - const uint8_t *filter_ptr, uint16_t filter_offset, int32_t *acc_buffer_ptr) + const uint8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const uint8_t *filter_ptr, int16_t filter_offset, int32_t *acc_buffer_ptr) { (void)input_depth; (void)depth_multiplier; @@ -1652,9 +1654,9 @@ void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_d else { out_x_loop_start_unclampled = - (pad_width - dilation_factor * filter_x + stride - 1) / stride; + (pad_width - dilation_factor * filter_x + stride - 1) / stride; out_x_loop_end_unclampled = - (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride; + (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride; } } else @@ -1672,8 +1674,8 @@ void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_d const uint8_t *input_ptr = input_data + in_x_origin * input_depth; const int num_output_pixels = out_x_loop_end - out_x_loop_start; QuantizedDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run( - num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset, - input_ptr_increment, filter_base_ptr, filter_offset, acc_buffer_ptr); + num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset, + input_ptr_increment, filter_base_ptr, filter_offset, acc_buffer_ptr); filter_base_ptr += output_depth; } } @@ -1690,11 +1692,11 @@ inline void QuantizedDepthwiseConvAccumRowGeneric(int stride, int dilation_facto const uint8_t *filter_base_ptr = filter_data; for (int filter_x = 0; filter_x < filter_width; ++filter_x) { - const int out_x_loop_start = std::max( - out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride); + const int out_x_loop_start = + std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride); const int out_x_loop_end = - std::min(out_x_buffer_end, - (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride); + std::min(out_x_buffer_end, + (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride); int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x; @@ -1813,7 +1815,8 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams ¶ms, const Shape const uint8_t *input_data, const Shape &filter_shape, const uint8_t *filter_data, const Shape &bias_shape, const int32_t *bias_data, const Shape &output_shape, - uint8_t *output_data) + uint8_t *output_data, int thread_start, int thread_end, + int thread_dim) { (void)bias_shape; const int stride_width = params.stride_width; @@ -1852,6 +1855,8 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams ¶ms, const Shape assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize); assert(kAccBufferActualSize <= kAccBufferMaxSize); assert(kOutputPixelsInAccBuffer >= 1); + assert(thread_dim == 0 || thread_dim == 1); + UNUSED_RELEASE(kAccBufferActualSize); // row_accum_func will point to the core accumulation function to be used @@ -1865,7 +1870,7 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams ¶ms, const Shape depth_multiplier == FIXED_DEPTH_MULTIPLIER) \ { \ row_accum_func = \ - QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>; \ + QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>; \ } #ifdef USE_NEON @@ -1919,22 +1924,49 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams ¶ms, const Shape const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2); // Now that we have determined row_accum_func, we can start work. - uint8_t *output_ptr = output_data; - for (int b = 0; b < batches; ++b) + int batch_start = 0; + int batch_end = batches; + int row_start = 0; + int row_end = output_height; + int output_ptr_offset = 0; + + switch (thread_dim) + { + case 0: + // Multithread along with the batch axis + assert(thread_start >= 0); + assert(thread_end <= batches); + batch_start = thread_start; + batch_end = thread_end; + output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0); + break; + case 1: + // Multithread along with the row axis + assert(thread_start >= 0); + assert(thread_end <= output_height); + row_start = thread_start; + row_end = thread_end; + output_ptr_offset = row_start * output_width * output_depth; + break; + } + + uint8_t *output_ptr = output_data + output_ptr_offset; + int batch_step = (output_height + row_start - row_end) * output_width * output_depth; + for (int b = batch_start; b < batch_end; ++b) { - for (int out_y = 0; out_y < output_height; ++out_y) + for (int out_y = row_start; out_y < row_end; ++out_y) { const int in_y_origin = (out_y * stride_height) - pad_height; const int filter_y_start = - std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor); + std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor); const int filter_y_end = - std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) / - dilation_height_factor); + std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) / + dilation_height_factor); for (int out_x_buffer_start = 0; out_x_buffer_start < output_width; out_x_buffer_start += kOutputPixelsInAccBuffer) { const int out_x_buffer_end = - std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer); + std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer); // We call a 'pixel' a group of activation that share all but the // 'depth'/'channel' coordinate. num_output_pixels is the number of // output pixels that we will accumulate in this loop iteration. @@ -1952,7 +1984,7 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams ¶ms, const Shape filter_data + filter_y * filter_height_stride, filter_offset, out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer); } - // Finished accumulating int32 values. Now need to convert them to + // Finished accumulating int32_t values. Now need to convert them to // the final 8bit form and store them. const int num_output_values = output_depth * num_output_pixels; int i = 0; @@ -2113,9 +2145,111 @@ inline void DepthwiseConvGeneral(const DepthwiseConvParams ¶ms, const Shape } } } + output_ptr += batch_step; } } +} // namespace depthwise_conv + +// template <DepthwiseConvOutputRounding kOutputRounding> +inline void DepthwiseConvWithRounding(const DepthwiseConvParams ¶ms, const Shape &input_shape, + const uint8_t *input_data, const Shape &filter_shape, + const uint8_t *filter_data, const Shape &bias_shape, + const int32_t *bias_data, const Shape &output_shape, + uint8_t *output_data, int thread_start, int thread_end, + int thread_dim) +{ + const int depth_multiplier = params.depth_multiplier; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + assert(dilation_width_factor >= 1); + assert(dilation_height_factor >= 1); + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + assert(output_activation_min <= output_activation_max); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_depth = input_shape.Dims(3); + assert(output_depth == input_depth * depth_multiplier); + assert(bias_shape.FlatSize() == output_depth); + + UNUSED_RELEASE(depth_multiplier); + UNUSED_RELEASE(output_activation_min); + UNUSED_RELEASE(output_activation_max); + UNUSED_RELEASE(dilation_width_factor); + UNUSED_RELEASE(dilation_height_factor); + UNUSED_RELEASE(output_depth); + UNUSED_RELEASE(input_depth); + +// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on +// Jetson TX-2. This compiler does not support the offsetof() macro. +#if defined(__aarch64__) && !defined(GOOGLE_L4T) +// TODO Use below codes +// // Dispatch to dot-product 3x3 kernels when supported. +// +// ruy::Context *ruy_context = cpu_backend_context->ruy_context(); +// const bool has_dot_product_instructions = +// ruy_context != nullptr && +// (ruy_context->GetRuntimeEnabledPaths() & ruy::Path::kNeonDotprod) != ruy::Path::kNone; +// if (has_dot_product_instructions) +// { +// using optimized_ops::depthwise_conv::DotProduct3x3KernelType; +// DotProduct3x3KernelType kernel_type = +// optimized_ops::depthwise_conv::CategorizeDotProductKernel( +// input_shape, filter_shape, params); +// if (kernel_type != DotProduct3x3KernelType::kNone) +// { +// optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3< +// DepthwiseConvImplementation::kUseNeon3x3DotProduct>(params, input_shape, input_data, +// filter_shape, filter_data, +// bias_shape, +// bias_data, output_shape, +// output_data); +// return; +// } +// } +// +// // Dispatch to non-dot-product 3x3 kernels when supported. +// +// const int stride_width = params.stride_width; +// const int stride_height = params.stride_height; +// const int pad_width = params.padding_values.width; +// const int pad_height = params.padding_values.height; +// const int output_shift = params.output_shift; +// +// // Call kernel optimized for depthwise convolutions using 3x3 filters if +// // parameters are supported. +// if (depthwise_conv::Fast3x3FilterKernelSupported(input_shape, filter_shape, stride_width, +// stride_height, dilation_width_factor, +// dilation_height_factor, pad_width, pad_height, +// depth_multiplier, output_shape, output_shift)) +// { +// depthwise_conv::DepthwiseConv3x3Filter<kOutputRounding>( +// params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data, +// output_shape, output_data, thread_start, thread_end, thread_dim); +// return; +// } +#endif + + depthwise_conv::DepthwiseConvGeneral(params, input_shape, input_data, filter_shape, filter_data, + bias_shape, bias_data, output_shape, output_data, + thread_start, thread_end, thread_dim); +} + +inline void DepthwiseConvImpl(const DepthwiseConvParams ¶ms, const Shape &input_shape, + const uint8_t *input_data, const Shape &filter_shape, + const uint8_t *filter_data, const Shape &bias_shape, + const int32_t *bias_data, const Shape &output_shape, + uint8_t *output_data, int thread_start, int thread_end, + int thread_dim) +{ + return DepthwiseConvWithRounding(params, input_shape, input_data, filter_shape, filter_data, + bias_shape, bias_data, output_shape, output_data, thread_start, + thread_end, thread_dim); +} + } // namespace optimized } // namespace cker } // namespace nnfw diff --git a/compute/cker/include/cker/operation/optimized/Gemm.h b/compute/cker/include/cker/operation/optimized/Gemm.h new file mode 100644 index 000000000..cfebef452 --- /dev/null +++ b/compute/cker/include/cker/operation/optimized/Gemm.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2019 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_OPTIMIZED_GEMM_H__ +#define __NNFW_CKER_OPTIMIZED_GEMM_H__ + +#include "cker/eigen/eigen_gemm_eigen.h" +#include "cker/Shape.h" +#include "cker/Types.h" + +#include <ruy/context.h> + +namespace nnfw +{ +namespace cker +{ +namespace optimized +{ + +#if defined(CKER_X86_PLATFORM) + +/* From tensorflow/tensorflow/lite/kernels/cpu_backend_gemm_x86.h */ +template <typename LhsScalar, typename RhsScalar, typename AccumScalar, typename DstScalar, + QuantizationFlavor quantization_flavor> +struct GemmImplX86 +{ + static void Run(const MatrixParams<LhsScalar> &, const LhsScalar *, + const MatrixParams<RhsScalar> &, const RhsScalar *, + const MatrixParams<DstScalar> &, DstScalar *, + const GemmParams<AccumScalar, DstScalar, quantization_flavor> &) + { + static_assert( + std::is_floating_point<LhsScalar>::value && std::is_floating_point<RhsScalar>::value && + std::is_floating_point<AccumScalar>::value && std::is_floating_point<DstScalar>::value && + quantization_flavor != QuantizationFlavor::kFloatingPoint, + "GemmImplX86 does not supported types other than float yet."); + } +}; + +// For float, defer to eigen for now. +template <> struct GemmImplX86<float, float, float, float, QuantizationFlavor::kFloatingPoint> +{ + static void Run(const MatrixParams<float> &lhs_params, const float *lhs_data, + const MatrixParams<float> &rhs_params, const float *rhs_data, + const MatrixParams<float> &dst_params, float *dst_data, + const GemmParams<float, float, QuantizationFlavor::kFloatingPoint> ¶ms) + { + detail::GemmImplUsingEigen::Run(lhs_params, lhs_data, rhs_params, rhs_data, dst_params, + dst_data, params); + } +}; + +/* From tensorflow/tensorflow/lite/kernels/cpu_backend_gemm.h */ +/* GEMM dispatch implementation for x86. + */ +template <typename LhsScalar, typename RhsScalar, typename AccumScalar, typename DstScalar, + QuantizationFlavor quantization_flavor> +struct GemmImpl : GemmImplX86<LhsScalar, RhsScalar, AccumScalar, DstScalar, quantization_flavor> +{ +}; + +/* From tensorflow/tensorflow/lite/kernels/cpu_backend_gemm.h */ +template <typename LhsScalar, typename RhsScalar, typename AccumScalar, typename DstScalar, + QuantizationFlavor quantization_flavor> +void Gemm(const MatrixParams<LhsScalar> &lhs_params, const LhsScalar *lhs_data, + const MatrixParams<RhsScalar> &rhs_params, const RhsScalar *rhs_data, + const MatrixParams<DstScalar> &dst_params, DstScalar *dst_data, + const GemmParams<AccumScalar, DstScalar, quantization_flavor> ¶ms) +{ + // Generic case: dispatch to any backend as a general GEMM. + GemmImpl<LhsScalar, RhsScalar, AccumScalar, DstScalar, quantization_flavor>::Run( + lhs_params, lhs_data, rhs_params, rhs_data, dst_params, dst_data, params); +} + +// From tensorflow/tensorflow/lite/kernels/cpu_backend_gemm_params.h +inline CachePolicy DefaultCachePolicy(bool is_constant_data) +{ + return is_constant_data ? CachePolicy::kCacheIfLargeSpeedup : CachePolicy::kNeverCache; +} +#endif // CKER_X86_PLATFORM + +} // namespace optimized +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_OPTIMIZED_GEMM_H__ diff --git a/compute/cker/include/cker/operation/optimized/OptimizedUtils.h b/compute/cker/include/cker/operation/optimized/OptimizedUtils.h index ae1f9e78e..f5edc94ab 100644 --- a/compute/cker/include/cker/operation/optimized/OptimizedUtils.h +++ b/compute/cker/include/cker/operation/optimized/OptimizedUtils.h @@ -111,7 +111,7 @@ inline void ExtractPatchIntoBufferColumn(const Shape &input_shape, int w, int h, { const int bottom_row_elements = (bottom_padding * kwidth * in_depth); const int bottom_start = - output_row_offset + ((top_padding + (ih_end - ih_start)) * kwidth * in_depth); + output_row_offset + ((top_padding + (ih_end - ih_start)) * kwidth * in_depth); memset(conv_buffer_data + bottom_start, zero_byte, (bottom_row_elements * sizeof(T))); } } @@ -159,7 +159,7 @@ void DilatedIm2col(const ConvParams ¶ms, const Shape &input_shape, const T * for (int batch = 0; batch < batches; ++batch) { const T zero_byte = - zero_bytes_len > 1 ? static_cast<T>(zero_bytes[batch]) : static_cast<T>(zero_bytes[0]); + zero_bytes_len > 1 ? static_cast<T>(zero_bytes[batch]) : static_cast<T>(zero_bytes[0]); for (int out_y = 0; out_y < output_height; ++out_y) { for (int out_x = 0; out_x < output_width; ++out_x) diff --git a/compute/cker/include/cker/operation/optimized/integer_ops/DepthwiseConvInt8.h b/compute/cker/include/cker/operation/optimized/integer_ops/DepthwiseConvInt8.h new file mode 100644 index 000000000..bd8497920 --- /dev/null +++ b/compute/cker/include/cker/operation/optimized/integer_ops/DepthwiseConvInt8.h @@ -0,0 +1,2138 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_INT8_H__ +#define __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_INT8_H__ + +#include "cker/CpuBackendThreadpool.h" +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" +#include "cker/neon/neon_check.h" +#include "cker/operation/Quantize.h" + +#include <fixedpoint/fixedpoint.h> +#include <public/gemmlowp.h> + +namespace nnfw +{ +namespace cker +{ +namespace optimized_integer_ops +{ + +// Category of depthwise convolution output rounding. +enum class DepthwiseConvOutputRounding +{ + kNone = 0, // Invalid: specific method must be specified. + kAwayFromZero, // Original method: exact halves rounded away from zero. + kUpward, // Halves towards +infinity: adds 0.5 before truncate. + // This is where a future kNearestEven would be placed. +}; + +// Category of depthwise convolution depth multiplication. +enum class DepthwiseConvDepthMultiplication +{ + kNoMultiplication = 0, // Depth multiplier = 1. + kUnitInputDepth, // Input depth = 1, output depth = depth multiplier. +}; + +namespace depthwise_conv +{ + +// Implementation of quantized DepthwiseConv + +template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> +struct QuantizedDepthwiseConvKernel +{ +}; + +#ifdef USE_NEON +template <> struct QuantizedDepthwiseConvKernel<true, 8, 2> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int8x8x2_t filter_s8; + filter_s8.val[0] = vld1_s8(filter_ptr); + filter_s8.val[1] = vld1_s8(filter_ptr + 8); + int16x8_t filter[2]; + for (int i = 0; i < 2; i++) + { + filter[i] = vmovl_s8(filter_s8.val[i]); + } + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer + int32x4x2_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); + acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); + } + // Load the inputs, add input_offset. + const int8x8_t input_s8 = vld1_s8(input_ptr); + input_ptr += input_ptr_increment; + const int16x8_t input_s16 = vmovl_s8(input_s8); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Duplicate the input values, 2-fold + const int16x8x2_t input_dup2 = vzipq_s16(input, input); + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc[0].val[i] = + vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), vget_low_s16(input_dup2.val[i])); + acc[1].val[i] = + vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), vget_high_s16(input_dup2.val[i])); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); + vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); + } + acc_buffer_ptr += 16; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 8, 1> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + const int8x8_t filter_s8 = vld1_s8(filter_ptr); + const int16x8_t filter = vmovl_s8(filter_s8); + + int outp = 0; + // Handle 2 output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) + { + // Load the accumulators from acc_buffer. + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + int8x8_t input_s8[2]; + for (int i = 0; i < 2; i++) + { + input_s8[i] = vld1_s8(input_ptr + 8 * i); + } + input_ptr += 16; + int16x8_t input[2]; + for (int i = 0; i < 2; i++) + { + input[i] = vmovl_s8(input_s8[i]); + } + for (int i = 0; i < 2; i++) + { + input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); + } + // Multiply-accumulate. + acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0])); + acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0])); + acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1])); + acc[3] = vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1])); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle 1 output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer. + int32x4_t acc[2]; + acc[0] = vld1q_s32(acc_buffer_ptr); + acc[1] = vld1q_s32(acc_buffer_ptr + 4); + + // Load the inputs, add input_offset. + const int8x8_t input_s8 = vld1_s8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vmovl_s8(input_s8); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Multiply-accumulate. + acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input)); + acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input)); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr, acc[0]); + vst1q_s32(acc_buffer_ptr + 4, acc[1]); + acc_buffer_ptr += 8; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 4, 2> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + const int8x8_t filter_s8 = vld1_s8(filter_ptr); + const int16x8_t filter = vmovl_s8(filter_s8); + + int outp = 0; + // Handle 2 output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) + { + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + const int8x8_t input_s8 = vld1_s8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vmovl_s8(input_s8); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Duplicate the input values, 2-fold + const int16x8x2_t input_dup2 = vzipq_s16(input, input); + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc[2 * i + 0] = + vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), vget_low_s16(input_dup2.val[i])); + acc[2 * i + 1] = + vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), vget_high_s16(input_dup2.val[i])); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer + int32x4_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + int8x8_t input_s8 = vdup_n_s8(0); + input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0); + input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1); + input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2); + input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3); + input_ptr += 4; + const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8)); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + // Duplicate the input values, 2-fold + const int16x4x2_t input_dup2 = vzip_s16(input, input); + // Multiply-accumulate + acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]); + acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 2, 8> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int16x8_t filter[2]; + for (int i = 0; i < 2; i++) + { + const int8x8_t filter_s8 = vld1_s8(filter_ptr + 8 * i); + filter[i] = vmovl_s8(filter_s8); + } + int outp = 0; + // Handle two output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) + { + // Load the accumulators from acc_buffer. + int32x4_t acc[8]; + for (int i = 0; i < 8; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + int8x8_t input_s8 = vdup_n_s8(0); + input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0); + input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1); + input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2); + input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3); + input_ptr += 4; + const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8)); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + // Multiply-accumulate. + acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); + acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0); + acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1); + acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1); + acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2); + acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2); + acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3); + acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3); + // Store the accumulators back to acc_buffer. + for (int i = 0; i < 8; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 32; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer. + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + int8x8_t input_s8 = vdup_n_s8(0); + input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0); + input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1); + input_ptr += 2; + const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8)); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate. + acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); + acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0); + acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1); + acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1); + + // Store the accumulators back to acc_buffer. + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 2, 2> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int8x8_t filter_s8 = vdup_n_s8(0); + filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0); + filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1); + filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2); + filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3); + const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8)); + + int outp = 0; + // Handle 4 output pixels at a time. + for (; outp <= num_output_pixels - 4; outp += 4) + { + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + + // Load the inputs, add input_offset. + const int8x8_t input_s8 = vld1_s8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vmovl_s8(input_s8); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Duplicate the input values, 2-fold + const int16x8x2_t input_dup2 = vzipq_s16(input, input); + // Multiply-accumulate + acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0])); + acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0])); + acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1])); + acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1])); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer + int32x4_t acc = vld1q_s32(acc_buffer_ptr); + + int8x8_t input_s8 = vdup_n_s8(0); + input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0); + input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1); + input_ptr += 2; + const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8)); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + // Duplicate the input values, 2-fold + const int16x4_t input_dup2 = vzip_s16(input, input).val[0]; + // Multiply-accumulate + acc = vmlal_s16(acc, filter, input_dup2); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 2, 1> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int8x8_t filter_s8 = vdup_n_s8(0); + filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0); + filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1); + filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2); + filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3); + const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8)); + + int outp = 0; + // Handle 8 output pixels at a time. + for (; outp <= num_output_pixels - 8; outp += 8) + { + // Load the accumulators from acc_buffer. + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + int8x8_t input_s8[2]; + for (int i = 0; i < 2; i++) + { + input_s8[i] = vld1_s8(input_ptr + 8 * i); + } + input_ptr += 16; + int16x8_t input[2]; + for (int i = 0; i < 2; i++) + { + input[i] = vmovl_s8(input_s8[i]); + } + for (int i = 0; i < 2; i++) + { + input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); + } + + // Multiply-accumulate. + acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0])); + acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0])); + acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1])); + acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1])); + // Store the accumulators back to acc_buffer. + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle 4 output pixels at a time. + for (; outp <= num_output_pixels - 4; outp += 4) + { + // Load the accumulators from acc_buffer. + int32x4_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + const int8x8_t input_s8 = vld1_s8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vmovl_s8(input_s8); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + + // Multiply-accumulate. + acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input)); + acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input)); + // Store the accumulators back to acc_buffer. + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + // Handle 2 output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) + { + // Load the accumulators from acc_buffer. + int32x4_t acc = vld1q_s32(acc_buffer_ptr); + // Load the inputs, add input_offset. + int8x8_t input_s8 = vdup_n_s8(0); + input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0); + input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1); + input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2); + input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3); + input_ptr += 4; + const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8)); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate. + acc = vmlal_s16(acc, filter, input); + // Store the accumulators back to acc_buffer. + vst1q_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + // Handle 1 output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer. + int32x2_t acc = vld1_s32(acc_buffer_ptr); + // Load the inputs, add input_offset. + int8x8_t input_s8 = vdup_n_s8(0); + input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0); + input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1); + input_ptr += 2; + const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8)); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate. + acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input)); + // Store the accumulators back to acc_buffer. + vst1_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 2; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 1, 2> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int8x8_t filter_s8 = vdup_n_s8(0); + filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0); + filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1); + filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2); + filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3); + const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8)); + + int outp = 0; + // Handle 8 output pixels at a time. + for (; outp <= num_output_pixels - 8; outp += 8) + { + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + + // Load the inputs, add input_offset. + const int8x8_t input_s8 = vld1_s8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vmovl_s8(input_s8); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Duplicate the input values, 2-fold + const int16x8x2_t input_dup2 = vzipq_s16(input, input); + // Multiply-accumulate + acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0])); + acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0])); + acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1])); + acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1])); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer + int32x2_t acc = vld1_s32(acc_buffer_ptr); + + // Load the inputs, add input_offset. + const uint32_t input = *input_ptr++ + input_offset; + + // Multiply-accumulate + acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input)); + // Store the accumulators back to acc_buffer + vst1_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 2; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 1, 4> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int8x8_t filter_s8 = vdup_n_s8(0); + filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0); + filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1); + filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2); + filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3); + const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8)); + + int outp = 0; + // Handle 8 output pixels at a time. + for (; outp <= num_output_pixels - 8; outp += 8) + { + // Load the accumulators from acc_buffer + int32x4_t acc[8]; + for (int i = 0; i < 8; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + + // Load the inputs, add input_offset. + int8x8_t input_s8 = vld1_s8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vmovl_s8(input_s8); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + + // Multiply-accumulate + acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0); + acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1); + acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2); + acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3); + acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0); + acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1); + acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2); + acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3); + + // Store the accumulators back to acc_buffer + for (int i = 0; i < 8; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 32; + } + // Handle 4 output pixels at a time. + for (; outp <= num_output_pixels - 4; outp += 4) + { + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + + // Load the inputs, add input_offset. + int8x8_t input_s8 = vdup_n_s8(0); + input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0); + input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1); + input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2); + input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3); + input_ptr += 4; + const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8)); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate + acc[0] = vmlal_lane_s16(acc[0], filter, input, 0); + acc[1] = vmlal_lane_s16(acc[1], filter, input, 1); + acc[2] = vmlal_lane_s16(acc[2], filter, input, 2); + acc[3] = vmlal_lane_s16(acc[3], filter, input, 3); + + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer + int32x4_t acc = vld1q_s32(acc_buffer_ptr); + + // Load the inputs, add input_offset. + const uint32_t input = *input_ptr++ + input_offset; + + // Multiply-accumulate + acc = vmlal_n_s16(acc, filter, input); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 4, 1> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int8x8_t filter_s8 = vdup_n_s8(0); + filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0); + filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1); + filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2); + filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3); + const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8)); + + int outp = 0; + // Handle 4 output pixels at a time. + for (; outp <= num_output_pixels - 4; outp += 4) + { + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + int16x8_t input[2]; + for (int i = 0; i < 2; i++) + { + const int8x8_t input_s8 = vld1_s8(input_ptr + 8 * i); + const int16x8_t input_s16 = vmovl_s8(input_s8); + input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + } + input_ptr += 16; + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i])); + acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i])); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer + int32x4_t acc; + acc = vld1q_s32(acc_buffer_ptr); + + // Load the inputs, add input_offset. + int8x8_t input_s8 = vdup_n_s8(0); + input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0); + input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1); + input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2); + input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3); + input_ptr += 4; + const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8)); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + // Multiply-accumulate + acc = vmlal_s16(acc, filter, input); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 4, 4> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int16x8_t filter[2]; + for (int i = 0; i < 2; i++) + { + const int8x8_t filter_s8 = vld1_s8(filter_ptr + 8 * i); + filter[i] = vmovl_s8(filter_s8); + } + + int outp = 0; + // Handle 2 output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) + { + // Load the accumulators from acc_buffer + int32x4_t acc[8]; + for (int i = 0; i < 8; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + + // Load the inputs, add input_offset. + int8x8_t input_s8 = vld1_s8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vmovl_s8(input_s8); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + + // Multiply-accumulate + acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), vget_low_s16(input), 0); + acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), vget_low_s16(input), 1); + acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), vget_low_s16(input), 2); + acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), vget_low_s16(input), 3); + acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), vget_high_s16(input), 0); + acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), vget_high_s16(input), 1); + acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), vget_high_s16(input), 2); + acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), vget_high_s16(input), 3); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 8; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 32; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + + // Load the inputs, add input_offset. + int8x8_t input_s8 = vdup_n_s8(0); + input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0); + input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1); + input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2); + input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3); + input_ptr += 4; + const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8)); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate + acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); + acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1); + acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2); + acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 0, 3> +{ + static void Run(int num_output_pixels, int input_depth, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // We will have to duplicate bytes in a NEON register, 3-fold. + // We will do that by register-level table-look-up using VTBL instructions. + // Here we prepare the registers containing the table-lookup indices. + static const int8_t dup3_indices_array[3][8] = { + {0, 0, 0, 1, 1, 1, 2, 2}, {2, 3, 3, 3, 4, 4, 4, 5}, {5, 5, 6, 6, 6, 7, 7, 7}}; + int8x8_t dup3_indices[3]; + for (int i = 0; i < 3; i++) + { + dup3_indices[i] = vld1_s8(dup3_indices_array[i]); + } + + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + const int8_t *local_filter_ptr = filter_ptr; + const int8_t *local_input_ptr = input_ptr; + int ic = 0; + // Handle 8 input channels at a time. + for (; ic <= input_depth - 8; ic += 8) + { + // Load the filters. + int16x8_t filter[3]; + int8x8x3_t filter_s8; + filter_s8.val[0] = vld1_s8(local_filter_ptr); + filter_s8.val[1] = vld1_s8(local_filter_ptr + 8); + filter_s8.val[2] = vld1_s8(local_filter_ptr + 16); + local_filter_ptr += 24; + for (int i = 0; i < 3; i++) + { + filter[i] = vmovl_s8(filter_s8.val[i]); + } + // Load the inputs, duplicate 3-fold, add input_offset. + const int8x8_t input_s8 = vld1_s8(local_input_ptr); + local_input_ptr += 8; + + int8x8_t input_s8_dup3[3]; + for (int i = 0; i < 3; i++) + { + input_s8_dup3[i] = vtbl1_s8(input_s8, dup3_indices[i]); + } + int16x8_t input_dup3[3]; + for (int i = 0; i < 3; i++) + { + const int16x8_t input_s16_dup3 = vmovl_s8(input_s8_dup3[i]); + input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset)); + } + // Load the accumulators from acc_buffer + int32x4x3_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); + acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); + acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16); + } + // Multiply-accumulate + for (int j = 0; j < 3; j++) + { + acc[0].val[j] = + vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), vget_low_s16(filter[j])); + acc[1].val[j] = + vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), vget_high_s16(filter[j])); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); + vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); + vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]); + } + acc_buffer_ptr += 24; + } + // Handle one input channel at a time. + for (; ic < input_depth; ic++) + { + const int16_t input_val = *local_input_ptr++ + input_offset; + for (int i = 0; i < 3; i++) + { + *acc_buffer_ptr++ += static_cast<int32_t>(local_filter_ptr[i]) * input_val; + } + local_filter_ptr += 3; + } + input_ptr += input_ptr_increment; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 0, 2> +{ + static void Run(int num_output_pixels, int input_depth, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + const int8_t *local_filter_ptr = filter_ptr; + const int8_t *local_input_ptr = input_ptr; + int ic = 0; + // Handle 8 input channels at a time. + for (; ic <= input_depth - 8; ic += 8) + { + // Load the filters. + int16x8_t filter[2]; + int8x8x2_t filter_s8; + filter_s8.val[0] = vld1_s8(local_filter_ptr); + filter_s8.val[1] = vld1_s8(local_filter_ptr + 8); + local_filter_ptr += 16; + for (int i = 0; i < 2; i++) + { + filter[i] = vmovl_s8(filter_s8.val[i]); + } + // Load the inputs, add input_offset, duplicate 2-fold. + const int8x8_t input_s8 = vld1_s8(local_input_ptr); + local_input_ptr += 8; + const int16x8_t input_s16 = vmovl_s8(input_s8); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + const int16x8x2_t input_dup2 = vzipq_s16(input, input); + // Load the accumulators from acc_buffer. + int32x4x2_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); + acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); + } + // Multiply-accumulate. + for (int j = 0; j < 2; j++) + { + acc[0].val[j] = + vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), vget_low_s16(input_dup2.val[j])); + acc[1].val[j] = + vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), vget_high_s16(input_dup2.val[j])); + } + // Store the accumulators back to acc_buffer. + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); + vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); + } + acc_buffer_ptr += 16; + } + // Handle one input channel at a time. + for (; ic < input_depth; ic++) + { + // Load the inputs. + const int16_t input_val = *local_input_ptr++ + input_offset; + for (int i = 0; i < 2; i++) + { + *acc_buffer_ptr++ += static_cast<int32_t>(local_filter_ptr[i]) * input_val; + } + local_filter_ptr += 2; + } + input_ptr += input_ptr_increment; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 0, 1> +{ + static void Run(int num_output_pixels, int input_depth, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + const int8_t *local_filter_ptr = filter_ptr; + const int8_t *local_input_ptr = input_ptr; + int ic = 0; + // Handle 16 input channels at a time. + for (; ic <= input_depth - 16; ic += 16) + { + // Load the filters. + int8x8_t filter_s8_0 = vld1_s8(local_filter_ptr + 8 * 0); + int8x8_t filter_s8_1 = vld1_s8(local_filter_ptr + 8 * 1); + local_filter_ptr += 16; + int16x8_t filter_0 = vmovl_s8(filter_s8_0); + int16x8_t filter_1 = vmovl_s8(filter_s8_1); + // Load the inputs, add input_offset. + int8x8_t input_s8_0 = vld1_s8(local_input_ptr + 8 * 0); + int8x8_t input_s8_1 = vld1_s8(local_input_ptr + 8 * 1); + local_input_ptr += 16; + int16x8_t input_0 = vmovl_s8(input_s8_0); + int16x8_t input_1 = vmovl_s8(input_s8_1); + input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset)); + input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset)); + // Load the accumulators from acc_buffer + int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); + int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); + int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2); + int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3); + acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0)); + acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0)); + acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1)); + acc_3 = vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1)); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0); + vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1); + vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2); + vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3); + acc_buffer_ptr += 16; + } + // Handle 8 input channels at a time. + for (; ic <= input_depth - 8; ic += 8) + { + // Load the filters. + const int8x8_t filter_s8 = vld1_s8(local_filter_ptr); + local_filter_ptr += 8; + const int16x8_t filter = vmovl_s8(filter_s8); + // Load the inputs, add input_offset. + const int8x8_t input_s8 = vld1_s8(local_input_ptr); + local_input_ptr += 8; + const int16x8_t input_s16 = vmovl_s8(input_s8); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Load the accumulators from acc_buffer + int32x4_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter)); + acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter)); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + // Handle one input channel at a time. + for (; ic < input_depth; ic++) + { + const int16_t input_val = *local_input_ptr++ + input_offset; + const int16_t filter_val = *local_filter_ptr++; + *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val; + } + input_ptr += input_ptr_increment; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 16, 1> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int8x8_t filter_s8[2]; + for (int i = 0; i < 2; i++) + { + filter_s8[i] = vld1_s8(filter_ptr + 8 * i); + } + int16x8_t filter[2]; + for (int i = 0; i < 2; i++) + { + filter[i] = vmovl_s8(filter_s8[i]); + } + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + // Load the inputs, add input_offset. + int8x8_t input_s8[2]; + for (int i = 0; i < 2; i++) + { + input_s8[i] = vld1_s8(input_ptr + 8 * i); + } + input_ptr += input_ptr_increment; + int16x8_t input[2]; + for (int i = 0; i < 2; i++) + { + input[i] = vmovl_s8(input_s8[i]); + } + for (int i = 0; i < 2; i++) + { + input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); + } + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), vget_low_s16(filter[i])); + acc[2 * i + 1] = + vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), vget_high_s16(filter[i])); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 8, 1> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + const int8x8_t filter_s8 = vld1_s8(filter_ptr); + const int16x8_t filter = vmovl_s8(filter_s8); + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + // Load the inputs, add input_offset. + const int8x8_t input_s8 = vld1_s8(input_ptr); + const int16x8_t input_s16 = vmovl_s8(input_s8); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Load the accumulators from acc_buffer + int32x4_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter)); + acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter)); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + input_ptr += input_ptr_increment; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 1, 16> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int8x8_t filter_s8[2]; + for (int i = 0; i < 2; i++) + { + filter_s8[i] = vld1_s8(filter_ptr + 8 * i); + } + int16x8_t filter[2]; + for (int i = 0; i < 2; i++) + { + filter[i] = vmovl_s8(filter_s8[i]); + } + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + int8_t input_s8 = *input_ptr; + input_ptr += input_ptr_increment; + int16_t input = static_cast<int16_t>(input_s8 + input_offset); + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 2; i++) + { + acc[2 * i + 0] = vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input); + acc[2 * i + 1] = vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 1, 32> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int8x8_t filter_s8_0 = vld1_s8(filter_ptr + 8 * 0); + int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 8 * 1); + int8x8_t filter_s8_2 = vld1_s8(filter_ptr + 8 * 2); + int8x8_t filter_s8_3 = vld1_s8(filter_ptr + 8 * 3); + int16x8_t filter_0 = vmovl_s8(filter_s8_0); + int16x8_t filter_1 = vmovl_s8(filter_s8_1); + int16x8_t filter_2 = vmovl_s8(filter_s8_2); + int16x8_t filter_3 = vmovl_s8(filter_s8_3); + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + int8_t input_s8 = *input_ptr; + input_ptr += input_ptr_increment; + int16_t input = static_cast<int16_t>(input_s8 + input_offset); + // Load the accumulators from acc_buffer + int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); + int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); + int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2); + int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3); + int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4); + int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5); + int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6); + int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7); + // Multiply-accumulate + acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input); + acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input); + acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input); + acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input); + acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input); + acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input); + acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input); + acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0); + vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1); + vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2); + vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3); + vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4); + vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5); + vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6); + vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7); + acc_buffer_ptr += 32; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 1, 20> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8. + // We load the first 16 bytes into filter_s8_{0,1} as usual. + // Then we load the 8 last bytes into filter_s8_x (x for 'extra'). + // This is redundant: the first 4 bytes of filter_s8_x are the same + // as the last 4 bytes of filter_s8_x. + int8x8_t filter_s8_0 = vld1_s8(filter_ptr + 8 * 0); + int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 8 * 1); + int8x8_t filter_s8_x = vld1_s8(filter_ptr + 8 * 1 + 4); + int16x8_t filter_0 = vmovl_s8(filter_s8_0); + int16x8_t filter_1 = vmovl_s8(filter_s8_1); + int16x8_t filter_x = vmovl_s8(filter_s8_x); + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + int8_t input_s8 = *input_ptr; + input_ptr += input_ptr_increment; + int16_t input = static_cast<int16_t>(input_s8 + input_offset); + // Load the accumulators from acc_buffer + int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); + int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); + int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2); + int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3); + int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4); + // Multiply-accumulate + acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input); + acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input); + acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input); + acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input); + acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0); + vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1); + vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2); + vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3); + vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4); + acc_buffer_ptr += 20; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 1, 8> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + const int8x8_t filter_s8 = vld1_s8(filter_ptr); + const int16x8_t filter = vmovl_s8(filter_s8); + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + int8_t input_s8 = *input_ptr; + input_ptr += input_ptr_increment; + int16_t input = static_cast<int16_t>(input_s8 + input_offset); + // Load the accumulators from acc_buffer + int32x4_t acc[2]; + for (int i = 0; i < 2; i++) + { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input); + acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) + { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 2, 1> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int8x8_t filter_s8 = vdup_n_s8(0); + filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0); + filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1); + filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2); + filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3); + const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8)); + + int outp = 0; + + // Handle 2 output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) + { + // Load the accumulators from acc_buffer. + int32x4_t acc = vld1q_s32(acc_buffer_ptr); + // Load the inputs, add input_offset. + int16x4_t input_s16 = vdup_n_s16(0); + input_s16 = vset_lane_s16((reinterpret_cast<const int16_t *>(input_ptr))[0], input_s16, 0); + input_ptr += input_ptr_increment; + input_s16 = vset_lane_s16((reinterpret_cast<const int16_t *>(input_ptr))[0], input_s16, 1); + input_ptr += input_ptr_increment; + input_s16 = vget_low_s16(vmovl_s8(vreinterpret_s8_s16(input_s16))); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate. + acc = vmlal_s16(acc, filter, input); + // Store the accumulators back to acc_buffer. + vst1q_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + + // Handle 1 output pixel at a time. + for (; outp < num_output_pixels; outp++) + { + // Load the accumulators from acc_buffer. + int32x2_t acc = vld1_s32(acc_buffer_ptr); + // Load the inputs, add input_offset. + int8x8_t input_s8 = vdup_n_s8(0); + input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0); + input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1); + input_ptr += input_ptr_increment; + const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8)); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate. + acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input)); + // Store the accumulators back to acc_buffer. + vst1_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 2; + } + } +}; + +template <> struct QuantizedDepthwiseConvKernel<true, 4, 1> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + if (num_output_pixels <= 0) + { + return; + } + + // Load the filters. + int8x8_t filter_s8 = vdup_n_s8(0); + filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0); + filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1); + filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2); + filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3); + const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8)); + + int outp = 0; + + // Handle one output pixel at a time until second to the last pixel. Second + // to the last because we read eight input pixels while only processing + // four. + for (; outp < num_output_pixels - 1; outp++) + { + // Load the accumulators from acc_buffer + int32x4_t acc; + acc = vld1q_s32(acc_buffer_ptr); + + // Load the inputs, add input_offset. + int8x8_t input_s8 = vld1_s8(input_ptr); + input_ptr += input_ptr_increment; + const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8)); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + // Multiply-accumulate + acc = vmlal_s16(acc, filter, input); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + + // Handle the last output pixel. + // Load the accumulators from acc_buffer + int32x4_t acc; + acc = vld1q_s32(acc_buffer_ptr); + + // Load the inputs, add input_offset. + int8x8_t input_s8 = vdup_n_s8(0); + input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0); + input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1); + input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2); + input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3); + const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8)); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + // Multiply-accumulate + acc = vmlal_s16(acc, filter, input); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr, acc); + } +}; + +template <> struct QuantizedDepthwiseConvKernel<false, 12, 1> +{ + static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */, + const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment, + const int8_t *filter_ptr, int32_t *acc_buffer_ptr) + { + // Load the filters. + int8x8_t filter_s8_0 = vld1_s8(filter_ptr); + int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 4); + int16x8_t filter_s16_0 = vmovl_s8(filter_s8_0); + int16x8_t filter_s16_1 = vmovl_s8(filter_s8_1); + int16x4_t filter_0 = vget_low_s16(filter_s16_0); + int16x4_t filter_1 = vget_high_s16(filter_s16_0); + int16x4_t filter_2 = vget_high_s16(filter_s16_1); + + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) + { + // Load the inputs, add input_offset. + int8x8_t input_s8_0 = vld1_s8(input_ptr); + int8x8_t input_s8_1 = vld1_s8(input_ptr + 4); + input_ptr += input_ptr_increment; + int16x8_t input_0 = vmovl_s8(input_s8_0); + int16x8_t input_1 = vmovl_s8(input_s8_1); + input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset)); + input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset)); + + // Load the accumulators from acc_buffer + int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0); + int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1); + int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2); + + // Multiply-accumulate + acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0); + acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1); + acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2); + + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0); + vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1); + vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2); + + acc_buffer_ptr += 12; + } + } +}; +#endif + +// Accumulates the effect of one row of the filter, on a segment of one row +// of the output, accessing the corresponding one row of the input. +template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> +void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_depth, + int input_width, const int8_t *input_data, int16_t input_offset, + int pad_width, int depth_multiplier, int filter_width, + const int8_t *filter_data, int out_x_buffer_start, + int out_x_buffer_end, int output_depth, int32_t *acc_buffer) +{ + // Consistency check parameters. This is important in particular to ensure + // that we keep the number of template instantiations minimal, so we don't + // increase binary size unnecessarily. + static_assert(kFixedDepthMultiplier || !kFixedInputDepth, ""); + static_assert(kFixedInputDepth || kAllowStrided, ""); + assert(stride == 1 || kAllowStrided); + if (kFixedInputDepth) + { + assert(input_depth == kFixedInputDepth); + } + if (kFixedDepthMultiplier) + { + assert(depth_multiplier == kFixedDepthMultiplier); + } + assert(output_depth == input_depth * depth_multiplier); + const int input_ptr_increment = stride * input_depth; + const int8_t *filter_base_ptr = filter_data; + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + // For the current (filter_x, filter_y) point in the filter, + // compute the boundaries of the corresponding output row segment. + int out_x_loop_start_unclamped = 0; + int out_x_loop_end_unclamped = 0; + if (kAllowStrided) + { + if (stride == 2) + { + out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 1) / 2; + out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 1) / 2; + } + else if (stride == 4) + { + out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 3) / 4; + out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 3) / 4; + } + else + { + out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + stride - 1) / stride; + out_x_loop_end_unclamped = + (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride; + } + } + else + { + out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x; + out_x_loop_end_unclamped = pad_width + input_width - dilation_factor * filter_x; + } + // The kernel will have to iterate on the segment of the + // output row that starts at out_x_loop_start and out_x_loop_end. + const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclamped); + const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclamped); + + int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; + const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x; + const int8_t *input_ptr = input_data + in_x_origin * input_depth; + const int num_output_pixels = out_x_loop_end - out_x_loop_start; + QuantizedDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run( + num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset, + input_ptr_increment, filter_base_ptr, acc_buffer_ptr); + filter_base_ptr += output_depth; + } +} + +// generic fallback of DepthwiseConvAccumRow, portable, non-templatized. +inline void QuantizedDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth, + int input_width, const int8_t *input_data, + int16_t input_offset, int pad_width, + int depth_multiplier, int filter_width, + const int8_t *filter_data, int out_x_buffer_start, + int out_x_buffer_end, int output_depth, + int32_t *acc_buffer) +{ + const int8_t *filter_base_ptr = filter_data; + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + const int out_x_loop_start = + std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride); + const int out_x_loop_end = + std::min(out_x_buffer_end, + (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride); + + int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; + const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x; + const int8_t *input_ptr = input_data + in_x_origin * input_depth; + const int input_ptr_increment = (stride - 1) * input_depth; + for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) + { + const int8_t *filter_ptr = filter_base_ptr; + for (int ic = 0; ic < input_depth; ++ic) + { + const int16_t input_val = *input_ptr++ + input_offset; + for (int m = 0; m < depth_multiplier; m++) + { + const int16_t filter_val = *filter_ptr++; + *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val; + } + } + input_ptr += input_ptr_increment; + } + filter_base_ptr += output_depth; + } +} + +// Initializes the accumulator buffer with bias values. +inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth, + const int32_t *bias_data, int32_t *acc_buffer) +{ + int i = 0; +#ifdef USE_NEON + if (output_depth == 1) + { + const int32x4_t b = vdupq_n_s32(bias_data[0]); + for (; i <= num_output_pixels - 16; i += 16) + { + vst1q_s32(acc_buffer + i + 0, b); + vst1q_s32(acc_buffer + i + 4, b); + vst1q_s32(acc_buffer + i + 8, b); + vst1q_s32(acc_buffer + i + 12, b); + } + for (; i <= num_output_pixels - 4; i += 4) + { + vst1q_s32(acc_buffer + i, b); + } + } + else if (output_depth == 2) + { + int32x4_t b = vdupq_n_s32(bias_data[0]); + b = vsetq_lane_s32(bias_data[1], b, 1); + b = vsetq_lane_s32(bias_data[1], b, 3); + for (; i <= num_output_pixels - 8; i += 8) + { + vst1q_s32(acc_buffer + 2 * i + 0, b); + vst1q_s32(acc_buffer + 2 * i + 4, b); + vst1q_s32(acc_buffer + 2 * i + 8, b); + vst1q_s32(acc_buffer + 2 * i + 12, b); + } + for (; i <= num_output_pixels - 2; i += 2) + { + vst1q_s32(acc_buffer + 2 * i, b); + } + } + else if (output_depth == 4) + { + const int32x4_t b = vld1q_s32(bias_data); + for (; i <= num_output_pixels - 4; i += 4) + { + vst1q_s32(acc_buffer + 4 * i + 0, b); + vst1q_s32(acc_buffer + 4 * i + 4, b); + vst1q_s32(acc_buffer + 4 * i + 8, b); + vst1q_s32(acc_buffer + 4 * i + 12, b); + } + for (; i < num_output_pixels; i++) + { + vst1q_s32(acc_buffer + 4 * i, b); + } + } + else if (output_depth == 8) + { + const int32x4_t b0 = vld1q_s32(bias_data); + const int32x4_t b1 = vld1q_s32(bias_data + 4); + for (; i <= num_output_pixels - 2; i += 2) + { + vst1q_s32(acc_buffer + 8 * i + 0, b0); + vst1q_s32(acc_buffer + 8 * i + 4, b1); + vst1q_s32(acc_buffer + 8 * i + 8, b0); + vst1q_s32(acc_buffer + 8 * i + 12, b1); + } + for (; i < num_output_pixels; i++) + { + vst1q_s32(acc_buffer + 8 * i + 0, b0); + vst1q_s32(acc_buffer + 8 * i + 4, b1); + } + } + else if (output_depth == 16) + { + const int32x4_t b0 = vld1q_s32(bias_data); + const int32x4_t b1 = vld1q_s32(bias_data + 4); + const int32x4_t b2 = vld1q_s32(bias_data + 8); + const int32x4_t b3 = vld1q_s32(bias_data + 12); + for (; i < num_output_pixels; i++) + { + vst1q_s32(acc_buffer + 16 * i + 0, b0); + vst1q_s32(acc_buffer + 16 * i + 4, b1); + vst1q_s32(acc_buffer + 16 * i + 8, b2); + vst1q_s32(acc_buffer + 16 * i + 12, b3); + } + } +#endif + for (; i < num_output_pixels; i++) + { + memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth); + } +} + +inline void DepthwiseConvGeneral(const DepthwiseConvParams ¶ms, + const int32_t *output_multiplier, const int32_t *output_shift, + const Shape &input_shape, const int8_t *input_data, + const Shape &filter_shape, const int8_t *filter_data, + const Shape & /* bias_shape */, const int32_t *bias_data, + const Shape &output_shape, int8_t *output_data, int thread_start, + int thread_end, int thread_dim) +{ + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int depth_multiplier = params.depth_multiplier; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + const int32_t input_offset = params.input_offset; + const int32_t output_offset = params.output_offset; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_rows = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + + static const int kAccBufferMaxSize = 2048; + int32_t acc_buffer[kAccBufferMaxSize]; + assert(kAccBufferMaxSize >= output_depth); + const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth; + const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth; + UNUSED_RELEASE(kAccBufferActualSize); + assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize); + assert(kAccBufferActualSize <= kAccBufferMaxSize); + assert(kOutputPixelsInAccBuffer >= 1); + assert(thread_dim == 0 || thread_dim == 1); + + // row_accum_func will point to the core accumulation function to be used + // for this DepthwiseConv op. + using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric); + row_accum_func_t row_accum_func = nullptr; + +#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \ + if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) && \ + (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) && \ + depth_multiplier == FIXED_DEPTH_MULTIPLIER) \ + { \ + row_accum_func = \ + QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>; \ + } + +#ifdef USE_NEON + // We go over our list of kernels by decreasing order of preference + // for the cases where multiple kernels could apply. + + // Start with the fastest kernels: AllowStrided=false, fixed input depth. + + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1) + + // Next come the strided kernels: AllowStrided=true, fixed input depth. + // They are a bit less efficient, but allow stride!=1. + + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1) + + // Finally, the kernels allowing a variable input depth, + // these are the least efficient but most general kernels. + + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3) +#endif // USE_NEON + + // No matching fast kernel found, use slow fallback. + if (!row_accum_func) + { + row_accum_func = QuantizedDepthwiseConvAccumRowGeneric; + } + +#undef TFMINI_USE_DEPTHWISECONV_KERNEL + + const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2); + const int input_batch_stride = input_height_stride * input_shape.Dims(1); + const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2); + + // Now that we have determined row_accum_func, we can start work. + int batch_start = 0; + int batch_end = batches; + int row_start = 0; + int row_end = output_rows; + int output_ptr_offset = 0; + + switch (thread_dim) + { + case 0: + assert(thread_start >= 0); + assert(thread_end <= batches); + batch_start = thread_start; + batch_end = thread_end; + output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0); + break; + case 1: + assert(thread_start >= 0); + assert(thread_end <= output_rows); + row_start = thread_start; + row_end = thread_end; + output_ptr_offset = row_start * output_width * output_depth; + break; + } + + int8_t *output_ptr = output_data + output_ptr_offset; + int batch_step = (output_rows + row_start - row_end) * output_width * output_depth; + for (int b = batch_start; b < batch_end; ++b) + { + for (int out_y = row_start; out_y < row_end; ++out_y) + { + const int in_y_origin = (out_y * stride_height) - pad_height; + const int filter_y_start = + std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor); + const int filter_y_end = + std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) / + dilation_height_factor); + for (int out_x_buffer_start = 0; out_x_buffer_start < output_width; + out_x_buffer_start += kOutputPixelsInAccBuffer) + { + const int out_x_buffer_end = + std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer); + // We call a 'pixel' a group of activation that share all but the + // 'depth'/'channel' coordinate. num_output_pixels is the number of + // output pixels that we will accumulate in this loop iteration. + const int num_output_pixels = out_x_buffer_end - out_x_buffer_start; + // Initialize our local accumulator with the bias values, so we don't + // have to add them later. + DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer); + // Accumulation loop. Most of the time should be spent in here. + for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) + { + const int in_y = in_y_origin + dilation_height_factor * filter_y; + row_accum_func(stride_width, dilation_width_factor, input_depth, input_width, + input_data + in_y * input_height_stride + b * input_batch_stride, + input_offset, pad_width, depth_multiplier, filter_width, + filter_data + filter_y * filter_height_stride, out_x_buffer_start, + out_x_buffer_end, output_depth, acc_buffer); + } + // Finished accumulating int32_t values. Now need to convert them to + // the final 8bit form and store them. + const int num_output_values = output_depth * num_output_pixels; + + Quantize(output_multiplier, output_shift, output_depth, num_output_values, output_offset, + output_activation_min, output_activation_max, acc_buffer, output_ptr); + + output_ptr += num_output_values; + } + } + output_ptr += batch_step; + } +} + +} // namespace depthwise_conv + +template <DepthwiseConvOutputRounding kOutputRounding> +inline void DepthwiseConvWithRounding(const DepthwiseConvParams ¶ms, + const int32_t *output_multiplier, const int32_t *output_shift, + const Shape &input_shape, const int8_t *input_data, + const Shape &filter_shape, const int8_t *filter_data, + const Shape &bias_shape, const int32_t *bias_data, + const Shape &output_shape, int8_t *output_data, + int thread_start, int thread_end, int thread_dim) +{ + const int depth_multiplier = params.depth_multiplier; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + UNUSED_RELEASE(depth_multiplier); + UNUSED_RELEASE(dilation_width_factor); + UNUSED_RELEASE(dilation_height_factor); + assert(dilation_width_factor >= 1); + assert(dilation_height_factor >= 1); + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_depth = input_shape.Dims(3); + UNUSED_RELEASE(output_depth); + UNUSED_RELEASE(input_depth); + assert(output_depth == input_depth * depth_multiplier); + assert(bias_shape.FlatSize() == output_depth); + +// TODO Use below codes +#if 0 +// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on +// Jetson TX-2. This compiler does not support the offsetof() macro. +#if defined(__aarch64__) && !defined(GOOGLE_L4T) +#if defined(__ANDROID__) && defined(__clang__) + CpuFlags cpu_flags; + GetCpuFlags(&cpu_flags); + const bool has_dot_product_instructions = cpu_flags.neon_dotprod; + + // Dispatch to dot-product 3x3 kernels when supported. + if (has_dot_product_instructions) + { + using optimized_ops::depthwise_conv::DotProduct3x3KernelType; + DotProduct3x3KernelType kernel_type = optimized_ops::depthwise_conv::CategorizeDotProductKernel< + optimized_ops::depthwise_conv::QuantizationType::kPerChannelInt8>( + input_shape, filter_shape, output_shape, params, output_shift); + if (kernel_type != DotProduct3x3KernelType::kNone) + { + DepthwiseConvParams params_copy = params; + params_copy.output_shift_per_channel = output_shift; + params_copy.output_multiplier_per_channel = output_multiplier; + optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3PerChannel< + DepthwiseConvImplementation::kUseNeon3x3DotProduct>( + params_copy, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data, + output_shape, output_data, thread_start, thread_end, thread_dim); + return; + } + } + +#endif + // Dispatch to non-dot-product 3x3 kernels when supported. + + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + + // Call kernel optimized for depthwise convolutions using 3x3 filters if + // parameters are supported. + if (optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported< + optimized_ops::depthwise_conv::QuantizationType::kPerChannelInt8>( + input_shape, filter_shape, stride_width, stride_height, dilation_width_factor, + dilation_height_factor, pad_width, pad_height, depth_multiplier, output_shape, 0, + output_shift)) + { + optimized_ops::depthwise_conv::DepthwiseConv3x3FilterPerChannel< + DepthwiseConvOutputRounding::kUpward>( + params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data, + bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim); + return; + } +#endif + +#endif /* end of if 0 */ + + depthwise_conv::DepthwiseConvGeneral( + params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data, + bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim); +} + +inline void DepthwiseConvImpl(const DepthwiseConvParams ¶ms, const int32_t *output_multiplier, + const int32_t *output_shift, const Shape &input_shape, + const int8_t *input_data, const Shape &filter_shape, + const int8_t *filter_data, const Shape &bias_shape, + const int32_t *bias_data, const Shape &output_shape, + int8_t *output_data, int thread_start, int thread_end, int thread_dim) +{ + return DepthwiseConvWithRounding<DepthwiseConvOutputRounding::kAwayFromZero>( + params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data, + bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim); +} + +template <typename T, typename TS> struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task +{ + DepthwiseConvWorkerTask(const DepthwiseConvParams ¶ms, const int32_t *output_multiplier, + const int32_t *output_shift, const Shape &input_shape, + const T *input_data, const Shape &filter_shape, const T *filter_data, + const Shape &bias_shape, const TS *bias_data, const Shape &output_shape, + T *output_data, int thread_start, int thread_end, int thread_dim) + : params_(params), output_multiplier_(output_multiplier), output_shift_(output_shift), + input_shape_(input_shape), input_data_(input_data), filter_shape_(filter_shape), + filter_data_(filter_data), bias_shape_(bias_shape), bias_data_(bias_data), + output_shape_(output_shape), output_data_(output_data), thread_start_(thread_start), + thread_end_(thread_end), thread_dim_(thread_dim) + { + } + + void Run() override + { + DepthwiseConvImpl(params_, output_multiplier_, output_shift_, input_shape_, input_data_, + filter_shape_, filter_data_, bias_shape_, bias_data_, output_shape_, + output_data_, thread_start_, thread_end_, thread_dim_); + } + +private: + const DepthwiseConvParams ¶ms_; + const int32_t *output_multiplier_; + const int32_t *output_shift_; + const Shape &input_shape_; + const T *input_data_; + const Shape &filter_shape_; + const T *filter_data_; + const Shape &bias_shape_; + const TS *bias_data_; + const Shape &output_shape_; + T *output_data_; + int thread_start_; + int thread_end_; + int thread_dim_; +}; + +inline int HowManyConvThreads(const Shape &output_shape, const Shape &filter_shape, int thread_dim) +{ + constexpr int kMinMulPerThread = 8; + const int output_units = output_shape.Dims(thread_dim); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int num_mul_per_unit = + FlatSizeSkipDim(output_shape, thread_dim) * filter_height * filter_width; + const int min_units_per_thread = kMinMulPerThread / num_mul_per_unit + 1; + int thread_count = output_units / min_units_per_thread; + return thread_count; +} + +inline void DepthwiseConvPerChannel(const DepthwiseConvParams ¶ms, + const int32_t *output_multiplier, const int32_t *output_shift, + const Shape &input_shape, const int8_t *input_data, + const Shape &filter_shape, const int8_t *filter_data, + const Shape &bias_shape, const int32_t *bias_data, + const Shape &output_shape, int8_t *output_data, + ruy::Context *ruy_context) +{ + UNUSED_ALL(params, output_multiplier, output_shift, input_shape, input_data, filter_shape, + filter_data, bias_shape, bias_data, output_shape, output_data, ruy_context); + + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + + const int output_batches = output_shape.Dims(0); + const int output_rows = output_shape.Dims(1); + int thread_count_batch = HowManyConvThreads(output_shape, filter_shape, 0); + int thread_count_row = HowManyConvThreads(output_shape, filter_shape, 1); + int thread_dim, thread_count, thread_dim_size; + if (thread_count_batch > thread_count_row) + { + thread_dim = 0; + thread_dim_size = output_batches; + thread_count = thread_count_batch; + } + else + { + thread_dim = 1; + thread_dim_size = output_rows; + thread_count = thread_count_row; + } + + // NOTE Borrow RuyContext to get max_num_threads setting + // TODO Define and use max_num_threads for CPU backend + const int max_threads = ruy_context->max_num_threads(); + thread_count = std::max(1, std::min(thread_count, max_threads)); + + if (thread_count == 1) + { + DepthwiseConvImpl(params, output_multiplier, output_shift, input_shape, input_data, + filter_shape, filter_data, bias_shape, bias_data, output_shape, output_data, + /*thread_start=*/0, + /*thread_end=*/output_rows, /*thread_dim=*/1); + } + else + { + std::vector<DepthwiseConvWorkerTask<int8_t, int32_t>> tasks; + // TODO(b/131746020) don't create new heap allocations every time. + // At least we make it a single heap allocation by using reserve(). + tasks.reserve(thread_count); + int thread_start = 0; + for (int i = 0; i < thread_count; ++i) + { + int thread_end = thread_start + (thread_dim_size - thread_start) / (thread_count - i); + tasks.emplace_back(params, output_multiplier, output_shift, input_shape, input_data, + filter_shape, filter_data, bias_shape, bias_data, output_shape, + output_data, thread_start, thread_end, thread_dim); + thread_start = thread_end; + } + cpu_backend_threadpool::Execute(tasks.size(), tasks.data(), ruy_context); + } +} + +} // namespace optimized_integer_ops +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_INT8_H__ diff --git a/compute/cker/include/cker/operation/reference/BatchMatMul.h b/compute/cker/include/cker/operation/reference/BatchMatMul.h index e8ffd4014..1b3020de2 100644 --- a/compute/cker/include/cker/operation/reference/BatchMatMul.h +++ b/compute/cker/include/cker/operation/reference/BatchMatMul.h @@ -87,9 +87,8 @@ inline void BatchMatMul(const Shape &lhs_shape, const float *lhs_data, const Sha { const float *lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2; const float *rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2; - float *out_ptr = - output_data + - ((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) * lhs_rows * rhs_cols; + float *out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) * + lhs_rows * rhs_cols; for (int j = 0; j < rhs_cols; ++j) { for (int i = 0; i < lhs_rows; ++i) diff --git a/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h index f7e39248c..96e1d9127 100644 --- a/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h +++ b/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h @@ -56,28 +56,22 @@ inline void BinaryArithmeticOp(const BinaryArithmeticOpParam ¶ms, const Shap const int size = MatchingElementsSize(input1_shape, input2_shape, output_shape); for (int i = 0; i < size; i++) { - output_data[i] = - ActivationFunctionWithMinMax(fn(input1_data[i], input2_data[i]), - params.float_activation_min, params.float_activation_max); + output_data[i] = ActivationFunctionWithMinMax( + fn(input1_data[i], input2_data[i]), params.float_activation_min, params.float_activation_max); } } template <typename T> -inline void BroadcastBinaryArithmeticOpSlowQuant8( - const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, const T *input1_data, - const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data, - const std::function<T(const BinaryArithmeticOpParam ¶ms, const T &, const T &)> &fn) +inline typename std::enable_if_t<is_quant8<T>::value> BroadcastBinaryArithmeticOpSlow( + const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, const T *input1_data, + const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data, + const std::function<T(const BinaryArithmeticOpParam ¶ms, const T &, const T &)> &fn) { NdArrayDesc<4> desc1; NdArrayDesc<4> desc2; NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2); const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape); - if ((params.quantized_activation_min < 0) && (params.quantized_activation_max > 255)) - { - throw std::runtime_error{"Support only for Quant8."}; - } - // Comment from tensorflow lite: // // In Tensorflow, the dimensions are canonically named (batch_number, row, @@ -99,11 +93,10 @@ inline void BroadcastBinaryArithmeticOpSlowQuant8( { for (int c = 0; c < extended_output_shape.Dims(3); ++c) { - output_data[Offset(extended_output_shape, b, y, x, c)] = - ActivationFunctionWithMinMax<uint8_t>( - fn(params, input1_data[SubscriptToIndex(desc1, b, y, x, c)], - input2_data[SubscriptToIndex(desc2, b, y, x, c)]), - params.quantized_activation_min, params.quantized_activation_max); + output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax<T>( + fn(params, input1_data[SubscriptToIndex(desc1, b, y, x, c)], + input2_data[SubscriptToIndex(desc2, b, y, x, c)]), + params.quantized_activation_min, params.quantized_activation_max); } } } @@ -143,9 +136,9 @@ inline void BroadcastBinaryArithmeticOpSlow(const BinaryArithmeticOpParam ¶m for (int c = 0; c < extended_output_shape.Dims(3); ++c) { output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax<T>( - fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)], - input2_data[SubscriptToIndex(desc2, b, y, x, c)]), - params.quantized_activation_min, params.quantized_activation_max); + fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)], + input2_data[SubscriptToIndex(desc2, b, y, x, c)]), + params.quantized_activation_min, params.quantized_activation_max); } } } @@ -154,9 +147,9 @@ inline void BroadcastBinaryArithmeticOpSlow(const BinaryArithmeticOpParam ¶m template <> inline void BroadcastBinaryArithmeticOpSlow( - const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, const float *input1_data, - const Shape &input2_shape, const float *input2_data, const Shape &output_shape, - float *output_data, const std::function<float(const float &, const float &)> &fn) + const BinaryArithmeticOpParam ¶ms, const Shape &input1_shape, const float *input1_data, + const Shape &input2_shape, const float *input2_data, const Shape &output_shape, + float *output_data, const std::function<float(const float &, const float &)> &fn) { NdArrayDesc<4> desc1; NdArrayDesc<4> desc2; @@ -171,10 +164,10 @@ inline void BroadcastBinaryArithmeticOpSlow( { for (int c = 0; c < extended_output_shape.Dims(3); ++c) { - output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax( - fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)], - input2_data[SubscriptToIndex(desc2, b, y, x, c)]), - params.float_activation_min, params.float_activation_max); + output_data[Offset(extended_output_shape, b, y, x, c)] = + ActivationFunctionWithMinMax(fn(input1_data[SubscriptToIndex(desc1, b, y, x, c)], + input2_data[SubscriptToIndex(desc2, b, y, x, c)]), + params.float_activation_min, params.float_activation_max); } } } diff --git a/compute/cker/include/cker/operation/reference/Conv.h b/compute/cker/include/cker/operation/reference/Conv.h index 86e8b5143..e316083a5 100644 --- a/compute/cker/include/cker/operation/reference/Conv.h +++ b/compute/cker/include/cker/operation/reference/Conv.h @@ -98,8 +98,8 @@ inline void Conv(const ConvParams ¶ms, const Shape &input_shape, const float bias_value = bias_data[out_channel]; } output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = - ActivationFunctionWithMinMax(total + bias_value, output_activation_min, - output_activation_max); + ActivationFunctionWithMinMax(total + bias_value, output_activation_min, + output_activation_max); } } } @@ -183,7 +183,213 @@ inline void Conv(const ConvParams ¶ms, const Shape &input_shape, const uint8 acc = std::max(acc, output_activation_min); acc = std::min(acc, output_activation_max); output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = - static_cast<uint8_t>(acc); + static_cast<uint8_t>(acc); + } + } + } + } +} + +template <typename T, bool is_asymmetric> +inline void Conv(const ConvParams ¶ms, const int32_t *output_multiplier, + const int32_t *output_shift, const Shape &input_shape, const T *input_data, + const Shape &filter_shape, const T *filter_data, const int32_t *filter_zeropoint, + const Shape &bias_shape, const int32_t *bias_data, const Shape &output_shape, + T *output_data) + +{ + UNUSED_RELEASE(bias_shape); + // Get parameters. + const int32_t input_offset = params.input_offset; // r = s(q - Z) + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int32_t output_offset = params.output_offset; + + // Set min and max value of the output. + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + + // Consistency check. + assert(output_activation_min < output_activation_max); + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + if (bias_data) + { + assert(bias_shape.FlatSize() == output_depth); + } + + // Check dimensions of the tensors. + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + const int in_y_origin = (out_y * stride_height) - pad_height; + for (int out_x = 0; out_x < output_width; ++out_x) + { + const int in_x_origin = (out_x * stride_width) - pad_width; + for (int out_channel = 0; out_channel < output_depth; ++out_channel) + { + int32_t acc = 0; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + const int in_y = in_y_origin + dilation_height_factor * filter_y; + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + + // Zero padding by omitting the areas outside the image. + const bool is_point_inside_image = + (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height); + + if (!is_point_inside_image) + { + continue; + } + + for (int in_channel = 0; in_channel < input_depth; ++in_channel) + { + const T input_val = input_data[Offset(input_shape, batch, in_y, in_x, in_channel)]; + const T filter_val = + filter_data[Offset(filter_shape, out_channel, filter_y, filter_x, in_channel)]; + if (is_asymmetric) + { + const int32_t filter_offset = -filter_zeropoint[out_channel]; + acc += (filter_val + filter_offset) * (input_val + input_offset); + } + else + { + // Accumulate with 32 bits accumulator. + // In the nudging process during model quantization, we force + // real value of 0.0 be represented by a quantized value. This + // guarantees that the input_offset is a int8_t, even though + // it is represented using int32_t. int32_t += int8_t * + // (int8_t - int8_t) so the highest value we can get from each + // accumulation is [-127, 127] * ([-128, 127] - + // [-128, 127]), which is [-32512, 32512]. log2(32512) + // = 14.98, which means we can accumulate at least 2^16 + // multiplications without overflow. The accumulator is + // applied to a filter so the accumulation logic will hold as + // long as the filter size (filter_y * filter_x * in_channel) + // does not exceed 2^16, which is the case in all the models + // we have seen so far. + // TODO(jianlijianli): Add a check to make sure the + // accumulator depth is smaller than 2^16. + acc += filter_val * (input_val + input_offset); + UNUSED_RELEASE(filter_zeropoint); + } + } + } + } + + if (bias_data) + { + acc += bias_data[out_channel]; + } + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier[out_channel], + output_shift[out_channel]); + acc += output_offset; + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = static_cast<T>(acc); + } + } + } + } +} + +// Slightly modified from tflite 2.13.0 HybridConvPerChannel +// im2col and im2col_shape are removed since it is not used in reference kernel. +inline void HybridConvPerChannel(const ConvParams ¶ms, float *scaling_factors_ptr, + const Shape &input_shape, const int8_t *input_data, + const Shape &filter_shape, const int8_t *filter_data, + const Shape &bias_shape, const float *bias_data, + const Shape &output_shape, float *output_data, + const float *per_channel_scale, const int32_t *input_offset) + +{ + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int input_depth = input_shape.Dims(3); + const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3); + if (bias_data) + { + assert(bias_shape.FlatSize() == output_depth); + UNUSED_RELEASE(bias_shape); + } + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int filter_input_depth = filter_shape.Dims(3); + const int groups = input_depth / filter_input_depth; + assert(input_depth % filter_input_depth == 0); + const int filters_per_group = output_depth / groups; + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + for (int out_channel = 0; out_channel < output_depth; ++out_channel) + { + auto group = out_channel / filters_per_group; + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + int32_t acc = 0; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + for (int in_channel = 0; in_channel < filter_input_depth; ++in_channel) + { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + const int in_y = in_y_origin + dilation_height_factor * filter_y; + // If the location is outside the bounds of the input image, + // use zero as a default value. + if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height)) + { + int32_t input_val = input_data[Offset(input_shape, batch, in_y, in_x, + in_channel + group * filter_input_depth)]; + int32_t filter_val = + filter_data[Offset(filter_shape, out_channel, filter_y, filter_x, in_channel)]; + acc += filter_val * (input_val - input_offset[batch]); + } + } + } + } + float acc_float = acc * per_channel_scale[out_channel] * scaling_factors_ptr[batch]; + if (bias_data) + { + acc_float += bias_data[out_channel]; + } + output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] = + ActivationFunctionWithMinMax(acc_float, output_activation_min, output_activation_max); } } } diff --git a/compute/cker/include/cker/operation/reference/integer_ops/DepthwiseConvHybrid.h b/compute/cker/include/cker/operation/reference/integer_ops/DepthwiseConvHybrid.h new file mode 100644 index 000000000..9fc58ad3b --- /dev/null +++ b/compute/cker/include/cker/operation/reference/integer_ops/DepthwiseConvHybrid.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2019 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_REFERENCE_DEPTHWISE_CONV_HYBRID_H__ +#define __NNFW_CKER_REFERENCE_DEPTHWISE_CONV_HYBRID_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ +namespace reference_integer_ops +{ + +inline void DepthwiseConvHybridPerChannel(const DepthwiseConvParams ¶ms, + float *scaling_factors_ptr, const Shape &input_shape, + const int8_t *input_data, const Shape &filter_shape, + const int8_t *filter_data, const Shape &bias_shape, + const float *bias_data, const Shape &output_shape, + float *output_data, const float *per_channel_scale, + int32_t *input_offset) +{ + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int depth_multiplier = params.depth_multiplier; + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + + // Check dimensions of the tensors. + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int bias_depth = bias_shape.FlatSize(); + UNUSED_RELEASE(output_depth); + UNUSED_RELEASE(bias_shape); + assert(output_depth == input_depth * depth_multiplier); + assert(bias_depth == output_depth); + + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + for (int in_channel = 0; in_channel < input_depth; ++in_channel) + { + for (int m = 0; m < depth_multiplier; ++m) + { + const int output_channel = m + in_channel * depth_multiplier; + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + int32_t acc = 0; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + const int in_y = in_y_origin + dilation_height_factor * filter_y; + // Zero padding by omitting the areas outside the image. + const bool is_point_inside_image = + (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height); + if (is_point_inside_image) + { + int32_t input_val = + input_data[Offset(input_shape, batch, in_y, in_x, in_channel)]; + int32_t filter_val = + filter_data[Offset(filter_shape, 0, filter_y, filter_x, output_channel)]; + acc += filter_val * (input_val - input_offset[batch]); + } + } + } + float acc_float = static_cast<float>(acc); + acc_float *= per_channel_scale[output_channel] * scaling_factors_ptr[batch]; + if (bias_data && output_channel < bias_depth) + { + acc_float += bias_data[output_channel]; + } + output_data[Offset(output_shape, batch, out_y, out_x, output_channel)] = + ActivationFunctionWithMinMax(acc_float, output_activation_min, output_activation_max); + } + } + } + } + } +} + +} // namespace reference_integer_ops +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_REFERENCE_DEPTHWISE_CONV_HYBRID_H__ diff --git a/compute/cker/include/cker/operation/reference/integer_ops/DepthwiseConvUInt8.h b/compute/cker/include/cker/operation/reference/integer_ops/DepthwiseConvUInt8.h new file mode 100644 index 000000000..025e40705 --- /dev/null +++ b/compute/cker/include/cker/operation/reference/integer_ops/DepthwiseConvUInt8.h @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2019 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_REFERENCE_DEPTHWISE_CONV_UINT8_H__ +#define __NNFW_CKER_REFERENCE_DEPTHWISE_CONV_UINT8_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" + +namespace nnfw +{ +namespace cker +{ +namespace reference_integer_ops +{ +inline void DepthwiseConvPerChannel(const DepthwiseConvParams ¶ms, + const int32_t *output_multiplier, const int32_t *output_shift, + const Shape &input_shape, const uint8_t *input_data, + const Shape &filter_shape, const uint8_t *filter_data, + const int32_t *filter_zeropoint, const Shape &bias_shape, + const int32_t *bias_data, const Shape &output_shape, + uint8_t *output_data) +{ + // Get parameters. + // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro. + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int depth_multiplier = params.depth_multiplier; + const int32_t input_offset = params.input_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + + // Check dimensions of the tensors. + assert(input_shape.DimensionsCount() == 4); + assert(filter_shape.DimensionsCount() == 4); + assert(output_shape.DimensionsCount() == 4); + + assert(output_activation_min <= output_activation_max); + const int batches = MatchingDim(input_shape, 0, output_shape, 0); + const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3); + const int input_height = input_shape.Dims(1); + const int input_width = input_shape.Dims(2); + const int input_depth = input_shape.Dims(3); + const int filter_height = filter_shape.Dims(1); + const int filter_width = filter_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + UNUSED_RELEASE(output_depth); + UNUSED_RELEASE(bias_shape); + assert(output_depth == input_depth * depth_multiplier); + assert(bias_shape.FlatSize() == output_depth); + + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + for (int in_channel = 0; in_channel < input_depth; ++in_channel) + { + for (int m = 0; m < depth_multiplier; ++m) + { + const int output_channel = m + in_channel * depth_multiplier; + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + int32_t acc = 0; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + const int in_y = in_y_origin + dilation_height_factor * filter_y; + // Zero padding by omitting the areas outside the image. + const bool is_point_inside_image = + (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height); + if (is_point_inside_image) + { + uint8_t input_val = + input_data[Offset(input_shape, batch, in_y, in_x, in_channel)]; + uint8_t filter_val = + filter_data[Offset(filter_shape, 0, filter_y, filter_x, output_channel)]; + + // { for per-channel + // NOTE: The following comment is copied from tflite int8 implementation + // It may not be 100% true for uint8 per-channel. + // + // Accumulate with 32 bits accumulator. + // In the nudging process during model quantization, we force + // real value of 0.0 be represented by a quantized value. This + // guarantees that the input_offset is a int8, even though it + // is represented using int32_t. + // int32 += int8 * (int8 - int8) so the highest value we can + // get from each accumulation is [-127, 127] * ([-128, 127] - + // [-128, 127]), which is [-32512, 32512]. log2(32512) + // = 14.98, which means we can accumulate at least 2^16 + // multiplications without overflow. The accumulator is + // applied to a filter so the accumulation logic will hold as + // long as the filter size (filter_y * filter_x * in_channel) + // does not exceed 2^16, which is the case in all the models + // we have seen so far. + // TODO(jianlijianli): Add a check to make sure the + // accumulator depth is smaller than 2^16. + const int32_t filter_offset = -filter_zeropoint[output_channel]; + acc += (filter_val + filter_offset) * (input_val + input_offset); + // } for per-channel + } + } + } + if (bias_data) + { + acc += bias_data[output_channel]; + } + acc = MultiplyByQuantizedMultiplier(acc, output_multiplier[output_channel], + output_shift[output_channel]); + acc += output_offset; + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + // For q8u per-channel, int8_t -> uint8_t + output_data[Offset(output_shape, batch, out_y, out_x, output_channel)] = + static_cast<uint8_t>(acc); + } + } + } + } + } +} + +} // namespace reference_integer_ops +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_REFERENCE_DEPTHWISE_CONV_UINT8_H__ diff --git a/compute/cker/include/cker/ruy/RuySupport.h b/compute/cker/include/cker/ruy/RuySupport.h index 9612dd517..14489a804 100644 --- a/compute/cker/include/cker/ruy/RuySupport.h +++ b/compute/cker/include/cker/ruy/RuySupport.h @@ -19,7 +19,9 @@ #define __NNFW_CKER_RUY_RUY_SUPPORT_H__ #include <util/ConfigSource.h> -#include <ruy/context.h> +#include <ruy/matrix.h> +#include <ruy/ruy.h> +#include <cassert> #include "cker/Types.h" namespace nnfw @@ -29,44 +31,66 @@ namespace cker namespace ruy_support { +inline ruy::CachePolicy ToRuyCachePolicy(CachePolicy cache_policy) +{ + switch (cache_policy) + { + case CachePolicy::kNeverCache: + return ruy::CachePolicy::kNeverCache; + case CachePolicy::kCacheIfLargeSpeedup: + return ruy::CachePolicy::kCacheIfLargeSpeedup; + case CachePolicy::kAlwaysCache: + return ruy::CachePolicy::kAlwaysCache; + default: + assert(false); + return ruy::CachePolicy::kNeverCache; + } +} + template <typename Scalar, typename DataPointer> void MakeRuyMatrix(const MatrixParams<Scalar> ¶ms, DataPointer data_ptr, - ruy::Matrix<Scalar> *dst) + ruy::Matrix<Scalar> *dst, bool use_caching = false) { - dst->layout.rows = params.rows; - dst->layout.cols = params.cols; - if (params.order == Order::kColMajor) + ruy::Order ruy_order = + params.order == Order::kColMajor ? ruy::Order::kColMajor : ruy::Order::kRowMajor; + ruy::MakeSimpleLayout(params.rows, params.cols, ruy_order, dst->mutable_layout()); + // Note that ruy::Matrix::data is a ConstCheckingPtr, not a plain pointer. + // It does care whether we assign to it a Scalar* or a const Scalar*. + dst->set_data(data_ptr); + dst->set_zero_point(params.zero_point); + if (use_caching) { - dst->layout.order = ruy::Order::kColMajor; - dst->layout.stride = params.rows; + dst->set_cache_policy(ToRuyCachePolicy(params.cache_policy)); } - else +} + +// Integer-quantized case with destination type narrower than int32 +template <typename DstScalar, QuantizationFlavor quantization_flavor> +void MakeRuyMulParams(const GemmParams<std::int32_t, DstScalar, quantization_flavor> ¶ms, + ruy::MulParams<std::int32_t, DstScalar> *ruy_mul_params) +{ + static_assert(sizeof(DstScalar) < sizeof(std::int32_t), ""); + if (quantization_flavor == QuantizationFlavor::kIntegerWithUniformMultiplier) { - dst->layout.order = ruy::Order::kRowMajor; - dst->layout.stride = params.cols; + ruy_mul_params->set_multiplier_fixedpoint(params.multiplier_fixedpoint); + ruy_mul_params->set_multiplier_exponent(params.multiplier_exponent); } - // Note that ruy::Matrix::data is a ConstCheckingPtr, not a plain pointer. - // It does care whether we assign to it a Scalar* or a const Scalar*. - dst->data = data_ptr; - dst->zero_point = params.zero_point; - dst->cacheable = params.cacheable; + if (quantization_flavor == QuantizationFlavor::kIntegerWithPerRowMultiplier) + { + ruy_mul_params->set_multiplier_fixedpoint_perchannel(params.multiplier_fixedpoint_perchannel); + ruy_mul_params->set_multiplier_exponent_perchannel(params.multiplier_exponent_perchannel); + } + ruy_mul_params->set_bias(params.bias); + ruy_mul_params->set_clamp_min(params.clamp_min); + ruy_mul_params->set_clamp_max(params.clamp_max); } -template <typename GemmParamsType, typename RuySpecType> -void MakeRuySpec(const GemmParamsType ¶ms, RuySpecType *ruy_spec) +// Raw-integer case with destination type int32. +template <QuantizationFlavor quantization_flavor> +void MakeRuyMulParams(const GemmParams<std::int32_t, std::int32_t, quantization_flavor> ¶ms, + ruy::MulParams<std::int32_t, std::int32_t> *ruy_mul_params) { - // This validation has already been performed by the Gemm API entry point, - // but it doesn't hurt to test specifically this again here, where it's - // being used. - ValidateGemmParams(params); - - ruy_spec->multiplier_fixedpoint = params.multiplier_fixedpoint; - ruy_spec->multiplier_exponent = params.multiplier_exponent; - ruy_spec->multiplier_fixedpoint_perchannel = params.multiplier_fixedpoint_perchannel; - ruy_spec->multiplier_exponent_perchannel = params.multiplier_exponent_perchannel; - ruy_spec->bias = params.bias; - ruy_spec->clamp_min = params.clamp_min; - ruy_spec->clamp_max = params.clamp_max; + ruy_mul_params->set_bias(params.bias); } } // namespace ruy_support diff --git a/compute/cker/include/cker/train/operation/FullyConnected.h b/compute/cker/include/cker/train/operation/FullyConnected.h new file mode 100644 index 000000000..b0255d287 --- /dev/null +++ b/compute/cker/include/cker/train/operation/FullyConnected.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_TRAIN_OPERATION_FULLY_CONNECTED_H__ +#define __NNFW_CKER_TRAIN_OPERATION_FULLY_CONNECTED_H__ + +#include "cker/eigen/Utils.h" +#include "cker/Shape.h" + +namespace nnfw +{ +namespace cker +{ +namespace train +{ + +template <typename T> +inline void FullyConnectedBiasGrad(const Shape &incomming_shape, const T *incomming_data, + const Shape &grad_shape, T *grad_data) +{ + const auto bias_size = grad_shape.FlatSize(); + if (bias_size != incomming_shape.Dims(incomming_shape.DimensionsCount() - 1) || + bias_size != grad_shape.Dims(0)) + throw std::runtime_error("cker::FullyConnectedBiasGrad: Unmatched shape"); + + const auto in_mat = MapAsMatrixWithLastDimAsRows(incomming_data, incomming_shape); + auto grad_mat = MapAsMatrixWithLastDimAsRows(grad_data, grad_shape); + + grad_mat = in_mat.rowwise().sum(); +} + +} // namespace train +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_FULLY_CONNECTED_H__ diff --git a/compute/cker/include/cker/train/operation/Loss.h b/compute/cker/include/cker/train/operation/Loss.h new file mode 100644 index 000000000..94f49ff07 --- /dev/null +++ b/compute/cker/include/cker/train/operation/Loss.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_TRAIN_OPERATION_LOSS_H__ +#define __NNFW_CKER_TRAIN_OPERATION_LOSS_H__ + +#include "cker/Shape.h" +#include "cker/eigen/Utils.h" + +namespace nnfw +{ +namespace cker +{ +namespace train +{ + +template <typename T> +inline void MSE(const Shape &y_pred_shape, const T *y_pred_data, const Shape &y_true_shape, + const T *y_true_data, const Shape &output_shape, T *output_data) +{ + // TODO Consider Reduction + if (output_shape != Shape{1}) + throw std::runtime_error("cker::MSE: output_shape != Shape{1}"); + if (y_pred_shape != y_true_shape) + throw std::runtime_error("cker::MSE: y_pred_shape != y_true_shape"); + + const auto y_pred = MapAsMatrixWithLastDimAsRows(y_pred_data, y_pred_shape); + const auto y_true = MapAsMatrixWithLastDimAsRows(y_true_data, y_true_shape); + + double squared_sum = 0.0f; + for (size_t c = 0; c < (size_t)y_pred.cols(); ++c) + { + for (size_t r = 0; r < (size_t)y_pred.rows(); ++r) + { + double error = y_pred.coeff(r, c) - y_true.coeff(r, c); + squared_sum += (error * error); + } + } + + auto size = y_pred.cols() * y_pred.rows(); + output_data[0] = (T)(squared_sum / size); +} + +template <typename T> +inline void MSEGrad(const Shape &y_pred_shape, const T *y_pred_data, const Shape &y_true_shape, + const T *y_true_data, const Shape &grad_shape, T *grad_data) +{ + if (y_pred_shape != y_true_shape) + throw std::runtime_error("cker::MSEGrad: y_pred_shape != y_true_shape"); + if (y_pred_shape != grad_shape) + throw std::runtime_error("cker::MSEGrad: y_pred_shape != grad_shape"); + + const int size = grad_shape.FlatSize(); + for (int i = 0; i < size; ++i) + { + grad_data[i] = static_cast<T>(-2 * (y_true_data[i] - y_pred_data[i]) / size); + } +} + +} // namespace train +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_TRAIN_OPERATION_LOSS_H__ diff --git a/compute/cker/include/cker/train/operation/ReLU.h b/compute/cker/include/cker/train/operation/ReLU.h new file mode 100644 index 000000000..32cf7fa9c --- /dev/null +++ b/compute/cker/include/cker/train/operation/ReLU.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_TRAIN_OPERATION_RELU_H__ +#define __NNFW_CKER_TRAIN_OPERATION_RELU_H__ + +#include "cker/Shape.h" +#include "cker/eigen/Utils.h" + +#include <Eigen/Core> + +namespace nnfw +{ +namespace cker +{ +namespace train +{ + +inline void ReLUGrad(const Shape &output_shape, const float *output_data, + const Shape &incoming_shape, const float *incoming_data, + const Shape &grad_shape, float *grad_data) +{ + const auto output_map = MapAsVector(output_data, output_shape); + const auto incoming_map = MapAsVector(incoming_data, incoming_shape); + auto grad_map = MapAsVector(grad_data, grad_shape); + + if (output_shape == incoming_shape && output_shape == grad_shape) + grad_map.array() = incoming_map.array() * (output_map.array() > 0.0f).template cast<float>(); + else + throw std::runtime_error("cker::ReLUGrad: Unsupported shape"); +} + +} // namespace train +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_TRAIN_OPERATION_RELU_H__ diff --git a/compute/cker/src/Range.test.cc b/compute/cker/src/Range.test.cc new file mode 100644 index 000000000..e5fe4801f --- /dev/null +++ b/compute/cker/src/Range.test.cc @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <cker/operation/Range.h> + +#include <gtest/gtest.h> +#include <vector> + +TEST(CKer_Operation, Range) +{ + { + const int start = 0; + const int limit = 10; + const int delta = 1; + std::vector<int> actual(10); + nnfw::cker::Range<int>(&start, &limit, &delta, actual.data()); + + for (int i = 0; i < actual.size(); i++) + ASSERT_EQ(actual[i], i); + } + + { + const int start = 3; + const int limit = 18; + const int delta = 3; + std::vector<int> expected = {3, 6, 9, 12, 15}; + std::vector<int> actual(expected.size()); + nnfw::cker::Range<int>(&start, &limit, &delta, actual.data()); + + for (int i = 0; i < actual.size(); i++) + ASSERT_EQ(actual[i], expected[i]); + } + + { + const float start = 3; + const float limit = 1; + const float delta = -0.5; + std::vector<float> expected = {3, 2.5, 2, 1.5}; + std::vector<float> actual(expected.size()); + nnfw::cker::Range<float>(&start, &limit, &delta, actual.data()); + + for (int i = 0; i < actual.size(); i++) + ASSERT_FLOAT_EQ(actual[i], expected[i]); + } +} + +TEST(CKer_Operation, neg_Range) +{ + { + const int start = 212; + const int limit = 10; + const int delta = 1; + std::vector<int> actual(10); + + EXPECT_ANY_THROW(nnfw::cker::Range<int>(&start, &limit, &delta, actual.data())); + } +} diff --git a/compute/cker/src/train/FullyConnected.test.cc b/compute/cker/src/train/FullyConnected.test.cc new file mode 100644 index 000000000..37c2d4a97 --- /dev/null +++ b/compute/cker/src/train/FullyConnected.test.cc @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <cker/train/operation/FullyConnected.h> + +#include <gtest/gtest.h> +#include <vector> + +TEST(CKer_Operation, FullyConnectedBiasGrad) +{ + { + // Shape: {2, 4} + std::vector<float> incoming_backward = {-1, 2, -3, 4, 5, -6, -7, 8}; + // Shape: {4} + std::vector<float> expected_bias_backward = {4, -4, -10, 12}; + std::vector<float> bias_backward(4); + + nnfw::cker::train::FullyConnectedBiasGrad( + nnfw::cker::Shape{2, 4}, incoming_backward.data(), + nnfw::cker::Shape{static_cast<int>(bias_backward.size())}, bias_backward.data()); + + for (size_t i = 0; i < bias_backward.size(); ++i) + ASSERT_EQ(bias_backward[i], expected_bias_backward[i]); + } + + { + // Shape: {3, 3} + std::vector<float> incoming_backward = {-1, 2, -3, 4, 5, -6, -7, 8, 9}; + // Shape: {3} + std::vector<float> expected_bias_backward = {-4, 15, 0}; + std::vector<float> bias_backward(3); + + nnfw::cker::train::FullyConnectedBiasGrad( + nnfw::cker::Shape{3, 3}, incoming_backward.data(), + nnfw::cker::Shape{static_cast<int>(bias_backward.size())}, bias_backward.data()); + + for (size_t i = 0; i < bias_backward.size(); ++i) + ASSERT_EQ(bias_backward[i], expected_bias_backward[i]); + } + + { + // Shape: {1, 2, 2, 3} + std::vector<float> incoming_backward = {-1, 2, -3, 4, 5, -6, -7, 8, 9, -10, -11, 12}; + // Shape: {3} + std::vector<float> expected_bias_backward = {-14, 4, 12}; + std::vector<float> bias_backward(3); + + nnfw::cker::train::FullyConnectedBiasGrad( + nnfw::cker::Shape{1, 2, 2, 3}, incoming_backward.data(), + nnfw::cker::Shape{static_cast<int>(bias_backward.size())}, bias_backward.data()); + + for (size_t i = 0; i < bias_backward.size(); ++i) + ASSERT_EQ(bias_backward[i], expected_bias_backward[i]); + } +} + +TEST(CKer_Operation, neg_FullyConnectedBiasGrad) +{ + { + // Unmatched shape + // Shape: {2, 4} + std::vector<float> incoming_backward = {-1, 2, -3, 4, 5, -6, -7, 8}; + // Shape: {3} + std::vector<float> bias_backward(3); + EXPECT_ANY_THROW(nnfw::cker::train::FullyConnectedBiasGrad( + nnfw::cker::Shape{2, 4}, incoming_backward.data(), + nnfw::cker::Shape{static_cast<int>(bias_backward.size())}, + bias_backward.data());); + } +} diff --git a/compute/cker/src/train/Loss.test.cc b/compute/cker/src/train/Loss.test.cc new file mode 100644 index 000000000..98568f47a --- /dev/null +++ b/compute/cker/src/train/Loss.test.cc @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <cker/train/operation/Loss.h> + +#include <gtest/gtest.h> +#include <vector> + +TEST(CKer_Operation, LossMSE) +{ + { + // Shape: {1, 10} -> m_rows:10, m_cols:1 + std::vector<int> y_pred = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + std::vector<int> y_true = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + std::vector<int> output(1); + std::vector<int> expected = {1}; + + nnfw::cker::train::MSE(nnfw::cker::Shape{1, 10}, y_pred.data(), nnfw::cker::Shape{1, 10}, + y_true.data(), nnfw::cker::Shape{1}, output.data()); + + EXPECT_EQ(output[0], expected[0]); + } + + { + // Shape: {1, 10} -> m_rows:10, m_cols:1 + std::vector<float> y_pred = {1., 2., 3., 4., 5., 6., 7., 8., 9., 10.}; + std::vector<float> y_true = {0., 1., 2., 3., 4., 5., 6., 7., 8., 9.}; + std::vector<float> output(1); + std::vector<float> expected = {1.0}; + + nnfw::cker::train::MSE(nnfw::cker::Shape{1, 10}, y_pred.data(), nnfw::cker::Shape{1, 10}, + y_true.data(), nnfw::cker::Shape{1}, output.data()); + + EXPECT_FLOAT_EQ(output[0], expected[0]); + } + + { + // Shape: {2, 3} -> m_rows:3, m_cols:2 + std::vector<float> y_pred = {27.2, 31.8, 51.9, 10.2, 34.2, 12.4}; + std::vector<float> y_true = {31.3, 40.3, 29.7, 12.9, 25.8, 11.9}; + std::vector<float> output(1); + std::vector<float> expected = {110.0}; + + nnfw::cker::train::MSE(nnfw::cker::Shape{2, 3}, y_pred.data(), nnfw::cker::Shape{2, 3}, + y_true.data(), nnfw::cker::Shape{1}, output.data()); + + EXPECT_FLOAT_EQ(output[0], expected[0]); + } + + { + // Shape: {2, 3, 4} -> m_rows:4, m_cols:6 + std::vector<float> y_pred = {1., 2., 3., 4., 1., 2., 3., 4., 1., 2., 3., 4., + 1., 2., 3., 4., 1., 2., 3., 4., 1., 2., 3., 4.}; + std::vector<float> y_true = {1., 1., 1., 1., 2., 2., 2., 2., 3., 3., 3., 3., + 1., 1., 1., 1., 2., 2., 2., 2., 3., 3., 3., 3.}; + std::vector<float> output(1); + std::vector<float> expected = {2.1666667}; + + nnfw::cker::train::MSE(nnfw::cker::Shape{2, 3, 4}, y_pred.data(), nnfw::cker::Shape{2, 3, 4}, + y_true.data(), nnfw::cker::Shape{1}, output.data()); + + EXPECT_FLOAT_EQ(output[0], expected[0]); + } +} + +TEST(CKer_Operation, neg_LossMSE) +{ + { + // Invalid expected value + std::vector<float> y_pred = {1., 2., 3., 4., 5., 6., 7., 8., 9., 10.}; + std::vector<float> y_true = {0., 1., 2., 3., 4., 5., 6., 7., 8., 9.}; + std::vector<float> output(1); + std::vector<float> expected = {-1.0}; + + nnfw::cker::train::MSE(nnfw::cker::Shape{2, 3, 4}, y_pred.data(), nnfw::cker::Shape{2, 3, 4}, + y_true.data(), nnfw::cker::Shape{1}, output.data()); + + EXPECT_NE(output[0], expected[0]); + } + + { + // Invalid output shape + std::vector<float> y_pred = {1., 2., 3., 4., 5., 6., 7., 8., 9., 10.}; + std::vector<float> y_true = {0., 1., 2., 3., 4., 5., 6., 7., 8., 9.}; + std::vector<float> output(3); + std::vector<float> expected = {1.0}; + + EXPECT_ANY_THROW(nnfw::cker::train::MSE(nnfw::cker::Shape{2, 3, 4}, y_pred.data(), + nnfw::cker::Shape{2, 3, 4}, y_true.data(), + nnfw::cker::Shape{3}, output.data())); + } + + { + // Different y_pread and y_true shape + std::vector<float> y_pred = {1., 2., 3., 4., 5., 6., 7., 8., 9., 10.}; + std::vector<float> y_true = {0., 1., 2., 3., 4., 5.}; + std::vector<float> output(1); + std::vector<float> expected = {1.0}; + + EXPECT_ANY_THROW(nnfw::cker::train::MSE(nnfw::cker::Shape{2, 3, 4}, y_pred.data(), + nnfw::cker::Shape{2, 3}, y_true.data(), + nnfw::cker::Shape{1}, output.data())); + } +} + +TEST(CKer_Operation, LossMSEGrad) +{ + { + // Shape: {1, 10} -> m_rows:10, m_cols:1 + std::vector<int> y_pred = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + std::vector<int> y_true = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + std::vector<int> deriv_y_pred(10); + std::vector<int> expected = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(), nnfw::cker::Shape{1, 10}, + y_true.data(), nnfw::cker::Shape{1, 10}, deriv_y_pred.data()); + + for (size_t i = 0; i < deriv_y_pred.size(); ++i) + EXPECT_EQ(deriv_y_pred[i], expected[i]); + } + + { + // Shape: {1, 10} -> m_rows:10, m_cols:1 + std::vector<float> y_pred = {1., 2., 3., 4., 5., 6., 7., 8., 9., 10.}; + std::vector<float> y_true = {0., 1., 2., 3., 4., 5., 6., 7., 8., 9.}; + std::vector<float> deriv_y_pred(10); + std::vector<float> expected = {0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2}; + + nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(), nnfw::cker::Shape{1, 10}, + y_true.data(), nnfw::cker::Shape{1, 10}, deriv_y_pred.data()); + + for (size_t i = 0; i < deriv_y_pred.size(); ++i) + EXPECT_FLOAT_EQ(deriv_y_pred[i], expected[i]); + } + + { + // Shape: {2, 3} -> m_rows:3, m_cols:2 + std::vector<float> y_pred = {27.2, 31.8, 51.9, 10.2, 34.2, 12.4}; + std::vector<float> y_true = {31.3, 40.3, 29.7, 12.9, 25.8, 11.9}; + std::vector<float> deriv_y_pred(6); + std::vector<float> expected = {-1.3666667, -2.8333333, 7.4, -0.9, 2.8, 0.1666667}; + + nnfw::cker::train::MSEGrad(nnfw::cker::Shape{2, 3}, y_pred.data(), nnfw::cker::Shape{2, 3}, + y_true.data(), nnfw::cker::Shape{2, 3}, deriv_y_pred.data()); + + for (size_t i = 0; i < deriv_y_pred.size(); ++i) + EXPECT_FLOAT_EQ(deriv_y_pred[i], expected[i]); + } +} + +TEST(CKer_Operation, neg_LossMSEGrad) +{ + { + // Invalid expected value + std::vector<float> y_pred = {27.2, 31.8, 51.9, 10.2, 34.2, 12.4}; + std::vector<float> y_true = {31.3, 40.3, 29.7, 12.9, 25.8, 11.9}; + std::vector<float> deriv_y_pred(6); + std::vector<float> expected = {1., 1., 1., 1., 1., 1.}; + + nnfw::cker::train::MSEGrad(nnfw::cker::Shape{2, 3}, y_pred.data(), nnfw::cker::Shape{2, 3}, + y_true.data(), nnfw::cker::Shape{2, 3}, deriv_y_pred.data()); + + for (size_t i = 0; i < deriv_y_pred.size(); ++i) + EXPECT_NE(deriv_y_pred[i], expected[i]); + } + + { + // Different y_pred and y_true shape + std::vector<float> y_pred = {1., 2., 3., 4., 5., 6., 7., 8., 9., 10.}; + std::vector<float> y_true = {0., 1., 2., 3., 4., 5.}; + std::vector<float> deriv_y_pred(10); + + EXPECT_ANY_THROW(nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(), + nnfw::cker::Shape{2, 3}, y_true.data(), + nnfw::cker::Shape{1, 10}, deriv_y_pred.data())); + } + + { + // Different y_pred and deriv_y_pred shape + std::vector<float> y_pred = {1., 2., 3., 4., 5., 6., 7., 8., 9., 10.}; + std::vector<float> y_true = {0., 1., 2., 3., 4., 5., 6., 7., 8., 9.}; + std::vector<float> deriv_y_pred(6); + + EXPECT_ANY_THROW(nnfw::cker::train::MSEGrad(nnfw::cker::Shape{1, 10}, y_pred.data(), + nnfw::cker::Shape{1, 10}, y_true.data(), + nnfw::cker::Shape{2, 3}, deriv_y_pred.data())); + } +} diff --git a/compute/cker/src/train/Relu.test.cc b/compute/cker/src/train/Relu.test.cc new file mode 100644 index 000000000..d94411038 --- /dev/null +++ b/compute/cker/src/train/Relu.test.cc @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <cker/operation/ReLU.h> +#include <cker/train/operation/ReLU.h> + +#include <gtest/gtest.h> +#include <vector> + +namespace +{ + +template <typename T> class ReluOpVerifier +{ +public: + ReluOpVerifier(const std::vector<T> &input, const std::vector<T> &expected_output, + const std::vector<T> &backprop_output, + const std::vector<T> &expected_backprop_input) + : _input{input}, _expected_output{expected_output}, _backprop_output{backprop_output}, + _expected_backprop_input{expected_backprop_input} + { + EXPECT_TRUE(input.size() == expected_output.size()); + _output.resize(_expected_output.size()); + _backprop_input.resize(_expected_backprop_input.size()); + } + +public: + void verifyExpected() + { + nnfw::cker::ReLU(nnfw::cker::Shape{static_cast<int>(_input.size())}, _input.data(), + nnfw::cker::Shape{static_cast<int>(_output.size())}, _output.data()); + + for (size_t i = 0; i < _output.size(); ++i) + ASSERT_EQ(_output[i], _expected_output[i]); + + if (_backprop_output.size() > 0) + { + nnfw::cker::train::ReLUGrad( + nnfw::cker::Shape{static_cast<int>(_output.size())}, _output.data(), + nnfw::cker::Shape{static_cast<int>(_backprop_output.size())}, _backprop_output.data(), + nnfw::cker::Shape{static_cast<int>(_backprop_input.size())}, _backprop_input.data()); + + for (size_t i = 0; i < _backprop_input.size(); ++i) + ASSERT_EQ(_backprop_input[i], _expected_backprop_input[i]); + } + } + +private: + std::vector<T> _input; + std::vector<T> _output; + std::vector<T> _expected_output; + std::vector<T> _backprop_output; + std::vector<T> _backprop_input; + std::vector<T> _expected_backprop_input; +}; + +} // namespace + +TEST(CKer_Operation, ReLU) +{ + { + std::vector<float> input_forward = {-1, 2, 3, -4}; + std::vector<float> expected_forward = {0, 2, 3, 0}; + std::vector<float> incoming_backward = {-5, 6, -7, 8}; + std::vector<float> expected_backward = {0, 6, -7, 0}; + ReluOpVerifier<float> verifier{input_forward, expected_forward, incoming_backward, + expected_backward}; + verifier.verifyExpected(); + } + + { + std::vector<float> input_forward = {0, -1, 2, 3, -4, 5, 6, -7}; + std::vector<float> expected_forward = {0, 0, 2, 3, 0, 5, 6, 0}; + std::vector<float> incoming_backward = {8, -9, 10, 11, -12, -13, 14, -15}; + std::vector<float> expected_backward = {0, 0, 10, 11, 0, -13, 14, 0}; + ReluOpVerifier<float> verifier{input_forward, expected_forward, incoming_backward, + expected_backward}; + verifier.verifyExpected(); + } +} + +TEST(CKer_Operation, neg_ReLU) +{ + { + // Unmatched shape + std::vector<float> input_forward = {0, -1, 2, 3, -4}; + std::vector<float> expected_forward = {0, 0, 2, 3, 0}; + std::vector<float> incoming_backward = {-5, 6, -7, 8}; + std::vector<float> expected_backward = {0, 6, -7, 0}; + ReluOpVerifier<float> verifier{input_forward, expected_forward, incoming_backward, + expected_backward}; + EXPECT_ANY_THROW(verifier.verifyExpected()); + } +} |