diff options
22 files changed, 742 insertions, 303 deletions
diff --git a/.gitignore b/.gitignore index 0eaad0dc7..e7a60c0b1 100644 --- a/.gitignore +++ b/.gitignore @@ -85,3 +85,4 @@ GRTAGS /externals/neon_2_sse /externals/tensorflow /externals/acl +/externals/absl diff --git a/CMakeLists.txt b/CMakeLists.txt index 6238ce073..85a81dd94 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -38,6 +38,7 @@ link_directories(${CMAKE_INSTALL_PREFIX}/lib) # Download configuration option(DOWNLOAD_TENSORFLOW "Download Tensorflow source" ON) +option(DOWNLOAD_ABSL "Download Absl source" ON) option(DOWNLOAD_EIGEN "Download Eigen source" ON) option(DOWNLOAD_FARMHASH "Download farmhash source" ON) option(DOWNLOAD_GEMMLOWP "Download GEMM low precesion library source" ON) @@ -49,6 +50,11 @@ option(DOWNLOAD_ARMCOMPUTE "Download ARM Compute source" ON) option(BUILD_GTEST "Download and build Google Test" ON) nnfw_find_package(GTest QUIET) +# NOTE Workaround to avoid build fail by tensorflow (or acl) package version mismatch on obs build +if(OBS_BUILD) + add_definitions(-DOBS_BUILD) +endif(OBS_BUILD) + # TODO For now Android build is being enabled incrementally so not all subdirectories are added yet. # However we are going to have the same subdirectories with other OS eventually. if("${TARGET_OS}" STREQUAL "android") diff --git a/cmake/option/option_armv7l-linux.cmake b/cmake/option/option_armv7l-linux.cmake index 42988bc9d..b295b4a82 100644 --- a/cmake/option/option_armv7l-linux.cmake +++ b/cmake/option/option_armv7l-linux.cmake @@ -18,6 +18,7 @@ set(FLAGS_COMMON ${FLAGS_COMMON} "-mfpu=neon-vfpv4" "-funsafe-math-optimizations" "-ftree-vectorize" + "-fPIC" ) # remove warning from arm cl diff --git a/cmake/packages/AbslSourceConfig.cmake b/cmake/packages/AbslSourceConfig.cmake new file mode 100644 index 000000000..9075b7397 --- /dev/null +++ b/cmake/packages/AbslSourceConfig.cmake @@ -0,0 +1,19 @@ +function(_AbslSource_import) + if(NOT DOWNLOAD_ABSL) + set(AbslSource_FOUND FALSE PARENT_SCOPE) + return() + endif(NOT DOWNLOAD_ABSL) + + nnfw_include(ExternalSourceTools) + nnfw_include(OptionTools) + + # NOTE The following URL comes from TensorFlow 1.12 + envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com") + set(ABSL_URL ${EXTERNAL_DOWNLOAD_SERVER}/abseil/abseil-cpp/archive/48cd2c3f351ff188bc85684b84a91b6e6d17d896.tar.gz) + ExternalSource_Download("absl" ${ABSL_URL}) + + set(AbslSource_DIR ${absl_SOURCE_DIR} PARENT_SCOPE) + set(AbslSource_FOUND TRUE PARENT_SCOPE) +endfunction(_AbslSource_import) + +_AbslSource_import() diff --git a/cmake/packages/EigenSourceConfig.cmake b/cmake/packages/EigenSourceConfig.cmake index aac30dff5..dd94e069e 100644 --- a/cmake/packages/EigenSourceConfig.cmake +++ b/cmake/packages/EigenSourceConfig.cmake @@ -7,7 +7,7 @@ function(_EigenSource_import) nnfw_include(ExternalSourceTools) nnfw_include(OptionTools) - # NOTE The following URL comes from TensorFlow 1.9 + # NOTE TensorFlow 1.12 downloads Eign from the following URL envoption(EXTERNAL_DOWNLOAD_SERVER "https://bitbucket.org") set(EIGEN_URL ${EXTERNAL_DOWNLOAD_SERVER}/eigen/eigen/get/fd6845384b86.tar.gz) ExternalSource_Download("eigen" ${EIGEN_URL}) diff --git a/cmake/packages/FarmhashSourceConfig.cmake b/cmake/packages/FarmhashSourceConfig.cmake index 29bc7f213..802367968 100644 --- a/cmake/packages/FarmhashSourceConfig.cmake +++ b/cmake/packages/FarmhashSourceConfig.cmake @@ -7,7 +7,7 @@ function(_FarmhashSource_import) nnfw_include(ExternalSourceTools) nnfw_include(OptionTools) - # NOTE TensorFlow 1.9 downloads farmhash from the following URL + # NOTE TensorFlow 1.12 downloads farmhash from the following URL envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com") set(FARMHASH_URL ${EXTERNAL_DOWNLOAD_SERVER}/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz) ExternalSource_Download("farmhash" ${FARMHASH_URL}) diff --git a/cmake/packages/FlatBuffersConfig.cmake b/cmake/packages/FlatBuffersConfig.cmake index fab08fe39..064673158 100644 --- a/cmake/packages/FlatBuffersConfig.cmake +++ b/cmake/packages/FlatBuffersConfig.cmake @@ -15,18 +15,22 @@ function(_FlatBuffers_import) # From FlatBuffers's CMakeLists.txt list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_cpp.cpp") + list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_dart.cpp") + list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_fbs.cpp") list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_general.cpp") list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_go.cpp") + list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_grpc.cpp") list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_js.cpp") + list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_json_schema.cpp") + list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_lobster.cpp") + list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_lua.cpp") list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_php.cpp") list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_python.cpp") - list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_fbs.cpp") - list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_grpc.cpp") - list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/idl_gen_json_schema.cpp") list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/flatc.cpp") list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/src/flatc_main.cpp") list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/grpc/src/compiler/cpp_generator.cc") list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/grpc/src/compiler/go_generator.cc") + list(APPEND FlatBuffers_Compiler_SRCS "${FlatBuffersSource_DIR}/grpc/src/compiler/java_generator.cc") if(NOT TARGET flatbuffers) add_library(flatbuffers ${FlatBuffers_Library_SRCS}) diff --git a/cmake/packages/FlatBuffersSourceConfig.cmake b/cmake/packages/FlatBuffersSourceConfig.cmake index f062c2608..5f142bff7 100644 --- a/cmake/packages/FlatBuffersSourceConfig.cmake +++ b/cmake/packages/FlatBuffersSourceConfig.cmake @@ -7,9 +7,9 @@ function(_FlatBuffersSource_import) nnfw_include(ExternalSourceTools) nnfw_include(OptionTools) - # NOTE TensorFlow 1.9 downloads FlatBuffers from the following URL + # NOTE TensorFlow 1.12 downloads FlatBuffers from the following URL envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com") - set(FLATBUFFERS_URL ${EXTERNAL_DOWNLOAD_SERVER}/google/flatbuffers/archive/971a68110e4fc1bace10fcb6deeb189e7e1a34ce.tar.gz) + set(FLATBUFFERS_URL ${EXTERNAL_DOWNLOAD_SERVER}/google/flatbuffers/archive/1f5eae5d6a135ff6811724f6c57f911d1f46bb15.tar.gz) ExternalSource_Download("flatbuffers" ${FLATBUFFERS_URL}) set(FlatBuffersSource_DIR ${flatbuffers_SOURCE_DIR} PARENT_SCOPE) diff --git a/cmake/packages/GEMMLowpSourceConfig.cmake b/cmake/packages/GEMMLowpSourceConfig.cmake index f5e73355e..613ff29b5 100644 --- a/cmake/packages/GEMMLowpSourceConfig.cmake +++ b/cmake/packages/GEMMLowpSourceConfig.cmake @@ -7,7 +7,7 @@ function(_GEMMLowpSource_import) nnfw_include(ExternalSourceTools) nnfw_include(OptionTools) - # NOTE TensorFlow 1.9 uses the following URL + # NOTE TensorFlow 1.12 uses the following URL envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com") set(GEMMLOWP_URL ${EXTERNAL_DOWNLOAD_SERVER}/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.tar.gz) ExternalSource_Download("gemmlowp" ${GEMMLOWP_URL}) diff --git a/cmake/packages/NEON2SSESourceConfig.cmake b/cmake/packages/NEON2SSESourceConfig.cmake index 6258b536d..b656f5700 100644 --- a/cmake/packages/NEON2SSESourceConfig.cmake +++ b/cmake/packages/NEON2SSESourceConfig.cmake @@ -7,7 +7,7 @@ function(_NEON2SSESource_import) nnfw_include(ExternalSourceTools) nnfw_include(OptionTools) - # NOTE TensorFlow 1.9 downloads NEON2SSE from the following URL + # NOTE TensorFlow 1.12 downloads NEON2SSE from the following URL envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com") set(NEON2SSE_URL ${EXTERNAL_DOWNLOAD_SERVER}/intel/ARM_NEON_2_x86_SSE/archive/0f77d9d182265259b135dad949230ecbf1a2633d.tar.gz) ExternalSource_Download("neon_2_sse" ${NEON2SSE_URL}) diff --git a/cmake/packages/TensorFlowSourceConfig.cmake b/cmake/packages/TensorFlowSourceConfig.cmake index ee88f9335..5828334c7 100644 --- a/cmake/packages/TensorFlowSourceConfig.cmake +++ b/cmake/packages/TensorFlowSourceConfig.cmake @@ -8,7 +8,7 @@ function(_TensorFlowSource_import) nnfw_include(OptionTools) envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com") - set(TENSORFLOW_URL ${EXTERNAL_DOWNLOAD_SERVER}/tensorflow/tensorflow/archive/v1.9.0.tar.gz) + set(TENSORFLOW_URL ${EXTERNAL_DOWNLOAD_SERVER}/tensorflow/tensorflow/archive/v1.12.0.tar.gz) ExternalSource_Download("tensorflow" ${TENSORFLOW_URL}) set(TensorFlowSource_DIR ${tensorflow_SOURCE_DIR} PARENT_SCOPE) diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt index 5bfc4ad4e..c6f5b5b8b 100644 --- a/externals/CMakeLists.txt +++ b/externals/CMakeLists.txt @@ -2,6 +2,7 @@ set(TENSORFLOW_BASE ${CMAKE_CURRENT_SOURCE_DIR}/tensorflow) set(TENSORFLOW_LITE_BASE ${TENSORFLOW_BASE}/tensorflow/contrib/lite) # Required source & package +nnfw_find_package(AbslSource REQUIRED) nnfw_find_package(Eigen REQUIRED) nnfw_find_package(FarmhashSource REQUIRED) nnfw_find_package(FlatBuffersSource REQUIRED) @@ -19,10 +20,20 @@ file(GLOB_RECURSE TFLITE_KERNEL_SRCS "${TENSORFLOW_LITE_BASE}/kernels/*.cc") file(GLOB_RECURSE TFLITE_KERNEL_TESTS "${TENSORFLOW_LITE_BASE}/kernels/*test*.cc") list(REMOVE_ITEM TFLITE_KERNEL_SRCS ${TFLITE_KERNEL_TESTS}) +file(GLOB TFLITE_LIB_SRCS "${TENSORFLOW_LITE_BASE}/c/*.c" "${TENSORFLOW_LITE_BASE}/c/*.cc") +file(GLOB TFLITE_LIB_TESTS "${TENSORFLOW_LITE_BASE}/c/*test*.cc") +list(REMOVE_ITEM TFLITE_LIB_SRCS ${TFLITE_LIB_TESTS}) + +file(GLOB TFLITE_API_SRCS "${TENSORFLOW_LITE_BASE}/core/api/*.c" "${TENSORFLOW_LITE_BASE}/core/api/*.cc") +file(GLOB TFLITE_API_TESTS "${TENSORFLOW_LITE_BASE}/core/api/*test*.cc") +list(REMOVE_ITEM TFLITE_API_SRCS ${TFLITE_API_TESTS}) + # We will use our own BuiltinOpResolver list(REMOVE_ITEM TFLITE_KERNEL_SRCS "${TENSORFLOW_LITE_BASE}/kernels/register.cc") list(APPEND TFLITE_SRCS ${TFLITE_CORE_SRCS}) list(APPEND TFLITE_SRCS ${TFLITE_KERNEL_SRCS}) +list(APPEND TFLITE_SRCS ${TFLITE_LIB_SRCS}) +list(APPEND TFLITE_SRCS ${TFLITE_API_SRCS}) list(APPEND TFLITE_SRCS "${TFLITE_DEPEND_DIR}/farmhash/src/farmhash.cc") @@ -36,6 +47,7 @@ if(BUILD_TFLITE_BENCHMARK_MODEL) endif() list(APPEND TFLITE_INCLUDES "${CMAKE_CURRENT_SOURCE_DIR}/tensorflow") +list(APPEND TFLITE_INCLUDES "${TFLITE_DEPEND_DIR}/absl") list(APPEND TFLITE_INCLUDES "${TFLITE_DEPEND_DIR}/gemmlowp") list(APPEND TFLITE_INCLUDES "${TFLITE_DEPEND_DIR}/farmhash/src") list(APPEND TFLITE_INCLUDES "${TFLITE_DEPEND_DIR}/flatbuffers/include") diff --git a/include/NeuralNetworksShim.h b/include/NeuralNetworksShim.h index a7bd745fb..b310a44cd 100644 --- a/include/NeuralNetworksShim.h +++ b/include/NeuralNetworksShim.h @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -// NOTE This header is derived from the following file (in TensorFlow) +// NOTE This header is derived from the following file (in TensorFlow v1.12) // 'externals/tensorflow/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h' #ifndef __NEURAL_NETWORKS_SHIM__ #define __NEURAL_NETWORKS_SHIM__ @@ -68,6 +68,9 @@ typedef int (*ANeuralNetworksModel_identifyInputsAndOutputs_fn)( ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount, const uint32_t* outputs); +typedef int (*ANeuralNetworksModel_relaxComputationFloat32toFloat16_fn)( + ANeuralNetworksModel* model, bool allow); + typedef int (*ANeuralNetworksExecution_create_fn)( ANeuralNetworksCompilation* compilation, ANeuralNetworksExecution** execution); @@ -360,6 +363,34 @@ inline int ANeuralNetworksModel_identifyInputsAndOutputs( } /** + * Specifies whether {@link ANEURALNETWORKS_TENSOR_FLOAT32} is allowed to be + * calculated with range and/or precision as low as that of the IEEE 754 16-bit + * floating-point format. By default, {@link ANEURALNETWORKS_TENSOR_FLOAT32} + * must be calculated using at least the range and precision of the IEEE 754 + * 32-bit floating-point format. + * + * @param model The model to be modified. + * @param allow 'true' indicates {@link ANEURALNETWORKS_TENSOR_FLOAT32} may be + * calculated with range and/or precision as low as that of the + * IEEE 754 16-bit floating point format. 'false' indicates + * {@link ANEURALNETWORKS_TENSOR_FLOAT32} must be calculated using + * at least the range and precision of the IEEE 754 32-bit floating + * point format. + * + * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has + * been called will return an error. + * + * Available since API level 28. + * + * See {@link ANeuralNetworksModel} for information on multithreaded usage. + */ +inline int ANeuralNetworksModel_relaxComputationFloat32toFloat16( + ANeuralNetworksModel* model, bool allow) { + LOAD_FUNCTION(ANeuralNetworksModel_relaxComputationFloat32toFloat16); + EXECUTE_FUNCTION_RETURN(model, allow); +} + +/** * Create a {@link ANeuralNetworksCompilation} to compile the given model. * This only creates the object. Compilation is only performed once * {@link ANeuralNetworksCompilation_start} is invoked. diff --git a/include/support/tflite/nnapi_delegate.h b/include/support/tflite/nnapi_delegate.h index a5da8ac39..b396d77f2 100644 --- a/include/support/tflite/nnapi_delegate.h +++ b/include/support/tflite/nnapi_delegate.h @@ -17,18 +17,24 @@ limitations under the License. // NOTE To minimize diff with upstream tensorflow, disable clang-format // clang-format off -// NOTE This header is derived from the following file (in TensorFlow) +// NOTE This header is derived from the following file (in TensorFlow v1.12) // 'externals/tensorflow/tensorflow/contrib/lite/nnapi_delegate.h' #ifndef __NNFW_SUPPORT_TFLITE_NNAPI_DELEGATE_H__ #define __NNFW_SUPPORT_TFLITE_NNAPI_DELEGATE_H__ #include "tensorflow/contrib/lite/allocation.h" +#ifdef OBS_BUILD #include "tensorflow/contrib/lite/context.h" #include "tensorflow/contrib/lite/error_reporter.h" +#else +#include "tensorflow/contrib/lite/c/c_api_internal.h" +#include "tensorflow/contrib/lite/core/api/error_reporter.h" +#endif #include "tensorflow/contrib/lite/interpreter.h" #include "NeuralNetworksShim.h" class ANeuralNetworksModel; +class ANeuralNetworksMemory; class ANeuralNetworksCompilation; namespace nnfw { @@ -62,11 +68,16 @@ class NNAPIDelegate { // Run TfLiteStatus Invoke(::tflite::Interpreter* interpreter); + // Whether the current platform supports NNAPI delegation. + static bool IsSupported(); + private: // The NN API model handle ANeuralNetworksModel* nn_model_ = nullptr; // The NN API compilation handle ANeuralNetworksCompilation* nn_compiled_model_ = nullptr; + // Model status + TfLiteStatus model_status_ = kTfLiteOk; // List of state tensors for LSTM, RNN, SVDF. // NN API does not allow ops to maintain states across multiple diff --git a/libs/support/tflite/src/nnapi_delegate.cpp b/libs/support/tflite/src/nnapi_delegate.cpp index 987cd77c5..57322f531 100644 --- a/libs/support/tflite/src/nnapi_delegate.cpp +++ b/libs/support/tflite/src/nnapi_delegate.cpp @@ -17,47 +17,85 @@ limitations under the License. // NOTE To minimize diff with upstream tensorflow, disable clang-format // clang-format off -// NOTE This code is derived from the following file (in TensorFlow) +// NOTE This code is derived from the following file (in TensorFlow v1.11) // 'externals/tensorflow/tensorflow/contrib/lite/nnapi_delegate.cc' #include "support/tflite/nnapi_delegate.h" #include <fcntl.h> #include <sys/mman.h> #include <sys/stat.h> #include <sys/types.h> +#ifdef OBS_BUILD #include "tensorflow/contrib/lite/builtin_op_data.h" #include "tensorflow/contrib/lite/error_reporter.h" +#else +#include "tensorflow/contrib/lite/c/builtin_op_data.h" +#include "tensorflow/contrib/lite/core/api/error_reporter.h" +#endif #include "tensorflow/contrib/lite/model.h" #include "NeuralNetworksShim.h" #include "NeuralNetworksExShim.h" #ifdef __ANDROID__ +#include <android/log.h> #include <sys/system_properties.h> #endif -namespace nnfw -{ +namespace nnfw { -// TODO(aselle): FATAL leaves resources hanging. -void FATAL(const char* format, ...) { - va_list args; - va_start(args, format); - vfprintf(stderr, format, args); - va_end(args); +void logError(const char* format, ...) { + // stderr is convenient for native tests, but is not captured for apps + va_list args_for_stderr; + va_start(args_for_stderr, format); + vfprintf(stderr, format, args_for_stderr); + va_end(args_for_stderr); + fprintf(stderr, "\n"); fflush(stderr); - exit(1); +#ifdef __ANDROID__ + // produce logcat output for general consumption + va_list args_for_log; + va_start(args_for_log, format); + __android_log_vprint(ANDROID_LOG_ERROR, "tflite", format, args_for_log); + va_end(args_for_log); +#endif } +#define FATAL(...) \ + logError(__VA_ARGS__); \ + exit(1); + // TODO(aselle): Change the error model to use status codes. -#define CHECK_TFLITE_SUCCESS(x) \ - if (x != kTfLiteOk) { \ - FATAL("Aborting since tflite returned failure."); \ +#define CHECK_TFLITE_SUCCESS(x) \ + if (x != kTfLiteOk) { \ + FATAL("Aborting since tflite returned failure nnapi_delegate.cc:%d.", \ + __LINE__); \ } -#define CHECK_NN(x) \ - if (x != ANEURALNETWORKS_NO_ERROR) { \ - FATAL("Aborting since tflite returned failure."); \ +#define CHECK_NN(x) \ + if (x != ANEURALNETWORKS_NO_ERROR) { \ + FATAL("Aborting since NNAPI returned failure nnapi_delegate.cc:%d", \ + __LINE__); \ } +#define RETURN_ERROR_IF_TFLITE_FAILED(x) \ + if (x != kTfLiteOk) { \ + logError( \ + "Returning error since TFLite returned failure nnapi_delegate.cc:%d.", \ + __LINE__); \ + return kTfLiteError; \ + } + +#define RETURN_ERROR_IF_NN_FAILED(x) \ + if (x != ANEURALNETWORKS_NO_ERROR) { \ + logError( \ + "Returning error since NNAPI returned failure nnapi_delegate.cc:%d.", \ + __LINE__); \ + return kTfLiteError; \ + } + +// Tracking of NNAPI operand ids +static const int64_t kOperandIdNotSet = -1; +static const int64_t kOperandNotNeeded = -2; + namespace { int32_t GetAndroidSdkVersion() { @@ -80,7 +118,10 @@ int32_t GetAndroidSdkVersion() { return 0; } -static const int32_t kAndroidSdkVersion = GetAndroidSdkVersion(); +int32_t GetAndroidSdkVersionCached() { + static int32_t androidSdkVersion = GetAndroidSdkVersion(); + return androidSdkVersion; +} } // namespace @@ -112,21 +153,16 @@ NNAPIDelegate::~NNAPIDelegate() { } // Adds the tensors of the interpreter to the NN API model. -// Returns the number of operands added. -uint32_t addTensorOperands(tflite::Interpreter* interpreter, +TfLiteStatus addTensorOperands(tflite::Interpreter* interpreter, ANeuralNetworksModel* nn_model, - const std::vector<uint32_t>& skip_list) { + uint32_t* no_of_operands_added, + std::vector<int64_t>* nnapi_ids) { uint32_t next_id = 0; for (size_t i = 0; i < interpreter->tensors_size(); i++) { - // skip temporaries tensors. - bool shouldSkip = false; - for (auto skip_idx : skip_list) { - if (i == skip_idx) { - shouldSkip = true; - break; - } - } - if (shouldSkip) continue; + // Skip temporaries and RNN back-edges. + if ((*nnapi_ids)[i] == kOperandNotNeeded) continue; + + (*nnapi_ids)[i] = int64_t(next_id); int32_t nn_type = 0; // NNAPI requires 32-bit float scale to be zero, tflite doesn't care @@ -160,7 +196,28 @@ uint32_t addTensorOperands(tflite::Interpreter* interpreter, zeroPoint = tensor->params.zero_point; break; default: - FATAL("Unsupported type."); + logError("Unsupported tensor type %d", tensor->type); + return kTfLiteError; + } + if (tensor->dims->size == 0) { + // WORKAROUND Some model have dimension zero + switch (tensor->type) { + case kTfLiteFloat32: + nn_type = ANEURALNETWORKS_FLOAT32; + break; + case kTfLiteInt32: + nn_type = ANEURALNETWORKS_INT32; + break; + default: + logError("NNAPI doesn't support tensors with rank 0 (index %d name %s)", + i, tensor->name); + return kTfLiteError; + } + } + if (tensor->dims->size > 4) { + logError("NNAPI doesn't support tensors with rank > 4 (index %d name %s)", + i, tensor->name); + return kTfLiteError; } // TODO(aselle): Note, many of these are intermediate results. Do I need // to ever specify these sizes. I am currently below doing setValue @@ -170,36 +227,53 @@ uint32_t addTensorOperands(tflite::Interpreter* interpreter, ANeuralNetworksOperandType operand_type{ nn_type, static_cast<uint32_t>(tensor->dims->size), reinterpret_cast<uint32_t*>(tensor->dims->data), scale, zeroPoint}; - CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type)); + RETURN_ERROR_IF_NN_FAILED( + ANeuralNetworksModel_addOperand(nn_model, &operand_type)); // TODO(aselle): Based on Michael's suggestion, limiting this to read // only memory if (tensor->allocation_type == kTfLiteMmapRo) { if (const NNAPIAllocation* alloc = dynamic_cast<const NNAPIAllocation*>( - static_cast<const ::tflite::Allocation*>(tensor->allocation))) { - CHECK_NN(ANeuralNetworksModel_setOperandValueFromMemory( - nn_model, next_id, alloc->memory(), alloc->offset(tensor->data.raw), - tensor->bytes)); + static_cast<const tflite::Allocation*>(tensor->allocation))) { + RETURN_ERROR_IF_NN_FAILED( + ANeuralNetworksModel_setOperandValueFromMemory( + nn_model, next_id, alloc->memory(), + alloc->offset(tensor->data.raw), tensor->bytes)); } else { - CHECK_NN(ANeuralNetworksModel_setOperandValue( + RETURN_ERROR_IF_NN_FAILED(ANeuralNetworksModel_setOperandValue( nn_model, next_id, tensor->data.raw, tensor->bytes)); } } else if (tensor->bytes == 0) { // These size 0 tensors are optional tensors reserved. - CHECK_NN( + RETURN_ERROR_IF_NN_FAILED( ANeuralNetworksModel_setOperandValue(nn_model, next_id, nullptr, 0)); } ++next_id; } - return next_id; + *no_of_operands_added = next_id; + return kTfLiteOk; +} + +void MapAndAddTensorIds(const int* from_ids_buf, size_t from_ids_count, + std::vector<uint32_t>* into, + const std::vector<int64_t>& map) { + for (size_t i = 0; i < from_ids_count; i++) { + int from_id = from_ids_buf[i]; + if (from_id == kOptionalTensor) { + into->push_back(from_id); + } else { + into->push_back(map[from_id]); + } + } } // Adds the operations and their parameters to the NN API model. // 'next-id' is the operand ID of the next operand of the model. -void AddOpsAndParams(tflite::Interpreter* interpreter, - ANeuralNetworksModel* nn_model, uint32_t next_id, - std::vector<int>* model_state_inputs, - std::vector<int>* model_state_outputs) { +TfLiteStatus AddOpsAndParams( + tflite::Interpreter* interpreter, ANeuralNetworksModel* nn_model, + uint32_t next_id, std::vector<int>* model_state_inputs, + std::vector<int>* model_state_outputs, + const std::vector<int64_t>& tensor_id_to_nnapi_id) { for (size_t i = 0; i < interpreter->nodes_size(); i++) { const auto* node_and_registration = interpreter->node_and_registration(i); const TfLiteNode& node = node_and_registration->first; @@ -208,10 +282,11 @@ void AddOpsAndParams(tflite::Interpreter* interpreter, static_cast<tflite::BuiltinOperator>(registration.builtin_code); // Add the parameters. - std::vector<uint32_t> augmented_inputs( - node.inputs->data, node.inputs->data + node.inputs->size); - std::vector<uint32_t> augmented_outputs( - node.outputs->data, node.outputs->data + node.outputs->size); + std::vector<uint32_t> augmented_inputs, augmented_outputs; + MapAndAddTensorIds(node.inputs->data, node.inputs->size, &augmented_inputs, + tensor_id_to_nnapi_id); + MapAndAddTensorIds(node.outputs->data, node.outputs->size, + &augmented_outputs, tensor_id_to_nnapi_id); auto add_scalar_int32 = [&nn_model, &augmented_inputs, &next_id](int value) { @@ -260,47 +335,68 @@ void AddOpsAndParams(tflite::Interpreter* interpreter, model_state_outputs->push_back(tensor_id); next_id++; }; + auto check_and_add_activation = [&add_scalar_int32](int activation) { + if (activation > kTfLiteActRelu6) { + logError("NNAPI only supports RELU, RELU1 and RELU6 activations"); + return kTfLiteError; + } + add_scalar_int32(activation); + return kTfLiteOk; + }; - auto add_add_params = [&add_scalar_int32]() { add_scalar_int32(0); }; + auto add_add_params = [&add_scalar_int32](void* data) { + auto* builtin = reinterpret_cast<TfLiteAddParams*>(data); + if (builtin->activation > kTfLiteActRelu6) { + logError("NNAPI only supports RELU, RELU1 and RELU6 activations"); + return kTfLiteError; + } + add_scalar_int32(builtin->activation); + return kTfLiteOk; + }; - auto add_pooling_params = [&add_scalar_int32](void* data) { + auto add_pooling_params = [&add_scalar_int32, + &check_and_add_activation](void* data) { auto builtin = reinterpret_cast<TfLitePoolParams*>(data); add_scalar_int32(builtin->padding); add_scalar_int32(builtin->stride_width); add_scalar_int32(builtin->stride_height); add_scalar_int32(builtin->filter_width); add_scalar_int32(builtin->filter_height); - add_scalar_int32(builtin->activation); + return check_and_add_activation(builtin->activation); }; - auto add_convolution_params = [&add_scalar_int32](void* data) { + auto add_convolution_params = [&add_scalar_int32, + &check_and_add_activation](void* data) { auto builtin = reinterpret_cast<TfLiteConvParams*>(data); add_scalar_int32(builtin->padding); add_scalar_int32(builtin->stride_width); add_scalar_int32(builtin->stride_height); - add_scalar_int32(builtin->activation); + return check_and_add_activation(builtin->activation); }; - auto add_depthwise_conv_params = [&add_scalar_int32](void* data) { + auto add_depthwise_conv_params = [&add_scalar_int32, + &check_and_add_activation](void* data) { auto builtin = reinterpret_cast<TfLiteDepthwiseConvParams*>(data); add_scalar_int32(builtin->padding); add_scalar_int32(builtin->stride_width); add_scalar_int32(builtin->stride_height); add_scalar_int32(builtin->depth_multiplier); - add_scalar_int32(builtin->activation); + return check_and_add_activation(builtin->activation); }; - auto add_fully_connected_params = [&add_scalar_int32](void* data) { + auto add_fully_connected_params = [&check_and_add_activation](void* data) { auto builtin = reinterpret_cast<TfLiteFullyConnectedParams*>(data); - add_scalar_int32(builtin->activation); + return check_and_add_activation(builtin->activation); }; auto add_concatenation_params = [&add_scalar_int32](void* data) { auto builtin = reinterpret_cast<TfLiteConcatenationParams*>(data); add_scalar_int32(builtin->axis); if (builtin->activation != kTfLiteActNone) { - FATAL("Concatenation does not support fused activation in NNAPI"); + logError("Concatenation does not support fused activation in NNAPI"); + return kTfLiteError; } + return kTfLiteOk; }; auto add_softmax_params = [&add_scalar_float32](void* data) { @@ -324,6 +420,7 @@ void AddOpsAndParams(tflite::Interpreter* interpreter, // LSTM in NNAPI requires scratch tensor as an output operand. auto add_lstm_scratch_tensor_float32 = [interpreter, &node, &nn_model, &next_id, &augmented_outputs]() { + if (node.temporaries->size == 0) return; int scratch_buffer_index = node.temporaries->data[0]; const TfLiteTensor* tensor = interpreter->tensor(scratch_buffer_index); ANeuralNetworksOperandType operand_type{ @@ -336,7 +433,11 @@ void AddOpsAndParams(tflite::Interpreter* interpreter, }; auto add_mean_params = [&add_scalar_int32](void* data) { +#ifdef OBS_BUILD auto builtin = reinterpret_cast<TfLiteMeanParams*>(data); +#else + auto builtin = reinterpret_cast<TfLiteReducerParams*>(data); +#endif add_scalar_int32(builtin->keep_dims); }; @@ -351,7 +452,7 @@ void AddOpsAndParams(tflite::Interpreter* interpreter, add_scalar_int32(builtin->activation); }; - auto add_squeeze_params = [&add_vector_int32](void* data) { + auto add_squeeze_params = [&](void* data) { const auto* builtin = reinterpret_cast<TfLiteSqueezeParams*>(data); // Note that we add the squeeze dimensions even if the dimensions were // unspecified (empty), as NNAPI requires the operand. @@ -382,26 +483,34 @@ void AddOpsAndParams(tflite::Interpreter* interpreter, switch (builtin) { case tflite::BuiltinOperator_ADD: nn_op_type = ANEURALNETWORKS_ADD; - add_add_params(); + RETURN_ERROR_IF_TFLITE_FAILED(add_add_params(node.builtin_data)); break; case tflite::BuiltinOperator_MUL: nn_op_type = ANEURALNETWORKS_MUL; - add_add_params(); + RETURN_ERROR_IF_TFLITE_FAILED(add_add_params(node.builtin_data)); break; case tflite::BuiltinOperator_AVERAGE_POOL_2D: - add_pooling_params(node.builtin_data); + RETURN_ERROR_IF_TFLITE_FAILED(add_pooling_params(node.builtin_data)); nn_op_type = ANEURALNETWORKS_AVERAGE_POOL_2D; break; case tflite::BuiltinOperator_MAX_POOL_2D: - add_pooling_params(node.builtin_data); + RETURN_ERROR_IF_TFLITE_FAILED(add_pooling_params(node.builtin_data)); nn_op_type = ANEURALNETWORKS_MAX_POOL_2D; break; case tflite::BuiltinOperator_L2_POOL_2D: - add_pooling_params(node.builtin_data); + RETURN_ERROR_IF_TFLITE_FAILED(add_pooling_params(node.builtin_data)); nn_op_type = ANEURALNETWORKS_L2_POOL_2D; break; - case tflite::BuiltinOperator_CONV_2D: - add_convolution_params(node.builtin_data); + case tflite::BuiltinOperator_CONV_2D: { + auto builtin = reinterpret_cast<TfLiteConvParams*>(node.builtin_data); + if (builtin->dilation_width_factor != 1 || + builtin->dilation_height_factor != 1 || node.inputs->size != 3) { + logError("NNAPI does not support dilated Conv2D."); + return kTfLiteError; + } + } + RETURN_ERROR_IF_TFLITE_FAILED( + add_convolution_params(node.builtin_data)); nn_op_type = ANEURALNETWORKS_CONV_2D; break; case tflite::BuiltinOperator_RELU: @@ -423,11 +532,13 @@ void AddOpsAndParams(tflite::Interpreter* interpreter, nn_op_type = ANEURALNETWORKS_LOGISTIC; break; case tflite::BuiltinOperator_DEPTHWISE_CONV_2D: - add_depthwise_conv_params(node.builtin_data); + RETURN_ERROR_IF_TFLITE_FAILED( + add_depthwise_conv_params(node.builtin_data)); nn_op_type = ANEURALNETWORKS_DEPTHWISE_CONV_2D; break; case tflite::BuiltinOperator_CONCATENATION: - add_concatenation_params(node.builtin_data); + RETURN_ERROR_IF_TFLITE_FAILED( + add_concatenation_params(node.builtin_data)); nn_op_type = ANEURALNETWORKS_CONCATENATION; break; case tflite::BuiltinOperator_SOFTMAX: @@ -435,10 +546,15 @@ void AddOpsAndParams(tflite::Interpreter* interpreter, nn_op_type = ANEURALNETWORKS_SOFTMAX; break; case tflite::BuiltinOperator_FULLY_CONNECTED: - add_fully_connected_params(node.builtin_data); + RETURN_ERROR_IF_TFLITE_FAILED( + add_fully_connected_params(node.builtin_data)); nn_op_type = ANEURALNETWORKS_FULLY_CONNECTED; break; case tflite::BuiltinOperator_RESHAPE: + if (node.inputs->size != 2) { + logError("NNAPI only supports 2-input RESHAPE"); + return kTfLiteError; + } nn_op_type = ANEURALNETWORKS_RESHAPE; // add_reshape_params(node.builtin_data); break; @@ -451,6 +567,10 @@ void AddOpsAndParams(tflite::Interpreter* interpreter, nn_op_type = ANEURALNETWORKS_SPACE_TO_DEPTH; break; case tflite::BuiltinOperator_LSTM: { + if (node.inputs->size + /* no of params */ 3 != 21) { + logError("NNAPI only supports 21-input LSTMs"); + return kTfLiteError; + } duplicate_state_tensor_float32( node.outputs->data[/*kOutputStateTensor*/ 0]); duplicate_state_tensor_float32( @@ -489,10 +609,6 @@ void AddOpsAndParams(tflite::Interpreter* interpreter, add_mean_params(node.builtin_data); nn_op_type = ANEURALNETWORKS_MEAN; break; - case tflite::BuiltinOperator_L2_NORMALIZATION: - nn_op_type = ANEURALNETWORKS_L2_NORMALIZATION; - check_l2normalization_params(node.builtin_data); - break; case tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION: nn_op_type = ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION; add_lrn_params(node.builtin_data); @@ -500,20 +616,54 @@ void AddOpsAndParams(tflite::Interpreter* interpreter, case tflite::BuiltinOperator_DIV: nnapi_version = 11; // require NNAPI 1.1 nn_op_type = ANEURALNETWORKS_DIV; - add_add_params(); - break; - case tflite::BuiltinOperator_HASHTABLE_LOOKUP: - nn_op_type = ANEURALNETWORKS_HASHTABLE_LOOKUP; + RETURN_ERROR_IF_TFLITE_FAILED(check_and_add_activation( + reinterpret_cast<TfLiteDivParams*>(node.builtin_data)->activation)); break; case tflite::BuiltinOperator_SUB: nnapi_version = 11; // require NNAPI 1.1 nn_op_type = ANEURALNETWORKS_SUB; - add_add_params(); + RETURN_ERROR_IF_TFLITE_FAILED(check_and_add_activation( + reinterpret_cast<TfLiteSubParams*>(node.builtin_data)->activation)); break; case tflite::BuiltinOperator_SQUEEZE: - nnapi_version = 11; // require NNAPI 1.1 - nn_op_type = ANEURALNETWORKS_SQUEEZE; + nnapi_version = 11; // requires NNAPI 1.1 add_squeeze_params(node.builtin_data); + nn_op_type = ANEURALNETWORKS_SQUEEZE; + break; + case tflite::BuiltinOperator_TRANSPOSE: + // The permutation input tensor value dictates the output dimensions. + // TODO(b/110888333): Support dynamically-sized tensors in delegates. + if ((node.inputs->size > 1) && + (interpreter->tensor(node.inputs->data[1])->allocation_type != + kTfLiteMmapRo)) { + logError("NNAPI does not yet support dynamic tensors."); + return kTfLiteError; + } + nnapi_version = 11; // require NNAPI 1.1 + nn_op_type = ANEURALNETWORKS_TRANSPOSE; + break; + case tflite::BuiltinOperator_L2_NORMALIZATION: + nn_op_type = ANEURALNETWORKS_L2_NORMALIZATION; + if (reinterpret_cast<TfLiteL2NormParams*>(node.builtin_data) + ->activation != kTfLiteActNone) { + logError( + "NNAPI does not support L2Normalization with fused activations"); + return kTfLiteError; + } + if ((node.inputs->size > 0) && + (interpreter->tensor(node.inputs->data[0])->dims->size != 4)) { + logError("NNAPI only supports input rank 4 for L2Normalization"); + return kTfLiteError; + } + break; + case tflite::BuiltinOperator_HASHTABLE_LOOKUP: + if (interpreter->tensor(node.outputs->data[0])->type != + kTfLiteFloat32) { + logError("NNAPI only support HASHTABLE_LOOKUP with float32 output", + builtin); + return kTfLiteError; + } + nn_op_type = ANEURALNETWORKS_HASHTABLE_LOOKUP; break; case tflite::BuiltinOperator_STRIDED_SLICE: add_strided_slice_params(node.builtin_data); @@ -557,10 +707,6 @@ void AddOpsAndParams(tflite::Interpreter* interpreter, augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size), reinterpret_cast<uint32_t*>(node.outputs->data))); continue; - case tflite::BuiltinOperator_TRANSPOSE: - nn_op_type = ANEURALNETWORKS_TRANSPOSE; - // param is almost same as reshape - break; case tflite::BuiltinOperator_NEG: CHECK_NN(ANeuralNetworksModel_addOperationEx( nn_model, ANEURALNETWORKS_NEG_EX, @@ -582,28 +728,71 @@ void AddOpsAndParams(tflite::Interpreter* interpreter, case tflite::BuiltinOperator_EMBEDDING_LOOKUP_SPARSE: case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM: case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM: + //case tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION: case tflite::BuiltinOperator_PADV2: + //case tflite::BuiltinOperator_RESIZE_BILINEAR: case tflite::BuiltinOperator_CALL: case tflite::BuiltinOperator_SKIP_GRAM: + //case tflite::BuiltinOperator_RELU_N1_TO_1: + //case tflite::BuiltinOperator_GATHER: + //case tflite::BuiltinOperator_SPACE_TO_BATCH_ND: + //case tflite::BuiltinOperator_BATCH_TO_SPACE_ND: + //case tflite::BuiltinOperator_TOPK_V2: + //case tflite::BuiltinOperator_SPLIT: + //case tflite::BuiltinOperator_STRIDED_SLICE: + //case tflite::BuiltinOperator_EXP: case tflite::BuiltinOperator_LOG_SOFTMAX: + //case tflite::BuiltinOperator_DEQUANTIZE: case tflite::BuiltinOperator_DELEGATE: + //case tflite::BuiltinOperator_CAST: case tflite::BuiltinOperator_PRELU: case tflite::BuiltinOperator_MAXIMUM: case tflite::BuiltinOperator_MINIMUM: case tflite::BuiltinOperator_ARG_MAX: +#ifndef OBS_BUILD + case tflite::BuiltinOperator_ARG_MIN: +#endif case tflite::BuiltinOperator_GREATER: case tflite::BuiltinOperator_GREATER_EQUAL: case tflite::BuiltinOperator_LESS: case tflite::BuiltinOperator_LESS_EQUAL: + //case tflite::BuiltinOperator_NEG: case tflite::BuiltinOperator_SELECT: case tflite::BuiltinOperator_SLICE: case tflite::BuiltinOperator_SIN: + //case tflite::BuiltinOperator_LOG: case tflite::BuiltinOperator_TRANSPOSE_CONV: +#ifndef OBS_BUILD + case tflite::BuiltinOperator_TILE: + case tflite::BuiltinOperator_EXPAND_DIMS: case tflite::BuiltinOperator_SPARSE_TO_DENSE: - FATAL("Op code %d is currently not delegated to NNAPI", builtin); - nn_op_type = -1; // set to invalid + case tflite::BuiltinOperator_EQUAL: + case tflite::BuiltinOperator_NOT_EQUAL: + case tflite::BuiltinOperator_SUM: + case tflite::BuiltinOperator_REDUCE_MAX: + case tflite::BuiltinOperator_REDUCE_MIN: + case tflite::BuiltinOperator_REDUCE_PROD: + case tflite::BuiltinOperator_SQRT: + case tflite::BuiltinOperator_RSQRT: + case tflite::BuiltinOperator_SHAPE: + case tflite::BuiltinOperator_POW: + case tflite::BuiltinOperator_FAKE_QUANT: + case tflite::BuiltinOperator_PACK: + case tflite::BuiltinOperator_LOGICAL_OR: + case tflite::BuiltinOperator_ONE_HOT: + case tflite::BuiltinOperator_LOGICAL_AND: + case tflite::BuiltinOperator_LOGICAL_NOT: + case tflite::BuiltinOperator_UNPACK: + case tflite::BuiltinOperator_FLOOR_DIV: + case tflite::BuiltinOperator_REDUCE_ANY: + case tflite::BuiltinOperator_SQUARE: + case tflite::BuiltinOperator_ZEROS_LIKE: + case tflite::BuiltinOperator_FILL: +#endif + logError("Op code %d is currently not delegated to NNAPI", builtin); + return kTfLiteError; break; - case tflite::BuiltinOperator_CUSTOM: + case tflite::BuiltinOperator_CUSTOM: { std::string custom_name(registration.custom_name); if (custom_name.compare("TensorFlowMax") == 0) { CHECK_NN(ANeuralNetworksModel_addOperationEx( @@ -650,64 +839,99 @@ void AddOpsAndParams(tflite::Interpreter* interpreter, reinterpret_cast<uint32_t*>(node.outputs->data))); continue; } - - FATAL("Custom operations are not supported when using NNAPI."); - nn_op_type = -1; // set to invalid + logError("Custom operations are not supported when using NNAPI."); + return kTfLiteError; + break; + } +#ifdef OBS_BUILD + default: + logError("Op code %d is currently not delegated to NNAPI", builtin); + return kTfLiteError; break; +#endif } - //if (nnapi_version == 11 && kAndroidSdkVersion < 28) { + //if (nnapi_version == 11 && GetAndroidSdkVersionCached() < 28) { // FATAL("Op %d needs NNAPI1.1", builtin); //} // Add the operation. - CHECK_NN(ANeuralNetworksModel_addOperation( + RETURN_ERROR_IF_NN_FAILED(ANeuralNetworksModel_addOperation( nn_model, nn_op_type, static_cast<uint32_t>(augmented_inputs.size()), augmented_inputs.data(), static_cast<uint32_t>(augmented_outputs.size()), reinterpret_cast<uint32_t*>(augmented_outputs.data()))); } + return kTfLiteOk; } TfLiteStatus NNAPIDelegate::BuildGraph(::tflite::Interpreter* interpreter) { - // TODO(aselle): This is not correct. need to handle resize invalidation. - if (nn_model_ && nn_compiled_model_) return kTfLiteOk; + if (nn_model_ && nn_compiled_model_) return model_status_; + // TODO(aselle): This is not correct. need to handle resize invalidation. if (!nn_model_) { CHECK_NN(ANeuralNetworksModel_create(&nn_model_)); - // Find all the temporary tensors and put them in a skip_list. - std::vector<uint32_t> skip_list; + // Find which tensors should be added to NNAPI. TFLite has temporaries + // and RNN back-edges which are are not valid for NNAPI. We look through all + // inputs and outputs and mark the mapping in tensor_id_to_nnapi_id with + // kOperandIdNotSet. addTensorOperands will replace those with the + // corresponding NNAPI operand ids and skip kOperandNotNeeded entries. + std::vector<int64_t> tensor_id_to_nnapi_id(interpreter->tensors_size(), + kOperandNotNeeded); + auto set_ids_to_not_set = [&tensor_id_to_nnapi_id](const int* buf, + size_t count) { + for (int j = 0; j < count; j++) { + auto tensor_id = buf[j]; + if (tensor_id != kOptionalTensor) { + tensor_id_to_nnapi_id[tensor_id] = kOperandIdNotSet; + } + } + }; for (size_t i = 0; i < interpreter->nodes_size(); i++) { const auto* node_and_registration = interpreter->node_and_registration(i); const TfLiteNode& node = node_and_registration->first; - if (node.temporaries != nullptr) { - for (int j = 0; j < node.temporaries->size; j++) { - skip_list.push_back(static_cast<uint32_t>(node.temporaries->data[j])); - } - } + set_ids_to_not_set(node.inputs->data, node.inputs->size); + set_ids_to_not_set(node.outputs->data, node.outputs->size); } - - uint32_t next_id = addTensorOperands(interpreter, nn_model_, skip_list); - AddOpsAndParams(interpreter, nn_model_, next_id, &model_states_inputs_, - &model_states_outputs_); - - std::vector<int> augmented_inputs = interpreter->inputs(); - std::vector<int> augmented_outputs = interpreter->outputs(); - - // All state tensors input/output need to be treated as model input/output. + set_ids_to_not_set(interpreter->inputs().data(), + interpreter->inputs().size()); + set_ids_to_not_set(interpreter->outputs().data(), + interpreter->outputs().size()); + + uint32_t next_id = 0; + RETURN_ERROR_IF_TFLITE_FAILED(addTensorOperands( + interpreter, nn_model_, &next_id, &tensor_id_to_nnapi_id)); + RETURN_ERROR_IF_TFLITE_FAILED( + AddOpsAndParams(interpreter, nn_model_, next_id, &model_states_inputs_, + &model_states_outputs_, tensor_id_to_nnapi_id)); + + std::vector<uint32_t> augmented_inputs; + MapAndAddTensorIds(interpreter->inputs().data(), + interpreter->inputs().size(), &augmented_inputs, + tensor_id_to_nnapi_id); augmented_inputs.insert(augmented_inputs.end(), model_states_inputs_.begin(), model_states_inputs_.end()); - augmented_outputs.insert(augmented_outputs.end(), - model_states_outputs_.begin(), - model_states_outputs_.end()); + std::vector<uint32_t> augmented_outputs; + MapAndAddTensorIds(interpreter->outputs().data(), + interpreter->outputs().size(), &augmented_outputs, + tensor_id_to_nnapi_id); + MapAndAddTensorIds(model_states_outputs_.data(), + model_states_outputs_.size(), &augmented_outputs, + tensor_id_to_nnapi_id); CHECK_NN(ANeuralNetworksModel_identifyInputsAndOutputs( nn_model_, static_cast<uint32_t>(augmented_inputs.size()), reinterpret_cast<const uint32_t*>(augmented_inputs.data()), static_cast<uint32_t>(augmented_outputs.size()), reinterpret_cast<const uint32_t*>(augmented_outputs.data()))); + + // TODO Support ANeuralNetworksModel_relaxComputationFloat32toFloat16 + //if (GetAndroidSdkVersionCached() >= 28) { + // CHECK_NN(ANeuralNetworksModel_relaxComputationFloat32toFloat16( + // nn_model_, interpreter->GetAllowFp16PrecisionForFp32())); + //} CHECK_NN(ANeuralNetworksModel_finish(nn_model_)); } if (!nn_compiled_model_) { @@ -719,7 +943,13 @@ TfLiteStatus NNAPIDelegate::BuildGraph(::tflite::Interpreter* interpreter) { TfLiteStatus NNAPIDelegate::Invoke(::tflite::Interpreter* interpreter) { if (!nn_model_) { - TF_LITE_ENSURE_STATUS(BuildGraph(interpreter)); + model_status_ = BuildGraph(interpreter); + if (model_status_ != kTfLiteOk) { + logError("Failed to build graph for NNAPI"); + } + } + if (model_status_ != kTfLiteOk) { + return model_status_; } ANeuralNetworksExecution* execution = nullptr; @@ -783,6 +1013,8 @@ TfLiteStatus NNAPIDelegate::Invoke(::tflite::Interpreter* interpreter) { return kTfLiteOk; } +bool NNAPIDelegate::IsSupported() { return nnfw::NNAPIExists(); } + } // namespace nnfw // clang-format on diff --git a/runtimes/neurun/src/kernel/cpu/AvgPoolLayer.cc b/runtimes/neurun/src/kernel/cpu/AvgPoolLayer.cc index 2a6a84e10..f434a6dec 100644 --- a/runtimes/neurun/src/kernel/cpu/AvgPoolLayer.cc +++ b/runtimes/neurun/src/kernel/cpu/AvgPoolLayer.cc @@ -27,14 +27,14 @@ namespace kernel namespace cpu { -#define AVGPOOLING_PARAMETERS \ - uint32_t height = getSizeOfDimension(_inputShape, 1); \ - uint32_t width = getSizeOfDimension(_inputShape, 2); \ - uint32_t outHeight = getSizeOfDimension(_outputShape, 1); \ - uint32_t outWidth = getSizeOfDimension(_outputShape, 2); \ - \ - uint32_t paddingHeight = (uint32_t)_paddingTop; \ - uint32_t paddingWidth = (uint32_t)_paddingLeft; +#define AVGPOOLING_PARAMETERS \ + tflite::PoolParams op_params; \ + op_params.stride_height = _strideHeight; \ + op_params.stride_width = _strideWidth; \ + op_params.filter_height = _kernelHeight; \ + op_params.filter_width = _kernelWidth; \ + op_params.padding_values.height = (int8_t)_paddingTop; \ + op_params.padding_values.width = (int8_t)_paddingLeft; AvgPoolLayer::AvgPoolLayer() : _inputData(nullptr), _outputData(nullptr), _inputShape(), _outputShape(), _paddingLeft(0), @@ -47,31 +47,31 @@ AvgPoolLayer::AvgPoolLayer() bool AvgPoolLayer::averagePoolFloat32() { - AVGPOOLING_PARAMETERS float output_activation_min, output_activation_max; CalculateActivationRangeFloat(_activation, &output_activation_min, &output_activation_max); + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; - ::tflite::optimized_ops::AveragePool( - reinterpret_cast<const float *>(_inputData), convertShapeToDims(_inputShape), _strideWidth, - _strideHeight, paddingWidth, paddingHeight, _kernelWidth, _kernelHeight, - output_activation_min, output_activation_max, reinterpret_cast<float *>(_outputData), - convertShapeToDims(_outputShape)); + ::tflite::optimized_ops::AveragePool(op_params, convertShapeToTFLiteShape(_inputShape), + reinterpret_cast<const float *>(_inputData), + convertShapeToTFLiteShape(_outputShape), + reinterpret_cast<float *>(_outputData)); return true; } bool AvgPoolLayer::averagePoolQuant8() { - AVGPOOLING_PARAMETERS int32_t output_activation_min = 0; int32_t output_activation_max = 0; CalculateActivationRangeUint8(_activation, _outputShape, &output_activation_min, &output_activation_max); + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; - ::tflite::optimized_ops::AveragePool(_inputData, convertShapeToDims(_inputShape), _strideWidth, - _strideHeight, paddingWidth, paddingHeight, _kernelWidth, - _kernelHeight, output_activation_min, output_activation_max, - _outputData, convertShapeToDims(_outputShape)); + ::tflite::optimized_ops::AveragePool(op_params, convertShapeToTFLiteShape(_inputShape), + _inputData, convertShapeToTFLiteShape(_outputShape), + _outputData); return true; } diff --git a/runtimes/neurun/src/kernel/cpu/ConcatLayer.cc b/runtimes/neurun/src/kernel/cpu/ConcatLayer.cc index 5fe5e3993..be093b437 100644 --- a/runtimes/neurun/src/kernel/cpu/ConcatLayer.cc +++ b/runtimes/neurun/src/kernel/cpu/ConcatLayer.cc @@ -24,6 +24,7 @@ namespace neurun { namespace kernel { + namespace cpu { @@ -36,13 +37,21 @@ ConcatLayer::ConcatLayer() bool ConcatLayer::concatenationFloat32() { - int num_inputs = _inputShapes.size(); - std::vector<::tflite::Dims<4> *> inputDimsPtr(num_inputs); - std::vector<::tflite::Dims<4>> inputDims(num_inputs); - for (int i = 0; i < num_inputs; i++) + uint32_t num_inputs = _inputShapes.size(); + + tflite::ConcatenationParams op_params; + op_params.axis = _axis; + op_params.inputs_count = num_inputs; + + std::vector<::tflite::RuntimeShape *> inputDimsPtr; + std::vector<::tflite::RuntimeShape> inputDims; + inputDimsPtr.reserve(num_inputs); + inputDims.reserve(num_inputs); + + for (uint32_t i = 0; i < num_inputs; i++) { - inputDims[i] = convertShapeToDims(_inputShapes[i]); - inputDimsPtr[i] = &inputDims[i]; + inputDims.push_back(convertShapeToTFLiteShape(_inputShapes[i])); + inputDimsPtr.push_back(&inputDims[i]); } std::vector<const float *> inputFloatPtrs; @@ -52,24 +61,44 @@ bool ConcatLayer::concatenationFloat32() inputFloatPtrs.emplace_back(reinterpret_cast<const float *>(ptr)); } - ::tflite::optimized_ops::Concatenation<::tflite::FusedActivationFunctionType::kNone, float>( - getNumberOfDimensions(_outputShape) - _axis - 1, inputFloatPtrs.data(), inputDimsPtr.data(), - num_inputs, reinterpret_cast<float *>(_outputData), convertShapeToDims(_outputShape)); + ::tflite::optimized_ops::Concatenation<float>( + op_params, inputDimsPtr.data(), inputFloatPtrs.data(), + convertShapeToTFLiteShape(_outputShape), reinterpret_cast<float *>(_outputData)); return true; } bool ConcatLayer::concatenationQuant8() { int num_inputs = _inputShapes.size(); - std::vector<::tflite::Dims<4> *> inputDimsPtr(num_inputs); - std::vector<::tflite::Dims<4>> inputDims(num_inputs); - for (int i = 0; i < num_inputs; i++) + + std::vector<int32_t> input_zeropoints(num_inputs); + std::vector<float> input_scales(num_inputs); + for (uint32_t i = 0; i < num_inputs; i++) { - inputDims[i] = convertShapeToDims(_inputShapes[i]); - inputDimsPtr[i] = &inputDims[i]; + input_zeropoints[i] = _inputShapes[i].offset; + input_scales[i] = _inputShapes[i].scale; } - ::tflite::optimized_ops::Concatenation<::tflite::FusedActivationFunctionType::kNone, uint8_t>( - getNumberOfDimensions(_outputShape) - _axis - 1, _inputDataPtrs.data(), inputDimsPtr.data(), - num_inputs, _outputData, convertShapeToDims(_outputShape)); + + tflite::ConcatenationParams op_params; + op_params.axis = _axis; + op_params.inputs_count = num_inputs; + op_params.input_zeropoint = input_zeropoints.data(); + op_params.input_scale = input_scales.data(); + op_params.output_zeropoint = _outputShape.offset; + op_params.output_scale = _outputShape.scale; + + std::vector<::tflite::RuntimeShape *> inputDimsPtr; + std::vector<::tflite::RuntimeShape> inputDims; + inputDimsPtr.reserve(num_inputs); + inputDims.reserve(num_inputs); + for (uint32_t i = 0; i < num_inputs; i++) + { + inputDims.push_back(convertShapeToTFLiteShape(_inputShapes[i])); + inputDimsPtr.push_back(&inputDims[i]); + } + + ::tflite::optimized_ops::Concatenation<uint8_t>( + op_params, inputDimsPtr.data(), _inputDataPtrs.data(), + convertShapeToTFLiteShape(_outputShape), _outputData); return true; } diff --git a/runtimes/neurun/src/kernel/cpu/ConvolutionLayer.cc b/runtimes/neurun/src/kernel/cpu/ConvolutionLayer.cc index 81e88e0f0..c694fa75f 100644 --- a/runtimes/neurun/src/kernel/cpu/ConvolutionLayer.cc +++ b/runtimes/neurun/src/kernel/cpu/ConvolutionLayer.cc @@ -33,55 +33,51 @@ static constexpr int kStaticBufferSize = 1605632; static char static_scratch_buffer[kStaticBufferSize]; static std::mutex executionMutex; -#define ANDROID_NN_CONV_PARAMETERS(Type) \ - uint32_t height = getSizeOfDimension(_inputShape, 1); \ - uint32_t width = getSizeOfDimension(_inputShape, 2); \ - uint32_t kernelHeight = getSizeOfDimension(_kernelShape, 1); \ - uint32_t kernelWidth = getSizeOfDimension(_kernelShape, 2); \ - uint32_t outHeight = getSizeOfDimension(_outputShape, 1); \ - uint32_t outWidth = getSizeOfDimension(_outputShape, 2); \ - uint32_t inDepth = getSizeOfDimension(_inputShape, 3); \ - \ - uint32_t paddingHeight = (uint32_t)_paddingTop; \ - uint32_t paddingWidth = (uint32_t)_paddingLeft; \ - \ - ::tflite::Dims<4> im2colDim; \ - im2colDim.sizes[3] = (int)getSizeOfDimension(_outputShape, 0); \ - im2colDim.sizes[2] = (int)getSizeOfDimension(_outputShape, 1); \ - im2colDim.sizes[1] = (int)getSizeOfDimension(_outputShape, 2); \ - im2colDim.sizes[0] = (int)inDepth * kernelHeight * kernelWidth; \ - \ - im2colDim.strides[0] = 1; \ - for (int i = 1; i < 4; i++) \ - { \ - im2colDim.strides[i] = im2colDim.strides[i - 1] * im2colDim.sizes[i - 1]; \ - } \ - Type *im2colData = nullptr; \ - uint64_t im2colByteSize = sizeof(Type); \ - std::unique_ptr<Type[]> im2colGuard; \ - for (int i = 0; i < 4; i++) \ - { \ - im2colByteSize *= im2colDim.sizes[i]; \ - } \ - /* http://b/77982879, tflite::optimized_ops::Conv uses int for offsets */ \ - if (im2colByteSize >= 0x7fffffff) \ - { \ - std::cout << "Conv size is too large, not enough memory" << std::endl; \ - return false; \ - } \ - if (im2colByteSize <= kStaticBufferSize) \ - { \ - im2colData = reinterpret_cast<Type *>(static_scratch_buffer); \ - } \ - else \ - { \ - im2colData = new (std::nothrow) Type[im2colByteSize / sizeof(Type)]; \ - if (im2colData == nullptr) \ - { \ - std::cout << "Conv size is too large, not enough memory" << std::endl; \ - return false; \ - } \ - im2colGuard.reset(im2colData); \ +#define ANDROID_NN_CONV_PARAMETERS(Type) \ + uint32_t height = getSizeOfDimension(_inputShape, 1); \ + uint32_t width = getSizeOfDimension(_inputShape, 2); \ + uint32_t kernelHeight = getSizeOfDimension(_kernelShape, 1); \ + uint32_t kernelWidth = getSizeOfDimension(_kernelShape, 2); \ + uint32_t outHeight = getSizeOfDimension(_outputShape, 1); \ + uint32_t outWidth = getSizeOfDimension(_outputShape, 2); \ + uint32_t inDepth = getSizeOfDimension(_inputShape, 3); \ + \ + uint32_t paddingHeight = (uint32_t)_paddingTop; \ + uint32_t paddingWidth = (uint32_t)_paddingLeft; \ + \ + Shape im2colShape; \ + im2colShape.dimensions.resize(4); \ + im2colShape.dimensions[0] = getSizeOfDimension(_outputShape, 0); \ + im2colShape.dimensions[1] = getSizeOfDimension(_outputShape, 1); \ + im2colShape.dimensions[2] = getSizeOfDimension(_outputShape, 2); \ + im2colShape.dimensions[3] = inDepth * kernelHeight * kernelWidth; \ + \ + Type *im2colData = nullptr; \ + uint64_t im2colByteSize = sizeof(Type); \ + std::unique_ptr<Type[]> im2colGuard; \ + for (int i = 0; i < 4; i++) \ + { \ + im2colByteSize *= im2colShape.dimensions[i]; \ + } \ + /* http://b/77982879, tflite::optimized_ops::Conv uses int for offsets */ \ + if (im2colByteSize >= 0x7fffffff) \ + { \ + std::cout << "Conv size is too large, not enough memory" << std::endl; \ + return false; \ + } \ + if (im2colByteSize <= kStaticBufferSize) \ + { \ + im2colData = reinterpret_cast<Type *>(static_scratch_buffer); \ + } \ + else \ + { \ + im2colData = new (std::nothrow) Type[im2colByteSize / sizeof(Type)]; \ + if (im2colData == nullptr) \ + { \ + std::cout << "Conv size is too large, not enough memory" << std::endl; \ + return false; \ + } \ + im2colGuard.reset(im2colData); \ } ConvolutionLayer::ConvolutionLayer() @@ -112,19 +108,32 @@ bool ConvolutionLayer::convFloat32() float output_activation_min, output_activation_max; CalculateActivationRangeFloat(_activation, &output_activation_min, &output_activation_max); int32_t dilationWidthFactor = 1, dilationHeightFactor = 1; + + ::tflite::ConvParams op_params; + op_params.padding_type = ::tflite::PaddingType::kSame; + op_params.padding_values.width = paddingWidth; + op_params.padding_values.height = paddingHeight; + op_params.stride_width = _strideWidth; + op_params.stride_height = _strideHeight; + op_params.dilation_width_factor = dilationWidthFactor; + op_params.dilation_height_factor = dilationHeightFactor; + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; + ::tflite::optimized_ops::Conv( - reinterpret_cast<const float *>(_inputData), convertShapeToDims(_inputShape), - reinterpret_cast<const float *>(_kernelData), convertShapeToDims(_kernelShape), - reinterpret_cast<const float *>(_biasData), convertShapeToDims(_biasShape), _strideWidth, - _strideHeight, dilationWidthFactor, dilationHeightFactor, paddingWidth, paddingHeight, - output_activation_min, output_activation_max, reinterpret_cast<float *>(_outputData), - convertShapeToDims(_outputShape), im2colDataToPass, im2colDim); + op_params, convertShapeToTFLiteShape(_inputShape), + reinterpret_cast<const float *>(_inputData), convertShapeToTFLiteShape(_kernelShape), + reinterpret_cast<const float *>(_kernelData), convertShapeToTFLiteShape(_biasShape), + reinterpret_cast<const float *>(_biasData), convertShapeToTFLiteShape(_outputShape), + reinterpret_cast<float *>(_outputData), convertShapeToTFLiteShape(im2colShape), + im2colDataToPass); return true; } bool ConvolutionLayer::convQuant8() { ANDROID_NN_CONV_PARAMETERS(uint8_t) + int32_t inputOffset = -_inputShape.offset; int32_t kernelOffset = -_kernelShape.offset; int32_t outputOffset = _outputShape.offset; @@ -141,6 +150,24 @@ bool ConvolutionLayer::convQuant8() } CalculateActivationRangeUint8(_activation, _outputShape, &output_activation_min, &output_activation_max); + int32_t dilationWidthFactor = 1, dilationHeightFactor = 1; + + ::tflite::ConvParams op_params; + op_params.padding_type = ::tflite::PaddingType::kSame; + op_params.padding_values.width = paddingWidth; + op_params.padding_values.height = paddingHeight; + op_params.stride_width = _strideWidth; + op_params.stride_height = _strideHeight; + op_params.dilation_width_factor = dilationWidthFactor; + op_params.dilation_height_factor = dilationHeightFactor; + op_params.input_offset = inputOffset; + op_params.weights_offset = kernelOffset; + op_params.output_offset = outputOffset; + op_params.output_multiplier = output_multiplier; + op_params.output_shift = output_shift; + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; + static gemmlowp::GemmContext gemm_context; // Prevent concurrent executions that may access the scratch buffer and // gemm_context. @@ -148,11 +175,10 @@ bool ConvolutionLayer::convQuant8() // Alow gemmlowp automatically decide how many threads to use. gemm_context.set_max_num_threads(0); ::tflite::optimized_ops::Conv( - _inputData, convertShapeToDims(_inputShape), inputOffset, _kernelData, - convertShapeToDims(_kernelShape), kernelOffset, reinterpret_cast<const int32_t *>(_biasData), - convertShapeToDims(_biasShape), _strideWidth, _strideHeight, paddingWidth, paddingHeight, - outputOffset, output_multiplier, output_shift, output_activation_min, output_activation_max, - _outputData, convertShapeToDims(_outputShape), im2colData, im2colDim, &gemm_context); + op_params, convertShapeToTFLiteShape(_inputShape), _inputData, + convertShapeToTFLiteShape(_kernelShape), _kernelData, convertShapeToTFLiteShape(_biasShape), + reinterpret_cast<const int32_t *>(_biasData), convertShapeToTFLiteShape(_outputShape), + _outputData, convertShapeToTFLiteShape(im2colShape), im2colData, &gemm_context); return true; } diff --git a/runtimes/neurun/src/kernel/cpu/FullyConnectedLayer.cc b/runtimes/neurun/src/kernel/cpu/FullyConnectedLayer.cc index 41b9afc0c..abe82db5e 100644 --- a/runtimes/neurun/src/kernel/cpu/FullyConnectedLayer.cc +++ b/runtimes/neurun/src/kernel/cpu/FullyConnectedLayer.cc @@ -44,64 +44,39 @@ FullyConnectedLayer::FullyConnectedLayer() static std::mutex executionMutex; bool FullyConnectedLayer::fullyConnectedFloat32() { - float output_activation_min, output_activation_max; - CalculateActivationRangeFloat(_activation, &output_activation_min, &output_activation_max); - // b/80425683, optimized implementation produces incorrect results when the - // number of input elements is the squre of batch_size. - uint32_t batch_size = getSizeOfDimension(_outputShape, 0); - uint32_t input_n_elements = getNumberOfElements(_inputShape); - if (batch_size * batch_size == input_n_elements) + int total_input_size = 1; + for (int i = 0; i < _inputShape.dimensions.size(); i++) { - ::tflite::reference_ops::FullyConnected( - reinterpret_cast<const float *>(_inputData), convertShapeToDims(_inputShape), - reinterpret_cast<const float *>(_weightsData), convertShapeToDims(_weightsShape), - reinterpret_cast<const float *>(_biasData), convertShapeToDims(_biasShape), - output_activation_min, output_activation_max, reinterpret_cast<float *>(_outputData), - convertShapeToDims(_outputShape)); - } - else - { - ::tflite::optimized_ops::FullyConnected( - reinterpret_cast<const float *>(_inputData), convertShapeToDims(_inputShape), - reinterpret_cast<const float *>(_weightsData), convertShapeToDims(_weightsShape), - reinterpret_cast<const float *>(_biasData), convertShapeToDims(_biasShape), - output_activation_min, output_activation_max, reinterpret_cast<float *>(_outputData), - convertShapeToDims(_outputShape)); + total_input_size *= _inputShape.dimensions[i]; } + + int input_size = _weightsShape.dimensions[1]; + const int batch_size = total_input_size / input_size; + const int num_units = _weightsShape.dimensions[0]; + + TfLiteFusedActivation act = convertFusedActivation(_activation); + + ::tflite::tensor_utils::VectorBatchVectorAssign(reinterpret_cast<const float *>(_biasData), + num_units, batch_size, + reinterpret_cast<float *>(_outputData)); + + // Compute output += weight * input + ::tflite::tensor_utils::MatrixBatchVectorMultiplyAccumulate( + reinterpret_cast<const float *>(_weightsData), num_units, input_size, + reinterpret_cast<const float *>(_inputData), batch_size, + reinterpret_cast<float *>(_outputData), /*result_stride=*/1); + + // Apply activation function + ::tflite::tensor_utils::ApplyActivationToVector(reinterpret_cast<float *>(_outputData), + batch_size * num_units, act, + reinterpret_cast<float *>(_outputData)); + return true; } bool FullyConnectedLayer::fullyConnectedQuant8() { - int32_t inputOffset = -_inputShape.offset; - int32_t weightsOffset = -_weightsShape.offset; - int32_t outputOffset = _outputShape.offset; - float real_multiplier = 0.0; - int32_t output_multiplier = 0; - int32_t output_shift = 0; - int32_t output_activation_min = 0; - int32_t output_activation_max = 0; - // Caution : 'Convolution' can make misleading. It seems it is just math term. - if (!GetQuantizedConvolutionMultipler(_inputShape, _weightsShape, _biasShape, _outputShape, - &real_multiplier) || - !QuantizeMultiplierSmallerThanOne(real_multiplier, &output_multiplier, &output_shift)) - { - return false; - } - CalculateActivationRangeUint8(_activation, _outputShape, &output_activation_min, - &output_activation_max); - static gemmlowp::GemmContext gemm_context; - // Prevent concurrent executions that access gemm_context. - std::unique_lock<std::mutex> lock(executionMutex); - // Alow gemmlowp automatically decide how many threads to use. - gemm_context.set_max_num_threads(0); - ::tflite::optimized_ops::FullyConnected( - _inputData, convertShapeToDims(_inputShape), inputOffset, _weightsData, - convertShapeToDims(_weightsShape), weightsOffset, - reinterpret_cast<const int32_t *>(_biasData), convertShapeToDims(_biasShape), outputOffset, - output_multiplier, output_shift, output_activation_min, output_activation_max, _outputData, - convertShapeToDims(_outputShape), &gemm_context); - return true; + throw std::runtime_error{"FullyConnectedLayer : Not tested for TENSOR_QUANT8_ASYMM"}; } void FullyConnectedLayer::configure(uint8_t *inputData, const Shape inputShape, diff --git a/runtimes/neurun/src/kernel/cpu/MaxPoolLayer.cc b/runtimes/neurun/src/kernel/cpu/MaxPoolLayer.cc index 3d96bb401..c4a288b07 100644 --- a/runtimes/neurun/src/kernel/cpu/MaxPoolLayer.cc +++ b/runtimes/neurun/src/kernel/cpu/MaxPoolLayer.cc @@ -26,14 +26,14 @@ namespace kernel namespace cpu { -#define MAXPOOLING_PARAMETERS \ - uint32_t height = getSizeOfDimension(_inputShape, 1); \ - uint32_t width = getSizeOfDimension(_inputShape, 2); \ - uint32_t outHeight = getSizeOfDimension(_outputShape, 1); \ - uint32_t outWidth = getSizeOfDimension(_outputShape, 2); \ - \ - uint32_t paddingHeight = (uint32_t)_paddingTop; \ - uint32_t paddingWidth = (uint32_t)_paddingLeft; +#define MAXPOOLING_PARAMETERS \ + tflite::PoolParams op_params; \ + op_params.stride_height = _strideHeight; \ + op_params.stride_width = _strideWidth; \ + op_params.filter_height = _kernelHeight; \ + op_params.filter_width = _kernelWidth; \ + op_params.padding_values.height = (int8_t)_paddingTop; \ + op_params.padding_values.width = (int8_t)_paddingLeft; MaxPoolLayer::MaxPoolLayer() : _inputData(nullptr), _outputData(nullptr), _inputShape(), _outputShape(), _paddingLeft(0), @@ -46,31 +46,30 @@ MaxPoolLayer::MaxPoolLayer() bool MaxPoolLayer::maxPoolFloat32() { - MAXPOOLING_PARAMETERS float output_activation_min, output_activation_max; CalculateActivationRangeFloat(_activation, &output_activation_min, &output_activation_max); + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; - ::tflite::optimized_ops::MaxPool( - reinterpret_cast<const float *>(_inputData), convertShapeToDims(_inputShape), _strideWidth, - _strideHeight, paddingWidth, paddingHeight, _kernelWidth, _kernelHeight, - output_activation_min, output_activation_max, reinterpret_cast<float *>(_outputData), - convertShapeToDims(_outputShape)); + ::tflite::optimized_ops::MaxPool(op_params, convertShapeToTFLiteShape(_inputShape), + reinterpret_cast<const float *>(_inputData), + convertShapeToTFLiteShape(_outputShape), + reinterpret_cast<float *>(_outputData)); return true; } bool MaxPoolLayer::maxPoolQuant8() { - MAXPOOLING_PARAMETERS int32_t output_activation_min = 0; int32_t output_activation_max = 0; CalculateActivationRangeUint8(_activation, _outputShape, &output_activation_min, &output_activation_max); + op_params.quantized_activation_min = output_activation_min; + op_params.quantized_activation_max = output_activation_max; - ::tflite::optimized_ops::MaxPool(_inputData, convertShapeToDims(_inputShape), _strideWidth, - _strideHeight, paddingWidth, paddingHeight, _kernelWidth, - _kernelHeight, output_activation_min, output_activation_max, - _outputData, convertShapeToDims(_outputShape)); + ::tflite::optimized_ops::MaxPool(op_params, convertShapeToTFLiteShape(_inputShape), _inputData, + convertShapeToTFLiteShape(_outputShape), _outputData); return true; } diff --git a/runtimes/neurun/src/kernel/cpu/OperationUtils.h b/runtimes/neurun/src/kernel/cpu/OperationUtils.h index 5914d04e3..066b1e573 100644 --- a/runtimes/neurun/src/kernel/cpu/OperationUtils.h +++ b/runtimes/neurun/src/kernel/cpu/OperationUtils.h @@ -23,7 +23,9 @@ #include <limits> #include <vector> +#include "tensorflow/contrib/lite/c/builtin_op_data.h" #include "tensorflow/contrib/lite/kernels/internal/types.h" +#include "tensorflow/contrib/lite/kernels/internal/tensor.h" #include "graph/operand/Object.h" #include "graph/operand/DataType.h" @@ -75,6 +77,51 @@ inline ::tflite::Dims<4> convertShapeToDims(const Shape &shape) return dims; } +inline ::tflite::RuntimeShape convertShapeToTFLiteShape(const Shape &shape) +{ + std::vector<int32_t> raw_shape; + raw_shape.resize(4); + + for (uint32_t i = 0; i < 4; ++i) + { + if (i >= shape.dimensions.size()) + { + raw_shape[i] = 1; + } + else + { + raw_shape[i] = shape.dimensions[i]; + } + } + + return ::tflite::GetTensorShape(raw_shape); +} + +inline TfLiteFusedActivation convertFusedActivation(FuseCode act) +{ + if (act == ANEURALNETWORKS_FUSED_NONE) + { + return kTfLiteActNone; + } + + if (act == ANEURALNETWORKS_FUSED_RELU) + { + return kTfLiteActRelu; + } + + if (act == ANEURALNETWORKS_FUSED_RELU1) + { + return kTfLiteActRelu1; + } + + if (act == ANEURALNETWORKS_FUSED_RELU6) + { + return kTfLiteActRelu6; + } + + return kTfLiteActNone; +} + __wur bool QuantizeMultiplierSmallerThanOne(double double_multiplier, int32_t *quantized_multiplier, int32_t *right_shift); diff --git a/runtimes/neurun/src/kernel/cpu/SoftMaxLayer.cc b/runtimes/neurun/src/kernel/cpu/SoftMaxLayer.cc index 4f5a69f2e..c998c65f6 100644 --- a/runtimes/neurun/src/kernel/cpu/SoftMaxLayer.cc +++ b/runtimes/neurun/src/kernel/cpu/SoftMaxLayer.cc @@ -33,45 +33,86 @@ SoftMaxLayer::SoftMaxLayer() // DO NOTHING } +// Performs softmax along the input of size (input_size * batch_size). +void Softmax(const float *in, const int input_size, const int batch_size, const float beta, + float *out) +{ + TF_LITE_ASSERT(input_size > 0); + + // For each batch + for (int b = 0; b < batch_size; b++) + { + // Find the max coeff. + float max_coeff = in[0]; + for (int i = 1; i < input_size; i++) + { + if (in[i] > max_coeff) + max_coeff = in[i]; + } + + // Compute the normalized sum of exps. + float exp_sum = 0.0; + for (int i = 0; i < input_size; i++) + { + out[i] = std::exp((in[i] - max_coeff) * beta); + exp_sum += out[i]; + } + + // Divide by the sum of exps. + float reciprocal_sum_exp = 1.f / exp_sum; + for (int i = 0; i < input_size; i++) + { + out[i] *= reciprocal_sum_exp; + } + + // Advance in and out pointers for the next batch. + in += input_size; + out += input_size; + } +} + bool SoftMaxLayer::softmaxFloat32() { - ::tflite::Dims<4> dim; + Shape shapeIn4D; + if (getNumberOfDimensions(_inputShape) == 2) { uint32_t batch_size = getSizeOfDimension(_inputShape, 0); uint32_t input_size = getNumberOfElements(_inputShape) / batch_size; - Shape shapeIn4D; - shapeIn4D.dimensions = {batch_size, 1, 1, input_size}; - dim = convertShapeToDims(shapeIn4D); + Softmax(reinterpret_cast<const float *>(_inputData), input_size, batch_size, _beta, + reinterpret_cast<float *>(_outputData)); } else if (getNumberOfDimensions(_inputShape) == 4) { - dim = convertShapeToDims(_inputShape); + ::tflite::SoftmaxParams op_params; + op_params.beta = _beta; + ::tflite::optimized_ops::Softmax(op_params, convertShapeToTFLiteShape(_inputShape), + reinterpret_cast<const float *>(_inputData), + convertShapeToTFLiteShape(_outputShape), + reinterpret_cast<float *>(_outputData)); } else { std::cout << "only 2D and 4D tensors supported" << std::endl; return false; } - ::tflite::optimized_ops::Softmax(reinterpret_cast<const float *>(_inputData), dim, _beta, - reinterpret_cast<float *>(_outputData), dim); + return true; } bool SoftMaxLayer::softmaxQuant8() { - ::tflite::Dims<4> dim; + Shape shapeIn4D = _inputShape; + if (getNumberOfDimensions(_inputShape) == 2) { uint32_t batch_size = getSizeOfDimension(_inputShape, 0); uint32_t input_size = getNumberOfElements(_inputShape) / batch_size; - Shape shapeIn4D; shapeIn4D.dimensions = {batch_size, 1, 1, input_size}; - dim = convertShapeToDims(shapeIn4D); } else if (getNumberOfDimensions(_inputShape) == 4) { - dim = convertShapeToDims(_inputShape); + shapeIn4D = _inputShape; } else { @@ -94,8 +135,13 @@ bool SoftMaxLayer::softmaxQuant8() return false; } float diff_min = -1.0f * CalculateInputRadius(kScaledDiffIntegerBits, input_left_shift); - ::tflite::optimized_ops::Softmax(_inputData, dim, input_multiplier, input_left_shift, diff_min, - _outputData, dim); + + ::tflite::SoftmaxParams op_params; + op_params.input_multiplier = input_multiplier; + op_params.input_left_shift = input_left_shift; + op_params.diff_min = diff_min; + ::tflite::optimized_ops::Softmax(op_params, convertShapeToTFLiteShape(shapeIn4D), _inputData, + convertShapeToTFLiteShape(shapeIn4D), _outputData); return true; } |