diff options
author | Chunseok Lee <chunseok.lee@samsung.com> | 2020-10-28 12:16:55 +0900 |
---|---|---|
committer | Chunseok Lee <chunseok.lee@samsung.com> | 2020-10-28 12:16:55 +0900 |
commit | c55f8a6db48cda9d3a78048338b7f18c4cca62b8 (patch) | |
tree | 761ee8e171e5203f5c598ad93b2e7e0bc2e31aa2 /runtime | |
parent | 74476a2d0296bdad70a2f7f90bc7419a8b05bffd (diff) | |
download | nnfw-c55f8a6db48cda9d3a78048338b7f18c4cca62b8.tar.gz nnfw-c55f8a6db48cda9d3a78048338b7f18c4cca62b8.tar.bz2 nnfw-c55f8a6db48cda9d3a78048338b7f18c4cca62b8.zip |
Imported Upstream version 1.10.0upstream/1.10.0submit/tizen/20201028.104702submit/tizen/20201028.031836accepted/tizen/unified/20201029.124827
Diffstat (limited to 'runtime')
220 files changed, 8212 insertions, 6602 deletions
diff --git a/runtime/contrib/android/api/Android.mk b/runtime/contrib/android/api/Android.mk index a056eff9d..3c768cca5 100644 --- a/runtime/contrib/android/api/Android.mk +++ b/runtime/contrib/android/api/Android.mk @@ -4,7 +4,5 @@ include $(CLEAR_VARS) API_ROOT_PATH := $(LOCAL_PATH) PREBUILT_LIB := -include $(API_ROOT_PATH)/prebuilt/Android.mk +include $(API_ROOT_PATH)/Prebuilt.mk include $(API_ROOT_PATH)/src/main/native/Android.mk - -#$(warning $(PREBUILT_LIB)) diff --git a/runtime/contrib/android/api/Prebuilt.mk b/runtime/contrib/android/api/Prebuilt.mk new file mode 100644 index 000000000..7d9f56582 --- /dev/null +++ b/runtime/contrib/android/api/Prebuilt.mk @@ -0,0 +1,70 @@ +LOCAL_PATH := $(call my-dir) +include $(CLEAR_VARS) + +ifndef ONERT_PREBUILT_LIB_DIR +$(error ONERT_PREBUILT_LIB_DIR is not set) +endif + +# libcircle_loader +include $(CLEAR_VARS) +LOCAL_MODULE := circle_loader +PREBUILT_LIB += circle_loader +LOCAL_SRC_FILES := \ + $(ONERT_PREBUILT_LIB_DIR)/libcircle_loader.so +include $(PREBUILT_SHARED_LIBRARY) + +# libtflite_loader +include $(CLEAR_VARS) +LOCAL_MODULE := tflite_loader +PREBUILT_LIB += tflite_loader +LOCAL_SRC_FILES := \ + $(ONERT_PREBUILT_LIB_DIR)/libtflite_loader.so +include $(PREBUILT_SHARED_LIBRARY) + +# libtensorflowlite_jni +include $(CLEAR_VARS) +LOCAL_MODULE := tensorflowlite_jni +PREBUILT_LIB += tensorflowlite_jni +LOCAL_SRC_FILES := \ + $(ONERT_PREBUILT_LIB_DIR)/libtensorflowlite_jni.so +include $(PREBUILT_SHARED_LIBRARY) + +# libnnfw +include $(CLEAR_VARS) +LOCAL_MODULE := nnfw-dev +PREBUILT_LIB += nnfw-dev +LOCAL_SRC_FILES := \ + $(ONERT_PREBUILT_LIB_DIR)/libnnfw-dev.so +include $(PREBUILT_SHARED_LIBRARY) + +# libonert_core +include $(CLEAR_VARS) +LOCAL_MODULE := onert_core +PREBUILT_LIB += onert_core +LOCAL_SRC_FILES := \ + $(ONERT_PREBUILT_LIB_DIR)/libonert_core.so +include $(PREBUILT_SHARED_LIBRARY) + +# backend_cpu +include $(CLEAR_VARS) +LOCAL_MODULE := backend_cpu +PREBUILT_LIB += backend_cpu +LOCAL_SRC_FILES := \ + $(ONERT_PREBUILT_LIB_DIR)/libbackend_cpu.so +include $(PREBUILT_SHARED_LIBRARY) + +# TODO Support backend acl +# backend_acl +ifeq ($(ONERT_CONTAINS_ACL), 1) + $(error containing acl backend doesn't supported yet) +endif + +# backend_ext +ifneq ($(ONERT_EXT_PREBUILT_LIB), ) +include $(CLEAR_VARS) +LOCAL_MODULE := backend_ext +PREBUILT_LIB += backend_ext +LOCAL_SRC_FILES := \ + $(ONERT_EXT_PREBUILT_LIB) +include $(PREBUILT_SHARED_LIBRARY) +endif diff --git a/runtime/contrib/android/api/build.gradle b/runtime/contrib/android/api/build.gradle index def89eeac..afc53d936 100644 --- a/runtime/contrib/android/api/build.gradle +++ b/runtime/contrib/android/api/build.gradle @@ -8,11 +8,39 @@ android { minSdkVersion 26 targetSdkVersion 29 versionCode 1 - versionName "1.9.0" + versionName "1.10.0" externalNativeBuild { ndkBuild { - arguments "ONERT_API_INC_DIR=${project.projectDir}/../../../onert/api/include" + def onert_header_dir + if (project.hasProperty('onertHeaderDir')) + onert_header_dir = project.onertHeaderDir + else + onert_header_dir = "${project.projectDir}/../../../onert/api/include" + + def onert_lib_dir + if (project.hasProperty('onertLibDir')) + onert_lib_dir = project.onertLibDir + else + onert_lib_dir = "${project.projectDir}/../../../../Product/out/lib" + + def onert_contains_acl + if (project.hasProperty('onertContainsAcl')) + onert_contains_acl = 1 + else + onert_contains_acl = 0 + + def onert_ext_lib + if (project.hasProperty('onertExtLib')) + onert_ext_lib = project.onertExtLib + else + onert_ext_lib = "" + + arguments "ONERT_API_INC_DIR=$onert_header_dir", + "ONERT_PREBUILT_LIB_DIR=$onert_lib_dir", + "ONERT_CONTAINS_ACL=$onert_contains_acl", + "ONERT_EXT_PREBUILT_LIB=$onert_ext_lib" + abiFilters 'arm64-v8a' } } diff --git a/runtime/contrib/android/api/prebuilt/Android.mk b/runtime/contrib/android/api/prebuilt/Android.mk deleted file mode 100644 index e8a9f0755..000000000 --- a/runtime/contrib/android/api/prebuilt/Android.mk +++ /dev/null @@ -1,9 +0,0 @@ -LOCAL_PATH := $(call my-dir) -include $(CLEAR_VARS) -PREBUILT_PATH := $(LOCAL_PATH) -include $(PREBUILT_PATH)/backend_cpu/Android.mk -include $(PREBUILT_PATH)/circle_loader/Android.mk -include $(PREBUILT_PATH)/nnfw-dev/Android.mk -include $(PREBUILT_PATH)/onert_core/Android.mk -include $(PREBUILT_PATH)/tensorflowlite_jni/Android.mk -include $(PREBUILT_PATH)/tflite_loader/Android.mk diff --git a/runtime/contrib/android/api/prebuilt/backend_cpu/Android.mk b/runtime/contrib/android/api/prebuilt/backend_cpu/Android.mk deleted file mode 100644 index ccda9ea90..000000000 --- a/runtime/contrib/android/api/prebuilt/backend_cpu/Android.mk +++ /dev/null @@ -1,7 +0,0 @@ -LOCAL_PATH := $(call my-dir) -include $(CLEAR_VARS) -LOCAL_MODULE := backend_cpu -PREBUILT_LIB += backend_cpu -LOCAL_SRC_FILES := \ - libbackend_cpu.so -include $(PREBUILT_SHARED_LIBRARY) diff --git a/runtime/contrib/android/api/prebuilt/backend_cpu/libbackend_cpu.so b/runtime/contrib/android/api/prebuilt/backend_cpu/libbackend_cpu.so deleted file mode 120000 index 3d577cf5c..000000000 --- a/runtime/contrib/android/api/prebuilt/backend_cpu/libbackend_cpu.so +++ /dev/null @@ -1 +0,0 @@ -../../../../../../Product/out/lib/libbackend_cpu.so
\ No newline at end of file diff --git a/runtime/contrib/android/api/prebuilt/circle_loader/Android.mk b/runtime/contrib/android/api/prebuilt/circle_loader/Android.mk deleted file mode 100644 index 2e481e93e..000000000 --- a/runtime/contrib/android/api/prebuilt/circle_loader/Android.mk +++ /dev/null @@ -1,7 +0,0 @@ -LOCAL_PATH := $(call my-dir) -include $(CLEAR_VARS) -LOCAL_MODULE := circle_loader -PREBUILT_LIB += circle_loader -LOCAL_SRC_FILES := \ - libcircle_loader.so -include $(PREBUILT_SHARED_LIBRARY) diff --git a/runtime/contrib/android/api/prebuilt/circle_loader/libcircle_loader.so b/runtime/contrib/android/api/prebuilt/circle_loader/libcircle_loader.so deleted file mode 120000 index 528d7017f..000000000 --- a/runtime/contrib/android/api/prebuilt/circle_loader/libcircle_loader.so +++ /dev/null @@ -1 +0,0 @@ -../../../../../../Product/out/lib/libcircle_loader.so
\ No newline at end of file diff --git a/runtime/contrib/android/api/prebuilt/nnfw-dev/Android.mk b/runtime/contrib/android/api/prebuilt/nnfw-dev/Android.mk deleted file mode 100644 index 10cb8f6f4..000000000 --- a/runtime/contrib/android/api/prebuilt/nnfw-dev/Android.mk +++ /dev/null @@ -1,7 +0,0 @@ -LOCAL_PATH := $(call my-dir) -include $(CLEAR_VARS) -LOCAL_MODULE := nnfw-dev -PREBUILT_LIB += nnfw-dev -LOCAL_SRC_FILES := \ - libnnfw-dev.so -include $(PREBUILT_SHARED_LIBRARY) diff --git a/runtime/contrib/android/api/prebuilt/nnfw-dev/libnnfw-dev.so b/runtime/contrib/android/api/prebuilt/nnfw-dev/libnnfw-dev.so deleted file mode 120000 index 1913db8d7..000000000 --- a/runtime/contrib/android/api/prebuilt/nnfw-dev/libnnfw-dev.so +++ /dev/null @@ -1 +0,0 @@ -../../../../../../Product/out/lib/libnnfw-dev.so
\ No newline at end of file diff --git a/runtime/contrib/android/api/prebuilt/onert_core/Android.mk b/runtime/contrib/android/api/prebuilt/onert_core/Android.mk deleted file mode 100644 index a6682a24f..000000000 --- a/runtime/contrib/android/api/prebuilt/onert_core/Android.mk +++ /dev/null @@ -1,7 +0,0 @@ -LOCAL_PATH := $(call my-dir) -include $(CLEAR_VARS) -LOCAL_MODULE := onert_core -PREBUILT_LIB += onert_core -LOCAL_SRC_FILES := \ - libonert_core.so -include $(PREBUILT_SHARED_LIBRARY) diff --git a/runtime/contrib/android/api/prebuilt/onert_core/libonert_core.so b/runtime/contrib/android/api/prebuilt/onert_core/libonert_core.so deleted file mode 120000 index bafe11cb9..000000000 --- a/runtime/contrib/android/api/prebuilt/onert_core/libonert_core.so +++ /dev/null @@ -1 +0,0 @@ -../../../../../../Product/out/lib/libonert_core.so
\ No newline at end of file diff --git a/runtime/contrib/android/api/prebuilt/tensorflowlite_jni/Android.mk b/runtime/contrib/android/api/prebuilt/tensorflowlite_jni/Android.mk deleted file mode 100644 index 823cf0747..000000000 --- a/runtime/contrib/android/api/prebuilt/tensorflowlite_jni/Android.mk +++ /dev/null @@ -1,7 +0,0 @@ -LOCAL_PATH := $(call my-dir) -include $(CLEAR_VARS) -LOCAL_MODULE := tensorflowlite_jni -PREBUILT_LIB += tensorflowlite_jni -LOCAL_SRC_FILES := \ - libtensorflowlite_jni.so -include $(PREBUILT_SHARED_LIBRARY) diff --git a/runtime/contrib/android/api/prebuilt/tensorflowlite_jni/libtensorflowlite_jni.so b/runtime/contrib/android/api/prebuilt/tensorflowlite_jni/libtensorflowlite_jni.so deleted file mode 120000 index d3d72a5a7..000000000 --- a/runtime/contrib/android/api/prebuilt/tensorflowlite_jni/libtensorflowlite_jni.so +++ /dev/null @@ -1 +0,0 @@ -../../../../../../Product/out/lib/libtensorflowlite_jni.so
\ No newline at end of file diff --git a/runtime/contrib/android/api/prebuilt/tflite_loader/Android.mk b/runtime/contrib/android/api/prebuilt/tflite_loader/Android.mk deleted file mode 100644 index 135ac1dad..000000000 --- a/runtime/contrib/android/api/prebuilt/tflite_loader/Android.mk +++ /dev/null @@ -1,7 +0,0 @@ -LOCAL_PATH := $(call my-dir) -include $(CLEAR_VARS) -LOCAL_MODULE := tflite_loader -PREBUILT_LIB += tflite_loader -LOCAL_SRC_FILES := \ - libtflite_loader.so -include $(PREBUILT_SHARED_LIBRARY) diff --git a/runtime/contrib/android/api/prebuilt/tflite_loader/libtflite_loader.so b/runtime/contrib/android/api/prebuilt/tflite_loader/libtflite_loader.so deleted file mode 120000 index 4c001aec0..000000000 --- a/runtime/contrib/android/api/prebuilt/tflite_loader/libtflite_loader.so +++ /dev/null @@ -1 +0,0 @@ -../../../../../../Product/out/lib/libtflite_loader.so
\ No newline at end of file diff --git a/runtime/contrib/android/api/src/main/native/onert-native-api.cpp b/runtime/contrib/android/api/src/main/native/onert-native-api.cpp index 1644e0f7f..209264d31 100644 --- a/runtime/contrib/android/api/src/main/native/onert-native-api.cpp +++ b/runtime/contrib/android/api/src/main/native/onert-native-api.cpp @@ -121,8 +121,7 @@ JNIEXPORT jboolean JNICALL Java_com_samsung_onert_NativeSessionWrapper_nativeSet if (jni::setInput(handle, params) == false) { - __android_log_print(ANDROID_LOG_ERROR, JTAG, "%s] failed native setOutput", - __PRETTY_FUNCTION__); + __android_log_print(ANDROID_LOG_ERROR, JTAG, "%s] failed native setInput", __PRETTY_FUNCTION__); return JNI_FALSE; } diff --git a/runtime/contrib/android_benchmark_app/CMakeLists.txt b/runtime/contrib/android_benchmark_app/CMakeLists.txt index 55dbf0024..beb279cb9 100644 --- a/runtime/contrib/android_benchmark_app/CMakeLists.txt +++ b/runtime/contrib/android_benchmark_app/CMakeLists.txt @@ -55,7 +55,7 @@ target_link_libraries(android_benchmark_native nnfw_lib_tflite) target_link_libraries(android_benchmark_native nnfw_lib_misc) target_link_libraries(android_benchmark_native log) -nnas_find_package(FlatBuffersSource EXACT 1.11 REQUIRED) +nnas_find_package(FlatBuffersSource EXACT 1.12 REQUIRED) target_include_directories(android_benchmark_native PUBLIC ${FlatBuffersSource_DIR}/include .) add_custom_target(android-benchmark-apk ALL diff --git a/runtime/libs/ndarray/src/ContiguousSpan.cpp b/runtime/libs/benchmark/include/benchmark/MemoryInfo.h index e06cfc2a1..6e8e12ba4 100644 --- a/runtime/libs/ndarray/src/ContiguousSpan.cpp +++ b/runtime/libs/benchmark/include/benchmark/MemoryInfo.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,18 +14,27 @@ * limitations under the License. */ -#include "ndarray/ContiguousSpan.h" +#ifndef __NNFW_BENCHMARK_MEMORY_INFO_H__ +#define __NNFW_BENCHMARK_MEMORY_INFO_H__ -namespace ndarray +#include <cstdint> +#include <string> + +namespace benchmark { -template class ContiguousSpan<float, true>; -template class ContiguousSpan<float, false>; -template class ContiguousSpan<int32_t, true>; -template class ContiguousSpan<int32_t, false>; -template class ContiguousSpan<uint32_t, true>; -template class ContiguousSpan<uint32_t, false>; -template class ContiguousSpan<uint8_t, true>; -template class ContiguousSpan<uint8_t, false>; +bool prepareVmRSS(); +bool prepareVmHWM(); +bool prepareGpuMemory(); +bool preparePssSum(); + +uint32_t getVmRSS(); +uint32_t getVmHWM(); +uint32_t getGpuMemory(const std::string &process_name); +uint32_t getPssSum(); + +std::string getProcessName(); + +} // namespace benchmark -} // namespace ndarray +#endif // __NNFW_BENCHMARK_MEMORY_INFO_H__ diff --git a/runtime/libs/benchmark/include/benchmark/MemoryPoller.h b/runtime/libs/benchmark/include/benchmark/MemoryPoller.h index 48caa3b3a..47db3fd77 100644 --- a/runtime/libs/benchmark/include/benchmark/MemoryPoller.h +++ b/runtime/libs/benchmark/include/benchmark/MemoryPoller.h @@ -57,10 +57,6 @@ public: private: void process(); bool prepareMemoryPolling(); - uint32_t getVmRSS(); - uint32_t getVmHWM(); - uint32_t getGpuMemory(); - uint32_t getPssSum(); private: std::chrono::milliseconds _duration; diff --git a/runtime/libs/benchmark/include/benchmark/Phases.h b/runtime/libs/benchmark/include/benchmark/Phases.h index 936a89742..7d642782a 100644 --- a/runtime/libs/benchmark/include/benchmark/Phases.h +++ b/runtime/libs/benchmark/include/benchmark/Phases.h @@ -50,6 +50,9 @@ public: const MemoryPoller &mem_poll() const { return *_mem_poll; } const Phase &at(const std::string &tag) const { return _phases.at(tag); } + uint32_t mem_before_init() const { return _mem_before_init; } + uint32_t mem_after_run() const { return _mem_after_run; } + private: void run(const std::string &tag, const PhaseFunc &exec, const PhaseFunc *post, uint32_t loop_num, bool option_disable); @@ -58,6 +61,8 @@ private: const PhaseOption _option; std::unordered_map<std::string, Phase> _phases; std::unique_ptr<MemoryPoller> _mem_poll; + uint32_t _mem_before_init; + uint32_t _mem_after_run; }; } // namespace benchmark diff --git a/runtime/libs/benchmark/include/benchmark/Result.h b/runtime/libs/benchmark/include/benchmark/Result.h index 69084b300..7604aa904 100644 --- a/runtime/libs/benchmark/include/benchmark/Result.h +++ b/runtime/libs/benchmark/include/benchmark/Result.h @@ -34,6 +34,8 @@ public: double time[PhaseEnum::END_OF_PHASE][FigureType::END_OF_FIG_TYPE]; uint32_t memory[PhaseEnum::END_OF_PHASE][MemoryType::END_OF_MEM_TYPE]; bool print_memory = false; + uint32_t init_memory = 0; + uint32_t peak_memory = 0; }; // TODO Support not only stdout but also ostream diff --git a/runtime/libs/benchmark/src/MemoryInfo.cpp b/runtime/libs/benchmark/src/MemoryInfo.cpp new file mode 100644 index 000000000..20d262961 --- /dev/null +++ b/runtime/libs/benchmark/src/MemoryInfo.cpp @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "benchmark/MemoryInfo.h" + +#include <vector> +#include <algorithm> +#include <fstream> +#include <sstream> +#include <cassert> +#include <sys/time.h> +#include <sys/resource.h> + +namespace +{ + +const std::string proc_status_path("/proc/self/status"); +const std::string gpu_memory_path("/sys/kernel/debug/mali0/gpu_memory"); +const std::string proc_smaps_path("/proc/self/smaps"); + +bool isStrNumber(const std::string &s) +{ + return !s.empty() && + std::find_if(s.begin(), s.end(), [](char c) { return !std::isdigit(c); }) == s.end(); +} + +std::vector<std::string> splitLine(std::string line, std::string delimiters = " \n\t") +{ + std::vector<std::string> words; + size_t prev = 0, pos; + + while ((pos = line.find_first_of(delimiters, prev)) != std::string::npos) + { + if (pos > prev) + words.emplace_back(line.substr(prev, pos - prev)); + prev = pos + 1; + } + + if (prev < line.length()) + words.emplace_back(line.substr(prev, std::string::npos)); + + return words; +} + +std::vector<std::string> getValueFromFileStatus(const std::string &file, const std::string &key) +{ + std::ifstream ifs(file); + assert(ifs.is_open()); + + std::string line; + std::vector<std::string> val; + + bool found = false; + while (std::getline(ifs, line)) + { + if (line.find(key) != std::string::npos) + { + found = true; + break; + } + } + ifs.close(); + + if (!found) + { + // NOTE. the process which uses gpu resources cannot be there yet at the model-load phase. + // At that time, just return empty. + return val; + } + + val = splitLine(line); + return val; +} + +// Because of smaps' structure, returns sum value as uint32_t +uint32_t getSumValueFromFileSmaps(const std::string &file, const std::string &key) +{ + std::ifstream ifs(file); + assert(ifs.is_open()); + + std::string line; + uint32_t sum = 0; + while (std::getline(ifs, line)) + { + if (line.find(key) != std::string::npos) + { + // an example by splitLine() + // `Pss: 0 kB` + // val[0]: "Pss:", val[1]: "0" val[2]: "kB" + auto val = splitLine(line); + assert(val.size() != 0); + // SwapPss could show so that check where Pss is at the beginning + if (val[0].find("Pss") != 0) + { + continue; + } + sum += std::stoul(val[1]); + } + } + + return sum; +} + +} // namespace + +namespace benchmark +{ + +bool prepareVmRSS() { return std::ifstream(proc_status_path).is_open(); } + +bool prepareVmHWM() { return std::ifstream(proc_status_path).is_open(); } + +bool prepareGpuMemory() { return std::ifstream(gpu_memory_path).is_open(); } + +bool preparePssSum() { return std::ifstream(proc_smaps_path).is_open(); } + +uint32_t getVmRSS() +{ + auto val = getValueFromFileStatus(proc_status_path, "VmRSS"); + if (val.size() == 0) + return 0; + assert(isStrNumber(val[1])); + return std::stoul(val[1]); +} + +uint32_t getVmHWM() +{ + auto val = getValueFromFileStatus(proc_status_path, "VmHWM"); + if (val.size() == 0) + return 0; + // key: value + assert(isStrNumber(val[1])); + return std::stoul(val[1]); +} + +uint32_t getGpuMemory(const std::string &process_name) +{ + assert(!process_name.empty()); + auto val = getValueFromFileStatus(gpu_memory_path, process_name); + if (val.size() == 0) + return 0; + // process_name -> pid -> gpu_mem -> max_gpu_mem + assert(isStrNumber(val[2])); + return std::stoul(val[2]); +} + +uint32_t getPssSum() { return getSumValueFromFileSmaps(proc_smaps_path, "Pss"); } + +std::string getProcessName() +{ + auto val = getValueFromFileStatus(proc_status_path, "Name"); + assert(val.size() >= 2); + return val[1]; +} + +} // namespace benchmark diff --git a/runtime/libs/benchmark/src/MemoryPoller.cpp b/runtime/libs/benchmark/src/MemoryPoller.cpp index 61fdecd46..050b5b163 100644 --- a/runtime/libs/benchmark/src/MemoryPoller.cpp +++ b/runtime/libs/benchmark/src/MemoryPoller.cpp @@ -16,106 +16,13 @@ #include "benchmark/MemoryPoller.h" #include "benchmark/Types.h" +#include "benchmark/MemoryInfo.h" #include <vector> -#include <fstream> -#include <sstream> #include <stdexcept> #include <cassert> #include <iostream> -namespace -{ - -const std::string proc_status_path("/proc/self/status"); -const std::string gpu_memory_path("/sys/kernel/debug/mali0/gpu_memory"); -const std::string proc_smaps_path("/proc/self/smaps"); - -bool isStrNumber(const std::string &s) -{ - return !s.empty() && - std::find_if(s.begin(), s.end(), [](char c) { return !std::isdigit(c); }) == s.end(); -} - -std::vector<std::string> splitLine(std::string line, std::string delimiters = " \n\t") -{ - std::vector<std::string> words; - size_t prev = 0, pos; - - while ((pos = line.find_first_of(delimiters, prev)) != std::string::npos) - { - if (pos > prev) - words.emplace_back(line.substr(prev, pos - prev)); - prev = pos + 1; - } - - if (prev < line.length()) - words.emplace_back(line.substr(prev, std::string::npos)); - - return words; -} - -std::vector<std::string> getValueFromFileStatus(const std::string &file, const std::string &key) -{ - std::ifstream ifs(file); - assert(ifs.is_open()); - - std::string line; - std::vector<std::string> val; - - bool found = false; - while (std::getline(ifs, line)) - { - if (line.find(key) != std::string::npos) - { - found = true; - break; - } - } - ifs.close(); - - if (!found) - { - // NOTE. the process which uses gpu resources cannot be there yet at the model-load phase. - // At that time, just return empty. - return val; - } - - val = splitLine(line); - return val; -} - -// Because of smaps' structure, returns sum value as uint32_t -uint32_t getSumValueFromFileSmaps(const std::string &file, const std::string &key) -{ - std::ifstream ifs(file); - assert(ifs.is_open()); - - std::string line; - uint32_t sum = 0; - while (std::getline(ifs, line)) - { - if (line.find(key) != std::string::npos) - { - // an example by splitLine() - // `Pss: 0 kB` - // val[0]: "Pss:", val[1]: "0" val[2]: "kB" - auto val = splitLine(line); - assert(val.size() != 0); - // SwapPss could show so that check where Pss is at the beginning - if (val[0].find("Pss") != 0) - { - continue; - } - sum += std::stoul(val[1]); - } - } - - return sum; -} - -} // namespace - namespace benchmark { @@ -168,7 +75,7 @@ bool MemoryPoller::end(PhaseEnum phase) mem = getVmRSS(); if (_gpu_poll) { - mem += getGpuMemory(); + mem += getGpuMemory(_process_name); } if (mem > _rss_map[phase]) _rss_map[phase] = mem; @@ -176,7 +83,7 @@ bool MemoryPoller::end(PhaseEnum phase) mem = getVmHWM(); if (_gpu_poll) { - mem += getGpuMemory(); + mem += getGpuMemory(_process_name); } _hwm_map[phase] = mem; @@ -208,7 +115,7 @@ void MemoryPoller::process() uint32_t cur_hwm = getVmHWM(); if (_gpu_poll) { - auto gpu_mem = getGpuMemory(); + auto gpu_mem = getGpuMemory(_process_name); cur_rss += gpu_mem; cur_hwm += gpu_mem; } @@ -236,77 +143,33 @@ void MemoryPoller::process() bool MemoryPoller::prepareMemoryPolling() { // VmRSS + if (!prepareVmRSS()) { - std::ifstream ifs(proc_status_path); - if (!ifs.is_open()) - { - std::cerr << "failed to open " << proc_status_path << std::endl; - return false; - } - ifs.close(); + std::cerr << "failed to prepare parsing vmrss" << std::endl; + return false; } // (Additionally) GpuMemory if (_gpu_poll) { - std::ifstream ifs(gpu_memory_path); - if (!ifs.is_open()) + if (!prepareGpuMemory()) { - std::cerr << "failed to open " << gpu_memory_path << std::endl; + std::cerr << "failed to prepare parsing gpu memory" << std::endl; return false; } - ifs.close(); // Needs process name - auto val = getValueFromFileStatus(proc_status_path, "Name"); - assert(val.size() != 0); - _process_name = val[1]; + _process_name = getProcessName(); } // PSS + if (!preparePssSum()) { - std::ifstream ifs(proc_smaps_path); - if (!ifs.is_open()) - { - std::cerr << "failed to open " << proc_smaps_path << std::endl; - return false; - } - ifs.close(); + std::cerr << "failed to prepare parsing pss sum" << std::endl; + return false; } return true; } -uint32_t MemoryPoller::getVmRSS() -{ - auto val = getValueFromFileStatus(proc_status_path, "VmRSS"); - if (val.size() == 0) - return 0; - assert(isStrNumber(val[1])); - return std::stoul(val[1]); -} - -uint32_t MemoryPoller::getVmHWM() -{ - auto val = getValueFromFileStatus(proc_status_path, "VmHWM"); - if (val.size() == 0) - return 0; - // key: value - assert(isStrNumber(val[1])); - return std::stoul(val[1]); -} - -uint32_t MemoryPoller::getGpuMemory() -{ - assert(!_process_name.empty()); - auto val = getValueFromFileStatus(gpu_memory_path, _process_name); - if (val.size() == 0) - return 0; - // process_name -> pid -> gpu_mem -> max_gpu_mem - assert(isStrNumber(val[2])); - return std::stoul(val[2]); -} - -uint32_t MemoryPoller::getPssSum() { return getSumValueFromFileSmaps(proc_smaps_path, "Pss"); } - } // namespace benchmark diff --git a/runtime/libs/benchmark/src/Phases.cpp b/runtime/libs/benchmark/src/Phases.cpp index 9ab67cfd9..897b943d3 100644 --- a/runtime/libs/benchmark/src/Phases.cpp +++ b/runtime/libs/benchmark/src/Phases.cpp @@ -17,6 +17,7 @@ #include "benchmark/Phases.h" #include "benchmark/Types.h" +#include "benchmark/MemoryInfo.h" #include <cassert> #include <chrono> @@ -46,8 +47,11 @@ void SleepForMicros(uint64_t micros) namespace benchmark { -Phases::Phases(const PhaseOption &option) : _option(option) +Phases::Phases(const PhaseOption &option) : _option(option), _mem_before_init(0), _mem_after_run(0) { + assert(prepareVmRSS()); + _mem_before_init = getVmHWM(); + if (_option.memory) { _mem_poll = std::make_unique<MemoryPoller>(std::chrono::milliseconds(option.memory_interval), @@ -93,6 +97,8 @@ void Phases::run(const std::string &tag, const PhaseFunc &exec, const PhaseFunc } } + _mem_after_run = getVmHWM(); + if (p == PhaseEnum::END_OF_PHASE) { return; diff --git a/runtime/libs/benchmark/src/Result.cpp b/runtime/libs/benchmark/src/Result.cpp index df573da92..e6cafb91c 100644 --- a/runtime/libs/benchmark/src/Result.cpp +++ b/runtime/libs/benchmark/src/Result.cpp @@ -141,6 +141,15 @@ void printResultMemory(const uint32_t memory[benchmark::PhaseEnum::END_OF_PHASE] } } +void printUsedPeakMemory(uint32_t init_memory, uint32_t peak_memory) +{ + uint32_t used_peak_memory = peak_memory - init_memory; + std::cout << "Used Peak Memory : " << used_peak_memory << " kb" << std::endl; + std::cout << "- HWM after run : " << peak_memory << " kb" << std::endl; + std::cout << "- HWM before init: " << init_memory << " kb" << std::endl; + std::cout << "===================================" << std::endl; +} + } // namespace namespace benchmark @@ -175,6 +184,8 @@ Result::Result(const Phases &phases) } } } + init_memory = phases.mem_before_init(); + peak_memory = phases.mem_after_run(); } void printResult(const Result &result) @@ -185,6 +196,7 @@ void printResult(const Result &result) return; printResultMemory(result.memory); + printUsedPeakMemory(result.init_memory, result.peak_memory); } // TODO There are necessary for a kind of output data file so that it doesn't have to be csv file diff --git a/runtime/libs/misc/include/misc/polymorphic_downcast.h b/runtime/libs/misc/include/misc/polymorphic_downcast.h index 412b864e6..ee885eb70 100644 --- a/runtime/libs/misc/include/misc/polymorphic_downcast.h +++ b/runtime/libs/misc/include/misc/polymorphic_downcast.h @@ -27,9 +27,7 @@ namespace misc template <typename DstType, typename SrcType> inline DstType polymorphic_downcast(SrcType *x) { -#ifndef __ANDROID__ assert(dynamic_cast<DstType>(x) == x); -#endif return static_cast<DstType>(x); } diff --git a/runtime/libs/ndarray/CMakeLists.txt b/runtime/libs/ndarray/CMakeLists.txt deleted file mode 100644 index b040f5115..000000000 --- a/runtime/libs/ndarray/CMakeLists.txt +++ /dev/null @@ -1,19 +0,0 @@ -add_library(ndarray STATIC src/Array.cpp src/ContiguousSpan.cpp) - -set_target_properties(ndarray PROPERTIES POSITION_INDEPENDENT_CODE ON) - -target_include_directories(ndarray PUBLIC include) -#can't make this private because of c++ templates -target_include_directories(ndarray PUBLIC src) - -option(NDARRAY_INLINE_TEMPLATES "Set to ON to disable extern declarations for common types") - -if(${NDARRAY_INLINE_TEMPLATES}) - target_compile_definitions(ndarray PUBLIC -DNDARRAY_INLINE_TEMPLATES=1) -endif() - -target_link_libraries(ndarray PRIVATE nnfw_common) -target_link_libraries(ndarray PRIVATE nnfw_coverage) - -add_subdirectory(test) -add_subdirectory(example) diff --git a/runtime/libs/ndarray/example/CMakeLists.txt b/runtime/libs/ndarray/example/CMakeLists.txt deleted file mode 100644 index c4b575dad..000000000 --- a/runtime/libs/ndarray/example/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -add_executable(example_no_array example_no_array.cpp) - -add_executable(example_array example_array.cpp) -target_link_libraries(example_array PRIVATE ndarray) diff --git a/runtime/libs/ndarray/example/example_array.cpp b/runtime/libs/ndarray/example/example_array.cpp deleted file mode 100644 index 85d274681..000000000 --- a/runtime/libs/ndarray/example/example_array.cpp +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "ndarray/Array.h" - -#include <iostream> -#include <iterator> - -using namespace ndarray; - -void gather_array(const Array<float> &input, Array<float> &output, const Array<int> &indices) -{ - assert(indices.shape().rank() == 3); - assert(input.shape().rank() == 3); - assert(indices.shape().dim(1) == input.shape().rank()); - - for (size_t i = 0; i < indices.shape().dim(0); ++i) - { - for (size_t j = 0; j < indices.shape().dim(1); ++j) - { - auto index = indices.slice(i, j); - output.slice(i, j).assign(input.slice(index[0], index[1])); - } - } -} - -int main() -{ - // fill tensor of shape[3,3,4] with sequential numbers from [0..36) - Shape in_shape{3, 3, 4}; - std::vector<float> input_data(in_shape.element_count()); - for (size_t i = 0; i < in_shape.element_count(); ++i) - input_data[i] = i; - - Array<float> input(input_data.data(), in_shape); - - // select column-vectors on main diagonal - Shape indices_shape{1, 3, 2}; - std::vector<int> indices_data(indices_shape.element_count()); - Array<int> indices(indices_data.data(), indices_shape); - - indices.slice(0, 0) = {0, 0}; - indices.slice(0, 1) = {1, 1}; - indices.slice(0, 2) = {2, 2}; - - Shape output_shape{1, 3, 4}; - std::vector<float> output_data(output_shape.element_count()); - - Array<float> output(output_data.data(), output_shape); - - gather_array(input, output, indices); - - for (size_t i = 0; i < indices_shape.dim(0); ++i) - { - for (size_t j = 0; j < indices_shape.dim(1); ++j) - { - auto output_piece = output.slice(i, j); - std::ostream_iterator<int> cout_it(std::cout, ", "); - std::copy(output_piece.begin(), output_piece.end(), cout_it); - std::cout << std::endl; - } - } -} diff --git a/runtime/libs/ndarray/example/example_no_array.cpp b/runtime/libs/ndarray/example/example_no_array.cpp deleted file mode 100644 index 3a4d05dca..000000000 --- a/runtime/libs/ndarray/example/example_no_array.cpp +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <array> -#include <vector> -#include <algorithm> -#include <cassert> -#include <iostream> - -void gather_no_array(const float *in_data, const std::array<size_t, 3> &dims, float *out_data, - const std::array<size_t, 3> &out_dims, //[nselections, - const int *indices, const std::array<size_t, 3> &indices_dims) -{ - assert(indices_dims[1] == dims.size()); - - for (int i = 0; i < indices_dims[0]; ++i) - { - for (int j = 0; j < indices_dims[1]; ++j) - { - const int *index_ptr = indices + i * indices_dims[2] * indices_dims[1] + j * indices_dims[2]; - - size_t in_offset = index_ptr[0] * dims[2] * dims[1] + index_ptr[1] * dims[2]; - - const float *in_ptr = in_data + in_offset; - - size_t out_offset = i * out_dims[2] * out_dims[1] + j * out_dims[2]; - - float *out_ptr = out_data + out_offset; - - for (int k = 0; k < dims[2]; ++k) - { - out_ptr[k] = in_ptr[k]; - } - } - } -} - -int main() -{ - std::array<size_t, 3> in_dims{3, 3, 4}; - std::vector<float> input(3 * 3 * 4); - for (size_t i = 0; i < 3 * 3 * 4; ++i) - input[i] = i; - - std::array<size_t, 3> indices_shape{1, 3, 2}; - std::vector<int> indices(1 * 3 * 2); - - indices[0] = 0; - indices[1] = 0; - indices[2] = 1; - indices[3] = 1; - indices[4] = 2; - indices[5] = 2; - - std::array<size_t, 3> output_dims{1, 3, 4}; - std::vector<float> output(1 * 3 * 4); - - gather_no_array(input.data(), in_dims, output.data(), output_dims, indices.data(), indices_shape); - - for (size_t i = 0; i < output_dims[0]; ++i) - { - for (size_t j = 0; j < output_dims[1]; ++j) - { - auto out_ptr = output.data() + i * output_dims[1] * output_dims[2] + j * output_dims[2]; - for (size_t k = 0; k < output_dims[2]; ++k) - { - std::cout << out_ptr[k] << ", "; - } - std::cout << std::endl; - } - } -} diff --git a/runtime/libs/ndarray/include/ndarray/Array.h b/runtime/libs/ndarray/include/ndarray/Array.h deleted file mode 100644 index 3890cc26b..000000000 --- a/runtime/libs/ndarray/include/ndarray/Array.h +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef _NDARRAY_ARRAY_H_ -#define _NDARRAY_ARRAY_H_ - -#include "Common.h" - -#include "ContiguousSpan.h" -#include "Shape.h" - -#if __cplusplus < 201402L -#include "detail/cxx14.h" //integer_sequence and make_index_dequence definitions -#else -#include <utility> -#endif - -#include <algorithm> -#include <cassert> -#include <type_traits> -#include <array> -#include <tuple> -#include <cstddef> - -namespace ndarray -{ - -// there is no index_sequence before c++14 -#if __cplusplus < 201402L - -template <size_t... Nums> using index_sequence = cxx14::index_sequence<Nums...>; - -template <size_t Num> using make_index_sequence = cxx14::make_index_sequence<Num>; - -#else - -template <size_t... Nums> using index_sequence = std::index_sequence<Nums...>; - -template <size_t _Num> using make_index_sequence = std::make_index_sequence<_Num>; - -#endif //__cplusplus < 201402L - -struct Strides -{ - explicit Strides(Shape s) : _strides{} { fillStrides(s); } - - int operator[](size_t idx) const noexcept { return _strides[idx]; } - - // since we don't have c++14 fold expression - template <typename Seq, typename... Ts> struct _calc_offset; - - template <size_t Num, size_t... Nums, typename T, typename... Ts> - struct _calc_offset<index_sequence<Num, Nums...>, T, Ts...> - { - static constexpr size_t get(const std::array<int, 8> &strides, int x, Ts... xs) - { - return _calc_offset<index_sequence<Nums...>, Ts...>::get(strides, xs...) + - x * std::get<Num>(strides); - } - }; - - template <size_t Num, typename T> struct _calc_offset<index_sequence<Num>, T> - { - static constexpr size_t get(const std::array<int, 8> &strides, int x) - { - return x * std::get<Num>(strides); - } - }; - - template <typename Seq, typename... Ts> constexpr size_t offset(Seq, Ts... x) const noexcept - { - // return ( 0 + ... + (std::get<Nums>(_strides) * x)); in c++14 - return _calc_offset<Seq, Ts...>::get(_strides, x...); - } - -private: - void fillStrides(const Shape &s) noexcept - { - int rank = s.rank(); - _strides[rank - 1] = 1; - for (int d = rank - 2; d >= 0; --d) - { - _strides[d] = _strides[d + 1] * s.dim(d + 1); - } - } - - std::array<int, NDARRAY_MAX_DIMENSION_COUNT> _strides; -}; - -template <typename T> class Array -{ -public: - Array(T *data, Shape shape) noexcept : _data(data), _shape(shape), _strides(shape) {} - - Array(const Array &) = delete; - - Array(Array &&a) noexcept : _data(a._data), _shape(a._shape), _strides(a._strides) - { - a._data = nullptr; - } - - template <typename... Ts> T &at(Ts... x) const noexcept { return _at(static_cast<size_t>(x)...); } - - /** - * @brief returns last dimension as ContigniousSpan - * @param x indices of slice to take. See tests for usage details - * @return slice at given position - */ - template <typename... Ts> ContiguousSpan<T, std::is_const<T>::value> slice(Ts... x) noexcept - { - assert(sizeof...(Ts) == _shape.rank() - 1); - return {&at(x..., 0ul), _shape.dim(_shape.rank() - 1)}; - } - - /** - * @brief returns last dimension as ContigniousSpan - * @param x indices of slice to take. See tests for usage details - * @return slice at given position - */ - template <typename... Ts> ContiguousSpan<T, true> slice(Ts... x) const noexcept - { - assert(sizeof...(Ts) == _shape.rank() - 1); - return {&at(x..., 0ul), _shape.dim(_shape.rank() - 1)}; - } - - ContiguousSpan<T, std::is_const<T>::value> flat() noexcept - { - return {_data, _shape.element_count()}; - } - - ContiguousSpan<T, true> flat() const noexcept { return {_data, _shape.element_count()}; } - - const Shape &shape() const noexcept { return _shape; } - -private: - template <typename... Ts> T &_at(Ts... x) const noexcept - { - assert(sizeof...(x) == _shape.rank()); - using Indices = make_index_sequence<sizeof...(Ts)>; - return _data[offset(Indices{}, x...)]; - } - - template <typename... Ts, size_t... Nums> - size_t offset(index_sequence<Nums...> seq, Ts... x) const noexcept - { - static_assert( - sizeof...(Ts) == sizeof...(Nums), - "Sanity check failed. Generated index sequence size is not equal to argument count"); - - return _strides.offset(seq, x...); - } - - T *_data; - Shape _shape; - Strides _strides; -}; - -template <typename To, typename From> Array<To> array_cast(Array<From> &&from, Shape newShape) -{ - assert(from.shape().element_count() / (sizeof(To) / sizeof(From)) == newShape.element_count()); - return Array<To>(reinterpret_cast<To *>(from.flat().data()), newShape); -} - -template <typename To, typename From> -Array<const To> array_cast(const Array<From> &from, Shape newShape) -{ - assert(from.shape().element_count() / (sizeof(To) / sizeof(From)) == newShape.element_count()); - return Array<To>(reinterpret_cast<const To *>(from.flat().data()), newShape); -} - -#ifndef NDARRAY_INLINE_TEMPLATES - -extern template class Array<float>; -extern template class Array<int32_t>; -extern template class Array<uint32_t>; -extern template class Array<uint8_t>; - -#endif // NDARRAY_INLINE_TEMPLATES - -} // namespace ndarray - -#endif //_NDARRAY_ARRAY_H_ diff --git a/runtime/libs/ndarray/include/ndarray/Common.h b/runtime/libs/ndarray/include/ndarray/Common.h deleted file mode 100644 index aa0cc6fe2..000000000 --- a/runtime/libs/ndarray/include/ndarray/Common.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef _NDARRAY_COMMON_H_ -#define _NDARRAY_COMMON_H_ - -#define NDARRAY_MAX_DIMENSION_COUNT 8 - -#endif //_NDARRAY_COMMON_H_ diff --git a/runtime/libs/ndarray/include/ndarray/ContiguousSpan.h b/runtime/libs/ndarray/include/ndarray/ContiguousSpan.h deleted file mode 100644 index 8caa6a686..000000000 --- a/runtime/libs/ndarray/include/ndarray/ContiguousSpan.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef _NDARRAY_CONTIGNIOUS_SPAN_H_ -#define _NDARRAY_CONTIGNIOUS_SPAN_H_ - -#include <type_traits> -#include <vector> -#include <cstdint> -#include <cstddef> -#include <cassert> - -namespace ndarray -{ - -template <typename T, bool isConst = false> class ContiguousSpan -{ -public: - using pointer_type = typename std::conditional<isConst, const T *, T *>::type; - using reference_type = typename std::conditional<isConst, const T &, T &>::type; - using iterator_type = pointer_type; - - ContiguousSpan(pointer_type data, size_t len) noexcept : _data(data), _len(len) {} - - template <typename It> - explicit ContiguousSpan(It first, It last) noexcept - : _data(&*first), _len(std::distance(first, last)) - { - } - - ContiguousSpan(const ContiguousSpan &) = delete; - - ContiguousSpan(ContiguousSpan &&s) noexcept : _data(s._data), _len(s._len) { s._data = nullptr; } - - operator ContiguousSpan<T, true>() { return ContiguousSpan<T, true>{_data, _len}; } - - reference_type operator[](size_t idx) const noexcept { return _data[idx]; } - - reference_type at(size_t idx) const noexcept { return _data[idx]; } - - ContiguousSpan<T, isConst> offset(size_t offset) - { - assert(offset <= _len); - return {_data + offset, _len - offset}; - } - - template <typename From, bool _ = isConst> - typename std::enable_if<!_, void>::type assign(const From &f) noexcept - { - assignFrom(std::begin(f), std::end(f)); - } - - template <typename U, bool _ = isConst> - typename std::enable_if<!_, ContiguousSpan &>::type - operator=(std::initializer_list<U> list) noexcept - { - assignFrom(std::begin(list), std::end(list)); - return *this; - } - - template <typename It, bool _ = isConst> - typename std::enable_if<!_, void>::type assignFrom(It first, It last) noexcept - { - std::copy(first, last, begin()); - } - - size_t size() const { return _len; } - - iterator_type begin() const { return iterator_type{_data}; } - - iterator_type end() const { return iterator_type{_data + _len}; } - - pointer_type data() { return _data; } - -private: - pointer_type _data; - size_t _len; -}; - -#ifndef NDARRAY_INLINE_TEMPLATES - -extern template class ContiguousSpan<float, true>; -extern template class ContiguousSpan<float, false>; -extern template class ContiguousSpan<int32_t, true>; -extern template class ContiguousSpan<int32_t, false>; -extern template class ContiguousSpan<uint32_t, true>; -extern template class ContiguousSpan<uint32_t, false>; -extern template class ContiguousSpan<uint8_t, true>; -extern template class ContiguousSpan<uint8_t, false>; - -#endif // NDARRAY_INLINE_TEMPLATES - -} // namespace ndarray - -#endif //_NDARRAY_CONTIGNIOUS_SPAN_H_ diff --git a/runtime/libs/ndarray/include/ndarray/Shape.h b/runtime/libs/ndarray/include/ndarray/Shape.h deleted file mode 100644 index fa58613b8..000000000 --- a/runtime/libs/ndarray/include/ndarray/Shape.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef _NDARRAY_SHAPE_H_ -#define _NDARRAY_SHAPE_H_ - -#include "Common.h" - -#include <array> -#include <cassert> -#include <cstddef> - -namespace ndarray -{ - -class Shape -{ -public: - //_dims{} here and later since array does not have std::initializer_list ctor - // and aggregate initialization is not allowed here - explicit Shape(size_t rank) noexcept : _dims{}, _rank(rank) - { - std::fill(_dims.begin(), _dims.end(), 0); - } - - Shape(std::initializer_list<size_t> list) noexcept : _dims{}, _rank(list.size()) - { - std::copy(list.begin(), list.end(), _dims.begin()); - } - - size_t dim(int i) const noexcept { return _dims.at(i); } - - size_t &dim(int i) noexcept { return _dims.at(i); } - - size_t element_count() const noexcept - { - uint32_t res = 1; - for (size_t i = 0; i < rank(); ++i) - res *= dim(i); - assert(res <= 0xffffffff); - return res; - } - - size_t rank() const noexcept { return _rank; } - -private: - std::array<size_t, NDARRAY_MAX_DIMENSION_COUNT> _dims; - size_t _rank; -}; - -} // namespace ndarray - -#endif //_NDARRAY_SHAPE_H_ diff --git a/runtime/libs/ndarray/src/detail/cxx14.h b/runtime/libs/ndarray/src/detail/cxx14.h deleted file mode 100644 index 81135b3f2..000000000 --- a/runtime/libs/ndarray/src/detail/cxx14.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef _NDARRAY_CXX14_H_ -#define _NDARRAY_CXX14_H_ - -namespace ndarray -{ - -namespace cxx14 -{ - -template <size_t... Nums> struct index_sequence -{ - using value_type = size_t; - - static constexpr std::size_t size() noexcept { return sizeof...(Nums); } -}; - -namespace detail -{ - -template <size_t v, typename Seq> struct _append; - -template <size_t v, size_t... Nums> struct _append<v, index_sequence<Nums...>> -{ - using result = index_sequence<Nums..., v>; -}; - -template <size_t Len> struct make_index_sequence -{ - using result = - typename detail::_append<Len - 1, typename make_index_sequence<Len - 1>::result>::result; -}; - -template <> struct make_index_sequence<1> -{ - using result = index_sequence<0>; -}; - -template <> struct make_index_sequence<0> -{ - using result = index_sequence<>; -}; - -} // namespace detail - -template <size_t Num> using make_index_sequence = typename detail::make_index_sequence<Num>::result; - -} // namespace cxx14 - -} // namespace ndarray - -#endif //_NDARRAY_CXX14_H_ diff --git a/runtime/libs/ndarray/test/CMakeLists.txt b/runtime/libs/ndarray/test/CMakeLists.txt deleted file mode 100644 index 16f8779ee..000000000 --- a/runtime/libs/ndarray/test/CMakeLists.txt +++ /dev/null @@ -1,17 +0,0 @@ -if(NOT BUILD_NDARRAY_TEST) - return() -endif() - -add_executable(ndarray_test ndarray_test.cpp) - -target_link_libraries(ndarray_test PRIVATE ndarray) - -nnfw_find_package(GTest) -if(NOT GTest_FOUND) - message(STATUS "GTest not avaialble. Skipping NDArray test build") - return() -endif(NOT GTest_FOUND) - -target_link_libraries(ndarray_test PUBLIC gtest gtest_main ${LIB_PTHREAD}) - -add_test(ndarray_test ndarray_test) diff --git a/runtime/libs/ndarray/test/ndarray_test.cpp b/runtime/libs/ndarray/test/ndarray_test.cpp deleted file mode 100644 index 0aa948c72..000000000 --- a/runtime/libs/ndarray/test/ndarray_test.cpp +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "gtest/gtest.h" - -#include "ndarray/Array.h" - -using namespace ndarray; - -TEST(NDArray_tests, basic_data_test) -{ - - float raw_data[] = {1, 2, 3, 4}; - - Array<float> data22{raw_data, {2, 2}}; - - ASSERT_FLOAT_EQ(data22.at(0, 0), 1); - ASSERT_FLOAT_EQ(data22.at(0, 1), 2); - ASSERT_FLOAT_EQ(data22.at(1, 0), 3); - ASSERT_FLOAT_EQ(data22.at(1, 1), 4); - - Array<float> data14{raw_data, {1, 4}}; - ASSERT_FLOAT_EQ(data22.at(0, 0), 1); - ASSERT_FLOAT_EQ(data22.at(0, 1), 2); - ASSERT_FLOAT_EQ(data22.at(0, 2), 3); - ASSERT_FLOAT_EQ(data22.at(0, 3), 4); -} - -TEST(NDArray_tests, slice_write_test) -{ - float raw_data[4] = {0}; - - Array<float> data22{raw_data, {2, 2}}; - - data22.slice(1) = {1, 2}; - - ASSERT_FLOAT_EQ(data22.at(0, 0), 0); - ASSERT_FLOAT_EQ(data22.at(0, 1), 0); - ASSERT_FLOAT_EQ(data22.at(1, 0), 1); - ASSERT_FLOAT_EQ(data22.at(1, 1), 2); -} - -TEST(NDArray_tests, slice_read_test) -{ - float raw_data[4] = {1, 2, 3, 4}; - - Array<float> data22{raw_data, {2, 2}}; - - auto slice = data22.slice(1); - - ASSERT_FLOAT_EQ(slice[0], 3); - ASSERT_FLOAT_EQ(slice[1], 4); -} - -TEST(NDArray_tests, multidim_test) -{ - float raw_data[5] = {0, 1, 2, 3, 4}; - - Array<float> data22{raw_data, {1, 1, 1, 1, 5}}; - - ASSERT_FLOAT_EQ(data22.at(0, 0, 0, 0, 0), 0); - ASSERT_FLOAT_EQ(data22.at(0, 0, 0, 0, 1), 1); - ASSERT_FLOAT_EQ(data22.at(0, 0, 0, 0, 2), 2); - ASSERT_FLOAT_EQ(data22.at(0, 0, 0, 0, 3), 3); - ASSERT_FLOAT_EQ(data22.at(0, 0, 0, 0, 4), 4); -} - -TEST(NDArray_tests, slice_assign_test) -{ - std::vector<float> v1{1, 2, 3, 4, 5}; - std::vector<float> v2(5); - - ContiguousSpan<float> span1(v1.begin(), v1.end()); - ContiguousSpan<float> span2(v2.begin(), v2.end()); - - span2.assign(span1); - - ASSERT_EQ(v1, v2); -} diff --git a/runtime/libs/nnapi/CMakeLists.txt b/runtime/libs/nnapi/CMakeLists.txt index a5d9490d1..73f82b909 100644 --- a/runtime/libs/nnapi/CMakeLists.txt +++ b/runtime/libs/nnapi/CMakeLists.txt @@ -1,3 +1,4 @@ -add_subdirectories() +add_library(nnfw_lib_nnapi INTERFACE) -add_library(nnfw_lib_nnapi ALIAS nnfw_lib_nnapi_1_2) +target_include_directories(nnfw_lib_nnapi INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include) +target_link_libraries(nnfw_lib_nnapi INTERFACE nnfw-nnapi-header) diff --git a/runtime/libs/nnapi/v1.2/include/NeuralNetworksExShim.h b/runtime/libs/nnapi/include/NeuralNetworksExShim.h index 855613241..855613241 100644 --- a/runtime/libs/nnapi/v1.2/include/NeuralNetworksExShim.h +++ b/runtime/libs/nnapi/include/NeuralNetworksExShim.h diff --git a/runtime/libs/nnapi/v1.2/include/NeuralNetworksLoadHelpers.h b/runtime/libs/nnapi/include/NeuralNetworksLoadHelpers.h index 1c482b54c..1c482b54c 100644 --- a/runtime/libs/nnapi/v1.2/include/NeuralNetworksLoadHelpers.h +++ b/runtime/libs/nnapi/include/NeuralNetworksLoadHelpers.h diff --git a/runtime/libs/nnapi/v1.2/include/NeuralNetworksShim.h b/runtime/libs/nnapi/include/NeuralNetworksShim.h index 80082383f..80082383f 100644 --- a/runtime/libs/nnapi/v1.2/include/NeuralNetworksShim.h +++ b/runtime/libs/nnapi/include/NeuralNetworksShim.h diff --git a/runtime/libs/nnapi/v1.2/include/NeuralNetworksTypes.h b/runtime/libs/nnapi/include/NeuralNetworksTypes.h index d74402749..d74402749 100644 --- a/runtime/libs/nnapi/v1.2/include/NeuralNetworksTypes.h +++ b/runtime/libs/nnapi/include/NeuralNetworksTypes.h diff --git a/runtime/libs/nnapi/v1.1/CMakeLists.txt b/runtime/libs/nnapi/v1.1/CMakeLists.txt deleted file mode 100644 index dc018c60f..000000000 --- a/runtime/libs/nnapi/v1.1/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -add_library(nnfw_lib_nnapi_1_1 INTERFACE) - -target_include_directories(nnfw_lib_nnapi_1_1 INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include) -target_link_libraries(nnfw_lib_nnapi_1_1 INTERFACE nnfw-nnapi-header) diff --git a/runtime/libs/nnapi/v1.1/include/NeuralNetworksExShim.h b/runtime/libs/nnapi/v1.1/include/NeuralNetworksExShim.h deleted file mode 100644 index f684dab90..000000000 --- a/runtime/libs/nnapi/v1.1/include/NeuralNetworksExShim.h +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -/** - * @file NeuralNetworksExShim.h - * @brief This file contains an actual implementation of - * ANeuralNetworksModel_addOperationEx function - * @ingroup COM_AI_RUNTIME - */ - -#ifndef NN_API_EX_SHIM_H -#define NN_API_EX_SHIM_H - -#include "NeuralNetworksEx.h" -#include "NeuralNetworksLoadHelpers.h" - -typedef int (*ANeuralNetworksModel_addOperationEx_fn)(ANeuralNetworksModel *model, - ANeuralNetworksOperationTypeEx type, - uint32_t inputCount, const uint32_t *inputs, - uint32_t outputCount, - const uint32_t *outputs); - -/** - * @brief Add an extended operation to a model. - * - * @param[in] model The model to be modified. - * @param[in] type The type of extended operation. - * @param[in] inputCount The number of entries in the inputs array. - * @param[in] inputs An array of indexes identifying each operand. - * @param[in] outputCount The number of entries in the outputs array. - * @param[in] outputs An array of indexes identifying each operand. - * - * @note The operands specified by inputs and outputs must have been - * previously added by calls to {@link ANeuralNetworksModel_addOperand}.\n - * Attempting to modify a model once {@link ANeuralNetworksModel_finish} - * has been called will return an error.\n - * See {@link ANeuralNetworksModel} for information on multithreaded usage. - * - * @return ANEURALNETWORKS_NO_ERROR if successful. - */ - -inline int ANeuralNetworksModel_addOperationEx(ANeuralNetworksModel *model, - ANeuralNetworksOperationTypeEx type, - uint32_t inputCount, const uint32_t *inputs, - uint32_t outputCount, const uint32_t *outputs) -{ - LOAD_FUNCTION(ANeuralNetworksModel_addOperationEx); - EXECUTE_FUNCTION_RETURN(model, type, inputCount, inputs, outputCount, outputs); -} - -#endif // NN_API_EX_SHIM_H diff --git a/runtime/libs/nnapi/v1.1/include/NeuralNetworksLoadHelpers.h b/runtime/libs/nnapi/v1.1/include/NeuralNetworksLoadHelpers.h deleted file mode 100644 index 201465f9c..000000000 --- a/runtime/libs/nnapi/v1.1/include/NeuralNetworksLoadHelpers.h +++ /dev/null @@ -1,141 +0,0 @@ -/* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -// NOTE To minimize diff with upstream tensorflow, disable clang-format -// clang-format off - -// NOTE This header is derived from part of the following file (in TensorFlow v1.12) -// 'externals/tensorflow/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h' - -/** - * @file NeuralNetworksLoadHelpers.h - * @ingroup COM_AI_RUNTIME - * @brief This file contains functions to load NN API runtime library - */ - -#ifndef __NEURAL_NETWORKS_LOAD_HELPER_H__ -#define __NEURAL_NETWORKS_LOAD_HELPER_H__ - -#include <dlfcn.h> -#include <stdint.h> -#include <stdio.h> -#include <stdlib.h> - -/** - * @brief Print log data - * @param[in] format Format string of @c printf - * @param[in] args Argument after format string. (Same with @c printf) - */ -#define NNAPI_LOG(format, ...) printf(format "\n", __VA_ARGS__); - -/** - * @brief Create a function pointer named @c fn after loading NN API library - * @param[in] name Name of a function - */ -#define LOAD_FUNCTION(name) \ - static name##_fn fn = reinterpret_cast<name##_fn>(nnfw::loadFunction(#name)); - -/** - * @brief Run @c fn function. @c fn is created by @ref LOAD_FUNCTION - * @param[in] args List of arguments for the function @c fn - */ -#define EXECUTE_FUNCTION(...) \ - if (fn != nullptr) { \ - fn(__VA_ARGS__); \ - } - -/** - * @brief Run @c fn function. @c fn is created by @ref LOAD_FUNCTION - * @param[in] args List of arguments for the function @c fn - * @return the return value of @c fn - */ -#define EXECUTE_FUNCTION_RETURN(...) return fn != nullptr ? fn(__VA_ARGS__) : 0; - -namespace nnfw -{ - -/** - * @brief Load NN API library - * @param[in] name path of NN API library - * @return a symbol table handle of NN API library - */ -inline void* loadLibrary(const char* name) { - // TODO: change RTLD_LOCAL? Assumes there can be multiple instances of nn - // api RT - void* handle = nullptr; -#if 1 //#ifdef __ANDROID__ - handle = dlopen(name, RTLD_LAZY | RTLD_LOCAL); - if (handle == nullptr) { - NNAPI_LOG("nnapi error: unable to open library %s", name); - NNAPI_LOG(" %s", dlerror()); - } -#endif - return handle; -} - -/** - * @brief Load libneuralnetworks.so and return handle of library - * @return a symbol table handle of NN API library - */ -inline void* getLibraryHandle() { - static void* handle = loadLibrary("libneuralnetworks.so"); - return handle; -} - -/** - * @brief Return function ptr in libneuralnetworks.so - * @param[in] name Name of function - * @return function pointer - */ -inline void* loadFunction(const char* name) { - void* fn = nullptr; - if (getLibraryHandle() != nullptr) { - fn = dlsym(getLibraryHandle(), name); - } - if (fn == nullptr) { - NNAPI_LOG("nnapi error: unable to open function %s", name); - NNAPI_LOG(" %s", dlerror()); - abort(); - } - else { -#ifdef _GNU_SOURCE - Dl_info info; - if (dladdr(fn, &info)) - { - NNAPI_LOG("nnapi function '%s' is loaded from '%s' ", name, info.dli_fname); - } - else - { - NNAPI_LOG("nnapi function '%s' is failed to load", name); - } - -#endif // _GNU_SOURCE - } - return fn; -} - -/** - * @brief Check if libneuralnetworks.so can be loaded - * @return @c true if loading is successful, otherwise @c false. - */ -inline bool NNAPIExists() { - static bool nnapi_is_available = getLibraryHandle(); - return nnapi_is_available; -} - -} // namespace nnfw - -#endif // __NEURAL_NETWORKS_LOAD_HELPER_H__ diff --git a/runtime/libs/nnapi/v1.1/include/NeuralNetworksShim.h b/runtime/libs/nnapi/v1.1/include/NeuralNetworksShim.h deleted file mode 100644 index 60b16f766..000000000 --- a/runtime/libs/nnapi/v1.1/include/NeuralNetworksShim.h +++ /dev/null @@ -1,709 +0,0 @@ -/* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -// NOTE To minimize diff with upstream tensorflow, disable clang-format -// clang-format off - -// NOTE This header is derived from part of the following file (in TensorFlow v1.12) -// 'externals/tensorflow/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h' -#ifndef __NEURAL_NETWORKS_SHIM__ -#define __NEURAL_NETWORKS_SHIM__ - -#include "NeuralNetworks.h" -#include "NeuralNetworksLoadHelpers.h" - -// nn api function types - -typedef int (*ANeuralNetworksMemory_createFromFd_fn)( - size_t size, int protect, int fd, size_t offset, - ANeuralNetworksMemory** memory); - -typedef void (*ANeuralNetworksMemory_free_fn)(ANeuralNetworksMemory* memory); - -typedef int (*ANeuralNetworksModel_create_fn)(ANeuralNetworksModel** model); - -typedef int (*ANeuralNetworksModel_finish_fn)(ANeuralNetworksModel* model); - -typedef void (*ANeuralNetworksModel_free_fn)(ANeuralNetworksModel* model); - -typedef int (*ANeuralNetworksCompilation_create_fn)( - ANeuralNetworksModel* model, ANeuralNetworksCompilation** compilation); - -typedef void (*ANeuralNetworksCompilation_free_fn)( - ANeuralNetworksCompilation* compilation); - -typedef int (*ANeuralNetworksCompilation_setPreference_fn)( - ANeuralNetworksCompilation* compilation, int32_t preference); - -typedef int (*ANeuralNetworksCompilation_finish_fn)( - ANeuralNetworksCompilation* compilation); - -typedef int (*ANeuralNetworksModel_addOperand_fn)( - ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type); - -typedef int (*ANeuralNetworksModel_setOperandValue_fn)( - ANeuralNetworksModel* model, int32_t index, const void* buffer, - size_t length); - -typedef int (*ANeuralNetworksModel_setOperandValueFromMemory_fn)( - ANeuralNetworksModel* model, int32_t index, - const ANeuralNetworksMemory* memory, size_t offset, size_t length); - -typedef int (*ANeuralNetworksModel_addOperation_fn)( - ANeuralNetworksModel* model, ANeuralNetworksOperationType type, - uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount, - const uint32_t* outputs); - -typedef int (*ANeuralNetworksModel_identifyInputsAndOutputs_fn)( - ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs, - uint32_t outputCount, const uint32_t* outputs); - -typedef int (*ANeuralNetworksModel_relaxComputationFloat32toFloat16_fn)( - ANeuralNetworksModel* model, bool allow); - -typedef int (*ANeuralNetworksExecution_create_fn)( - ANeuralNetworksCompilation* compilation, - ANeuralNetworksExecution** execution); - -typedef void (*ANeuralNetworksExecution_free_fn)( - ANeuralNetworksExecution* execution); - -typedef int (*ANeuralNetworksExecution_setInput_fn)( - ANeuralNetworksExecution* execution, int32_t index, - const ANeuralNetworksOperandType* type, const void* buffer, size_t length); - -typedef int (*ANeuralNetworksExecution_setInputFromMemory_fn)( - ANeuralNetworksExecution* execution, int32_t index, - const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory, - size_t offset, size_t length); - -typedef int (*ANeuralNetworksExecution_setOutput_fn)( - ANeuralNetworksExecution* execution, int32_t index, - const ANeuralNetworksOperandType* type, void* buffer, size_t length); - -typedef int (*ANeuralNetworksExecution_setOutputFromMemory_fn)( - ANeuralNetworksExecution* execution, int32_t index, - const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory, - size_t offset, size_t length); - -typedef int (*ANeuralNetworksExecution_startCompute_fn)( - ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event); - -typedef int (*ANeuralNetworksEvent_wait_fn)(ANeuralNetworksEvent* event); - -typedef void (*ANeuralNetworksEvent_free_fn)(ANeuralNetworksEvent* event); - -/** - * Creates a shared memory object from a file descriptor. - * - * The shared memory is backed by a file descriptor via mmap. - * See {@link ANeuralNetworksMemory} for a description on how to use - * this shared memory. - * - * @param size The requested size in bytes. - * Must not be larger than the file size. - * @param prot The desired memory protection for the mapping. - * It is either PROT_NONE or the bitwise OR of one or - * more of the following flags: PROT_READ, PROT_WRITE. - * @param fd The requested file descriptor. - * The file descriptor has to be mmap-able. The file - * descriptor will be duplicated. - * @param offset The offset to the beginning of the file of the area to map. - * The offset has to be aligned to a page size. - * @param memory The memory object to be created. - * Set to NULL if unsuccessful. - * - * @return ANEURALNETWORKS_NO_ERROR if the request completed normally. - */ -inline int ANeuralNetworksMemory_createFromFd(size_t size, int protect, int fd, - size_t offset, - ANeuralNetworksMemory** memory) { - LOAD_FUNCTION(ANeuralNetworksMemory_createFromFd); - EXECUTE_FUNCTION_RETURN(size, protect, fd, offset, memory); -} - -/** - * Delete a memory object. - * - * Destroys the object used by the run time to keep track of the memory. - * This will free the underlying actual memory if no other code has open - * handles to this memory. - * - * @param memory The memory object to be freed. - */ -inline void ANeuralNetworksMemory_free(ANeuralNetworksMemory* memory) { - LOAD_FUNCTION(ANeuralNetworksMemory_free); - EXECUTE_FUNCTION(memory); -} - -/** - * Create an empty {@link ANeuralNetworksModel}. - * - * <p>This only creates the object. Computation is performed once - * {@link ANeuralNetworksExecution_startCompute} is invoked. - * - * The model should be constructed with calls to - * {@link ANeuralNetworksModel_addOperation} and - * {@link ANeuralNetworksModel_addOperand} - * - * <p>{@link ANeuralNetworksModel_finish} should be called once the model - * has been fully constructed.</p> - * - * <p>{@link ANeuralNetworksModel_free} should be called once the model - * is no longer needed.</p> - * - * @param model The {@link ANeuralNetworksModel} to be created. - * Set to NULL if unsuccessful. - * - * @return ANEURALNETWORKS_NO_ERROR if successful. - */ -inline int ANeuralNetworksModel_create(ANeuralNetworksModel** model) { - LOAD_FUNCTION(ANeuralNetworksModel_create); - EXECUTE_FUNCTION_RETURN(model); -} - -/** - * Destroy a model. - * - * The model need not have been finished by a call to - * {@link ANeuralNetworksModel_finish}. - * - * See {@link ANeuralNetworksModel} for information on multithreaded usage. - * - * @param model The model to be destroyed. Passing NULL is acceptable and - * results in no operation. - */ -inline void ANeuralNetworksModel_free(ANeuralNetworksModel* model) { - LOAD_FUNCTION(ANeuralNetworksModel_free); - EXECUTE_FUNCTION(model); -} - -/** - * Indicate that we have finished modifying a model. Required before - * calling {@link ANeuralNetworksCompilation_compile}. - * - * An application is responsible to make sure that no other thread uses - * the model at the same time. - * - * See {@link ANeuralNetworksModel} for information on multithreaded usage. - * - * @param model The model to be finished. - * - * @return ANEURALNETWORKS_NO_ERROR if successful. - */ -inline int ANeuralNetworksModel_finish(ANeuralNetworksModel* model) { - LOAD_FUNCTION(ANeuralNetworksModel_finish); - EXECUTE_FUNCTION_RETURN(model); -} - -/** - * Add an operand to a model. - * - * The order in which the operands are added is important. The first one added - * to a model will have the index value 0, the second 1, etc. These indexes are - * used as operand identifiers in {@link ANeuralNetworksModel_addOperation}, - * {@link ANeuralNetworksExecution_setInput}, - * {@link ANeuralNetworksExecution_setInputFromMemory}, - * {@link ANeuralNetworksExecution_setOutput}, - * {@link ANeuralNetworksExecution_setOutputFromMemory} and - * {@link ANeuralNetworksExecution_setOperandValue}. - * - * To build a model that can accommodate inputs of various sizes, as you may - * want to do for a CNN, set the size of the dimensions that will vary at run - * time to 0. If you do so, provide the full dimensions when calling - * {@link ANeuralNetworksExecution_setInput} or {@link - * ANeuralNetworksExecution_setInputFromMemory}. - * - * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has - * been called will return an error. - * - * See {@link ANeuralNetworksModel} for information on multithreaded usage. - * - * @param model The model to be modified. - * @param type The {@link ANeuralNetworksOperandType} that describes the shape - * of the operand. - * - * @return ANEURALNETWORKS_NO_ERROR if successful. - */ -inline int ANeuralNetworksModel_addOperand( - ANeuralNetworksModel* model, const ANeuralNetworksOperandType* type) { - LOAD_FUNCTION(ANeuralNetworksModel_addOperand); - EXECUTE_FUNCTION_RETURN(model, type); -} - -/** - * Sets an operand to a constant value. - * - * For scalar values, the content of buffer is copied into the model. - * - * For tensor values, a pointer to the buffer is stored within the model. - * The application is responsible for not changing the content of this region - * until all executions using this model have completed. As the data may - * be copied during processing, modifying the data after this call yields - * undefined results. - * - * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has - * been called will return an error. - * - * See {@link ANeuralNetworksModel} for information on multithreaded usage. - * - * @param model The model to be modified. - * @param index The index of the model operand we're setting. - * @param buffer A pointer to the data to use. - * @param length The size in bytes of the data value. - * - * @return ANEURALNETWORKS_NO_ERROR if successful. - */ -inline int ANeuralNetworksModel_setOperandValue(ANeuralNetworksModel* model, - int32_t index, - const void* buffer, - size_t length) { - LOAD_FUNCTION(ANeuralNetworksModel_setOperandValue); - EXECUTE_FUNCTION_RETURN(model, index, buffer, length); -} - -/** - * Sets an operand to a value stored in a memory object. - * - * The content of the memory is not copied. A reference to that memory is stored - * inside the model. The application is responsible for not changing the content - * of the memory region until all executions using this model have completed. - * As the data may be copied during processing, modifying the data after this - * call yields undefined results. - * - * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has - * been called will return an error. - * - * See {@link ANeuralNetworksModel} for information on multithreaded usage. - * - * @param model The model to be modified. - * @param index The index of the model operand we're setting. - * @param buffer A pointer to the data to use. - * @param memory The memory containing the data. - * @param offset This specifies the location of the data within the memory. - * The offset is in bytes from the start of memory. - * @param length The size in bytes of the data value. - * - * @return ANEURALNETWORKS_NO_ERROR if successful. - */ -inline int ANeuralNetworksModel_setOperandValueFromMemory( - ANeuralNetworksModel* model, int32_t index, - const ANeuralNetworksMemory* memory, size_t offset, size_t length) { - LOAD_FUNCTION(ANeuralNetworksModel_setOperandValueFromMemory); - EXECUTE_FUNCTION_RETURN(model, index, memory, offset, length); -} - -/** - * Add an operation to a model. - * - * @param model The model to be modified. - * @param type The type of the operation. - * @param inputCount The number of entries in the inputs array. - * @param inputs An array of indexes identifying each operand. - * @param outputCount The number of entries in the outputs array. - * @param outputs An array of indexes identifying each operand. - * - * The operands specified by inputs and outputs must have been - * previously added by calls to {@link ANeuralNetworksModel_addOperand}. - * - * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has - * been called will return an error. - * - * See {@link ANeuralNetworksModel} for information on multithreaded usage. - * - * @return ANEURALNETWORKS_NO_ERROR if successful. - */ -inline int ANeuralNetworksModel_addOperation(ANeuralNetworksModel* model, - ANeuralNetworksOperationType type, - uint32_t inputCount, - const uint32_t* inputs, - uint32_t outputCount, - const uint32_t* outputs) { - LOAD_FUNCTION(ANeuralNetworksModel_addOperation); - EXECUTE_FUNCTION_RETURN(model, type, inputCount, inputs, outputCount, - outputs); -} - -/** - * Specifies which operands will be the model's inputs and outputs. - * - * An operand cannot be used for both input and output. Doing so will - * return an error. - * - * @param model The model to be modified. - * @param inputCount The number of entries in the inputs array. - * @param inputs An array of indexes identifying the input operands. - * @param outputCount The number of entries in the outputs array. - * @param outputs An array of indexes identifying the output operands. - * - * The operands specified by inputs and outputs must have been - * previously added by calls to {@link ANeuralNetworksModel_addOperand}. - * - * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has - * been called will return an error. - * - * See {@link ANeuralNetworksModel} for information on multithreaded usage. - * - */ -inline int ANeuralNetworksModel_identifyInputsAndOutputs( - ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs, - uint32_t outputCount, const uint32_t* outputs) { - LOAD_FUNCTION(ANeuralNetworksModel_identifyInputsAndOutputs); - EXECUTE_FUNCTION_RETURN(model, inputCount, inputs, outputCount, outputs); -} - -/** - * Specifies whether {@link ANEURALNETWORKS_TENSOR_FLOAT32} is allowed to be - * calculated with range and/or precision as low as that of the IEEE 754 16-bit - * floating-point format. By default, {@link ANEURALNETWORKS_TENSOR_FLOAT32} - * must be calculated using at least the range and precision of the IEEE 754 - * 32-bit floating-point format. - * - * @param model The model to be modified. - * @param allow 'true' indicates {@link ANEURALNETWORKS_TENSOR_FLOAT32} may be - * calculated with range and/or precision as low as that of the - * IEEE 754 16-bit floating point format. 'false' indicates - * {@link ANEURALNETWORKS_TENSOR_FLOAT32} must be calculated using - * at least the range and precision of the IEEE 754 32-bit floating - * point format. - * - * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has - * been called will return an error. - * - * Available since API level 28. - * - * See {@link ANeuralNetworksModel} for information on multithreaded usage. - */ -inline int ANeuralNetworksModel_relaxComputationFloat32toFloat16( - ANeuralNetworksModel* model, bool allow) { - LOAD_FUNCTION(ANeuralNetworksModel_relaxComputationFloat32toFloat16); - EXECUTE_FUNCTION_RETURN(model, allow); -} - -/** - * Create a {@link ANeuralNetworksCompilation} to compile the given model. - * This only creates the object. Compilation is only performed once - * {@link ANeuralNetworksCompilation_start} is invoked. - * - * <p>The provided model must outlive the compilation.</p> - * - * The model must already have been finished by a call to - * {@link ANeuralNetworksModel_finish}. - * - * See {@link ANeuralNetworksCompilation} for information on multithreaded - * usage. - * - * @param model The {@link ANeuralNetworksModel} to be compiled. - * @param compilation The newly created object or NULL if unsuccessful. - * - * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA - * if the model is invalid. - */ -inline int ANeuralNetworksCompilation_create( - ANeuralNetworksModel* model, ANeuralNetworksCompilation** compilation) { - LOAD_FUNCTION(ANeuralNetworksCompilation_create); - EXECUTE_FUNCTION_RETURN(model, compilation); -} - -/** - * Destroy a compilation. - * - * <p>If called on a compilation for which - * {@link ANeuralNetworksCompilation_start} has been called, the - * function will return immediately but will mark the compilation to be deleted - * once the compilation completes. The {@link ANeuralNetworksCompilation_wait} - * will return ERROR_DELETED. - * - * See {@link ANeuralNetworksCompilation} for information on multithreaded - * usage. - * - * @param compilation The compilation to be destroyed. Passing NULL is - * acceptable and results in no operation. - */ -inline void ANeuralNetworksCompilation_free( - ANeuralNetworksCompilation* compilation) { - LOAD_FUNCTION(ANeuralNetworksCompilation_free); - EXECUTE_FUNCTION(compilation); -} - -/** - * Sets the execution preference. - * - * <p>Provides guidance to the runtime when trade-offs are possible.</p> - * - * See {@link ANeuralNetworksCompilation} for information on multithreaded - * usage. - * - * @param compilation The compilation to be modified. - * @param preference Either {@link PREFER_LOW_POWER}, - * {@link PREFER_SINGLE_FAST_ANSWER}, or - * {@link PREFER_SUSTAINED_SPEED}. - * - * @return ANEURALNETWORKS_NO_ERROR if successful. - */ -inline int ANeuralNetworksCompilation_setPreference( - ANeuralNetworksCompilation* compilation, int32_t preference) { - LOAD_FUNCTION(ANeuralNetworksCompilation_setPreference); - EXECUTE_FUNCTION_RETURN(compilation, preference); -} - -/** - * Waits until the compilation completes. - * - * More than one thread can wait on a compilation. When the compilation - * completes, all threads will be released. - * - * See {@link ANeuralNetworksCompilation} for information on multithreaded - * usage. - * - * @return ANEURALNETWORKS_NO_ERROR if the compilation completed normally. - */ -inline int ANeuralNetworksCompilation_finish( - ANeuralNetworksCompilation* compilation) { - LOAD_FUNCTION(ANeuralNetworksCompilation_finish); - EXECUTE_FUNCTION_RETURN(compilation); -} -/** - * Create a {@link ANeuralNetworksExecution} to apply the given compilation. - * This only creates the object. Computation is only performed once - * {@link ANeuralNetworksExecution_startCompute} is invoked. - * - * <p>The provided compilation must outlive the execution.</p> - * - * See {@link ANeuralNetworksExecution} for information on multithreaded usage. - * - * @param compilation The {@link ANeuralNetworksCompilation} to be evaluated. - * @param execution The newly created object or NULL if unsuccessful. - * - * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA - * if the compilation is invalid. - */ -inline int ANeuralNetworksExecution_create( - ANeuralNetworksCompilation* compilation, - ANeuralNetworksExecution** execution) { - LOAD_FUNCTION(ANeuralNetworksExecution_create); - EXECUTE_FUNCTION_RETURN(compilation, execution); -} - -/** - * Destroy an execution. - * - * <p>If called on an execution for which - * {@link ANeuralNetworksExecution_startCompute} has been called, the - * function will return immediately but will mark the execution to be deleted - * once the computation completes. The {link ANeuralNetworksExecution_wait} - * will return ANEURALNETWORKS_ERROR_DELETED. - * - * See {@link ANeuralNetworksExecution} for information on multithreaded usage. - * - * @param execution The execution to be destroyed. Passing NULL is acceptable - * and results in no operation. - */ -inline void ANeuralNetworksExecution_free(ANeuralNetworksExecution* execution) { - LOAD_FUNCTION(ANeuralNetworksExecution_free); - EXECUTE_FUNCTION(execution); -} - -/** - * Associate a user buffer with an input of the model of the - * {@link ANeuralNetworksExecution}. - * - * <p>The provided buffer must outlive the execution.</p> - * - * See {@link ANeuralNetworksExecution} for information on multithreaded usage. - * - * @param execution The execution to be modified. - * @param index The index of the input argument we are setting. It is - * an index into the lists passed to - * {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not - * the index associated with {@link - * ANeuralNetworksModel_addOperand}. - * @param type The type of the operand. This should be used to specify the - * dimensions that were set to 0 when the operand was added to the - * model. All other properties of the type must be the same as - * specified in the model. If the type is the same as specified - * when the model was built, NULL can be passed. - * @param buffer The buffer containing the data. - * @param length The length in bytes of the buffer. - * - * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if - * the name is not recognized or the buffer is too small for the input. - */ -inline int ANeuralNetworksExecution_setInput( - ANeuralNetworksExecution* execution, int32_t index, - const ANeuralNetworksOperandType* type, const void* buffer, size_t length) { - LOAD_FUNCTION(ANeuralNetworksExecution_setInput); - EXECUTE_FUNCTION_RETURN(execution, index, type, buffer, length); -} - -/** - * Associate part of a memory object with an input of the model of the - * {@link ANeuralNetworksExecution}. - * - * <p>The provided memory must outlive the execution.</p> - * - * See {@link ANeuralNetworksExecution} for information on multithreaded usage. - * - * @param execution The execution to be modified. - * @param index The index of the input argument we are setting. It is - * an index into the lists passed to - * {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not - * the index associated with {@link - * ANeuralNetworksModel_addOperand}. - * @param type The type of the operand. This can be used to specify the - * dimensions that were set to 0 when the operand was added to the - * model. All other values must be the same as specified in the - * model. If the type is the same as specified when the model - * was built, NULL can be passed. - * @param memory The memory containing the data. - * @param offset This specifies the location of the data within the memory. - * The offset is in bytes from the start of memory. - * @param length The size in bytes of the data value. - * - * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if - * the name is not recognized or the buffer is too small for the input. - */ -inline int ANeuralNetworksExecution_setInputFromMemory( - ANeuralNetworksExecution* execution, int32_t index, - const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory, - size_t offset, size_t length) { - LOAD_FUNCTION(ANeuralNetworksExecution_setInputFromMemory); - EXECUTE_FUNCTION_RETURN(execution, index, type, memory, offset, length); -} - -/** - * Associate a user buffer with an output of the model of the - * {@link ANeuralNetworksExecution}. - * - * <p>The provided buffer must outlive the execution.</p> - * - * See {@link ANeuralNetworksExecution} for information on multithreaded usage. - * - * @param execution The execution to be modified. - * @param index The index of the output argument we are setting. It is - * an index into the lists passed to - * {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not - * the index associated with {@link - * ANeuralNetworksModel_addOperand}. - * @param type The type of the operand. This can be used to specify the - * dimensions that were set to 0 when the operand was added to the - * model. All other values must be the same as specified in the - * model. If the type is the same as specified when the model - * was built, NULL can be passed. - * @param buffer The buffer where the data is to be written. - * @param length The length in bytes of the buffer. - * - * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if - * the name is not recognized or the buffer is too small for the output. - */ -inline int ANeuralNetworksExecution_setOutput( - ANeuralNetworksExecution* execution, int32_t index, - const ANeuralNetworksOperandType* type, void* buffer, size_t length) { - LOAD_FUNCTION(ANeuralNetworksExecution_setOutput); - EXECUTE_FUNCTION_RETURN(execution, index, type, buffer, length); -} - -/** - * Associate part of a memory object with an output of the model of the - * {@link ANeuralNetworksExecution}. - * - * <p>The provided memory must outlive the execution.</p> - * - * See {@link ANeuralNetworksExecution} for information on multithreaded usage. - * - * @param execution The execution to be modified. - * @param index The index of the output argument we are setting. It is - * an index into the lists passed to - * {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not - * the index associated with {@link - * ANeuralNetworksModel_addOperand}. - * @param type The type of the operand. This can be used to specify the - * dimensions that were set to 0 when the operand was added to the - * model. All other values must be the same as specified in the - * model. If the type is the same as specified when the model - * was built, NULL can be passed. - * @param memory The memory where the data is to be stored. - * @param offset This specifies the location of the data within the memory. - * The offset is in bytes from the start of memory. - * @param length The length in bytes of the data value. - * - * @return ANEURALNETWORKS_NO_ERROR if successful, ANEURALNETWORKS_BAD_DATA if - * the name is not recognized or the buffer is too small for the output. - */ -inline int ANeuralNetworksExecution_setOutputFromMemory( - ANeuralNetworksExecution* execution, int32_t index, - const ANeuralNetworksOperandType* type, const ANeuralNetworksMemory* memory, - size_t offset, size_t length) { - LOAD_FUNCTION(ANeuralNetworksExecution_setOutputFromMemory); - EXECUTE_FUNCTION_RETURN(execution, index, type, memory, offset, length); -} - -/** - * Schedule evaluation of the execution. - * - * <p>Schedules evaluation of the execution. Once the model has been - * applied and the outputs are ready to be consumed, the execution will be - * signaled. Use {@link ANeuralNetworksExecution_wait} to wait for that signal. - * </p> - * - * Multiple executions can be scheduled and evaluated concurrently, and - * compilations can be performed concurrently with executions. The runtime makes - * no guarantee on the ordering of the completion of compilations and - * executions. If it's important to the application, the application should - * enforce the ordering by using {@link ANeuralNetworksCompilation_wait} and - * {@link ANeuralNetworksExecution_wait}. - * - * ANeuralNetworksExecution_wait must be called to recuperate the resources used - * by the execution. - * - * See {@link ANeuralNetworksExecution} for information on multithreaded usage. - * - * @param execution The execution to be scheduled and executed. - * - * @return ANEURALNETWORKS_NO_ERROR if successful. - */ -inline int ANeuralNetworksExecution_startCompute( - ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event) { - LOAD_FUNCTION(ANeuralNetworksExecution_startCompute); - EXECUTE_FUNCTION_RETURN(execution, event); -} - -/** - * Waits until the execution completes. - * - * More than one thread can wait on an event. When the execution completes, - * all threads will be released. - * - * See {@link ANeuralNetworksExecution} for information on multithreaded usage. - * - * @return ANEURALNETWORKS_NO_ERROR if the execution completed normally. - */ -inline int ANeuralNetworksEvent_wait(ANeuralNetworksEvent* event) { - LOAD_FUNCTION(ANeuralNetworksEvent_wait); - EXECUTE_FUNCTION_RETURN(event); -} - -/** - * Destroys the event. - * - * See {@link ANeuralNetworksExecution} for information on multithreaded usage. - */ -inline void ANeuralNetworksEvent_free(ANeuralNetworksEvent* event) { - LOAD_FUNCTION(ANeuralNetworksEvent_free); - EXECUTE_FUNCTION(event); -} - -#endif // __NEURAL_NETWORKS_SHIM__ diff --git a/runtime/libs/nnapi/v1.2/CMakeLists.txt b/runtime/libs/nnapi/v1.2/CMakeLists.txt deleted file mode 100644 index 21ec3015f..000000000 --- a/runtime/libs/nnapi/v1.2/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -add_library(nnfw_lib_nnapi_1_2 INTERFACE) - -target_include_directories(nnfw_lib_nnapi_1_2 INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include) -target_link_libraries(nnfw_lib_nnapi_1_2 INTERFACE nnfw-nnapi-header) diff --git a/runtime/nnapi-header/include/NeuralNetworks.h b/runtime/nnapi-header/include/NeuralNetworks.h index 7400806d8..0c54d7582 100644 --- a/runtime/nnapi-header/include/NeuralNetworks.h +++ b/runtime/nnapi-header/include/NeuralNetworks.h @@ -24,8 +24,8 @@ * @file NeuralNetworks.h */ -#ifndef ANDROID_ML_NN_RUNTIME_NEURAL_NETWORKS_H -#define ANDROID_ML_NN_RUNTIME_NEURAL_NETWORKS_H +#ifndef ANDROID_FRAMEWORKS_ML_NN_RUNTIME_NEURAL_NETWORKS_H +#define ANDROID_FRAMEWORKS_ML_NN_RUNTIME_NEURAL_NETWORKS_H /****************************************************************** * @@ -43,16 +43,14 @@ * - DO NOT CHANGE THE LAYOUT OR SIZE OF STRUCTURES */ -// For compatibility with android, check __ANDROID_API__ is defined -// If __ANDROID_API__ is pre-defined, this header may be used for android -#ifndef __ANDROID_API__ -#define __ANDROID_API__ 29 -#define __ANDROID_API_Q__ 29 +// For compatibility with android, check __ANDROID__ is defined +#ifndef __ANDROID__ +#define __ANDROID_API__ 30 #define __INTRODUCED_IN(api_level) typedef struct AHardwareBuffer AHardwareBuffer; #else #include <android/hardware_buffer.h> -#endif // __ANDROID_API__ +#endif // __ANDROID__ #include <stddef.h> #include <stdint.h> #include <sys/cdefs.h> @@ -62,7 +60,11 @@ __BEGIN_DECLS /** * Operand types. * - * The type of operands that can be added to a model. + * The type of an operand in a model. + * + * Types prefaced with ANEURALNETWORKS_TENSOR_* must be used for tensor data (i.e., tensors + * with at least one dimension). Types not prefaced by ANEURALNETWORKS_TENSOR_* represent + * scalar values and must have no dimensions. * * Although we define many types, most operators accept just a few * types. Most used are {@link ANEURALNETWORKS_TENSOR_FLOAT32}, @@ -94,7 +96,6 @@ typedef enum { * real_value = (integer_value - zeroPoint) * scale. */ ANEURALNETWORKS_TENSOR_QUANT8_ASYMM = 5, -#if __ANDROID_API__ >= __ANDROID_API_Q__ /** * An 8 bit boolean scalar value. * @@ -160,7 +161,6 @@ typedef enum { * Available since API level 29. */ ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL = 11, - /** * A tensor of 16 bit unsigned integers that represent real numbers. * @@ -175,7 +175,6 @@ typedef enum { * Available since API level 29. */ ANEURALNETWORKS_TENSOR_QUANT16_ASYMM = 12, - /** * A tensor of 8 bit signed integers that represent real numbers. * @@ -188,14 +187,36 @@ typedef enum { * Available since API level 29. */ ANEURALNETWORKS_TENSOR_QUANT8_SYMM = 13, -#endif // __ANDROID_API__ >= __ANDROID_API_Q__ + /** + * A tensor of 8 bit signed integers that represent real numbers. + * + * Attached to this tensor are two numbers that can be used to convert the + * 8 bit integer to the real value and vice versa. These two numbers are: + * - scale: a 32 bit floating point value greater than zero. + * - zeroPoint: a 32 bit integer, in range [-128, 127]. + * + * The formula is: + * real_value = (integer_value - zeroPoint) * scale. + * + * Available since API level 30. + */ + ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED = 14, + /** + * A reference to a model. + * + * {@link ANeuralNetworksModel_setOperandValueFromModel} must be used to set + * the value for an Operand of this type. + * + * Available since API level 30. + */ + ANEURALNETWORKS_MODEL = 15, } OperandCode; /** * Operation types. * - * The type of operations that can be added to a model. + * The type of an operation in a model. * * Available since API level 27. */ @@ -231,6 +252,8 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) + * * {@link ANEURALNETWORKS_TENSOR_INT32} (since API level 30) * * Supported tensor rank: up to 4 * @@ -238,15 +261,19 @@ typedef enum { * * 0: A tensor. * * 1: A tensor of the same {@link OperandCode}, and compatible dimensions * as input0. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scales and zeroPoint can be different from input0 scale and zeroPoint. * * 2: An {@link ANEURALNETWORKS_INT32} scalar, and has to be one of the * {@link FuseCode} values. Specifies the activation to * invoke on the result. + * For a {@link ANEURALNETWORKS_TENSOR_INT32} tensor, + * the {@link FuseCode} must be "NONE". * * Outputs: * * 0: The sum, a tensor of the same {@link OperandCode} as input0. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint can be different from inputs' scale and zeroPoint. * * Available since API level 27. @@ -270,18 +297,20 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout. * With the default data layout NHWC, the data is stored in the order of: * [batch, height, width, channels]. Alternatively, the data layout could * be NCHW, the data storage order of: [batch, channels, height, width]. + * NCHW is supported since API level 29. * * Both explicit padding and implicit padding are supported. * * Inputs (explicit padding): * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying - * the input. Since API level 29, zero batches is supported for this - * tensor. + * the input. + * Since API level 29, zero batches is supported for this tensor. * * 1: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on * the left, in the ‘width’ dimension. * * 2: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on @@ -307,8 +336,8 @@ typedef enum { * * Inputs (implicit padding): * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying - * the input. Since API level 29, zero batches is supported for this - * tensor. + * the input. + * Since API level 29, zero batches is supported for this tensor. * * 1: An {@link ANEURALNETWORKS_INT32} scalar, specifying the implicit * padding scheme, has to be one of the * {@link PaddingCode} values. @@ -330,7 +359,8 @@ typedef enum { * Outputs: * * 0: The output 4-D tensor, of shape * [batches, out_height, out_width, depth]. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input0. * * Available since API level 27. @@ -346,8 +376,9 @@ typedef enum { * Supported tensor {@link OperandCode}: * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} - * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} (full support since API - * level 29, see the input section) + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * (full support since API level 29, see the input section) + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: up to 4 * @@ -357,6 +388,9 @@ typedef enum { * Before API level 29, all input tensors of * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} * must have the same scale and zeroPoint as the output tensor. + * Input tensors of + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} + * are allowed to have different scale and zeroPoint. * Since API level 29, zero-sized tensors are supported. * * n: An {@link ANEURALNETWORKS_INT32} scalar, specifying the * concatenation axis. @@ -373,7 +407,7 @@ typedef enum { ANEURALNETWORKS_CONCATENATION = 2, /** - * Performs an 2-D convolution operation. + * Performs a 2-D convolution operation. * * The CONV_2D op sweeps a 2-D filter that can mix channels together over a * batch of images, applying the filter to each window of each image of the @@ -409,31 +443,46 @@ typedef enum { * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (scale set to 0.0, * * * each value scaling is separate and equal to input.scale * filter.scales[channel]). * + * Available since API level 30: + * * Quantized signed (since API level 30): + * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} for input, filter, and output. + * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (with scale set to + * * * input.scale * filter.scale). + * + * * Quantized signed with filter symmetric per channel quantization (since API level 30): + * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} for input, and output. + * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} for filter. + * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (scale set to 0.0, + * * * each value scaling is separate and equal to input.scale * filter.scales[channel]). + * * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout. * With the default data layout NHWC, the data is stored in the order of: * [batch, height, width, channels]. Alternatively, the data layout could * be NCHW, the data storage order of: [batch, channels, height, width]. + * NCHW is supported since API level 29. * * Both explicit padding and implicit padding are supported. * * Inputs (explicit padding): * * 0: A 4-D tensor, of shape [batches, height, width, depth_in], - * specifying the input. Since API level 29, zero batches is supported - * for this tensor. + * specifying the input. + * Since API level 29, zero batches is supported for this tensor. * * 1: A 4-D tensor, of shape * [depth_out, filter_height, filter_width, depth_in], specifying the - * filter. For tensor of type - * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} the channel - * dimension (extraParams.channelQuant.channelDim) must be set to 0. + * filter. + * For tensor of type {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} + * the channel dimension (ANeuralNetworksSymmPerChannelQuantParams::channelDim) + * must be set to 0. * * 2: A 1-D tensor, of shape [depth_out], specifying the bias. For input - * tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} or - * {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the same - * type. For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, + * tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} + * or {@link ANEURALNETWORKS_TENSOR_FLOAT16} the bias must be of the same type. + * For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, * the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint - * of 0 and bias_scale == input_scale * filter_scale. For filter tensor - * of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, the bias - * should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of - * 0 and bias_scale of 0. The actual scale of each value 'i' is equal to + * of 0 and bias_scale == input_scale * filter_scale. + * For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, + * the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0 + * and bias_scale of 0. The actual scale of each value 'i' is equal to * bias_scale[i] = input_scale * filter_scale[i]. * * 3: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on * the left, in the ‘width’ dimension. @@ -466,22 +515,25 @@ typedef enum { * * Inputs (implicit padding): * * 0: A 4-D tensor, of shape [batches, height, width, depth_in], - * specifying the input. Since API level 29, zero batches is supported - * for this tensor. + * specifying the input. + * Since API level 29, zero batches is supported for this tensor. * * 1: A 4-D tensor, of shape * [depth_out, filter_height, filter_width, depth_in], specifying the - * filter. For tensor of type - * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} the channel - * dimension (extraParams.channelQuant.channelDim) must be set to 0. + * filter. + * For tensor of type {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} + * the channel dimension (ANeuralNetworksSymmPerChannelQuantParams::channelDim) + * must be set to 0. * * 2: A 1-D tensor, of shape [depth_out], specifying the bias. For input - * tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} or - * {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the same - * type. For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, + * tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} + * or {@link ANEURALNETWORKS_TENSOR_FLOAT16} the bias must be of the same + * type. + * For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, * the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint - * of 0 and bias_scale == input_scale * filter_scale. For filter tensor - * of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, the bias - * should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of - * 0 and bias_scale of 0. The actual scale of each value 'i' is equal to + * of 0 and bias_scale == input_scale * filter_scale. + * For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, + * the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0 + * and bias_scale of 0. The actual scale of each value 'i' is equal to * bias_scale[i] = input_scale * filter_scale[i]. * * 3: An {@link ANEURALNETWORKS_INT32} scalar, specifying the implicit * padding scheme, has to be one of the @@ -509,10 +561,9 @@ typedef enum { * * Outputs: * * 0: The output 4-D tensor, of shape - * [batches, out_height, out_width, depth_out]. Before API level 29, - * for output tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, - * the following condition must be satisfied: - * output_scale > input_scale * filter_scale + * [batches, out_height, out_width, depth_out]. + * Before API level 29, for output tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, + * the following condition must be satisfied: output_scale > input_scale * filter_scale * * Available since API level 27. */ @@ -559,10 +610,23 @@ typedef enum { * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (scale set to 0.0, * * * each value scaling is separate and equal to input.scale * filter.scales[channel]). * + * Available since API level 30: + * * Quantized signed (since API level 30): + * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} for input, filter, and output. + * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (with scale set to + * * * input.scale * filter.scale). + * + * * Quantized signed with filter symmetric per channel quantization (since API level 30): + * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} for input, and output. + * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} for filter. + * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (scale set to 0.0, + * * * each value scaling is separate and equal to input.scale * filter.scales[channel]). + * * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout. * With the default data layout NHWC, the data is stored in the order of: * [batch, height, width, channels]. Alternatively, the data layout could * be NCHW, the data storage order of: [batch, channels, height, width]. + * NCHW is supported since API level 29. * * Both explicit padding and implicit padding are supported. * @@ -570,18 +634,20 @@ typedef enum { * * 0: A 4-D tensor, of shape [batches, height, width, depth_in], * specifying the input. * * 1: A 4-D tensor, of shape [1, filter_height, filter_width, depth_out], - * specifying the filter. For tensor of type - * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} the channel - * dimension (extraParams.channelQuant.channelDim) must be set to 3. + * specifying the filter. + * For tensor of type {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} + * the channel dimension (ANeuralNetworksSymmPerChannelQuantParams::channelDim) + * must be set to 3. * * 2: A 1-D tensor, of shape [depth_out], specifying the bias. For input - * tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} or - * {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the same - * type. For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, + * tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} + * or {@link ANEURALNETWORKS_TENSOR_FLOAT16} the bias must be of the same type. + * For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, * the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint - * of 0 and bias_scale == input_scale * filter_scale. For filter tensor - * of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, the bias - * should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of - * 0 and bias_scale of 0. The actual scale of each value 'i' is equal to + * of 0 and bias_scale == input_scale * filter_scale. + * For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, + * the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0 + * and bias_scale of 0. The actual scale of each value 'i' is equal to * bias_scale[i] = input_scale * filter_scale[i]. * * 3: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on * the left, in the ‘width’ dimension. @@ -620,14 +686,15 @@ typedef enum { * * 1: A 4-D tensor, of shape [1, filter_height, filter_width, depth_out], * specifying the filter. * * 2: A 1-D tensor, of shape [depth_out], specifying the bias. For input - * tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} or - * {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the same - * type. For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, + * tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} + * or {@link ANEURALNETWORKS_TENSOR_FLOAT16} the bias must be of the same type. + * For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, * the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint - * of 0 and bias_scale == input_scale * filter_scale. For filter tensor - * of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, the bias - * should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of - * 0 and bias_scale of 0. The actual scale of each value 'i' is equal to + * of 0 and bias_scale == input_scale * filter_scale. + * For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, + * the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0 + * and bias_scale of 0. The actual scale of each value 'i' is equal to * bias_scale[i] = input_scale * filter_scale[i]. * * 3: An {@link ANEURALNETWORKS_INT32} scalar, specifying the implicit * padding scheme, has to be one of the @@ -654,12 +721,11 @@ typedef enum { * cells between each filter element on height dimension. If this input is set, * input 9 (dilation factor for width) must be specified as well. * Available since API level 29. - * * Outputs: * * 0: The output 4-D tensor, of shape - * [batches, out_height, out_width, depth_out]. Before API level 29, - * for output tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, + * [batches, out_height, out_width, depth_out]. Before API level 29, for + * output tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, * the following condition must be satisfied: * output_scale > input_scale * filter_scale * @@ -686,11 +752,13 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout. * With the default data layout NHWC, the data is stored in the order of: * [batch, height, width, channels]. Alternatively, the data layout could * be NCHW, the data storage order of: [batch, channels, height, width]. + * NCHW is supported since API level 29. * * Inputs: * * 0: A 4-D tensor, of shape [batches, height, width, depth_in], @@ -705,7 +773,8 @@ typedef enum { * Outputs: * * 0: The output 4-D tensor, of shape [batch, height*block_size, * width*block_size, depth/(block_size*block_size)]. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input0. * * Available since API level 27. @@ -723,6 +792,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} (since API level 29) + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported output tensor {@link OperandCode}: * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) @@ -731,7 +801,8 @@ typedef enum { * Supported tensor rank: up to 4 * * Inputs: - * * 0: A tensor. Since API level 29, this tensor may be zero-sized. + * * 0: A tensor. + * Since API level 29, this tensor may be zero-sized. * * Outputs: * * 0: A tensor with the same shape as input0. @@ -761,9 +832,11 @@ typedef enum { * and an error must be reported. * * Supported value tensor {@link OperandCode}: + * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 30) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} - * * {@link ANEURALNETWORKS_TENSOR_INT32} - * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_INT32} (since API level 29) + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} (since API level 29) + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported value tensor rank: from 2 * @@ -777,7 +850,8 @@ typedef enum { * * 0: A n-D tensor with the same rank and shape as the Values * tensor, except for the first dimension which has the same size * as Lookups' only dimension. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input1. * * Available since API level 27. @@ -816,6 +890,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: up to 4. * @@ -826,26 +901,26 @@ typedef enum { * [batch_size, input_size], where "input_size" corresponds to the * number of inputs to the layer, matching the second dimension of * weights, and "batch_size" is calculated by dividing the number of - * elements by "input_size". Since API level 29, zero batch_size is - * supported for this tensor. + * elements by "input_size". + * Since API level 29, zero batch_size is supported for this tensor. * * 1: A 2-D tensor, specifying the weights, of shape * [num_units, input_size], where "num_units" corresponds to the number * of output nodes. * * 2: A 1-D tensor, of shape [num_units], specifying the bias. For input * tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32}, the bias should - * also be of {@link ANEURALNETWORKS_TENSOR_FLOAT32}. For input tensor - * of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, the bias should be - * of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0 and - * bias_scale == input_scale * filter_scale. + * also be of {@link ANEURALNETWORKS_TENSOR_FLOAT32}. + * For input tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, + * the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, + * with zeroPoint of 0 and bias_scale == input_scale * filter_scale. * * 3: An {@link ANEURALNETWORKS_INT32} scalar, and has to be one of the * {@link FuseCode} values. Specifies the activation to * invoke on the result. * * Outputs: - * * 0: The output tensor, of shape [batch_size, num_units]. Before API - * level 29, for output tensor of {@link - * ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, the following condition must - * be satisfied: output_scale > input_scale * filter_scale. + * * 0: The output tensor, of shape [batch_size, num_units]. Before API level 29, for + * output tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, the following + * condition must be satisfied: output_scale > input_scale * filter_scale. * * Available since API level 27. */ @@ -911,7 +986,7 @@ typedef enum { ANEURALNETWORKS_HASHTABLE_LOOKUP = 10, /** - * Applies L2 normalization along the depth dimension. + * Applies L2 normalization along the axis dimension. * * The values in the output tensor are computed as: * @@ -919,13 +994,13 @@ typedef enum { * input[batch, row, col, channel] / * sqrt(sum_{c} pow(input[batch, row, col, c], 2)) * - * For input tensor with rank less than 4, independently normalizes each - * 1-D slice along dimension dim. + * By default the axis dimension is the last dimension of the input tensor. * * Supported tensor {@link OperandCode}: * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} (since API level 29) + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: up to 4 * Tensors with rank less than 4 are only supported since API level 29. @@ -942,6 +1017,12 @@ typedef enum { * * 0: A tensor of the same {@link OperandCode} and same shape as input0. * For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, * the scale must be 1.f / 128 and the zeroPoint must be 128. + * For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, + * the scale must be 1.f / 128 and the zeroPoint must be 0. + * + * NOTE: Before API level 30, if the elements along an axis are all zeros, + * the result is undefined. Since API level 30, if the elements along an axis + * are all zeros, the result is logical zero. * * Available since API level 27. */ @@ -967,13 +1048,14 @@ typedef enum { * With the default data layout NHWC, the data is stored in the order of: * [batch, height, width, channels]. Alternatively, the data layout could * be NCHW, the data storage order of: [batch, channels, height, width]. + * NCHW is supported since API level 29. * * Both explicit padding and implicit padding are supported. * * Inputs (explicit padding): * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying - * the input. Since API level 29, zero batches is supported for this - * tensor. + * the input. + * Since API level 29, zero batches is supported for this tensor. * * 1: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on * the left, in the ‘width’ dimension. * * 2: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on @@ -999,8 +1081,8 @@ typedef enum { * * Inputs (implicit padding): * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying - * the input. Since API level 29, zero batches is supported for this - * tensor. + * the input. + * Since API level 29, zero batches is supported for this tensor. * * 1: An {@link ANEURALNETWORKS_INT32} scalar, specifying the implicit * padding scheme, has to be one of the * {@link PaddingCode} values. @@ -1095,17 +1177,20 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: up to 4. * * Inputs: - * * 0: A tensor, specifying the input. Since API level 29, this tensor may - * be zero-sized. + * * 0: A tensor, specifying the input. + * Since API level 29, this tensor may be zero-sized. * * Outputs: * * 0: The output tensor of same shape as input0. * For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, * the scale must be 1.f / 256 and the zeroPoint must be 0. + * For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, + * the scale must be 1.f / 256 and the zeroPoint must be -128. * * Available since API level 27. */ @@ -1158,7 +1243,7 @@ typedef enum { * Outputs: * * 0: If the projection type is Sparse: * Output.Dim == { Tensor[0].Dim[0] } - * A tensor of int32 that represents hash signatures, + * A tensor of int32 that represents hash signatures. * * If the projection type is Dense: * Output.Dim == { Tensor[0].Dim[0] * Tensor[0].Dim[1] } @@ -1248,7 +1333,7 @@ typedef enum { * * The projection bias (\f$b_{proj}\f$) may (but not required to) have a * value if the recurrent projection layer exists, and should otherwise * have no value. - * * (API level >= 29) The four layer normalization weights either all have + * * (API level 29 or later) The four layer normalization weights either all have * values or none of them have values. Additionally, if CIFG is used, * input layer normalization weights tensor is omitted and the other layer * normalization weights either all have values or none of them have @@ -1406,18 +1491,20 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout. * With the default data layout NHWC, the data is stored in the order of: * [batch, height, width, channels]. Alternatively, the data layout could * be NCHW, the data storage order of: [batch, channels, height, width]. + * NCHW is supported since API level 29. * * Both explicit padding and implicit padding are supported. * * Inputs (explicit padding): * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying - * the input. Since API level 29, zero batches is supported for this - * tensor. + * the input. + * Since API level 29, zero batches is supported for this tensor. * * 1: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on * the left, in the ‘width’ dimension. * * 2: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on @@ -1443,8 +1530,8 @@ typedef enum { * * Inputs (implicit padding): * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying - * the input. Since API level 29, zero batches is supported for this - * tensor. + * the input. + * Since API level 29, zero batches is supported for this tensor. * * 1: An {@link ANEURALNETWORKS_INT32} scalar, specifying the implicit * padding scheme, has to be one of the * {@link PaddingCode} values. @@ -1466,7 +1553,8 @@ typedef enum { * Outputs: * * 0: The output 4-D tensor, of shape * [batches, out_height, out_width, depth]. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input0. * * Available since API level 27. @@ -1496,6 +1584,8 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) + * * {@link ANEURALNETWORKS_TENSOR_INT32} (since API level 30) * * Supported tensor rank: up to 4 * @@ -1506,10 +1596,13 @@ typedef enum { * * 2: An {@link ANEURALNETWORKS_INT32} scalar, and has to be one of the * {@link FuseCode} values. Specifies the activation to * invoke on the result. + * For a {@link ANEURALNETWORKS_TENSOR_INT32} tensor, + * the {@link FuseCode} must be "NONE". * * Outputs: * * 0: The product, a tensor of the same {@link OperandCode} as input0. - * For output tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, + * For output tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, * the following condition must be satisfied: * output_scale > input1_scale * input2_scale. * @@ -1528,16 +1621,18 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: up to 4. * * Inputs: - * * 0: A tensor, specifying the input. Since API level 29, this tensor may - * be zero-sized. + * * 0: A tensor, specifying the input. + * Since API level 29, this tensor may be zero-sized. * * Outputs: * * 0: The output tensor of same shape as input0. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input0. * * Available since API level 27. @@ -1555,16 +1650,18 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: up to 4. * * Inputs: - * * 0: A tensor, specifying the input. Since API level 29, this tensor may - * be zero-sized. + * * 0: A tensor, specifying the input. + * Since API level 29, this tensor may be zero-sized. * * Outputs: * * 0: The output tensor of the same shape as input0. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input0. * * Available since API level 27. @@ -1582,16 +1679,18 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: up to 4. * * Inputs: - * * 0: A tensor, specifying the input. Since API level 29, this tensor may - * be zero-sized. + * * 0: A tensor, specifying the input. + * Since API level 29, this tensor may be zero-sized. * * Outputs: * * 0: The output tensor of same shape as input0. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input0. * * Available since API level 27. @@ -1608,6 +1707,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: up to 4. * @@ -1624,7 +1724,8 @@ typedef enum { * * Outputs: * * 0: The output tensor, of shape specified by the input shape. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input0. * * Available since API level 27. @@ -1642,18 +1743,20 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} (since API level 29) + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout. * With the default data layout NHWC, the data is stored in the order of: * [batch, height, width, channels]. Alternatively, the data layout could * be NCHW, the data storage order of: [batch, channels, height, width]. + * NCHW is supported since API level 29. * * Both resizing by shape and resizing by scale are supported. * * Inputs (resizing by shape): * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying - * the input. Since API level 29, zero batches is supported for this - * tensor. + * the input. + * Since API level 29, zero batches is supported for this tensor. * * 1: An {@link ANEURALNETWORKS_INT32} scalar, specifying the output * width of the output tensor. * * 2: An {@link ANEURALNETWORKS_INT32} scalar, specifying the output @@ -1661,6 +1764,17 @@ typedef enum { * * 3: An optional {@link ANEURALNETWORKS_BOOL} scalar, default to false. * Set to true to specify NCHW data layout for input0 and output0. * Available since API level 29. + * * 4: Align corners. An optional {@link ANEURALNETWORKS_BOOL} + * scalar, default to false. If True, the centers of the 4 corner + * pixels of the input and output tensors are aligned, preserving the + * values at the corner pixels. + * Available since API level 30. + * * 5: Half pixel centers. An optional {@link ANEURALNETWORKS_BOOL} + * scalar, default to false. If True, the pixel centers are assumed to + * be at (0.5, 0.5). This is the default behavior of image.resize in + * TF 2.0. If this parameter is True, then align_corners parameter + * must be False. + * Available since API level 30. * * Inputs (resizing by scale, since API level 29): * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying @@ -1679,10 +1793,24 @@ typedef enum { * {@link ANEURALNETWORKS_FLOAT32} otherwise. * * 3: An optional {@link ANEURALNETWORKS_BOOL} scalar, default to false. * Set to true to specify NCHW data layout for input0 and output0. + * * 4: Align corners. An optional {@link ANEURALNETWORKS_BOOL} + * scalar, default to false. If True, the centers of the 4 corner + * pixels of the input and output tensors are aligned, preserving the + * values at the corner pixels. + * Available since API level 30. + * * 5: Half pixel centers. An optional {@link ANEURALNETWORKS_BOOL} + * scalar, default to false. If True, the pixel centers are assumed to + * be at (0.5, 0.5). This is the default behavior of image.resize in + * TF 2.0. If this parameter is True, then align_corners parameter + * must be False. + * Available since API level 30. * * Outputs: * * 0: The output 4-D tensor, of shape * [batches, new_height, new_width, depth]. + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, + * the scale and zeroPoint must be the same as input0. * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, * the scale and zeroPoint must be the same as input0. * @@ -1762,19 +1890,21 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: up to 4. * Tensors with rank other than 2 or 4 are only supported since API level 29. * * Inputs: - * * 0: A 2-D or 4-D tensor, specifying the tensor to be reshaped. Since - * API level 29, this tensor may be zero-sized. + * * 0: A 2-D or 4-D tensor, specifying the tensor to be reshaped. + * Since API level 29, this tensor may be zero-sized. * * 1: A scalar, specifying the positive scaling factor for the exponent, - * beta. If input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT32} or - * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, the scalar must be of - * {@link ANEURALNETWORKS_FLOAT32}. If input0 is of {@link - * ANEURALNETWORKS_TENSOR_FLOAT16}, then the scalar must be of {@link - * ANEURALNETWORKS_FLOAT16}. + * beta. If input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT32}, + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} or + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, the scalar + * must be of {@link ANEURALNETWORKS_FLOAT32}. + * If input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT16}, then the + * scalar must be of {@link ANEURALNETWORKS_FLOAT16}. * * 2: An optional {@link ANEURALNETWORKS_INT32} scalar, default to -1, * specifying the dimension the activation would be performed on. * Negative index is used to specify axis from the end (e.g. -1 for @@ -1785,6 +1915,8 @@ typedef enum { * * 0: The output tensor of same shape as input0. * For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, * the scale must be 1.f / 256 and the zeroPoint must be 0. + * For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, + * the scale must be 1.f / 256 and the zeroPoint must be -128. * * Available since API level 27. */ @@ -1808,11 +1940,13 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout. * With the default data layout NHWC, the data is stored in the order of: * [batch, height, width, channels]. Alternatively, the data layout could * be NCHW, the data storage order of: [batch, channels, height, width]. + * NCHW is supported since API level 29. * * Inputs: * * 0: A 4-D tensor, of shape [batches, height, width, depth_in], @@ -1827,7 +1961,8 @@ typedef enum { * Outputs: * * 0: The output 4-D tensor, of shape [batches, height/block_size, * width/block_size, depth_in*block_size*block_size]. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input0. * * Available since API level 27. @@ -1924,17 +2059,20 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} (since API level 29) + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: up to 4. * * Inputs: - * * 0: A tensor, specifying the input. Since API level 29, this tensor may - * be zero-sized. + * * 0: A tensor, specifying the input. + * Since API level 29, this tensor may be zero-sized. * * Outputs: * * 0: The output tensor of same shape as input0. * For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, * the scale must be 1.f / 128 and the zeroPoint must be 128. + * For {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, + * the scale must be 1.f / 128 and the zeroPoint must be 0. * * Available since API level 27. */ @@ -1942,7 +2080,6 @@ typedef enum { // Operations below are available since API level 28. - // TODO: make the description easier to understand. /** * BatchToSpace for N-dimensional tensors. * @@ -1957,11 +2094,13 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout. * With the default data layout NHWC, the data is stored in the order of: * [batch, height, width, channels]. Alternatively, the data layout could * be NCHW, the data storage order of: [batch, channels, height, width]. + * NCHW is supported since API level 29. * * Inputs: * * 0: An n-D tensor, specifying the tensor to be reshaped @@ -1974,7 +2113,8 @@ typedef enum { * * Outputs: * * 0: A tensor of the same {@link OperandCode} as input0. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input0. * * Available since API level 28. @@ -1988,6 +2128,11 @@ typedef enum { * dimensions. The output is the result of dividing the first input tensor * by the second, optionally modified by an activation function. * + * For inputs of {@link ANEURALNETWORKS_TENSOR_INT32}, performs + * "floor division" ("//" in Python). For example, + * 5 // 2 = 2 + * -5 // 2 = -3 + * * Two dimensions are compatible when: * 1. they are equal, or * 2. one of them is 1 @@ -2008,6 +2153,7 @@ typedef enum { * Supported tensor {@link OperandCode}: * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} + * * {@link ANEURALNETWORKS_TENSOR_INT32} (since API level 30) * * Supported tensor rank: up to 4 * @@ -2018,6 +2164,8 @@ typedef enum { * * 2: An {@link ANEURALNETWORKS_INT32} scalar, and has to be one of the * {@link FuseCode} values. Specifies the activation to * invoke on the result. + * For a {@link ANEURALNETWORKS_TENSOR_INT32} tensor, + * the {@link FuseCode} must be "NONE". * * Outputs: * * 0: A tensor of the same {@link OperandCode} as input0. @@ -2038,6 +2186,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: up to 4 * @@ -2057,23 +2206,27 @@ typedef enum { * * Outputs: * * 0: A tensor of the same {@link OperandCode} as input0. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, - * the scale and zeroPoint must be same as input0. + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, + * the scale and zeroPoint must be the same as input0. + * If all dimensions are reduced and keep_dims is false, the output + * shape is [1]. * * Available since API level 28. */ ANEURALNETWORKS_MEAN = 31, /** - * Pads a tensor with zeros. + * Pads a tensor. * * This operation pads a tensor according to the specified paddings. * * Supported tensor {@link OperandCode}: * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} - * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} (full support since API - * level 29, see the output section) + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) + * (full support since API level 29, see the output section) * * Supported tensor rank: up to 4 * @@ -2095,7 +2248,8 @@ typedef enum { * of the padding: * output0.dimension[i] = * padding[i, 0] + input0.dimension[i] + padding[i, 1] - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input0. * * NOTE: Before API level 29, the pad value for @@ -2106,7 +2260,6 @@ typedef enum { */ ANEURALNETWORKS_PAD = 32, - // TODO: make the description easier to understand. /** * SpaceToBatch for N-Dimensional tensors. * @@ -2121,13 +2274,15 @@ typedef enum { * Supported tensor {@link OperandCode}: * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} - * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} (full support since API - * level 29, see the output section) + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) + * (full support since API level 29, see the output section) * * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout. * With the default data layout NHWC, the data is stored in the order of: * [batch, height, width, channels]. Alternatively, the data layout could * be NCHW, the data storage order of: [batch, channels, height, width]. + * NCHW is supported since API level 29. * * Inputs: * * 0: An n-D tensor, specifying the input. @@ -2148,7 +2303,8 @@ typedef enum { * * Outputs: * * 0: A tensor of the same {@link OperandCode} as input0. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input0. * * NOTE: Before API level 29, the pad value for @@ -2171,6 +2327,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: up to 4 * @@ -2186,8 +2343,11 @@ typedef enum { * * 0: A tensor of the same {@link OperandCode} as input0. Contains the * same data as input, but has one or more dimensions of size 1 * removed. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input0. + * If all input dimensions are equal to 1 and are to be squeezed, the + * output shape is [1]. * * Available since API level 28. */ @@ -2206,6 +2366,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: up to 4 * @@ -2235,8 +2396,11 @@ typedef enum { * Outputs: * * 0: A tensor of the same {@link OperandCode} as input0 and rank (n - k), * where k is the number of bits set in shrink_axis_mask. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input0. + * If shrink_axis_mask is true for all input dimensions, the output + * shape is [1]. * * Available since API level 28. */ @@ -2270,6 +2434,8 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} (since API level 29) + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) + * * {@link ANEURALNETWORKS_TENSOR_INT32} (since API level 30) * * Supported tensor rank: up to 4 * @@ -2280,10 +2446,13 @@ typedef enum { * * 2: An {@link ANEURALNETWORKS_INT32} scalar, and has to be one of the * {@link FuseCode} values. Specifies the activation to * invoke on the result. + * For a {@link ANEURALNETWORKS_TENSOR_INT32} tensor, + * the {@link FuseCode} must be "NONE". * * Outputs: * * 0: A tensor of the same {@link OperandCode} as input0. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint can be different from inputs' scale and zeroPoint. * * Available since API level 28. @@ -2303,6 +2472,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: up to 4 * @@ -2314,7 +2484,8 @@ typedef enum { * * Outputs: * * 0: A tensor of the same {@link OperandCode} as input0. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input0. * * Available since API level 28. @@ -2329,6 +2500,7 @@ typedef enum { * Supported tensor {@link OperandCode}: * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} + * * {@link ANEURALNETWORKS_TENSOR_INT32} (since API level 30) * * Supported tensor rank: from 1. * @@ -2350,6 +2522,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_INT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: from 1 * @@ -2361,6 +2534,7 @@ typedef enum { * * Outputs: * * 0: An (n - 1)-D {@link ANEURALNETWORKS_TENSOR_INT32} tensor. + * If input is 1-dimensional, the output shape is [1]. * * Available since API level 29. */ @@ -2376,6 +2550,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_INT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: from 1 * @@ -2387,6 +2562,7 @@ typedef enum { * * Outputs: * * 0: An (n - 1)-D {@link ANEURALNETWORKS_TENSOR_INT32} tensor. + * If input is 1-dimensional, the output shape is [1]. * * Available since API level 29. */ @@ -2419,7 +2595,8 @@ typedef enum { * and height, dw and dh is the log-scale relative correction factor * for the width and height. For input0 of type * {@link ANEURALNETWORKS_TENSOR_QUANT16_ASYMM}, this tensor should be - * of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}. Zero num_rois is + * of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} or + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}. Zero num_rois is * supported for this tensor. * * 2: An 1-D {@link ANEURALNETWORKS_TENSOR_INT32} tensor, of shape * [num_rois], specifying the batch index of each box. Boxes with @@ -2441,7 +2618,54 @@ typedef enum { ANEURALNETWORKS_AXIS_ALIGNED_BBOX_TRANSFORM = 41, /** - * Performs a forward LSTM on the input followed by a backward LSTM. + * A recurrent neural network layer that applies an LSTM cell to a + * sequence of inputs in forward and backward directions. + * + * The op supports cross-linking via an auxiliary input. Regular cell feeds + * one input into the two RNN cells in the following way: + * + * INPUT (INPUT_REVERSED) + * | | + * --------------------- + * | FW_LSTM BW_LSTM | + * --------------------- + * | | + * FW_OUT BW_OUT + * + * An op with cross-linking takes two inputs and feeds them into the RNN + * cells in the following way: + * + * AUX_INPUT (AUX_INPUT_REVERSED) + * | | + * INPUT | (INPUT_R'D.)| + * | | | | + * ----------------------- + * | \ / \ / | + * | FW_LSTM BW_LSTM | + * ----------------------- + * | | + * FW_OUT BW_OUT + * + * The cross-linking mode is enabled iff auxiliary input and auxiliary + * weights are present. While stacking this op on top of itself, this + * allows to connect both forward and backward outputs from previous cell + * to the next cell's input. + * + * Since API level 30 parallel linking mode is supported. The mode is + * enabled if auxiliary input is present but auxiliary weights are omitted. + * In this case, the cell feeds inputs into the RNN in the following way: + * + * INPUT (AUX_INPUT_REVERSED) + * | | + * --------------------- + * | FW_LSTM BW_LSTM | + * --------------------- + * | | + * FW_OUT BW_OUT + * + * While stacking this op on top of itself, this allows to connect both + * forward and backward outputs from previous cell to the next cell's + * corresponding inputs. * * Supported tensor {@link OperandCode}: * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} @@ -2451,7 +2675,6 @@ typedef enum { * * All input and output tensors must be of the same type. * - * * Inputs: * * 0: The input. * A 3-D tensor of shape: @@ -2543,25 +2766,34 @@ typedef enum { * * 38: The backward input cell state. * A 2-D tensor of shape [batch_size, bw_num_units]. * * 39: The auxiliary input. Optional. - * A 3-D tensor of shape [max_time, batch_size, input_size], where “batch_size” - * corresponds to the batching dimension, and “input_size” is the size - * of the input. - * * 40: The forward auxiliary input-to-input weights. Optional. - * A 2-D tensor of shape [fw_num_units, input_size]. - * * 41: The forward auxiliary input-to-forget weights. Optional. - * A 2-D tensor of shape [fw_num_units, input_size]. - * * 42: The forward auxiliary input-to-cell weights. Optional. - * A 2-D tensor of shape [fw_num_units, input_size]. - * * 43: The forward auxiliary input-to-output weights. Optional. - * A 2-D tensor of shape [fw_num_units, input_size]. - * * 44: The backward auxiliary input-to-input weights. Optional. - * A 2-D tensor of shape [bw_num_units, input_size]. - * * 45: The backward auxiliary input-to-forget weights. Optional. - * A 2-D tensor of shape [bw_num_units, input_size]. - * * 46: The backward auxiliary input-to-cell weights. Optional. - * A 2-D tensor of shape [bw_num_units, input_size]. - * * 47: The backward auxiliary input-to-output weights. Optional. - * A 2-D tensor of shape [bw_num_units, input_size]. + * A 3-D tensor of shape [max_time, batch_size, aux_input_size], + * where “batch_size” corresponds to the batching dimension, and + * “aux_input_size” is the size of the auxiliary input. Optional. See + * the docs above for the usage modes explanation. + * * 40: The forward auxiliary input-to-input weights. + * Optional. See the docs above for the usage modes explanation. + * A 2-D tensor of shape [fw_num_units, aux_input_size]. + * * 41: The forward auxiliary input-to-forget weights. + * Optional. See the docs above for the usage modes explanation. + * A 2-D tensor of shape [fw_num_units, aux_input_size]. + * * 42: The forward auxiliary input-to-cell weights. + * Optional. See the docs above for the usage modes explanation. + * A 2-D tensor of shape [fw_num_units, aux_input_size]. + * * 43: The forward auxiliary input-to-output weights. + * Optional. See the docs above for the usage modes explanation. + * A 2-D tensor of shape [fw_num_units, aux_input_size]. + * * 44: The backward auxiliary input-to-input weights. + * Optional. See the docs above for the usage modes explanation. + * A 2-D tensor of shape [bw_num_units, aux_input_size]. + * * 45: The backward auxiliary input-to-forget weights. + * Optional. See the docs above for the usage modes explanation. + * A 2-D tensor of shape [bw_num_units, aux_input_size]. + * * 46: The backward auxiliary input-to-cell weights. + * Optional. See the docs above for the usage modes explanation. + * A 2-D tensor of shape [bw_num_units, aux_input_size]. + * * 47: The backward auxiliary input-to-output weights. + * Optional. See the docs above for the usage modes explanation. + * A 2-D tensor of shape [bw_num_units, aux_input_size]. * * 48: The activation function. * A value indicating the activation function: * <ul> @@ -2576,17 +2808,17 @@ typedef enum { * then clipping is disabled. * If all the input tensors have type {@link ANEURALNETWORKS_TENSOR_FLOAT32}, * this scalar must be of the type {@link ANEURALNETWORKS_FLOAT32}, - * otherwise if all the input tensors have the type {@link - * ANEURALNETWORKS_TENSOR_FLOAT16}, this scalar must be of type {@link - * ANEURALNETWORKS_FLOAT16}. + * otherwise if all the input tensors have the type + * {@link ANEURALNETWORKS_TENSOR_FLOAT16}, this scalar must be + * of type {@link ANEURALNETWORKS_FLOAT16}. * * 50: The clipping threshold for the output from the * projection layer, such that values are bound within * [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled. * If all the input tensors have type {@link ANEURALNETWORKS_TENSOR_FLOAT32}, * this scalar must be of the type {@link ANEURALNETWORKS_FLOAT32}, - * otherwise if all the input tensors have the type {@link - * ANEURALNETWORKS_TENSOR_FLOAT16}, this scalar must be of type {@link - * ANEURALNETWORKS_FLOAT16}. + * otherwise if all the input tensors have the type + * {@link ANEURALNETWORKS_TENSOR_FLOAT16}, this scalar must be + * of type {@link ANEURALNETWORKS_FLOAT16}. * * 51: merge_outputs * An {@link ANEURALNETWORKS_BOOL} scalar specifying if the outputs * from forward and backward cells should be merged. @@ -2633,8 +2865,36 @@ typedef enum { * A 3-D tensor of shape: * If time-major: [max_time, batch_size, bw_output_size] * If batch-major: [batch_size, max_time, bw_output_size] + * * 2: The forward activation state output. + * A 2-D tensor of shape [batch_size, fw_output_size] containing an + * activation state from the last time step in the sequence. This + * output is optional and can be omitted. If this output is present + * then outputs 3-5 must be present as well. + * Available since API level 30. + * * 3: The forward cell state output. + * A tensor of shape [batch_size, fw_cell_size] containing a cell state + * from the last time step in the sequence. This output is optional + * and can be omitted. If this output is present + * then outputs 2, 4, 5 must be present as well. + * Available since API level 30. + * * 4: The backward activation state output. + * A 2-D tensor of shape [batch_size, bw_output_size] containing an + * activation state from the last time step in the sequence. This + * output is optional and can be omitted. If this output is present + * then outputs 2, 3, 5 must be present as well. + * Available since API level 30. + * * 5: The backward cell state output. + * A tensor of shape [batch_size, bw_cell_size] containing a cell state + * from the last time step in the sequence. This output is optional + * and can be omitted. If this output is present + * then outputs 2-4 must be present as well. + * Available since API level 30. * * Available since API level 29. + * + * Important: As of API level 29, there is no way to get the output state tensors out and NNAPI + * does not maintain internal states. This operator does not support the usage pattern in which + * multiple cells are chained and state tensors are propagated. */ ANEURALNETWORKS_BIDIRECTIONAL_SEQUENCE_LSTM = 42, @@ -2662,8 +2922,8 @@ typedef enum { * * “activation” is the function passed as the “fused_activation_function” * argument (if not “NONE”). * - * The op also supports an auxiliary input. Regular cell feeds one input - * into the two RNN cells in the following way: + * The op supports cross-linking via an auxiliary input. Regular cell feeds + * one input into the two RNN cells in the following way: * * INPUT (INPUT_REVERSED) * | | @@ -2673,8 +2933,8 @@ typedef enum { * | | * FW_OUT BW_OUT * - * An op with an auxiliary input takes two inputs and feeds them into the - * RNN cells in the following way: + * An op with cross-linking takes two inputs and feeds them into the RNN + * cells in the following way: * * AUX_INPUT (AUX_INPUT_REVERSED) * | | @@ -2687,9 +2947,26 @@ typedef enum { * | | * FW_OUT BW_OUT * + * The cross-linking mode is enabled iff auxiliary input and auxiliary + * weights are present. While stacking this op on top of itself, this + * allows to connect both forward and backward outputs from previous cell + * to the next cell's input. + * + * Since API level 30 parallel linking mode is supported. The mode is + * enabled if auxiliary input is present but auxiliary weights are omitted. + * In this case, the cell feeds inputs into the RNN in the following way: + * + * INPUT (AUX_INPUT_REVERSED) + * | | + * --------------------- + * | FW_RNN BW_RNN | + * --------------------- + * | | + * FW_OUT BW_OUT + * * While stacking this op on top of itself, this allows to connect both * forward and backward outputs from previous cell to the next cell's - * inputs. + * corresponding inputs. * * Supported tensor {@link OperandCode}: * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} @@ -2722,11 +2999,17 @@ typedef enum { * A 2-D tensor of shape [batchSize, bwNumUnits]. Specifies a hidden * state input for the first time step of the computation. * * 9: auxInput. - * A 3-D tensor. The shape is the same as of the input 0. + * A 3-D tensor. The shape is defined by the input 6 (timeMajor). If + * it is set to true, then the input has a shape [maxTime, batchSize, + * auxInputSize], otherwise the input has a shape [batchSize, maxTime, + * auxInputSize]. Can be omitted. See the docs above for the usage + * modes explanation. * * 10:fwAuxWeights. - * A 2-D tensor of shape [fwNumUnits, inputSize]. + * A 2-D tensor of shape [fwNumUnits, auxInputSize]. Can be omitted. + * See the docs above for the usage modes explanation. * * 11:bwAuxWeights. - * A 2-D tensor of shape [bwNumUnits, inputSize]. + * A 2-D tensor of shape [bwNumUnits, auxInputSize]. Can be omitted. + * See the docs above for the usage modes explanation. * * 12:fusedActivationFunction. * A {@link FuseCode} value indicating the activation function. If * “NONE” is specified then it results in a linear activation. @@ -2752,8 +3035,24 @@ typedef enum { * (timeMajor). If it is set to true, then the shape is set to * [maxTime, batchSize, bwNumUnits], otherwise the shape is set to * [batchSize, maxTime, bwNumUnits]. + * * 2: The forward hidden state output. + * A 2-D tensor of shape [batchSize, fwNumUnits] containing a hidden + * state from the last time step in the sequence. This output is + * optional and can be omitted. If this output is present then output + * 3 must be present as well. + * Available since API level 30. + * * 3: The backward hidden state output. + * A 2-D tensor of shape [batchSize, bwNumUnits] containing a hidden + * state from the last time step in the sequence. This output is + * optional and can be omitted. If this output is present then output + * 2 must be present as well. + * Available since API level 30. * * Available since API level 29. + * + * Important: As of API level 29, there is no way to get the output state tensors out and NNAPI + * does not maintain internal states. This operator does not support the usage pattern in which + * multiple cells are chained and state tensors are propagated. */ ANEURALNETWORKS_BIDIRECTIONAL_SEQUENCE_RNN = 43, @@ -2780,6 +3079,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Inputs: * * 0: A 2-D Tensor of shape [num_rois, num_classes], specifying the score @@ -2791,7 +3091,11 @@ typedef enum { * order of the boxes corresponds with input0. For input0 of type * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, this tensor should be of * {@link ANEURALNETWORKS_TENSOR_QUANT16_ASYMM}, with zeroPoint of 0 and - * scale of 0.125. Zero num_rois is supported for this tensor. + * scale of 0.125. + * For input0 of type {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, + * this tensor should be of {@link ANEURALNETWORKS_TENSOR_QUANT16_ASYMM}, + * with zeroPoint of -128 and scale of 0.125. + * Zero num_rois is supported for this tensor. * * 2: A 1-D {@link ANEURALNETWORKS_TENSOR_INT32} tensor, of shape * [num_rois], specifying the batch index of each box. Boxes with * the same batch index are grouped together. @@ -2818,6 +3122,8 @@ typedef enum { * [num_output_rois], specifying the score of each output box. The boxes * are grouped by batches, but the sequential order in each batch is not * guaranteed. For type of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, + * guaranteed. For type of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * or {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, * the scale and zero point must be the same as input0. * * 1: A 2-D Tensor of the same {@link OperandCode} as input1, with shape * [num_output_rois, 4], specifying the coordinates of each @@ -2837,7 +3143,7 @@ typedef enum { ANEURALNETWORKS_BOX_WITH_NMS_LIMIT = 44, /** - * Casts a tensor to a new type. + * Casts a tensor to a type. * * This operation ignores the scale and zeroPoint of quanized tensors, * e.g. it treats a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} input @@ -2848,6 +3154,14 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_INT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * Since API level 30, casting tensors of the following + * {@link OperandCode} to the same {@link OperandCode} is supported: + * * {@link ANEURALNETWORKS_TENSOR_BOOL8} + * * {@link ANEURALNETWORKS_TENSOR_INT32} + * * {@link ANEURALNETWORKS_TENSOR_QUANT16_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM} * * Supported tensor rank: from 1 * @@ -2880,6 +3194,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: up to 4 * @@ -2894,7 +3209,8 @@ typedef enum { * * Outputs: * * 0: A tensor of the same {@link OperandCode} and same shape as input0. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input0. * * Available since API level 29. @@ -2952,14 +3268,14 @@ typedef enum { * * 11: A scalar, score_threshold. Boxes with scores lower than the * threshold are filtered before sending to the NMS algorithm. The * scalar must be of {@link ANEURALNETWORKS_FLOAT16} if input0 is of - * {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of {@link - * ANEURALNETWORKS_FLOAT32} if input0 is of {@link - * ANEURALNETWORKS_TENSOR_FLOAT32}. + * {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of + * {@link ANEURALNETWORKS_FLOAT32} if input0 is of + * {@link ANEURALNETWORKS_TENSOR_FLOAT32}. * * 12: A scalar, specifying the IoU threshold for hard NMS. The scalar - * must be of {@link ANEURALNETWORKS_FLOAT16} if input0 is of {@link - * ANEURALNETWORKS_TENSOR_FLOAT16} and of {@link - * ANEURALNETWORKS_FLOAT32} if input0 is of {@link - * ANEURALNETWORKS_TENSOR_FLOAT32}. + * must be of {@link ANEURALNETWORKS_FLOAT16} if input0 is of + * {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of + * {@link ANEURALNETWORKS_FLOAT32} if input0 is of + * {@link ANEURALNETWORKS_TENSOR_FLOAT32}. * * 13: An {@link ANEURALNETWORKS_BOOL} scalar, set to true to include * background class in the list of label map for the output, set * to false to not include the background. When the background @@ -2992,6 +3308,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_INT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: from 1 * @@ -3041,6 +3358,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_INT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: from 1 * @@ -3052,7 +3370,8 @@ typedef enum { * Outputs: * * 0: An (n + 1)-D tensor with the same {@link OperandCode} and data as * input0. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input0. * * Available since API level 29. @@ -3078,6 +3397,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_INT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: from 1 * @@ -3092,7 +3412,8 @@ typedef enum { * * Outputs: * * 0: An (n + k - 1)-D tensor with the same {@link OperandCode} as input0. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input0. * * Available since API level 29. @@ -3115,6 +3436,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Inputs: * * 0: A 4-D Tensor specifying the score of each anchor at each @@ -3132,11 +3454,13 @@ typedef enum { * dimensions is the channel dimension. * * 2: A 2-D Tensor of shape [num_anchors, 4], specifying the shape of each * predefined anchor, with format [x1, y1, x2, y2]. For input0 of type - * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, this tensor should be of + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} or + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, this tensor should be of * {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}, with scale of 0.125. * * 3: A 2-D Tensor of shape [batches, 2], specifying the size of * each image in the batch, with format [image_height, image_width]. - * For input0 of type {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, this + * For input0 of type {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} or + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, this * tensor should be of {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM}, with * scale of 0.125. * * 4: An {@link ANEURALNETWORKS_FLOAT32} scalar, specifying the ratio @@ -3163,7 +3487,8 @@ typedef enum { * [num_output_rois], specifying the score of each output box. * The boxes are grouped by batches, but the sequential order in * each batch is not guaranteed. For type of - * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, the scale and zero + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} or + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, the scale and zero * point must be the same as input0. * * 1: A tensor of the same {@link OperandCode} as input3, of shape * [num_output_rois, 4], specifying the coordinates of each output @@ -3188,6 +3513,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_INT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: from 1 * @@ -3213,6 +3539,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_INT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: from 1 * @@ -3271,12 +3598,23 @@ typedef enum { * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (with scale set to * * * input.scale * filter.scale). * + * * Quantized signed (since API level 30): + * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} for input, filter, and output. + * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (with scale set to + * * * input.scale * filter.scale). + * * * Quantized with symmetric per channel quantization for the filter: * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} for input, and output. * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} for filter. * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (scale set to 0.0, * * * each value scaling is separate and equal to input.scale * filter.scales[channel]). * + * * Quantized signed with filter symmetric per channel quantization (since API level 30): + * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} for input, and output. + * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} for filter. + * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (scale set to 0.0, + * * * each value scaling is separate and equal to input.scale * filter.scales[channel]). + * * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout. * With the default data layout NHWC, the data is stored in the order of: * [batch, height, width, channels]. Alternatively, the data layout could @@ -3295,8 +3633,9 @@ typedef enum { * {@link ANeuralNetworksSymmPerChannelQuantParams}) must be set to 0. * * 2: A 1-D tensor, of shape [depth_out], specifying the bias. For input * tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} or - * {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the same - * type. For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, + * {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the same type. + * For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} * the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint * of 0 and bias_scale == input_scale * filter_scale. For filter tensor * of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, the bias @@ -3316,7 +3655,7 @@ typedef enum { * * 8: An {@link ANEURALNETWORKS_INT32} scalar, specifying the stride when * walking through input in the ‘height’ dimension. * * 9: An {@link ANEURALNETWORKS_INT32} scalar, specifying the number of - groups. + * groups. * * 10: An {@link ANEURALNETWORKS_INT32} scalar, and has to be one of the * {@link FuseCode} values. Specifies the activation to * invoke on the result. @@ -3330,12 +3669,14 @@ typedef enum { * [depth_out, filter_height, filter_width, depth_group], specifying * the filter, where depth_out must be divisible by num_groups. For * tensor of type {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} - * the channel dimension (channelDim at - * {@link ANeuralNetworksSymmPerChannelQuantParams}) must be set to 0. + * the channel dimension (ANeuralNetworksSymmPerChannelQuantParams::channelDim) + * must be set to 0. * * 2: A 1-D tensor, of shape [depth_out], specifying the bias. For input * tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} or * {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the same - * type. For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, + * {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the same type. + * For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} * the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint * of 0 and bias_scale == input_scale * filter_scale. For filter tensor * of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, the bias @@ -3360,7 +3701,8 @@ typedef enum { * Outputs: * * 0: The output 4-D tensor, of shape * [batches, out_height, out_width, depth_out]. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint can be different from inputs' scale and zeroPoint. * * Available since API level 29. @@ -3382,6 +3724,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout. * With the default data layout NHWC, the data is stored in the order of: @@ -3398,13 +3741,18 @@ typedef enum { * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, this tensor should * be of {@link ANEURALNETWORKS_TENSOR_QUANT16_ASYMM}, with zeroPoint * of 0 and scale of 0.125. + * For input0 of type + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, this tensor + * should be of {@link ANEURALNETWORKS_TENSOR_QUANT16_ASYMM}, with + * zeroPoint of -128 and scale of 0.125. * * 2: An {@link ANEURALNETWORKS_BOOL} scalar, set to true to specify * NCHW data layout for input0. Set to false for NHWC. * * Outputs: * * 0: A tensor of the same {@link OperandCode} as input0, with shape * [num_boxes, num_keypoints], specifying score of the keypoints. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} or + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint can be different from input0 scale and zeroPoint. * * 1: A tensor of the same {@link OperandCode} as input1, with shape * [num_boxes, num_keypoints, 2], specifying the location of @@ -3447,19 +3795,19 @@ typedef enum { * * 0: An n-D tensor, specifying the tensor to be normalized. * * 1: A scalar, specifying gamma, the scale applied to the normalized * tensor. The scalar must be of {@link ANEURALNETWORKS_FLOAT16} if - * input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of {@link - * ANEURALNETWORKS_FLOAT32} if input0 is of {@link - * ANEURALNETWORKS_TENSOR_FLOAT32}. + * input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of + * {@link ANEURALNETWORKS_FLOAT32} if input0 is of + * {@link ANEURALNETWORKS_TENSOR_FLOAT32}. * * 2: A scalar, specifying beta, the offset applied to the normalized * tensor. The scalar must be of {@link ANEURALNETWORKS_FLOAT16} if - * input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of {@link - * ANEURALNETWORKS_FLOAT32} if input0 is of {@link - * ANEURALNETWORKS_TENSOR_FLOAT32}. + * input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of + * {@link ANEURALNETWORKS_FLOAT32} if input0 is of + * {@link ANEURALNETWORKS_TENSOR_FLOAT32}. * * 3: A scalar, specifying epsilon, the small value added to variance to * avoid dividing by zero. The scalar must be of {@link ANEURALNETWORKS_FLOAT16} if - * input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of {@link - * ANEURALNETWORKS_FLOAT32} if input0 is of {@link - * ANEURALNETWORKS_TENSOR_FLOAT32}. + * input0 is of {@link ANEURALNETWORKS_TENSOR_FLOAT16} and of + * {@link ANEURALNETWORKS_FLOAT32} if input0 is of + * {@link ANEURALNETWORKS_TENSOR_FLOAT32}. * * 4: An {@link ANEURALNETWORKS_BOOL} scalar, set to true to specify * NCHW data layout for input0 and output0. Set to false for NHWC. * @@ -3479,6 +3827,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_INT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: from 1 * @@ -3505,6 +3854,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_INT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: from 1 * @@ -3644,6 +3994,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_INT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: from 1. * @@ -3656,7 +4007,8 @@ typedef enum { * * Outputs: * * 0: A tensor of the same {@link OperandCode} as input0. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, * the scale and zeroPoint can be different from inputs' scale and zeroPoint. * * Available since API level 29. @@ -3671,6 +4023,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_INT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: from 1. * @@ -3683,7 +4036,8 @@ typedef enum { * * Outputs: * * 0: A tensor of the same {@link OperandCode} as input0. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, * the scale and zeroPoint can be different from inputs' scale and zeroPoint. * * Available since API level 29. @@ -3719,6 +4073,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_INT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: from 1 * @@ -3744,6 +4099,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: up to 4 * @@ -3761,7 +4117,8 @@ typedef enum { * pad value must be of {@link ANEURALNETWORKS_FLOAT16}. * For input tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32}, the * pad value must be of {@link ANEURALNETWORKS_FLOAT32}. - * For input tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, + * For input tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, * the pad value must be of {@link ANEURALNETWORKS_INT32}. The * scale and zeroPoint are assumed to be the same as in input0. * @@ -3773,7 +4130,8 @@ typedef enum { * of the padding: * output0.dimension[i] = * padding[i, 0] + input0.dimension[i] + padding[i, 1] - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input0. * * Available since API level 29. @@ -3836,6 +4194,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: from 1 * @@ -3846,8 +4205,9 @@ typedef enum { * * Outputs: * * 0: A tensor of the same {@link OperandCode} as input0. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, - * the scale and zeroPoint can be diffent from the input0 scale and zeroPoint. + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, + * the scales and zeroPoint can be different from input0 scale and zeroPoint. * * Available since API level 29. */ @@ -3856,14 +4216,23 @@ typedef enum { /** * Quantizes the input tensor. * - * The formula is: + * The formula for {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} output tensor is: * * output = max(0, min(255, round(input / scale) + zeroPoint) * - * Supported tensor {@link OperandCode}: + * The formula for {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} output + * tensor is: + * + * output = max(-128, min(127, round(input / scale) + zeroPoint) + * + * Supported input tensor {@link OperandCode}: * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * + * Supported output tensor {@link OperandCode}: + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) + * * Supported tensor rank: from 1 * * Inputs: @@ -3871,7 +4240,8 @@ typedef enum { * * Outputs: * * 0: The output tensor of same shape as input0, but with - * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}. + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} or. + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}. * * Available since API level 29. */ @@ -3995,7 +4365,8 @@ typedef enum { * * 1: A scalar {@link ANEURALNETWORKS_INT32}, specifying the number of * independent samples to draw for each row slice. * * 2: A 1-D {@link ANEURALNETWORKS_TENSOR_INT32} tensor with shape [2], - * specifying seeds used to initialize the random distribution. + * specifying seeds used to initialize the random distribution. If both + * provided seeds are 0, both will be randomly generated. * Outputs: * * 0: A 2-D {@link ANEURALNETWORKS_TENSOR_INT32} tensor with shape * [batches, samples], containing the drawn samples. @@ -4026,6 +4397,8 @@ typedef enum { * * Outputs: * * 0: A tensor of the same {@link OperandCode} as input0. + * If all dimensions are reduced and keep_dims is false, the output + * shape is [1]. * * Available since API level 29. */ @@ -4053,6 +4426,8 @@ typedef enum { * * Outputs: * * 0: A tensor of the same {@link OperandCode} as input0. + * If all dimensions are reduced and keep_dims is false, the output + * shape is [1]. * * Available since API level 29. */ @@ -4070,6 +4445,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: up to 4 * @@ -4082,7 +4458,10 @@ typedef enum { * * Outputs: * * 0: A tensor of the same {@link OperandCode} as input0. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * If all dimensions are reduced and keep_dims is false, the output + * shape is [1]. + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input0. * * Available since API level 29. @@ -4101,6 +4480,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: up to 4 * @@ -4113,7 +4493,10 @@ typedef enum { * * Outputs: * * 0: A tensor of the same {@link OperandCode} as input0. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * If all dimensions are reduced and keep_dims is false, the output + * shape is [1]. + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input0. * * Available since API level 29. @@ -4142,6 +4525,8 @@ typedef enum { * * Outputs: * * 0: A tensor of the same {@link OperandCode} as input0. + * If all dimensions are reduced and keep_dims is false, the output + * shape is [1]. * * Available since API level 29. */ @@ -4169,6 +4554,8 @@ typedef enum { * * Outputs: * * 0: A tensor of the same {@link OperandCode} as input0. + * If all dimensions are reduced and keep_dims is false, the output + * shape is [1]. * * Available since API level 29. */ @@ -4188,9 +4575,10 @@ typedef enum { * interpolation. * * Supported tensor {@link OperandCode}: - * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} (since API level 29) + * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout. * With the default data layout NHWC, the data is stored in the order of: @@ -4229,7 +4617,8 @@ typedef enum { * Outputs: * * 0: A tensor of the same {@link OperandCode} as input0. The output * shape is [num_rois, out_height, out_width, depth]. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint can be different from the input0 scale and zeroPoint. * * Available since API level 29. @@ -4252,6 +4641,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout. * With the default data layout NHWC, the data is stored in the order of: @@ -4262,7 +4652,8 @@ typedef enum { * * 0: A 4-D tensor, specifying the feature map. * * 1: A 2-D Tensor of shape [num_rois, 4], specifying the locations of * the regions of interest, each line with format [x1, y1, x2, y2]. - * For input0 of type {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, + * For input0 of type {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * this tensor should be of {@link ANEURALNETWORKS_TENSOR_QUANT16_ASYMM}, * with zeroPoint of 0 and scale of 0.125. * * 2: An 1-D {@link ANEURALNETWORKS_TENSOR_INT32} tensor, of shape @@ -4282,7 +4673,8 @@ typedef enum { * Outputs: * * 0: A tensor of the same {@link OperandCode} as input0. The output * shape is [num_rois, out_height, out_width, depth]. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For input0 of type {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input0. * * Available since API level 29. @@ -4319,6 +4711,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_INT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: from 1 * @@ -4329,7 +4722,8 @@ typedef enum { * true) or input2 (if false). * * 1: An input tensor of the same shape as input0. * * 2: An input tensor of the same shape and type as input1. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scales and zeroPoint can be different from input1 scale and zeroPoint. * * Outputs: @@ -4337,6 +4731,7 @@ typedef enum { * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, * the scale and zeroPoint can be different from inputs' scale and zeroPoint. * + * Available since API level 29. */ ANEURALNETWORKS_SELECT = 84, @@ -4376,6 +4771,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_INT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: from 1 * @@ -4388,7 +4784,8 @@ typedef enum { * * Outputs: * * 0: An n-D tensor of the same type as the input containing the slice. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * its scale and zeroPoint has to be same as the input0 scale and zeroPoint. * * Available since API level 29. @@ -4403,6 +4800,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_INT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: from 1 * @@ -4415,7 +4813,8 @@ typedef enum { * * Outputs: * * 0 ~ (num_splits - 1): Resulting subtensors. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input0. * * Available since API level 29. @@ -4455,6 +4854,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_INT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: from 1 * @@ -4465,7 +4865,8 @@ typedef enum { * * Outputs: * * 0: A tiled tensor of the same {@link OperandCode} and rank as `input`. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input0. * * Available since API level 29. @@ -4483,6 +4884,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_INT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: from 1 * @@ -4494,7 +4896,8 @@ typedef enum { * Outputs: * * 0: An n-D tensor of the same type as the input, containing the k * largest elements along each last dimensional slice. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input0. * * 1: An n-D tensor of type {@link ANEURALNETWORKS_TENSOR_INT32} * containing the indices of values within the last dimension of input. @@ -4531,6 +4934,18 @@ typedef enum { * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (scale set to 0.0, * * * each value scaling is separate and equal to input.scale * filter.scales[channel]). * + * Available since API level 30: + * * Quantized signed (since API level 30): + * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} for input, filter, and output. + * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (with scale set to + * * * input.scale * filter.scale). + * + * * Quantized signed with filter symmetric per channel quantization (since API level 30): + * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} for input, and output. + * * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} for filter. + * * * {@link ANEURALNETWORKS_TENSOR_INT32} for bias (scale set to 0.0, + * * * each value scaling is separate and equal to input.scale * filter.scales[channel]). + * * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout. * With the default data layout NHWC, the data is stored in the order of: * [batch, height, width, channels]. Alternatively, the data layout could @@ -4540,24 +4955,25 @@ typedef enum { * * Inputs (explicit padding): * * 0: A 4-D tensor, of shape [batches, height, width, depth_in], - * specifying the input. Since API level 29, zero batches is supported - * for this tensor. + * specifying the input. + * Since API level 29, zero batches is supported for this tensor. * * 1: A 4-D tensor, of shape * [depth_out, filter_height, filter_width, depth_in], specifying the * filter. For tensor of type * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} the channel - * dimension (extraParams.channelQuant.channelDim) must be set to 0. + * dimension (ANeuralNetworksSymmPerChannelQuantParams::channelDim) must be set to 0. * * 2: A 1-D tensor, of shape [depth_out], specifying the bias. For input * tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} or - * {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias should be of the - * same type. For input tensor of type - * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, the bias should be - * of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0 and - * bias_scale == input_scale * filter_scale. For filter tensor of - * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, the bias - * must be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of - * 0 and bias_scale of 0. The actual scale of each value 'i' is equal - * to bias_scale[i] = input_scale * filter_scale[i]. + * {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias must be of the + * same type. + * For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, + * the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, + * with zeroPoint of 0 and bias_scale == input_scale * filter_scale. + * For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, + * the bias must be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0 + * and bias_scale of 0. The actual scale of each value 'i' is equal to + * bias_scale[i] = input_scale * filter_scale[i]. * * 3: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on * the left, in the ‘width’ dimension. * * 4: An {@link ANEURALNETWORKS_INT32} scalar, specifying the padding on @@ -4578,24 +4994,25 @@ typedef enum { * * Inputs (implicit padding): * * 0: A 4-D tensor, of shape [batches, height, width, depth_in], - * specifying the input. Since API level 29, zero batches is supported - * for this tensor. + * specifying the input. + * Since API level 29, zero batches is supported for this tensor. * * 1: A 4-D tensor, of shape * [depth_out, filter_height, filter_width, depth_in], specifying the * filter. For tensor of type * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} the channel - * dimension (extraParams.channelQuant.channelDim) must be set to 0. + * dimension (ANeuralNetworksSymmPerChannelQuantParams::channelDim) must be set to 0. * * 2: A 1-D tensor, of shape [depth_out], specifying the bias. For input * tensor of type {@link ANEURALNETWORKS_TENSOR_FLOAT32} or * {@link ANEURALNETWORKS_TENSOR_FLOAT16}, the bias should be of the - * same type. For input tensor of type - * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}, the bias should be - * of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0 and - * bias_scale == input_scale * filter_scale. For filter tensor of - * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, the bias - * must be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of - * 0 and bias_scale of 0. The actual scale of each value 'i' is equal - * to bias_scale[i] = input_scale * filter_scale[i]. + * same type. + * For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * and {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED}, + * the bias should be of {@link ANEURALNETWORKS_TENSOR_INT32}, + * with zeroPoint of 0 and bias_scale == input_scale * filter_scale. + * For filter tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL}, + * the bias must be of {@link ANEURALNETWORKS_TENSOR_INT32}, with zeroPoint of 0 + * and bias_scale of 0. The actual scale of each value 'i' is equal to + * bias_scale[i] = input_scale * filter_scale[i]. * * 3: An {@link ANEURALNETWORKS_TENSOR_INT32} tensor, specifying the output * tensor shape. * * 4: An {@link ANEURALNETWORKS_INT32} scalar, specifying the implicit @@ -4614,7 +5031,8 @@ typedef enum { * Outputs: * * 0: The output 4-D tensor, of shape * [batches, out_height, out_width, depth_out]. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint can be different from inputs' scale and zeroPoint. * * Available since API level 29. @@ -4727,8 +5145,21 @@ typedef enum { * A 3-D tensor of shape: * If time-major: [max_time, batch_size, output_size] * If batch-major: [batch_size, max_time, output_size] + * * 1: A tensor of shape [batch_size, output_size] containing a hidden + * state from the last time step in the sequence. This output is + * optional and can be omitted. If this output is present then + * output #2 must be present as well. + * Available since API level 30. + * * 2: A tensor of shape [batch_size, cell_size] containing a cell state + * from the last time step in the sequence. This output is optional + * and can be omitted. + * Available since API level 30. * * Available since API level 29. + * + * Important: As of API level 29, there is no way to get the output state tensors out and NNAPI + * does not maintain internal states. This operator does not support the usage pattern in which + * multiple cells are chained and state tensors are propagated. */ ANEURALNETWORKS_UNIDIRECTIONAL_SEQUENCE_LSTM = 92, @@ -4784,8 +5215,16 @@ typedef enum { * it is set to 1, then the output has a shape [maxTime, batchSize, * numUnits], otherwise the output has a shape [batchSize, maxTime, * numUnits]. + * * 1: A tensor of shape [batchSize, numUnits] containing hidden state + * from the last time step in the sequence. This output is optional + * and can be omitted. + * Available since API level 30. * * Available since API level 29. + * + * Important: As of API level 29, there is no way to get the output state tensors out and NNAPI + * does not maintain internal states. This operator does not support the usage pattern in which + * multiple cells are chained and state tensors are propagated. */ ANEURALNETWORKS_UNIDIRECTIONAL_SEQUENCE_RNN = 93, @@ -4800,6 +5239,7 @@ typedef enum { * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} (since API level 30) * * Supported tensor rank: 4, with "NHWC" or "NCHW" data layout. * With the default data layout NHWC, the data is stored in the order of: @@ -4817,6 +5257,17 @@ typedef enum { * height of the output tensor. * * 3: An {@link ANEURALNETWORKS_BOOL} scalar, default to false. * Set to true to specify NCHW data layout for input0 and output0. + * * 4: Align corners. An optional {@link ANEURALNETWORKS_BOOL} + * scalar, default to false. If True, the centers of the 4 corner + * pixels of the input and output tensors are aligned, preserving the + * values at the corner pixels. + * Available since API level 30. + * * 5: Half pixel centers. An optional {@link ANEURALNETWORKS_BOOL} + * scalar, default to false. If True, the pixel centers are assumed to + * be at (0.5, 0.5). This is the default behavior of image.resize in + * TF 2.0. If this parameter is True, then align_corners parameter + * must be False. + * Available since API level 30. * * Inputs (resizing by scale): * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying @@ -4835,16 +5286,377 @@ typedef enum { * {@link ANEURALNETWORKS_FLOAT32} otherwise. * * 3: An {@link ANEURALNETWORKS_BOOL} scalar, default to false. * Set to true to specify NCHW data layout for input0 and output0. + * * 4: Align corners. An optional {@link ANEURALNETWORKS_BOOL} + * scalar, default to false. If True, the centers of the 4 corner + * pixels of the input and output tensors are aligned, preserving the + * values at the corner pixels. + * Available since API level 30. + * * 5: Half pixel centers. An optional {@link ANEURALNETWORKS_BOOL} + * scalar, default to false. If True, the pixel centers are assumed to + * be at (0.5, 0.5). This is the default behavior of image.resize in + * TF 2.0. If this parameter is True, then align_corners parameter + * must be False. + * Available since API level 30. * * Outputs: * * 0: The output 4-D tensor, of shape * [batches, new_height, new_width, depth]. - * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} tensor, + * For a {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} and + * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} tensor, * the scale and zeroPoint must be the same as input0. * * Available since API level 29. */ ANEURALNETWORKS_RESIZE_NEAREST_NEIGHBOR = 94, + + // Operations below are available since API level 30. + + /** + * Quantized version of {@link ANEURALNETWORKS_LSTM}. + * + * The input and the output use asymmetric quantized types, while the rest + * use symmetric ones. + * + * Inputs: + * * 0: The input to the LSTM cell. + * Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} + * Shape: [batchSize, inputSize] + * * 1: The input-to-input weights. Optional. + * Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM} + * Shape: [numUnits, inputSize] + * * 2: The input-to-forget weights. + * Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM} + * Shape: [numUnits, inputSize] + * * 3: The input-to-cell weights. + * Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM} + * Shape: [numUnits, inputSize] + * * 4: The input-to-output weights. + * Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM} + * Shape: [numUnits, inputSize] + * * 5: The recurrent-to-input weights. Optional. + * Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM} + * Shape: [numUnits, outputSize] + * * 6: The recurrent-to-forget weights. + * Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM} + * Shape: [numUnits, outputSize] + * * 7: The recurrent-to-cell weights. + * Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM} + * Shape: [numUnits, outputSize] + * * 8: The recurrent-to-output weights. + * Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM} + * Shape: [numUnits, outputSize] + * * 9: The cell-to-input weights (for peephole). Optional. + * Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM} + * Shape: [numUnits] + * * 10: The cell-to-forget weights (for peephole). Optional. + * Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM} + * Shape: [numUnits] + * * 11: The cell-to-output weights (for peephole). Optional. + * Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM} + * Shape: [numUnits] + * * 12: The input gate bias. Quantized with scale being the + * product of input and weights scales and zeroPoint equal to 0. + * Optional. + * Type: {@link ANEURALNETWORKS_TENSOR_INT32} + * Shape: [numUnits] + * * 13: The forget gate bias. Quantized with scale being the + * product of input and weights scales and zeroPoint equal to 0. + * Type: {@link ANEURALNETWORKS_TENSOR_INT32} + * Shape: [numUnits] + * * 14: The cell bias. Quantized with scale being the + * product of input and weights scales and zeroPoint equal to 0. + * Type: {@link ANEURALNETWORKS_TENSOR_INT32} + * Shape: [numUnits] + * * 15: The output gate bias. Quantized with scale being the + * product of input and weights scales and zeroPoint equal to 0. + * Type: {@link ANEURALNETWORKS_TENSOR_INT32} + * Shape: [numUnits] + * * 16: The projection weights. Optional. + * Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM} + * Shape: [outputSize, numUnits] + * * 17: The projection bias. Quantized with scale being the + * product of input and weights scales and zeroPoint equal to 0. + * Optional. + * Type: {@link ANEURALNETWORKS_TENSOR_INT32} + * Shape: [outputSize] + * * 18: The output from the previous time step. + * Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} + * Shape: [batchSize, outputSize] + * * 19: The cell state from the previous time step. + * Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM} + * Shape: [batchSize, numUnits] + * * 20: The input layer normalization weights. Used to rescale + * normalized inputs to activation at input gate. Optional. + * Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM} + * Shape: [numUnits] + * * 21: The forget layer normalization weights. Used to + * rescale normalized inputs to activation at forget gate. Optional. + * Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM} + * Shape: [numUnits] + * * 22: The cell layer normalization weights. Used to rescale + * normalized inputs to activation at cell gate. Optional. + * Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM} + * Shape: [numUnits] + * * 23: The output layer normalization weights. Used to + * rescale normalized inputs to activation at output gate. Optional. + * Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM} + * Shape: [numUnits] + * * 24: The cell clip. If provided the cell state is clipped + * by this value prior to the cell output activation. Optional. + * Type: {@link ANEURALNETWORKS_FLOAT32}. + * * 25: The projection clip. If provided and projection is enabled, + * this is used for clipping the projected values. Optional. + * Type: {@link ANEURALNETWORKS_FLOAT32}. + * * 26: The scale of the intermediate result of matmul, + * i.e. input to layer normalization, at input gate. + * Type: {@link ANEURALNETWORKS_FLOAT32}. + * * 27: The scale of the intermediate result of matmul, + * i.e. input to layer normalization, at forget gate. + * Type: {@link ANEURALNETWORKS_FLOAT32}. + * * 28: The scale of the intermediate result of matmul, + * i.e. input to layer normalization, at cell gate. + * Type: {@link ANEURALNETWORKS_FLOAT32}. + * * 29: The scale of the intermediate result of matmul, + * i.e. input to layer normalization, at output gate. + * Type: {@link ANEURALNETWORKS_FLOAT32}. + * * 30: The zero point of the hidden state, i.e. input to + * projection. + * Type: {@link ANEURALNETWORKS_INT32}. + * * 31: The scale of the hidden state, i.e. input to + * projection. + * Type: {@link ANEURALNETWORKS_FLOAT32}. + * + * Outputs: + * * 0: The output state (out). + * Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} + * Shape: [batchSize, outputSize] + * * 1: The cell state (out). + * Type: {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM} + * Shape: [batchSize, numUnits] + * * 2: The output. This is effectively the same as the current + * "output state (out)" value. + * Type: {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} + * Shape: [batchSize, outputSize] + * + * Available since API level 30. + */ + ANEURALNETWORKS_QUANTIZED_LSTM = 95, + + /** + * Executes one of the two referenced models as determined by a boolean + * value. + * + * The inputs and outputs of the two referenced models must agree with the + * signature of this operation. That is, if the operation has (3 + n) inputs + * and m outputs, both models must have n inputs and m outputs with the same + * types, ranks (if specified), dimensions (if specified), scales, + * zeroPoints, and other operand parameters as the corresponding operation + * inputs and outputs. + * + * Inputs: + * * 0: A value of type {@link ANEURALNETWORKS_TENSOR_BOOL8} and shape [1] + * that determines which of the two referenced models to execute. + * The operand must have fully specified dimensions. + * * 1: A {@link ANEURALNETWORKS_MODEL} reference to the model to be + * executed if the condition is true. + * * 2: A {@link ANEURALNETWORKS_MODEL} reference to the model to be + * executed if the condition is false. + * * 3 ~ (n + 2): Inputs to be passed to the model selected for execution. + * + * Outputs: + * * 0 ~ (m - 1): Outputs produced by the selected model. + * + * Available since API level 30. + */ + ANEURALNETWORKS_IF = 96, + + /** + * Executes the body model until the condition model outputs false. + * + * The inputs to this operation are the condition model, the body model, + * and operand values for the first iteration of the loop. The values are + * implicitly split into three groups of input-output, state-only, and + * input-only values, as described below. + * + * The outputs of this operation are the final values of input-output + * operands. + * + * Both the condition and body model receive (m + k + n) inputs. + * * The first m (m >= 1) inputs are input-output operands. For the first + * iteration, these are initialized from the corresponding inputs of the + * WHILE operation. In subsequent iterations, their values come from the + * corresponding outputs of the body model produced during the previous + * iteration. + * * The next k (k >= 0) inputs are state-only operands. They are similar to + * the input-output operands, except that their values are no longer + * available after the loop terminates. + * * The last n (n >= 0) inputs are input-only operands. Their values come + * from the corresponding inputs of the WHILE operation. + * + * The body model produces (m + k) outputs. + * * The first m outputs are input-output operands. They become the outputs + * of the WHILE operation when a termination condition is reached. + * * The last k outputs are state-only operands. Their values are no longer + * available after the loop terminates. + * + * The numbers m, k, and n are inferred by the runtime as follows: + * m = (WHILE operation output count) + * k = (body model output count) - m + * n = (body model input count) - m - k + * + * The pseudo-code below illustrates the flow of a WHILE operation with + * inputs condition, body, initial_input_output, initial_state, input_only + * (m = 1, k = 1, n = 1): + * + * input_output = initial_input_output + * state = initial_state + * while condition(input_output, state, input_only): + * input_output, state = body(input_output, state, input_only) + * return input_output + * + * To prevent infinite loops, there is an implicit execution timeout + * associated with each loop ("loop timeout duration"). See {@link + * ANeuralNetworksExecution_setLoopTimeout}. + * + * Inputs: + * * 0: A {@link ANEURALNETWORKS_MODEL} reference to the condition + * model. The model must have (m + k + n) inputs with + * the same types, ranks (if specified), dimensions (if specified), + * scales, zeroPoints, and other operand parameters as the + * corresponding inputs of the WHILE operation and exactly one output + * of {@link ANEURALNETWORKS_TENSOR_BOOL8} and shape [1]. + * The output operand must have fully specified dimensions. + * * 1: A {@link ANEURALNETWORKS_MODEL} reference to the body model. + * The model must have (m + k + n) inputs and (m + k) outputs with + * the same types, ranks (if specified), dimensions (if specified), + * scales, zeroPoints, and other operand parameters as the + * corresponding inputs and outputs of the WHILE operation. + * * (m inputs): Initial values for input-output operands. + * * (k inputs): Initial values for state-only operands. + * * (n inputs): Values for input-only operands. + * + * Outputs: + * * 0 ~ (m - 1): Outputs produced by the loop. + * + * Available since API level 30. + */ + ANEURALNETWORKS_WHILE = 97, + + /** + * Computes exponential linear activation on the input tensor element-wise. + * + * The output is calculated using the following formula: + * + * ELU(x) = max(0, x) + min(0, alpha * (exp(x) - 1)) + * + * Supported tensor {@link OperandCode}: + * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} + * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} + * + * Supported tensor rank: from 1. + * + * Inputs: + * * 0: A tensor, specifying the input. May be zero-sized. + * * 1: A scalar, specifying the alpha parameter. + * For input tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT16}, + * the alpha value must be of {@link ANEURALNETWORKS_FLOAT16}. + * For input tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32}, + * the alpha value must be of {@link ANEURALNETWORKS_FLOAT32}. + * + * Outputs: + * * 0: The output tensor of same shape and type as input0. + * + * Available since API level 30. + */ + ANEURALNETWORKS_ELU = 98, + + /** + * Computes hard-swish activation on the input tensor element-wise. + * + * Hard swish activation is introduced in + * https://arxiv.org/pdf/1905.02244.pdf + * + * The output is calculated using the following formula: + * + * h-swish(x) = x * max(0, min(6, (x + 3))) / 6 + + * Supported tensor {@link OperandCode}: + * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} + * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} + * + * Supported tensor rank: from 1. + * + * Inputs: + * * 0: A tensor, specifying the input. May be zero-sized. + * + * Outputs: + * * 0: The output tensor of same shape and type as input0. + * Scale and zero point of this tensor may be different from the input + * tensor's parameters. + * + * Available since API level 30. + */ + ANEURALNETWORKS_HARD_SWISH = 99, + + /** + * Creates a tensor filled with a scalar value. + * + * Supported output tensor {@link OperandCode}: + * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} + * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} + * * {@link ANEURALNETWORKS_TENSOR_INT32} + * + * Supported tensor rank: from 1. + * + * Inputs: + * * 0: A 1-D tensor, specifying the desired output tensor shape. + * * 1: A scalar, specifying the value to fill the output tensors with. + * For output tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT16}, + * the scalar must be of {@link ANEURALNETWORKS_FLOAT16}. + * For output tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32}, + * the scalar must be of {@link ANEURALNETWORKS_FLOAT32}. + * For output tensor of {@link ANEURALNETWORKS_TENSOR_INT32}, + * the scalar must be of {@link ANEURALNETWORKS_INT32}. + * + * Outputs: + * * 0: The output tensor. + * + * Available since API level 30. + */ + ANEURALNETWORKS_FILL = 100, + + /** + * Returns the rank of a tensor. + * + * The rank of a tensor is the number of dimensions in it. Also known as + * "order", "degree", "ndims". + * + * Supported tensor {@link OperandCode}: + * * {@link ANEURALNETWORKS_TENSOR_FLOAT16} + * * {@link ANEURALNETWORKS_TENSOR_FLOAT32} + * * {@link ANEURALNETWORKS_TENSOR_INT32} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT16_SYMM} + * * {@link ANEURALNETWORKS_TENSOR_BOOL8} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL} + * * {@link ANEURALNETWORKS_TENSOR_QUANT16_ASYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_SYMM} + * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM_SIGNED} + * + * Supported tensor rank: from 1. + * + * Inputs: + * * 0: The input tensor. + * + * Outputs: + * * 0: A scalar of {@link ANEURALNETWORKS_INT32}, specifying the rank + * of the input tensor. + * + * Available since API level 30. + */ + ANEURALNETWORKS_RANK = 101, } OperationCode; /** @@ -4880,10 +5692,11 @@ typedef enum { * the same; for odd number of padding, padding to the ending is bigger * than the padding to the beginning by 1. * - * total_padding is a function of input, stride and filter size. + * total_padding is a function of input, stride, dilation and filter size. * It could be computed as follows: - * out_size = (input + stride - 1) / stride; - * needed_input = (out_size - 1) * stride + filter_size + * out_size = (input + stride - 1) / stride + * effective_filter_size = (filter_size - 1) * dilation + 1 + * needed_input = (out_size - 1) * stride + effective_filter_size * total_padding = max(0, needed_input - input_size) * The computation is the same for the horizontal and vertical directions. */ @@ -5004,6 +5817,47 @@ typedef enum { * Failure caused by a device not being available. */ ANEURALNETWORKS_UNAVAILABLE_DEVICE = 9, + + /** + * Failure because a deadline could not be met for a task, but future + * deadlines may still be met for the same task after a short delay. + * + * Available since API level 30. + */ + ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT = 10, + + /** + * Failure because a deadline could not be met for a task, and future + * deadlines will likely also not be met for the same task even after a + * short delay. + * + * Available since API level 30. + */ + ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT = 11, + + /** + * Failure because of a resource limitation within the driver, but future + * calls for the same task may still succeed after a short delay. + * + * Available since API level 30. + */ + ANEURALNETWORKS_RESOURCE_EXHAUSTED_TRANSIENT = 12, + + /** + * Failure because of a resource limitation within the driver, and future + * calls for the same task will likely also fail even after a short + * delay. + * + * Available since API level 30. + */ + ANEURALNETWORKS_RESOURCE_EXHAUSTED_PERSISTENT = 13, + + /** + * Failure indicating an object is in a dead state. + * + * Available since API level 30. + */ + ANEURALNETWORKS_DEAD_OBJECT = 14, } ResultCode; /** @@ -5024,6 +5878,48 @@ enum { ANEURALNETWORKS_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES = 128 }; enum { ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN = 32 }; /** + * Different duration measurements. + * + * Durations are measured in nanoseconds. + * + * Available since API level 29. + */ +typedef enum { + // Execution time on hardware (not driver, which runs on host processor). + ANEURALNETWORKS_DURATION_ON_HARDWARE = 0, + // Execution time in driver (including time on hardware). Excludes overhead + // such as that of the runtime itself and the IPC needed for the runtime to + // communicate with the driver. + ANEURALNETWORKS_DURATION_IN_DRIVER = 1, + // Execution time on hardware, after all dependencies have been signaled. + // If no dependencies specified (for example, if the execution was scheduled other + // than with {@link ANeuralNetworksExecution_startComputeWithDependencies}), the + // reported time will be the same as ANEURALNETWORKS_DURATION_ON_HARDWARE. + // Available since API level 30. + ANEURALNETWORKS_FENCED_DURATION_ON_HARDWARE = 2, + // Execution time in driver, after all dependencies have been signaled. Excludes + // overhead such as that of the runtime itself and the IPC needed for the runtime + // to communicate with the driver. + // If no dependencies specified (for example, if the execution was scheduled other + // than with {@link ANeuralNetworksExecution_startComputeWithDependencies}), the + // reported time will be the same as ANEURALNETWORKS_DURATION_IN_DRIVER. + // Available since API level 30. + ANEURALNETWORKS_FENCED_DURATION_IN_DRIVER = 3, +} DurationCode; + +/** + * Relative execution priority. + * + * Available since API level 30. + */ +typedef enum { + ANEURALNETWORKS_PRIORITY_LOW = 90, + ANEURALNETWORKS_PRIORITY_MEDIUM = 100, + ANEURALNETWORKS_PRIORITY_HIGH = 110, + ANEURALNETWORKS_PRIORITY_DEFAULT = ANEURALNETWORKS_PRIORITY_MEDIUM, +} PriorityCode; + +/** * ANeuralNetworksMemory is an opaque type that represents memory. * * This type is used to represent shared memory, memory mapped files, @@ -5049,7 +5945,21 @@ enum { ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN = 32 }; * of the element type byte size, e.g., a tensor with * {@link ANEURALNETWORKS_TENSOR_FLOAT32} type must be aligned on 4-byte boundary. * + * It is the application's responsibility to ensure that there are no uses of + * the memory after calling {@link ANeuralNetworksMemory_free}. This includes + * any model which references this memory because of a call to + * {@link ANeuralNetworksModel_setOperandValueFromMemory}, any compilation + * created using such a model, any execution object or burst object created + * using such a compilation, or any execution which references this memory + * because of a call to {@link ANeuralNetworksExecution_setInputFromMemory} or + * {@link ANeuralNetworksExecution_setOutputFromMemory}. + * * Available since API level 27. + * + * Starting at API level 30, the application may request creation of device native memory from + * {@link ANeuralNetworksMemoryDesc} to avoid potential memory copying and transformation + * overhead between executions. See also {@link ANeuralNetworksMemoryDesc} and + * {@link ANeuralNetworksMemory_createFromDesc}. */ typedef struct ANeuralNetworksMemory ANeuralNetworksMemory; @@ -5079,9 +5989,10 @@ typedef struct ANeuralNetworksMemory ANeuralNetworksMemory; * modifies a model at a given time. It is however safe for more than one * thread to use the model once {@link ANeuralNetworksModel_finish} has returned.</p> * - * <p>It is also the application's responsibility to ensure that there are no other - * uses of the model after calling {@link ANeuralNetworksModel_free}. - * This includes any compilation or execution object created using the model.</p> + * <p>It is also the application's responsibility to ensure that there are no + * other uses of the model after calling {@link ANeuralNetworksModel_free}. + * This includes any compilation, execution object or burst object created using + * the model.</p> * * Available since API level 27. */ @@ -5119,7 +6030,10 @@ typedef struct ANeuralNetworksModel ANeuralNetworksModel; * * <p>It is also the application's responsibility to ensure that there are no other * uses of the compilation after calling {@link ANeuralNetworksCompilation_free}. - * This includes any execution object created using the compilation.</p> + * This includes any execution object or burst object created using the compilation, + * or any memory descriptor with the compilation as part of one of the roles specified by + * {@link ANeuralNetworksMemoryDesc_addInputRole} or + * {@link ANeuralNetworksMemoryDesc_addOutputRole}.</p> * * Available since API level 27. */ @@ -5139,7 +6053,8 @@ typedef struct ANeuralNetworksCompilation ANeuralNetworksCompilation; * {@link ANeuralNetworksExecution_setOutput} or * {@link ANeuralNetworksExecution_setOutputFromMemory}.</li> * <li>Apply the model with one of the following:</li><ul> - * <li>Asynchronously with {@link ANeuralNetworksExecution_startCompute}, + * <li>Asynchronously with {@link ANeuralNetworksExecution_startCompute} + * or with {@link ANeuralNetworksExecution_startComputeWithDependencies}, * waiting for the execution to complete with * {@link ANeuralNetworksEvent_wait}.</li> * <li>Synchronously with {@link ANeuralNetworksExecution_compute}.</li> @@ -5154,38 +6069,54 @@ typedef struct ANeuralNetworksCompilation ANeuralNetworksCompilation; * ({@link ANeuralNetworksModel_setOperandValueFromMemory}).</p> * * <p>An execution cannot be modified once - * {@link ANeuralNetworksExecution_compute} or - * {@link ANeuralNetworksExecution_startCompute} has been called on it.</p> + * {@link ANeuralNetworksExecution_burstCompute}, + * {@link ANeuralNetworksExecution_compute}, + * {@link ANeuralNetworksExecution_startCompute} or + * {@link ANeuralNetworksExecution_startComputeWithDependencies} has been called on it.</p> * * <p>An execution can be applied to a model with - * {@link ANeuralNetworksExecution_compute} or - * {@link ANeuralNetworksExecution_startCompute} only once. Create new + * {@link ANeuralNetworksExecution_burstCompute}, + * {@link ANeuralNetworksExecution_compute}, + * {@link ANeuralNetworksExecution_startCompute} or + * {@link ANeuralNetworksExecution_startComputeWithDependencies} only once. Create new * executions to do new evaluations of the model.</p> * * <p>It is the application's responsibility to make sure that only one thread * modifies an execution at a given time. It is however safe for more than one * thread to use {@link ANeuralNetworksEvent_wait} at the same time.</p> * + * <p>It is also the application's responsibility to ensure that the execution + * either has never been scheduled or has completed (i.e., that + * {@link ANeuralNetworksExecution_burstCompute}, + * {@link ANeuralNetworksExecution_compute}, or + * {@link ANeuralNetworksEvent_wait} has returned) before calling + * {@link ANeuralNetworksExecution_free}.</p>. + * * <p>It is also the application's responsibility to ensure that there are no other * uses of the execution after calling {@link ANeuralNetworksExecution_free}.</p> * * <p>Multiple executions can be scheduled and evaluated concurrently, either by - * means of {@link ANeuralNetworksExecution_compute} (which is synchronous) in - * different threads or by means of - * {@link ANeuralNetworksExecution_startCompute} (which is asynchronous). The - * runtime makes no guarantee on the ordering of completion of executions. If - * it's important to the application, the application should enforce the - * ordering by ensuring that one execution completes before the next is - * scheduled (for example, by scheduling all executions synchronously within a - * single thread, or by scheduling all executions asynchronously and using - * {@link ANeuralNetworksEvent_wait} between calls to - * {@link ANeuralNetworksExecution_startCompute}).</p> + * means of {@link ANeuralNetworksExecution_compute} or + * {@link ANeuralNetworksExecution_burstCompute} (which are synchronous) in + * different threads, or by means of + * {@link ANeuralNetworksExecution_startCompute} or + * {@link ANeuralNetworksExecution_startComputeWithDependencies} (which are asynchronous). + * (Concurrent uses of {@link ANeuralNetworksExecution_burstCompute} must be on + * different burst objects.) The runtime makes no guarantee on the ordering of + * completion of executions. If it's important to the application, the + * application should enforce the ordering by ensuring that one execution + * completes before the next is scheduled (for example, by scheduling all + * executions synchronously within a single thread, or by scheduling all + * executions asynchronously and using {@link ANeuralNetworksEvent_wait} between + * calls to {@link ANeuralNetworksExecution_startCompute}); or by using + * {@link ANeuralNetworksExecution_startComputeWithDependencies} to make the execution wait for a + * list of events to be signaled before starting the actual evaluation.</p> * * Available since API level 27. */ typedef struct ANeuralNetworksExecution ANeuralNetworksExecution; -#if __ANDROID_API__ >= __ANDROID_API_Q__ +#if __ANDROID_API__ >= 29 /** * Parameters for ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL operand. */ @@ -5230,7 +6161,7 @@ typedef struct ANeuralNetworksSymmPerChannelQuantParams { * Available since API level 29. */ typedef struct ANeuralNetworksBurst ANeuralNetworksBurst; -#endif // __ANDROID_API__ >= __ANDROID_API_Q__ +#endif // __ANDROID_API__ >= 29 /** * ANeuralNetworksOperandType describes the type of an operand. @@ -5245,7 +6176,9 @@ typedef struct ANeuralNetworksBurst ANeuralNetworksBurst; * * If a tensor operand's type is not fully specified, the dimensions * of the operand are deduced from the operand types and values of the - * operation for which that operand is an output. + * operation for which that operand is an output or from the corresponding + * {@link ANEURALNETWORKS_IF} or {@link ANEURALNETWORKS_WHILE} operation input + * operand type in the case of referenced model input operands. * * <p>In the following situations, a tensor operand type must be fully * specified:<ul> @@ -5254,16 +6187,25 @@ typedef struct ANeuralNetworksBurst ANeuralNetworksBurst; * non-nullptr buffer) or * {@link ANeuralNetworksModel_setOperandValueFromMemory}.</li> * <li>The operand is a model input (see - * {@link ANeuralNetworksModel_identifyInputsAndOutputs}). A - * fully specified tensor operand type must either be provided - * to {@link ANeuralNetworksModel_addOperand}; or it must be - * provided to the corresponding + * {@link ANeuralNetworksModel_identifyInputsAndOutputs}) of the main + * model within a compilation. A fully specified tensor operand type + * must either be provided to {@link ANeuralNetworksModel_addOperand}; + * or it must be provided to the corresponding * {@link ANeuralNetworksExecution_setInput}, or * {@link ANeuralNetworksExecution_setInputFromMemory}. * EXCEPTION: If the input is optional and omitted * (by passing nullptr for buffer to * {@link ANeuralNetworksExecution_setInput}) then it need - * not have a fully specified tensor operand type.</li></ul> + * not have a fully specified tensor operand type.</li> + * <li>The operand is a model output (see + * {@link ANeuralNetworksModel_identifyInputsAndOutputs}) of the main + * model within a compilation and is to be used with {@link + * ANeuralNetworksExecution_startComputeWithDependencies}. + * A fully specified tensor operand type must either be provided + * to {@link ANeuralNetworksModel_addOperand}; or it must be + * provided to the corresponding + * {@link ANeuralNetworksExecution_setOutput}, or + * {@link ANeuralNetworksExecution_setOutputFromMemory}.</li></ul> * * A tensor operand type of specified rank but some number of * unspecified dimensions is represented by setting dimensionCount to @@ -5296,11 +6238,21 @@ typedef struct ANeuralNetworksOperandType { const uint32_t* dimensions; /** - * These two fields are only used for quantized tensors. - * They must be zero for all other types. - * The dequantized value of each entry is (value - zeroPoint) * scale. + * The quantization scale. + * + * Must be 0 when not applicable to an operand type. + * + * See {@link OperandCode}. */ float scale; + + /** + * The quantization zero point. + * + * Must be 0 when not applicable to an operand type. + * + * See {@link OperandCode}. + */ int32_t zeroPoint; } ANeuralNetworksOperandType; @@ -5314,7 +6266,7 @@ typedef int32_t ANeuralNetworksOperationType; */ typedef struct ANeuralNetworksEvent ANeuralNetworksEvent; -#if __ANDROID_API__ >= __ANDROID_API_Q__ +#if __ANDROID_API__ >= 29 /** * ANeuralNetworksDevice is an opaque type that represents a device. @@ -5326,6 +6278,318 @@ typedef struct ANeuralNetworksEvent ANeuralNetworksEvent; */ typedef struct ANeuralNetworksDevice ANeuralNetworksDevice; +#endif // __ANDROID_API__ >= 29 + +#if __ANDROID_API__ >= 30 + +/** + * ANeuralNetworksMemoryDesc is an opaque type that represents a memory descriptor. + * + * A memory descriptor describes the properties of a memory object, and is used by + * {@link ANeuralNetworksMemory_createFromDesc}. + * + * To use: + * - Create a new memory descriptor by calling {@link ANeuralNetworksMemoryDesc_create}. + * - Specify all of the intended input and output roles by calling + * {@link ANeuralNetworksMemoryDesc_addInputRole} and + * {@link ANeuralNetworksMemoryDesc_addOutputRole}. + * - Optionally, specify the memory dimensions by calling + * {@link ANeuralNetworksMemoryDesc_setDimensions}. + * - Complete the memory descriptor with {@link ANeuralNetworksMemoryDesc_finish}. + * - Use the memory descriptor as many times as needed with + * {@link ANeuralNetworksMemory_createFromDesc}. + * - Destroy the memory descriptor with {@link ANeuralNetworksMemoryDesc_free}. + * + * A memory descriptor is completed by calling {@link ANeuralNetworksMemoryDesc_finish}. + * A memory descriptor is destroyed by calling {@link ANeuralNetworksMemoryDesc_free}. + * + * A memory descriptor must not be modified once {@link ANeuralNetworksMemoryDesc_finish} + * has been called on it. + * + * It is the application's responsibility to make sure that only + * one thread modifies a memory descriptor at a given time. It is however + * safe for more than one thread to use the memory descriptor once + * {@link ANeuralNetworksMemoryDesc_finish} has returned. + * + * It is also the application's responsibility to ensure that there are no other + * uses of the memory descriptor after calling {@link ANeuralNetworksMemoryDesc_free}. + * It is however safe to continue using a {@link ANeuralNetworksMemory} object created + * from the memory descriptor. + * + * Available since API level 30. + */ +typedef struct ANeuralNetworksMemoryDesc ANeuralNetworksMemoryDesc; + +/** + * Create a {@link ANeuralNetworksMemoryDesc} with no properties. + * + * This only creates the memory descriptor. Its properties should be set with calls to + * {@link ANeuralNetworksMemoryDesc_addInputRole}, + * {@link ANeuralNetworksMemoryDesc_addOutputRole}, and + * {@link ANeuralNetworksMemoryDesc_setDimensions}. + * + * {@link ANeuralNetworksMemoryDesc_finish} must be called once all properties have been set. + * + * {@link ANeuralNetworksMemoryDesc_free} must be called once the memory descriptor + * is no longer needed. + * + * Available since API level 30. + * + * @param desc The {@link ANeuralNetworksMemoryDesc} to be created. + * Set to NULL if unsuccessful. + * + * @return ANEURALNETWORKS_NO_ERROR if successful. + */ +int ANeuralNetworksMemoryDesc_create(ANeuralNetworksMemoryDesc** desc) __INTRODUCED_IN(30); + +/** + * Destroy a memory descriptor. + * + * The memory descriptor need not have been finished by a call to + * {@link ANeuralNetworksMemoryDesc_finish}. + * + * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded usage. + * + * Available since API level 30. + * + * @param desc The memory descriptor to be destroyed. Passing NULL is acceptable and + * results in no operation. + */ +void ANeuralNetworksMemoryDesc_free(ANeuralNetworksMemoryDesc* desc) __INTRODUCED_IN(30); + +/** + * Specify that a memory object will be playing the role of an input to an execution created from a + * particular compilation. + * + * The compilation and the input index fully specify an input operand. This function + * may be invoked multiple times on the same memory descriptor with different input operands, + * and the same input operand may be specified on multiple memory descriptors. However, + * specifying the same input operand on the same memory descriptor more than once will + * return an error. + * + * The dimensions of the corresponding model operands of all the roles specified by + * {@link ANeuralNetworksMemoryDesc_addInputRole} and + * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be compatible with each other. Two + * dimensions are incompatible if both ranks are fully specified but have different values, or if + * there is at least one axis that is fully specified in both but has different values. + * + * At least one of {@link ANeuralNetworksMemoryDesc_addInputRole} and + * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be called on a memory descriptor + * before invoking {@link ANeuralNetworksMemoryDesc_finish}. + * + * Attempting to modify a memory descriptor once {@link ANeuralNetworksMemoryDesc_finish} has been + * called will return an error. + * + * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded usage. + * + * Available since API level 30. + * + * @param desc The memory descriptor to be modified. + * @param compilation The compilation object. It must already have been finished by calling + * {@link ANeuralNetworksCompilation_finish}, and must outlive the memory + * descriptor. + * @param index The index of the input argument we are referencing from the compilation. It is + * an index into the inputs list passed to + * {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not + * the index associated with {@link ANeuralNetworksModel_addOperand}. + * @param frequency A floating-point value within the range (0.0, 1.0]. Describes how likely the + * memory is to be used in the specified role. This is provided as a hint to + * optimize the case when different roles prefer different memory locations or data + * layouts. + * + * @return ANEURALNETWORKS_NO_ERROR if successful. + */ +int ANeuralNetworksMemoryDesc_addInputRole(ANeuralNetworksMemoryDesc* desc, + const ANeuralNetworksCompilation* compilation, + uint32_t index, float frequency) __INTRODUCED_IN(30); + +/** + * Specify that a memory object will be playing the role of an output to an execution created from a + * particular compilation. + * + * The compilation and the output index fully specify an output operand. This function + * may be invoked multiple times on the same memory descriptor with different output operands, + * and the same output operand may be specified on multiple memory descriptors. However, + * specifying the same output operand on the same memory descriptor object more than once will + * return an error. + * + * The dimensions of the corresponding model operands of all the roles specified by + * {@link ANeuralNetworksMemoryDesc_addInputRole} and + * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be compatible with each other. Two + * dimensions are incompatible if both ranks are fully specified but have different values, or if + * there is at least one axis that is fully specified in both but has different values. + * + * At least one of {@link ANeuralNetworksMemoryDesc_addInputRole} and + * {@link ANeuralNetworksMemoryDesc_addOutputRole} must be called on the memory descriptor + * before invoking {@link ANeuralNetworksMemoryDesc_finish}. + * + * Attempting to modify a memory descriptor once {@link ANeuralNetworksMemoryDesc_finish} has been + * called will return an error. + * + * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded usage. + * + * Available since API level 30. + * + * @param desc The memory descriptor to be modified. + * @param compilation The compilation object. It must already have been finished by calling + * {@link ANeuralNetworksCompilation_finish}, and must outlive the memory + * descriptor. + * @param index The index of the output argument we are referencing from the compilation. It is + * an index into the outputs list passed to + * {@link ANeuralNetworksModel_identifyInputsAndOutputs}. It is not + * the index associated with {@link ANeuralNetworksModel_addOperand}. + * @param frequency A floating-point value within the range (0.0, 1.0]. Describes how likely the + * memory is to be used in the specified role. This is provided as a hint to + * optimize the case when multiple roles prefer different memory locations or data + * layouts. + * + * @return ANEURALNETWORKS_NO_ERROR if successful. + */ +int ANeuralNetworksMemoryDesc_addOutputRole(ANeuralNetworksMemoryDesc* desc, + const ANeuralNetworksCompilation* compilation, + uint32_t index, float frequency) __INTRODUCED_IN(30); + +/** + * Set the dimensional information of the memory descriptor. + * + * The specified dimensions must be compatible with the dimensions of the corresponding model + * operands of all the roles specified by {@link ANeuralNetworksMemoryDesc_addInputRole} and + * {@link ANeuralNetworksMemoryDesc_addOutputRole}. Two dimensions are incompatible if both ranks + * are fully specified but have different values, or if there is at least one axis that is fully + * specified in both but has different values. + * + * Attempting to modify a memory descriptor once {@link ANeuralNetworksMemoryDesc_finish} has been + * called will return an error. + * + * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded usage. + * + * Available since API level 30. + * + * @param desc The memory descriptor to be modified. + * @param rank The number of dimensions. Must be 0 for scalars. + * @param dimensions An array of dimensions. An entry with the value 0 indicates that the + * corresponding axis has an unknown size. + * + * @return ANEURALNETWORKS_NO_ERROR if successful. + */ +int ANeuralNetworksMemoryDesc_setDimensions(ANeuralNetworksMemoryDesc* desc, uint32_t rank, + const uint32_t* dimensions) __INTRODUCED_IN(30); + +/** + * Indicate that we have finished modifying a memory descriptor. Required before calling + * {@link ANeuralNetworksMemory_createFromDesc}. + * + * This function must only be called once for a given memory descriptor. + * + * See {@link ANeuralNetworksMemoryDesc} for information on multithreaded usage. + * + * Available since API level 30. + * + * @param desc The memory descriptor to be finished. + * + * @return ANEURALNETWORKS_NO_ERROR if successful. + */ +int ANeuralNetworksMemoryDesc_finish(ANeuralNetworksMemoryDesc* desc) __INTRODUCED_IN(30); + +/** + * Creates a memory object from a memory descriptor. + * + * The memory object is created with an uninitialized buffer. A memory object with an uninitialized + * buffer may only be used according to the roles specified by {@link + * ANeuralNetworksMemoryDesc_addOutputRole}, or as the destination memory in {@link + * ANeuralNetworksMemory_copy}. The buffer of a memory object is initialized after the memory object + * is used as an output in a successful execution, or used as the destination memory in a successful + * {@link ANeuralNetworksMemory_copy}. A memory object with an initialized buffer may be used + * according to all roles specified in {@link ANeuralNetworksMemoryDesc}, or as the source or + * destination memory in {@link ANeuralNetworksMemory_copy}. The buffer of a memory object will + * return to the uninitialized state if the memory object is used as an output in a failed + * execution, or used as the destination memory in a failed {@link ANeuralNetworksMemory_copy}. + * + * The dimensions of the memory descriptor are deduced from the dimensions of the corresponding + * model operands of all the roles specified by {@link ANeuralNetworksMemoryDesc_addInputRole} and + * {@link ANeuralNetworksMemoryDesc_addOutputRole}, as well as the dimensions set by the call to + * {@link ANeuralNetworksMemoryDesc_setDimensions}, if any. The memory descriptor may have + * unspecified dimensions or rank. In such a case, the same memory object may be used with different + * shapes of outputs in different executions. When the memory is used as an input, the input shape + * must be the same as the output shape from the last execution using this memory object as an + * output, or the last {@link ANeuralNetworkMemory_copy} using this memory object as the destination + * memory. Creating a memory object with unspecified dimensions or rank may fail for certain sets of + * roles. + * + * Using the memory in roles or shapes that are not compatible with the rules specified above will + * return an error. + * + * When calling {@link ANeuralNetworksExecution_setInputFromMemory} or + * {@link ANeuralNetworksExecution_setOutputFromMemory} with the memory object, + * both offset and length must be set to zero and the entire memory region will be + * associated with the specified input or output operand. + * + * Calling {@link ANeuralNetworksModel_setOperandValueFromMemory} with the memory created from this + * function will return an error. + * + * {@link ANeuralNetworksMemory_free} must be called once the memory is no longer needed. + * + * Attempting to create memory from an unfinished memory descriptor will return an error. + * + * The provided {@link ANeuralNetworksMemoryDesc} need not outlive the {@link ANeuralNetworksMemory} + * object. + * + * Available since API level 30. + * + * @param desc The memory descriptor. + * @param memory The memory object to be created. + * Set to NULL if unsuccessful. + * + * @return ANEURALNETWORKS_NO_ERROR if successful; ANEURALNETWORKS_OP_FAILED if the memory is + * created with unspecified dimensions or rank and it is not supported for this set of + * roles. + */ +int ANeuralNetworksMemory_createFromDesc(const ANeuralNetworksMemoryDesc* desc, + ANeuralNetworksMemory** memory) __INTRODUCED_IN(30); + +/** + * Copies data from one memory object to another. + * + * If at most one of the src and dst is created from {@link ANeuralNetworksMemory_createFromDesc}, + * the src and dst must have the same logical size: + * - If the memory is created from {@link ANeuralNetworksMemory_createFromFd}, or if it is created + * from {@link ANeuralNetworksMemory_createFromAHardwareBuffer} with format of + * AHARDWAREBUFFER_FORMAT_BLOB, the logical size equals the size of the memory. + * - If the memory is created from {@link ANeuralNetworksMemory_createFromAHardwareBuffer} with a + * format other than AHARDWAREBUFFER_FORMAT_BLOB, the logical size equals the size when there is + * no padding and the data is tightly packed. This function may fail if the AHardwareBuffer + * cannot be accessed. + * - If the memory is created from {@link ANeuralNetworksMemory_createFromDesc}, the logical size + * equals the size indicated by the {@link OperandCode} multiplied by the number of elements. This + * function will fail if the number of elements is unknown. + * + * If both src and dst are created from {@link ANeuralNetworksMemory_createFromDesc}, they must have + * compatible dimensions. Two dimensions are incompatible if both ranks are fully specified but + * have different values, or if there is at least one axis that is fully specified in both but has + * different values. The dst may have unspecified dimensions or rank. In such a case, the dimensions + * of dst will get updated according to the dimensions of the src. + * + * In both cases, if the src is created from {@link ANeuralNetworksMemory_createFromDesc}, it must + * have been used as an output in a successful execution, or used as the destination memory in a + * successful {@link ANeuralNetworksMemory_copy}. + * + * The src and dst may have different data layout, in which case the data copying is performed + * logically with data layout transformation. + * + * Available since API level 30. + * + * @param src The source memory object. + * @param dst The destination memory object. + * + * @return ANEURALNETWORKS_NO_ERROR if successful. + */ +int ANeuralNetworksMemory_copy(const ANeuralNetworksMemory* src, const ANeuralNetworksMemory* dst) + __INTRODUCED_IN(30); + +#endif // __ANDROID_API__ >= 30 + +#if __ANDROID_API__ >= 29 + /** * Get the number of available devices. * @@ -5359,7 +6623,8 @@ int ANeuralNetworks_getDevice(uint32_t devIndex, ANeuralNetworksDevice** device) * @param device The representation of the specified device. * @param name The returned name of the specified device. The name will be in UTF-8 * and will be null-terminated. It will be recognizable as a known device name - * rather than a cryptic string. For devices with feature level 29 and above, the + * rather than a cryptic string. For devices with feature level reported by + * {@link ANeuralNetworksDevice_getFeatureLevel} that is 29 and above, the * format of the name is {VENDOR}-{DEVICE}. For devices with feature level 28 * or lower, the format of the name is undefined. * The name will remain valid for the duration of the application. @@ -5439,6 +6704,26 @@ int ANeuralNetworksDevice_getVersion(const ANeuralNetworksDevice* device, const int ANeuralNetworksDevice_getFeatureLevel(const ANeuralNetworksDevice* device, int64_t* featureLevel) __INTRODUCED_IN(29); +#if __ANDROID_API__ >= 30 + +/** + * Wait until the device is in a live state. + * + * A device may encounter internal errors and temporarily enter a dead state. A + * call that uses a device in such a state will return with the error + * {@link ANEURALNETWORKS_DEAD_OBJECT}. ANeuralNetworksDevice_wait will block until + * the device is in a live state. + * + * @param device The representation of the specified device. + * + * @return ANEURALNETWORKS_NO_ERROR if successful. + * + * Available since API level 30. + */ +int ANeuralNetworksDevice_wait(const ANeuralNetworksDevice* device) __INTRODUCED_IN(30); + +#endif // __ANDROID_API__ >= 30 + /** * Get the supported operations for a specified set of devices. If multiple devices * are selected, the supported operation list is a union of supported operations of all @@ -5473,6 +6758,10 @@ int ANeuralNetworksModel_getSupportedOperationsForDevices( * ANeuralNetworksCompilation_create}, where the runtime will attempt to recover * from such failures. * + * The model passed to this function is termed the "main model" of the + * compilation, to distinguish it from other models referred to by an Operand + * of type {@link ANEURALNETWORKS_MODEL} within this compilation. + * * @param model The {@link ANeuralNetworksModel} to be compiled. * @param devices The set of devices. Must not contain duplicates. * @param numDevices The number of devices in the set. @@ -5502,7 +6791,7 @@ int ANeuralNetworksCompilation_createForDevices(ANeuralNetworksModel* model, * data. It is recommended to use the code cache directory provided * by the Android runtime. If not using the code cache directory, the * user should choose a directory local to the application, and is - * responsible to managing the cache entries. + * responsible for managing the cache entries. * @param token The token provided by the user to specify a model must be of length * ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN. The user should ensure that * the token is unique to a model within the application. The NNAPI @@ -5525,10 +6814,24 @@ int ANeuralNetworksCompilation_setCaching(ANeuralNetworksCompilation* compilatio * execution has completed and the outputs are ready to be consumed. * </p> * + * If {@link ANeuralNetworksExecution_setTimeout} was called on this execution, + * and the execution is not able to complete before the timeout duration is + * exceeded, then execution may be aborted, in which case + * {@link ANEURALNETWORKS_MISSED_DEADLINE_*} will be returned. If the device has + * a feature level reported by {@link ANeuralNetworksDevice_getFeatureLevel} + * that is lower than 30, then the timeout duration hint will be ignored. + * + * If this execution contains a {@link ANEURALNETWORKS_WHILE} operation, and + * the condition model does not output false within the loop timeout duration, + * then execution will be aborted and {@link ANEURALNETWORKS_MISSED_DEADLINE_*} + * will be returned. + * * See {@link ANeuralNetworksExecution} for information on multithreaded usage. * - * See {@link ANeuralNetworksExecution_startCompute} for asynchronous execution. - * Synchronous execution incurs lower overhead than asynchronous execution. + * See {@link ANeuralNetworksExecution_burstCompute} for burst synchronous execution. + * See {@link ANeuralNetworksExecution_startCompute} for regular asynchronous execution. + * See {@link ANeuralNetworksExecution_startComputeWithDependencies} for + * asynchronous execution with dependencies. * * Available since API level 29. * @@ -5544,9 +6847,10 @@ int ANeuralNetworksExecution_compute(ANeuralNetworksExecution* execution) __INTR * Get the dimensional information of the specified output operand of the model of the * {@link ANeuralNetworksExecution}. * - * On asynchronous execution initiated by {@link ANeuralNetworksExecution_startCompute}, - * {@link ANeuralNetworksEvent_wait} must be called prior to this function to recuperate - * the resources used by the execution. + * The execution must have completed. On asynchronous execution initiated by + * {@link ANeuralNetworksExecution_startCompute} or + * {@link ANeuralNetworksExecution_startComputeWithDependencies}, + * {@link ANeuralNetworksEvent_wait} must be called prior to this function. * * @param execution The execution to be queried. * @param index The index of the output argument we are querying. It is @@ -5569,9 +6873,10 @@ int ANeuralNetworksExecution_getOutputOperandRank(ANeuralNetworksExecution* exec * Get the dimensional information of the specified output operand of the model of the * {@link ANeuralNetworksExecution}. The target output operand cannot be a scalar. * - * On asynchronous execution initiated by {@link ANeuralNetworksExecution_startCompute}, - * {@link ANeuralNetworksEvent_wait} must be called prior to this function to recuperate - * the resources used by the execution. + * The execution must have completed. On asynchronous execution initiated by + * {@link ANeuralNetworksExecution_startCompute} or + * {@link ANeuralNetworksExecution_startComputeWithDependencies}, + * {@link ANeuralNetworksEvent_wait} must be called prior to this function. * * @param execution The execution to be queried. * @param index The index of the output argument we are querying. It is an index into the lists @@ -5625,11 +6930,28 @@ void ANeuralNetworksBurst_free(ANeuralNetworksBurst* burst) __INTRODUCED_IN(29); * <p>Schedules synchronous evaluation of the execution. Returns once the * execution has completed and the outputs are ready to be consumed.</p> * + * If {@link ANeuralNetworksExecution_setTimeout} was called on the execution, + * and the execution is not able to complete before the timeout duration is + * exceeded, then execution may be aborted, in which case + * {@link ANEURALNETWORKS_MISSED_DEADLINE_*} will be returned. + * + * If the execution contains a {@link ANEURALNETWORKS_WHILE} operation, and + * the condition model does not output false within the loop timeout duration, + * then execution will be aborted and {@link ANEURALNETWORKS_MISSED_DEADLINE_*} + * will be returned. If the device has a feature level reported by + * {@link ANeuralNetworksDevice_getFeatureLevel} that is lower than 30, then the + * timeout duration hint will be ignored. + * * <p>There must be at most one {@link ANeuralNetworksExecution} processing at * any given time for any given burst object. Any * {@link ANeuralNetworksExecution} launched before the previous has finished * will result in ANEURALNETWORKS_BAD_STATE.</p> * + * See {@link ANeuralNetworksExecution_compute} for synchronous execution. + * See {@link ANeuralNetworksExecution_startCompute} for regular asynchronous execution. + * See {@link ANeuralNetworksExecution_startComputeWithDependencies} for + * asynchronous execution with dependencies. + * * Available since API level 29. * * @param burst The burst object to execute on. @@ -5656,14 +6978,14 @@ int ANeuralNetworksExecution_burstCompute(ANeuralNetworksExecution* execution, * offset and length must be set to zero and the entire memory region will be * associated with the specified input or output operand. There is no guarantee * that an arbitrary AHardwareBuffer_Format and AHardwareBuffer_UsageFlags combination - * can be used by arbitrary devices. The execution will fail if selected set of devices - * cannot consume the buffer. + * can be used by arbitrary devices. The execution will fail if the selected set of + * devices cannot consume the buffer. * * Calling {@link ANeuralNetworksModel_setOperandValueFromMemory} with shared memory * backed by an AHardwareBuffer of a format other than AHARDWAREBUFFER_FORMAT_BLOB is * disallowed. * - * TODO(miaowang): add documentation about intended usage with introspection API. + * The provided AHardwareBuffer must outlive the ANeuralNetworksMemory object. * * Available since API level 29. * @@ -5686,8 +7008,12 @@ int ANeuralNetworksMemory_createFromAHardwareBuffer(const AHardwareBuffer* ahwb, * * By default, duration is not measured. * - * The {@link ANeuralNetworksExecution} must have been created with + * The {@link ANeuralNetworksExecution} must have been created from an + * {@link ANeuralNetworksCompilation} which in turn was created from * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1. + * If the device has a feature level reported by + * {@link ANeuralNetworksDevice_getFeatureLevel} that is lower than 29, then the + * duration will not be measured. * * See {@link ANeuralNetworksExecution} for information on multithreaded usage. * @@ -5702,41 +7028,32 @@ int ANeuralNetworksExecution_setMeasureTiming(ANeuralNetworksExecution* executio __INTRODUCED_IN(29); /** - * Different duration measurements. - * - * Durations are measured in nanoseconds. - * - * Available since API level 29. - */ -typedef enum { - // Execution time on hardware (not driver, which runs on host processor). - ANEURALNETWORKS_DURATION_ON_HARDWARE = 0, - // Execution time in driver (including time on hardware). Excludes overhead - // such as that of the runtime itself and the IPC needed for the runtime to - // communicate with the driver. - ANEURALNETWORKS_DURATION_IN_DRIVER = 1, -} DurationCode; - -/** * Get the time spent in the specified {@link ANeuralNetworksExecution}, in nanoseconds. - * The execution must have completed. * - * Available since API level 29. + * The execution must have completed. On asynchronous execution initiated by + * {@link ANeuralNetworksExecution_startCompute} or + * {@link ANeuralNetworksExecution_startComputeWithDependencies}, + * {@link ANeuralNetworksEvent_wait} must be called prior to this function. * * @param execution The execution to be queried. * @param durationCode The measurement to be queried, specified by {@link DurationCode}. * @param duration The returned duration. If no measurement was requested by - * {@link ANeuralNetworksExecution_setMeasureTiming}, or for some other - * reason the duration is not available, UINT64_MAX will be returned. - * A particular device need not support any given measurement. + * {@link ANeuralNetworksExecution_setMeasureTiming}, if the + * device is has a feature level reported by + * {@link ANeuralNetworksDevice_getFeatureLevel} that is lower + * than 29, or for some other reason the duration is not + * available, UINT64_MAX will be returned. A particular device + * need not support any given measurement. * * @return ANEURALNETWORKS_NO_ERROR if successful. + * + * Available since API level 29. */ int ANeuralNetworksExecution_getDuration(const ANeuralNetworksExecution* execution, int32_t durationCode, uint64_t* duration) __INTRODUCED_IN(29); -#endif // __ANDROID_API__ >= __ANDROID_API_Q__ +#endif // __ANDROID_API__ >= 29 #if __ANDROID_API__ >= 27 @@ -5776,7 +7093,8 @@ int ANeuralNetworksMemory_createFromFd(size_t size, int protect, int fd, size_t * * Available since API level 27. * - * @param memory The memory object to be freed. + * @param memory The memory object to be freed. Passing NULL is acceptable and + * results in no operation. */ void ANeuralNetworksMemory_free(ANeuralNetworksMemory* memory) __INTRODUCED_IN(27); @@ -5784,8 +7102,10 @@ void ANeuralNetworksMemory_free(ANeuralNetworksMemory* memory) __INTRODUCED_IN(2 * Create an empty {@link ANeuralNetworksModel}. * * <p>This only creates the object. Computation is performed once - * {@link ANeuralNetworksExecution_compute} or - * {@link ANeuralNetworksExecution_startCompute} is invoked. + * {@link ANeuralNetworksExecution_burstCompute}, + * {@link ANeuralNetworksExecution_compute}, + * {@link ANeuralNetworksExecution_startCompute} or + * {@link ANeuralNetworksExecution_startComputeWithDependencies} is invoked. * * The model should be constructed with calls to * {@link ANeuralNetworksModel_addOperation} and @@ -5826,8 +7146,8 @@ void ANeuralNetworksModel_free(ANeuralNetworksModel* model) __INTRODUCED_IN(27); * calling {@link ANeuralNetworksCompilation_create} and * {@link ANeuralNetworksCompilation_createForDevices}. * - * An application is responsible to make sure that no other thread uses - * the model at the same time. + * An application must ensure that no other thread uses the model at the same + * time. * * This function must only be called once for a given model. * @@ -5901,11 +7221,13 @@ int ANeuralNetworksModel_addOperand(ANeuralNetworksModel* model, * {@link ANEURALNETWORKS_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES} * are immediately copied into the model. * - * For values of length greater than {@link ANEURALNETWORKS_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES}, - * a pointer to the buffer is stored within the model. The application is responsible - * for not changing the content of this region until all executions using this model - * have completed. As the data may be copied during processing, modifying the data - * after this call yields undefined results. + * For values of length greater than + * {@link ANEURALNETWORKS_MAX_SIZE_OF_IMMEDIATELY_COPIED_VALUES}, a pointer to + * the buffer is stored within the model. The application must not change the + * content of this region until all executions using this model have + * completed. As the data may be copied during processing, modifying the data + * after this call yields undefined results. The provided buffer must outlive + * this model. * * For large tensors, using {@link ANeuralNetworksModel_setOperandValueFromMemory} * is likely to be more efficient. @@ -5930,7 +7252,7 @@ int ANeuralNetworksModel_addOperand(ANeuralNetworksModel* model, int ANeuralNetworksModel_setOperandValue(ANeuralNetworksModel* model, int32_t index, const void* buffer, size_t length) __INTRODUCED_IN(27); -#if __ANDROID_API__ >= __ANDROID_API_Q__ +#if __ANDROID_API__ >= 29 /** * Sets an operand's per channel quantization parameters. @@ -5955,28 +7277,33 @@ int ANeuralNetworksModel_setOperandSymmPerChannelQuantParams( ANeuralNetworksModel* model, int32_t index, const ANeuralNetworksSymmPerChannelQuantParams* channelQuant) __INTRODUCED_IN(29); -#endif // __ANDROID_API__ >= __ANDROID_API_Q__ +#endif // __ANDROID_API__ >= 29 /** * Sets an operand to a value stored in a memory object. * * The content of the memory is not copied. A reference to that memory is stored - * inside the model. The application is responsible for not changing the content - * of the memory region until all executions using this model have completed. - * As the data may be copied during processing, modifying the data after this call - * yields undefined results. + * inside the model. The application must not change the content of the memory + * region until all executions using this model have completed. As the data may + * be copied during processing, modifying the data after this call yields + * undefined results. + * + * <p>The provided memory must outlive this model.</p> * * To indicate that an optional operand should be considered missing, * use {@link ANeuralNetworksModel_setOperandValue} instead, passing nullptr for buffer. * - * Is disallowed to set an operand value with shared memory backed by an AHardwareBuffer + * It is disallowed to set an operand value with shared memory backed by an AHardwareBuffer * of a format other than AHARDWAREBUFFER_FORMAT_BLOB. * + * It is disallowed to set an operand value with memory created from + * {@link ANeuralNetworksMemory_createFromDesc}. + * * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has been * called will return an error. * * See {@link ANeuralNetworksModel} for information on multithreaded usage. - * See {@link ANeuralNetworksMemory_createFromAHardwarBuffer} for information on + * See {@link ANeuralNetworksMemory_createFromAHardwareBuffer} for information on * AHardwareBuffer usage. * * Available since API level 27. @@ -5996,6 +7323,39 @@ int ANeuralNetworksModel_setOperandValueFromMemory(ANeuralNetworksModel* model, size_t offset, size_t length) __INTRODUCED_IN(27); +#if __ANDROID_API__ >= 30 + +/** + * Sets an operand to a value that is a reference to another NNAPI model. + * + * The referenced model must already have been finished by a call to + * {@link ANeuralNetworksModel_finish}. + * + * The {@link ANeuralNetworksModel_relaxComputationFloat32toFloat16} setting of + * referenced models is overridden by that setting of the main model of a + * compilation. + * + * The referenced model must outlive the model referring to it. + * + * Attempting to modify a model once {@link ANeuralNetworksModel_finish} has + * been called will return an error. + * + * See {@link ANeuralNetworksModel} for information on multithreaded usage. + * + * Available since API level 30. + * + * @param model The model to be modified. + * @param index The index of the model operand we're setting. + * @param value The model to be referenced. + * + * @return ANEURALNETWORKS_NO_ERROR if successful. + */ +int ANeuralNetworksModel_setOperandValueFromModel(ANeuralNetworksModel* model, int32_t index, + const ANeuralNetworksModel* value) + __INTRODUCED_IN(30); + +#endif // __ANDROID_API__ >= 30 + /** * Add an operation to a model. * @@ -6060,6 +7420,9 @@ int ANeuralNetworksModel_identifyInputsAndOutputs(ANeuralNetworksModel* model, u * must be calculated using at least the range and precision of the IEEE 754 * 32-bit floating-point format. * + * The relaxComputationFloat32toFloat16 setting of the main model of + * a compilation overrides the values of the referenced models. + * * @param model The model to be modified. * @param allow 'true' indicates {@link ANEURALNETWORKS_TENSOR_FLOAT32} may be * calculated with range and/or precision as low as that of the @@ -6083,7 +7446,11 @@ int ANeuralNetworksModel_relaxComputationFloat32toFloat16(ANeuralNetworksModel* /** * Create a {@link ANeuralNetworksCompilation} to compile the given model. * - * <p>This only creates the object. Compilation is only performed once + * The model passed to this function is termed the "main model" of the + * compilation, to distinguish it from other models referred to by an Operand + * of type {@link ANEURALNETWORKS_MODEL} within this compilation. + * + * <p>This function only creates the object. Compilation is only performed once * {@link ANeuralNetworksCompilation_finish} is invoked.</p> * * <p>{@link ANeuralNetworksCompilation_finish} should be called once @@ -6114,7 +7481,7 @@ int ANeuralNetworksCompilation_create(ANeuralNetworksModel* model, * Destroy a compilation. * * The compilation need not have been finished by a call to - * {@link ANeuralNetworksModel_finish}. + * {@link ANeuralNetworksCompilation_finish}. * * See {@link ANeuralNetworksCompilation} for information on multithreaded usage. * @@ -6128,7 +7495,8 @@ void ANeuralNetworksCompilation_free(ANeuralNetworksCompilation* compilation) __ /** * Sets the execution preference. * - * <p>Provides guidance to the runtime when trade-offs are possible.</p> + * <p>Provides guidance to the runtime when trade-offs are possible. By default the runtime + * uses PREFER_SINGLE_FAST_ANSWER</p> * * See {@link ANeuralNetworksCompilation} for information on multithreaded usage. * @@ -6146,13 +7514,19 @@ int ANeuralNetworksCompilation_setPreference(ANeuralNetworksCompilation* compila /** * Indicate that we have finished modifying a compilation. Required before - * calling {@link ANeuralNetworksExecution_create}. + * calling {@link ANeuralNetworksBurst_create} or + * {@link ANeuralNetworksExecution_create}. * - * An application is responsible to make sure that no other thread uses - * the compilation at the same time. + * An application must ensure that no other thread uses the compilation at the + * same time. * * This function must only be called once for a given compilation. * + * If {@link ANeuralNetworksCompilation_setTimeout} was called on this + * compilation, and the compilation is not able to be finished before the + * timeout duration is exceeded, then compilation may be aborted, in which case + * {@link ANEURALNETWORKS_MISSED_DEADLINE_*} will be returned. + * * See {@link ANeuralNetworksCompilation} for information on multithreaded usage. * * Available since API level 27. @@ -6163,11 +7537,85 @@ int ANeuralNetworksCompilation_setPreference(ANeuralNetworksCompilation* compila */ int ANeuralNetworksCompilation_finish(ANeuralNetworksCompilation* compilation) __INTRODUCED_IN(27); +#if __ANDROID_API__ >= 30 + +/** + * Set the execution priority. + * + * Execution priorities are relative to other executions created by the same + * application (specifically same uid) for the same device. Specifically, + * priorities of executions from one application will not affect executions from + * another application. Similarly, priorities of executions on one device will + * not affect executions on another device. + * + * Higher priority executions may use more compute resources than lower priority + * executions, and may preempt or starve lower priority executions. + * + * See {@link ANeuralNetworksCompilation} for information on multithreaded usage. + * + * Available since API level 30. + * + * @param compilation The compilation to be modified. + * @param priority The relative priority of the execution compared to other + * executions created by the application. Must be one of + * ANEURALNETWORKS_PRIORITY_*. + * + * @return ANEURALNETWORKS_NO_ERROR if successful. + */ +int ANeuralNetworksCompilation_setPriority(ANeuralNetworksCompilation* compilation, int priority) + __INTRODUCED_IN(30); + +/** + * Set the maximum expected duration for compiling the model. + * + * If the device is not able to complete the compilation within the specified + * duration, the compilation may be aborted. The timeout duration begins at the + * call to {@link ANeuralNetworksCompilation_finish}. + * + * This timeout duration acts as a hint to drivers, and can be used to both free + * up compute resources within the driver and return control back to the + * application quicker than is possible without the hint. It enables drivers + * that are able to estimate how long a compilation will take to abort the + * compilation before it has even started if the driver believes the compilation + * cannot be completed within the timeout duration. Similarly, it enables + * drivers to abort an ongoing compilation if it is taking too long. However, + * this call does not guarantee that the compilation will complete or abort + * within the timeout duration. + * + * By default (i.e., unless ANeuralNetworksCompilation_setTimeout is called), + * the timeout duration for compiling the model is considered infinite. + * + * The {@link ANeuralNetworksCompilation} must have been created with + * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1, + * otherwise this function will fail with ANEURALNETWORKS_BAD_DATA. If the + * device has a feature level reported by + * {@link ANeuralNetworksDevice_getFeatureLevel} that is lower than 30, then the + * timeout duration hint will be ignored. + * + * See {@link ANeuralNetworksCompilation} for information on multithreaded usage. + * + * @param compilation The compilation to be modified. + * @param duration The maximum amount of time in nanoseconds that is expected to + * be spent finishing a compilation. If this duration is exceeded, the + * compilation may be aborted. If set to 0, the timeout duration is + * considered infinite. + * + * @return ANEURALNETWORKS_NO_ERROR if successful. + * + * Available since API level 30. + */ +int ANeuralNetworksCompilation_setTimeout(ANeuralNetworksCompilation* compilation, + uint64_t duration) __INTRODUCED_IN(30); + +#endif // __ANDROID_API__ >= 30 + /** * Create a {@link ANeuralNetworksExecution} to apply the given compilation. * This only creates the object. Computation is only performed once - * {@link ANeuralNetworksExecution_compute} or - * {@link ANeuralNetworksExecution_startCompute} is invoked. + * {@link ANeuralNetworksExecution_burstCompute}, + * {@link ANeuralNetworksExecution_compute}, + * {@link ANeuralNetworksExecution_startCompute} or + * {@link ANeuralNetworksExecution_startComputeWithDependencies} is invoked. * * <p>The provided compilation must outlive the execution.</p> * @@ -6187,12 +7635,16 @@ int ANeuralNetworksExecution_create(ANeuralNetworksCompilation* compilation, /** * Destroy an execution. * - * <p>If called on an execution for which - * {@link ANeuralNetworksExecution_startCompute} has been called, the - * function will return immediately but will mark the execution to be deleted - * once the computation completes. The related {@link ANeuralNetworksEvent} - * will be signaled and the {@link ANeuralNetworksEvent_wait} will return - * ANEURALNETWORKS_ERROR_DELETED. + * <p>The execution need not have been scheduled by a call to + * {@link ANeuralNetworksExecution_burstCompute}, + * {@link ANeuralNetworksExecution_compute}, + * {@link ANeuralNetworksExecution_startCompute} or + * {@link ANeuralNetworksExecution_startComputeWithDependencies}; but if it has been scheduled, + * then the application must not call {@link ANeuralNetworksExecution_free} + * until the execution has completed (i.e., + * {@link ANeuralNetworksExecution_burstCompute}, + * {@link ANeuralNetworksExecution_compute}, or + * {@link ANeuralNetworksEvent_wait} has returned). * * See {@link ANeuralNetworksExecution} for information on multithreaded usage. * @@ -6206,7 +7658,10 @@ void ANeuralNetworksExecution_free(ANeuralNetworksExecution* execution) __INTROD /** * Associate a user buffer with an input of the model of the * {@link ANeuralNetworksExecution}. Evaluation of the execution must not have - * been scheduled. + * been scheduled. Once evaluation of the execution has been scheduled, the + * application must not change the content of the buffer until the execution has + * completed. Evaluation of the execution will not change the content of the + * buffer. * * <p>The provided buffer must outlive the execution.</p> * @@ -6244,9 +7699,12 @@ int ANeuralNetworksExecution_setInput(ANeuralNetworksExecution* execution, int32 size_t length) __INTRODUCED_IN(27); /** - * Associate part of a memory object with an input of the model of the + * Associate a region of a memory object with an input of the model of the * {@link ANeuralNetworksExecution}. Evaluation of the execution must not have - * been scheduled. + * been scheduled. Once evaluation of the execution has been scheduled, the + * application must not change the content of the region until the execution has + * completed. Evaluation of the execution will not change the content of the + * region. * * <p>The provided memory must outlive the execution.</p> * @@ -6255,8 +7713,10 @@ int ANeuralNetworksExecution_setInput(ANeuralNetworksExecution* execution, int32 * buffer and 0 for length. * * See {@link ANeuralNetworksExecution} for information on multithreaded usage. - * See {@link ANeuralNetworksMemory_createFromAHardwarBuffer} for information on + * See {@link ANeuralNetworksMemory_createFromAHardwareBuffer} for information on * AHardwareBuffer usage. + * See {@link ANeuralNetworksMemory_createFromDesc} for information on usage of memory objects + * created from memory descriptors. * * Available since API level 27. * @@ -6290,7 +7750,9 @@ int ANeuralNetworksExecution_setInputFromMemory(ANeuralNetworksExecution* execut /** * Associate a user buffer with an output of the model of the * {@link ANeuralNetworksExecution}. Evaluation of the execution must not have - * been scheduled. + * been scheduled. Once evaluation of the execution has been scheduled, the + * application must not change the content of the buffer until the execution has + * completed. * * If the output is optional, you can indicate that it is omitted by * passing nullptr for buffer and 0 for length. @@ -6333,9 +7795,11 @@ int ANeuralNetworksExecution_setOutput(ANeuralNetworksExecution* execution, int3 size_t length) __INTRODUCED_IN(27); /** - * Associate part of a memory object with an output of the model of the + * Associate a region of a memory object with an output of the model of the * {@link ANeuralNetworksExecution}. Evaluation of the execution must not have - * been scheduled. + * been scheduled. Once evaluation of the execution has been scheduled, the + * application must not change the content of the region until the execution has + * completed. * * If the output is optional, you can indicate that it is omitted by * using {@link ANeuralNetworksExecution_setOutput} instead, passing nullptr for @@ -6344,8 +7808,10 @@ int ANeuralNetworksExecution_setOutput(ANeuralNetworksExecution* execution, int3 * <p>The provided memory must outlive the execution.</p> * * See {@link ANeuralNetworksExecution} for information on multithreaded usage. - * See {@link ANeuralNetworksMemory_createFromAHardwarBuffer} for information on + * See {@link ANeuralNetworksMemory_createFromAHardwareBuffer} for information on * AHardwareBuffer usage. + * See {@link ANeuralNetworksMemory_createFromDesc} for information on usage of memory objects + * created from memory descriptors. * * Available since API level 27. * @@ -6385,8 +7851,8 @@ int ANeuralNetworksExecution_setOutputFromMemory(ANeuralNetworksExecution* execu /** * Schedule asynchronous evaluation of the execution. * - * <p>Schedules asynchronous evaluation of the execution. Once the model has - * been applied and the outputs are ready to be consumed, the returned event + * <p>Schedules asynchronous evaluation of the execution. Once the execution + * has completed and the outputs are ready to be consumed, the returned event * will be signaled. Use {@link ANeuralNetworksEvent_wait} to wait for that * event. * </p> @@ -6394,10 +7860,31 @@ int ANeuralNetworksExecution_setOutputFromMemory(ANeuralNetworksExecution* execu * ANeuralNetworksEvent_wait must be called to recuperate the resources used * by the execution. * + * If {@link ANeuralNetworksExecution_setTimeout} was called on this execution, + * and the execution is not able to complete before the timeout duration is + * exceeded, then execution may be aborted, in which case + * {@link ANEURALNETWORKS_MISSED_DEADLINE_*} will be returned through + * {@link ANeuralNetworksExecution_startCompute} or + * {@link ANeuralNetworksEvent_wait} on the event object. If the device has a + * feature level reported by {@link ANeuralNetworksDevice_getFeatureLevel} that + * is lower than 30, then the timeout duration hint will be ignored. + * + * If this execution contains a {@link ANEURALNETWORKS_WHILE} operation, and + * the condition model does not output false within the loop timeout duration, + * then execution will be aborted and {@link ANEURALNETWORKS_MISSED_DEADLINE_*} + * will be returned through {@link ANeuralNetworksEvent_wait} on the event + * object. + * + * If the device can detect before the execution has started that the execution + * will not complete within the timeout duration, the device may choose to skip + * the execution and instead return {@link ANEURALNETWORKS_MISSED_DEADLINE_*}. + * * See {@link ANeuralNetworksExecution} for information on multithreaded usage. * * See {@link ANeuralNetworksExecution_compute} for synchronous execution. - * Synchronous execution incurs lower overhead than asynchronous execution. + * See {@link ANeuralNetworksExecution_burstCompute} for burst synchronous execution. + * See {@link ANeuralNetworksExecution_startComputeWithDependencies} for + * asynchronous execution with dependencies. * * Available since API level 27. * @@ -6405,21 +7892,129 @@ int ANeuralNetworksExecution_setOutputFromMemory(ANeuralNetworksExecution* execu * @param event The event that will be signaled on completion. event is set to * NULL if there's an error. * - * @return ANEURALNETWORKS_NO_ERROR if successful. + * @return ANEURALNETWORKS_NO_ERROR if the evaluation is successfully scheduled. */ int ANeuralNetworksExecution_startCompute(ANeuralNetworksExecution* execution, ANeuralNetworksEvent** event) __INTRODUCED_IN(27); +#if __ANDROID_API__ >= 30 + +/** + * Set the maximum expected duration of the specified execution. + * + * If the device is not able to complete the execution within the specified + * duration, the execution may be aborted. The timeout duration begins at a + * call to one of: + * - {@link ANeuralNetworksExecution_burstCompute} + * - {@link ANeuralNetworksExecution_compute} + * - {@link ANeuralNetworksExecution_startCompute} + * - {@link ANeuralNetworksExecution_startComputeWithDependencies} + * + * This timeout duration acts as a hint to drivers, and can be used to both free + * up compute resources within the driver and return control back to the + * application quicker than is possible without the hint. It enables drivers + * that are able to estimate how long an execution will take to abort the + * execution before it has even started if the driver believes the execution + * cannot be completed within the timeout duration. Similarly, it enables + * drivers to abort an ongoing execution if it is taking too long. However, this + * call does not guarantee that the execution will complete or abort within the + * timeout duration. + * + * By default (i.e., unless ANeuralNetworksExecution_setTimeout is called), + * the timeout duration for execution is considered infinite. + * + * The {@link ANeuralNetworksExecution} must have been created from an + * {@link ANeuralNetworksCompilation} which in turn was created from + * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1, + * otherwise this function will fail with ANEURALNETWORKS_BAD_DATA. If the + * device has a feature level reported by + * {@link ANeuralNetworksDevice_getFeatureLevel} that is lower than 30, then the + * timeout duration hint will be ignored. + * + * See {@link ANeuralNetworksExecution} for information on multithreaded usage. + * + * @param execution The execution to be modified. + * @param duration The maximum amount of time in nanoseconds that is expected to + * be spent executing a model. If this duration is exceeded, the execution + * may be aborted. If set to 0, the timeout duration is considered infinite. + * + * @return ANEURALNETWORKS_NO_ERROR if successful. + * + * Available since API level 30. + */ +int ANeuralNetworksExecution_setTimeout(ANeuralNetworksExecution* execution, uint64_t duration) + __INTRODUCED_IN(30); + +/** + * Set the maximum duration of WHILE loops in the specified execution. + * + * This is a fuzzy per-loop timeout intended to prevent infinite loops. + * + * If a WHILE loop condition model does not output false within the specified + * duration, the execution will be aborted. + * + * See {@link ANeuralNetworks_getDefaultLoopTimeout} and + * {@link ANeuralNetworks_getMaximumLoopTimeout} for the default + * and maximum timeout values. + * + * See {@link ANeuralNetworksExecution} for information on multithreaded usage. + * + * @param execution The execution to be modified. + * @param duration The maximum amount of time in nanoseconds that can be spent + * executing a WHILE loop. If the specified duration value exceeds the value + * produced by {@link ANeuralNetworks_getMaximumLoopTimeout}, it will be + * overridden by that value. + * + * @return ANEURALNETWORKS_NO_ERROR if successful. + * ANEURALNETWORKS_BAD_STATE if execution has started. + * ANEURALNETWORKS_UNEXPECTED_NULL if execution is NULL. + * + * Available since API level 30. + */ +int ANeuralNetworksExecution_setLoopTimeout(ANeuralNetworksExecution* execution, uint64_t duration) + __INTRODUCED_IN(30); + +/** + * Get the default timeout value for WHILE loops. + * + * @return The default timeout value in nanoseconds. + * + * Available since API level 30. + */ +uint64_t ANeuralNetworks_getDefaultLoopTimeout() __INTRODUCED_IN(30); + +/** + * Get the maximum timeout value for WHILE loops. + * + * @return The maximum timeout value in nanoseconds. + * + * Available since API level 30. + */ +uint64_t ANeuralNetworks_getMaximumLoopTimeout() __INTRODUCED_IN(30); + +#endif // __ANDROID_API__ >= 30 + /** * Waits until the execution completes. * * More than one thread can wait on an event. When the execution completes, * all threads will be released. * + * If {@link ANeuralNetworksExecution_setTimeout} was called on the execution + * corresponding to this event, and the execution is not able to complete + * before the duration is exceeded, the execution may be aborted, in which case + * {@link ANEURALNETWORKS_MISSED_DEADLINE_*} will be returned here. + * + * If the execution contains a {@link ANEURALNETWORKS_WHILE} operation, and + * the condition model does not output false within the loop timeout duration, + * the execution will be aborted, and {@link ANEURALNETWORKS_MISSED_DEADLINE_*} + * will be returned here. + * * See {@link ANeuralNetworksExecution} for information on multithreaded usage. * * Available since API level 27. * + * @param event The event that will be signaled on completion. * @return ANEURALNETWORKS_NO_ERROR if the execution completed normally. * ANEURALNETWORKS_UNMAPPABLE if the execution input or output memory cannot * be properly mapped. @@ -6432,13 +8027,140 @@ int ANeuralNetworksEvent_wait(ANeuralNetworksEvent* event) __INTRODUCED_IN(27); * See {@link ANeuralNetworksExecution} for information on multithreaded usage. * * Available since API level 27. + * + * @param event The event object to be destroyed. Passing NULL is acceptable and + * results in no operation. */ void ANeuralNetworksEvent_free(ANeuralNetworksEvent* event) __INTRODUCED_IN(27); #endif // __ANDROID_API__ >= 27 +#if __ANDROID_API__ >= 30 +/** + * Create a {@link ANeuralNetworksEvent} from a sync_fence file descriptor. + * + * The newly created ANeuralNetworksEvent does not take ownership of the provided sync_fence_fd, + * it will instead dup the provided sync_fence_fd and own the duplicate. + * + * @param sync_fence_fd The sync_fence file descriptor. + * @param event The newly created object or NULL if unsuccessful. + * + * @return ANEURALNETWORKS_NO_ERROR if successful. + * + * Available since API level 30. + */ +int ANeuralNetworksEvent_createFromSyncFenceFd(int sync_fence_fd, ANeuralNetworksEvent** event) + __INTRODUCED_IN(30); + +/** + * Get sync_fence file descriptor from the event. + * + * If the ANeuralNetworksEvent is not backed by a sync fence, the sync_fence_fd + * will be set to -1, and ANEURALNETWORKS_BAD_DATA will be returned. + * + * See {@link ANeuralNetworksEvent_createFromSyncFenceFd} and + * {@link ANeuralNetworksExecution_startComputeWithDependencies} to see how to create + * an event backed by a sync fence. + * + * The user takes ownership of the returned fd, and must close the returned file descriptor when + * it is no longer needed. + * + * @param event An event that is backed by a sync fence. + * @param sync_fence_fd The sync_fence file descriptor. The file descriptor will + * be set to -1 if there is an error. + * + * @return ANEURALNETWORKS_NO_ERROR if successful. + * + * Available since API level 30. + */ +int ANeuralNetworksEvent_getSyncFenceFd(const ANeuralNetworksEvent* event, int* sync_fence_fd) + __INTRODUCED_IN(30); + +/** + * Schedule asynchronous evaluation of the execution with dependencies. + * + * The execution will wait for all the depending events to be signaled before + * starting the evaluation. Once the execution has completed and the outputs + * are ready to be consumed, the returned event will be signaled. Depending on which + * devices are handling the execution, the event could be backed by a sync fence. + * Use {@link ANeuralNetworksEvent_wait} to wait for that event. + * + * ANeuralNetworksEvent_wait must be called to recurperate the resources used + * by the execution. + * + * If parts of the execution are scheduled on devices that do not support fenced execution, + * the function call may wait for such parts to finish before returning. + * + * The function will return an error if any of the events in dependencies is already in a bad + * state. After the execution is scheduled, if any of the events in dependencies does not complete + * normally, the execution will fail, and {@link ANeuralNetworksEvent_wait} on the returned + * event will return an error. + * + * The function will return an error if any of the execution outputs has a tensor operand type + * that is not fully specified. + * + * The function can be passed a timeout duration in nanoseconds. This timeout + * duration acts as a hint to drivers in the same way that the timeout durations + * in {@link ANeuralNetworksCompilation_setTimeout} and {@link + * ANeuralNetworksExecution_setTimeout} act as hints to drivers. The duration + * begins when all waitFor sync fences have been signaled, and can be used + * together with {@link ANeuralNetworksExecution_setTimeout} which specifies the + * maximum timeout duration beginning at the call to + * {@link ANeuralNetworksExecution_startComputeWithDependencies}. + * If the duration is non-zero, the {@link ANeuralNetworksExecution} must have been created + * from an {@link ANeuralNetworksCompilation} which in turn was created from + * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1, + * otherwise this function will fail with ANEURALNETWORKS_BAD_DATA. If either + * the timeout duration from {@link ANeuralNetworksExecution_setTimeout} or the + * timeout duration passed to this call is exceeded, the execution may be + * aborted, in which case {@link ANEURALNETWORKS_MISSED_DEADLINE_*} will be + * returned through {@link ANeuralNetworksExecution_startComputeWithDependencies} + * or {@link ANeuralNetworksEvent_wait} on the event object. If the device has a + * feature level reported by {@link ANeuralNetworksDevice_getFeatureLevel} that + * is lower than 30, then the timeout duration hints will be ignored. + * + * If this execution contains a {@link ANEURALNETWORKS_WHILE} operation, and + * the condition model does not output false within the loop timeout duration, + * then execution will be aborted and {@link ANEURALNETWORKS_MISSED_DEADLINE_*} + * will be returned through {@link ANeuralNetworksEvent_wait} on the event + * object. + * + * See {@link ANeuralNetworksExecution} for information on multithreaded usage. + * + * See {@link ANeuralNetworksExecution_compute} for synchronous execution. + * See {@link ANeuralNetworksExecution_burstCompute} for burst synchronous execution. + * See {@link ANeuralNetworksExecution_startCompute} for regular asynchronous execution. + * + * @param execution The execution to be scheduled and executed. + * @param dependencies A set of depending events. The actual evaluation will not start + * until all the events are signaled. + * @param num_dependencies The number of events in the dependencies set. + * @param duration The maximum amount of time in nanoseconds that is expected to + * be spent executing the model after all dependencies are + * signaled. If set to 0, the timeout duration is considered + * infinite. + * @param event The event that will be signaled on completion. event is set to + * NULL if there's an error. + * + * @return ANEURALNETWORKS_NO_ERROR if the evaluation is successfully scheduled. + * + * Available since API level 30. + */ +int ANeuralNetworksExecution_startComputeWithDependencies( + ANeuralNetworksExecution* execution, const ANeuralNetworksEvent* const* dependencies, + uint32_t num_dependencies, uint64_t duration, ANeuralNetworksEvent** event) + __INTRODUCED_IN(30); + +#endif // __ANDROID_API__ >= 30 + __END_DECLS -#endif // ANDROID_ML_NN_RUNTIME_NEURAL_NETWORKS_H +#endif // ANDROID_FRAMEWORKS_ML_NN_RUNTIME_NEURAL_NETWORKS_H + +// For compatibility with android, check __ANDROID__ is defined +#ifndef __ANDROID__ +#undef __ANDROID_API__ +#undef __INTRODUCED_IN +#endif // __ANDROID__ /** @} */ diff --git a/runtime/nnapi-header/include/NeuralNetworksExtensions.h b/runtime/nnapi-header/include/NeuralNetworksExtensions.h index ca2e04567..dd51b0301 100644 --- a/runtime/nnapi-header/include/NeuralNetworksExtensions.h +++ b/runtime/nnapi-header/include/NeuralNetworksExtensions.h @@ -14,8 +14,8 @@ * limitations under the License. */ -#ifndef ANDROID_ML_NN_RUNTIME_NEURAL_NETWORKS_EXTENSIONS_H -#define ANDROID_ML_NN_RUNTIME_NEURAL_NETWORKS_EXTENSIONS_H +#ifndef ANDROID_FRAMEWORKS_ML_NN_RUNTIME_NEURAL_NETWORKS_EXTENSIONS_H +#define ANDROID_FRAMEWORKS_ML_NN_RUNTIME_NEURAL_NETWORKS_EXTENSIONS_H #include "NeuralNetworks.h" @@ -37,7 +37,7 @@ __BEGIN_DECLS -#if __ANDROID_API__ >= __ANDROID_API_Q__ +#if __ANDROID_API__ >= 29 /** * Queries whether an extension is supported by the driver implementation of the specified device. @@ -110,8 +110,8 @@ int ANeuralNetworksModel_setOperandExtensionData(ANeuralNetworksModel* model, in const void* data, size_t length) __INTRODUCED_IN(29); -#endif // __ANDROID_API__ >= __ANDROID_API_Q__ +#endif // __ANDROID_API__ >= 29 __END_DECLS -#endif // ANDROID_ML_NN_RUNTIME_NEURAL_NETWORKS_EXTENSIONS_H +#endif // ANDROID_FRAMEWORKS_ML_NN_RUNTIME_NEURAL_NETWORKS_EXTENSIONS_H diff --git a/runtime/onert/api/CMakeLists.txt b/runtime/onert/api/CMakeLists.txt index 49a5aa071..9c6dd90cc 100644 --- a/runtime/onert/api/CMakeLists.txt +++ b/runtime/onert/api/CMakeLists.txt @@ -9,10 +9,16 @@ add_library(${ONERT_DEV} SHARED ${API_SRC}) set(NNFW_API_HEADERS include/nnfw.h include/nnfw_experimental.h) target_link_libraries(${ONERT_DEV} PUBLIC nnfw-nnapi-header) -target_link_libraries(${ONERT_DEV} PUBLIC onert_core) +target_link_libraries(${ONERT_DEV} PRIVATE onert_core) target_link_libraries(${ONERT_DEV} PRIVATE jsoncpp tflite_loader circle_loader ${LIB_PTHREAD}) target_link_libraries(${ONERT_DEV} PRIVATE nnfw_common) target_link_libraries(${ONERT_DEV} PRIVATE nnfw_coverage) +# NOTE Below line is added to remove warning for android build +# It will be removed after android build uses gold linker +if (ANDROID) + target_link_libraries(${ONERT_DEV} INTERFACE log) +endif (ANDROID) + target_include_directories(${ONERT_DEV} PUBLIC include) set_target_properties(${ONERT_DEV} PROPERTIES PUBLIC_HEADER "${NNFW_API_HEADERS}") diff --git a/runtime/onert/api/include/nnfw_version.h b/runtime/onert/api/include/nnfw_version.h index 42e43760b..8c6ea3994 100644 --- a/runtime/onert/api/include/nnfw_version.h +++ b/runtime/onert/api/include/nnfw_version.h @@ -21,6 +21,6 @@ * NNFW_VERSION is a uint32 value representing nnfw runtime version * in 0xMMmmmmPP, where MM = major, mmmm = minor, PP = patch */ -#define NNFW_VERSION 0x01000900 +#define NNFW_VERSION 0x01000a00 #endif // __NNFW_VERSION_H__ diff --git a/runtime/onert/api/src/nnfw_api_internal.cc b/runtime/onert/api/src/nnfw_api_internal.cc index 81b40703f..aa066e190 100644 --- a/runtime/onert/api/src/nnfw_api_internal.cc +++ b/runtime/onert/api/src/nnfw_api_internal.cc @@ -112,7 +112,16 @@ NNFW_STATUS nnfw_session::load_circle_from_buffer(uint8_t *buffer, size_t size) if (size == 0) return NNFW_STATUS_ERROR; - _subgraphs = onert::circle_loader::loadModel(buffer, size); + try + { + _subgraphs = onert::circle_loader::loadModel(buffer, size); + } + catch (const std::exception &e) + { + std::cerr << "Error during model loading : " << e.what() << std::endl; + return NNFW_STATUS_ERROR; + } + _compiler = std::make_unique<onert::compiler::Compiler>(_subgraphs); _state = State::MODEL_LOADED; diff --git a/runtime/onert/backend/acl_cl/ConstantInitializer.cc b/runtime/onert/backend/acl_cl/ConstantInitializer.cc index 31f1c10eb..b45b91058 100644 --- a/runtime/onert/backend/acl_cl/ConstantInitializer.cc +++ b/runtime/onert/backend/acl_cl/ConstantInitializer.cc @@ -14,6 +14,11 @@ * limitations under the License. */ +#include <AclActivationBuilder.h> +#include <AclFunction.h> +#include <Convert.h> +#include <Swizzle.h> + #include "ConstantInitializer.h" namespace onert @@ -96,6 +101,46 @@ void ConstantInitializer::visit(const ir::operation::SpaceToBatchND &node) } } +void ConstantInitializer::visit(const ir::operation::Reverse &node) +{ + const auto &output_index = node.getOutputs().at(0); + + const auto &input_index = node.getInputs().at(ir::operation::Reverse::Input::INPUT); + const auto &input_obj = _operands.at(input_index); + + const auto &axis_index = node.getInputs().at(ir::operation::Reverse::Input::AXIS); + const auto &axis_obj = _operands.at(axis_index); + + const auto ifm_rank = input_obj.shape().rank(); + const auto frontend_layout = this->_current_op_seq_layout; + + auto output_tensor = this->_tensor_reg->getITensor(output_index); + const auto backend_layout = output_tensor->layout(); + + if (axis_obj.isConstant()) + { + _init_map[axis_index] = [ifm_rank, frontend_layout, backend_layout](const ir::Operand &operand, + backend::ITensor &obj) { + assert(operand.data()); + + const auto axis_value = *(reinterpret_cast<const int32_t *>(operand.data()->base())); + int32_t axis_tmp = axis_value; + if (axis_tmp < 0) + { + axis_tmp = axis_tmp + ifm_rank; + } + + auto axis = + acl_common::ToARMComputeAxis(ifm_rank, axis_tmp, frontend_layout, backend_layout).value(); + + obj.access([&](ITensor &tensor) { + int32_t *into = reinterpret_cast<int32_t *>(tensor.buffer()); + *into = (int32_t)axis; + }); + }; + } +} + } // namespace acl_cl } // namespace backend } // namespace onert diff --git a/runtime/onert/backend/acl_cl/ConstantInitializer.h b/runtime/onert/backend/acl_cl/ConstantInitializer.h index 4f894fd31..9f3acb461 100644 --- a/runtime/onert/backend/acl_cl/ConstantInitializer.h +++ b/runtime/onert/backend/acl_cl/ConstantInitializer.h @@ -38,6 +38,7 @@ public: void visit(const ir::operation::Gather &) final; void visit(const ir::operation::HashtableLookup &) final; void visit(const ir::operation::SpaceToBatchND &) final; + void visit(const ir::operation::Reverse &) final; }; } // namespace acl_cl diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.cc b/runtime/onert/backend/acl_cl/KernelGenerator.cc index 94489253d..cc9afcaeb 100644 --- a/runtime/onert/backend/acl_cl/KernelGenerator.cc +++ b/runtime/onert/backend/acl_cl/KernelGenerator.cc @@ -78,9 +78,9 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node) const auto block_size_index{ node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)}; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); - auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); + auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index); assert(_ctx.at(block_size_index).data()); @@ -98,9 +98,9 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node) const auto activation = node.param().activation; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get(); - auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index); + auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index); const auto act_info = acl_common::asActivationLayerInfo(activation); @@ -164,10 +164,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) ker_width, ker_height); const auto activation = node.param().activation; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); - auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get(); - auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); + auto ker_tensor = _tensor_reg->getAclTensor(ker_index); + auto bias_tensor = _tensor_reg->getAclTensor(bias_index); const auto conv_info = acl_common::asPadStrideInfo(padding, stride); const auto act_info = acl_common::asActivationLayerInfo(activation); @@ -202,10 +202,10 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) const auto multiplier = node.param().multiplier; const auto activation = node.param().activation; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); - auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get(); - auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); + auto ker_tensor = _tensor_reg->getAclTensor(ker_index); + auto bias_tensor = _tensor_reg->getAclTensor(bias_index); const auto conv_info = acl_common::asPadStrideInfo(padding, stride); const auto act_info = acl_common::asActivationLayerInfo(activation); @@ -240,7 +240,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node) return; } - auto output_tensor = _tensor_reg->getAclTensor(ofm_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(ofm_index); std::vector<::arm_compute::ICLTensor *> input_tensors; for (auto &ifm_ind : input_indexes) input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle()); @@ -268,7 +268,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node) void KernelGenerator::visit(const ir::operation::FullyConnected &node) { const auto output_index{node.getOutputs().at(0)}; - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); const auto activation = node.param().activation; auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ICLTensor, @@ -286,8 +286,8 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) const auto keep_dims{node.param().keep_dims}; const auto reduce_type = node.param().reduce_type; - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto input_tensor = _tensor_reg->getAclTensor(input_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); + auto input_tensor = _tensor_reg->getAclTensor(input_index); // Convert to ACL axes taking into account negative values and possible duplicates. const auto &axes = _ctx.at(axes_index); @@ -320,8 +320,8 @@ void KernelGenerator::visit(const ir::operation::Reshape &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)}; - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto input_tensor = _tensor_reg->getAclTensor(input_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); + auto input_tensor = _tensor_reg->getAclTensor(input_index); // NOTE This operation must not be changed the layout from frontend to backend // So, PermutationOperationPass makes layouts of frontend and backend the same. @@ -351,8 +351,8 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node) (void)dims; (void)ndim; - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto input_tensor = _tensor_reg->getAclTensor(input_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); + auto input_tensor = _tensor_reg->getAclTensor(input_index); auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(), output_tensor->handle()); _return_fn = asAclFunction(std::move(fn)); @@ -365,8 +365,8 @@ void KernelGenerator::visit(const ir::operation::Softmax &node) const auto beta = node.param().beta; - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto input_tensor = _tensor_reg->getAclTensor(input_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); + auto input_tensor = _tensor_reg->getAclTensor(input_index); auto fn = acl_common::generateLayer<arm_compute::CLSoftmaxLayer>( _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(), @@ -382,8 +382,8 @@ void KernelGenerator::visit(const ir::operation::Slice &node) const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)}; const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)}; - auto outputData_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto inputData_tensor = _tensor_reg->getAclTensor(input_index).get(); + auto outputData_tensor = _tensor_reg->getAclTensor(output_index); + auto inputData_tensor = _tensor_reg->getAclTensor(input_index); const auto frontend_layout = _current_op_seq_layout; const auto backend_layout = inputData_tensor->layout(); @@ -449,8 +449,8 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node) const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)}; const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)}; - auto outputData_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto inputData_tensor = _tensor_reg->getAclTensor(input_index).get(); + auto outputData_tensor = _tensor_reg->getAclTensor(output_index); + auto inputData_tensor = _tensor_reg->getAclTensor(input_index); const auto frontend_layout = _current_op_seq_layout; const auto backend_layout = inputData_tensor->layout(); @@ -523,10 +523,23 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node) strides_set.set(i, strides[i]); } + // Disable applied dim_correction + if (inputData_tensor->num_dimensions() != inputData_tensor->info()->num_dimensions()) + { + // This means that high dimension's value is 1 and input tensor is applied dim_correction + acl_common::disableDimCorrection(inputData_tensor); + } + auto fn = acl_common::generateLayer<arm_compute::CLStridedSlice>( inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set, begin_mask, end_mask, shrink_axis_mask); + // Revert disabling applied dim_correction + if (inputData_tensor->dimension(0) == 1) + { + acl_common::enableDimCorrection(inputData_tensor); + } + _return_fn = asAclFunction(std::move(fn)); } @@ -534,22 +547,47 @@ void KernelGenerator::visit(const ir::operation::Transpose &node) { const auto ofm_idx{node.getOutputs().at(0)}; const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)}; - const auto &perm{node.param().perm}; + const auto perm_idx{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)}; const auto rank = _ctx.at(ifm_idx).shape().rank(); - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx); const auto frontend_layout = _current_op_seq_layout; const auto backend_layout = ifm_tensor->layout(); - std::vector<std::int32_t> pv(perm.cbegin(), perm.cend()); - // Reversed - auto backend_pv = ::onert::backend::acl_common::getARMComputePermutationVector( - rank, pv, frontend_layout, backend_layout); + const auto &perms = _ctx.at(perm_idx); + std::vector<int32_t> pv; + if (perms.shape() == ir::Shape{0}) + { + pv.resize(rank); + std::iota(pv.begin(), pv.end(), 0); + std::reverse(pv.begin(), pv.end()); + } + else + { + pv = _ctx.at(perm_idx).asVector<int32_t>(); + } - auto fn = acl_common::generateLayer<::arm_compute::CLPermute>(ifm_tensor->handle(), - ofm_tensor->handle(), backend_pv); + std::unique_ptr<arm_compute::IFunction> fn; + if (rank == 1) + { + fn = acl_common::generateLayer<arm_compute::CLCopy>(ifm_tensor->handle(), ofm_tensor->handle()); + } + else if (rank == 2) + { + assert(pv.size() == 2 && pv.at(0) == 1 && pv.at(1) == 0); + fn = acl_common::generateLayer<arm_compute::CLTranspose>(ifm_tensor->handle(), + ofm_tensor->handle()); + } + else + { + auto backend_pv = + acl_common::getARMComputePermutationVector(rank, pv, frontend_layout, backend_layout); + + fn = acl_common::generateLayer<arm_compute::CLPermute>(ifm_tensor->handle(), + ofm_tensor->handle(), backend_pv); + } _return_fn = asAclFunction(std::move(fn)); } @@ -559,8 +597,8 @@ void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)}; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); const ::arm_compute::ActivationLayerInfo act_info = acl_common::asActivationLayerInfo( node.param().op_type, node.param().alpha, node.param().beta); @@ -577,9 +615,9 @@ void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node) const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)}; - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get(); - auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); + auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index); + auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index); std::unique_ptr<arm_compute::IFunction> fn; switch (node.param().op_type) @@ -626,8 +664,8 @@ void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)}; - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto input_tensor = _tensor_reg->getAclTensor(input_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); + auto input_tensor = _tensor_reg->getAclTensor(input_index); std::unique_ptr<arm_compute::IFunction> fn; switch (node.param().op_type) @@ -647,7 +685,11 @@ void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node) { fn = acl_common::generateLayer<arm_compute::CLCopy>(input_tensor->handle(), output_tensor->handle()); - ; + } + else if (_ctx.at(input_index).typeInfo().type() == ir::DataType::BOOL8) + { + fn = acl_common::generateLayer<arm_compute::CLCastBool>(input_tensor->handle(), + output_tensor->handle()); } else { @@ -719,8 +761,8 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)}; - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto input_tensor = _tensor_reg->getAclTensor(input_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); + auto input_tensor = _tensor_reg->getAclTensor(input_index); auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(), output_tensor->handle()); @@ -735,10 +777,10 @@ void KernelGenerator::visit(const ir::operation::InstanceNorm &node) const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)}; const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)}; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); - auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index).get(); - auto beta_tensor = _tensor_reg->getAclTensor(beta_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); + auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index); + auto beta_tensor = _tensor_reg->getAclTensor(beta_index); auto epsilon = node.param().epsilon; auto activation = node.param().activation; @@ -764,9 +806,9 @@ void KernelGenerator::visit(const ir::operation::Comparison &node) const auto comparison_type = node.param().comparison_type; - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto input0_tensor = _tensor_reg->getAclTensor(input0_index).get(); - auto input1_tensor = _tensor_reg->getAclTensor(input1_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); + auto input0_tensor = _tensor_reg->getAclTensor(input0_index); + auto input1_tensor = _tensor_reg->getAclTensor(input1_index); auto fn = acl_common::generateLayer<arm_compute::CLComparison>( input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(), @@ -775,6 +817,56 @@ void KernelGenerator::visit(const ir::operation::Comparison &node) _return_fn = asAclFunction(std::move(fn)); } +void KernelGenerator::visit(const ir::operation::OneHot &node) +{ + const auto output_idx{node.getOutputs().at(0)}; + const auto indices_idx{node.getInputs().at(ir::operation::OneHot::Input::INDICES)}; + const auto depth_idx{node.getInputs().at(ir::operation::OneHot::Input::DEPTH)}; + const auto onvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::ON_VALUE)}; + const auto offvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::OFF_VALUE)}; + const auto depth = _ctx.at(depth_idx).asScalar<int32_t>(); + assert(depth > 0); + + auto output_tensor = _tensor_reg->getAclTensor(output_idx); + auto indices_tensor = _tensor_reg->getAclTensor(indices_idx); + auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx); + + const size_t output_rank = _ctx.at(output_idx).shape().rank(); + const auto frontend_layout = _current_op_seq_layout; + const auto backend_layout = output_tensor->layout(); + int32_t axis = node.param().axis == -1 ? output_rank - 1 : node.param().axis; + axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value(); + + if (output_tensor->num_dimensions() != output_tensor->info()->num_dimensions()) + { + // This means that high dimension's value is 1 and output_tensor is applied dim_correction + acl_common::disableDimCorrection(output_tensor); + } + + std::unique_ptr<::arm_compute::IFunction> fn; + const auto &offvalue = _ctx.at(offvalue_idx); + if (offvalue.isConstant()) + { + fn = acl_common::generateLayer<arm_compute::CLOneHot>( + indices_tensor->handle(), onvalue_tensor->handle(), output_tensor->handle(), + acl_common::asPixelValue(offvalue), static_cast<uint32_t>(depth), axis); + } + else + { + auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx); + fn = acl_common::generateLayer<arm_compute::CLOneHot>( + indices_tensor->handle(), onvalue_tensor->handle(), offvalue_tensor->handle(), + output_tensor->handle(), static_cast<uint32_t>(depth), axis); + } + + if (output_tensor->dimension(0) == 1) + { + acl_common::enableDimCorrection(output_tensor); + } + + _return_fn = asAclFunction(std::move(fn)); +} + void KernelGenerator::visit(const ir::operation::Pack &node) { const auto output_index{node.getOutputs().at(0)}; @@ -786,41 +878,39 @@ void KernelGenerator::visit(const ir::operation::Pack &node) for (const auto &input_index : node.getInputs()) input_indexes.emplace_back(input_index); - auto output = _tensor_reg->getAclTensor(output_index).get()->handle(); + auto output = _tensor_reg->getAclTensor(output_index)->handle(); std::vector<arm_compute::ICLTensor *> inputs; for (const auto &input_index : input_indexes) inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle()); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = _tensor_reg->getAclTensor(output_index).get()->layout(); + const auto backend_layout = _tensor_reg->getAclTensor(output_index)->layout(); if (axis < 0) axis += output_rank; axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value(); // Disable applied dim_correction - std::vector<arm_compute::TensorShape> orig_inputs_acl_tensor_shapes; for (const auto &input_index : input_indexes) { - size_t input_rank = _ctx.at(input_index).shape().rank(); const auto &input_tensor = _tensor_reg->getAclTensor(input_index); - orig_inputs_acl_tensor_shapes.emplace_back(input_tensor->info()->tensor_shape()); - assert(input_rank == input_tensor->num_dimensions()); - if (input_rank != input_tensor->info()->num_dimensions()) + if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions()) { - // This means that high dimension's value is 1 and ifm tensor is applied dim_correction - input_tensor->info()->set_tensor_shape(acl_common::asTensorShape( - _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false)); + // This means that high dimension's value is 1 and input tensor is applied dim_correction + acl_common::disableDimCorrection(input_tensor); } } auto fn = acl_common::generateLayer<arm_compute::CLStackLayer>(inputs, axis, output); // Revert disabling applied dim_correction - assert(inputs.size() == orig_inputs_acl_tensor_shapes.size()); - for (size_t i = 0; i < inputs.size(); ++i) + for (const auto &input_index : input_indexes) { - inputs.at(i)->info()->set_tensor_shape(orig_inputs_acl_tensor_shapes.at(i)); + const auto &input_tensor = _tensor_reg->getAclTensor(input_index); + if (input_tensor->dimension(0) == 1) + { + acl_common::enableDimCorrection(input_tensor); + } } _return_fn = asAclFunction(std::move(fn)); @@ -833,7 +923,7 @@ void KernelGenerator::visit(const ir::operation::Pool2D &node) acl_common::convertPoolType(node.param().op_type)); const auto ofm_index{node.getOutputs().at(0)}; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); const auto activation = node.param().activation; _return_fn = std::make_unique<exec::FunctionSequence>( asAclFunction(std::move(raw_fn)), @@ -845,8 +935,8 @@ void KernelGenerator::visit(const ir::operation::Permute &node) const auto ofm_idx{node.getOutputs().at(0)}; const auto ifm_idx{node.getInputs().at(0)}; const auto permute_type = node.getPermuteType(); - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx); const auto rank = _ctx.at(ofm_idx).shape().rank(); assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank()); @@ -879,11 +969,10 @@ void KernelGenerator::visit(const ir::operation::Permute &node) void KernelGenerator::visit(const ir::operation::ResizeBilinear &node) { const auto ofm_index{node.getOutputs().at(0)}; - const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)}; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); auto fn = acl_common::generateLayer<arm_compute::CLScale>( ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::InterpolationPolicy::BILINEAR, @@ -896,11 +985,10 @@ void KernelGenerator::visit(const ir::operation::ResizeBilinear &node) void KernelGenerator::visit(const ir::operation::ResizeNearestNeighbor &node) { const auto ofm_index{node.getOutputs().at(0)}; - const auto ifm_index{node.getInputs().at(ir::operation::ResizeNearestNeighbor::Input::INPUT)}; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); auto fn = acl_common::generateLayer<arm_compute::CLScale>( ifm_tensor->handle(), ofm_tensor->handle(), @@ -925,14 +1013,14 @@ void KernelGenerator::visit(const ir::operation::RNN &node) const auto activation = node.param().activation; - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); + auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index); - auto input_tensor = _tensor_reg->getAclTensor(input_index).get(); - auto weights_tensor = _tensor_reg->getAclTensor(weights_index).get(); - auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index).get(); - auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get(); - auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index).get(); + auto input_tensor = _tensor_reg->getAclTensor(input_index); + auto weights_tensor = _tensor_reg->getAclTensor(weights_index); + auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index); + auto bias_tensor = _tensor_reg->getAclTensor(bias_index); + auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index); auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation); auto copy_layer = acl_common::generateLayer<arm_compute::CLCopy>( @@ -954,10 +1042,10 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node) node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)}; const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)}; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); - auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index).get(); - auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); + auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index); + auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index); assert(_ctx.at(block_size_index).data()); assert(_ctx.at(paddings_index).data()); @@ -976,8 +1064,8 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node) auto block_size = node.param().block_size; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); auto fn = acl_common::generateLayer<arm_compute::CLSpaceToDepthLayer>( ifm_tensor->handle(), ofm_tensor->handle(), block_size); @@ -991,9 +1079,9 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node) const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)}; const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)}; - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index).get(); - auto values_tensor = _tensor_reg->getAclTensor(values_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); + auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index); + auto values_tensor = _tensor_reg->getAclTensor(values_index); auto fn = acl_common::generateLayer<arm_compute::CLEmbeddingLookup>( values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle()); @@ -1020,8 +1108,8 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node) float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction) float bias = 0.0f; // Don't offset the reduction. - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP, radius, alpha, beta, bias, false); @@ -1041,12 +1129,12 @@ void KernelGenerator::visit(const ir::operation::HashtableLookup &node) const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)}; const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)}; - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto hits_tensor = _tensor_reg->getAclTensor(hits_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); + auto hits_tensor = _tensor_reg->getAclTensor(hits_index); - auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index).get(); - auto keys_tensor = _tensor_reg->getAclTensor(keys_index).get(); - auto values_tensor = _tensor_reg->getAclTensor(values_index).get(); + auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index); + auto keys_tensor = _tensor_reg->getAclTensor(keys_index); + auto values_tensor = _tensor_reg->getAclTensor(values_index); auto fn = acl_common::generateLayer<arm_compute::CLHashtableLookup>( lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(), @@ -1061,9 +1149,9 @@ void KernelGenerator::visit(const ir::operation::PReLU &node) const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)}; const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)}; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); - auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); + auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index); auto fn = acl_common::generateLayer<arm_compute::CLPReluLayer>( ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle()); @@ -1096,9 +1184,9 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node) invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1); } - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); - auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); + auto ker_tensor = _tensor_reg->getAclTensor(ker_index); const auto tconv_info = acl_common::asPadStrideInfo(padding, stride); @@ -1116,9 +1204,9 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node) const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)}; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get(); - auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index); + auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index); auto fn = acl_common::generateLayer<arm_compute::CLElementwiseSquaredDiff>( lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); @@ -1140,9 +1228,9 @@ void KernelGenerator::visit(const ir::operation::TopKV2 &node) const auto k = node.param().k; - auto values_tensor = _tensor_reg->getAclTensor(outputValues_index).get(); - auto indices_tensor = _tensor_reg->getAclTensor(outputIndices_index).get(); - auto input_tensor = _tensor_reg->getAclTensor(inputData_index).get(); + auto values_tensor = _tensor_reg->getAclTensor(outputValues_index); + auto indices_tensor = _tensor_reg->getAclTensor(outputIndices_index); + auto input_tensor = _tensor_reg->getAclTensor(inputData_index); auto fn = acl_common::generateLayer<arm_compute::CLTopKV2>( input_tensor->handle(), k, values_tensor->handle(), indices_tensor->handle()); @@ -1162,9 +1250,9 @@ void KernelGenerator::visit(const ir::operation::Gather &node) const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw); const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value(); - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); - auto indices_tensor = _tensor_reg->getAclTensor(indices_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); + auto indices_tensor = _tensor_reg->getAclTensor(indices_index); // NOTE The frontend layout and backend layout must be the same for this operation. // If not the same, we have to add a stage(?) to perform permutation of output tensor. It @@ -1187,29 +1275,29 @@ void KernelGenerator::visit(const ir::operation::Gather &node) assert(k == indices_tensor->num_dimensions()); // Disable applied dim_correction - const auto orig_ifm_acl_tensor_shape = ifm_tensor->info()->tensor_shape(); if (n != ifm_tensor->info()->num_dimensions()) { // This means that high dimension's value is 1 and ifm tensor is applied dim_correction - const auto ifm = _ctx.at(ifm_index); - ifm_tensor->info()->set_tensor_shape( - acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false)); + acl_common::disableDimCorrection(ifm_tensor); } - const auto orig_indice_acl_tensor_shape = indices_tensor->info()->tensor_shape(); if (k != indices_tensor->info()->num_dimensions()) { // This means that high dimension's value is 1 and indices tensor is applied dim_correction - const auto indices = _ctx.at(indices_index); - indices_tensor->info()->set_tensor_shape( - acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false)); + acl_common::disableDimCorrection(indices_tensor); } auto fn = acl_common::generateLayer<arm_compute::CLGatherEx>( ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis); // Revert disabling applied dim_correction - ifm_tensor->info()->set_tensor_shape(orig_ifm_acl_tensor_shape); - indices_tensor->info()->set_tensor_shape(orig_indice_acl_tensor_shape); + if (ifm_tensor->dimension(0) == 1) + { + acl_common::enableDimCorrection(ifm_tensor); + } + if (indices_tensor->dimension(0) == 1) + { + acl_common::enableDimCorrection(indices_tensor); + } _return_fn = asAclFunction(std::move(fn)); } @@ -1218,19 +1306,20 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node) { const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::ArgMax::Input::INPUT)}; + const auto axis_index{node.getInputs().at(ir::operation::ArgMax::Input::AXIS)}; auto ifm_shape = _ctx.at(ifm_index).shape(); auto ofm_shape = _ctx.at(ofm_index).shape(); assert((ifm_shape.rank() - 1) == ofm_shape.rank()); - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); const auto ifm_rank = _ctx.at(ifm_index).shape().rank(); auto frontend_layout = _current_op_seq_layout; auto backend_layout = ifm_tensor->layout(); - int axis_value = node.param().axis; + int axis_value = _ctx.at(axis_index).asScalar<int32_t>(); if (axis_value < 0) { axis_value += ifm_rank; @@ -1239,7 +1328,7 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node) auto acl_axis = acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value(); - auto fn = acl_common::generateLayer<arm_compute::CLArgMinMaxLayer>( + auto fn = acl_common::generateLayer<arm_compute::CLArgMinMaxLayerEx>( ifm_tensor->handle(), acl_axis, ofm_tensor->handle(), ::arm_compute::ReductionOperation::ARG_IDX_MAX); @@ -1257,8 +1346,8 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod auto beta = node.param().beta; auto bias = node.param().bias; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); const auto norm_info = ::arm_compute::NormalizationLayerInfo( ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false); @@ -1277,8 +1366,8 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node) auto block_size = node.param().block_size; assert(block_size > 0); - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto input_tensor = _tensor_reg->getAclTensor(input_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); + auto input_tensor = _tensor_reg->getAclTensor(input_index); auto fn = acl_common::generateLayer<arm_compute::CLDepthToSpaceLayer>( input_tensor->handle(), output_tensor->handle(), block_size); @@ -1289,22 +1378,27 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node) void KernelGenerator::visit(const ir::operation::Split &node) { const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)}; + const auto axis_index{node.getInputs().at(ir::operation::Split::Input::AXIS)}; assert(node.param().num_splits == static_cast<int>(node.getOutputs().size())); + if (!_ctx.at(axis_index).isConstant()) + { + throw std::runtime_error("Non-constant axis_index NYI for acl_cl backend"); + } const auto ifm_rank = _ctx.at(ifm_index).shape().rank(); std::vector<ir::OperandIndex> output_indexes; for (const auto &output : node.getOutputs()) output_indexes.emplace_back(output); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); std::vector<arm_compute::ICLTensor *> output_tensors; for (const auto &ofm_ind : output_indexes) - output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind).get()->handle()); + output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle()); const auto frontend_layout = _current_op_seq_layout; const auto backend_layout = ifm_tensor->layout(); - auto axis = node.param().axis; + auto axis = _ctx.at(axis_index).asScalar<int32_t>(); if (axis < 0) axis += ifm_rank; axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value(); @@ -1315,6 +1409,60 @@ void KernelGenerator::visit(const ir::operation::Split &node) _return_fn = asAclFunction(std::move(fn)); } +void KernelGenerator::visit(const ir::operation::SplitV &node) +{ + const auto ifm_index{node.getInputs().at(ir::operation::SplitV::Input::INPUT)}; + const auto size_split_index{node.getInputs().at(ir::operation::SplitV::Input::SIZE_SPLITS)}; + const auto split_dim_index{node.getInputs().at(ir::operation::SplitV::Input::SPLIT_DIM)}; + + assert(node.param().num_splits == static_cast<int>(node.getOutputs().size())); + + const size_t ifm_rank = _ctx.at(ifm_index).shape().rank(); + std::vector<ir::OperandIndex> output_indexes; + for (const auto &output : node.getOutputs()) + output_indexes.emplace_back(output); + + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); + auto size_split_tensor = _tensor_reg->getAclTensor(size_split_index); + + std::vector<arm_compute::ICLTensor *> output_tensors; + for (const auto &ofm_ind : output_indexes) + output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle()); + + auto fn = std::make_unique<arm_compute::CLSplitVEx>(); + const auto &split_dim_op = _ctx.at(split_dim_index); + if (split_dim_op.isConstant()) + { + int32_t split_dim = split_dim_op.asScalar<int32_t>(); + uint32_t split_dim_revised = (split_dim < 0) ? (split_dim + ifm_rank) : split_dim; + const auto frontend_layout = _current_op_seq_layout; + const auto backend_layout = ifm_tensor->layout(); + + if (ifm_tensor->num_dimensions() != ifm_tensor->info()->num_dimensions()) + { + // This means that high dimension's value is 1 and ifm tensor is applied dim_correction + acl_common::disableDimCorrection(ifm_tensor); + } + + split_dim_revised = + acl_common::ToARMComputeAxis(ifm_rank, split_dim_revised, frontend_layout, backend_layout) + .value(); + fn->configure(ifm_tensor->handle(), size_split_tensor->handle(), split_dim_revised, + output_tensors, node.param().num_splits); + + if (ifm_tensor->dimension(0) == 1) + { + acl_common::enableDimCorrection(ifm_tensor); + } + } + else + { + throw std::runtime_error("Non-constant split_dim NYI for acl_cl backend"); + } + + _return_fn = asAclFunction(std::move(fn)); +} + void KernelGenerator::visit(const ir::operation::Unpack &node) { const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)}; @@ -1326,34 +1474,32 @@ void KernelGenerator::visit(const ir::operation::Unpack &node) for (const auto &output_index : node.getOutputs()) output_indexes.emplace_back(output_index); - auto input = _tensor_reg->getAclTensor(input_index).get()->handle(); + auto input_tensor = _tensor_reg->getAclTensor(input_index); std::vector<arm_compute::ICLTensor *> outputs; for (const auto &output_index : output_indexes) outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle()); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = _tensor_reg->getAclTensor(input_index).get()->layout(); + const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout(); if (axis < 0) axis += input_rank; axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value(); // Disable applied dim_correction - std::vector<arm_compute::TensorShape> orig_outputs_acl_tensor_shapes; - for (const auto &output_index : output_indexes) + if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions()) { - size_t output_rank = _ctx.at(output_index).shape().rank(); - const auto &output_tensor = _tensor_reg->getAclTensor(output_index); - orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape()); - assert(output_rank == output_tensor->num_dimensions()); - if (output_rank != output_tensor->info()->num_dimensions()) - { - // This means that high dimension's value is 1 and ifm tensor is applied dim_correction - output_tensor->info()->set_tensor_shape(acl_common::asTensorShape( - _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false)); - } + // This means that high dimension's value is 1 and input tensor is applied dim_correction + acl_common::disableDimCorrection(input_tensor); } - auto fn = acl_common::generateLayer<arm_compute::CLUnstack>(input, outputs, axis); + auto fn = + acl_common::generateLayer<arm_compute::CLUnstack>(input_tensor->handle(), outputs, axis); + + // Revert disabling applied dim_correction + if (input_tensor->dimension(0) == 1) + { + acl_common::enableDimCorrection(input_tensor); + } _return_fn = asAclFunction(std::move(fn)); } @@ -1373,11 +1519,11 @@ void KernelGenerator::visit(const ir::operation::Pad &node) auto quant_info = ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset()); const auto pixel_value = ::arm_compute::PixelValue(0, data_type, quant_info); - auto input = _tensor_reg->getAclTensor(input_index).get()->handle(); - auto output = _tensor_reg->getAclTensor(output_index).get()->handle(); + auto input = _tensor_reg->getAclTensor(input_index)->handle(); + auto output = _tensor_reg->getAclTensor(output_index)->handle(); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = _tensor_reg->getAclTensor(input_index).get()->layout(); + const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout(); ::arm_compute::PaddingList padding_list; padding_list.resize(rank); @@ -1391,21 +1537,26 @@ void KernelGenerator::visit(const ir::operation::Pad &node) } // Disable applied dim_correction - size_t input_rank = _ctx.at(input_index).shape().rank(); const auto &input_tensor = _tensor_reg->getAclTensor(input_index); - assert(input_rank == input_tensor->num_dimensions()); - if (input_rank != input_tensor->info()->num_dimensions()) + if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions()) { - // This means that high dimension's value is 1 and ifm tensor is applied dim_correction - input_tensor->info()->set_tensor_shape(acl_common::asTensorShape( - _ctx.at(input_index).shape(), frontend_layout, backend_layout, false)); + // This means that high dimension's value is 1 and input tensor is applied dim_correction + acl_common::disableDimCorrection(input_tensor); } auto fn = acl_common::generateLayer<arm_compute::CLPadLayer>(input, output, padding_list, pixel_value); - // Do not revert disabling applied dim_correction CLPadKernel has cl kernel for 4-dimension - // It would produce a mistach of result + // NOTE Do not revert disabling applied dim_correction for 4D. + // It would produce a mistach of result by incorrect offset_first_element in + // ICLKernel::add_tensor_argument<3>(). + // We have to disable applied dim_correction and not to revert enabling for the kernel that slices + // 4D to 3D because slicing arm_compute::Window can causes incorrect offset_first_element if the + // used tensor is 4D and the tensor's high dimention is 1 + if (input_tensor->num_dimensions() < 4 && input_tensor->dimension(0) == 1) + { + acl_common::enableDimCorrection(input_tensor); + } _return_fn = asAclFunction(std::move(fn)); } @@ -1415,8 +1566,8 @@ void KernelGenerator::visit(const ir::operation::ConvertFp32ToFp16 &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp32ToFp16::Input::INPUT)}; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); auto fn = acl_common::generateLayer<arm_compute::CLDepthConvertLayer>( ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0); @@ -1429,8 +1580,8 @@ void KernelGenerator::visit(const ir::operation::ConvertFp16ToFp32 &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp16ToFp32::Input::INPUT)}; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); auto fn = acl_common::generateLayer<arm_compute::CLDepthConvertLayer>( ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0); @@ -1438,6 +1589,30 @@ void KernelGenerator::visit(const ir::operation::ConvertFp16ToFp32 &node) _return_fn = asAclFunction(std::move(fn)); } +void KernelGenerator::visit(const ir::operation::Reverse &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(ir::operation::Reverse::Input::INPUT)}; + const auto axis_index{node.getInputs().at(ir::operation::Reverse::Input::AXIS)}; + + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); + auto axis_tensor = _tensor_reg->getAclTensor(axis_index); + + // WORKAROUND: acl-cl backend only allow U32 type for axis + // ConstantInitializer will resolve S32 type to U32 type + if (_ctx.at(axis_index).isConstant() && + (axis_tensor->handle()->info()->data_type() == arm_compute::DataType::S32)) + { + axis_tensor->handle()->info()->set_data_type(arm_compute::DataType::U32); + } + + auto fn = acl_common::generateLayer<arm_compute::CLReverse>( + ifm_tensor->handle(), ofm_tensor->handle(), axis_tensor->handle()); + + _return_fn = asAclFunction(std::move(fn)); +} + } // namespace acl_cl } // namespace backend } // namespace onert diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.h b/runtime/onert/backend/acl_cl/KernelGenerator.h index d188d6d83..e8a922677 100644 --- a/runtime/onert/backend/acl_cl/KernelGenerator.h +++ b/runtime/onert/backend/acl_cl/KernelGenerator.h @@ -59,6 +59,7 @@ public: void visit(const ir::operation::InstanceNorm &) override; void visit(const ir::operation::Comparison &) override; void visit(const ir::operation::LSTM &) override; + void visit(const ir::operation::OneHot &) override; void visit(const ir::operation::Pack &) override; void visit(const ir::operation::Pool2D &) override; void visit(const ir::operation::Permute &) override; @@ -79,10 +80,12 @@ public: void visit(const ir::operation::LocalResponseNormalization &) override; void visit(const ir::operation::DepthToSpace &) override; void visit(const ir::operation::Split &) override; + void visit(const ir::operation::SplitV &) override; void visit(const ir::operation::Unpack &) override; void visit(const ir::operation::Pad &) override; void visit(const ir::operation::ConvertFp32ToFp16 &) override; void visit(const ir::operation::ConvertFp16ToFp32 &) override; + void visit(const ir::operation::Reverse &) override; private: const ir::Operands &_ctx; diff --git a/runtime/onert/backend/acl_common/AclKernelGen.h b/runtime/onert/backend/acl_common/AclKernelGen.h index 372ce689e..257bbd3b4 100644 --- a/runtime/onert/backend/acl_common/AclKernelGen.h +++ b/runtime/onert/backend/acl_common/AclKernelGen.h @@ -30,6 +30,20 @@ namespace backend namespace acl_common { +void enableDimCorrection(IACLTensor *tensor) +{ + size_t input_rank = tensor->num_dimensions(); + const_cast<arm_compute::TensorShape &>(tensor->info()->tensor_shape()) + .set(input_rank - 1, tensor->info()->dimension(input_rank - 1), true); +} + +void disableDimCorrection(IACLTensor *tensor) +{ + size_t input_rank = tensor->num_dimensions(); + const_cast<arm_compute::TensorShape &>(tensor->info()->tensor_shape()) + .set(input_rank - 1, tensor->info()->dimension(input_rank - 1), false); +} + template <typename Layer, typename... Args> std::unique_ptr<arm_compute::IFunction> generateLayer(Args &&... args) { @@ -138,30 +152,27 @@ std::unique_ptr<exec::IFunction> kernelGenLSTM(const ir::operation::LSTM &node, const auto projection_clip = projection_threshold; assert(cell_clip >= 0.f && projection_clip >= 0.f); - auto scratch_buffer_tensor = tensor_reg->getAclTensor(scratch_buffer_index).get(); - auto output_state_out_tensor = tensor_reg->getAclTensor(output_state_out_index).get(); - auto cell_state_out_tensor = tensor_reg->getAclTensor(cell_state_out_index).get(); - auto output_tensor = tensor_reg->getAclTensor(output_index).get(); + auto scratch_buffer_tensor = tensor_reg->getAclTensor(scratch_buffer_index); + auto output_state_out_tensor = tensor_reg->getAclTensor(output_state_out_index); + auto cell_state_out_tensor = tensor_reg->getAclTensor(cell_state_out_index); + auto output_tensor = tensor_reg->getAclTensor(output_index); - auto input_tensor = tensor_reg->getAclTensor(input_index).get(); + auto input_tensor = tensor_reg->getAclTensor(input_index); - auto input_to_forget_weights_tensor = - tensor_reg->getAclTensor(input_to_forget_weights_index).get(); - auto input_to_cell_weights_tensor = tensor_reg->getAclTensor(input_to_cell_weights_index).get(); - auto input_to_output_weights_tensor = - tensor_reg->getAclTensor(input_to_output_weights_index).get(); + auto input_to_forget_weights_tensor = tensor_reg->getAclTensor(input_to_forget_weights_index); + auto input_to_cell_weights_tensor = tensor_reg->getAclTensor(input_to_cell_weights_index); + auto input_to_output_weights_tensor = tensor_reg->getAclTensor(input_to_output_weights_index); auto recurrent_to_forget_weights_tensor = - tensor_reg->getAclTensor(recurrent_to_forget_weights_index).get(); - auto recurrent_to_cell_weights_tensor = - tensor_reg->getAclTensor(recurrent_to_cell_weights_index).get(); + tensor_reg->getAclTensor(recurrent_to_forget_weights_index); + auto recurrent_to_cell_weights_tensor = tensor_reg->getAclTensor(recurrent_to_cell_weights_index); auto recurrent_to_output_weights_tensor = - tensor_reg->getAclTensor(recurrent_to_output_weights_index).get(); + tensor_reg->getAclTensor(recurrent_to_output_weights_index); - auto forget_gate_bias_tensor = tensor_reg->getAclTensor(forget_gate_bias_index).get(); - auto cell_bias_tensor = tensor_reg->getAclTensor(cell_bias_index).get(); - auto output_gate_bias_tensor = tensor_reg->getAclTensor(output_gate_bias_index).get(); - auto output_state_in_tensor = tensor_reg->getAclTensor(output_state_in_index).get(); - auto cell_state_in_tensor = tensor_reg->getAclTensor(cell_state_in_index).get(); + auto forget_gate_bias_tensor = tensor_reg->getAclTensor(forget_gate_bias_index); + auto cell_bias_tensor = tensor_reg->getAclTensor(cell_bias_index); + auto output_gate_bias_tensor = tensor_reg->getAclTensor(output_gate_bias_index); + auto output_state_in_tensor = tensor_reg->getAclTensor(output_state_in_index); + auto cell_state_in_tensor = tensor_reg->getAclTensor(cell_state_in_index); auto act_info = asActivationLayerInfo(activation); @@ -169,13 +180,13 @@ std::unique_ptr<exec::IFunction> kernelGenLSTM(const ir::operation::LSTM &node, if (has_cifg_param) { auto input_to_input_weights_tensor = - tensor_reg->getAclTensor(input_to_input_weights_index).get(); // optional + tensor_reg->getAclTensor(input_to_input_weights_index); // optional auto recurrent_to_input_weights_tensor = - tensor_reg->getAclTensor(recurrent_to_input_weights_index).get(); // optional + tensor_reg->getAclTensor(recurrent_to_input_weights_index); // optional auto cell_to_input_weights_handle = - has_peephole_param ? tensor_reg->getAclTensor(cell_to_input_weights_index).get()->handle() + has_peephole_param ? tensor_reg->getAclTensor(cell_to_input_weights_index)->handle() : nullptr; // optional (non-cifg && peephole) - auto input_gate_bias_tensor = tensor_reg->getAclTensor(input_gate_bias_index).get(); // optional + auto input_gate_bias_tensor = tensor_reg->getAclTensor(input_gate_bias_index); // optional lstm_params.set_cifg_params(input_to_input_weights_tensor->handle(), recurrent_to_input_weights_tensor->handle(), cell_to_input_weights_handle, input_gate_bias_tensor->handle()); @@ -183,19 +194,18 @@ std::unique_ptr<exec::IFunction> kernelGenLSTM(const ir::operation::LSTM &node, if (has_peephole_param) { auto cell_to_forget_weights_tensor = - tensor_reg->getAclTensor(cell_to_forget_weights_index).get(); // optional + tensor_reg->getAclTensor(cell_to_forget_weights_index); // optional auto cell_to_output_weights_tensor = - tensor_reg->getAclTensor(cell_to_output_weights_index).get(); // optional + tensor_reg->getAclTensor(cell_to_output_weights_index); // optional lstm_params.set_peephole_params(cell_to_forget_weights_tensor->handle(), cell_to_output_weights_tensor->handle()); } if (has_projection_param) { - auto projection_weights_tensor = - tensor_reg->getAclTensor(projection_weights_index).get(); // optional - auto projection_bias_handle = - has_projection_bias ? tensor_reg->getAclTensor(projection_bias_index).get()->handle() - : nullptr; // optional + auto projection_weights_tensor = tensor_reg->getAclTensor(projection_weights_index); // optional + auto projection_bias_handle = has_projection_bias + ? tensor_reg->getAclTensor(projection_bias_index)->handle() + : nullptr; // optional lstm_params.set_projection_params(projection_weights_tensor->handle(), projection_bias_handle); } @@ -260,10 +270,10 @@ kernelGenFullyConnected(const ir::operation::FullyConnected &node, const ir::Ope reshape.dim(1) = input_size; /* W */ } - auto output_tensor = tensor_reg->getAclTensor(output_index).get(); - const auto input_tensor = tensor_reg->getAclTensor(input_index).get(); - const auto weight_tensor = tensor_reg->getAclTensor(weight_index).get(); - const auto bias_tensor = tensor_reg->getAclTensor(bias_index).get(); + auto output_tensor = tensor_reg->getAclTensor(output_index); + const auto input_tensor = tensor_reg->getAclTensor(input_index); + const auto weight_tensor = tensor_reg->getAclTensor(weight_index); + const auto bias_tensor = tensor_reg->getAclTensor(bias_index); const auto frontend_layout = layout; const auto acl_layout = output_tensor->handle()->info()->data_layout(); @@ -313,8 +323,8 @@ kernelGenPool2D(const T_PoolOp &node, const ir::Operands &operands, VERBOSE(Pool2DParam) << "PAD(L): " << padding.left << std::endl; VERBOSE(Pool2DParam) << "PAD(R): " << padding.right << std::endl; - auto ofm_tensor = tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = tensor_reg->getAclTensor(ifm_index).get(); + auto ofm_tensor = tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = tensor_reg->getAclTensor(ifm_index); ::arm_compute::PoolingLayerInfo info{ pooling_type, ::arm_compute::Size2D{kw, kh}, ifm_tensor->info()->data_layout(), diff --git a/runtime/onert/backend/acl_common/AclSubTensorAnalyzer.h b/runtime/onert/backend/acl_common/AclSubTensorAnalyzer.h index 83d7ad6fd..beec95718 100644 --- a/runtime/onert/backend/acl_common/AclSubTensorAnalyzer.h +++ b/runtime/onert/backend/acl_common/AclSubTensorAnalyzer.h @@ -61,8 +61,14 @@ public: for (const auto &ind : inputs) { - // NOTE Not support the case that concat's input is a constant or a input of model - if (_graph.operands().at(ind).isConstant() || _graph.getInputs().contains(ind)) + /** + * NOTE Not support below cases. + * 1. concat's input is a constant. + * 2. concat's input is a input of model. + * 3. concat's input already becomes a subtensor of another concat. + */ + if (_graph.operands().at(ind).isConstant() || _graph.getInputs().contains(ind) || + _parent_map.find(ind) != _parent_map.end()) { return; } diff --git a/runtime/onert/backend/acl_common/AclTensorBuilder.h b/runtime/onert/backend/acl_common/AclTensorBuilder.h index 91452014b..bb7abc95d 100644 --- a/runtime/onert/backend/acl_common/AclTensorBuilder.h +++ b/runtime/onert/backend/acl_common/AclTensorBuilder.h @@ -70,8 +70,6 @@ public: void allocate() override; void postFunctionPrepare() override; - std::unique_ptr<ITensorManager> releaseStaticTensorManager(void) override; - T_AclTensorManager *acl_tensor_manager(void) { return _tensor_mgr.get(); } void setUsesCount(const ir::OperandIndex &index, size_t num_uses) @@ -161,7 +159,6 @@ void AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::registerTensorInfo( else { // SubTensors - assert(!info.isConstant() && "Subtensors of constants are not supported yet."); // Update offset info and emplace @@ -306,13 +303,6 @@ void AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::postFunctionPrepare(voi } template <typename T_ITensor, typename T_Tensor, typename T_SubTensor> -std::unique_ptr<ITensorManager> -AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::releaseStaticTensorManager(void) -{ - return std::move(_tensor_mgr); -} - -template <typename T_ITensor, typename T_Tensor, typename T_SubTensor> void AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::buildTensors(void) { assert(_tensor_mgr->constTensors().size() == 0); diff --git a/runtime/onert/backend/acl_common/AclTensorRegistry.h b/runtime/onert/backend/acl_common/AclTensorRegistry.h index 1ef9f4b35..02d66db99 100644 --- a/runtime/onert/backend/acl_common/AclTensorRegistry.h +++ b/runtime/onert/backend/acl_common/AclTensorRegistry.h @@ -36,17 +36,11 @@ template <typename T_AclTensorManager> class AclTensorRegistry : public ITensorR public: AclTensorRegistry(T_AclTensorManager *tensor_mgr) : _tensor_mgr{tensor_mgr} {} - std::shared_ptr<ITensor> getITensor(const ir::OperandIndex &ind) override - { - return _tensor_mgr->at(ind); - } + ITensor *getITensor(const ir::OperandIndex &ind) override { return _tensor_mgr->at(ind).get(); } - std::shared_ptr<ITensor> getNativeITensor(const ir::OperandIndex &ind) override - { - return getITensor(ind); - } + ITensor *getNativeITensor(const ir::OperandIndex &ind) override { return getITensor(ind); } - auto getAclTensor(const ir::OperandIndex &ind) { return _tensor_mgr->at(ind); } + auto getAclTensor(const ir::OperandIndex &ind) { return _tensor_mgr->at(ind).get(); } private: T_AclTensorManager *_tensor_mgr; diff --git a/runtime/onert/backend/acl_common/Convert.cc b/runtime/onert/backend/acl_common/Convert.cc index 67dcc8192..6ef6a2dc3 100644 --- a/runtime/onert/backend/acl_common/Convert.cc +++ b/runtime/onert/backend/acl_common/Convert.cc @@ -112,6 +112,8 @@ namespace acl_common return ::arm_compute::DataType::S8; case ir::DataType::FLOAT16: return ::arm_compute::DataType::F16; + case ir::DataType::INT64: + return ::arm_compute::DataType::S64; default: throw std::runtime_error("Not supported, yet"); break; @@ -299,6 +301,8 @@ ir::DataType asRuntimeDataType(::arm_compute::DataType data_type) return ir::DataType::QUANT_INT8_SYMM; case ::arm_compute::DataType::F16: return ir::DataType::FLOAT16; + case ::arm_compute::DataType::S64: + return ir::DataType::INT64; default: throw std::runtime_error{"Not supported, yet"}; break; @@ -335,6 +339,27 @@ arm_compute::ReduceOperation convertReduceType(ir::operation::Reduce::ReduceType } } +arm_compute::PixelValue asPixelValue(const ir::Operand &operand) +{ + assert(operand.isConstant()); + assert(operand.shape().num_elements() == 1); + switch (operand.typeInfo().type()) + { + case ir::DataType::INT32: + return arm_compute::PixelValue(operand.asScalar<int32_t>()); + case ir::DataType::INT64: + return arm_compute::PixelValue(operand.asScalar<int64_t>()); + case ir::DataType::UINT32: + return arm_compute::PixelValue(operand.asScalar<uint64_t>()); + case ir::DataType::UINT8: + return arm_compute::PixelValue(operand.asScalar<uint8_t>()); + case ir::DataType::FLOAT32: + return arm_compute::PixelValue(operand.asScalar<float>()); + default: + throw std::runtime_error("asPixelValue : Not supported datatype yet"); + } +} + } // namespace acl_common } // namespace backend } // namespace onert diff --git a/runtime/onert/backend/acl_common/Convert.h b/runtime/onert/backend/acl_common/Convert.h index 380321c07..0b36df102 100644 --- a/runtime/onert/backend/acl_common/Convert.h +++ b/runtime/onert/backend/acl_common/Convert.h @@ -17,6 +17,7 @@ #ifndef __ONERT_BACKEND_ACL_COMMON_CONVERT_H__ #define __ONERT_BACKEND_ACL_COMMON_CONVERT_H__ +#include <arm_compute/core/PixelValue.h> #include <arm_compute/core/TensorInfo.h> #include <arm_compute/core/SubTensorInfo.h> #include <arm_compute/core/TensorShape.h> @@ -85,6 +86,8 @@ ir::DataType asRuntimeDataType(::arm_compute::DataType data_type); arm_compute::PoolingType convertPoolType(ir::operation::Pool2D::PoolType pool_type_ir); arm_compute::ReduceOperation convertReduceType(ir::operation::Reduce::ReduceType reduce_type_ir); +arm_compute::PixelValue asPixelValue(const ir::Operand &operand); + } // namespace acl_common } // namespace backend } // namespace onert diff --git a/runtime/onert/backend/acl_neon/KernelGenerator.cc b/runtime/onert/backend/acl_neon/KernelGenerator.cc index 6d53c1245..598d043e7 100644 --- a/runtime/onert/backend/acl_neon/KernelGenerator.cc +++ b/runtime/onert/backend/acl_neon/KernelGenerator.cc @@ -18,7 +18,6 @@ #include <arm_compute/runtime/NEON/NEFunctions.h> // Include all ARM Compute NEON functions #include <arm_compute/runtime/NEON/NEFunctionsEx.h> // Include all ARM Compute EX NEON functions -#include <arm_compute/runtime/CPP/functions/CPPOneHotEx.h> #include <AclActivationBuilder.h> #include <AclFunction.h> @@ -75,15 +74,16 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node) { const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::ArgMax::Input::INPUT)}; + const auto axis_index{node.getInputs().at(ir::operation::ArgMax::Input::AXIS)}; const auto ifm_rank = _ctx.at(ifm_index).shape().rank(); - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); auto frontend_layout = _current_op_seq_layout; auto backend_layout = ifm_tensor->layout(); - int axis_value = node.param().axis; + int axis_value = _ctx.at(axis_index).asScalar<int32_t>(); if (axis_value < 0) { axis_value += ifm_rank; @@ -106,9 +106,9 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node) const auto block_size_index{ node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)}; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); - auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); + auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index); assert(_ctx.at(block_size_index).data()); @@ -126,9 +126,9 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node) const auto activation = node.param().activation; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get(); - auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index); + auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index); std::unique_ptr<arm_compute::IFunction> fn; switch (node.param().arithmetic_type) @@ -190,10 +190,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) ker_width, ker_height); const auto activation = node.param().activation; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); - auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get(); - auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); + auto ker_tensor = _tensor_reg->getAclTensor(ker_index); + auto bias_tensor = _tensor_reg->getAclTensor(bias_index); const auto conv_info = acl_common::asPadStrideInfo(padding, stride); const auto act_info = acl_common::asActivationLayerInfo(activation); @@ -214,8 +214,8 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node) auto block_size = node.param().block_size; assert(block_size > 0); - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto input_tensor = _tensor_reg->getAclTensor(input_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); + auto input_tensor = _tensor_reg->getAclTensor(input_index); auto fn = acl_common::generateLayer<arm_compute::NEDepthToSpaceLayer>( input_tensor->handle(), output_tensor->handle(), block_size); @@ -245,10 +245,10 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) const auto multiplier = node.param().multiplier; const auto activation = node.param().activation; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); - auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get(); - auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); + auto ker_tensor = _tensor_reg->getAclTensor(ker_index); + auto bias_tensor = _tensor_reg->getAclTensor(bias_index); const auto conv_info = acl_common::asPadStrideInfo(padding, stride); const auto act_info = acl_common::asActivationLayerInfo(activation); @@ -282,7 +282,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node) return; } - auto output_tensor = _tensor_reg->getAclTensor(ofm_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(ofm_index); std::vector<::arm_compute::ITensor *> input_tensors; for (const auto &ifm_ind : input_indexes) input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle()); @@ -312,8 +312,8 @@ void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)}; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); const ::arm_compute::ActivationLayerInfo act_info = acl_common::asActivationLayerInfo( node.param().op_type, node.param().alpha, node.param().beta); @@ -343,9 +343,9 @@ void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node) const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)}; - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get(); - auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); + auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index); + auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index); std::unique_ptr<arm_compute::IFunction> fn; switch (node.param().op_type) @@ -390,8 +390,8 @@ void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)}; - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto input_tensor = _tensor_reg->getAclTensor(input_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); + auto input_tensor = _tensor_reg->getAclTensor(input_index); std::unique_ptr<arm_compute::IFunction> fn; switch (node.param().op_type) @@ -412,6 +412,11 @@ void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node) fn = acl_common::generateLayer<arm_compute::NECopy>(input_tensor->handle(), output_tensor->handle()); } + else if (_ctx.at(input_index).typeInfo().type() == ir::DataType::BOOL8) + { + fn = acl_common::generateLayer<arm_compute::NECastBool>(input_tensor->handle(), + output_tensor->handle()); + } else { fn = acl_common::generateLayer<arm_compute::NECast>( @@ -480,9 +485,9 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node) const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)}; const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)}; - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index).get(); - auto values_tensor = _tensor_reg->getAclTensor(values_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); + auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index); + auto values_tensor = _tensor_reg->getAclTensor(values_index); auto fn = acl_common::generateLayer<arm_compute::NEEmbeddingLookup>( values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle()); @@ -493,7 +498,7 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node) void KernelGenerator::visit(const ir::operation::FullyConnected &node) { const auto output_index{node.getOutputs().at(0)}; - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); const auto activation = node.param().activation; auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ITensor, @@ -512,12 +517,12 @@ void KernelGenerator::visit(const ir::operation::HashtableLookup &node) const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)}; const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)}; - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto hits_tensor = _tensor_reg->getAclTensor(hits_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); + auto hits_tensor = _tensor_reg->getAclTensor(hits_index); - auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index).get(); - auto keys_tensor = _tensor_reg->getAclTensor(keys_index).get(); - auto values_tensor = _tensor_reg->getAclTensor(values_index).get(); + auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index); + auto keys_tensor = _tensor_reg->getAclTensor(keys_index); + auto values_tensor = _tensor_reg->getAclTensor(values_index); auto fn = acl_common::generateLayer<arm_compute::NEHashtableLookup>( lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(), @@ -539,9 +544,9 @@ void KernelGenerator::visit(const ir::operation::Gather &node) // Converting in reverse order const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value(); - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); - auto indices_tensor = _tensor_reg->getAclTensor(indices_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); + auto indices_tensor = _tensor_reg->getAclTensor(indices_index); const auto backend_layout = ofm_tensor->layout(); UNUSED_RELEASE(backend_layout); @@ -567,24 +572,26 @@ void KernelGenerator::visit(const ir::operation::Gather &node) if (n != ifm_tensor->info()->num_dimensions()) { // This means that high dimension's value is 1 and ifm tensor is applied dim_correction - const auto ifm = _ctx.at(ifm_index); - ifm_tensor->info()->set_tensor_shape( - acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false)); + acl_common::disableDimCorrection(ifm_tensor); } if (k != indices_tensor->info()->num_dimensions()) { // This means that high dimension's value is 1 and indices tensor is applied dim_correction - const auto indices = _ctx.at(indices_index); - indices_tensor->info()->set_tensor_shape( - acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false)); + acl_common::disableDimCorrection(indices_tensor); } auto fn = acl_common::generateLayer<arm_compute::NEGatherEx>( ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis); - // acl_neon doesn't not revert disabling applied dim_correction because acl_neon's kernels would - // use arm_compute::TensorInfo::offset_element_in_bytes() - // It would create an error when the kernel accesses high dimension that its value is 1 + // Revert disabling applied dim_correction + if (ifm_tensor->dimension(0) == 1) + { + acl_common::enableDimCorrection(ifm_tensor); + } + if (indices_tensor->dimension(0) == 1) + { + acl_common::enableDimCorrection(indices_tensor); + } _return_fn = asAclFunction(std::move(fn)); } @@ -596,10 +603,10 @@ void KernelGenerator::visit(const ir::operation::InstanceNorm &node) const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)}; const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)}; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); - auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index).get(); - auto beta_tensor = _tensor_reg->getAclTensor(beta_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); + auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index); + auto beta_tensor = _tensor_reg->getAclTensor(beta_index); auto epsilon = node.param().epsilon; auto activation = node.param().activation; @@ -630,8 +637,8 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node) float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction) float bias = 0.0f; // Don't offset the reduction. - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP, radius, alpha, beta, bias, false); @@ -653,8 +660,8 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod auto beta = node.param().beta; auto bias = node.param().bias; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); const auto norm_info = ::arm_compute::NormalizationLayerInfo( ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false); @@ -682,13 +689,13 @@ void KernelGenerator::visit(const ir::operation::Pack &node) for (const auto &input_index : node.getInputs()) input_indexes.emplace_back(input_index); - auto output = _tensor_reg->getAclTensor(output_index).get()->handle(); + auto output = _tensor_reg->getAclTensor(output_index)->handle(); std::vector<arm_compute::ITensor *> inputs; for (const auto &input_index : input_indexes) inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle()); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = _tensor_reg->getAclTensor(output_index).get()->layout(); + const auto backend_layout = _tensor_reg->getAclTensor(output_index)->layout(); if (axis < 0) axis += output_rank; @@ -697,22 +704,25 @@ void KernelGenerator::visit(const ir::operation::Pack &node) // Disable applied dim_correction for (const auto &input_index : input_indexes) { - size_t input_rank = _ctx.at(input_index).shape().rank(); const auto &input_tensor = _tensor_reg->getAclTensor(input_index); - assert(input_rank == input_tensor->num_dimensions()); - if (input_rank != input_tensor->info()->num_dimensions()) + if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions()) { - // This means that high dimension's value is 1 and ifm tensor is applied dim_correction - input_tensor->info()->set_tensor_shape(acl_common::asTensorShape( - _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false)); + // This means that high dimension's value is 1 and input tensor is applied dim_correction + acl_common::disableDimCorrection(input_tensor); } } auto fn = acl_common::generateLayer<arm_compute::NEStackLayer>(inputs, axis, output); - // acl_neon doesn't not revert disabling applied dim_correction because acl_neon's kernels would - // use arm_compute::TensorInfo::offset_element_in_bytes() - // It would create an error when the kernel accesses high dimension that its value is 1 + // Revert disabling applied dim_correction + for (const auto &input_index : input_indexes) + { + const auto &input_tensor = _tensor_reg->getAclTensor(input_index); + if (input_tensor->dimension(0) == 1) + { + acl_common::enableDimCorrection(input_tensor); + } + } _return_fn = asAclFunction(std::move(fn)); } @@ -727,8 +737,8 @@ void KernelGenerator::visit(const ir::operation::Pad &node) auto rank = _ctx.at(input_index).shape().rank(); auto pad_base = _ctx.at(pad_index).data()->base(); - auto input = _tensor_reg->getAclTensor(input_index).get()->handle(); - auto output = _tensor_reg->getAclTensor(output_index).get()->handle(); + auto input = _tensor_reg->getAclTensor(input_index)->handle(); + auto output = _tensor_reg->getAclTensor(output_index)->handle(); ::arm_compute::PaddingList padding_list; padding_list.resize(rank); @@ -737,7 +747,7 @@ void KernelGenerator::visit(const ir::operation::Pad &node) const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = _tensor_reg->getAclTensor(input_index).get()->layout(); + const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout(); const auto axis = acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value(); padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]}; @@ -764,7 +774,7 @@ void KernelGenerator::visit(const ir::operation::Pool2D &node) acl_common::convertPoolType(node.param().op_type)); const auto ofm_index{node.getOutputs().at(0)}; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); const auto activation = node.param().activation; _return_fn = std::make_unique<exec::FunctionSequence>( asAclFunction(std::move(raw_fn)), @@ -776,8 +786,8 @@ void KernelGenerator::visit(const ir::operation::Permute &node) const auto ofm_idx{node.getOutputs().at(0)}; const auto ifm_idx{node.getInputs().at(0)}; const auto permute_type = node.getPermuteType(); - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx); const auto rank = _ctx.at(ofm_idx).shape().rank(); assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank()); @@ -812,9 +822,9 @@ void KernelGenerator::visit(const ir::operation::PReLU &node) const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)}; const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)}; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); - auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); + auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index); auto fn = acl_common::generateLayer<arm_compute::NEPReluLayer>( ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle()); @@ -828,8 +838,8 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)}; const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)}; - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto input_tensor = _tensor_reg->getAclTensor(input_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); + auto input_tensor = _tensor_reg->getAclTensor(input_index); // Convert to ACL axes taking into account negative values and possible duplicates. const auto &axes = _ctx.at(axes_index); @@ -866,8 +876,8 @@ void KernelGenerator::visit(const ir::operation::Reshape &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)}; - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto input_tensor = _tensor_reg->getAclTensor(input_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); + auto input_tensor = _tensor_reg->getAclTensor(input_index); // NOTE This operation must not be changed the layout from frontend to backend // So, PermutationOperationPass makes layouts of frontend and backend the same. @@ -887,11 +897,10 @@ void KernelGenerator::visit(const ir::operation::Reshape &node) void KernelGenerator::visit(const ir::operation::ResizeBilinear &node) { const auto ofm_index{node.getOutputs().at(0)}; - const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)}; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); auto fn = acl_common::generateLayer<arm_compute::NEScale>( ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::InterpolationPolicy::BILINEAR, @@ -916,14 +925,14 @@ void KernelGenerator::visit(const ir::operation::RNN &node) const auto activation = node.param().activation; - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); + auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index); - auto input_tensor = _tensor_reg->getAclTensor(input_index).get(); - auto weights_tensor = _tensor_reg->getAclTensor(weights_index).get(); - auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index).get(); - auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get(); - auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index).get(); + auto input_tensor = _tensor_reg->getAclTensor(input_index); + auto weights_tensor = _tensor_reg->getAclTensor(weights_index); + auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index); + auto bias_tensor = _tensor_reg->getAclTensor(bias_index); + auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index); auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation); auto copy_layer = acl_common::generateLayer<arm_compute::NECopy>( @@ -949,8 +958,8 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node) (void)dims; (void)ndim; - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto input_tensor = _tensor_reg->getAclTensor(input_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); + auto input_tensor = _tensor_reg->getAclTensor(input_index); auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(), output_tensor->handle()); _return_fn = asAclFunction(std::move(fn)); @@ -962,25 +971,26 @@ void KernelGenerator::visit(const ir::operation::Softmax &node) const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)}; const auto beta = node.param().beta; - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto input_tensor = _tensor_reg->getAclTensor(input_index).get(); - const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = input_tensor->layout(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); + auto input_tensor = _tensor_reg->getAclTensor(input_index); // Disable applied dim_correction - const size_t input_rank = _ctx.at(input_index).shape().rank(); - if (input_rank != input_tensor->info()->num_dimensions()) + if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions()) { // This means that high dimension's value is 1 and input tensor is applied dim_correction - const auto input = _ctx.at(input_index); - input_tensor->info()->set_tensor_shape( - acl_common::asTensorShape(input.shape(), frontend_layout, backend_layout, false)); + acl_common::disableDimCorrection(input_tensor); } auto fn = acl_common::generateLayer<arm_compute::NESoftmaxLayer>( _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(), output_tensor->handle(), beta); + // Revert disabling applied dim_correction + if (input_tensor->dimension(0) == 1) + { + acl_common::disableDimCorrection(input_tensor); + } + _return_fn = asAclFunction(std::move(fn)); } @@ -992,10 +1002,10 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node) node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)}; const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)}; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); - auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index).get(); - auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); + auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index); + auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index); assert(_ctx.at(block_size_index).data()); assert(_ctx.at(paddings_index).data()); @@ -1014,8 +1024,8 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node) auto block_size = node.param().block_size; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); auto fn = acl_common::generateLayer<arm_compute::NESpaceToDepthLayer>( ifm_tensor->handle(), ofm_tensor->handle(), block_size); @@ -1027,22 +1037,27 @@ void KernelGenerator::visit(const ir::operation::Split &node) { // TODO Support this op by SubTensor const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)}; + const auto axis_index{node.getInputs().at(ir::operation::Split::Input::AXIS)}; assert(node.param().num_splits == static_cast<int>(node.getOutputs().size())); + if (!_ctx.at(axis_index).isConstant()) + { + throw std::runtime_error("Non-constant axis_index NYI for acl_neon backend"); + } const auto ifm_rank = _ctx.at(ifm_index).shape().rank(); std::vector<ir::OperandIndex> output_indexes; for (const auto &output : node.getOutputs()) output_indexes.emplace_back(output); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); std::vector<arm_compute::ITensor *> output_tensors; for (const auto &ofm_ind : output_indexes) - output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind).get()->handle()); + output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle()); const auto frontend_layout = _current_op_seq_layout; const auto backend_layout = ifm_tensor->layout(); - auto axis = node.param().axis; + auto axis = _ctx.at(axis_index).asScalar<int32_t>(); if (axis < 0) axis += ifm_rank; axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value(); @@ -1059,9 +1074,9 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node) const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)}; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get(); - auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index); + auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index); auto fn = acl_common::generateLayer<arm_compute::NEElementwiseSquaredDiff>( lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); @@ -1076,8 +1091,8 @@ void KernelGenerator::visit(const ir::operation::Slice &node) const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)}; const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)}; - auto outputData_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto inputData_tensor = _tensor_reg->getAclTensor(input_index).get(); + auto outputData_tensor = _tensor_reg->getAclTensor(output_index); + auto inputData_tensor = _tensor_reg->getAclTensor(input_index); const auto frontend_layout = _current_op_seq_layout; const auto backend_layout = inputData_tensor->layout(); @@ -1141,8 +1156,8 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node) const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)}; const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)}; - auto outputData_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto inputData_tensor = _tensor_reg->getAclTensor(input_index).get(); + auto outputData_tensor = _tensor_reg->getAclTensor(output_index); + auto inputData_tensor = _tensor_reg->getAclTensor(input_index); const auto frontend_layout = _current_op_seq_layout; const auto backend_layout = inputData_tensor->layout(); @@ -1211,10 +1226,23 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node) strides_set.set(i, strides[i]); } + // Disable applied dim_correction + if (inputData_tensor->num_dimensions() != inputData_tensor->info()->num_dimensions()) + { + // This means that high dimension's value is 1 and input tensor is applied dim_correction + acl_common::disableDimCorrection(inputData_tensor); + } + auto fn = acl_common::generateLayer<arm_compute::NEStridedSlice>( inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set, begin_mask, end_mask, shrink_axis_mask); + // Revert disabling applied dim_correction + if (inputData_tensor->dimension(0) == 1) + { + acl_common::enableDimCorrection(inputData_tensor); + } + _return_fn = asAclFunction(std::move(fn)); } @@ -1244,9 +1272,9 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node) invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1); } - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get(); - auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index); + auto ker_tensor = _tensor_reg->getAclTensor(ker_index); const auto tconv_info = acl_common::asPadStrideInfo(padding, stride); @@ -1261,26 +1289,43 @@ void KernelGenerator::visit(const ir::operation::Transpose &node) { const auto ofm_idx{node.getOutputs().at(0)}; const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)}; - const auto &perm{node.param().perm}; + const auto perm_idx{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)}; - auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx).get(); - const auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx).get(); + auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx); + const auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx); const auto frontend_layout = _current_op_seq_layout; const auto backend_layout = ifm_tensor->layout(); - const auto rank = _ctx.at(ifm_idx).shape().rank(); - std::vector<std::int32_t> pv(perm.cbegin(), perm.cend()); - auto backend_pv = ::onert::backend::acl_common::getARMComputePermutationVector( - rank, pv, frontend_layout, backend_layout); - std::unique_ptr<::arm_compute::IFunction> fn; - if (ifm_tensor->num_dimensions() <= 2 && ofm_tensor->num_dimensions() <= 2) + const auto &perms = _ctx.at(perm_idx); + std::vector<int32_t> pv; + if (perms.shape() == ir::Shape{0}) + { + pv.resize(rank); + std::iota(pv.begin(), pv.end(), 0); + std::reverse(pv.begin(), pv.end()); + } + else + { + pv = _ctx.at(perm_idx).asVector<int32_t>(); + } + + std::unique_ptr<arm_compute::IFunction> fn; + if (rank == 1) { + fn = acl_common::generateLayer<arm_compute::NECopy>(ifm_tensor->handle(), ofm_tensor->handle()); + } + else if (rank == 2) + { + assert(pv.size() == 2 && pv.at(0) == 1 && pv.at(1) == 0); fn = acl_common::generateLayer<arm_compute::NETranspose>(ifm_tensor->handle(), ofm_tensor->handle()); } else { + auto backend_pv = + acl_common::getARMComputePermutationVector(rank, pv, frontend_layout, backend_layout); + fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(), ofm_tensor->handle(), backend_pv); } @@ -1298,34 +1343,32 @@ void KernelGenerator::visit(const ir::operation::Unpack &node) for (const auto &output_index : node.getOutputs()) output_indexes.emplace_back(output_index); - auto input = _tensor_reg->getAclTensor(input_index).get()->handle(); + auto input_tensor = _tensor_reg->getAclTensor(input_index); std::vector<arm_compute::ITensor *> outputs; for (const auto &output_index : output_indexes) outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle()); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = _tensor_reg->getAclTensor(input_index).get()->layout(); + const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout(); if (axis < 0) axis += input_rank; axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value(); // Disable applied dim_correction - std::vector<arm_compute::TensorShape> orig_outputs_acl_tensor_shapes; - for (const auto &output_index : output_indexes) + if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions()) { - size_t output_rank = _ctx.at(output_index).shape().rank(); - const auto &output_tensor = _tensor_reg->getAclTensor(output_index); - orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape()); - assert(output_rank == output_tensor->num_dimensions()); - if (output_rank != output_tensor->info()->num_dimensions()) - { - // This means that high dimension's value is 1 and ifm tensor is applied dim_correction - output_tensor->info()->set_tensor_shape(acl_common::asTensorShape( - _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false)); - } + // This means that high dimension's value is 1 and input tensor is applied dim_correction + acl_common::disableDimCorrection(input_tensor); } - auto fn = acl_common::generateLayer<arm_compute::NEUnstack>(input, outputs, axis); + auto fn = + acl_common::generateLayer<arm_compute::NEUnstack>(input_tensor->handle(), outputs, axis); + + // Revert disabling applied dim_correction + if (input_tensor->dimension(0) == 1) + { + acl_common::enableDimCorrection(input_tensor); + } _return_fn = asAclFunction(std::move(fn)); } @@ -1335,8 +1378,8 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)}; - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto input_tensor = _tensor_reg->getAclTensor(input_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); + auto input_tensor = _tensor_reg->getAclTensor(input_index); auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(), output_tensor->handle()); @@ -1352,9 +1395,9 @@ void KernelGenerator::visit(const ir::operation::Comparison &node) const auto comparison_type = node.param().comparison_type; - auto output_tensor = _tensor_reg->getAclTensor(output_index).get(); - auto input0_tensor = _tensor_reg->getAclTensor(input0_index).get(); - auto input1_tensor = _tensor_reg->getAclTensor(input1_index).get(); + auto output_tensor = _tensor_reg->getAclTensor(output_index); + auto input0_tensor = _tensor_reg->getAclTensor(input0_index); + auto input1_tensor = _tensor_reg->getAclTensor(input1_index); auto fn = acl_common::generateLayer<arm_compute::NEElementwiseComparison>( input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(), @@ -1370,15 +1413,20 @@ void KernelGenerator::visit(const ir::operation::OneHot &node) const auto depth_idx{node.getInputs().at(ir::operation::OneHot::Input::DEPTH)}; const auto onvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::ON_VALUE)}; const auto offvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::OFF_VALUE)}; - const auto axis = node.param().axis; - auto output_tensor = _tensor_reg->getAclTensor(out_idx).get(); - auto indices_tensor = _tensor_reg->getAclTensor(indices_idx).get(); - auto depth_tensor = _tensor_reg->getAclTensor(depth_idx).get(); - auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx).get(); - auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx).get(); + auto output_tensor = _tensor_reg->getAclTensor(out_idx); + auto indices_tensor = _tensor_reg->getAclTensor(indices_idx); + auto depth_tensor = _tensor_reg->getAclTensor(depth_idx); + auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx); + auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx); + + const size_t output_rank = _ctx.at(out_idx).shape().rank(); + const auto frontend_layout = _current_op_seq_layout; + const auto backend_layout = output_tensor->layout(); + int32_t axis = node.param().axis == -1 ? output_rank - 1 : node.param().axis; + axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value(); - auto fn = acl_common::generateLayer<arm_compute::CPPOneHotEx>( + auto fn = acl_common::generateLayer<arm_compute::NEOneHot>( indices_tensor->handle(), depth_tensor->handle(), onvalue_tensor->handle(), offvalue_tensor->handle(), output_tensor->handle(), axis); _return_fn = asAclFunction(std::move(fn)); diff --git a/runtime/onert/backend/cpu/ExternalContext.h b/runtime/onert/backend/cpu/ExternalContext.h index 6627412d2..32e249f5a 100644 --- a/runtime/onert/backend/cpu/ExternalContext.h +++ b/runtime/onert/backend/cpu/ExternalContext.h @@ -39,16 +39,13 @@ public: ExternalContext() : _ruy_context(new ruy::Context) { setMaxNumThreads(onert::util::getConfigInt(onert::util::config::RUY_THREADS)); -#ifdef USE_RUY_GEMV - _ruy_context->cache_policy = ruy::kCacheLHSOnNarrowMul; -#endif } void setMaxNumThreads(int max_num_threads) { const int target_num_threads = max_num_threads > -1 ? max_num_threads : kDefaultNumThreadpoolThreads; - _ruy_context->max_num_threads = target_num_threads; + _ruy_context->set_max_num_threads(target_num_threads); } ruy::Context *ruy_context() const { return _ruy_context.get(); } diff --git a/runtime/onert/backend/cpu/KernelGenerator.cc b/runtime/onert/backend/cpu/KernelGenerator.cc index 74b6f0c6b..5f330ff50 100644 --- a/runtime/onert/backend/cpu/KernelGenerator.cc +++ b/runtime/onert/backend/cpu/KernelGenerator.cc @@ -232,12 +232,10 @@ void KernelGenerator::visit(const ir::OpSequence &op_seq) dyn_ctx->op_seq = &op_seq; dyn_ctx->operations = &_operations_ctx; dyn_ctx->dynamic_shape_inferer = std::move(dyn_shape_inferer); - dyn_ctx->tensor_registry = _tensor_reg; dyn_ctx->dynamic_tensor_manager = _tensor_builder->dynamicTensorManager(); _return_fn_seq->dynamic_tensor_ctx(dyn_ctx); } - _return_fn_seq->enableDynamicShapeInferer(true); _current_op_seq_layout = op_seq.getLayout(); for (const auto &operation_idx : op_seq.operations()) @@ -272,10 +270,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)}; const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)}; - auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index).get(); - auto ker_tensor = _tensor_reg->getPortableTensor(ker_index).get(); - auto bias_tensor = _tensor_reg->getPortableTensor(bias_index).get(); + auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index); + auto ker_tensor = _tensor_reg->getPortableTensor(ker_index); + auto bias_tensor = _tensor_reg->getPortableTensor(bias_index); const auto stride = node.param().stride; const auto activation = node.param().activation; @@ -332,10 +330,10 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) const auto multiplier = node.param().multiplier; const auto activation = node.param().activation; - auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index).get(); - auto ker_tensor = _tensor_reg->getPortableTensor(ker_index).get(); - auto bias_tensor = _tensor_reg->getPortableTensor(bias_index).get(); + auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index); + auto ker_tensor = _tensor_reg->getPortableTensor(ker_index); + auto bias_tensor = _tensor_reg->getPortableTensor(bias_index); auto fn = std::make_unique<ops::DepthwiseConvolutionLayer>(); @@ -353,11 +351,11 @@ void KernelGenerator::visit(const ir::operation::Concat &node) const auto rank = _ctx.at(ofm_index).shape().rank(); const auto axis = ops::getAxis(rank, node.param().axis, _current_op_seq_layout); - auto output_tensor = _tensor_reg->getPortableTensor(ofm_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(ofm_index); std::vector<const IPortableTensor *> input_tensors; for (auto &ifm_idx : node.getInputs()) - input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx).get()); + input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx)); auto fn = std::make_unique<ops::ConcatLayer>(); @@ -372,9 +370,9 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node) const auto input_index{node.getInputs().at(ir::operation::BatchToSpaceND::INPUT)}; const auto block_size_index{node.getInputs().at(ir::operation::BatchToSpaceND::BLOCK_SIZE)}; - auto output_alloc = _tensor_reg->getPortableTensor(output_index).get(); - auto input_alloc = _tensor_reg->getPortableTensor(input_index).get(); - auto block_size_alloc = _tensor_reg->getPortableTensor(block_size_index).get(); + auto output_alloc = _tensor_reg->getPortableTensor(output_index); + auto input_alloc = _tensor_reg->getPortableTensor(input_index); + auto block_size_alloc = _tensor_reg->getPortableTensor(block_size_index); auto fn = std::make_unique<ops::BatchToSpaceNDLayer>(); @@ -384,7 +382,7 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node) if (node.getInputs().size() != NNApiInputs) { const auto crops_data_index{node.getInputs().at(ir::operation::BatchToSpaceND::CROPS_DATA)}; - crops_alloc = _tensor_reg->getPortableTensor(crops_data_index).get(); + crops_alloc = _tensor_reg->getPortableTensor(crops_data_index); } fn->configure(input_alloc, output_alloc, block_size_alloc, crops_alloc); @@ -398,9 +396,9 @@ void KernelGenerator::visit(const ir::operation::Fill &node) const auto input_index{node.getInputs().at(ir::operation::Fill::Input::INPUT)}; const auto value_index{node.getInputs().at(ir::operation::Fill::Input::VALUE)}; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto input_tensor = _tensor_reg->getPortableTensor(input_index).get(); - auto value_tensor = _tensor_reg->getPortableTensor(value_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto input_tensor = _tensor_reg->getPortableTensor(input_index); + auto value_tensor = _tensor_reg->getPortableTensor(value_index); auto fn = std::make_unique<ops::FillLayer>(); @@ -419,11 +417,10 @@ void KernelGenerator::visit(const ir::operation::FullyConnected &node) const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)}; const auto activation = node.param().activation; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto input_tensor = _tensor_reg->getPortableTensor(input_index).get(); - auto weight_tensor = _tensor_reg->getPortableTensor(weight_index).get(); - auto bias_tensor = - bias_index.undefined() ? nullptr : _tensor_reg->getPortableTensor(bias_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto input_tensor = _tensor_reg->getPortableTensor(input_index); + auto weight_tensor = _tensor_reg->getPortableTensor(weight_index); + auto bias_tensor = bias_index.undefined() ? nullptr : _tensor_reg->getPortableTensor(bias_index); auto fn = std::make_unique<ops::FullyConnectedLayer>(); @@ -438,8 +435,8 @@ void KernelGenerator::visit(const ir::operation::Reshape &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)}; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto input_tensor = _tensor_reg->getPortableTensor(input_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto input_tensor = _tensor_reg->getPortableTensor(input_index); // optional 2nd input IPortableTensor *shape_tensor = nullptr; @@ -447,7 +444,7 @@ void KernelGenerator::visit(const ir::operation::Reshape &node) if (node.getInputs().size() == 2) { const auto shape_index{node.getInputs().at(ir::operation::Reshape::Input::SHAPE)}; - shape_tensor = _tensor_reg->getPortableTensor(shape_index).get(); + shape_tensor = _tensor_reg->getPortableTensor(shape_index); } auto fn = std::make_unique<ops::ReshapeLayer>(); @@ -461,8 +458,8 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)}; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto input_tensor = _tensor_reg->getPortableTensor(input_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto input_tensor = _tensor_reg->getPortableTensor(input_index); // Squeeze can share same kernel with reshape auto fn = std::make_unique<ops::ReshapeLayer>(); @@ -479,8 +476,8 @@ void KernelGenerator::visit(const ir::operation::Softmax &node) const auto beta = node.param().beta; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto input_tensor = _tensor_reg->getPortableTensor(input_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto input_tensor = _tensor_reg->getPortableTensor(input_index); auto fn = std::make_unique<ops::SoftMaxLayer>(); @@ -497,9 +494,9 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node) const auto activation = node.param().activation; - auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index).get(); - auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index).get(); - auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index).get(); + auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index); + auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index); + auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index); auto fn = std::make_unique<ops::BinaryArithmeticLayer>(); @@ -515,9 +512,9 @@ void KernelGenerator::visit(const ir::operation::Comparison &node) const auto lhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)}; const auto rhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)}; - auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index).get(); - auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index).get(); - auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index).get(); + auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index); + auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index); + auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index); auto comparison_type = node.param().comparison_type; @@ -534,9 +531,9 @@ void KernelGenerator::visit(const ir::operation::Gather &node) const auto input_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)}; const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)}; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto input_tensor = _tensor_reg->getPortableTensor(input_index).get(); - auto indices_tensor = _tensor_reg->getPortableTensor(indices_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto input_tensor = _tensor_reg->getPortableTensor(input_index); + auto indices_tensor = _tensor_reg->getPortableTensor(indices_index); const auto backend_layout = output_tensor->layout(); UNUSED_RELEASE(backend_layout); @@ -575,11 +572,11 @@ void KernelGenerator::visit(const ir::operation::OneHot &node) const auto axis = node.param().axis; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto indices_tensor = _tensor_reg->getPortableTensor(indices_index).get(); - auto depth_tensor = _tensor_reg->getPortableTensor(depth_index).get(); - auto onvalue_tensor = _tensor_reg->getPortableTensor(onvalue_index).get(); - auto offvalue_tensor = _tensor_reg->getPortableTensor(offvalue_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto indices_tensor = _tensor_reg->getPortableTensor(indices_index); + auto depth_tensor = _tensor_reg->getPortableTensor(depth_index); + auto onvalue_tensor = _tensor_reg->getPortableTensor(onvalue_index); + auto offvalue_tensor = _tensor_reg->getPortableTensor(offvalue_index); assert(indices_tensor->data_type() == OperandType::INT32); assert(axis <= static_cast<int>(indices_tensor->num_dimensions())); @@ -595,10 +592,10 @@ void KernelGenerator::visit(const ir::operation::Einsum &node) { const auto ofm_index{node.getOutputs().at(0)}; - auto output_tensor = _tensor_reg->getPortableTensor(ofm_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(ofm_index); std::vector<const IPortableTensor *> input_tensors; for (auto &ifm_idx : node.getInputs()) - input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx).get()); + input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx)); const auto equation = node.param().equation; @@ -613,7 +610,7 @@ void KernelGenerator::visit(const ir::operation::Custom &node) { auto fill_op_info = [&](const ir::OperandIndexSequence &opSeq, std::vector<custom::TypeInfo> &types, - std::vector<std::shared_ptr<IPortableTensor>> &tensors) { + std::vector<IPortableTensor *> &tensors) { for (auto &idx : opSeq) { const auto &operand = _ctx.at(idx); @@ -642,8 +639,8 @@ void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)}; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto input_tensor = _tensor_reg->getPortableTensor(input_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto input_tensor = _tensor_reg->getPortableTensor(input_index); auto fn = std::make_unique<ops::ElementwiseActivationLayer>(); @@ -659,9 +656,9 @@ void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node) const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)}; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index).get(); - auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index); + auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index); auto fn = std::make_unique<ops::ElementwiseBinaryLayer>(); @@ -676,8 +673,8 @@ void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)}; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto input_tensor = _tensor_reg->getPortableTensor(input_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto input_tensor = _tensor_reg->getPortableTensor(input_index); auto fn = std::make_unique<ops::ElementwiseUnaryLayer>(); @@ -692,9 +689,9 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node) const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)}; const auto axis_index{node.getInputs().at(ir::operation::ExpandDims::Input::AXIS)}; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto input_tensor = _tensor_reg->getPortableTensor(input_index).get(); - auto axis_tensor = _tensor_reg->getPortableTensor(axis_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto input_tensor = _tensor_reg->getPortableTensor(input_index); + auto axis_tensor = _tensor_reg->getPortableTensor(axis_index); auto fn = std::make_unique<ops::ExpandDimsLayer>(); @@ -712,11 +709,11 @@ void KernelGenerator::visit(const ir::operation::Pack &node) assert(-rank <= axis && axis < rank); - auto output_tensor = _tensor_reg->getPortableTensor(ofm_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(ofm_index); std::vector<const IPortableTensor *> input_tensors; for (auto &ifm_idx : node.getInputs()) - input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx).get()); + input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx)); auto fn = std::make_unique<ops::PackLayer>(); @@ -734,11 +731,11 @@ void KernelGenerator::visit(const ir::operation::Unpack &node) assert(rank == 0 || (-rank <= axis && axis < rank)); - auto input_tensor = _tensor_reg->getPortableTensor(input_index).get(); + auto input_tensor = _tensor_reg->getPortableTensor(input_index); std::vector<IPortableTensor *> output_tensors; for (auto &output_idx : node.getOutputs()) - output_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx).get()); + output_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx)); auto fn = std::make_unique<ops::UnpackLayer>(); @@ -756,8 +753,8 @@ void KernelGenerator::visit(const ir::operation::Pad &node) const auto output_index{node.getOutputs().at(0)}; assert(_ctx.at(pad_index).data()); - auto input = _tensor_reg->getPortableTensor(input_index).get(); - auto output = _tensor_reg->getPortableTensor(output_index).get(); + auto input = _tensor_reg->getPortableTensor(input_index); + auto output = _tensor_reg->getPortableTensor(output_index); auto pad_rank = _ctx.at(pad_index).shape().dim(0); auto pad_base = reinterpret_cast<const int32_t *>(_ctx.at(pad_index).data()->base()); @@ -780,13 +777,15 @@ void KernelGenerator::visit(const ir::operation::Transpose &node) { const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Transpose::Input::INPUT)}; + const auto perm_index{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)}; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto input_tensor = _tensor_reg->getPortableTensor(input_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto input_tensor = _tensor_reg->getPortableTensor(input_index); + auto perm_tensor = _tensor_reg->getPortableTensor(perm_index); auto fn = std::make_unique<ops::TransposeLayer>(); - fn->configure(input_tensor, output_tensor, node.param().perm); + fn->configure(input_tensor, perm_tensor, output_tensor); _return_fn = std::move(fn); } @@ -798,9 +797,9 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)}; const auto keep_dims = node.param().keep_dims; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto input_tensor = _tensor_reg->getPortableTensor(input_index).get(); - auto axes_tensor = _tensor_reg->getPortableTensor(axes_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto input_tensor = _tensor_reg->getPortableTensor(input_index); + auto axes_tensor = _tensor_reg->getPortableTensor(axes_index); if (node.param().reduce_type == ir::operation::Reduce::ReduceType::MEAN) { @@ -828,10 +827,10 @@ void KernelGenerator::visit(const ir::operation::Select &node) const auto true_index{node.getInputs().at(ir::operation::Select::Input::INPUT_TRUE)}; const auto false_index{node.getInputs().at(ir::operation::Select::Input::INPUT_FALSE)}; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto condition_tensor = _tensor_reg->getPortableTensor(condition_index).get(); - auto true_tensor = _tensor_reg->getPortableTensor(true_index).get(); - auto false_tensor = _tensor_reg->getPortableTensor(false_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto condition_tensor = _tensor_reg->getPortableTensor(condition_index); + auto true_tensor = _tensor_reg->getPortableTensor(true_index); + auto false_tensor = _tensor_reg->getPortableTensor(false_index); auto fn = std::make_unique<ops::SelectLayer>(); @@ -847,10 +846,10 @@ void KernelGenerator::visit(const ir::operation::Slice &node) const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)}; const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)}; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto input_tensor = _tensor_reg->getPortableTensor(input_index).get(); - auto begins_tensor = _tensor_reg->getPortableTensor(begins_index).get(); - auto sizes_tensor = _tensor_reg->getPortableTensor(sizes_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto input_tensor = _tensor_reg->getPortableTensor(input_index); + auto begins_tensor = _tensor_reg->getPortableTensor(begins_index); + auto sizes_tensor = _tensor_reg->getPortableTensor(sizes_index); auto fn = std::make_unique<ops::SliceLayer>(); @@ -867,11 +866,11 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node) const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)}; const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)}; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto input_tensor = _tensor_reg->getPortableTensor(input_index).get(); - auto starts_tensor = _tensor_reg->getPortableTensor(starts_index).get(); - auto ends_tensor = _tensor_reg->getPortableTensor(ends_index).get(); - auto strides_tensor = _tensor_reg->getPortableTensor(strides_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto input_tensor = _tensor_reg->getPortableTensor(input_index); + auto starts_tensor = _tensor_reg->getPortableTensor(starts_index); + auto ends_tensor = _tensor_reg->getPortableTensor(ends_index); + auto strides_tensor = _tensor_reg->getPortableTensor(strides_index); auto begin_mask = node.param().begin_mask; auto end_mask = node.param().end_mask; @@ -891,19 +890,18 @@ void KernelGenerator::visit(const ir::operation::Split &node) assert(num_splits == static_cast<int>(node.getOutputs().size())); const auto input_idx{node.getInputs().at(ir::operation::Split::Input::INPUT)}; - const auto rank = _ctx.at(input_idx).shape().rank(); - const auto axis = ops::getAxis(rank, node.param().axis, _current_op_seq_layout); - auto axis_resolved = axis < 0 ? axis + rank : axis; + const auto axis_idx{node.getInputs().at(ir::operation::Split::Input::AXIS)}; - auto in_tensor = _tensor_reg->getPortableTensor(input_idx).get(); + auto in_tensor = _tensor_reg->getPortableTensor(input_idx); + auto axis_tensor = _tensor_reg->getPortableTensor(axis_idx); std::vector<IPortableTensor *> out_tensors; for (auto &output_idx : node.getOutputs()) - out_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx).get()); + out_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx)); auto fn = std::make_unique<ops::SplitLayer>(); - fn->configure(in_tensor, num_splits, axis_resolved, out_tensors); + fn->configure(in_tensor, axis_tensor, num_splits, out_tensors); _return_fn = std::move(fn); } @@ -913,8 +911,8 @@ void KernelGenerator::visit(const ir::operation::Shape &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Shape::Input::INPUT)}; - auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index).get(); + auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index); auto fn = std::make_unique<ops::ShapeLayer>(); @@ -928,18 +926,37 @@ void KernelGenerator::visit(const ir::operation::ResizeBilinear &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::ResizeBilinear::INPUT)}; - auto output_height = node.param().height_out; - auto output_width = node.param().width_out; auto align_corners = node.param().align_corners; auto half_pixel_centers = node.param().half_pixel_centers; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto input_tensor = _tensor_reg->getPortableTensor(input_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto input_tensor = _tensor_reg->getPortableTensor(input_index); auto fn = std::make_unique<ops::ResizeBilinearLayer>(); - fn->configure(input_tensor, output_tensor, output_height, output_width, align_corners, - half_pixel_centers); + if (node.getInputs().size() == 1) + { + fn->configure(input_tensor, output_tensor, node.param().height_out, node.param().width_out, + align_corners, half_pixel_centers); + } + else + { + assert(node.getInputs().size() == 2); + const auto size_index{node.getInputs().at(ir::operation::ResizeBilinear::SIZE)}; + auto size_tensor = _tensor_reg->getPortableTensor(size_index); + if (size_tensor->is_constant()) + { + auto size_vec = _ctx.at(size_index).asVector<int32_t>(); + const auto height_out = size_vec[0]; + const auto width_out = size_vec[1]; + fn->configure(input_tensor, output_tensor, height_out, width_out, align_corners, + half_pixel_centers); + } + else + { + fn->configure(input_tensor, output_tensor, size_tensor, align_corners, half_pixel_centers); + } + } _return_fn = std::move(fn); } @@ -950,9 +967,9 @@ void KernelGenerator::visit(const ir::operation::Reverse &node) const auto input_index{node.getInputs().at(ir::operation::Reverse::INPUT)}; const auto axis_index{node.getInputs().at(ir::operation::Reverse::AXIS)}; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto input_tensor = _tensor_reg->getPortableTensor(input_index).get(); - auto axis_tensor = _tensor_reg->getPortableTensor(axis_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto input_tensor = _tensor_reg->getPortableTensor(input_index); + auto axis_tensor = _tensor_reg->getPortableTensor(axis_index); auto fn = std::make_unique<ops::ReverseLayer>(); @@ -965,15 +982,15 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node) { const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::ArgMax::INPUT)}; + const auto axis_index{node.getInputs().at(ir::operation::ArgMax::AXIS)}; - const auto axis = node.param().axis; - - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto input_tensor = _tensor_reg->getPortableTensor(input_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto input_tensor = _tensor_reg->getPortableTensor(input_index); + auto axis_tensor = _tensor_reg->getPortableTensor(axis_index); auto fn = std::make_unique<ops::ArgMinMaxLayer>(); - fn->configure(input_tensor, output_tensor, axis, /* is_arg_max */ true); + fn->configure(input_tensor, output_tensor, axis_tensor, /* is_arg_max */ true); _return_fn = std::move(fn); } @@ -992,8 +1009,8 @@ void KernelGenerator::visit(const ir::operation::Pool2D &node) ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); const auto activation = node.param().activation; - auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index).get(); + auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index); auto fn = std::make_unique<ops::PoolLayer>(); @@ -1010,9 +1027,9 @@ void KernelGenerator::visit(const ir::operation::Pow &node) const auto lhs_index{node.getInputs().at(ir::operation::Pow::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::Pow::RHS)}; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index).get(); - auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index); + auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index); auto fn = std::make_unique<ops::PowLayer>(); @@ -1026,8 +1043,8 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(0)}; - auto output_alloc = _tensor_reg->getPortableTensor(output_index).get(); - auto input_alloc = _tensor_reg->getPortableTensor(input_index).get(); + auto output_alloc = _tensor_reg->getPortableTensor(output_index); + auto input_alloc = _tensor_reg->getPortableTensor(input_index); auto fn = std::make_unique<ops::L2NormLayer>(); @@ -1043,10 +1060,10 @@ void KernelGenerator::visit(const ir::operation::Range &node) const auto limit_index{node.getInputs().at(ir::operation::Range::LIMIT)}; const auto delta_index{node.getInputs().at(ir::operation::Range::DELTA)}; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto start_tensor = _tensor_reg->getPortableTensor(start_index).get(); - auto limit_tensor = _tensor_reg->getPortableTensor(limit_index).get(); - auto delta_tensor = _tensor_reg->getPortableTensor(delta_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto start_tensor = _tensor_reg->getPortableTensor(start_index); + auto limit_tensor = _tensor_reg->getPortableTensor(limit_index); + auto delta_tensor = _tensor_reg->getPortableTensor(delta_index); auto fn = std::make_unique<ops::RangeLayer>(); @@ -1059,8 +1076,8 @@ void KernelGenerator::visit(const ir::operation::Rank &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Shape::Input::INPUT)}; - auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index).get(); - auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index).get(); + auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index); + auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index); auto fn = std::make_unique<ops::RankLayer>(); @@ -1075,9 +1092,9 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node) const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)}; - auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index).get(); - auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index).get(); - auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index).get(); + auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index); + auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index); + auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index); auto fn = std::make_unique<ops::SqDiffLayer>(); @@ -1091,9 +1108,9 @@ void KernelGenerator::visit(const ir::operation::Tile &node) const auto input_index{node.getInputs().at(ir::operation::Tile::INPUT)}; const auto multiples_index{node.getInputs().at(ir::operation::Tile::MULTIPLES)}; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto input_tensor = _tensor_reg->getPortableTensor(input_index).get(); - auto multiples_tensor = _tensor_reg->getPortableTensor(multiples_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto input_tensor = _tensor_reg->getPortableTensor(input_index); + auto multiples_tensor = _tensor_reg->getPortableTensor(multiples_index); auto fn = std::make_unique<ops::TileLayer>(); @@ -1108,10 +1125,10 @@ void KernelGenerator::visit(const ir::operation::MatrixBandPart &node) const auto num_lower_index{node.getInputs().at(ir::operation::MatrixBandPart::NUM_LOWER_DIAG)}; const auto num_upper_index{node.getInputs().at(ir::operation::MatrixBandPart::NUM_UPPER_DIAG)}; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto input_tensor = _tensor_reg->getPortableTensor(input_index).get(); - auto num_lower_tensor = _tensor_reg->getPortableTensor(num_lower_index).get(); - auto num_upper_tensor = _tensor_reg->getPortableTensor(num_upper_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto input_tensor = _tensor_reg->getPortableTensor(input_index); + auto num_lower_tensor = _tensor_reg->getPortableTensor(num_lower_index); + auto num_upper_tensor = _tensor_reg->getPortableTensor(num_upper_index); auto fn = std::make_unique<ops::MatrixBandPartLayer>(); @@ -1125,9 +1142,9 @@ void KernelGenerator::visit(const ir::operation::BatchMatMul &node) const auto lhs_index{node.getInputs().at(ir::operation::BatchMatMul::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::BatchMatMul::RHS)}; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index).get(); - auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index); + auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index); const auto adj_x = node.param().adj_x; const auto adj_y = node.param().adj_y; @@ -1144,9 +1161,9 @@ void KernelGenerator::visit(const ir::operation::BroadcastTo &node) const auto input_index{node.getInputs().at(ir::operation::BroadcastTo::INPUT)}; const auto shape_index{node.getInputs().at(ir::operation::BroadcastTo::SHAPE)}; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto input_tensor = _tensor_reg->getPortableTensor(input_index).get(); - auto shape_tensor = _tensor_reg->getPortableTensor(shape_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto input_tensor = _tensor_reg->getPortableTensor(input_index); + auto shape_tensor = _tensor_reg->getPortableTensor(shape_index); auto fn = std::make_unique<ops::BroadcastToLayer>(); @@ -1159,10 +1176,10 @@ void KernelGenerator::visit(const ir::operation::FusedBatchNorm &node) { const auto ofm_index{node.getOutputs().at(0)}; - auto output_tensor = _tensor_reg->getPortableTensor(ofm_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(ofm_index); std::vector<const IPortableTensor *> input_tensors; for (auto &ifm_idx : node.getInputs()) - input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx).get()); + input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx)); const auto epsilon = node.param().epsilon; const auto is_training = node.param().is_training; @@ -1183,8 +1200,8 @@ void KernelGenerator::visit(const ir::operation::LogSoftmax &node) const auto beta = node.param().beta; const auto axis = node.param().axis; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto input_tensor = _tensor_reg->getPortableTensor(input_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto input_tensor = _tensor_reg->getPortableTensor(input_index); auto fn = std::make_unique<ops::LogSoftMaxLayer>(); @@ -1200,10 +1217,10 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node) const auto block_shape_index{node.getInputs().at(ir::operation::SpaceToBatchND::BLOCK_SIZE)}; const auto padding_index{node.getInputs().at(ir::operation::SpaceToBatchND::PADDINGS)}; - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); - auto input_tensor = _tensor_reg->getPortableTensor(input_index).get(); - auto block_shape_tensor = _tensor_reg->getPortableTensor(block_shape_index).get(); - auto padding_tensor = _tensor_reg->getPortableTensor(padding_index).get(); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); + auto input_tensor = _tensor_reg->getPortableTensor(input_index); + auto block_shape_tensor = _tensor_reg->getPortableTensor(block_shape_index); + auto padding_tensor = _tensor_reg->getPortableTensor(padding_index); auto fn = std::make_unique<ops::SpaceToBatchNDLayer>(); @@ -1218,8 +1235,8 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node) const auto output_index{node.getOutputs().at(0)}; auto block_size = node.param().block_size; - auto input_tensor = _tensor_reg->getPortableTensor(input_index).get(); - auto output_tensor = _tensor_reg->getPortableTensor(output_index).get(); + auto input_tensor = _tensor_reg->getPortableTensor(input_index); + auto output_tensor = _tensor_reg->getPortableTensor(output_index); auto fn = std::make_unique<ops::SpaceToDepthLayer>(); @@ -1233,9 +1250,9 @@ void KernelGenerator::visit(const ir::operation::StatelessRandomUniform &node) const auto shape_index{node.getInputs().at(ir::operation::StatelessRandomUniform::SHAPE)}; const auto seed_index{node.getInputs().at(ir::operation::StatelessRandomUniform::SEED)}; - auto output_alloc = _tensor_reg->getPortableTensor(output_index).get(); - auto shape_alloc = _tensor_reg->getPortableTensor(shape_index).get(); - auto seed_alloc = _tensor_reg->getPortableTensor(seed_index).get(); + auto output_alloc = _tensor_reg->getPortableTensor(output_index); + auto shape_alloc = _tensor_reg->getPortableTensor(shape_index); + auto seed_alloc = _tensor_reg->getPortableTensor(seed_index); auto fn = std::make_unique<ops::StatelessRandomUniformLayer>(); @@ -1252,13 +1269,13 @@ void KernelGenerator::visit(const ir::operation::SplitV &node) const auto size_splits{node.getInputs().at(ir::operation::SplitV::Input::SIZE_SPLITS)}; const auto split_dim{node.getInputs().at(ir::operation::SplitV::Input::SPLIT_DIM)}; - auto in_tensor = _tensor_reg->getPortableTensor(input_idx).get(); - auto in_size_splits = _tensor_reg->getPortableTensor(size_splits).get(); - auto in_split_dim = _tensor_reg->getPortableTensor(split_dim).get(); + auto in_tensor = _tensor_reg->getPortableTensor(input_idx); + auto in_size_splits = _tensor_reg->getPortableTensor(size_splits); + auto in_split_dim = _tensor_reg->getPortableTensor(split_dim); std::vector<IPortableTensor *> out_tensors; for (auto &output_idx : node.getOutputs()) - out_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx).get()); + out_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx)); auto fn = std::make_unique<ops::SplitVLayer>(); diff --git a/runtime/onert/backend/cpu/StaticTensorManager.cc b/runtime/onert/backend/cpu/StaticTensorManager.cc index 78c98dabf..3edac897c 100644 --- a/runtime/onert/backend/cpu/StaticTensorManager.cc +++ b/runtime/onert/backend/cpu/StaticTensorManager.cc @@ -41,7 +41,7 @@ void StaticTensorManager::allocateNonconsts(void) for (auto &pair : _tensors->native_tensors()) { const auto &ind = pair.first; - auto tensor = pair.second; + auto tensor = pair.second.get(); if (!_as_constants[ind] && !tensor->is_dynamic()) { auto *buffer = _nonconst_mgr->getBuffer(ind); @@ -62,13 +62,14 @@ void StaticTensorManager::buildTensor(const ir::OperandIndex &ind, assert(!_tensors->getITensor(ind)); if (as_const) { - auto tensor = std::make_shared<ExternalTensor>(tensor_info, backend_layout); - _tensors->setNativeTensor(ind, tensor); + auto tensor = std::make_unique<ExternalTensor>(tensor_info, backend_layout); + _tensors->setNativeTensor(ind, std::move(tensor)); } else { - auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout, _dynamic_tensor_manager); - _tensors->setNativeTensor(ind, tensor); + auto tensor = std::make_unique<Tensor>(tensor_info, backend_layout, + _dynamic_tensor_manager->dynamic_mem_mgr().get()); + _tensors->setNativeTensor(ind, std::move(tensor)); } _as_constants[ind] = as_const; } diff --git a/runtime/onert/core/src/backend/controlflow/UserTensorRegistry.h b/runtime/onert/backend/cpu/Tensor.cc index fa2a2d54c..dac8f898b 100644 --- a/runtime/onert/core/src/backend/controlflow/UserTensorRegistry.h +++ b/runtime/onert/backend/cpu/Tensor.cc @@ -14,23 +14,19 @@ * limitations under the License. */ -#ifndef __ONERT_BACKEND_CONTROLFLOW_USER_TENSOR_REGISTRY__ -#define __ONERT_BACKEND_CONTROLFLOW_USER_TENSOR_REGISTRY__ - -#include "backend/ITensorRegistry.h" -#include "UserTensor.h" +#include "Tensor.h" namespace onert { namespace backend { -namespace controlflow +namespace cpu { -using UserTensorRegistry = PortableTensorRegistryTemplate<UserTensor>; +// `dynamic_cast` not working across library boundaries on NDK +// With this as a key function, `dynamic_cast` works across dl +ExternalTensor::~ExternalTensor() {} -} // namespace controlflow +} // namespace cpu } // namespace backend } // namespace onert - -#endif // __ONERT_BACKEND_CONTROLFLOW_USER_TENSOR_REGISTRY__ diff --git a/runtime/onert/backend/cpu/Tensor.h b/runtime/onert/backend/cpu/Tensor.h index 20e60260c..2ad2ad0fb 100644 --- a/runtime/onert/backend/cpu/Tensor.h +++ b/runtime/onert/backend/cpu/Tensor.h @@ -41,6 +41,7 @@ class ExternalTensor : public Tensor { public: ExternalTensor() = delete; + virtual ~ExternalTensor(); public: ExternalTensor(const ir::OperandInfo &info, const ir::Layout layout) @@ -95,6 +96,21 @@ public: } } + /** + * @brief Reset reference count to zero and release data + */ + void reset_ref() override + { + assert(_data != nullptr); + assert(_num_references > 0); + _num_references = 0; + + _data.reset(); + _buffer = nullptr; + } + + int32_t num_references() override { return _num_references; } + private: std::shared_ptr<const ir::Data> _data; }; diff --git a/runtime/onert/backend/cpu/TensorBuilder.cc b/runtime/onert/backend/cpu/TensorBuilder.cc index 828d52f7c..e6bc55b0b 100644 --- a/runtime/onert/backend/cpu/TensorBuilder.cc +++ b/runtime/onert/backend/cpu/TensorBuilder.cc @@ -85,16 +85,6 @@ void TensorBuilder::allocate() // This is because CPU kernels require `ITensor`s to be allocated before Kernel Generation. } -std::unique_ptr<ITensorManager> TensorBuilder::releaseStaticTensorManager(void) -{ - return std::move(_static_tensor_mgr); -} - -std::unique_ptr<ITensorManager> TensorBuilder::releaseDynamicTensorManager(void) -{ - return std::move(_dynamic_tensor_mgr); -} - } // namespace cpu } // namespace backend } // namespace onert diff --git a/runtime/onert/backend/cpu/TensorBuilder.h b/runtime/onert/backend/cpu/TensorBuilder.h index b6d5f09cc..448abc229 100644 --- a/runtime/onert/backend/cpu/TensorBuilder.h +++ b/runtime/onert/backend/cpu/TensorBuilder.h @@ -58,12 +58,8 @@ public: void allocate() override; void postFunctionPrepare() override { /* DO NOTHING */} - std::unique_ptr<ITensorManager> releaseStaticTensorManager(void) override; - IDynamicTensorManager *dynamicTensorManager(void) override { return _dynamic_tensor_mgr.get(); } - std::unique_ptr<ITensorManager> releaseDynamicTensorManager(void) override; - private: const std::shared_ptr<cpu_common::TensorRegistry> _tensor_reg; std::unique_ptr<cpu_common::DynamicTensorManager> _dynamic_tensor_mgr; diff --git a/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.cc b/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.cc index d7b0b2bce..2fd284c91 100644 --- a/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.cc +++ b/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.cc @@ -44,24 +44,29 @@ template <typename T> std::function<bool(T, T)> GetComparefunction(bool is_arg_m } } -void ArgMinMaxLayer::configure(const IPortableTensor *input, IPortableTensor *output, int32_t axis, - bool is_arg_max) +void ArgMinMaxLayer::configure(const IPortableTensor *input, IPortableTensor *output, + const IPortableTensor *axis, bool is_arg_max) { _input = input; _output = output; - if (axis < 0) - { - axis += input->num_dimensions(); - } _axis = axis; _is_arg_max = is_arg_max; } void ArgMinMaxLayer::run() { -#define TF_LITE_ARG_MIN_MAX(input_type, axis_type, output_type) \ - ArgMinMax(getTensorShape(_input), reinterpret_cast<const input_type *>(_input->buffer()), \ - getTensorShape(_output), reinterpret_cast<output_type *>(_output->buffer()), _axis, \ + if (_axis->total_size() != sizeof(int32_t)) + { + throw std::runtime_error("ArgMinMax: wrong shape of axis"); + } + auto axis = *reinterpret_cast<const int32_t *>(_axis->buffer()); + if (axis < 0) + { + axis += _input->num_dimensions(); + } +#define TF_LITE_ARG_MIN_MAX(input_type, axis_type, output_type) \ + ArgMinMax(getTensorShape(_input), reinterpret_cast<const input_type *>(_input->buffer()), \ + getTensorShape(_output), reinterpret_cast<output_type *>(_output->buffer()), axis, \ GetComparefunction<input_type>(_is_arg_max)); if (_output->data_type() == ir::DataType::INT32) { diff --git a/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.h b/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.h index d7c021624..4c864cb98 100644 --- a/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.h +++ b/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.h @@ -33,18 +33,18 @@ namespace ops class ArgMinMaxLayer : public ::onert::exec::IFunction { public: - ArgMinMaxLayer() : _input(nullptr), _output(nullptr), _axis(-1), _is_arg_max(true) {} + ArgMinMaxLayer() : _input(nullptr), _output(nullptr), _axis(nullptr), _is_arg_max(true) {} public: - void configure(const IPortableTensor *indices, IPortableTensor *output, int32_t axis, - bool is_arg_max); + void configure(const IPortableTensor *indices, IPortableTensor *output, + const IPortableTensor *axis, bool is_arg_max); void run() override; private: const IPortableTensor *_input; IPortableTensor *_output; - int32_t _axis; + const IPortableTensor *_axis; bool _is_arg_max; }; diff --git a/runtime/onert/backend/cpu/ops/BinaryArithmeticLayer.cc b/runtime/onert/backend/cpu/ops/BinaryArithmeticLayer.cc index f50c63375..8e51daad5 100644 --- a/runtime/onert/backend/cpu/ops/BinaryArithmeticLayer.cc +++ b/runtime/onert/backend/cpu/ops/BinaryArithmeticLayer.cc @@ -34,20 +34,21 @@ template <nnfw::cker::BinaryArithmeticOpType arithmetic_type, typename T> void eval(const IPortableTensor *lhs, const IPortableTensor *rhs, IPortableTensor *output, nnfw::cker::BinaryArithmeticOpParam op_params) { - const bool need_broadcast = - nnfw::cker::ProcessBroadcastShapes(getTensorShape(lhs), getTensorShape(rhs), &op_params); + const auto lhs_shape = getTensorShape(lhs); + const auto rhs_shape = getTensorShape(rhs); + const bool need_broadcast = nnfw::cker::ProcessBroadcastShapes(lhs_shape, rhs_shape, &op_params); if (need_broadcast) { nnfw::cker::BroadcastBinaryArithmeticOp<arithmetic_type>( - op_params, getTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), - getTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), getTensorShape(output), + op_params, lhs_shape, reinterpret_cast<const T *>(lhs->buffer()), rhs_shape, + reinterpret_cast<const T *>(rhs->buffer()), getTensorShape(output), reinterpret_cast<T *>(output->buffer())); return; } nnfw::cker::BinaryArithmeticOp<arithmetic_type>( - op_params, getTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), - getTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), getTensorShape(output), + op_params, lhs_shape, reinterpret_cast<const T *>(lhs->buffer()), rhs_shape, + reinterpret_cast<const T *>(rhs->buffer()), getTensorShape(output), reinterpret_cast<T *>(output->buffer())); } diff --git a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc index 05da33abf..f873a3430 100644 --- a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc +++ b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc @@ -158,16 +158,30 @@ void FullyConnectedLayer::fullyConnectedSparseWeight() op_params.float_activation_max = output_activation_max; op_params.activation = convertActivationType(_activation); - int w0_size = getTensorShape(_weights).Dims(0); - const uint16_t *w1_segments = _weights->w1_segments(); - const uint16_t *w1_indices = _weights->w1_indices(); + const uint16_t *w1_segments = _weights->sparsity()->w1_segments(); + const uint16_t *w1_indices = _weights->sparsity()->w1_indices(); - nnfw::cker::FullyConnectedSparseWeight( - op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()), - getTensorShape(_weights), reinterpret_cast<const float *>(_weights->buffer()), - getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr), - getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), w0_size, w1_segments, - w1_indices); + auto block_size = _weights->sparsity()->block_size(); + if (block_size.size() == 0) + { + nnfw::cker::FullyConnectedSparseWeightRandom( + op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()), + getTensorShape(_weights), reinterpret_cast<const float *>(_weights->buffer()), + getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr), + getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), w1_segments, + w1_indices); + } + else if (block_size.size() == 2 && block_size[0] == 16 && block_size[1] == 1) + { + nnfw::cker::FullyConnectedSparseWeight16x1( + op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()), + getTensorShape(_weights), reinterpret_cast<const float *>(_weights->buffer()), + getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr), + getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), w1_segments, + w1_indices); + } + else + throw std::runtime_error{"FullyConnected: unsupported sparsity"}; } void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortableTensor *weights, @@ -191,7 +205,7 @@ void FullyConnectedLayer::run() { fullyConnectedHybrid(); } - else if (_weights->is_sparse()) + else if (_weights->sparsity()) { fullyConnectedSparseWeight(); } @@ -239,17 +253,11 @@ void FullyConnectedLayer::prepare() const int rows = getTensorShape(_weights).Dims(0); if (rows % 4 == 0) { - const int total_input_size = getTensorShape(_input).FlatSize(); - const int input_size = getTensorShape(_weights).Dims(1); - const int batch_size = total_input_size / input_size; - if (batch_size <= 4) - { - // TODO If it's possible to extract precaching from ruy kernel, - // place this instead of below code + // TODO If it's possible to extract precaching from ruy kernel, + // place this instead of below code - // buffer will be used by ruy kernel as a cache key - _cached_weights = _weights->buffer(); - } + // buffer will be used by ruy kernel as a cache key + _cached_weights = _weights->buffer(); } #endif } diff --git a/runtime/onert/backend/cpu/ops/OperationUtils.h b/runtime/onert/backend/cpu/ops/OperationUtils.h index 98385521a..eb24dd43c 100644 --- a/runtime/onert/backend/cpu/ops/OperationUtils.h +++ b/runtime/onert/backend/cpu/ops/OperationUtils.h @@ -95,27 +95,18 @@ inline nnfw::cker::Shape getTensorShape(const IPortableTensor *tensor) if (tensor == nullptr) return nnfw::cker::Shape(); + const ir::Shape &shape = tensor->get_info().shape(); + assert(tensor->layout() == ir::Layout::NHWC); - constexpr int kMaxSmallSize = 8; - int32_t raw_shape_small[kMaxSmallSize]; - std::vector<int32_t> raw_shape_vec; - auto rank = tensor->num_dimensions(); - int32_t *data = nullptr; - if (rank > kMaxSmallSize) - { - raw_shape_vec.resize(rank); - data = raw_shape_vec.data(); - } - else - { - data = raw_shape_small; - } - for (uint32_t i = 0; i < rank; ++i) + auto rank = shape.rank(); + nnfw::cker::Shape ret(rank); + auto data = ret.DimsData(); + for (int i = 0; i < rank; ++i) { - data[i] = tensor->dimension(i); + data[i] = shape.dim(i); } - return nnfw::cker::Shape(rank, data); + return ret; } inline nnfw::cker::FusedActivationFunctionType diff --git a/runtime/onert/backend/cpu/ops/ReduceLayer.cc b/runtime/onert/backend/cpu/ops/ReduceLayer.cc index bb5f85d60..4a55b2a33 100644 --- a/runtime/onert/backend/cpu/ops/ReduceLayer.cc +++ b/runtime/onert/backend/cpu/ops/ReduceLayer.cc @@ -18,6 +18,7 @@ #include "OperationUtils.h" +#include "cker/neon/neon_check.h" #include <cker/operation/Reduce.h> namespace onert @@ -158,7 +159,7 @@ void evalSumQuantized(const IPortableTensor *input, IPortableTensor *output, ReduceLayer::ReduceLayer() : _input(nullptr), _axes(nullptr), _output(nullptr), _reduce_kernel(new nnfw::cker::Reduce()), - _kernel() + _kernel(), _reduceType(ReduceType::kInvalid) { // DO NOTHING } @@ -171,8 +172,9 @@ void ReduceLayer::configure(const IPortableTensor *input, const IPortableTensor _input = input; _axes = axes; _output = output; + _reduceType = reduceType; - switch (reduceType) + switch (_reduceType) { case ReduceType::kSum: if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM) @@ -199,13 +201,23 @@ void ReduceLayer::configure(const IPortableTensor *input, const IPortableTensor _kernel = generateKernelGeneric(_input, keep_dims, *_reduce_kernel, ReduceType::kAll); break; default: - throw std::runtime_error{"ReduceSum: Unsupported reduce type"}; + throw std::runtime_error{"Reduce: Unsupported reduce type"}; } } void ReduceLayer::run() { const auto axes = getReducerAxes(_axes); +#ifdef USE_NEON + int32_t rank = _input->num_dimensions(); + if (_input->data_type() == ir::DataType::FLOAT32 && _reduceType == ReduceType::kSum && + axes.size() == 1 && (axes[0] == -1 || axes[0] == rank - 1)) + { + OptimizedReduceSum(reinterpret_cast<const float *>(_input->buffer()), getTensorShape(_input), + reinterpret_cast<float *>(_output->buffer())); + return; + } +#endif // NEON _kernel(_input, _output, axes); } diff --git a/runtime/onert/backend/cpu/ops/ReduceLayer.h b/runtime/onert/backend/cpu/ops/ReduceLayer.h index 332d399bd..8265dd41f 100644 --- a/runtime/onert/backend/cpu/ops/ReduceLayer.h +++ b/runtime/onert/backend/cpu/ops/ReduceLayer.h @@ -17,6 +17,8 @@ #ifndef __ONERT_BACKEND_CPU_OPS_REDUCESUMLAYER_H__ #define __ONERT_BACKEND_CPU_OPS_REDUCESUMLAYER_H__ +#include "cker/neon/neon_check.h" + #include <backend/IPortableTensor.h> #include <exec/IFunction.h> @@ -47,6 +49,7 @@ enum class ReduceType kMin, kAny, kAll, + kInvalid // For debug and initialize }; class ReduceLayer : public ::onert::exec::IFunction @@ -70,6 +73,8 @@ private: std::function<void(const IPortableTensor *input, IPortableTensor *output, const std::vector<int> &axes)> _kernel; + + ReduceType _reduceType; }; } // namespace ops diff --git a/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.cc b/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.cc index 180094bb8..1fe56cb99 100644 --- a/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.cc +++ b/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.cc @@ -28,16 +28,39 @@ namespace ops { ResizeBilinearLayer::ResizeBilinearLayer() - : _input(nullptr), _output(nullptr), _output_height(0), _output_width(0), _align_corners(false), - _half_pixel_centers(false) + : _input(nullptr), _output(nullptr), _size(nullptr), _output_height(0), _output_width(0), + _align_corners(false), _half_pixel_centers(false) { // DO NOTHING } void ResizeBilinearLayer::configure(const IPortableTensor *input, IPortableTensor *output, + const IPortableTensor *size, bool align_corners, + bool half_pixel_centers) +{ + assert(!size->is_constant()); + _input = input; + _output = output; + _size = size; + _align_corners = align_corners; + _half_pixel_centers = half_pixel_centers; +} + +void ResizeBilinearLayer::configure(const IPortableTensor *input, IPortableTensor *output, int32_t output_height, int32_t output_width, bool align_corners, bool half_pixel_centers) { + assert(_size == nullptr); + if (output_height < 0) + { + throw std::runtime_error{"ResizeBilinear: size value must be positive value, output_height = " + + std::to_string(output_height)}; + } + if (output_width < 0) + { + throw std::runtime_error{"ResizeBilinear: size value must be positive value, output_width = " + + std::to_string(output_width)}; + } _input = input; _output = output; _output_height = output_height; @@ -49,10 +72,19 @@ void ResizeBilinearLayer::configure(const IPortableTensor *input, IPortableTenso void ResizeBilinearLayer::run() { nnfw::cker::ResizeBilinearParams params; + if (_size == nullptr) + { + params.output_height = _output_height; + params.output_width = _output_width; + } + else + { + const auto size_buf = reinterpret_cast<const int32_t *>(_size->buffer()); + params.output_height = size_buf[0]; + params.output_width = size_buf[1]; + } params.align_corners = _align_corners; params.half_pixel_centers = _half_pixel_centers; - params.output_height = _output_height; - params.output_width = _output_width; switch (_input->data_type()) { diff --git a/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.h b/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.h index fc49b348e..d7ae1c620 100644 --- a/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.h +++ b/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.h @@ -36,7 +36,10 @@ public: ResizeBilinearLayer(); public: - void configure(const IPortableTensor *input1, IPortableTensor *output, int32_t output_height, + void configure(const IPortableTensor *input1, IPortableTensor *output, + const IPortableTensor *size, bool align_corners, bool half_pixel_centers); + + void configure(const IPortableTensor *input, IPortableTensor *output, int32_t output_height, int32_t output_width, bool align_corners, bool half_pixel_centers); void run() override; @@ -44,6 +47,7 @@ public: private: const IPortableTensor *_input; IPortableTensor *_output; + const IPortableTensor *_size; int32_t _output_height; int32_t _output_width; bool _align_corners; diff --git a/runtime/onert/backend/cpu/ops/SoftMaxLayer.cc b/runtime/onert/backend/cpu/ops/SoftMaxLayer.cc index 095e67abc..b42be3042 100644 --- a/runtime/onert/backend/cpu/ops/SoftMaxLayer.cc +++ b/runtime/onert/backend/cpu/ops/SoftMaxLayer.cc @@ -62,7 +62,11 @@ void SoftMaxLayer::softmaxFloat32() } else { - throw std::runtime_error{"only 1D, 2D and 4D tensors supported"}; + nnfw::cker::SoftmaxParams op_params; + op_params.beta = _beta; + nnfw::cker::reference::Softmax( + op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()), + getTensorShape(_output), reinterpret_cast<float *>(_output->buffer())); } } diff --git a/runtime/onert/backend/cpu/ops/SplitLayer.cc b/runtime/onert/backend/cpu/ops/SplitLayer.cc index 1f40654c1..922cde2e3 100644 --- a/runtime/onert/backend/cpu/ops/SplitLayer.cc +++ b/runtime/onert/backend/cpu/ops/SplitLayer.cc @@ -29,7 +29,7 @@ namespace cpu namespace ops { -SplitLayer::SplitLayer() : _input(nullptr), _num_splits(0), _axis(0), _outputs() +SplitLayer::SplitLayer() : _input(nullptr), _axis(nullptr), _num_splits(0), _outputs() { // DO NOTHING } @@ -37,7 +37,16 @@ SplitLayer::SplitLayer() : _input(nullptr), _num_splits(0), _axis(0), _outputs() template <typename T> void SplitLayer::split(void) { nnfw::cker::SplitParams op_params; - op_params.axis = _axis; + if (_axis->total_size() != sizeof(int32_t)) + { + throw std::runtime_error("ArgMinMax: wrong shape of axis"); + } + auto axis = *reinterpret_cast<const int32_t *>(_axis->buffer()); + if (axis < 0) + { + axis += _input->num_dimensions(); + } + op_params.axis = axis; op_params.num_split = _num_splits; std::vector<T *> outputPtrs; @@ -53,8 +62,8 @@ template <typename T> void SplitLayer::split(void) getTensorShape(_outputs[0]), outputPtrs.data()); } -void SplitLayer::configure(const IPortableTensor *input, uint16_t num_splits, int16_t axis, - std::vector<IPortableTensor *> &outputs) +void SplitLayer::configure(const IPortableTensor *input, const IPortableTensor *axis, + uint16_t num_splits, std::vector<IPortableTensor *> &outputs) { assert(input != nullptr); diff --git a/runtime/onert/backend/cpu/ops/SplitLayer.h b/runtime/onert/backend/cpu/ops/SplitLayer.h index 0719a0063..090f87166 100644 --- a/runtime/onert/backend/cpu/ops/SplitLayer.h +++ b/runtime/onert/backend/cpu/ops/SplitLayer.h @@ -38,15 +38,15 @@ public: public: template <typename T> void split(void); - void configure(const IPortableTensor *input, uint16_t num_splits, int16_t axis, + void configure(const IPortableTensor *input, const IPortableTensor *axis, uint16_t num_splits, std::vector<IPortableTensor *> &outputs); void run() override; private: const IPortableTensor *_input; + const IPortableTensor *_axis; uint16_t _num_splits; - int16_t _axis; std::vector<IPortableTensor *> _outputs; }; diff --git a/runtime/onert/backend/cpu/ops/StridedSliceLayer.cc b/runtime/onert/backend/cpu/ops/StridedSliceLayer.cc index dcbb87734..f77f4d691 100644 --- a/runtime/onert/backend/cpu/ops/StridedSliceLayer.cc +++ b/runtime/onert/backend/cpu/ops/StridedSliceLayer.cc @@ -37,17 +37,17 @@ StridedSliceLayer::StridedSliceLayer() template <typename T> void StridedSliceLayer::stridedSliceImpl() { + const auto input_shape = getTensorShape(_input); + const auto output_shape = getTensorShape(_output); auto op_params = nnfw::cker::buildStridedSliceParams( reinterpret_cast<uint32_t *>(_begin->buffer()), reinterpret_cast<uint32_t *>(_end->buffer()), reinterpret_cast<uint32_t *>(_strides->buffer()), _begin_mask, _end_mask, _shrink_axis_mask, - getTensorShape(_input).DimensionsCount()); + input_shape.DimensionsCount()); - nnfw::cker::checkOutputSize(op_params, getTensorShape(_input), getTensorShape(_output), - getTensorShape(_input).DimensionsCount()); + nnfw::cker::checkOutputSize(op_params, input_shape, output_shape, input_shape.DimensionsCount()); - nnfw::cker::StridedSlice(op_params, getTensorShape(_input), - reinterpret_cast<const T *>(_input->buffer()), getTensorShape(_output), - reinterpret_cast<T *>(_output->buffer())); + nnfw::cker::StridedSlice(op_params, input_shape, reinterpret_cast<const T *>(_input->buffer()), + output_shape, reinterpret_cast<T *>(_output->buffer())); } void StridedSliceLayer::configure(const IPortableTensor *input, const IPortableTensor *begin, diff --git a/runtime/onert/backend/cpu/ops/TransposeLayer.cc b/runtime/onert/backend/cpu/ops/TransposeLayer.cc index 7b232562a..3362c3396 100644 --- a/runtime/onert/backend/cpu/ops/TransposeLayer.cc +++ b/runtime/onert/backend/cpu/ops/TransposeLayer.cc @@ -19,6 +19,7 @@ #include "OperationUtils.h" #include <cker/operation/Transpose.h> +#include <numeric> namespace onert { @@ -29,7 +30,7 @@ namespace cpu namespace ops { -TransposeLayer::TransposeLayer() : _input(nullptr), _output(nullptr), _perm() +TransposeLayer::TransposeLayer() : _input(nullptr), _perm(nullptr), _output(nullptr) { // DO NOTHING } @@ -37,10 +38,23 @@ TransposeLayer::TransposeLayer() : _input(nullptr), _output(nullptr), _perm() template <typename T> void TransposeLayer::transpose() { nnfw::cker::TransposeParams param; - param.perm_count = _perm.size(); - for (size_t i = 0; i < _perm.size(); i++) + assert(_perm->num_dimensions() == 1); + + param.perm_count = _input->num_dimensions(); + if (_perm->dimension(0) == 0) // This means _perm is (n-1...0) + { + const auto begin = param.perm; + const auto end = param.perm + _input->num_dimensions(); + std::iota(begin, end, 0); + std::reverse(begin, end); + } + else { - param.perm[i] = _perm[i]; + assert(param.perm_count == static_cast<int>(_perm->dimension(0))); + for (auto i = 0; i < param.perm_count; i++) + { + param.perm[i] = *(reinterpret_cast<const int32_t *>(_perm->buffer()) + i); + } } nnfw::cker::Transpose(param, getTensorShape(_input), @@ -63,8 +77,8 @@ void TransposeLayer::transposeQuant8() transpose<uint8_t>(); } -void TransposeLayer::configure(const IPortableTensor *input, IPortableTensor *output, - const std::vector<int> &perm) +void TransposeLayer::configure(const IPortableTensor *input, const IPortableTensor *perm, + IPortableTensor *output) { _input = input; _perm = perm; diff --git a/runtime/onert/backend/cpu/ops/TransposeLayer.h b/runtime/onert/backend/cpu/ops/TransposeLayer.h index f9cb12770..c8e9f8ae7 100644 --- a/runtime/onert/backend/cpu/ops/TransposeLayer.h +++ b/runtime/onert/backend/cpu/ops/TransposeLayer.h @@ -40,15 +40,15 @@ public: void transposeQuant8(); - void configure(const IPortableTensor *input, IPortableTensor *output, - const std::vector<int> &perm); + void configure(const IPortableTensor *input, const IPortableTensor *perm, + IPortableTensor *output); void run() override; private: const IPortableTensor *_input; + const IPortableTensor *_perm; IPortableTensor *_output; - std::vector<int> _perm; }; } // namespace ops diff --git a/runtime/onert/core/CMakeLists.txt b/runtime/onert/core/CMakeLists.txt index d58b47ced..344b2a972 100644 --- a/runtime/onert/core/CMakeLists.txt +++ b/runtime/onert/core/CMakeLists.txt @@ -13,6 +13,11 @@ target_link_libraries(onert_core PRIVATE nnfw_coverage) target_link_libraries(onert_core PRIVATE dl ${LIB_PTHREAD}) target_link_libraries(onert_core PRIVATE jsoncpp) target_link_libraries(onert_core INTERFACE ruy_instrumentation) +# NOTE Below line is added to remove warning for android build +# It will be removed after android build uses gold linker +if (ANDROID) + target_link_libraries(onert_core INTERFACE log) +endif (ANDROID) if(ENVVAR_ONERT_CONFIG) target_compile_definitions(onert_core PRIVATE ENVVAR_FOR_DEFAULT_CONFIG) diff --git a/runtime/onert/core/include/backend/CustomKernelBuilder.h b/runtime/onert/core/include/backend/CustomKernelBuilder.h index 101272135..cae2fc1a3 100644 --- a/runtime/onert/core/include/backend/CustomKernelBuilder.h +++ b/runtime/onert/core/include/backend/CustomKernelBuilder.h @@ -49,10 +49,10 @@ struct TypeInfo struct CustomKernelConfigParams { - std::vector<std::shared_ptr<backend::IPortableTensor>> input_tensors; + std::vector<backend::IPortableTensor *> input_tensors; std::vector<TypeInfo> input_types; - std::vector<std::shared_ptr<backend::IPortableTensor>> output_tensors; + std::vector<backend::IPortableTensor *> output_tensors; std::vector<TypeInfo> output_types; char *userdata; diff --git a/runtime/onert/core/include/backend/IDynamicTensorManager.h b/runtime/onert/core/include/backend/IDynamicTensorManager.h index 343c52c4a..67cfda24e 100644 --- a/runtime/onert/core/include/backend/IDynamicTensorManager.h +++ b/runtime/onert/core/include/backend/IDynamicTensorManager.h @@ -39,24 +39,12 @@ struct IDynamicTensorManager : public ITensorManager public: /** - * @brief Set new shape and allocate memory for dynamic tensor. - * If a tensor is dynamic tensor and previously allocated memory exists, - * it will be deallocated. - * If a tensor is static tensor (with previously allocated memory by StaticTensorManager), - * tensor->buffer() will be overwrite to the dynamically allocated memory - * @param ind operand index of a tensor - * @param new_shape tensor's new shape. While allocating memory for this new_shape, - * tensor's shape is set to new_shape - */ - virtual void applyShape(const ir::OperandIndex &ind, const ir::Shape &new_shape) = 0; - - /** * @brief Plan when to delete a tensor. Note this planning is done at compilation time. * @param op_ind operation index - * @param operand_ind operand index of input operand of first param op. Operand can be static + * @param tensor candidate ITensor to dealloc. Tensor can be static * or dynamic since tensor type may not be clearly known at compilation time. */ - virtual void planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind) = 0; + virtual void planDealloc(ir::OperationIndex op_ind, backend::ITensor *tensor) = 0; /** * @brief Deallocate input tensors of op if an input tensor is a dynamic tensor and it won't @@ -64,12 +52,6 @@ public: * @note This will work after calling planDealloc */ virtual void deallocInput(ir::OperationIndex op_ind) = 0; - - /** - * @brief Deallocate an output tensor if the tensor is a dynamic tensor - * @note This will work after calling planDealloc - */ - virtual void deallocSubgraphOutput(ir::OperandIndex ind) = 0; }; } // namespace backend diff --git a/runtime/onert/core/include/backend/IPortableTensor.h b/runtime/onert/core/include/backend/IPortableTensor.h index a05b39a33..1b1f05fe1 100644 --- a/runtime/onert/core/include/backend/IPortableTensor.h +++ b/runtime/onert/core/include/backend/IPortableTensor.h @@ -18,6 +18,8 @@ #define __ONERT_BACKEND_I_PORTABLE_TENSOR_H__ #include "backend/ITensor.h" +#include "ir/OperandInfo.h" +#include "ir/Sparsity.h" namespace onert { @@ -36,14 +38,18 @@ namespace backend class IPortableTensor : public ITensor { public: - virtual ~IPortableTensor() = default; - virtual bool is_sparse() const { return false; } - virtual const uint16_t *w1_segments() const { return nullptr; } - virtual const uint16_t *w1_indices() const { return nullptr; } + IPortableTensor(const ir::OperandInfo &info) : _info(info) {} + + virtual ~IPortableTensor(); + virtual const ir::Sparsity *sparsity() const { return nullptr; } + const ir::OperandInfo &get_info() const { return _info; } public: bool has_padding() const final { return false; } void access(const std::function<void(ITensor &tensor)> &fn) final { fn(*this); } + +protected: + ir::OperandInfo _info; }; } // namespace backend diff --git a/runtime/onert/core/include/backend/ITensor.h b/runtime/onert/core/include/backend/ITensor.h index 12b1c5433..b18dd30a2 100644 --- a/runtime/onert/core/include/backend/ITensor.h +++ b/runtime/onert/core/include/backend/ITensor.h @@ -53,13 +53,19 @@ public: virtual void access(const std::function<void(ITensor &tensor)> &fn) = 0; /** - * @brief Return the dynamic tensor manager + * @brief Set the shape to @c shape and possibly re-allocate the buffer * - * If dynamic tensors are not supported, it returns @c nullptr . + * If a tensor is dynamic tensor and previously allocated memory exists, + * it will be deallocated. + * If a tensor is static tensor (with previously allocated memory by StaticTensorManager), + * @c buffer() will be overwriten * - * @return IDynamicTensorManager* DynamicTensorManager + * @param shape tensor's new shape. While allocating memory for this new_shape, + * tensor's shape is set to new_shape + * @return true If applying shape is successful + * @return false If not applying shape is not supported (it throws for other errors) */ - virtual IDynamicTensorManager *dynamic_tensor_manager() { return nullptr; } + virtual bool applyShape(const ir::Shape &) { return false; } /** * @brief Return true if the tensor is constant diff --git a/runtime/onert/core/include/backend/ITensorBuilder.h b/runtime/onert/core/include/backend/ITensorBuilder.h index f93ab81ae..97721cf19 100644 --- a/runtime/onert/core/include/backend/ITensorBuilder.h +++ b/runtime/onert/core/include/backend/ITensorBuilder.h @@ -89,14 +89,6 @@ public: // methods for static tensor allocation */ virtual void postFunctionPrepare() = 0; - /** - * @brief Release static @c ITensorManger object which was built - * Before calling this, @c allocate must have been called - * - * @return std::unique_ptr<ITensorManager> Tensor Manager object - */ - virtual std::unique_ptr<ITensorManager> releaseStaticTensorManager(void) = 0; - public: // methods for dynamic tensor allocation /** * @brief Get dynamicTensorManager. If a backend does not support dynamic tensor, exception @@ -108,14 +100,6 @@ public: // methods for dynamic tensor allocation * to the end of execution */ virtual IDynamicTensorManager *dynamicTensorManager(void) { return nullptr; } - - /** - * @brief Release dynamic @c ITensorManger object which was built - * Before calling this, @c allocate must have been called - * - * @return std::unique_ptr<ITensorManager> Tensor Manager object - */ - virtual std::unique_ptr<ITensorManager> releaseDynamicTensorManager(void) { return nullptr; } }; } // namespace backend diff --git a/runtime/onert/core/include/backend/ITensorRegistry.h b/runtime/onert/core/include/backend/ITensorRegistry.h index 88fcb0fcd..b256a1fb8 100644 --- a/runtime/onert/core/include/backend/ITensorRegistry.h +++ b/runtime/onert/core/include/backend/ITensorRegistry.h @@ -43,7 +43,7 @@ struct ITensorRegistry * * @note Return tensor cannot be used longer than dynamic tensor manager */ - virtual std::shared_ptr<ITensor> getITensor(const ir::OperandIndex &) = 0; + virtual ITensor *getITensor(const ir::OperandIndex &) = 0; /** * @brief Returns pointer of ITensor among native tensors * @@ -51,17 +51,14 @@ struct ITensorRegistry * * @note Returned tensor cannot be used longer than dynamic tensor manager */ - virtual std::shared_ptr<ITensor> getNativeITensor(const ir::OperandIndex &) = 0; + virtual ITensor *getNativeITensor(const ir::OperandIndex &) = 0; /** * @brief Set the Migrant Tensor which are from other backends * * @return true if supported * @return false if not supported */ - virtual bool setMigrantTensor(const ir::OperandIndex &, const std::shared_ptr<IPortableTensor> &) - { - return false; - } + virtual bool setMigrantTensor(const ir::OperandIndex &, IPortableTensor *) { return false; } }; } // namespace backend @@ -85,41 +82,37 @@ namespace backend template <typename T_Tensor> class PortableTensorRegistryTemplate : public ITensorRegistry { public: - std::shared_ptr<ITensor> getITensor(const ir::OperandIndex &ind) override + ITensor *getITensor(const ir::OperandIndex &ind) override { static_assert(std::is_base_of<ITensor, T_Tensor>::value, "T_Tensor must derive from ITensor."); - auto external_tensor = _migrant.find(ind); - if (external_tensor != _migrant.end()) - return external_tensor->second; + auto _migrant_tensor = _migrant.find(ind); + if (_migrant_tensor != _migrant.end()) + return _migrant_tensor->second; return getNativeTensor(ind); } - std::shared_ptr<ITensor> getNativeITensor(const ir::OperandIndex &ind) override - { - return getNativeTensor(ind); - } + ITensor *getNativeITensor(const ir::OperandIndex &ind) override { return getNativeTensor(ind); } - std::shared_ptr<IPortableTensor> getPortableTensor(const ir::OperandIndex &ind) + IPortableTensor *getPortableTensor(const ir::OperandIndex &ind) { - auto external_tensor = _migrant.find(ind); - if (external_tensor != _migrant.end()) + auto _migrant_tensor = _migrant.find(ind); + if (_migrant_tensor != _migrant.end()) { - if (external_tensor->second) - return external_tensor->second; + if (_migrant_tensor->second) + return _migrant_tensor->second; } return getNativeTensor(ind); } - std::shared_ptr<T_Tensor> getNativeTensor(const ir::OperandIndex &ind) + T_Tensor *getNativeTensor(const ir::OperandIndex &ind) { auto tensor = _native.find(ind); if (tensor != _native.end()) - return tensor->second; + return tensor->second.get(); return nullptr; } - bool setMigrantTensor(const ir::OperandIndex &ind, - const std::shared_ptr<IPortableTensor> &tensor) override + bool setMigrantTensor(const ir::OperandIndex &ind, IPortableTensor *tensor) override { assert(tensor != nullptr); auto itr = _native.find(ind); @@ -129,25 +122,22 @@ public: return true; } - void setNativeTensor(const ir::OperandIndex &ind, const std::shared_ptr<T_Tensor> &tensor) + void setNativeTensor(const ir::OperandIndex &ind, std::unique_ptr<T_Tensor> &&tensor) { assert(tensor != nullptr); auto itr = _migrant.find(ind); if (itr != _migrant.end()) throw std::runtime_error{"Tried to set a native tensor but a migrant tensor already exists."}; - _native[ind] = tensor; + _native[ind] = std::move(tensor); } - const ir::OperandIndexMap<std::shared_ptr<T_Tensor>> &native_tensors() { return _native; } + const ir::OperandIndexMap<std::unique_ptr<T_Tensor>> &native_tensors() { return _native; } - const ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> &migrant_tensors() - { - return _migrant; - } + const ir::OperandIndexMap<IPortableTensor *> &migrant_tensors() { return _migrant; } private: - ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> _migrant; - ir::OperandIndexMap<std::shared_ptr<T_Tensor>> _native; + ir::OperandIndexMap<IPortableTensor *> _migrant; + ir::OperandIndexMap<std::unique_ptr<T_Tensor>> _native; }; } // namespace backend diff --git a/runtime/onert/core/include/backend/cpu_common/DynamicTensorManager.h b/runtime/onert/core/include/backend/cpu_common/DynamicTensorManager.h index e3c8c8666..c4e06aa82 100644 --- a/runtime/onert/core/include/backend/cpu_common/DynamicTensorManager.h +++ b/runtime/onert/core/include/backend/cpu_common/DynamicTensorManager.h @@ -44,14 +44,16 @@ public: virtual ~DynamicTensorManager() = default; - void applyShape(const ir::OperandIndex &ind, const ir::Shape &new_shape) override; - void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &tensor_info, ir::Layout backend_layout); - void planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind) override; + void planDealloc(ir::OperationIndex op_ind, backend::ITensor *tensor) override; void deallocInput(ir::OperationIndex op_ind) override; - void deallocSubgraphOutput(ir::OperandIndex ind) override; + + std::shared_ptr<DynamicMemoryManager> dynamic_mem_mgr() { return _dynamic_mem_mgr; } + +private: + const ITensor *getRawITensor(ir::OperandIndex ind); private: /** @@ -63,7 +65,8 @@ private: // contains list of dynamic tensor index, which can be deallocated after running operation // note: this map could contain static tensor index too. Careful use is required. - std::unordered_map<ir::OperationIndex, std::unordered_set<ir::OperandIndex>> _dealloc_tensor_map; + std::unordered_map<ir::OperationIndex, std::unordered_set<backend::ITensor *>> + _dealloc_tensor_map; }; } // namespace cpu_common diff --git a/runtime/onert/core/include/backend/cpu_common/MemoryManager.h b/runtime/onert/core/include/backend/cpu_common/MemoryManager.h index 4be7a1a11..28ec6b803 100644 --- a/runtime/onert/core/include/backend/cpu_common/MemoryManager.h +++ b/runtime/onert/core/include/backend/cpu_common/MemoryManager.h @@ -20,12 +20,14 @@ #include "Allocator.h" #include "backend/IMemoryManager.h" #include "IMemoryPlanner.h" -#include "ir/OperandIndexMap.h" namespace onert { namespace backend { + +class ITensor; + namespace cpu_common { @@ -59,12 +61,12 @@ public: DynamicMemoryManager() = default; virtual ~DynamicMemoryManager() = default; - std::shared_ptr<Allocator> allocate(const ir::OperandIndex &ind, uint32_t capacity); - void deallocate(const ir::OperandIndex &ind); + std::shared_ptr<Allocator> allocate(const ITensor *tensor, uint32_t capacity); + void deallocate(const ITensor *tensor); void deallocate(void); private: - ir::OperandIndexMap<std::shared_ptr<Allocator>> _mem_alloc_map; + std::unordered_map<const ITensor *, std::shared_ptr<Allocator>> _mem_alloc_map; }; } // namespace cpu_common diff --git a/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h b/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h index 3f09b7a4a..fa50b551e 100644 --- a/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h +++ b/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h @@ -20,7 +20,6 @@ #include "MemoryManager.h" #include "backend/IStaticTensorManager.h" -#include "backend/IDynamicTensorManager.h" #include "ir/OperandIndexMap.h" #include "ir/OperandInfo.h" #include "TensorRegistry.h" @@ -32,11 +31,13 @@ namespace backend namespace cpu_common { +class DynamicTensorManager; + class StaticTensorManager : public backend::IStaticTensorManager { public: StaticTensorManager(const std::shared_ptr<TensorRegistry> ®, - IDynamicTensorManager *dynamic_tensor_manager); + DynamicMemoryManager *dynamic_mem_mgr); virtual ~StaticTensorManager() = default; void allocateConsts(void); @@ -57,7 +58,7 @@ private: std::unique_ptr<MemoryManager> _nonconst_mgr; const std::shared_ptr<TensorRegistry> _tensors; ir::OperandIndexMap<bool> _as_constants; - IDynamicTensorManager *_dynamic_tensor_manager; + DynamicMemoryManager *_dynamic_mem_mgr; }; } // namespace cpu_common diff --git a/runtime/onert/core/include/backend/cpu_common/Tensor.h b/runtime/onert/core/include/backend/cpu_common/Tensor.h index 974501ecb..5fa20e15d 100644 --- a/runtime/onert/core/include/backend/cpu_common/Tensor.h +++ b/runtime/onert/core/include/backend/cpu_common/Tensor.h @@ -29,16 +29,19 @@ namespace backend namespace cpu_common { +class DynamicMemoryManager; + class Tensor : public IPortableTensor { public: Tensor() = delete; + virtual ~Tensor(); public: Tensor(const ir::OperandInfo &info, const ir::Layout layout, - IDynamicTensorManager *dynamic_tensor_manager) - : _info(info), _layout(layout), _buffer(nullptr), _num_references(0), - _dynamic_tensor_manager(dynamic_tensor_manager), _allocator(nullptr) + DynamicMemoryManager *dynamic_mem_mgr) + : IPortableTensor(info), _layout(layout), _buffer(nullptr), _num_references(0), + _dynamic_mem_mgr(dynamic_mem_mgr), _allocator(nullptr) { // DO NOTHING } @@ -94,7 +97,7 @@ public: * W : dimension(2) * C : dimension(3) */ - size_t dimension(size_t index) const override { return _info.shape().dim(index); } + size_t dimension(size_t index) const final override { return _info.shape().dim(index); } size_t num_dimensions() const override { return _info.shape().rank(); } size_t total_size() const override { return _info.total_size(); } size_t calcOffset(const ir::Coordinates &coords) const override; @@ -105,10 +108,8 @@ public: bool is_constant() const override { return _info.isConstant(); } bool is_dynamic() const override { return _info.isDynamic(); } void set_dynamic() override { _info.setDynamic(); } - IDynamicTensorManager *dynamic_tensor_manager() override { return _dynamic_tensor_manager; } - bool is_sparse() const override { return _info.typeInfo().sparse(); } - virtual const uint16_t *w1_segments() const override { return _info.typeInfo().w1_segments(); } - virtual const uint16_t *w1_indices() const override { return _info.typeInfo().w1_indices(); } + bool applyShape(const ir::Shape &new_shape) override; + const ir::Sparsity *sparsity() const override { return _info.typeInfo().sparsity(); } virtual void increase_ref() { @@ -118,6 +119,7 @@ public: ++_num_references; } + virtual void decrease_ref() { assert(_buffer != nullptr || _allocator != nullptr); @@ -136,14 +138,34 @@ public: } } + /** + * @brief Reset reference count to zero and release data + */ + virtual void reset_ref() + { + assert(_buffer != nullptr || _allocator != nullptr); + assert(_num_references > 0); + _num_references = 0; + + // Only constant tensor has allocator pointer + if (_buffer != nullptr) + _buffer = nullptr; + else + { + _allocator->release(); + _allocator = nullptr; + } + } + + virtual int32_t num_references() { return _num_references; } + void setShape(const ir::Shape &new_shape) override; protected: - ir::OperandInfo _info; ir::Layout _layout; uint8_t *_buffer; int32_t _num_references; - IDynamicTensorManager *_dynamic_tensor_manager; + DynamicMemoryManager *_dynamic_mem_mgr; private: /** diff --git a/runtime/onert/core/include/compiler/StaticShapeInference.h b/runtime/onert/core/include/compiler/StaticShapeInference.h index b97cb5b7b..5af11074e 100644 --- a/runtime/onert/core/include/compiler/StaticShapeInference.h +++ b/runtime/onert/core/include/compiler/StaticShapeInference.h @@ -70,6 +70,8 @@ private: // TODO Define visitors for operations. List them in alphabetic order. void visit(const ir::operation::ArgMax &op) override; void visit(const ir::operation::BatchMatMul &op) override; + void visit(const ir::operation::BCQFullyConnected &op) override; + void visit(const ir::operation::BCQGather &op) override; void visit(const ir::operation::BinaryArithmetic &op) override; void visit(const ir::operation::BroadcastTo &op) override; void visit(const ir::operation::Comparison &op) override; @@ -85,6 +87,7 @@ private: void visit(const ir::operation::Gather &op) override; void visit(const ir::operation::If &op) override; void visit(const ir::operation::L2Normalization &op) override; + void visit(const ir::operation::LSTM &op) override; void visit(const ir::operation::MatrixBandPart &op) override; void visit(const ir::operation::OneHot &op) override; void visit(const ir::operation::Pack &op) override; diff --git a/runtime/onert/core/include/exec/DynamicShapeInference.h b/runtime/onert/core/include/exec/DynamicShapeInference.h index 6f6659659..4a86708d0 100644 --- a/runtime/onert/core/include/exec/DynamicShapeInference.h +++ b/runtime/onert/core/include/exec/DynamicShapeInference.h @@ -51,6 +51,8 @@ public: // Remove TODO when any op starting from the alphabet is added void visit(const ir::operation::ArgMax &op) override; void visit(const ir::operation::BatchMatMul &op) override; + void visit(const ir::operation::BCQFullyConnected &op) override; + void visit(const ir::operation::BCQGather &op) override; void visit(const ir::operation::BinaryArithmetic &op) override; void visit(const ir::operation::BroadcastTo &op) override; void visit(const ir::operation::Comparison &op) override; @@ -65,6 +67,7 @@ public: void visit(const ir::operation::FusedBatchNorm &op) override; void visit(const ir::operation::Gather &op) override; void visit(const ir::operation::L2Normalization &op) override; + void visit(const ir::operation::LSTM &op) override; void visit(const ir::operation::MatrixBandPart &op) override; void visit(const ir::operation::OneHot &op) override; void visit(const ir::operation::Pack &op) override; diff --git a/runtime/onert/core/include/exec/FunctionSequence.h b/runtime/onert/core/include/exec/FunctionSequence.h index 79a58ed00..49f00dba1 100644 --- a/runtime/onert/core/include/exec/FunctionSequence.h +++ b/runtime/onert/core/include/exec/FunctionSequence.h @@ -79,7 +79,6 @@ public: // methods related to dynamic tensor const ir::OpSequence *op_seq = nullptr; const ir::Operations *operations = nullptr; std::shared_ptr<exec::DynamicShapeInferer> dynamic_shape_inferer = nullptr; - std::shared_ptr<backend::ITensorRegistry> tensor_registry = nullptr; backend::IDynamicTensorManager *dynamic_tensor_manager = nullptr; }; @@ -104,14 +103,25 @@ public: // methods related to dynamic tensor */ void enableDynamicShapeInferer(bool enable) { - _enable_dynamic_shape_inferer = _enable_dynamic_shape_inferer && enable; + _enable_dynamic_shape_inferer = _enable_dynamic_shape_inferer || enable; } + /** + * @brief Call this function to initialize vars before running + * @note When we run a model with static tensor input and then run with dynamic tensor input, + * _enable_dynamic_shape_inferer is set to @c false at first run. + * Once _enable_dynamic_shape_inferer is set to @c true it cannot be changed to @c false + * only with calling enableDynamicShapeInferer(). So initializing it to @c false is + * necessary. + * @todo This is a quick fix. Adding this will increase time for run(). Find way to optimize. + */ + void initRunning() { _enable_dynamic_shape_inferer = false; } + protected: std::vector<std::unique_ptr<IFunction>> _functions; protected: - bool _enable_dynamic_shape_inferer = true; + bool _enable_dynamic_shape_inferer = false; std::shared_ptr<DynamicTensorCtx> _dynamic_tensor_ctx = nullptr; }; diff --git a/runtime/onert/core/include/exec/IExecutor.h b/runtime/onert/core/include/exec/IExecutor.h index 6c8bab67c..1d2831dd0 100644 --- a/runtime/onert/core/include/exec/IExecutor.h +++ b/runtime/onert/core/include/exec/IExecutor.h @@ -69,21 +69,6 @@ struct IExecutor using ExecutorMap = std::unordered_map<ir::SubgraphIndex, std::unique_ptr<IExecutor>>; -// TODO Move this structure to suitable place -/** - * @brief Dynamic allocation info for input tensors - * When user sets shape of input having unknown dims after compilation, memory for the input - * should be allocated before executing kernels. This struct contains information to allocate - * memory. - */ -struct DynAllocInfo -{ - /// @brief index of input tensor whose memory needs to be allocated at execution time - ir::OperandIndex ind; -}; - -using DynAllocInfoMap = std::unordered_map<std::shared_ptr<backend::ITensor>, DynAllocInfo>; - } // namespace exec } // namespace onert diff --git a/runtime/onert/core/include/ir/Operand.h b/runtime/onert/core/include/ir/Operand.h index 1b3a43b02..f149a744b 100644 --- a/runtime/onert/core/include/ir/Operand.h +++ b/runtime/onert/core/include/ir/Operand.h @@ -40,6 +40,7 @@ public: { // DO NOTHING } + explicit Operand(const Operand &) = default; public: const Shape &shape(void) const { return _info.shape(); } diff --git a/runtime/onert/core/include/ir/OperandIndexSequence.h b/runtime/onert/core/include/ir/OperandIndexSequence.h index aa01eccaa..2f78cc832 100644 --- a/runtime/onert/core/include/ir/OperandIndexSequence.h +++ b/runtime/onert/core/include/ir/OperandIndexSequence.h @@ -82,6 +82,8 @@ public: public: std::vector<OperandIndex>::const_iterator begin(void) const { return _vec.begin(); } std::vector<OperandIndex>::const_iterator end(void) const { return _vec.end(); } + std::vector<OperandIndex>::iterator begin(void) { return _vec.begin(); } + std::vector<OperandIndex>::iterator end(void) { return _vec.end(); } private: std::vector<OperandIndex> _vec; diff --git a/runtime/onert/core/include/ir/OperandInfo.h b/runtime/onert/core/include/ir/OperandInfo.h index b8e123027..67aeb0e65 100644 --- a/runtime/onert/core/include/ir/OperandInfo.h +++ b/runtime/onert/core/include/ir/OperandInfo.h @@ -117,6 +117,7 @@ public: MemAllocType memAllocType() const { return _alloc_type; } void setAsConstant() { _const = true; } + void setAsNonConst() { _const = false; } bool isConstant() const { // Impossible case: constant and dynamic operand diff --git a/runtime/onert/core/include/ir/Operation.h b/runtime/onert/core/include/ir/Operation.h index 818bd913b..89f7e340d 100644 --- a/runtime/onert/core/include/ir/Operation.h +++ b/runtime/onert/core/include/ir/Operation.h @@ -34,9 +34,12 @@ struct OperationVisitor; class Operation { public: + // TODO Remove default parameter Operation(OperandConstraint input_constr, const OperandIndexSequence &inputs, - const OperandIndexSequence &outputs); - explicit Operation(OperandConstraint input_constr); + const OperandIndexSequence &outputs, + OperandConstraint output_constr = OperandConstraint::createAny()); + explicit Operation(OperandConstraint input_constr, + OperandConstraint output_constr = OperandConstraint::createAny()); Operation(const Operation &) = default; Operation(Operation &&) = default; @@ -62,6 +65,7 @@ public: private: OperandConstraint _input_constr; + OperandConstraint _output_constr; OperandIndexSequence _inputs; OperandIndexSequence _outputs; }; diff --git a/runtime/onert/core/include/ir/Sparsity.h b/runtime/onert/core/include/ir/Sparsity.h new file mode 100644 index 000000000..ad4d8259b --- /dev/null +++ b/runtime/onert/core/include/ir/Sparsity.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +#ifndef __ONERT_IR_SPARSITY_H__ +#define __ONERT_IR_SPARSITY_H__ + +#include <cassert> +#include <cstdint> +#include <vector> + +namespace onert +{ +namespace ir +{ + +/** + * @brief Structure for Sparse Tensor + */ +struct Sparsity +{ +public: + Sparsity() = default; + Sparsity(std::vector<uint16_t> &&w1_segments, std::vector<uint16_t> &&w1_indices, + std::vector<int32_t> &&block_size) + : _w1_segments(w1_segments), _w1_indices(w1_indices), _block_size(block_size) + { + } + + /** + * @brief Returns segments array. See compressed sparse row format. + */ + const uint16_t *w1_segments() const { return _w1_segments.data(); } + /** + * @brief Returns indices array. See compressed sparse row format. + */ + const uint16_t *w1_indices() const { return _w1_indices.data(); } + /** + * @brief Returns block size which is used for block sparsity + */ + const std::vector<int32_t> &block_size() const { return _block_size; } + +private: + std::vector<uint16_t> _w1_segments; + std::vector<uint16_t> _w1_indices; + std::vector<int32_t> _block_size; +}; + +} // namespace ir +} // namespace onert + +#endif // __ONERT_IR_SPARSITY_H__ diff --git a/runtime/onert/core/include/ir/TypeInfo.h b/runtime/onert/core/include/ir/TypeInfo.h index 3f7eab4c0..a1ae4d2e4 100644 --- a/runtime/onert/core/include/ir/TypeInfo.h +++ b/runtime/onert/core/include/ir/TypeInfo.h @@ -18,9 +18,11 @@ #define __ONERT_IR_TYPEINFO_H__ #include <cstdint> +#include <memory> #include <vector> #include "ir/DataType.h" +#include "ir/Sparsity.h" namespace onert { @@ -33,7 +35,7 @@ public: TypeInfo() = delete; explicit TypeInfo(DataType type, float scale = 0, int32_t offset = 0) - : _type(type), _scale(scale), _offset(offset), _sparse(false) + : _type(type), _scale(scale), _offset(offset), _sparsity(nullptr) { } @@ -41,18 +43,11 @@ public: DataType type() const { return _type; } float scale() const { return _scale; } int32_t offset() const { return _offset; } - bool sparse() const { return _sparse; } - const uint16_t *w1_segments() const { return _w1_segments.data(); } - const uint16_t *w1_indices() const { return _w1_indices.data(); } + const ir::Sparsity *sparsity() const { return _sparsity.get(); } + void sparsity(std::shared_ptr<ir::Sparsity> sparsity) { _sparsity = sparsity; } public: void type(const DataType type) { _type = type; } - void sparse2DMetadata(std::vector<uint16_t> &&w1_segments, std::vector<uint16_t> &&w1_indices) - { - _sparse = true; - _w1_segments = w1_segments; - _w1_indices = w1_indices; - } private: DataType _type; @@ -60,9 +55,7 @@ private: float _scale; int32_t _offset; // for sparsity - bool _sparse; - std::vector<uint16_t> _w1_segments; - std::vector<uint16_t> _w1_indices; + std::shared_ptr<ir::Sparsity> _sparsity; }; bool operator==(const TypeInfo &lhs, const TypeInfo &rhs); diff --git a/runtime/onert/core/include/ir/operation/ArgMax.h b/runtime/onert/core/include/ir/operation/ArgMax.h index 8400e1f1e..ea7eabb83 100644 --- a/runtime/onert/core/include/ir/operation/ArgMax.h +++ b/runtime/onert/core/include/ir/operation/ArgMax.h @@ -31,12 +31,12 @@ class ArgMax : public Operation public: enum Input { - INPUT + INPUT = 0, + AXIS = 1 }; struct Param { - int axis; DataType output_type; }; diff --git a/runtime/onert/core/include/ir/operation/LSTM.h b/runtime/onert/core/include/ir/operation/LSTM.h index 1e6c00bf3..027bc6b42 100644 --- a/runtime/onert/core/include/ir/operation/LSTM.h +++ b/runtime/onert/core/include/ir/operation/LSTM.h @@ -26,6 +26,7 @@ namespace ir namespace operation { +// This operation supports only unidirectional sequence lstm class LSTM : public Operation { public: @@ -51,6 +52,10 @@ public: PROJECTION_BIAS = 17, OUTPUT_STATE_IN = 18, CELL_STATE_IN = 19, + INPUT_LAYER_NORMALIZATION_WEIGHTS = 20, + FORGET_LAYER_NORMALIZATION_WEIGHTS = 21, + CELL_LAYER_NORMALIZATION_WEIGHTS = 22, + OUTPUT_LAYER_NORMALIZATION_WEIGHTS = 23, }; enum Output @@ -66,6 +71,7 @@ public: Activation activation; float cell_threshold; float projection_threshold; + bool time_major; }; public: @@ -73,6 +79,7 @@ public: public: void accept(OperationVisitor &v) const override; + std::string name() const override; OpCode opcode() const final { return OpCode::LSTM; } public: diff --git a/runtime/onert/core/include/ir/operation/ResizeBilinear.h b/runtime/onert/core/include/ir/operation/ResizeBilinear.h index 29aa496d7..ab330c826 100644 --- a/runtime/onert/core/include/ir/operation/ResizeBilinear.h +++ b/runtime/onert/core/include/ir/operation/ResizeBilinear.h @@ -34,10 +34,12 @@ public: enum Input { INPUT = 0, + SIZE = 1, }; struct Param { + // If the input SIZE exists in inputs, height_out and width_out are not set. Ignore these params int32_t height_out; int32_t width_out; bool align_corners; diff --git a/runtime/onert/core/include/ir/operation/ResizeNearestNeighbor.h b/runtime/onert/core/include/ir/operation/ResizeNearestNeighbor.h index e4d810eeb..10827803e 100644 --- a/runtime/onert/core/include/ir/operation/ResizeNearestNeighbor.h +++ b/runtime/onert/core/include/ir/operation/ResizeNearestNeighbor.h @@ -34,10 +34,12 @@ public: enum Input { INPUT = 0, + SIZE = 1, }; struct Param { + // If the input SIZE exists in inputs, Be height_out and width_out not set. Ignore these params int32_t height_out; int32_t width_out; bool align_corners; diff --git a/runtime/onert/core/include/ir/operation/Split.h b/runtime/onert/core/include/ir/operation/Split.h index 60e0fdf15..c415941a4 100644 --- a/runtime/onert/core/include/ir/operation/Split.h +++ b/runtime/onert/core/include/ir/operation/Split.h @@ -29,12 +29,12 @@ class Split : public Operation public: enum Input { - INPUT = 0 + AXIS = 0, + INPUT = 1, }; struct Param { - int axis; int num_splits; }; diff --git a/runtime/onert/core/include/ir/operation/Transpose.h b/runtime/onert/core/include/ir/operation/Transpose.h index 9631f7aaa..665c9bbce 100644 --- a/runtime/onert/core/include/ir/operation/Transpose.h +++ b/runtime/onert/core/include/ir/operation/Transpose.h @@ -34,26 +34,15 @@ public: enum Input { INPUT = 0, // for an n-D tensor, specifying the tensor to be transposed. - }; - - struct Param - { - std::vector<int> perm; + PERMUTATION = 1, }; public: - Transpose(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs, - const Param ¶m); + Transpose(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs); public: void accept(OperationVisitor &v) const override; OpCode opcode() const final { return OpCode::Transpose; } - -public: - const Param ¶m() const { return _param; } - -private: - Param _param; }; } // namespace operation diff --git a/runtime/onert/core/include/util/Config.lst b/runtime/onert/core/include/util/Config.lst index 5077fad69..30f211011 100644 --- a/runtime/onert/core/include/util/Config.lst +++ b/runtime/onert/core/include/util/Config.lst @@ -35,6 +35,7 @@ CONFIG(OP_SEQ_MAX_NODE , int , "0") CONFIG(TRACE_FILEPATH , std::string , "") CONFIG(FP16_ENABLE , bool , "0") CONFIG(RUY_THREADS , int , "-1") +CONFIG(USE_MMAPED_DATA , bool , "0") // Auto-generate all operations diff --git a/runtime/onert/core/include/util/ShapeInference.h b/runtime/onert/core/include/util/ShapeInference.h index 1ebed48f2..701b835d2 100644 --- a/runtime/onert/core/include/util/ShapeInference.h +++ b/runtime/onert/core/include/util/ShapeInference.h @@ -47,7 +47,14 @@ ir::Shape inferArgMaxShape(const ir::Shape &input_shape, int axis, int rank); ir::Shape inferBatchMatMulShape(const ir::Shape &lhs_shape, const ir::Shape &rhs_shape, const ir::operation::BatchMatMul::Param ¶m); -ir::Shape inferBroadcastToShape(const ir::Shape wshape, const int32_t *shape_buffer); +ir::Shape inferBCQFullyConnectedShape(const ir::Shape &in_shape, const ir::Shape &cluster_shape, + const int32_t *cluster_buf); + +ir::Shape inferBCQGatherShape(const ir::Shape &indices_shape, const ir::Shape &cluster_shape, + const int32_t *cluster_buf, int rank, + const ir::operation::BCQGather::Param ¶m); + +ir::Shape inferBroadcastToShape(const ir::Shape shp_shape, const int32_t *shp_buf); ir::Shape inferConcatShape(const Shapes &in_shapes, const ir::operation::Concat::Param ¶m); @@ -63,7 +70,7 @@ ir::Shape inferEltwiseShape(const ir::Shape &lhs_shape, const ir::Shape &rhs_sha ir::Shape inferExpandDimsShape(const ir::Shape &in_shape, int32_t axis); -ir::Shape inferFillShape(const ir::Shape &in_shape, const int32_t *buf); +ir::Shape inferFillShape(const ir::Shape &in_shape, const int32_t *in_buf); ir::Shape inferFullyConnectedShape(const ir::Shape &in_shape, const ir::Shape &ker_shape); @@ -97,12 +104,12 @@ ir::Shape inferResizeBilinearShape(const ir::Shape &in_shape, const int32_t outp ir::Shape inferSelectShape(const ir::Shape &input_cond_shape, const ir::Shape &input_true_shape, const ir::Shape &input_false_shape); -ir::Shape inferSliceShape(const ir::Shape &input_shape, const int32_t *begins, - const int32_t *sizes); +ir::Shape inferSliceShape(const ir::Shape &input_shape, const int32_t *begins_buf, + const int32_t *sizes_buf); ir::Shape inferSpaceToBatchNDShape(const ir::Shape &input_shape, const ir::Shape &block_shape_shape, - const ir::Shape &padding_shape, const int32_t *block_shape_data, - const int32_t *padding_data); + const ir::Shape &padding_shape, const int32_t *block_shape_buf, + const int32_t *padding_buf); ir::Shape inferSplitShape(const ir::Shape input_shape, int axis_value, int num_splits); @@ -132,9 +139,11 @@ StridedSliceParams buildStridedSliceParams(const T *begin, const T *end, const T ir::Shape inferStridedSliceShape(const ir::Shape &input_shape, const StridedSliceParams &op_params, uint32_t rank); -ir::Shape inferTileShape(const ir::Shape &in_shape, const int32_t *multiplier); +ir::Shape inferTileShape(const ir::Shape &in_shape, const int32_t *multiplier_buf, + const int32_t multiplier_size); -ir::Shape inferTransposeShape(const ir::Shape &in_shape, const std::vector<int> &perm); +ir::Shape inferTransposeShape(const ir::Shape &in_shape, const int32_t *perm_buf, + const int32_t rank); ir::Shape inferUnpackShape(const ir::Shape &input_shape, int axis, int rank); diff --git a/runtime/onert/core/include/util/Utils.h b/runtime/onert/core/include/util/Utils.h index 847fb6971..8a4eea32b 100644 --- a/runtime/onert/core/include/util/Utils.h +++ b/runtime/onert/core/include/util/Utils.h @@ -22,6 +22,87 @@ #ifndef __ONERT_UTIL_UTILS_H__ #define __ONERT_UTIL_UTILS_H__ +#include "ir/Coordinates.h" +#include "ir/Shape.h" + #define UNUSED_RELEASE(a) (void)(a) +template <size_t from, size_t to, typename Enable = void> struct ForEachDimension +{ + template <typename L, typename... Args> + static void unroll(const onert::ir::Shape &shape, onert::ir::Coordinates &coords, + L &&lambda_function, Args &&... args) + { + static_assert(from < to, "from must not be less than to"); + assert(static_cast<int>(to) <= shape.rank()); + const auto &d = shape.dim(from); + + for (auto v = 0; v < d; v++) + { + coords.set(from, v); + ForEachDimension<from + 1, to>::unroll(shape, coords, std::forward<L>(lambda_function), + std::forward<Args>(args)...); + } + } +}; + +template <size_t from, size_t to> +struct ForEachDimension<from, to, typename std::enable_if<from == to>::type> +{ + template <typename L, typename... Args> + static void unroll(const onert::ir::Shape &shape, onert::ir::Coordinates &coords, + L &&lambda_function, Args &&... args) + { + UNUSED_RELEASE(shape); + assert(static_cast<int>(to) <= shape.rank()); + lambda_function(coords, std::forward<Args>(args)...); + } +}; + +template <typename L, typename... Args> +inline void ShapeLoop(const onert::ir::Shape &shape, L &&lambda_function, Args &&... args) +{ + assert(shape.rank() > 0); + for (auto i = 0; i < shape.rank(); ++i) + { + assert(shape.dim(i) > 0); + } + + onert::ir::Coordinates coords; + switch (shape.rank()) + { + case 0: + coords.set(0, 0); + ForEachDimension<0, 0>::unroll(shape, coords, std::forward<L>(lambda_function), + std::forward<Args>(args)...); + break; + case 1: + ForEachDimension<0, 1>::unroll(shape, coords, std::forward<L>(lambda_function), + std::forward<Args>(args)...); + break; + case 2: + ForEachDimension<0, 2>::unroll(shape, coords, std::forward<L>(lambda_function), + std::forward<Args>(args)...); + break; + case 3: + ForEachDimension<0, 3>::unroll(shape, coords, std::forward<L>(lambda_function), + std::forward<Args>(args)...); + break; + case 4: + ForEachDimension<0, 4>::unroll(shape, coords, std::forward<L>(lambda_function), + std::forward<Args>(args)...); + break; + case 5: + ForEachDimension<0, 5>::unroll(shape, coords, std::forward<L>(lambda_function), + std::forward<Args>(args)...); + break; + case 6: + ForEachDimension<0, 6>::unroll(shape, coords, std::forward<L>(lambda_function), + std::forward<Args>(args)...); + break; + default: + assert(false && "ShapeLoop, 1 <= Shape'rank <= 6"); + break; + } +} #endif // __ONERT_UTIL_UTILS_H__ diff --git a/runtime/libs/ndarray/src/Array.cpp b/runtime/onert/core/src/backend/IPortableTensor.cc index f9c9de9d3..cec34e780 100644 --- a/runtime/libs/ndarray/src/Array.cpp +++ b/runtime/onert/core/src/backend/IPortableTensor.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,14 +14,16 @@ * limitations under the License. */ -#include "ndarray/Array.h" +#include "backend/IPortableTensor.h" -namespace ndarray +namespace onert +{ +namespace backend { -template class Array<float>; -template class Array<int32_t>; -template class Array<uint32_t>; -template class Array<uint8_t>; +// `dynamic_cast` not working across library boundaries on NDK +// With this as a key function, `dynamic_cast` works across dl +IPortableTensor::~IPortableTensor() {} -} // namespace ndarray +} // namespace backend +} // namespace onert diff --git a/runtime/onert/core/src/backend/controlflow/BackendContext.h b/runtime/onert/core/src/backend/controlflow/BackendContext.h new file mode 100644 index 000000000..d179bfde4 --- /dev/null +++ b/runtime/onert/core/src/backend/controlflow/BackendContext.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_CPU_BACKEND_CONTEXT_H__ +#define __ONERT_BACKEND_CPU_BACKEND_CONTEXT_H__ + +#include <backend/BackendContext.h> +#include "ExternalContext.h" + +namespace onert +{ +namespace backend +{ +namespace controlflow +{ + +class BackendContext : public onert::backend::BackendContext +{ +public: + BackendContext(const Backend *backend, const ir::Graph *graph, + std::shared_ptr<ITensorRegistry> tensor_registry = nullptr, + std::shared_ptr<ITensorBuilder> tensor_builder = nullptr, + std::shared_ptr<IConstantInitializer> constant_initializer = nullptr, + std::shared_ptr<IKernelGenerator> kernel_gen = nullptr, + std::shared_ptr<ITensorRegister> tensor_register = nullptr, + std::shared_ptr<IOptimizer> optimizer = nullptr) + : onert::backend::BackendContext(backend, graph, tensor_registry, tensor_builder, + constant_initializer, kernel_gen, tensor_register, + optimizer), + _external_context(std::make_shared<ExternalContext>()) + { + } + + std::shared_ptr<ExternalContext> external_context() { return _external_context; } + +private: + // NOTE ruy context has a thread pool, and when multiple ruy contexts are created, + // the thread pool is also created in duplicate + // TODO Create one ruy context for session + std::shared_ptr<ExternalContext> _external_context; +}; + +} // namespace controlflow +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_CPU_BACKEND_CONTEXT_H__ diff --git a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc index 1288e4c96..77f02969d 100644 --- a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc +++ b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc @@ -17,8 +17,7 @@ #include "DynamicTensorManager.h" #include "util/logging.h" -#include "util/Exceptions.h" -#include "ir/DataType.h" +#include "misc/polymorphic_downcast.h" namespace onert { @@ -33,82 +32,18 @@ DynamicTensorManager::DynamicTensorManager(const std::shared_ptr<TensorRegistry> // DO NOTHING } -void DynamicTensorManager::applyShape(const ir::OperandIndex &ind, const ir::Shape &new_shape) -{ - // NOTE Handle user tensors first - auto user_tensor = _tensors->getNativeUserTensor(ind); - if (user_tensor) - { - // User tensors cannot be reallocated. - auto buffer_size = user_tensor->total_size(); - auto new_size = new_shape.num_elements() * sizeOfDataType(user_tensor->data_type()); - if (buffer_size < new_size) - throw InsufficientBufferSizeException{"Output buffer size is less than output tensor size"}; - user_tensor->setShape(new_shape); - return; - } - - // NOTE Then handle own tensors - auto tensor = _tensors->getNativeOwnTensor(ind); - assert(tensor); - - bool previously_dynamic = tensor->is_dynamic(); - - auto allocTensorMem = [&](bool overwrite = false) { - auto capacity = tensor->total_size(); - auto alloc = _dynamic_mem_mgr->allocate(ind, capacity); - - if (overwrite) - tensor->overwriteBuffer(alloc); - else - tensor->setBuffer(alloc); - }; - - if (!previously_dynamic) - { - // TODO deallocate tensor->buffer() - // issue is that staticTensorManager might have allocate this memory - tensor->setShape(new_shape); - tensor->set_dynamic(); - allocTensorMem(true); - } - else if (tensor->buffer() == nullptr) - { - tensor->setShape(new_shape); - tensor->set_dynamic(); - allocTensorMem(); - } - // when buffer was already allocated and new_shape requires different size - else - { - auto previous_size = tensor->total_size(); - auto new_size = new_shape.num_elements() * sizeOfDataType(tensor->data_type()); - if (previous_size != new_size) - { - _dynamic_mem_mgr->deallocate(ind); - - tensor->setShape(new_shape); - tensor->set_dynamic(); - allocTensorMem(true); - } - else - { // when buffer with same size was already allocated, shape could differ - tensor->setShape(new_shape); - } - } -} - void DynamicTensorManager::buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &tensor_info, ir::Layout backend_layout) { - auto tensor = std::make_shared<cpu_common::Tensor>(tensor_info, backend_layout, this); - _tensors->setNativeOwnTensor(ind, tensor); + auto tensor = + std::make_unique<cpu_common::Tensor>(tensor_info, backend_layout, _dynamic_mem_mgr.get()); + _tensors->setNativeOwnTensor(ind, std::move(tensor)); } -void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind) +void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, backend::ITensor *tensor) { - _dealloc_tensor_map[op_ind].emplace(operand_ind); + _dealloc_tensor_map[op_ind].emplace(tensor); } void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind) @@ -118,25 +53,26 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind) return; auto &input_set = find->second; - for (auto input_ind : input_set) + for (auto *tensor : input_set) { - if (!_tensors->getNativeTensor(input_ind)->is_dynamic()) + if (!tensor->is_dynamic()) continue; - _dynamic_mem_mgr->deallocate(input_ind); - VERBOSE(DynamicTensorManager) << "Deallocating #" << input_ind.value() + _dynamic_mem_mgr->deallocate(tensor); + + auto *cpu_tensor = nnfw::misc::polymorphic_downcast<cpu_common::Tensor *>(tensor); + cpu_tensor->resetBuffer(); + + VERBOSE(DynamicTensorManager) << "Deallocating a tensor " << (void *)tensor << " (input of op_ind: " << op_ind.value() << ")" << std::endl; } } -void DynamicTensorManager::deallocSubgraphOutput(ir::OperandIndex output_ind) +const ITensor *DynamicTensorManager::getRawITensor(ir::OperandIndex ind) { - if (!_tensors->getNativeTensor(output_ind)->is_dynamic()) - return; - - _dynamic_mem_mgr->deallocate(output_ind); - VERBOSE(DynamicTensorManager) << "Deallocating #" << output_ind.value() - << " (output of a subgraph)" << std::endl; + auto ptr = _tensors->getITensor(ind); + assert(ptr); + return ptr; } } // namespace controlflow diff --git a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.h b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.h index dbe388ba2..fb822a917 100644 --- a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.h +++ b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.h @@ -43,14 +43,16 @@ public: virtual ~DynamicTensorManager() = default; - void applyShape(const ir::OperandIndex &ind, const ir::Shape &new_shape) override; - void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &tensor_info, ir::Layout backend_layout); - void planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind) override; + void planDealloc(ir::OperationIndex op_ind, backend::ITensor *tensor) override; void deallocInput(ir::OperationIndex op_ind) override; - void deallocSubgraphOutput(ir::OperandIndex ind) override; + + std::shared_ptr<cpu_common::DynamicMemoryManager> dynamic_mem_mgr() { return _dynamic_mem_mgr; } + +private: + const ITensor *getRawITensor(ir::OperandIndex ind); private: /** @@ -60,9 +62,10 @@ private: std::shared_ptr<cpu_common::DynamicMemoryManager> _dynamic_mem_mgr; const std::shared_ptr<TensorRegistry> _tensors; - // contains list of dynamic tensor index, which can be deallocated after running operation - // note: this map could contain static tensor index too. Careful use is required. - std::unordered_map<ir::OperationIndex, std::unordered_set<ir::OperandIndex>> _dealloc_tensor_map; + // contains list of dynamic tensor, which can be deallocated after running operation + // note: this map could contain static tensor too. Careful use is required. + std::unordered_map<ir::OperationIndex, std::unordered_set<backend::ITensor *>> + _dealloc_tensor_map; }; } // namespace controlflow diff --git a/runtime/onert/core/src/backend/controlflow/ExternalContext.h b/runtime/onert/core/src/backend/controlflow/ExternalContext.h new file mode 100644 index 000000000..58bccb6c6 --- /dev/null +++ b/runtime/onert/core/src/backend/controlflow/ExternalContext.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_CONTROLFLOW_EXTERNAL_CONTEXT_H__ +#define __ONERT_BACKEND_CONTROLFLOW_EXTERNAL_CONTEXT_H__ + +#include <backend/IExternalContext.h> +#include <util/ConfigSource.h> +#include <ruy/context.h> + +namespace +{ +const int kDefaultNumThreadpoolThreads = 1; +} + +namespace onert +{ +namespace backend +{ +namespace controlflow +{ + +// TODO Unify this with cpu::ExternalContext +class ExternalContext : public IExternalContext +{ +public: + ExternalContext() : _ruy_context(nullptr) + { + // setMaxNumThreads(onert::util::getConfigInt(onert::util::config::RUY_THREADS)); + } + + void setMaxNumThreads(int max_num_threads) + { + const int target_num_threads = + max_num_threads > -1 ? max_num_threads : kDefaultNumThreadpoolThreads; + _ruy_context->set_max_num_threads(target_num_threads); + } + + ruy::Context *ruy_context() const { return _ruy_context.get(); } + +private: + const std::unique_ptr<ruy::Context> _ruy_context; +}; + +} // namespace controlflow +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_CONTROLFLOW_EXTERNAL_CONTEXT_H__ diff --git a/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc b/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc index de5a6a5f6..d76ca53e3 100644 --- a/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc +++ b/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc @@ -58,12 +58,10 @@ void KernelGenerator::visit(const ir::OpSequence &op_seq) dyn_ctx->op_seq = &op_seq; dyn_ctx->operations = &_graph.operations(); dyn_ctx->dynamic_shape_inferer = std::move(dyn_shape_inferer); - dyn_ctx->tensor_registry = _tensor_reg; dyn_ctx->dynamic_tensor_manager = _dyn_tensor_manager; _return_fn_seq->dynamic_tensor_ctx(dyn_ctx); } - _return_fn_seq->enableDynamicShapeInferer(true); for (const auto &op_idx : op_seq.operations()) { @@ -78,7 +76,7 @@ void KernelGenerator::visit(const ir::operation::If &node) const auto then_subg_index = node.param().then_subg_index; const auto else_subg_index = node.param().else_subg_index; - std::vector<std::shared_ptr<backend::ITensor>> input_tensors; + std::vector<backend::ITensor *> input_tensors; for (const auto input_index : node.getInputs()) { auto input_tensor = getTensor(input_index); @@ -86,14 +84,11 @@ void KernelGenerator::visit(const ir::operation::If &node) input_tensors.emplace_back(input_tensor); } - std::vector<std::shared_ptr<backend::ITensor>> output_tensors; - exec::DynAllocInfoMap outputs_dyn_alloc_info; + std::vector<backend::ITensor *> output_tensors; for (const auto output_index : node.getOutputs()) { auto output_tensor = getTensor(output_index); - output_tensors.emplace_back(output_tensor); - outputs_dyn_alloc_info[output_tensor] = exec::DynAllocInfo{output_index}; } // IfLayer just set ExecutorMap instead of then and else executor to avoid complexity of @@ -101,8 +96,8 @@ void KernelGenerator::visit(const ir::operation::If &node) const auto cond_tensor = input_tensors.front(); input_tensors.erase(input_tensors.begin()); auto fn = std::make_unique<::onert::backend::controlflow::kernel::IfLayer>( - cond_tensor, input_tensors, output_tensors, node.getOutputs(), _graph, outputs_dyn_alloc_info, - then_subg_index, else_subg_index, _executor_map); + cond_tensor, input_tensors, output_tensors, node.getOutputs(), _graph, then_subg_index, + else_subg_index, _executor_map); _return_fn = std::move(fn); } @@ -113,14 +108,10 @@ void KernelGenerator::visit(const ir::operation::Permute &node) const auto input_index{node.getInputs().at(0)}; // Add PermuteLayer - std::vector<std::shared_ptr<ITensor>> output_tensors{getTensor(output_index)}; - std::vector<std::shared_ptr<ITensor>> input_tensors{getTensor(input_index)}; - std::unordered_map<std::shared_ptr<ITensor>, exec::DynAllocInfo> outputs_dyn_alloc_info; - outputs_dyn_alloc_info[output_tensors.at(0)] = exec::DynAllocInfo{output_index}; - - auto fn = - std::make_unique<kernel::PermuteLayer>(input_tensors, output_tensors, outputs_dyn_alloc_info); + std::vector<ITensor *> output_tensors{getTensor(output_index)}; + std::vector<ITensor *> input_tensors{getTensor(input_index)}; + auto fn = std::make_unique<kernel::PermuteLayer>(input_tensors, output_tensors); _return_fn = std::move(fn); } @@ -131,7 +122,7 @@ void KernelGenerator::visit(const ir::operation::While &node) // This op does not support input as a constant, because controlflow backend does not have // TensorBuilder - std::vector<std::shared_ptr<backend::ITensor>> input_tensors; + std::vector<backend::ITensor *> input_tensors; for (const auto input_index : node.getInputs()) { auto input_tensor = getTensor(input_index); @@ -139,29 +130,25 @@ void KernelGenerator::visit(const ir::operation::While &node) input_tensors.emplace_back(input_tensor); } - std::vector<std::shared_ptr<backend::ITensor>> output_tensors; - std::unordered_map<std::shared_ptr<ITensor>, exec::DynAllocInfo> outputs_dyn_alloc_info; + std::vector<backend::ITensor *> output_tensors; for (const auto output_index : node.getOutputs()) { auto output_tensor = getTensor(output_index); - output_tensors.emplace_back(output_tensor); - - outputs_dyn_alloc_info[output_tensor] = exec::DynAllocInfo{output_index}; } // WhileLayer just set ExecutorMap instead of cond and body executor to avoid complexity of // creating executor recusively auto fn = std::make_unique<::onert::backend::controlflow::kernel::WhileLayer>( - input_tensors, output_tensors, node.getOutputs(), _graph, outputs_dyn_alloc_info, - cond_subg_index, body_subg_index, _executor_map); + input_tensors, output_tensors, node.getOutputs(), _graph, cond_subg_index, body_subg_index, + _executor_map); _return_fn = std::move(fn); } -std::shared_ptr<backend::ITensor> KernelGenerator::getTensor(const ir::OperandIndex &index) +backend::ITensor *KernelGenerator::getTensor(const ir::OperandIndex &index) { - std::shared_ptr<backend::ITensor> ret = _tensor_registries.getITensor(index); + backend::ITensor *ret = _tensor_registries.getITensor(index); assert(ret != nullptr); return ret; } diff --git a/runtime/onert/core/src/backend/controlflow/KernelGenerator.h b/runtime/onert/core/src/backend/controlflow/KernelGenerator.h index b84a810e4..ce248913f 100644 --- a/runtime/onert/core/src/backend/controlflow/KernelGenerator.h +++ b/runtime/onert/core/src/backend/controlflow/KernelGenerator.h @@ -56,7 +56,7 @@ public: void visit(const ir::operation::While &) override; private: - std::shared_ptr<backend::ITensor> getTensor(const ir::OperandIndex &index); + backend::ITensor *getTensor(const ir::OperandIndex &index); private: const ir::Graph &_graph; diff --git a/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc b/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc index e5c3f5fd5..7d0ff201f 100644 --- a/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc +++ b/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc @@ -29,8 +29,8 @@ namespace controlflow TensorBuilder::TensorBuilder(const std::shared_ptr<TensorRegistry> &tensor_reg) : _tensor_reg{tensor_reg}, _dynamic_tensor_mgr{new DynamicTensorManager(_tensor_reg)}, - _static_tensor_mgr{ - new cpu_common::StaticTensorManager(_tensor_reg->base_reg(), _dynamic_tensor_mgr.get())} + _static_tensor_mgr{new cpu_common::StaticTensorManager( + _tensor_reg->base_reg(), _dynamic_tensor_mgr->dynamic_mem_mgr().get())} { /* empty */ } @@ -101,25 +101,14 @@ void TensorBuilder::allocate() // This is because CPU kernels require `ITensor`s to be allocated before Kernel Generation. } -std::shared_ptr<cpu_common::Tensor> TensorBuilder::nativeOwnTensorAt(const ir::OperandIndex &ind) +IDynamicTensorManager *TensorBuilder::dynamicTensorManager(void) { - return _tensor_reg->getNativeOwnTensor(ind); -} - -std::unique_ptr<ITensorManager> TensorBuilder::releaseStaticTensorManager(void) -{ - return std::move(_static_tensor_mgr); + return _dynamic_tensor_mgr.get(); } -std::unique_ptr<ITensorManager> TensorBuilder::releaseDynamicTensorManager(void) +cpu_common::Tensor *TensorBuilder::nativeOwnTensorAt(const ir::OperandIndex &ind) { - return std::move(_dynamic_tensor_mgr); -} - -void TensorBuilder::setNativeUserTensor(const ir::OperandIndex &ind, - const std::shared_ptr<UserTensor> &tensor) -{ - _tensor_reg->setNativeUserTensor(ind, tensor); + return _tensor_reg->getNativeOwnTensor(ind); } } // namespace controlflow diff --git a/runtime/onert/core/src/backend/controlflow/TensorBuilder.h b/runtime/onert/core/src/backend/controlflow/TensorBuilder.h index 2f2a2c47e..695994761 100644 --- a/runtime/onert/core/src/backend/controlflow/TensorBuilder.h +++ b/runtime/onert/core/src/backend/controlflow/TensorBuilder.h @@ -27,7 +27,6 @@ #include <unordered_map> #include "DynamicTensorManager.h" -#include "UserTensorRegistry.h" namespace onert { @@ -59,20 +58,15 @@ public: void allocate() override; void postFunctionPrepare() override { /* DO NOTHING */} - std::unique_ptr<ITensorManager> releaseStaticTensorManager(void) override; - - IDynamicTensorManager *dynamicTensorManager(void) override { return _dynamic_tensor_mgr.get(); } - - std::unique_ptr<ITensorManager> releaseDynamicTensorManager(void) override; + IDynamicTensorManager *dynamicTensorManager(void) override; /** * @brief Get tensor with a specific OperandIndex. * @param ind OperandIndex for the tensor. There must exist a tensor with this ind. * If not, program will crash with assert or exception. - * @return shared_ptr<operand::Tensor> + * @return operand::Tensor * */ - std::shared_ptr<cpu_common::Tensor> nativeOwnTensorAt(const ir::OperandIndex &ind); - void setNativeUserTensor(const ir::OperandIndex &ind, const std::shared_ptr<UserTensor> &tensor); + cpu_common::Tensor *nativeOwnTensorAt(const ir::OperandIndex &ind); private: const std::shared_ptr<TensorRegistry> _tensor_reg; diff --git a/runtime/onert/core/src/backend/controlflow/TensorRegistry.h b/runtime/onert/core/src/backend/controlflow/TensorRegistry.h index 678c5b73b..94f71bb9c 100644 --- a/runtime/onert/core/src/backend/controlflow/TensorRegistry.h +++ b/runtime/onert/core/src/backend/controlflow/TensorRegistry.h @@ -48,7 +48,7 @@ class TensorRegistry : public ITensorRegistry public: TensorRegistry() : _base_reg{new cpu_common::TensorRegistry} {} - std::shared_ptr<ITensor> getITensor(const ir::OperandIndex &ind) override + ITensor *getITensor(const ir::OperandIndex &ind) override { auto base_tensor = _base_reg->getITensor(ind); if (base_tensor) @@ -56,7 +56,7 @@ public: return getNativeUserTensor(ind); } - std::shared_ptr<ITensor> getNativeITensor(const ir::OperandIndex &ind) override + ITensor *getNativeITensor(const ir::OperandIndex &ind) override { auto base_tensor = _base_reg->getNativeITensor(ind); if (base_tensor) @@ -64,7 +64,7 @@ public: return getNativeUserTensor(ind); } - std::shared_ptr<IPortableTensor> getPortableTensor(const ir::OperandIndex &ind) + IPortableTensor *getPortableTensor(const ir::OperandIndex &ind) { auto base_tensor = _base_reg->getPortableTensor(ind); if (base_tensor) @@ -72,7 +72,7 @@ public: return getNativeUserTensor(ind); } - std::shared_ptr<IPortableTensor> getNativeTensor(const ir::OperandIndex &ind) + IPortableTensor *getNativeTensor(const ir::OperandIndex &ind) { auto base_tensor = _base_reg->getNativeTensor(ind); if (base_tensor) @@ -80,21 +80,20 @@ public: return getNativeUserTensor(ind); } - std::shared_ptr<Tensor> getNativeOwnTensor(const ir::OperandIndex &ind) + Tensor *getNativeOwnTensor(const ir::OperandIndex &ind) { return _base_reg->getNativeTensor(ind); } - std::shared_ptr<UserTensor> getNativeUserTensor(const ir::OperandIndex &ind) + UserTensor *getNativeUserTensor(const ir::OperandIndex &ind) { auto tensor = _native_user_tensors.find(ind); if (tensor != _native_user_tensors.end()) - return tensor->second; + return tensor->second.get(); return nullptr; } - bool setMigrantTensor(const ir::OperandIndex &ind, - const std::shared_ptr<IPortableTensor> &tensor) override + bool setMigrantTensor(const ir::OperandIndex &ind, IPortableTensor *tensor) override { assert(tensor); assert(!getITensor(ind)); // For the ind, tensor is not registered yet @@ -102,21 +101,21 @@ public: return true; } - void setNativeOwnTensor(ir::OperandIndex ind, const std::shared_ptr<Tensor> &tensor) + void setNativeOwnTensor(ir::OperandIndex ind, std::unique_ptr<Tensor> &&tensor) { assert(tensor); assert(!getITensor(ind)); // For the ind, tensor is not registered yet - _base_reg->setNativeTensor(ind, tensor); + _base_reg->setNativeTensor(ind, std::move(tensor)); } - void setNativeUserTensor(ir::OperandIndex ind, const std::shared_ptr<UserTensor> &tensor) + void setNativeUserTensor(ir::OperandIndex ind, std::unique_ptr<UserTensor> &&tensor) { assert(tensor); assert(!getITensor(ind)); // For the ind, tensor is not registered yet - _native_user_tensors[ind] = tensor; + _native_user_tensors[ind] = std::move(tensor); } - const ir::OperandIndexMap<std::shared_ptr<UserTensor>> &native_user_tensors() + const ir::OperandIndexMap<std::unique_ptr<UserTensor>> &native_user_tensors() { return _native_user_tensors; } @@ -124,7 +123,7 @@ public: private: std::shared_ptr<cpu_common::TensorRegistry> _base_reg; - ir::OperandIndexMap<std::shared_ptr<UserTensor>> _native_user_tensors; + ir::OperandIndexMap<std::unique_ptr<UserTensor>> _native_user_tensors; }; } // namespace controlflow diff --git a/runtime/onert/core/src/backend/controlflow/UserTensor.cc b/runtime/onert/core/src/backend/controlflow/UserTensor.cc index c8e2ebade..5081a90ea 100644 --- a/runtime/onert/core/src/backend/controlflow/UserTensor.cc +++ b/runtime/onert/core/src/backend/controlflow/UserTensor.cc @@ -16,6 +16,9 @@ #include "UserTensor.h" +#include "util/Exceptions.h" +#include "ir/DataType.h" + namespace onert { namespace backend @@ -35,6 +38,16 @@ size_t UserTensor::calcOffset(const ir::Coordinates &coords) const return offset; } +bool UserTensor::applyShape(const ir::Shape &new_shape) +{ + // User tensors cannot be reallocated. + auto new_size = new_shape.num_elements() * ir::sizeOfDataType(data_type()); + if (total_size() < new_size) + throw InsufficientBufferSizeException{"User given buffer size is too small."}; + setShape(new_shape); + return true; +} + } // namespace controlflow } // namespace backend } // namespace onert diff --git a/runtime/onert/core/src/backend/controlflow/UserTensor.h b/runtime/onert/core/src/backend/controlflow/UserTensor.h index 9be33595d..7aa62a8a9 100644 --- a/runtime/onert/core/src/backend/controlflow/UserTensor.h +++ b/runtime/onert/core/src/backend/controlflow/UserTensor.h @@ -38,16 +38,12 @@ namespace controlflow class UserTensor : public IPortableTensor { public: - UserTensor(const ir::OperandInfo &info, ir::Layout layout, uint8_t *buffer, size_t size, - IDynamicTensorManager *dynamic_tensor_manager) - : _info{info}, _layout{layout}, _buffer{buffer}, _size{size}, _dynamic{false}, - _dynamic_tensor_manager{dynamic_tensor_manager} + UserTensor(const ir::OperandInfo &info, ir::Layout layout, uint8_t *buffer, size_t size) + : IPortableTensor{info}, _layout{layout}, _buffer{buffer}, _size{size}, _dynamic{false} { } - UserTensor(const ir::OperandInfo &info, ir::Layout layout, - IDynamicTensorManager *dynamic_tensor_manager) - : UserTensor{info, layout, nullptr, 0, dynamic_tensor_manager} + UserTensor(const ir::OperandInfo &info, ir::Layout layout) : UserTensor{info, layout, nullptr, 0} { } @@ -73,15 +69,13 @@ public: ir::Shape getShape() const override { return _info.shape(); } void setShape(const ir::Shape &new_shape) override { _info.shape(new_shape); } bool is_constant() const override { return false; } - IDynamicTensorManager *dynamic_tensor_manager() override { return _dynamic_tensor_manager; } + bool applyShape(const ir::Shape &) override; private: - ir::OperandInfo _info; ir::Layout _layout; uint8_t *_buffer; size_t _size; bool _dynamic; - IDynamicTensorManager *_dynamic_tensor_manager; }; } // namespace controlflow diff --git a/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.cc b/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.cc index 8377c7183..c0329acd8 100644 --- a/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.cc +++ b/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.cc @@ -30,16 +30,13 @@ namespace controlflow namespace kernel { -IfLayer::IfLayer(const std::shared_ptr<backend::ITensor> &cond_tensor, - const std::vector<std::shared_ptr<backend::ITensor>> input_tensors, - const std::vector<std::shared_ptr<backend::ITensor>> output_tensors, +IfLayer::IfLayer(backend::ITensor *cond_tensor, const std::vector<backend::ITensor *> input_tensors, + const std::vector<backend::ITensor *> output_tensors, const ir::OperandIndexSequence &output_indices, const ir::Graph &graph, - const exec::DynAllocInfoMap &outputs_dyn_alloc_info, const ir::SubgraphIndex &then_subg_index, const ir::SubgraphIndex &else_subg_index, exec::ExecutorMap *executor_map) : _cond_tensor{cond_tensor}, _input_tensors{input_tensors}, _output_tensors{output_tensors}, - _output_indices{output_indices}, _graph{graph}, - _outputs_dyn_alloc_info{outputs_dyn_alloc_info}, _then_subg_index{then_subg_index}, + _output_indices{output_indices}, _graph{graph}, _then_subg_index{then_subg_index}, _else_subg_index{else_subg_index}, _executor_map{executor_map} { // At this point, executor_map may not have executors of then subg and else subg @@ -63,21 +60,24 @@ void IfLayer::run() }; exec::ExecutorBase *subg_exec = nullptr; - if (getResultCond(_cond_tensor.get())) + bool cond_result = getResultCond(_cond_tensor); + if (cond_result) { + VERBOSE(If) << "Call to $" << _then_subg_index << " (then)" << std::endl; subg_exec = nnfw::misc::polymorphic_downcast<exec::ExecutorBase *>( _executor_map->at(_then_subg_index).get()); } else { + VERBOSE(If) << "Call to $" << _else_subg_index << " (else)" << std::endl; subg_exec = nnfw::misc::polymorphic_downcast<exec::ExecutorBase *>( _executor_map->at(_else_subg_index).get()); } const auto &subg_graph = subg_exec->graph(); - std::vector<std::shared_ptr<backend::ITensor>> src_tensors; - std::vector<std::shared_ptr<backend::ITensor>> dst_tensors; + std::vector<backend::ITensor *> src_tensors; + std::vector<backend::ITensor *> dst_tensors; // Add tensors used in subgraph or contained in outputs of subgraph assert(subg_graph.getInputs().size() == _input_tensors.size()); assert(subg_graph.getInputs().size() == subg_exec->getInputTensors().size()); @@ -91,9 +91,8 @@ void IfLayer::run() dst_tensors.emplace_back(subg_exec->getInputTensors().at(i)); } } - const auto &subg_inputs_dyn_alloc_info = subg_exec->getInputsDynamicAllocInfo(); const auto permute_op_input_to_subg_input = - std::make_shared<PermuteLayer>(src_tensors, dst_tensors, subg_inputs_dyn_alloc_info); + std::make_shared<PermuteLayer>(src_tensors, dst_tensors); // Add tensors used as output of operation or contained in outputs of operation src_tensors.clear(); @@ -111,7 +110,7 @@ void IfLayer::run() } } const auto permute_subg_output_to_op_output = - std::make_shared<PermuteLayer>(src_tensors, dst_tensors, _outputs_dyn_alloc_info); + std::make_shared<PermuteLayer>(src_tensors, dst_tensors); // Remove copying of unused tensor permute_op_input_to_subg_input->prepare(); @@ -120,6 +119,8 @@ void IfLayer::run() // Copy & run subg_exec->execute(_input_tensors, permute_op_input_to_subg_input); permute_subg_output_to_op_output->run(); + VERBOSE(If) << "Return from $" << (cond_result ? _then_subg_index : _else_subg_index) + << std::endl; } } // namespace kernel diff --git a/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.h b/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.h index ef3a6e6f6..1461388dc 100644 --- a/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.h +++ b/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.h @@ -32,11 +32,9 @@ namespace kernel class IfLayer : public ::onert::exec::IFunction { public: - IfLayer(const std::shared_ptr<backend::ITensor> &cond_tensor, - const std::vector<std::shared_ptr<backend::ITensor>> input_tensors, - const std::vector<std::shared_ptr<backend::ITensor>> output_tensors, + IfLayer(backend::ITensor *cond_tensor, const std::vector<backend::ITensor *> input_tensors, + const std::vector<backend::ITensor *> output_tensors, const ir::OperandIndexSequence &output_indices, const ir::Graph &graph, - const exec::DynAllocInfoMap &outputs_dyn_alloc_info, const ir::SubgraphIndex &then_subg_index, const ir::SubgraphIndex &else_subg_index, exec::ExecutorMap *executor_map); @@ -44,12 +42,11 @@ public: void run() override; private: - const std::shared_ptr<backend::ITensor> _cond_tensor; - const std::vector<std::shared_ptr<backend::ITensor>> _input_tensors; - const std::vector<std::shared_ptr<backend::ITensor>> _output_tensors; + backend::ITensor *_cond_tensor; + const std::vector<backend::ITensor *> _input_tensors; + const std::vector<backend::ITensor *> _output_tensors; const ir::OperandIndexSequence &_output_indices; const ir::Graph &_graph; - const exec::DynAllocInfoMap _outputs_dyn_alloc_info; const ir::SubgraphIndex _then_subg_index; const ir::SubgraphIndex _else_subg_index; exec::ExecutorMap *_executor_map; diff --git a/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.cc b/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.cc index e8f1ea679..49fbb33c4 100644 --- a/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.cc +++ b/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.cc @@ -54,12 +54,9 @@ void PermuteLayer::run() try { - const auto dst_index = _dst_dyn_alloc_info_map.at(dst_tensor).ind; - auto dyn_tensor_manager = dst_tensor->dynamic_tensor_manager(); - if (!dyn_tensor_manager) + if (!dst_tensor->applyShape(new_shape)) throw std::runtime_error{ "Error: PermuteLayer: output's TensorManager does not support dynamic tensor"}; - dyn_tensor_manager->applyShape(dst_index, new_shape); assert(dst_tensor->buffer() != nullptr); } catch (const std::out_of_range &e) diff --git a/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.h b/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.h index 403ac770d..8129403a5 100644 --- a/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.h +++ b/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.h @@ -33,10 +33,7 @@ namespace kernel class PermuteLayer : public onert::exec::IPermuteFunction { public: - PermuteLayer(const std::vector<std::shared_ptr<ITensor>> &src_tensors, - const std::vector<std::shared_ptr<ITensor>> &dst_tensors, - const exec::DynAllocInfoMap &dst_dyn_alloc_info_map) - : _dst_dyn_alloc_info_map{dst_dyn_alloc_info_map} + PermuteLayer(const std::vector<ITensor *> &src_tensors, const std::vector<ITensor *> &dst_tensors) { assert(src_tensors.size() == dst_tensors.size()); _src_tensors = src_tensors; @@ -64,9 +61,6 @@ public: } void run() override; - -private: - const exec::DynAllocInfoMap _dst_dyn_alloc_info_map; }; } // namespace kernel diff --git a/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.cc b/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.cc index 50936e5f6..225f0dd7c 100644 --- a/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.cc +++ b/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.cc @@ -30,16 +30,14 @@ namespace controlflow namespace kernel { -WhileLayer::WhileLayer(const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors, - const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors, +WhileLayer::WhileLayer(const std::vector<backend::ITensor *> input_tensors, + const std::vector<backend::ITensor *> output_tensors, const ir::OperandIndexSequence &output_indices, const ir::Graph &graph, - const exec::DynAllocInfoMap &outputs_dyn_alloc_info, const ir::SubgraphIndex &cond_subg_index, const ir::SubgraphIndex &body_subg_index, exec::ExecutorMap *executor_map) : _cond_subg_index{cond_subg_index}, _body_subg_index{body_subg_index}, _output_indices{output_indices}, _graph{graph}, _input_tensors{input_tensors}, - _output_tensors{output_tensors}, _outputs_dyn_alloc_info{outputs_dyn_alloc_info}, - _executor_map{executor_map} + _output_tensors{output_tensors}, _executor_map{executor_map} { // At this point, executor_map may not have executors of cond subg and body subg } @@ -62,15 +60,13 @@ void WhileLayer::run() _executor_map->at(_body_subg_index).get()); const auto &cond_graph = cond_exec->graph(); - const auto &cond_inputs_dyn_alloc = cond_exec->getInputsDynamicAllocInfo(); const auto &body_graph = body_exec->graph(); - const auto &body_inputs_dyn_alloc = body_exec->getInputsDynamicAllocInfo(); - std::vector<std::shared_ptr<backend::ITensor>> input_tensors; - std::vector<std::shared_ptr<backend::ITensor>> cond_input_tensors; - std::vector<std::shared_ptr<backend::ITensor>> body_input_tensors; - std::vector<std::shared_ptr<backend::ITensor>> body_output_tensors; - std::vector<std::shared_ptr<backend::ITensor>> output_tensors; + std::vector<backend::ITensor *> input_tensors; + std::vector<backend::ITensor *> cond_input_tensors; + std::vector<backend::ITensor *> body_input_tensors; + std::vector<backend::ITensor *> body_output_tensors; + std::vector<backend::ITensor *> output_tensors; // Add only used tensors in cond subgraph assert(cond_graph.getInputs().size() == _input_tensors.size()); @@ -85,7 +81,7 @@ void WhileLayer::run() } } const auto permute_op_input_to_cond_input = - std::make_shared<PermuteLayer>(input_tensors, cond_input_tensors, cond_inputs_dyn_alloc); + std::make_shared<PermuteLayer>(input_tensors, cond_input_tensors); // Add only used tensors among outputs of while operation assert(_output_indices.size() == _input_tensors.size()); @@ -103,7 +99,7 @@ void WhileLayer::run() } } const auto permute_op_input_to_op_output = - std::make_shared<PermuteLayer>(input_tensors, output_tensors, _outputs_dyn_alloc_info); + std::make_shared<PermuteLayer>(input_tensors, output_tensors); // Add all tensors with unused tensors in body subgraph because unused input tensors will be // copied output tensors in body subgraph @@ -111,7 +107,7 @@ void WhileLayer::run() input_tensors = _input_tensors; body_input_tensors = body_exec->getInputTensors(); const auto permute_op_input_to_body_input = - std::make_shared<PermuteLayer>(input_tensors, body_input_tensors, body_inputs_dyn_alloc); + std::make_shared<PermuteLayer>(input_tensors, body_input_tensors); // Add only used tensors in cond subgraph assert(cond_graph.getInputs().size() == body_exec->getOutputTensors().size()); @@ -127,8 +123,8 @@ void WhileLayer::run() cond_input_tensors.emplace_back(cond_exec->getInputTensors().at(i)); } } - const auto permute_body_output_to_cond_input = std::make_shared<PermuteLayer>( - body_output_tensors, cond_input_tensors, cond_inputs_dyn_alloc); + const auto permute_body_output_to_cond_input = + std::make_shared<PermuteLayer>(body_output_tensors, cond_input_tensors); // Add only used tensors in body subgraph assert(body_graph.getInputs().size() == body_exec->getOutputTensors().size()); @@ -146,8 +142,8 @@ void WhileLayer::run() body_input_tensors.emplace_back(body_exec->getInputTensors().at(i)); } } - const auto permute_body_output_to_body_input = std::make_shared<PermuteLayer>( - body_output_tensors, body_input_tensors, body_inputs_dyn_alloc); + const auto permute_body_output_to_body_input = + std::make_shared<PermuteLayer>(body_output_tensors, body_input_tensors); // Add only used tensors among outputs of while operation assert(_output_indices.size() == body_exec->getOutputTensors().size()); @@ -165,7 +161,7 @@ void WhileLayer::run() } } const auto permute_body_output_to_op_output = - std::make_shared<PermuteLayer>(body_output_tensors, output_tensors, _outputs_dyn_alloc_info); + std::make_shared<PermuteLayer>(body_output_tensors, output_tensors); // Remove copying of unused tensor permute_op_input_to_cond_input->prepare(); @@ -175,7 +171,9 @@ void WhileLayer::run() permute_body_output_to_body_input->prepare(); permute_body_output_to_op_output->prepare(); + VERBOSE(While) << "Call to $" << _cond_subg_index << " (cond)" << std::endl; cond_exec->execute(_input_tensors, permute_op_input_to_cond_input); + VERBOSE(While) << "Return from $" << _cond_subg_index << std::endl; assert(cond_exec->getOutputTensors().size() == 1); auto &cond_output_tensor = cond_exec->getOutputTensors().at(0); @@ -186,21 +184,27 @@ void WhileLayer::run() }; const auto body_execute_with_op_inputs = [&]() { + VERBOSE(While) << "Call to $" << _body_subg_index << " (body)" << std::endl; body_exec->execute(_input_tensors, permute_op_input_to_body_input); + VERBOSE(While) << "Return from $" << _body_subg_index << std::endl; }; const auto body_execute_with_body_outputs = [&]() { + VERBOSE(While) << "Call to $" << _body_subg_index << " (body)" << std::endl; body_exec->execute(body_exec->getOutputTensors(), permute_body_output_to_body_input); + VERBOSE(While) << "Return from $" << _body_subg_index << std::endl; }; std::function<void()> body_execute = body_execute_with_op_inputs; const auto cond_execute = [&]() { + VERBOSE(While) << "Call to $" << _cond_subg_index << " (cond)" << std::endl; cond_exec->execute(body_exec->getOutputTensors(), permute_body_output_to_cond_input); + VERBOSE(While) << "Return from $" << _cond_subg_index << std::endl; }; auto permute_to_outputs_fn = permute_op_input_to_op_output; // Loop while Cond subgraph's output is true - while (getResultCond(cond_output_tensor.get())) + while (getResultCond(cond_output_tensor)) { body_execute(); cond_execute(); diff --git a/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.h b/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.h index ebca8acdc..9dae49281 100644 --- a/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.h +++ b/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.h @@ -35,10 +35,9 @@ namespace kernel class WhileLayer : public ::onert::exec::IFunction { public: - WhileLayer(const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors, - const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors, + WhileLayer(const std::vector<backend::ITensor *> input_tensors, + const std::vector<backend::ITensor *> output_tensors, const ir::OperandIndexSequence &output_indices, const ir::Graph &graph, - const exec::DynAllocInfoMap &outputs_dyn_alloc_info, const ir::SubgraphIndex &cond_subg_index, const ir::SubgraphIndex &body_subg_index, exec::ExecutorMap *executor_map); @@ -50,9 +49,8 @@ private: const ir::SubgraphIndex _body_subg_index; const ir::OperandIndexSequence &_output_indices; const ir::Graph &_graph; - const std::vector<std::shared_ptr<backend::ITensor>> _input_tensors; - const std::vector<std::shared_ptr<backend::ITensor>> _output_tensors; - const exec::DynAllocInfoMap _outputs_dyn_alloc_info; + const std::vector<backend::ITensor *> _input_tensors; + const std::vector<backend::ITensor *> _output_tensors; exec::ExecutorMap *_executor_map; }; diff --git a/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc b/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc index f7ce3d011..740248ccd 100644 --- a/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc +++ b/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc @@ -17,6 +17,7 @@ #include "backend/cpu_common/DynamicTensorManager.h" #include "util/logging.h" +#include "misc/polymorphic_downcast.h" namespace onert { @@ -31,71 +32,18 @@ DynamicTensorManager::DynamicTensorManager(const std::shared_ptr<TensorRegistry> // DO NOTHING } -void DynamicTensorManager::applyShape(const ir::OperandIndex &ind, const ir::Shape &new_shape) -{ - VERBOSE_F() << ind << std::endl; - - auto tensor = _tensors->getNativeTensor(ind); - assert(tensor); - - bool previously_dynamic = tensor->is_dynamic(); - - auto allocTensorMem = [&](bool overwrite = false) { - auto capacity = tensor->total_size(); - auto alloc = _dynamic_mem_mgr->allocate(ind, capacity); - - if (overwrite) - tensor->overwriteBuffer(alloc); - else - tensor->setBuffer(alloc); - }; - - if (!previously_dynamic) - { - // TODO deallocate tensor->buffer() - // issue is that staticTensorManager might have allocate this memory - tensor->setShape(new_shape); - tensor->set_dynamic(); - allocTensorMem(true); - } - else if (tensor->buffer() == nullptr) - { - tensor->setShape(new_shape); - tensor->set_dynamic(); - allocTensorMem(); - } - // when buffer was already allocated and new_shape requires different size - else - { - auto previous_size = tensor->total_size(); - auto new_size = new_shape.num_elements() * sizeOfDataType(tensor->data_type()); - if (previous_size != new_size) - { - _dynamic_mem_mgr->deallocate(ind); - - tensor->setShape(new_shape); - tensor->set_dynamic(); - allocTensorMem(true); - } - else - { // when buffer with same size was already allocated, shape could differ - tensor->setShape(new_shape); - } - } -} - void DynamicTensorManager::buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &tensor_info, ir::Layout backend_layout) { assert(_tensors->getNativeTensor(ind) == nullptr); - auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout, this); - _tensors->setNativeTensor(ind, tensor); + auto tensor = std::make_unique<Tensor>(tensor_info, backend_layout, _dynamic_mem_mgr.get()); + _tensors->setNativeTensor(ind, std::move(tensor)); } -void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind) +void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, backend::ITensor *tensor) { - _dealloc_tensor_map[op_ind].emplace(operand_ind); + _dealloc_tensor_map[op_ind].emplace(tensor); } void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind) @@ -105,31 +53,26 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind) return; auto &input_set = find->second; - for (auto input_ind : input_set) + for (auto *tensor : input_set) { - auto *tensor = _tensors->getNativeTensor(input_ind).get(); if (!tensor->is_dynamic()) continue; - _dynamic_mem_mgr->deallocate(input_ind); - tensor->resetBuffer(); + _dynamic_mem_mgr->deallocate(tensor); - VERBOSE(DynamicTensorManager) << "Deallocating #" << input_ind.value() + auto *cpu_tensor = nnfw::misc::polymorphic_downcast<cpu_common::Tensor *>(tensor); + cpu_tensor->resetBuffer(); + + VERBOSE(DynamicTensorManager) << "Deallocating tensor " << (void *)cpu_tensor << " (input of op_ind: " << op_ind.value() << ")" << std::endl; } } -void DynamicTensorManager::deallocSubgraphOutput(ir::OperandIndex output_ind) +const ITensor *DynamicTensorManager::getRawITensor(ir::OperandIndex ind) { - auto *tensor = _tensors->getNativeTensor(output_ind).get(); - if (!tensor->is_dynamic()) - return; - - _dynamic_mem_mgr->deallocate(output_ind); - tensor->resetBuffer(); - - VERBOSE(DynamicTensorManager) << "Deallocating #" << output_ind.value() - << " (output of a subgraph)" << std::endl; + auto ptr = _tensors->getITensor(ind); + assert(ptr); + return ptr; } } // namespace cpu_common diff --git a/runtime/onert/core/src/backend/cpu_common/MemoryManager.cc b/runtime/onert/core/src/backend/cpu_common/MemoryManager.cc index 8cb9c22ca..9f179d9ee 100644 --- a/runtime/onert/core/src/backend/cpu_common/MemoryManager.cc +++ b/runtime/onert/core/src/backend/cpu_common/MemoryManager.cc @@ -20,6 +20,7 @@ #include "MemoryPlannerFactory.h" #include "util/ConfigSource.h" +#include "util/logging.h" namespace onert { @@ -70,20 +71,20 @@ uint8_t *MemoryManager::getBuffer(const ir::OperandIndex &ind) const return _mem_alloc->base() + mem_blk.offset; } -std::shared_ptr<cpu_common::Allocator> DynamicMemoryManager::allocate(const ir::OperandIndex &ind, +std::shared_ptr<cpu_common::Allocator> DynamicMemoryManager::allocate(const ITensor *tensor, uint32_t capacity) { - auto find = _mem_alloc_map.find(ind); + auto find = _mem_alloc_map.find(tensor); if (find != _mem_alloc_map.end()) throw std::runtime_error("Cannot allocate memory for a tensor. It was already allocated."); - _mem_alloc_map[ind] = std::make_shared<cpu_common::Allocator>(capacity); - return _mem_alloc_map[ind]; + _mem_alloc_map[tensor] = std::make_shared<cpu_common::Allocator>(capacity); + return _mem_alloc_map[tensor]; } -void DynamicMemoryManager::deallocate(const ir::OperandIndex &ind) +void DynamicMemoryManager::deallocate(const ITensor *tensor) { - auto find = _mem_alloc_map.find(ind); + auto find = _mem_alloc_map.find(tensor); if (find == _mem_alloc_map.end()) throw std::runtime_error("Cannot find Allocator for the requested index"); diff --git a/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc b/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc index 440f70c93..cac43babe 100644 --- a/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc +++ b/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc @@ -27,9 +27,9 @@ namespace cpu_common { StaticTensorManager::StaticTensorManager(const std::shared_ptr<TensorRegistry> ®, - IDynamicTensorManager *dynamic_tensor_manager) + DynamicMemoryManager *dynamic_mem_mgr) : _const_mgr{new DynamicMemoryManager()}, _nonconst_mgr{new MemoryManager()}, _tensors{reg}, - _dynamic_tensor_manager{dynamic_tensor_manager} + _dynamic_mem_mgr{dynamic_mem_mgr} { // DO NOTHING } @@ -39,10 +39,10 @@ void StaticTensorManager::allocateConsts(void) for (auto &pair : _tensors->native_tensors()) { const auto &ind = pair.first; - auto tensor = pair.second; + auto tensor = pair.second.get(); if (_as_constants[ind]) { - auto mem_alloc = _const_mgr->allocate(ind, tensor->total_size()); + auto mem_alloc = _const_mgr->allocate(_tensors->getITensor(ind), tensor->total_size()); tensor->setBuffer(mem_alloc); auto buffer = mem_alloc->base(); VERBOSE(CPU_COMMON_StaticTensorManager) << "CONSTANT TENSOR(#" << ind.value() @@ -59,7 +59,7 @@ void StaticTensorManager::allocateNonconsts(void) for (auto &pair : _tensors->native_tensors()) { const auto &ind = pair.first; - auto tensor = pair.second; + auto tensor = pair.second.get(); if (!_as_constants[ind] && !tensor->is_dynamic()) { auto *buffer = _nonconst_mgr->getBuffer(ind); @@ -80,8 +80,8 @@ void StaticTensorManager::buildTensor(const ir::OperandIndex &ind, bool as_const) { assert(!_tensors->getNativeTensor(ind)); - auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout, _dynamic_tensor_manager); - _tensors->setNativeTensor(ind, tensor); + auto tensor = std::make_unique<Tensor>(tensor_info, backend_layout, _dynamic_mem_mgr); + _tensors->setNativeTensor(ind, std::move(tensor)); _as_constants[ind] = as_const; } diff --git a/runtime/onert/core/src/backend/cpu_common/Tensor.cc b/runtime/onert/core/src/backend/cpu_common/Tensor.cc index f34564dd9..d3dcf9a6d 100644 --- a/runtime/onert/core/src/backend/cpu_common/Tensor.cc +++ b/runtime/onert/core/src/backend/cpu_common/Tensor.cc @@ -16,6 +16,9 @@ #include "backend/cpu_common/Tensor.h" +#include "ir/DataType.h" +#include "backend/cpu_common/MemoryManager.h" + namespace onert { namespace backend @@ -23,6 +26,8 @@ namespace backend namespace cpu_common { +Tensor::~Tensor() {} + size_t Tensor::calcOffset(const ir::Coordinates &coords) const { size_t rank = num_dimensions(); @@ -38,6 +43,55 @@ size_t Tensor::calcOffset(const ir::Coordinates &coords) const void Tensor::setShape(const ir::Shape &new_shape) { _info.shape(new_shape); } +bool Tensor::applyShape(const ir::Shape &new_shape) +{ + bool previously_dynamic = is_dynamic(); + + auto allocTensorMem = [&](bool overwrite = false) { + auto capacity = total_size(); + auto alloc = _dynamic_mem_mgr->allocate(this, capacity); + + if (overwrite) + overwriteBuffer(alloc); + else + setBuffer(alloc); + }; + + if (!previously_dynamic) + { + // TODO deallocate tensor->buffer() + // issue is that staticTensorManager might have allocate this memory + setShape(new_shape); + set_dynamic(); + allocTensorMem(true); + } + else if (buffer() == nullptr) + { + setShape(new_shape); + set_dynamic(); + allocTensorMem(); + } + // when buffer was already allocated and new_shape requires different size + else + { + auto previous_size = total_size(); + auto new_size = new_shape.num_elements() * ir::sizeOfDataType(data_type()); + if (previous_size != new_size) + { + _dynamic_mem_mgr->deallocate(this); + + setShape(new_shape); + set_dynamic(); + allocTensorMem(true); + } + else + { // when buffer with same size was already allocated, shape could differ + setShape(new_shape); + } + } + return true; +} + } // namespace cpu_common } // namespace backend } // namespace onert diff --git a/runtime/onert/core/src/compiler/BackendManager.cc b/runtime/onert/core/src/compiler/BackendManager.cc index db7a14a96..0093f50fd 100644 --- a/runtime/onert/core/src/compiler/BackendManager.cc +++ b/runtime/onert/core/src/compiler/BackendManager.cc @@ -70,31 +70,18 @@ void BackendManager::loadBackend(const std::string &backend) } // TODO Remove indentation - // Workaround If backend have dynamic library with "-boost" suffix naming, - // BackendManager load library with "-boost" suffix instead of library without suffix - // This feature is used for custom backend extension to support additional operations { - const std::string backend_boost_so = "libbackend_" + backend + "-boost" + SHARED_LIB_EXT; const std::string backend_so = "libbackend_" + backend + SHARED_LIB_EXT; + void *handle = dlopen(backend_so.c_str(), RTLD_LAZY | RTLD_LOCAL); - void *handle = dlopen(backend_boost_so.c_str(), RTLD_LAZY | RTLD_LOCAL); if (handle == nullptr) { - handle = dlopen(backend_so.c_str(), RTLD_LAZY | RTLD_LOCAL); - - if (handle == nullptr) - { - VERBOSE_F() << "Failed to load backend '" << backend << "' - " << dlerror() << std::endl; - return; - } - - VERBOSE_F() << "Successfully loaded '" << backend << "' - " << backend_so << "\n"; - } - else - { - VERBOSE_F() << "Successfully loaded '" << backend << "' - " << backend_boost_so << "\n"; + VERBOSE_F() << "Failed to load backend '" << backend << "' - " << dlerror() << std::endl; + return; } + VERBOSE_F() << "Successfully loaded '" << backend << "' - " << backend_so << "\n"; + { // load object creator function auto backend_create = (backend_create_t)dlsym(handle, "onert_backend_create"); diff --git a/runtime/onert/core/src/compiler/Compiler.cc b/runtime/onert/core/src/compiler/Compiler.cc index 93dbbc3b5..12b582b35 100644 --- a/runtime/onert/core/src/compiler/Compiler.cc +++ b/runtime/onert/core/src/compiler/Compiler.cc @@ -19,6 +19,7 @@ #include "ParamChecker.h" #include "ExecutorFactory.h" #include "OperationValidator.h" +#include "ShapeValidator.h" #include "Fp32ToFp16Converter.h" #include <backend/controlflow/Config.h> @@ -27,8 +28,12 @@ #include "compiler/ManualScheduler.h" #include "compiler/HEScheduler.h" #include "compiler/StaticShapeInference.h" +#include "compiler/pass/ConstantOutputPass.h" +#include "compiler/pass/OddOutputPass.h" +#include "compiler/pass/PassRunner.h" #include "exec/ExecTime.h" #include "ir/operation/LowerInfo.h" +#include "ir/verifier/Verifier.h" #include "dumper/dot/DotDumper.h" #include "compiler/Linear.h" #include "interp/InterpExecutor.h" @@ -132,6 +137,8 @@ std::shared_ptr<exec::ExecutorMap> Compiler::compile(void) backend::controlflow::Config::ID; _options.manual_scheduler_options.opcode_to_backend[ir::OpCode::While] = backend::controlflow::Config::ID; + _options.manual_scheduler_options.opcode_to_backend[ir::OpCode::Permute] = + backend::controlflow::Config::ID; } // FIXME This is a workaround for bcq operations, should remove it @@ -159,10 +166,24 @@ std::shared_ptr<exec::ExecutorMap> Compiler::compile(void) VERBOSE(Compiler) << std::noboolalpha; } + _subgraphs->iterate([&](const ir::SubgraphIndex &, ir::Graph &subg) { + // Mandatory passes + pass::PassRunner{} + .append(std::make_unique<pass::ConstantOutputPass>(subg)) + .append(std::make_unique<pass::OddOutputPass>(subg)) + .run(); + }); + /*************************************************** * Prepare compilation phase ***************************************************/ + // Check shape independent operation feature + // - Operand type + // - Shape independent parameter + _subgraphs->iterate( + [](const onert::ir::SubgraphIndex &, const ir::Graph &subg) { OperationValidator{subg}(); }); + auto executors = std::make_shared<exec::ExecutorMap>(); // Compilable check @@ -229,17 +250,23 @@ std::shared_ptr<exec::ExecutorMap> Compiler::compile(void) inferer.dump(); } - /************************************************************* - * Backend independent analysis & optimization phase finished - *************************************************************/ - - // operation validation + // Shape validation + // TODO Move shape independent feature check from ShapeValidator to OperationValidator + // TODO Move ShapeValidator into shape inference + // - Check input tensor shape validation + // - Check parameter value validation which valid value is depend on input tensor shape + // - Output tensor shape validation check is needless because + // static/dynamic shape inferer will make valid output shape for (auto &pair : lowered_subgs) { auto &lowered_subg = pair.second; - compiler::OperationValidator{lowered_subg->graph()}(); + compiler::ShapeValidator{lowered_subg->graph()}(); } + /************************************************************* + * Backend independent analysis & optimization phase finished + *************************************************************/ + executors = std::make_shared<exec::ExecutorMap>(); for (auto &pair : lowered_subgs) { diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.cc b/runtime/onert/core/src/compiler/ExecutorFactory.cc index 062c6c9c3..bb325ffbc 100644 --- a/runtime/onert/core/src/compiler/ExecutorFactory.cc +++ b/runtime/onert/core/src/compiler/ExecutorFactory.cc @@ -29,6 +29,7 @@ #include "backend/IConstantInitializer.h" #include "backend/IKernelGenerator.h" #include "backend/IOptimizer.h" +#include "backend/IPortableTensor.h" #include "backend/ITensorRegister.h" #include "backend/controlflow/Config.h" #include "backend/controlflow/KernelGenerator.h" @@ -65,23 +66,6 @@ private: std::shared_ptr<backend::IConfig> _config; }; -// TODO Think of a better way to manage TensorManagers -backend::TensorManagerSet createTensorManagerSet(const compiler::TensorBuilders &tensor_builders) -{ - backend::TensorManagerSet tensor_mgrs; - for (auto &tensor_builder : tensor_builders) - { - auto s_tensor_manager = tensor_builder->releaseStaticTensorManager(); - if (s_tensor_manager != nullptr) - tensor_mgrs.insert(std::move(s_tensor_manager)); - - auto d_tensor_manager = tensor_builder->releaseDynamicTensorManager(); - if (d_tensor_manager != nullptr) - tensor_mgrs.insert(std::move(d_tensor_manager)); - } - return tensor_mgrs; -} - } // namespace } // namespace onert @@ -172,7 +156,8 @@ void ExecutorFactory::runTensorRegistration(compiler::LoweredGraph *lowered_grap for (const auto op_idx : op_seq) { const auto &op = lowered_graph->graph().operations().at(op_idx); - for (const auto &index : (op.getInputs() | ir::Remove::UNDEFINED) + op.getOutputs()) + for (const auto &index : + (op.getInputs() | ir::Remove::UNDEFINED) + (op.getOutputs() | ir::Remove::UNDEFINED)) { if (!tensor_builder->isRegistered(index) && !model_io.contains(index)) { @@ -200,11 +185,11 @@ void ExecutorFactory::runTensorRegistration(compiler::LoweredGraph *lowered_grap } } -std::vector<std::shared_ptr<backend::ITensor>> +std::vector<backend::ITensor *> ExecutorFactory::initializeModelIOTensors(compiler::LoweredGraph &lowered_graph, const ir::OperandIndexSequence &indices) { - std::vector<std::shared_ptr<backend::ITensor>> ret; + std::vector<backend::ITensor *> ret; // TODO Store controlflow backend in BackendContext std::shared_ptr<backend::controlflow::TensorBuilder> cf_tensor_builder; @@ -227,19 +212,20 @@ ExecutorFactory::initializeModelIOTensors(compiler::LoweredGraph &lowered_graph, for (auto ind : indices) { const auto &operand = lowered_graph.graph().operands().at(ind); - auto tensor = std::make_shared<backend::controlflow::UserTensor>( + auto tensor = std::make_unique<backend::controlflow::UserTensor>( operand.info(), - ir::Layout::NHWC, /* FIXME find op_seq for this operand and use frontend_layout */ - cf_tensor_builder->dynamicTensorManager()); + ir::Layout::NHWC /* FIXME find op_seq for this operand and use frontend_layout */ + ); // Add tensor to controlflow TensorRegistry. - cf_tensor_reg->setNativeUserTensor(ind, tensor); - ret.push_back(tensor); + cf_tensor_reg->setNativeUserTensor(ind, std::move(tensor)); + auto *itensor = cf_tensor_reg->getITensor(ind); + ret.push_back(itensor); } return ret; } -void ExecutorFactory::prepareExternalTensors(compiler::LoweredGraph &lowered_graph) +void ExecutorFactory::prepareMigrantTensors(compiler::LoweredGraph &lowered_graph) { TensorRegistries tensor_regs{lowered_graph.backend_contexts(), true}; @@ -251,13 +237,13 @@ void ExecutorFactory::prepareExternalTensors(compiler::LoweredGraph &lowered_gra ir::Remove::UNDEFINED) { // If an OpSequence input/output tensor does not have a own tensor object, - // it must be using external tensors, so find the tensor from other tensor builders and + // it must be using migrant tensors, so find the tensor from other tensor builders and // set the tensor to this tensor builder if portable if (!backend_ctx->tensor_registry->getITensor(ind)) { auto tensor = tensor_regs.getITensor(ind); assert(tensor); // The tensor must have been registered - auto ptensor = std::dynamic_pointer_cast<backend::IPortableTensor>(tensor); + auto ptensor = dynamic_cast<backend::IPortableTensor *>(tensor); if (ptensor) backend_ctx->tensor_registry->setMigrantTensor(ind, ptensor); } @@ -299,8 +285,8 @@ ExecutorFactory::createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lo auto order = Linear::linearize(*lowered_graph); runTensorRegistration(lowered_graph.get(), order); - std::vector<std::shared_ptr<backend::ITensor>> input_tensors; - std::vector<std::shared_ptr<backend::ITensor>> output_tensors; + std::vector<backend::ITensor *> input_tensors; + std::vector<backend::ITensor *> output_tensors; if (options.is_primary_subgraph) { input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs()); @@ -318,7 +304,7 @@ ExecutorFactory::createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lo tensor_builder->prepare(); } - prepareExternalTensors(*lowered_graph); + prepareMigrantTensors(*lowered_graph); ExecutionBuilder builder; @@ -370,10 +356,9 @@ ExecutorFactory::createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lo }); } - backend::TensorManagerSet tensor_mgrs = createTensorManagerSet(tensor_builders); - auto exec = new exec::LinearExecutor{ - std::move(lowered_graph), input_tensors, output_tensors, tensor_regs, - std::move(tensor_mgrs), std::move(code_map), order}; + auto exec = + new exec::LinearExecutor{std::move(lowered_graph), input_tensors, output_tensors, tensor_regs, + std::move(code_map), order}; if (!options.trace_filepath.empty()) { @@ -396,8 +381,8 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor( auto order = Linear::linearize(*lowered_graph); runTensorRegistration(lowered_graph.get(), order); - std::vector<std::shared_ptr<backend::ITensor>> input_tensors; - std::vector<std::shared_ptr<backend::ITensor>> output_tensors; + std::vector<backend::ITensor *> input_tensors; + std::vector<backend::ITensor *> output_tensors; if (options.is_primary_subgraph) { input_tensors = initializeModelIOTensors(*lowered_graph, lowered_graph->graph().getInputs()); @@ -424,7 +409,7 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor( tensor_builder->prepare(); } - prepareExternalTensors(*lowered_graph); + prepareMigrantTensors(*lowered_graph); ExecutionBuilder builder; @@ -477,20 +462,16 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor( }); } - backend::TensorManagerSet tensor_mgrs = createTensorManagerSet(tensor_builders); - exec::ExecutorBase *exec = nullptr; if (parallel) { - exec = new exec::ParallelExecutor{std::move(lowered_graph), input_tensors, - output_tensors, tensor_regs, - std::move(tensor_mgrs), std::move(code_map)}; + exec = new exec::ParallelExecutor{std::move(lowered_graph), input_tensors, output_tensors, + tensor_regs, std::move(code_map)}; } else { - auto dataflow_exec = new exec::DataflowExecutor{std::move(lowered_graph), input_tensors, - output_tensors, tensor_regs, - std::move(tensor_mgrs), std::move(code_map)}; + auto dataflow_exec = new exec::DataflowExecutor{ + std::move(lowered_graph), input_tensors, output_tensors, tensor_regs, std::move(code_map)}; if (options.he_profiling_mode) { std::vector<const backend::Backend *> backends; diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.h b/runtime/onert/core/src/compiler/ExecutorFactory.h index b8893c03b..e76b721ea 100644 --- a/runtime/onert/core/src/compiler/ExecutorFactory.h +++ b/runtime/onert/core/src/compiler/ExecutorFactory.h @@ -46,10 +46,10 @@ private: static void initializeBackendContext(compiler::LoweredGraph *lowered_graph); static void runTensorRegistration(compiler::LoweredGraph *lowered_graph, const std::vector<ir::OpSequenceIndex> &order); - static std::vector<std::shared_ptr<backend::ITensor>> + static std::vector<backend::ITensor *> initializeModelIOTensors(compiler::LoweredGraph &lowered_graph, const ir::OperandIndexSequence &indices); - static void prepareExternalTensors(compiler::LoweredGraph &lowered_graph); + static void prepareMigrantTensors(compiler::LoweredGraph &lowered_graph); static exec::IExecutor * createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph, const compiler::CompilerOptions &options, diff --git a/runtime/onert/core/src/compiler/HEScheduler.cc b/runtime/onert/core/src/compiler/HEScheduler.cc index 5653b090e..fe54b0fdd 100644 --- a/runtime/onert/core/src/compiler/HEScheduler.cc +++ b/runtime/onert/core/src/compiler/HEScheduler.cc @@ -34,7 +34,8 @@ namespace compiler static uint32_t getOperationsFlattenedIOSize(const ir::Graph &graph, const ir::Operation &node) { uint32_t size = 0; - for (const auto &ind : (node.getInputs() | ir::Remove::UNDEFINED) + node.getOutputs()) + for (const auto &ind : + (node.getInputs() | ir::Remove::UNDEFINED) + (node.getOutputs() | ir::Remove::UNDEFINED)) { size += graph.operands().at(ind).info().total_size(); } @@ -248,8 +249,9 @@ int64_t HEScheduler::getPermuteTime(const backend::Backend *src_backend, if (time != _exec_time->NOT_FOUND) return time; + // FIXME permute time is not recorded so the control reaches here always // Makes the scheduler prefer keeping computations on one backend - return size / 200; + return size / 400; } int64_t HEScheduler::tryBackend(const ir::Operation &node, const backend::Backend *backend) @@ -370,7 +372,7 @@ int64_t HEScheduler::DFSChildrenMaxRank(const ir::OperationIndex &index) { const auto &node = _graph->operations().at(index); int64_t max_child_rank = 0; - for (const auto &output : node.getOutputs()) + for (const auto &output : node.getOutputs() | ir::Remove::UNDEFINED) { const auto &operand = _graph->operands().at(output); const bool quant = operand.typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM; diff --git a/runtime/onert/core/src/compiler/Linear.cc b/runtime/onert/core/src/compiler/Linear.cc index 49a989500..39e58fe11 100644 --- a/runtime/onert/core/src/compiler/Linear.cc +++ b/runtime/onert/core/src/compiler/Linear.cc @@ -148,6 +148,9 @@ void Linear::planTensors(const compiler::LoweredGraph &lowered_graph, tensor_builder->notifyFirstUse(ind); } + const auto io_tensors = + (graph.getInputs() + graph.getOutputs()) | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED; + // At each operation, // 1. Scan DEF of outputs. If the DEF, allocate it // 2. Scan USE of inputs. Decrease the USE and deallocate if the USE is 0 @@ -182,7 +185,15 @@ void Linear::planTensors(const compiler::LoweredGraph &lowered_graph, // plan for deallocation of dynamic tensor auto dyn_tensor_manager = tensor_builder_map[ind]->dynamicTensorManager(); if (dyn_tensor_manager) - dyn_tensor_manager->planDealloc(op_idx, ind); + { + const auto *backend = + lowered_graph.getLowerInfo(ind)->def_factors().getOnlyElement().backend(); + auto &tensor_registry = lowered_graph.backend_contexts().at(backend)->tensor_registry; + auto *tensor = tensor_registry->getITensor(ind); + assert(tensor); + if (!io_tensors.contains(ind)) // I/O tensors cannot be deallocated + dyn_tensor_manager->planDealloc(op_idx, tensor); + } } } } diff --git a/runtime/onert/core/src/compiler/LoweredGraph.cc b/runtime/onert/core/src/compiler/LoweredGraph.cc index 1489a1884..cdf1a8158 100644 --- a/runtime/onert/core/src/compiler/LoweredGraph.cc +++ b/runtime/onert/core/src/compiler/LoweredGraph.cc @@ -21,6 +21,7 @@ #include "util/logging.h" #include "compiler/pass/ConstantInsertionPass.h" #include "compiler/pass/ConstantLoweringPass.h" +#include "compiler/pass/PassRunner.h" #include "compiler/pass/PermutationOperationPass.h" #include "compiler/pass/PermutationInsertionPass.h" #include "compiler/pass/PermutationEliminationPass.h" @@ -101,14 +102,14 @@ LoweredGraph::LoweredGraph(const ir::Graph &graph, const CompilerOptions &option std::reverse(std::begin(op_seq.operations()), std::end(op_seq.operations())); }); - VERBOSE(OpSequences) << "dump without permutation" << std::endl; + VERBOSE(OpSequences) << "dump before permutation insertion" << std::endl; dumpOpSequences(_op_seqs, _graph.operations()); - pass::ConstantInsertionPass ci_pass(*this); - ci_pass.run(); - - pass::ConstantLoweringPass cl_pass(*this); - cl_pass.run(); + // Mandatory passes + pass::PassRunner{} + .append(std::make_unique<pass::ConstantInsertionPass>(*this)) + .append(std::make_unique<pass::ConstantLoweringPass>(*this)) + .run(); // Set LowerInfo for each operand from the operand::LowerInfo holder manipulateLowerInfo(operands_lower_info, options.is_primary_subgraph); @@ -116,20 +117,17 @@ LoweredGraph::LoweredGraph(const ir::Graph &graph, const CompilerOptions &option dumpLowerInfo(); } - // Run Permutation Passes - { - pass::PermutationOperationPass po_pass(*this); - po_pass.run(); - - pass::PermutationInsertionPass pi_pass(*this); - pi_pass.run(); + // Mandatory passes + pass::PassRunner{} + .append(std::make_unique<pass::PermutationOperationPass>(*this)) + .append(std::make_unique<pass::PermutationInsertionPass>(*this)) + .run(); - pass::PermutationEliminationPass pe_pass(*this); - pe_pass.run(); + // Optimization passes + pass::PassRunner{}.append(std::make_unique<pass::PermutationEliminationPass>(*this)).run(); - VERBOSE(OpSequences) << "dump with permutation" << std::endl; - dumpOpSequences(_op_seqs, _graph.operations()); - } + VERBOSE(OpSequences) << "Dump after permutation insertion" << std::endl; + dumpOpSequences(_op_seqs, _graph.operations()); // Graph verifications { @@ -276,7 +274,7 @@ void LoweredGraph::makeOpSequences( auto &&lower_info = operands_lower_info.at(operand); lower_info->addUsePermuteFactor(ir::operand::PermuteFactor{backend, backend_layout}); } - for (auto operand : node.getOutputs()) + for (auto operand : node.getOutputs() | ir::Remove::UNDEFINED) { auto &&lower_info = operands_lower_info.at(operand); lower_info->addDefPermuteFactor(ir::operand::PermuteFactor{backend, backend_layout}); @@ -340,7 +338,7 @@ void LoweredGraph::manipulateLowerInfo( assert(lower_info->def_factors().empty()); lower_info->addDefPermuteFactor(factor); } - for (auto index : _graph.getOutputs()) + for (auto index : _graph.getOutputs() | ir::Remove::UNDEFINED) { auto &&lower_info = operands_lower_info.at(index); lower_info->addUsePermuteFactor(factor); @@ -368,7 +366,7 @@ void LoweredGraph::manipulateLowerInfo( } } } - for (auto index : _graph.getOutputs()) + for (auto index : _graph.getOutputs() | ir::Remove::UNDEFINED) { auto &&lower_info = operands_lower_info.at(index); if (lower_info->def_factors().size() == 0) @@ -496,7 +494,7 @@ bool LoweredGraph::mergeable(const ir::OpSequenceIndex &op_seq_index, branched_set.clear(); // Check for branching down - for (const auto &output : node.getOutputs() | ir::Remove::DUPLICATED) + for (const auto &output : node.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED) { // TODO Fix this workaround for the case of model outputs that are used by another operation // This is needed since the branching is decided by operation, but for model outputs, @@ -544,7 +542,7 @@ bool LoweredGraph::mergeable(const ir::OpSequenceIndex &op_seq_index, } // node's input == op_seq's output? - for (const auto output : n.getOutputs()) + for (const auto output : n.getOutputs() | ir::Remove::UNDEFINED) { if (node_inputs.contains(output)) { diff --git a/runtime/onert/core/src/compiler/OperationValidator.cc b/runtime/onert/core/src/compiler/OperationValidator.cc index f7f659e3e..0582cf154 100644 --- a/runtime/onert/core/src/compiler/OperationValidator.cc +++ b/runtime/onert/core/src/compiler/OperationValidator.cc @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,13 +16,7 @@ #include "OperationValidator.h" -#include <typeinfo> - #include "ir/Graph.h" -#include "ir/operation/LowerInfo.h" - -#include "util/logging.h" -#include "util/Utils.h" #define OP_REQUIRES(EXP) \ do \ @@ -37,33 +31,14 @@ namespace compiler { OperationValidator::OperationValidator(const ir::Graph &graph) - : _graph{graph}, _ctx{graph.operands()}, _current_op_seq_layout{ir::Layout::UNKNOWN} + : _graph{graph}, _ctx{graph.operands()} { } -void OperationValidator::checkUnaryOp(const ir::Operation &node) -{ - const auto output_index{node.getOutputs().at(0)}; - const auto input_index{node.getInputs().at(0)}; - - // Check if I/O types match - OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type()); - - if (_ctx.at(output_index).info().isDynamic()) - return; - - // Check if I/O shapes match - OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); -} - void OperationValidator::operator()() { - // There is no reason for each subgraph to have subgraphs since compiler has subgraphs when - // creating Compiler assert(_graph.subgraphs() == nullptr); - _current_op_seq_layout = _graph.layout(); - _graph.operations().iterate( [&](const ir::OperationIndex &, const ir::Operation &node) { node.accept(*this); }); } @@ -72,50 +47,23 @@ void OperationValidator::visit(const ir::operation::BatchMatMul &node) { const auto lhs_index(node.getInputs().at(ir::operation::BatchMatMul::Input::LHS)); const auto rhs_index(node.getInputs().at(ir::operation::BatchMatMul::Input::RHS)); - const auto out_index{node.getOutputs().at(0)}; // Constant lhs and rhs is not implemented yet OP_REQUIRES(!_ctx.at(lhs_index).isConstant() && !_ctx.at(rhs_index).isConstant()); - - if (_ctx.at(out_index).info().isDynamic()) - return; - - OP_REQUIRES(_ctx.at(lhs_index).shape().rank() <= 4); - OP_REQUIRES(_ctx.at(rhs_index).shape().rank() <= 4); - OP_REQUIRES(_ctx.at(lhs_index).shape().rank() >= 2); - OP_REQUIRES(_ctx.at(rhs_index).shape().rank() >= 2); } void OperationValidator::visit(const ir::operation::BatchToSpaceND &node) { - const auto ofm_index{node.getOutputs().at(0)}; - if (_ctx.at(ofm_index).info().isDynamic()) - return; - - const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)}; const auto block_size_index{ node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)}; - const auto frontend_layout = _current_op_seq_layout; - const auto input_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout); - const auto output_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout); - - // All requirement as per NNAPI specification. - OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4); - OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 4); - OP_REQUIRES(_ctx.at(block_size_index).shape().rank() == 1); - - OP_REQUIRES(_ctx.at(block_size_index).shape().dim(0) == 2); - + // Non-constant block_size is not implemented yet OP_REQUIRES(_ctx.at(block_size_index).isConstant()); - - OP_REQUIRES(input_shape.C == output_shape.C); } void OperationValidator::visit(const ir::operation::Comparison &node) { const auto output_index{node.getOutputs().at(0)}; - // This validator does not check shape. So checking isDynamic() is skipped. const auto lhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)}; const auto rhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)}; @@ -124,223 +72,20 @@ void OperationValidator::visit(const ir::operation::Comparison &node) OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == ir::DataType::BOOL8); } -void OperationValidator::visit(const ir::operation::Softmax &node) -{ - VERBOSE(Softmax) << "Configure SOFTMAX operation" << std::endl; - - const auto output_index{node.getOutputs().at(0)}; - if (_ctx.at(output_index).info().isDynamic()) - return; - - const auto input_index{node.getInputs().at(0)}; - - OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank()); -} - -void OperationValidator::visit(const ir::operation::InstanceNorm &node) -{ - const auto ofm_index{node.getOutputs().at(0)}; - if (_ctx.at(ofm_index).info().isDynamic()) - return; - - const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)}; - const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)}; - const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)}; - - OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4); - OP_REQUIRES(_ctx.at(ifm_index).shape() == _ctx.at(ofm_index).shape()); - OP_REQUIRES(_ctx.at(gamma_index).shape().rank() == 1); - OP_REQUIRES(_ctx.at(beta_index).shape().rank() == 1); -} - -void OperationValidator::visit(const ir::operation::Pool2D &node) +void OperationValidator::visit(const ir::operation::DepthToSpace &node) { - const auto ofm_index{node.getOutputs().at(0)}; - if (_ctx.at(ofm_index).info().isDynamic()) - return; - - const auto ifm_index{node.getInputs().at(ir::operation::Pool2D::Input::INPUT)}; + int32_t block_size = node.param().block_size; - OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4); + OP_REQUIRES(block_size > 0); } -void OperationValidator::visit(const ir::operation::Permute &node) +void OperationValidator::visit(const ir::operation::ElementwiseActivation &node) { - VERBOSE(Permute) << "Configure Permute operation" << std::endl; - const auto output_index{node.getOutputs().at(0)}; - if (_ctx.at(output_index).info().isDynamic()) - return; - const auto input_index{node.getInputs().at(0)}; - OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank()); -} - -void OperationValidator::visit(const ir::operation::Reduce &node) -{ - VERBOSE(Permute) << "Configure " + node.name() + " operation" << std::endl; - - const auto output_index{node.getOutputs().at(0)}; - if (_ctx.at(output_index).info().isDynamic()) - return; - - const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)}; - const auto input_shape = _ctx.at(input_index).shape(); - const auto output_shape = _ctx.at(output_index).shape(); - - OP_REQUIRES(input_shape.rank() <= 4); - OP_REQUIRES(output_shape.rank() <= input_shape.rank()); - - // NOTE For the 4-dimensions, if the rank of input and output are different, this runtime only - // supports cases reducing height and width or reducing depth. - // TODO We have to support all cases of dimensions up to 4. - // For correct permuting, we have to set output's shape to be equal in dimension position of the - // input. But the positions of the same dimensions in the input and output may be set differently. - // For example {2,3,4,5}(input's shape) can be reduced to {3,5}(output's shape). The original - // output shape should be {1,3,1,5}, but real output shape may be {3,5}. If you simply try to - // extend it in 4 dimensions, it should be {1,1,3,5}. - // Even if output shape is changed to {1,3,1,5}, there is another problem. It is that shape of - // output tensor used at next operation is changed to {1,3,1,5} after this operation even if the - // next operation is not desired. - if (input_shape.rank() == 4 && input_shape.rank() != output_shape.rank()) - { - if (output_shape.rank() == 2) - { - // Reducing HW - OP_REQUIRES(input_shape.dim(0) == output_shape.dim(0) && - input_shape.dim(3) == output_shape.dim(1)); - } - else if (output_shape.rank() == 3) - { - // Reducing C or - // (Reducing H and C(input and output) == 1) or (Reducing W and C(input and output) == 1) - OP_REQUIRES((input_shape.dim(0) == output_shape.dim(0) && - input_shape.dim(1) == output_shape.dim(1) && - input_shape.dim(2) == output_shape.dim(2)) || - (input_shape.dim(0) == output_shape.dim(0) && - (input_shape.dim(1) == output_shape.dim(1) || - input_shape.dim(2) == output_shape.dim(1)) && - input_shape.dim(3) == 1 && output_shape.dim(2) == 1)); - } - } -} - -void OperationValidator::visit(const ir::operation::Transpose &node) -{ - const auto output_index{node.getOutputs().at(0)}; - if (_ctx.at(output_index).info().isDynamic()) - return; - - const auto input_index{node.getInputs().at(ir::operation::Transpose::Input::INPUT)}; - const auto &perm{node.param().perm}; - - const auto &output_shape = _ctx.at(output_index).shape(); - const auto &input_shape = _ctx.at(input_index).shape(); - - OP_REQUIRES(input_shape.rank() == static_cast<int>(perm.size())); - OP_REQUIRES(input_shape.rank() == output_shape.rank()); -} - -void OperationValidator::visit(const ir::operation::RNN &node) -{ - // NOTE This validation is for static rnn(non-dynamic shape), but not for dynamic rnn - // TODO Support dynamic rnn - const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)}; - if (_ctx.at(output_index).info().isDynamic()) - return; - - const auto hidden_state_out_index{ - node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)}; - - const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)}; - const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)}; - const auto recurrent_weights_index{ - node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)}; - const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)}; - const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)}; - - const auto batch_size = _ctx.at(output_index).shape().dim(0); - const auto num_units = _ctx.at(output_index).shape().dim(1); - - OP_REQUIRES(_ctx.at(output_index).shape().rank() == 2 && - _ctx.at(hidden_state_out_index).shape().rank() == 2 && - _ctx.at(input_index).shape().rank() == 2 && - _ctx.at(weights_index).shape().rank() == 2 && - _ctx.at(recurrent_weights_index).shape().rank() == 2 && - _ctx.at(hidden_state_in_index).shape().rank() == 2); - OP_REQUIRES(_ctx.at(bias_index).shape().rank() == 1); - - OP_REQUIRES(batch_size == _ctx.at(input_index).shape().dim(0) && - batch_size == _ctx.at(hidden_state_in_index).shape().dim(0) && - batch_size == _ctx.at(hidden_state_out_index).shape().dim(0)); - OP_REQUIRES(_ctx.at(input_index).shape().dim(1) == _ctx.at(weights_index).shape().dim(1)); - - OP_REQUIRES(num_units == _ctx.at(weights_index).shape().dim(0) && - num_units == _ctx.at(recurrent_weights_index).shape().dim(0) && - num_units == _ctx.at(bias_index).shape().dim(0)); - OP_REQUIRES(num_units == _ctx.at(output_index).shape().dim(1) && - num_units == _ctx.at(recurrent_weights_index).shape().dim(1) && - num_units == _ctx.at(hidden_state_in_index).shape().dim(1) && - num_units == _ctx.at(hidden_state_out_index).shape().dim(1)); -} - -void OperationValidator::visit(const ir::operation::SpaceToBatchND &node) -{ - const auto ofm_index{node.getOutputs().at(0)}; - if (_ctx.at(ofm_index).info().isDynamic()) - return; - - const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)}; - const auto block_size_index{ - node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)}; - const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)}; - - const auto frontend_layout = _current_op_seq_layout; - const auto input_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout); - const auto output_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout); - - // All requirement as per NNAPI specification. - OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4); - OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 4); - OP_REQUIRES(_ctx.at(block_size_index).shape().rank() == 1); - OP_REQUIRES(_ctx.at(paddings_index).shape().rank() == 2); - - OP_REQUIRES(_ctx.at(block_size_index).shape().dim(0) == 2); - OP_REQUIRES(_ctx.at(paddings_index).shape().dim(0) == 2); - OP_REQUIRES(_ctx.at(paddings_index).shape().dim(1) == 2); - - OP_REQUIRES(_ctx.at(block_size_index).isConstant()); - OP_REQUIRES(_ctx.at(paddings_index).isConstant()); - - OP_REQUIRES(input_shape.C == output_shape.C); -} - -void OperationValidator::visit(const ir::operation::SpaceToDepth &node) -{ - const auto ofm_index{node.getOutputs().at(0)}; - if (_ctx.at(ofm_index).info().isDynamic()) - return; - - const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)}; - - const auto frontend_layout = _current_op_seq_layout; - const auto input_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout); - const auto output_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout); - const auto block_size = node.param().block_size; - - // All assertions as per NNAPI specification. - OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4); - OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 4); - OP_REQUIRES((block_size >= 1) && (input_shape.H % block_size == 0) && - (input_shape.W % block_size == 0)); - OP_REQUIRES(input_shape.N == output_shape.N); - OP_REQUIRES(input_shape.C * block_size * block_size == output_shape.C); -} - -void OperationValidator::visit(const ir::operation::ElementwiseActivation &node) -{ - checkUnaryOp(node); + // Check if I/O types match + OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type()); } void OperationValidator::visit(const ir::operation::ElementwiseBinary &node) @@ -358,9 +103,6 @@ void OperationValidator::visit(const ir::operation::ElementwiseUnary &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)}; - OP_REQUIRES(node.getInputs().size() == 1); - OP_REQUIRES(node.getOutputs().size() == 1); - // Check if I/O types match if (node.param().op_type == ir::operation::ElementwiseUnary::Type::DEQUANTIZE) { @@ -376,47 +118,13 @@ void OperationValidator::visit(const ir::operation::ElementwiseUnary &node) { OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type()); } - - if (_ctx.at(output_index).info().isDynamic()) - return; - - OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); } void OperationValidator::visit(const ir::operation::EmbeddingLookup &node) { - const auto output_index{node.getOutputs().at(0)}; const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)}; - const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)}; - - const auto &output_obj = _ctx.at(output_index); - const auto &lookups_obj = _ctx.at(lookups_index); - const auto &values_obj = _ctx.at(values_index); - - // Verify operand here, not at SimpleEmbeddingLookup::configure() to avoid acl's modifying - // TensorShape sometimes(Issue: https://github.sec.samsung.net/STAR/nnfw/issues/729) - { - OP_REQUIRES(lookups_obj.typeInfo().type() == ir::DataType::INT32); - - if (_ctx.at(output_index).info().isDynamic()) - return; - const auto &output_shape = output_obj.shape(); - const auto &lookups_shape = lookups_obj.shape(); - const auto &values_shape = values_obj.shape(); - - OP_REQUIRES(lookups_shape.rank() == 1); - OP_REQUIRES(values_shape.rank() >= 2); - - // output should be a n-D tensor with the same rank and shape as the values tensor, except for - // the first dimension which has the same size as lookups' only dimension. - OP_REQUIRES(output_shape.rank() == values_shape.rank()); - OP_REQUIRES(output_shape.dim(0) == lookups_shape.dim(0)); - for (int n = 1; n < output_shape.rank(); ++n) - { - OP_REQUIRES(output_shape.dim(n) == values_shape.dim(n)); - } - } + OP_REQUIRES(_ctx.at(lookups_index).typeInfo().type() == ir::DataType::INT32); } void OperationValidator::visit(const ir::operation::ExpandDims &node) @@ -427,488 +135,35 @@ void OperationValidator::visit(const ir::operation::ExpandDims &node) OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type()); OP_REQUIRES(_ctx.at(axis_index).typeInfo().type() == ir::DataType::INT32); - - if (_ctx.at(axis_index).info().isDynamic()) - return; - OP_REQUIRES(_ctx.at(axis_index).shape().rank() <= 1); } void OperationValidator::visit(const ir::operation::HashtableLookup &node) { - const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)}; const auto hits_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::HITS)}; - const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)}; const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)}; - const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)}; - - const auto &output_obj = _ctx.at(output_index); - const auto &hits_obj = _ctx.at(hits_index); - - const auto &lookups_obj = _ctx.at(lookups_index); - const auto &keys_obj = _ctx.at(keys_index); - const auto &values_obj = _ctx.at(values_index); - - OP_REQUIRES(lookups_obj.typeInfo().type() == ir::DataType::INT32); - OP_REQUIRES(keys_obj.typeInfo().type() == ir::DataType::INT32); - OP_REQUIRES(hits_obj.typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM); - - if (_ctx.at(output_index).info().isDynamic()) - return; - const auto &output_shape = output_obj.shape(); - const auto &lookups_shape = lookups_obj.shape(); - const auto &keys_shape = keys_obj.shape(); - const auto &values_shape = values_obj.shape(); - - OP_REQUIRES(values_shape.rank() == output_shape.rank()); - OP_REQUIRES(lookups_shape.rank() == 1); - OP_REQUIRES(keys_shape.rank() == 1); - OP_REQUIRES(values_shape.dim(0) == keys_shape.dim(0)); - OP_REQUIRES(lookups_shape.dim(0) == output_shape.dim(0)); -} - -void OperationValidator::visit(const ir::operation::TransposeConv &node) -{ - // param check - OP_REQUIRES((node.param().padding.type == ir::PaddingType::SAME) || - (node.param().padding.type == ir::PaddingType::VALID)); - - // shape check - const auto ofm_index{node.getOutputs().at(0)}; - if (_ctx.at(ofm_index).info().isDynamic()) - return; - - const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)}; - const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)}; - - // Only 4D tensors are supported - OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 4); - OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == _ctx.at(ifm_index).shape().rank()); - OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == _ctx.at(ker_index).shape().rank()); - - const auto frontend_layout = _current_op_seq_layout; - const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout); - const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout); - // The kernel has only IHWO layout on frontend - // So ker_shape is treated here below - // I -> N - // H -> H - // W -> W - // O -> C - const auto ker_shape = _ctx.at(ker_index).shape().asFeature(ir::Layout::NHWC); - - OP_REQUIRES(ifm_shape.N == ofm_shape.N); - OP_REQUIRES(ifm_shape.C == ker_shape.C); - OP_REQUIRES(ker_shape.N == ofm_shape.C); -} - -void OperationValidator::visit(const ir::operation::Gather &node) -{ - const auto ofm_index{node.getOutputs().at(0)}; - if (_ctx.at(ofm_index).info().isDynamic()) - return; - - const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)}; - const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)}; - - const auto ifm_shape = _ctx.at(ifm_index).shape(); - const auto indices_shape = _ctx.at(indices_index).shape(); - const auto ofm_shape = _ctx.at(ofm_index).shape(); - - OP_REQUIRES(ifm_shape.rank() <= 4); - OP_REQUIRES(indices_shape.rank() <= 3); - OP_REQUIRES(ofm_shape.rank() <= 4); -} - -void OperationValidator::visit(const ir::operation::DepthToSpace &node) -{ - // param check - int32_t block_size = node.param().block_size; - - OP_REQUIRES(block_size > 0); - - // shape check - const auto output_index{node.getOutputs().at(0)}; - if (_ctx.at(output_index).info().isDynamic()) - return; - - const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)}; - - const auto frontend_layout = _current_op_seq_layout; - const auto output_shape = _ctx.at(output_index).shape().asFeature(frontend_layout); - const auto input_shape = _ctx.at(input_index).shape().asFeature(frontend_layout); - - OP_REQUIRES(_ctx.at(input_index).shape().rank() == 4); - OP_REQUIRES(_ctx.at(output_index).shape().rank() == 4); - - { - OP_REQUIRES(output_shape.N == input_shape.N); - OP_REQUIRES(output_shape.H == input_shape.H * block_size); - OP_REQUIRES(output_shape.W == input_shape.W * block_size); - OP_REQUIRES(input_shape.C % (block_size * block_size) == 0); - OP_REQUIRES(output_shape.C == input_shape.C / (block_size * block_size)); - } + OP_REQUIRES(_ctx.at(lookups_index).typeInfo().type() == ir::DataType::INT32); + OP_REQUIRES(_ctx.at(keys_index).typeInfo().type() == ir::DataType::INT32); + OP_REQUIRES(_ctx.at(hits_index).typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM); } void OperationValidator::visit(const ir::operation::Pack &node) { - // param check const auto num{node.param().num}; - const auto axis{node.param().axis}; - OP_REQUIRES(num == static_cast<int32_t>(node.getInputs().size())); - - const auto output_index{node.getOutputs().at(0)}; - if (_ctx.at(output_index).info().isDynamic()) - return; - - // shape check - const auto &output_shape = _ctx.at(output_index).shape(); - const auto output_rank = static_cast<int32_t>(output_shape.rank()); - const auto input1_index{node.getInputs().at(0)}; - const auto input_shape = _ctx.at(input1_index).shape(); - - OP_REQUIRES(axis >= -output_rank && axis < output_rank); - for (const auto &index : node.getInputs()) - { - OP_REQUIRES(input_shape == _ctx.at(index).shape()); - } -} - -void OperationValidator::visit(const ir::operation::LSTM &node) -{ - // NOTE This validation is for static rnn(non-dynamic shape), but not for dynamic rnn - // TODO Support dynamic rnn - const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)}; - if (_ctx.at(output_index).info().isDynamic()) - return; - - const auto scratch_buffer_index{ - node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)}; - const auto output_state_out_index{ - node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)}; - const auto cell_state_out_index{ - node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)}; - - const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)}; - const auto input_to_input_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; - const auto input_to_forget_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)}; - const auto input_to_cell_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)}; - const auto input_to_output_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)}; - const auto recurrent_to_input_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; - const auto recurrent_to_forget_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)}; - const auto recurrent_to_cell_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)}; - const auto recurrent_to_output_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)}; - const auto cell_to_input_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; - const auto cell_to_forget_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; - const auto cell_to_output_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; - const auto input_gate_bias_index{ - node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)}; - const auto forget_gate_bias_index{ - node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)}; - const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)}; - const auto output_gate_bias_index{ - node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)}; - const auto projection_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; - const auto projection_bias_index{ - node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; - const auto output_state_in_index{ - node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)}; - const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)}; - - OP_REQUIRES(_ctx.at(scratch_buffer_index).shape().rank() == 2 && - _ctx.at(output_state_out_index).shape().rank() == 2 && - _ctx.at(cell_state_out_index).shape().rank() == 2 && - _ctx.at(output_index).shape().rank() == 2 && - _ctx.at(input_index).shape().rank() == 2 && - _ctx.at(input_to_input_weights_index).shape().rank() == 2 && - _ctx.at(input_to_forget_weights_index).shape().rank() == 2 && - _ctx.at(input_to_cell_weights_index).shape().rank() == 2 && - _ctx.at(input_to_output_weights_index).shape().rank() == 2 && - _ctx.at(recurrent_to_input_weights_index).shape().rank() == 2 && - _ctx.at(recurrent_to_forget_weights_index).shape().rank() == 2 && - _ctx.at(recurrent_to_cell_weights_index).shape().rank() == 2 && - _ctx.at(recurrent_to_output_weights_index).shape().rank() == 2 && - _ctx.at(projection_weights_index).shape().rank() == 2 && - _ctx.at(output_state_in_index).shape().rank() == 2 && - _ctx.at(cell_state_in_index).shape().rank() == 2); - - OP_REQUIRES(_ctx.at(cell_to_input_weights_index).shape().rank() == 1 && - _ctx.at(cell_to_forget_weights_index).shape().rank() == 1 && - _ctx.at(cell_to_output_weights_index).shape().rank() == 1 && - _ctx.at(input_gate_bias_index).shape().rank() == 1 && - _ctx.at(forget_gate_bias_index).shape().rank() == 1 && - _ctx.at(cell_bias_index).shape().rank() == 1 && - _ctx.at(output_gate_bias_index).shape().rank() == 1 && - _ctx.at(projection_bias_index).shape().rank() == 1); - - // CIFG assertion - OP_REQUIRES((_ctx.at(input_to_input_weights_index).shape().dim(0) == 0 && - _ctx.at(input_to_input_weights_index).shape().dim(1) == 0 && - _ctx.at(recurrent_to_input_weights_index).shape().dim(0) == 0 && - _ctx.at(recurrent_to_input_weights_index).shape().dim(1) == 0 && - _ctx.at(input_gate_bias_index).shape().dim(0) == 0 && - _ctx.at(cell_to_input_weights_index).shape().dim(0) == 0) || - (_ctx.at(input_to_input_weights_index).shape().dim(0) != 0 && - _ctx.at(input_to_input_weights_index).shape().dim(1) != 0 && - _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 && - _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0 && - _ctx.at(input_gate_bias_index).shape().dim(0) != 0)); - - // Peephole assertion - OP_REQUIRES((_ctx.at(cell_to_forget_weights_index).shape().dim(0) == 0 && - _ctx.at(cell_to_output_weights_index).shape().dim(0) == 0) || - (_ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0 && - _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0)); - - bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 && - _ctx.at(input_to_input_weights_index).shape().dim(1) != 0; - bool has_recurrent_to_input_weights = - _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 && - _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0; - bool has_input_gate_bias = _ctx.at(input_gate_bias_index).shape().dim(0) != 0; - bool has_cell_to_input_weights = _ctx.at(cell_to_input_weights_index).shape().dim(0) != 0; - bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0; - bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0; - bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 && - _ctx.at(projection_weights_index).shape().dim(1) != 0; - bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0); - - // NOTE The cell_to_input_weights do not exist in non-peephole although regular LSTM(non-CIFG). - // true: no CIFG - // false: CIFG - bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights; - - // NOTE The cell_to_input_weights do not exist in regular CIFG although peephole. - // true: peephole - // false: no peephole - bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights; - - // NOTE The projection weights may have data but the projection bias may not. - bool has_projection_param = has_projection_weights; - - const auto batch_size = _ctx.at(input_index).shape().dim(0); - OP_REQUIRES(batch_size == _ctx.at(output_state_in_index).shape().dim(0) && - batch_size == _ctx.at(cell_state_in_index).shape().dim(0) && - batch_size == _ctx.at(scratch_buffer_index).shape().dim(0) && - batch_size == _ctx.at(output_state_out_index).shape().dim(0) && - batch_size == _ctx.at(cell_state_out_index).shape().dim(0) && - batch_size == _ctx.at(output_index).shape().dim(0)); - - const auto input_size = _ctx.at(input_index).shape().dim(1); - OP_REQUIRES(input_size == _ctx.at(input_to_forget_weights_index).shape().dim(1) && - input_size == _ctx.at(input_to_cell_weights_index).shape().dim(1) && - input_size == _ctx.at(input_to_output_weights_index).shape().dim(1)); - - const auto num_units = _ctx.at(cell_state_out_index).shape().dim(1); - OP_REQUIRES(num_units == _ctx.at(input_to_forget_weights_index).shape().dim(0) && - num_units == _ctx.at(input_to_cell_weights_index).shape().dim(0) && - num_units == _ctx.at(input_to_output_weights_index).shape().dim(0) && - num_units == _ctx.at(recurrent_to_forget_weights_index).shape().dim(0) && - num_units == _ctx.at(recurrent_to_cell_weights_index).shape().dim(0) && - num_units == _ctx.at(recurrent_to_output_weights_index).shape().dim(0) && - num_units == _ctx.at(forget_gate_bias_index).shape().dim(0) && - num_units == _ctx.at(cell_bias_index).shape().dim(0) && - num_units == _ctx.at(output_gate_bias_index).shape().dim(0) && - num_units == _ctx.at(cell_state_in_index).shape().dim(1) && - (((num_units * 3) == _ctx.at(scratch_buffer_index).shape().dim(1)) || - ((num_units * 4) == _ctx.at(scratch_buffer_index).shape().dim(1)))); - - const auto output_size = _ctx.at(output_index).shape().dim(1); - OP_REQUIRES(output_size == _ctx.at(recurrent_to_forget_weights_index).shape().dim(1) && - output_size == _ctx.at(recurrent_to_cell_weights_index).shape().dim(1) && - output_size == _ctx.at(recurrent_to_output_weights_index).shape().dim(1) && - output_size == _ctx.at(output_state_in_index).shape().dim(1) && - output_size == _ctx.at(output_state_out_index).shape().dim(1)); - - if (has_cifg_param) - { - OP_REQUIRES(input_size == _ctx.at(input_to_input_weights_index).shape().dim(1)); - OP_REQUIRES(num_units == _ctx.at(input_to_input_weights_index).shape().dim(0) && - num_units == _ctx.at(recurrent_to_input_weights_index).shape().dim(0) && - (num_units == _ctx.at(cell_to_input_weights_index).shape().dim(0) || - _ctx.at(cell_to_input_weights_index).shape().dim(0) == 0 /* non-peephole */) && - num_units == _ctx.at(input_gate_bias_index).shape().dim(0)); - OP_REQUIRES(output_size == _ctx.at(recurrent_to_input_weights_index).shape().dim(1)); - OP_REQUIRES(has_input_to_input_weights && has_recurrent_to_input_weights && - has_input_gate_bias); - if (has_cell_to_input_weights) - { - // NOTE The cell_to_input_weights exist only in case of non-CIFG and peephole. - OP_REQUIRES(has_peephole_param); - } - OP_REQUIRES(_ctx.at(scratch_buffer_index).shape().dim(1) == num_units * 4); - } - else - { - OP_REQUIRES(_ctx.at(scratch_buffer_index).shape().dim(1) == num_units * 3); - } - - if (has_peephole_param) - { - OP_REQUIRES(num_units == _ctx.at(cell_to_forget_weights_index).shape().dim(0) && - num_units == _ctx.at(cell_to_output_weights_index).shape().dim(0) && - (num_units == _ctx.at(cell_to_input_weights_index).shape().dim(0) || - _ctx.at(cell_to_input_weights_index).shape().dim(0) == 0 /* CIFG */)); - } - - if (has_projection_param) - { - OP_REQUIRES(num_units == _ctx.at(projection_weights_index).shape().dim(1)); - OP_REQUIRES(output_size == _ctx.at(projection_weights_index).shape().dim(0)); - if (has_projection_bias) - { - OP_REQUIRES(output_size == _ctx.at(projection_bias_index).shape().dim(0)); - } - } -} - -void OperationValidator::visit(const ir::operation::L2Normalization &node) -{ - const auto ofm_index{node.getOutputs().at(0)}; - if (_ctx.at(ofm_index).info().isDynamic()) - return; - - const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)}; - - auto ifm_shape = _ctx.at(ifm_index).shape(); - auto ofm_shape = _ctx.at(ofm_index).shape(); - - OP_REQUIRES(ifm_shape.rank() == ofm_shape.rank()); - - for (auto i = 0; i < ifm_shape.rank(); i++) - { - OP_REQUIRES(ifm_shape.dim(i) == ofm_shape.dim(i)); - } -} - -void OperationValidator::visit(const ir::operation::Unpack &node) -{ - const auto num{node.param().num}; - OP_REQUIRES(num == static_cast<int32_t>(node.getOutputs().size())); - const auto axis{node.param().axis}; - - const auto output_index{node.getInputs().at(0)}; - if (_ctx.at(output_index).info().isDynamic()) - return; - - const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)}; - - const auto &input_shape = _ctx.at(input_index).shape(); - const auto input_rank = static_cast<int32_t>(input_shape.rank()); - - OP_REQUIRES(axis >= -input_rank && axis < input_rank); + OP_REQUIRES(num == static_cast<int32_t>(node.getInputs().size())); } void OperationValidator::visit(const ir::operation::Pad &node) { const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)}; - OP_REQUIRES(_ctx.at(pad_index).typeInfo().type() == ir::DataType::INT32); - - const auto output_index{node.getInputs().at(0)}; - if (_ctx.at(output_index).info().isDynamic()) - return; - - const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)}; - - const auto &pad_shape = _ctx.at(pad_index).shape(); - const auto input_rank = static_cast<int32_t>(_ctx.at(input_index).shape().rank()); - - OP_REQUIRES(pad_shape.rank() == 2); - OP_REQUIRES(pad_shape.dim(0) == input_rank); - OP_REQUIRES(pad_shape.dim(1) == 2); - OP_REQUIRES(_ctx.at(input_index).shape().rank() == _ctx.at(output_index).shape().rank()); -} - -void OperationValidator::visit(const ir::operation::Select &node) -{ - const auto output_index{node.getOutputs().at(0)}; - // This validator does not check shape. So checking isDynamic() is skipped. - - const auto condition_index{node.getInputs().at(ir::operation::Select::Input::CONDITION)}; - const auto input_true_index{node.getInputs().at(ir::operation::Select::Input::INPUT_TRUE)}; - const auto input_false_index{node.getInputs().at(ir::operation::Select::Input::INPUT_FALSE)}; - UNUSED_RELEASE(output_index); - UNUSED_RELEASE(input_true_index); - UNUSED_RELEASE(input_false_index); - - OP_REQUIRES(_ctx.at(condition_index).typeInfo().type() == ir::DataType::BOOL8); -} - -void OperationValidator::visit(const ir::operation::StridedSlice &node) -{ - const auto output_index{node.getOutputs().at(0)}; - const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)}; - const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)}; - const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)}; - const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)}; - - UNUSED_RELEASE(starts_index); - UNUSED_RELEASE(ends_index); - UNUSED_RELEASE(strides_index); - - OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type()); - - if (_ctx.at(output_index).info().isDynamic()) - return; - - OP_REQUIRES(_ctx.at(input_index).shape().rank() <= 4); -} -void OperationValidator::visit(const ir::operation::Split &node) -{ - const auto input_index{node.getInputs().at(ir::operation::Split::Input::INPUT)}; - - if (_ctx.at(input_index).info().isDynamic()) - return; - - const auto num_splits = node.param().num_splits; - const auto input_rank = _ctx.at(input_index).shape().rank(); - const auto axis = node.param().axis < 0 ? node.param().axis + input_rank : node.param().axis; - - OP_REQUIRES(num_splits > 0 && num_splits <= 0xFFFF); - OP_REQUIRES(axis >= 0 && axis < input_rank); - OP_REQUIRES(node.getOutputs().size() == static_cast<uint32_t>(num_splits)); - - OP_REQUIRES(_ctx.at(input_index).shape().dim(axis) % num_splits == 0); -} - -void OperationValidator::visit(const ir::operation::Shape &node) -{ - const auto output_index{node.getOutputs().at(0)}; - if (_ctx.at(output_index).info().isDynamic()) - return; - - const auto input_index{node.getInputs().at(0)}; - UNUSED_RELEASE(input_index); - OP_REQUIRES(_ctx.at(output_index).shape().rank() == 1); + OP_REQUIRES(_ctx.at(pad_index).typeInfo().type() == ir::DataType::INT32); } void OperationValidator::visit(const ir::operation::ResizeBilinear &node) { - const auto output_index{node.getOutputs().at(0)}; - const auto input_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)}; - - if (_ctx.at(output_index).info().isDynamic()) - { - return; - } - OP_REQUIRES(_ctx.at(input_index).shape().rank() == 4); - OP_REQUIRES(_ctx.at(output_index).shape().rank() == 4); - auto align_corners = node.param().align_corners; auto half_pixel_centers = node.param().half_pixel_centers; @@ -923,23 +178,31 @@ void OperationValidator::visit(const ir::operation::Reverse &node) OP_REQUIRES(_ctx.at(axis_index).typeInfo().type() == ir::DataType::INT32); OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type()); +} + +void OperationValidator::visit(const ir::operation::SpaceToBatchND &node) +{ + const auto block_size_index{ + node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)}; + const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)}; - if (_ctx.at(output_index).info().isDynamic()) - return; - OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); + // Non-constant block_size and padding is not implemented yet + OP_REQUIRES(_ctx.at(block_size_index).isConstant()); + OP_REQUIRES(_ctx.at(paddings_index).isConstant()); } -void OperationValidator::visit(const ir::operation::If &) +void OperationValidator::visit(const ir::operation::SpaceToDepth &node) { - // TODO Add to validate with subgraphs + const auto block_size = node.param().block_size; + OP_REQUIRES(block_size >= 1); } -void OperationValidator::visit(const ir::operation::While &node) +void OperationValidator::visit(const ir::operation::Split &node) { - // This validator does not check shape. So checking isDynamic() is skipped. + const auto num_splits = node.param().num_splits; - OP_REQUIRES(node.getInputs().size() == node.getOutputs().size()); - // TODO Add to validate with subgraphs + OP_REQUIRES(num_splits > 0 && num_splits <= 0xFFFF); + OP_REQUIRES(node.getOutputs().size() == static_cast<uint32_t>(num_splits)); } void OperationValidator::visit(const ir::operation::SquaredDifference &node) @@ -948,105 +211,33 @@ void OperationValidator::visit(const ir::operation::SquaredDifference &node) const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)}; - // Check for Type equivalence OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(lhs_index).typeInfo().type()); OP_REQUIRES(_ctx.at(lhs_index).typeInfo().type() == _ctx.at(rhs_index).typeInfo().type()); - - // Check for dimension constraints - if (_ctx.at(output_index).info().isDynamic()) - return; - - auto output_shape = _ctx.at(output_index).shape(); - auto lhs_shape = _ctx.at(lhs_index).shape(); - auto rhs_shape = _ctx.at(rhs_index).shape(); - // Check for output rank - OP_REQUIRES(output_shape.rank() == std::max(lhs_shape.rank(), rhs_shape.rank())); - auto min_rank = std::min(lhs_shape.rank(), rhs_shape.rank()); - - for (int idx = 1; idx <= min_rank; idx++) - { - int l_idx = lhs_shape.rank() - idx; - int r_idx = rhs_shape.rank() - idx; - int out_idx = output_shape.rank() - idx; - - OP_REQUIRES((l_idx >= 0) && (r_idx >= 0) && (out_idx >= 0)); - - auto l_dims = lhs_shape.dim(l_idx); - auto r_dims = rhs_shape.dim(r_idx); - auto out_dims = output_shape.dim(out_idx); - - OP_REQUIRES(((l_dims == r_dims) && (out_dims == l_dims)) || - ((l_dims == 1) && (out_dims == r_dims)) || ((r_dims == 1) && (out_dims == l_dims))); - } - auto &tmp_shape = (lhs_shape.rank() > rhs_shape.rank()) ? lhs_shape : rhs_shape; - for (int idx = min_rank + 1; idx <= output_shape.rank(); idx++) - { - int out_idx = output_shape.rank() - idx; - int tmp_idx = tmp_shape.rank() - idx; - - OP_REQUIRES((out_idx >= 0) && (tmp_idx >= 0) && - (output_shape.dim(out_idx) == tmp_shape.dim(tmp_idx))); - } } -void OperationValidator::visit(const ir::operation::Tile &node) + +void OperationValidator::visit(const ir::operation::StridedSlice &node) { const auto output_index{node.getOutputs().at(0)}; - if (_ctx.at(output_index).info().isDynamic()) - return; - - const auto input_index{node.getInputs().at(0)}; - const auto multiple_index{node.getInputs().at(1)}; + const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)}; - OP_REQUIRES(_ctx.at(multiple_index).shape().rank() == 1); - OP_REQUIRES(_ctx.at(multiple_index).shape().dim(0) == _ctx.at(input_index).shape().rank()); - OP_REQUIRES(_ctx.at(input_index).shape().rank() == _ctx.at(output_index).shape().rank()); + OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type()); } -void OperationValidator::visit(const ir::operation::Range &node) +void OperationValidator::visit(const ir::operation::TransposeConv &node) { - const auto output_index{node.getOutputs().at(0)}; - const auto start_index{node.getInputs().at(ir::operation::Range::Input::START)}; - const auto limit_index{node.getInputs().at(ir::operation::Range::Input::LIMIT)}; - const auto delta_index{node.getInputs().at(ir::operation::Range::Input::DELTA)}; - - // Check for dimension constraints - if (_ctx.at(output_index).info().isDynamic()) - return; - - OP_REQUIRES(_ctx.at(start_index).shape().rank() == 0); - OP_REQUIRES(_ctx.at(limit_index).shape().rank() == 0); - OP_REQUIRES(_ctx.at(delta_index).shape().rank() == 0); + OP_REQUIRES((node.param().padding.type == ir::PaddingType::SAME) || + (node.param().padding.type == ir::PaddingType::VALID)); } -void OperationValidator::visit(const ir::operation::MatrixBandPart &node) +void OperationValidator::visit(const ir::operation::Unpack &node) { - const auto output_index{node.getOutputs().at(0)}; - const auto input_index{node.getInputs().at(ir::operation::MatrixBandPart::Input::INPUT)}; - const auto num_lower_index{ - node.getInputs().at(ir::operation::MatrixBandPart::Input::NUM_LOWER_DIAG)}; - const auto num_upper_index{ - node.getInputs().at(ir::operation::MatrixBandPart::Input::NUM_UPPER_DIAG)}; - - // Check for dimension constraints - if (_ctx.at(output_index).info().isDynamic()) - return; - - OP_REQUIRES(_ctx.at(input_index).shape().rank() >= 2); // input must be more than 2 dim matrix - OP_REQUIRES(_ctx.at(num_upper_index).shape().rank() == 0); // num_lower must be scalar - OP_REQUIRES(_ctx.at(num_lower_index).shape().rank() == 0); // num_upper must be scalar + const auto num{node.param().num}; + OP_REQUIRES(num == static_cast<int32_t>(node.getOutputs().size())); } -void OperationValidator::visit(const ir::operation::LogSoftmax &node) +void OperationValidator::visit(const ir::operation::While &node) { - VERBOSE(LogSoftmax) << "Configure LOGSOFTMAX operation" << std::endl; - - const auto output_index{node.getOutputs().at(0)}; - if (_ctx.at(output_index).info().isDynamic()) - return; - - const auto input_index{node.getInputs().at(0)}; - - OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank()); + OP_REQUIRES(node.getInputs().size() == node.getOutputs().size()); } } // namespace compiler diff --git a/runtime/onert/core/src/compiler/OperationValidator.h b/runtime/onert/core/src/compiler/OperationValidator.h index deb6357bb..f884a3765 100644 --- a/runtime/onert/core/src/compiler/OperationValidator.h +++ b/runtime/onert/core/src/compiler/OperationValidator.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,6 @@ #ifndef __ONERT_COMPILER_OPERATION_VALIDATOR_H__ #define __ONERT_COMPILER_OPERATION_VALIDATOR_H__ -#include "ir/Layout.h" #include "ir/OperationVisitor.h" namespace onert @@ -47,51 +46,30 @@ public: void visit(const ir::operation::BatchMatMul &node) override; void visit(const ir::operation::BatchToSpaceND &node) override; void visit(const ir::operation::Comparison &node) override; - void visit(const ir::operation::Softmax &node) override; - void visit(const ir::operation::InstanceNorm &node) override; - void visit(const ir::operation::Permute &node) override; - void visit(const ir::operation::Pool2D &node) override; - void visit(const ir::operation::Reduce &node) override; - void visit(const ir::operation::Transpose &node) override; - void visit(const ir::operation::RNN &node) override; - void visit(const ir::operation::SpaceToBatchND &node) override; - void visit(const ir::operation::SpaceToDepth &node) override; + void visit(const ir::operation::DepthToSpace &node) override; void visit(const ir::operation::ElementwiseActivation &node) override; void visit(const ir::operation::ElementwiseBinary &node) override; void visit(const ir::operation::ElementwiseUnary &node) override; void visit(const ir::operation::EmbeddingLookup &node) override; void visit(const ir::operation::ExpandDims &node) override; void visit(const ir::operation::HashtableLookup &node) override; - void visit(const ir::operation::TransposeConv &node) override; - void visit(const ir::operation::Gather &node) override; - void visit(const ir::operation::DepthToSpace &node) override; void visit(const ir::operation::Pack &node) override; - void visit(const ir::operation::LSTM &node) override; - void visit(const ir::operation::L2Normalization &node) override; - void visit(const ir::operation::Unpack &node) override; void visit(const ir::operation::Pad &node) override; - void visit(const ir::operation::Select &node) override; - void visit(const ir::operation::StridedSlice &node) override; - void visit(const ir::operation::Split &node) override; - void visit(const ir::operation::Shape &node) override; void visit(const ir::operation::ResizeBilinear &node) override; void visit(const ir::operation::Reverse &node) override; - void visit(const ir::operation::If &node) override; - void visit(const ir::operation::While &node) override; + void visit(const ir::operation::SpaceToBatchND &node) override; + void visit(const ir::operation::SpaceToDepth &node) override; + void visit(const ir::operation::Split &node) override; void visit(const ir::operation::SquaredDifference &node) override; - void visit(const ir::operation::Tile &node) override; - void visit(const ir::operation::Range &node) override; - void visit(const ir::operation::MatrixBandPart &node) override; - void visit(const ir::operation::LogSoftmax &node) override; - -private: - void checkUnaryOp(const ir::Operation &node); + void visit(const ir::operation::StridedSlice &node) override; + void visit(const ir::operation::TransposeConv &node) override; + void visit(const ir::operation::Unpack &node) override; + void visit(const ir::operation::While &node) override; private: // TODO Remove _ctx field const ir::Graph &_graph; const ir::Operands &_ctx; - ir::Layout _current_op_seq_layout; }; } // namespace compiler diff --git a/runtime/onert/core/src/compiler/ShapeValidator.cc b/runtime/onert/core/src/compiler/ShapeValidator.cc new file mode 100644 index 000000000..8be4fe6ec --- /dev/null +++ b/runtime/onert/core/src/compiler/ShapeValidator.cc @@ -0,0 +1,1021 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ShapeValidator.h" + +#include <typeinfo> + +#include "ir/Graph.h" +#include "ir/operation/LowerInfo.h" + +#include "util/logging.h" +#include "util/Utils.h" + +#define OP_REQUIRES(EXP) \ + do \ + { \ + if (!(EXP)) \ + throw std::runtime_error("ShapeValidator failed at line " + std::to_string(__LINE__)); \ + } while (0) + +namespace onert +{ +namespace compiler +{ + +ShapeValidator::ShapeValidator(const ir::Graph &graph) + : _graph{graph}, _ctx{graph.operands()}, _current_op_seq_layout{ir::Layout::UNKNOWN} +{ +} + +void ShapeValidator::checkUnaryOp(const ir::Operation &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(0)}; + + if (_ctx.at(output_index).info().isDynamic()) + return; + + // Check if I/O shapes match + OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); +} + +void ShapeValidator::operator()() +{ + // There is no reason for each subgraph to have subgraphs since compiler has subgraphs when + // creating Compiler + assert(_graph.subgraphs() == nullptr); + + _current_op_seq_layout = _graph.layout(); + + _graph.operations().iterate( + [&](const ir::OperationIndex &, const ir::Operation &node) { node.accept(*this); }); +} + +void ShapeValidator::visit(const ir::operation::BatchMatMul &node) +{ + const auto lhs_index(node.getInputs().at(ir::operation::BatchMatMul::Input::LHS)); + const auto rhs_index(node.getInputs().at(ir::operation::BatchMatMul::Input::RHS)); + const auto out_index{node.getOutputs().at(0)}; + + if (_ctx.at(out_index).info().isDynamic()) + return; + + OP_REQUIRES(_ctx.at(lhs_index).shape().rank() <= 4); + OP_REQUIRES(_ctx.at(rhs_index).shape().rank() <= 4); + OP_REQUIRES(_ctx.at(lhs_index).shape().rank() >= 2); + OP_REQUIRES(_ctx.at(rhs_index).shape().rank() >= 2); +} + +void ShapeValidator::visit(const ir::operation::BatchToSpaceND &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + if (_ctx.at(ofm_index).info().isDynamic()) + return; + + const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)}; + const auto block_size_index{ + node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)}; + + const auto frontend_layout = _current_op_seq_layout; + const auto input_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout); + const auto output_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout); + + // All requirement as per NNAPI specification. + OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4); + OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 4); + OP_REQUIRES(_ctx.at(block_size_index).shape().rank() == 1); + + OP_REQUIRES(_ctx.at(block_size_index).shape().dim(0) == 2); + + OP_REQUIRES(input_shape.C == output_shape.C); +} + +void ShapeValidator::visit(const ir::operation::BCQFullyConnected &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + if (_ctx.at(ofm_index).info().isDynamic()) + return; + + const auto ifm_index{node.getInputs().at(ir::operation::BCQFullyConnected::Input::INPUT)}; + const auto weight_scales_index{ + node.getInputs().at(ir::operation::BCQFullyConnected::Input::WEIGHTS_SCALES)}; + const auto weight_binary_index{ + node.getInputs().at(ir::operation::BCQFullyConnected::Input::WEIGHTS_BINARY)}; + const auto weight_cluster_index{ + node.getInputs().at(ir::operation::BCQFullyConnected::Input::WEIGHTS_CLUSTERS)}; + // const auto bias_index{node.getInputs().at(ir::operation::BCQFullyConnected::Input::BIAS)}; + + OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 2); + OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 2); + OP_REQUIRES(_ctx.at(weight_scales_index).shape().rank() == 1); + OP_REQUIRES(_ctx.at(weight_binary_index).shape().rank() == 2); + OP_REQUIRES(_ctx.at(weight_cluster_index).shape().rank() == 2); + + OP_REQUIRES(_ctx.at(ifm_index).shape().dim(1) == _ctx.at(ofm_index).shape().dim(1)); + + OP_REQUIRES(_ctx.at(weight_cluster_index).shape().dim(0) > 0); + OP_REQUIRES(_ctx.at(weight_cluster_index).shape().dim(1) == 2); + + // more shape validation will be done inside kernel. + + // TODO Check bias dimension (can be null tensor) +} + +void ShapeValidator::visit(const ir::operation::BCQGather &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + if (_ctx.at(ofm_index).info().isDynamic()) + return; + + const auto indices_index{node.getInputs().at(ir::operation::BCQGather::Input::INDICES)}; + const auto input_binary_index{node.getInputs().at(ir::operation::BCQGather::Input::INPUT_BINARY)}; + const auto input_scales_index{node.getInputs().at(ir::operation::BCQGather::Input::INPUT_SCALES)}; + const auto input_clusters_index{ + node.getInputs().at(ir::operation::BCQGather::Input::INPUT_CLUSTERS)}; + + OP_REQUIRES(_ctx.at(indices_index).shape().rank() <= 2); // TODO : support rank up to 4 or more + OP_REQUIRES(_ctx.at(input_binary_index).shape().rank() == 2); + OP_REQUIRES(_ctx.at(input_scales_index).shape().rank() == 1); + OP_REQUIRES(_ctx.at(input_clusters_index).shape().rank() == 2); + + OP_REQUIRES(_ctx.at(input_clusters_index).shape().dim(0) > 0); + OP_REQUIRES(_ctx.at(input_clusters_index).shape().dim(1) == 2); + + // more shape validation will be done inside kernel. +} + +void ShapeValidator::visit(const ir::operation::Comparison &) +{ + // TODO Shape validation of comparison +} + +void ShapeValidator::visit(const ir::operation::Softmax &node) +{ + const auto output_index{node.getOutputs().at(0)}; + if (_ctx.at(output_index).info().isDynamic()) + return; + + const auto input_index{node.getInputs().at(0)}; + + OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank()); +} + +void ShapeValidator::visit(const ir::operation::InstanceNorm &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + if (_ctx.at(ofm_index).info().isDynamic()) + return; + + const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)}; + const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)}; + const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)}; + + OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4); + OP_REQUIRES(_ctx.at(ifm_index).shape() == _ctx.at(ofm_index).shape()); + OP_REQUIRES(_ctx.at(gamma_index).shape().rank() == 1); + OP_REQUIRES(_ctx.at(beta_index).shape().rank() == 1); +} + +void ShapeValidator::visit(const ir::operation::Pool2D &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + if (_ctx.at(ofm_index).info().isDynamic()) + return; + + const auto ifm_index{node.getInputs().at(ir::operation::Pool2D::Input::INPUT)}; + + OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4); +} + +void ShapeValidator::visit(const ir::operation::Permute &node) +{ + const auto output_index{node.getOutputs().at(0)}; + if (_ctx.at(output_index).info().isDynamic()) + return; + + const auto input_index{node.getInputs().at(0)}; + + OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank()); +} + +void ShapeValidator::visit(const ir::operation::Reduce &node) +{ + const auto output_index{node.getOutputs().at(0)}; + if (_ctx.at(output_index).info().isDynamic()) + return; + + const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)}; + const auto input_shape = _ctx.at(input_index).shape(); + const auto output_shape = _ctx.at(output_index).shape(); + + OP_REQUIRES(input_shape.rank() <= 4); + OP_REQUIRES(output_shape.rank() <= input_shape.rank()); + + // NOTE For the 4-dimensions, if the rank of input and output are different, this runtime only + // supports cases reducing height and width or reducing depth. + // TODO We have to support all cases of dimensions up to 4. + // For correct permuting, we have to set output's shape to be equal in dimension position of the + // input. But the positions of the same dimensions in the input and output may be set differently. + // For example {2,3,4,5}(input's shape) can be reduced to {3,5}(output's shape). The original + // output shape should be {1,3,1,5}, but real output shape may be {3,5}. If you simply try to + // extend it in 4 dimensions, it should be {1,1,3,5}. + // Even if output shape is changed to {1,3,1,5}, there is another problem. It is that shape of + // output tensor used at next operation is changed to {1,3,1,5} after this operation even if the + // next operation is not desired. + if (input_shape.rank() == 4 && input_shape.rank() != output_shape.rank()) + { + if (output_shape.rank() == 2) + { + // Reducing HW + OP_REQUIRES(input_shape.dim(0) == output_shape.dim(0) && + input_shape.dim(3) == output_shape.dim(1)); + } + else if (output_shape.rank() == 3) + { + // Reducing C or + // (Reducing H and C(input and output) == 1) or (Reducing W and C(input and output) == 1) + OP_REQUIRES((input_shape.dim(0) == output_shape.dim(0) && + input_shape.dim(1) == output_shape.dim(1) && + input_shape.dim(2) == output_shape.dim(2)) || + (input_shape.dim(0) == output_shape.dim(0) && + (input_shape.dim(1) == output_shape.dim(1) || + input_shape.dim(2) == output_shape.dim(1)) && + input_shape.dim(3) == 1 && output_shape.dim(2) == 1)); + } + } +} + +void ShapeValidator::visit(const ir::operation::Transpose &node) +{ + const auto output_index{node.getOutputs().at(0)}; + if (_ctx.at(output_index).info().isDynamic()) + return; + + const auto input_index{node.getInputs().at(ir::operation::Transpose::Input::INPUT)}; + const auto perm_index{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)}; + + const auto &output_shape = _ctx.at(output_index).shape(); + const auto &input_shape = _ctx.at(input_index).shape(); + + OP_REQUIRES(_ctx.at(perm_index).shape().num_elements() == 0 || + input_shape.rank() == static_cast<int>(_ctx.at(perm_index).shape().num_elements())); + OP_REQUIRES(input_shape.rank() == output_shape.rank()); +} + +void ShapeValidator::visit(const ir::operation::RNN &node) +{ + // NOTE This validation is for static rnn(non-dynamic shape), but not for dynamic rnn + // TODO Support dynamic rnn + const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)}; + if (_ctx.at(output_index).info().isDynamic()) + return; + + const auto hidden_state_out_index{ + node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)}; + + const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)}; + const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)}; + const auto recurrent_weights_index{ + node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)}; + const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)}; + const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)}; + + const auto batch_size = _ctx.at(output_index).shape().dim(0); + const auto num_units = _ctx.at(output_index).shape().dim(1); + + OP_REQUIRES(_ctx.at(output_index).shape().rank() == 2 && + _ctx.at(hidden_state_out_index).shape().rank() == 2 && + _ctx.at(input_index).shape().rank() == 2 && + _ctx.at(weights_index).shape().rank() == 2 && + _ctx.at(recurrent_weights_index).shape().rank() == 2 && + _ctx.at(hidden_state_in_index).shape().rank() == 2); + OP_REQUIRES(_ctx.at(bias_index).shape().rank() == 1); + + OP_REQUIRES(batch_size == _ctx.at(input_index).shape().dim(0) && + batch_size == _ctx.at(hidden_state_in_index).shape().dim(0) && + batch_size == _ctx.at(hidden_state_out_index).shape().dim(0)); + OP_REQUIRES(_ctx.at(input_index).shape().dim(1) == _ctx.at(weights_index).shape().dim(1)); + + OP_REQUIRES(num_units == _ctx.at(weights_index).shape().dim(0) && + num_units == _ctx.at(recurrent_weights_index).shape().dim(0) && + num_units == _ctx.at(bias_index).shape().dim(0)); + OP_REQUIRES(num_units == _ctx.at(output_index).shape().dim(1) && + num_units == _ctx.at(recurrent_weights_index).shape().dim(1) && + num_units == _ctx.at(hidden_state_in_index).shape().dim(1) && + num_units == _ctx.at(hidden_state_out_index).shape().dim(1)); +} + +void ShapeValidator::visit(const ir::operation::SpaceToBatchND &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + if (_ctx.at(ofm_index).info().isDynamic()) + return; + + const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)}; + const auto block_size_index{ + node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)}; + const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)}; + + const auto frontend_layout = _current_op_seq_layout; + const auto input_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout); + const auto output_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout); + + // All requirement as per NNAPI specification. + OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4); + OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 4); + OP_REQUIRES(_ctx.at(block_size_index).shape().rank() == 1); + OP_REQUIRES(_ctx.at(paddings_index).shape().rank() == 2); + + OP_REQUIRES(_ctx.at(block_size_index).shape().dim(0) == 2); + OP_REQUIRES(_ctx.at(paddings_index).shape().dim(0) == 2); + OP_REQUIRES(_ctx.at(paddings_index).shape().dim(1) == 2); + + OP_REQUIRES(input_shape.C == output_shape.C); +} + +void ShapeValidator::visit(const ir::operation::SpaceToDepth &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + if (_ctx.at(ofm_index).info().isDynamic()) + return; + + const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)}; + + const auto frontend_layout = _current_op_seq_layout; + const auto input_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout); + const auto output_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout); + const auto block_size = node.param().block_size; + + // All assertions as per NNAPI specification. + OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 4); + OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 4); + OP_REQUIRES((input_shape.H % block_size == 0) && (input_shape.W % block_size == 0)); + OP_REQUIRES(input_shape.N == output_shape.N); + OP_REQUIRES(input_shape.C * block_size * block_size == output_shape.C); +} + +void ShapeValidator::visit(const ir::operation::ElementwiseActivation &node) { checkUnaryOp(node); } + +void ShapeValidator::visit(const ir::operation::ElementwiseBinary &) +{ + // TODO Shape validation of ElementwiseBinary +} + +void ShapeValidator::visit(const ir::operation::ElementwiseUnary &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)}; + + if (_ctx.at(output_index).info().isDynamic()) + return; + + OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); +} + +void ShapeValidator::visit(const ir::operation::EmbeddingLookup &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)}; + const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)}; + + const auto &output_obj = _ctx.at(output_index); + const auto &lookups_obj = _ctx.at(lookups_index); + const auto &values_obj = _ctx.at(values_index); + + // Verify operand here, not at SimpleEmbeddingLookup::configure() to avoid acl's modifying + // TensorShape sometimes(Issue: https://github.sec.samsung.net/STAR/nnfw/issues/729) + { + if (_ctx.at(output_index).info().isDynamic()) + return; + + const auto &output_shape = output_obj.shape(); + const auto &lookups_shape = lookups_obj.shape(); + const auto &values_shape = values_obj.shape(); + + OP_REQUIRES(lookups_shape.rank() == 1); + OP_REQUIRES(values_shape.rank() >= 2); + + // output should be a n-D tensor with the same rank and shape as the values tensor, except for + // the first dimension which has the same size as lookups' only dimension. + OP_REQUIRES(output_shape.rank() == values_shape.rank()); + OP_REQUIRES(output_shape.dim(0) == lookups_shape.dim(0)); + for (int n = 1; n < output_shape.rank(); ++n) + { + OP_REQUIRES(output_shape.dim(n) == values_shape.dim(n)); + } + } +} + +void ShapeValidator::visit(const ir::operation::ExpandDims &node) +{ + const auto axis_index{node.getInputs().at(ir::operation::ExpandDims::Input::AXIS)}; + + if (_ctx.at(axis_index).info().isDynamic()) + return; + OP_REQUIRES(_ctx.at(axis_index).shape().rank() <= 1); +} + +void ShapeValidator::visit(const ir::operation::HashtableLookup &node) +{ + const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)}; + const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)}; + const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)}; + const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)}; + + const auto &output_obj = _ctx.at(output_index); + const auto &lookups_obj = _ctx.at(lookups_index); + const auto &keys_obj = _ctx.at(keys_index); + const auto &values_obj = _ctx.at(values_index); + + if (_ctx.at(output_index).info().isDynamic()) + return; + + const auto &output_shape = output_obj.shape(); + const auto &lookups_shape = lookups_obj.shape(); + const auto &keys_shape = keys_obj.shape(); + const auto &values_shape = values_obj.shape(); + + OP_REQUIRES(values_shape.rank() == output_shape.rank()); + OP_REQUIRES(lookups_shape.rank() == 1); + OP_REQUIRES(keys_shape.rank() == 1); + OP_REQUIRES(values_shape.dim(0) == keys_shape.dim(0)); + OP_REQUIRES(lookups_shape.dim(0) == output_shape.dim(0)); +} + +void ShapeValidator::visit(const ir::operation::TransposeConv &node) +{ + // shape check + const auto ofm_index{node.getOutputs().at(0)}; + if (_ctx.at(ofm_index).info().isDynamic()) + return; + + const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)}; + const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)}; + + // Only 4D tensors are supported + OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == 4); + OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == _ctx.at(ifm_index).shape().rank()); + OP_REQUIRES(_ctx.at(ofm_index).shape().rank() == _ctx.at(ker_index).shape().rank()); + + const auto frontend_layout = _current_op_seq_layout; + const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(frontend_layout); + const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout); + // The kernel has only IHWO layout on frontend + // So ker_shape is treated here below + // I -> N + // H -> H + // W -> W + // O -> C + const auto ker_shape = _ctx.at(ker_index).shape().asFeature(ir::Layout::NHWC); + + OP_REQUIRES(ifm_shape.N == ofm_shape.N); + OP_REQUIRES(ifm_shape.C == ker_shape.C); + OP_REQUIRES(ker_shape.N == ofm_shape.C); +} + +void ShapeValidator::visit(const ir::operation::Gather &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + if (_ctx.at(ofm_index).info().isDynamic()) + return; + + const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)}; + const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)}; + + const auto ifm_shape = _ctx.at(ifm_index).shape(); + const auto indices_shape = _ctx.at(indices_index).shape(); + const auto ofm_shape = _ctx.at(ofm_index).shape(); + + OP_REQUIRES(ifm_shape.rank() <= 4); + OP_REQUIRES(indices_shape.rank() <= 3); + OP_REQUIRES(ofm_shape.rank() <= 4); +} + +void ShapeValidator::visit(const ir::operation::DepthToSpace &node) +{ + int32_t block_size = node.param().block_size; + + // shape check + const auto output_index{node.getOutputs().at(0)}; + if (_ctx.at(output_index).info().isDynamic()) + return; + + const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)}; + + const auto frontend_layout = _current_op_seq_layout; + const auto output_shape = _ctx.at(output_index).shape().asFeature(frontend_layout); + const auto input_shape = _ctx.at(input_index).shape().asFeature(frontend_layout); + + OP_REQUIRES(_ctx.at(input_index).shape().rank() == 4); + OP_REQUIRES(_ctx.at(output_index).shape().rank() == 4); + + { + OP_REQUIRES(output_shape.N == input_shape.N); + OP_REQUIRES(output_shape.H == input_shape.H * block_size); + OP_REQUIRES(output_shape.W == input_shape.W * block_size); + OP_REQUIRES(input_shape.C % (block_size * block_size) == 0); + OP_REQUIRES(output_shape.C == input_shape.C / (block_size * block_size)); + } +} + +void ShapeValidator::visit(const ir::operation::Pack &node) +{ + const auto axis{node.param().axis}; + const auto output_index{node.getOutputs().at(0)}; + if (_ctx.at(output_index).info().isDynamic()) + return; + + // shape check + const auto &output_shape = _ctx.at(output_index).shape(); + const auto output_rank = static_cast<int32_t>(output_shape.rank()); + + const auto input1_index{node.getInputs().at(0)}; + const auto input_shape = _ctx.at(input1_index).shape(); + + OP_REQUIRES(axis >= -output_rank && axis < output_rank); + for (const auto &index : node.getInputs()) + { + OP_REQUIRES(input_shape == _ctx.at(index).shape()); + } +} + +void ShapeValidator::visit(const ir::operation::LSTM &node) +{ + // NOTE This validation is for static rnn(non-dynamic shape), but not for dynamic rnn + // TODO Support dynamic rnn + const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)}; + if (_ctx.at(output_index).info().isDynamic()) + return; + + const auto scratch_buffer_index{ + node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)}; + const auto output_state_out_index{ + node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)}; + const auto cell_state_out_index{ + node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)}; + + const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)}; + const auto input_to_input_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; + const auto input_to_forget_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)}; + const auto input_to_cell_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)}; + const auto input_to_output_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)}; + const auto recurrent_to_input_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; + const auto recurrent_to_forget_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)}; + const auto recurrent_to_cell_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)}; + const auto recurrent_to_output_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)}; + const auto cell_to_input_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; + const auto cell_to_forget_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; + const auto cell_to_output_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; + const auto input_gate_bias_index{ + node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)}; + const auto forget_gate_bias_index{ + node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)}; + const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)}; + const auto output_gate_bias_index{ + node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)}; + const auto projection_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; + const auto projection_bias_index{ + node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; + const auto output_state_in_index{ + node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)}; + const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)}; + + OP_REQUIRES(_ctx.at(input_index).shape().rank() == _ctx.at(output_index).shape().rank()); + for (int i = 0; i < _ctx.at(input_index).shape().rank() - 1; ++i) + { + OP_REQUIRES(_ctx.at(input_index).shape().dim(i) == _ctx.at(output_index).shape().dim(i)); + } + OP_REQUIRES( + (_ctx.at(output_index).shape().rank() == 2 || _ctx.at(output_index).shape().rank() == 3) && + (_ctx.at(input_index).shape().rank() == 2 || _ctx.at(input_index).shape().rank() == 3) && + _ctx.at(input_to_input_weights_index).shape().rank() == 2 && + _ctx.at(input_to_forget_weights_index).shape().rank() == 2 && + _ctx.at(input_to_cell_weights_index).shape().rank() == 2 && + _ctx.at(input_to_output_weights_index).shape().rank() == 2 && + _ctx.at(recurrent_to_input_weights_index).shape().rank() == 2 && + _ctx.at(recurrent_to_forget_weights_index).shape().rank() == 2 && + _ctx.at(recurrent_to_cell_weights_index).shape().rank() == 2 && + _ctx.at(recurrent_to_output_weights_index).shape().rank() == 2 && + _ctx.at(projection_weights_index).shape().rank() == 2 && + _ctx.at(output_state_in_index).shape().rank() == 2 && + _ctx.at(cell_state_in_index).shape().rank() == 2); + + OP_REQUIRES(_ctx.at(cell_to_input_weights_index).shape().rank() == 1 && + _ctx.at(cell_to_forget_weights_index).shape().rank() == 1 && + _ctx.at(cell_to_output_weights_index).shape().rank() == 1 && + _ctx.at(input_gate_bias_index).shape().rank() == 1 && + _ctx.at(forget_gate_bias_index).shape().rank() == 1 && + _ctx.at(cell_bias_index).shape().rank() == 1 && + _ctx.at(output_gate_bias_index).shape().rank() == 1 && + _ctx.at(projection_bias_index).shape().rank() == 1); + + // CIFG assertion + OP_REQUIRES((_ctx.at(input_to_input_weights_index).shape().dim(0) == 0 && + _ctx.at(input_to_input_weights_index).shape().dim(1) == 0 && + _ctx.at(recurrent_to_input_weights_index).shape().dim(0) == 0 && + _ctx.at(recurrent_to_input_weights_index).shape().dim(1) == 0 && + _ctx.at(input_gate_bias_index).shape().dim(0) == 0 && + _ctx.at(cell_to_input_weights_index).shape().dim(0) == 0) || + (_ctx.at(input_to_input_weights_index).shape().dim(0) != 0 && + _ctx.at(input_to_input_weights_index).shape().dim(1) != 0 && + _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 && + _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0 && + _ctx.at(input_gate_bias_index).shape().dim(0) != 0)); + + // Peephole assertion + OP_REQUIRES((_ctx.at(cell_to_forget_weights_index).shape().dim(0) == 0 && + _ctx.at(cell_to_output_weights_index).shape().dim(0) == 0) || + (_ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0 && + _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0)); + + bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 && + _ctx.at(input_to_input_weights_index).shape().dim(1) != 0; + bool has_recurrent_to_input_weights = + _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 && + _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0; + bool has_input_gate_bias = _ctx.at(input_gate_bias_index).shape().dim(0) != 0; + bool has_cell_to_input_weights = _ctx.at(cell_to_input_weights_index).shape().dim(0) != 0; + bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0; + bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0; + bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 && + _ctx.at(projection_weights_index).shape().dim(1) != 0; + bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0); + + // NOTE The cell_to_input_weights do not exist in non-peephole although regular LSTM(non-CIFG). + // true: no CIFG + // false: CIFG + bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights; + + // NOTE The cell_to_input_weights do not exist in regular CIFG although peephole. + // true: peephole + // false: no peephole + bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights; + + // NOTE The projection weights may have data but the projection bias may not. + bool has_projection_param = has_projection_weights; + + const auto batch_size = (_ctx.at(input_index).shape().rank() == 3 && node.param().time_major) + ? _ctx.at(input_index).shape().dim(1) + : _ctx.at(input_index).shape().dim(0); + OP_REQUIRES(batch_size == _ctx.at(output_state_in_index).shape().dim(0) && + batch_size == _ctx.at(cell_state_in_index).shape().dim(0)); + + const auto input_size = _ctx.at(input_index).shape().dim(_ctx.at(input_index).shape().rank() - 1); + OP_REQUIRES(input_size == _ctx.at(input_to_forget_weights_index).shape().dim(1) && + input_size == _ctx.at(input_to_cell_weights_index).shape().dim(1) && + input_size == _ctx.at(input_to_output_weights_index).shape().dim(1)); + + const auto num_units = _ctx.at(input_to_output_weights_index).shape().dim(0); + OP_REQUIRES(num_units == _ctx.at(input_to_cell_weights_index).shape().dim(0) && + num_units == _ctx.at(input_to_output_weights_index).shape().dim(0) && + num_units == _ctx.at(recurrent_to_forget_weights_index).shape().dim(0) && + num_units == _ctx.at(recurrent_to_cell_weights_index).shape().dim(0) && + num_units == _ctx.at(recurrent_to_output_weights_index).shape().dim(0) && + num_units == _ctx.at(forget_gate_bias_index).shape().dim(0) && + num_units == _ctx.at(cell_bias_index).shape().dim(0) && + num_units == _ctx.at(output_gate_bias_index).shape().dim(0) && + num_units == _ctx.at(cell_state_in_index).shape().dim(1)); + + const auto output_size = + _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1); + OP_REQUIRES(output_size == _ctx.at(recurrent_to_forget_weights_index).shape().dim(1) && + output_size == _ctx.at(recurrent_to_cell_weights_index).shape().dim(1) && + output_size == _ctx.at(recurrent_to_output_weights_index).shape().dim(1) && + output_size == _ctx.at(output_state_in_index).shape().dim(1)); + + if (has_cifg_param) + { + OP_REQUIRES(input_size == _ctx.at(input_to_input_weights_index).shape().dim(1)); + OP_REQUIRES(num_units == _ctx.at(input_to_input_weights_index).shape().dim(0) && + num_units == _ctx.at(recurrent_to_input_weights_index).shape().dim(0) && + (num_units == _ctx.at(cell_to_input_weights_index).shape().dim(0) || + _ctx.at(cell_to_input_weights_index).shape().dim(0) == 0 /* non-peephole */) && + num_units == _ctx.at(input_gate_bias_index).shape().dim(0)); + OP_REQUIRES(output_size == _ctx.at(recurrent_to_input_weights_index).shape().dim(1)); + OP_REQUIRES(has_input_to_input_weights && has_recurrent_to_input_weights && + has_input_gate_bias); + if (has_cell_to_input_weights) + { + // NOTE The cell_to_input_weights exist only in case of non-CIFG and peephole. + OP_REQUIRES(has_peephole_param); + } + if (_ctx.exist(scratch_buffer_index)) + OP_REQUIRES(_ctx.at(scratch_buffer_index).shape().dim(1) == num_units * 4); + } + else + { + if (_ctx.exist(scratch_buffer_index)) + OP_REQUIRES(_ctx.at(scratch_buffer_index).shape().dim(1) == num_units * 3); + } + + if (has_peephole_param) + { + OP_REQUIRES(num_units == _ctx.at(cell_to_forget_weights_index).shape().dim(0) && + num_units == _ctx.at(cell_to_output_weights_index).shape().dim(0) && + (num_units == _ctx.at(cell_to_input_weights_index).shape().dim(0) || + _ctx.at(cell_to_input_weights_index).shape().dim(0) == 0 /* CIFG */)); + } + + if (has_projection_param) + { + OP_REQUIRES(num_units == _ctx.at(projection_weights_index).shape().dim(1)); + OP_REQUIRES(output_size == _ctx.at(projection_weights_index).shape().dim(0)); + if (has_projection_bias) + { + OP_REQUIRES(output_size == _ctx.at(projection_bias_index).shape().dim(0)); + } + } + + if (_ctx.exist(scratch_buffer_index)) + { + OP_REQUIRES(_ctx.at(scratch_buffer_index).shape().rank() == 2); + OP_REQUIRES(batch_size == _ctx.at(scratch_buffer_index).shape().dim(0)); + } + + if (_ctx.exist(output_state_out_index)) + { + OP_REQUIRES(_ctx.at(output_state_out_index).shape().rank() == 2); + OP_REQUIRES(batch_size == _ctx.at(output_state_out_index).shape().dim(0)); + OP_REQUIRES(output_size == _ctx.at(output_state_out_index).shape().dim(1)); + } + + if (_ctx.exist(cell_state_out_index)) + { + OP_REQUIRES(_ctx.at(cell_state_out_index).shape().rank() == 2); + OP_REQUIRES(batch_size == _ctx.at(cell_state_out_index).shape().dim(0)); + OP_REQUIRES(num_units == _ctx.at(cell_state_out_index).shape().dim(1)); + } +} + +void ShapeValidator::visit(const ir::operation::L2Normalization &node) +{ + const auto ofm_index{node.getOutputs().at(0)}; + if (_ctx.at(ofm_index).info().isDynamic()) + return; + + const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)}; + + auto ifm_shape = _ctx.at(ifm_index).shape(); + auto ofm_shape = _ctx.at(ofm_index).shape(); + + OP_REQUIRES(ifm_shape.rank() == ofm_shape.rank()); + + for (auto i = 0; i < ifm_shape.rank(); i++) + { + OP_REQUIRES(ifm_shape.dim(i) == ofm_shape.dim(i)); + } +} + +void ShapeValidator::visit(const ir::operation::Unpack &node) +{ + const auto axis{node.param().axis}; + const auto output_index{node.getInputs().at(0)}; + if (_ctx.at(output_index).info().isDynamic()) + return; + + const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)}; + + const auto &input_shape = _ctx.at(input_index).shape(); + const auto input_rank = static_cast<int32_t>(input_shape.rank()); + + OP_REQUIRES(axis >= -input_rank && axis < input_rank); +} + +void ShapeValidator::visit(const ir::operation::Pad &node) +{ + const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)}; + OP_REQUIRES(_ctx.at(pad_index).typeInfo().type() == ir::DataType::INT32); + + const auto output_index{node.getInputs().at(0)}; + if (_ctx.at(output_index).info().isDynamic()) + return; + + const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)}; + + const auto &pad_shape = _ctx.at(pad_index).shape(); + const auto input_rank = static_cast<int32_t>(_ctx.at(input_index).shape().rank()); + + OP_REQUIRES(pad_shape.rank() == 2); + OP_REQUIRES(pad_shape.dim(0) == input_rank); + OP_REQUIRES(pad_shape.dim(1) == 2); + OP_REQUIRES(_ctx.at(input_index).shape().rank() == _ctx.at(output_index).shape().rank()); +} + +void ShapeValidator::visit(const ir::operation::Select &node) +{ + const auto output_index{node.getOutputs().at(0)}; + // This validator does not check shape. So checking isDynamic() is skipped. + + const auto condition_index{node.getInputs().at(ir::operation::Select::Input::CONDITION)}; + const auto input_true_index{node.getInputs().at(ir::operation::Select::Input::INPUT_TRUE)}; + const auto input_false_index{node.getInputs().at(ir::operation::Select::Input::INPUT_FALSE)}; + UNUSED_RELEASE(output_index); + UNUSED_RELEASE(input_true_index); + UNUSED_RELEASE(input_false_index); + + OP_REQUIRES(_ctx.at(condition_index).typeInfo().type() == ir::DataType::BOOL8); +} + +void ShapeValidator::visit(const ir::operation::StridedSlice &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)}; + + if (_ctx.at(output_index).info().isDynamic()) + return; + + OP_REQUIRES(_ctx.at(input_index).shape().rank() <= 4); +} + +void ShapeValidator::visit(const ir::operation::Split &node) +{ + const auto output_index{node.getOutputs().at(0)}; + if (_ctx.at(output_index).info().isDynamic()) + return; + + const auto input_index{node.getInputs().at(ir::operation::Split::Input::INPUT)}; + const auto axis_index{node.getInputs().at(ir::operation::Split::Input::AXIS)}; + + const auto num_splits = node.param().num_splits; + const auto input_rank = _ctx.at(input_index).shape().rank(); + auto axis = *reinterpret_cast<const int32_t *>(_ctx.at(axis_index).data()->base()); + axis = axis < 0 ? axis + input_rank : axis; + + OP_REQUIRES(axis >= 0 && axis < input_rank); + OP_REQUIRES(_ctx.at(input_index).shape().dim(axis) % num_splits == 0); +} + +void ShapeValidator::visit(const ir::operation::Shape &node) +{ + const auto output_index{node.getOutputs().at(0)}; + if (_ctx.at(output_index).info().isDynamic()) + return; + + const auto input_index{node.getInputs().at(0)}; + UNUSED_RELEASE(input_index); + OP_REQUIRES(_ctx.at(output_index).shape().rank() == 1); +} + +void ShapeValidator::visit(const ir::operation::ResizeBilinear &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)}; + + if (_ctx.at(output_index).info().isDynamic()) + { + return; + } + OP_REQUIRES(_ctx.at(input_index).shape().rank() == 4); + OP_REQUIRES(_ctx.at(output_index).shape().rank() == 4); +} + +void ShapeValidator::visit(const ir::operation::Reverse &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::Reverse::Input::INPUT)}; + + if (_ctx.at(output_index).info().isDynamic()) + return; + OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape()); +} + +void ShapeValidator::visit(const ir::operation::If &) +{ + // TODO Add to validate with subgraphs +} + +void ShapeValidator::visit(const ir::operation::While &) +{ + // This validator does not check shape. So checking isDynamic() is skipped. + // TODO Add to validate with subgraphs +} + +void ShapeValidator::visit(const ir::operation::SquaredDifference &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)}; + const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)}; + + // Check for dimension constraints + if (_ctx.at(output_index).info().isDynamic()) + return; + + auto output_shape = _ctx.at(output_index).shape(); + auto lhs_shape = _ctx.at(lhs_index).shape(); + auto rhs_shape = _ctx.at(rhs_index).shape(); + // Check for output rank + OP_REQUIRES(output_shape.rank() == std::max(lhs_shape.rank(), rhs_shape.rank())); + auto min_rank = std::min(lhs_shape.rank(), rhs_shape.rank()); + + for (int idx = 1; idx <= min_rank; idx++) + { + int l_idx = lhs_shape.rank() - idx; + int r_idx = rhs_shape.rank() - idx; + int out_idx = output_shape.rank() - idx; + + OP_REQUIRES((l_idx >= 0) && (r_idx >= 0) && (out_idx >= 0)); + + auto l_dims = lhs_shape.dim(l_idx); + auto r_dims = rhs_shape.dim(r_idx); + auto out_dims = output_shape.dim(out_idx); + + OP_REQUIRES(((l_dims == r_dims) && (out_dims == l_dims)) || + ((l_dims == 1) && (out_dims == r_dims)) || ((r_dims == 1) && (out_dims == l_dims))); + } + auto &tmp_shape = (lhs_shape.rank() > rhs_shape.rank()) ? lhs_shape : rhs_shape; + for (int idx = min_rank + 1; idx <= output_shape.rank(); idx++) + { + int out_idx = output_shape.rank() - idx; + int tmp_idx = tmp_shape.rank() - idx; + + OP_REQUIRES((out_idx >= 0) && (tmp_idx >= 0) && + (output_shape.dim(out_idx) == tmp_shape.dim(tmp_idx))); + } +} +void ShapeValidator::visit(const ir::operation::Tile &node) +{ + const auto output_index{node.getOutputs().at(0)}; + if (_ctx.at(output_index).info().isDynamic()) + return; + + const auto input_index{node.getInputs().at(0)}; + const auto multiple_index{node.getInputs().at(1)}; + + OP_REQUIRES(_ctx.at(multiple_index).shape().rank() == 1); + OP_REQUIRES(_ctx.at(multiple_index).shape().dim(0) == _ctx.at(input_index).shape().rank()); + OP_REQUIRES(_ctx.at(input_index).shape().rank() == _ctx.at(output_index).shape().rank()); +} + +void ShapeValidator::visit(const ir::operation::Range &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto start_index{node.getInputs().at(ir::operation::Range::Input::START)}; + const auto limit_index{node.getInputs().at(ir::operation::Range::Input::LIMIT)}; + const auto delta_index{node.getInputs().at(ir::operation::Range::Input::DELTA)}; + + // Check for dimension constraints + if (_ctx.at(output_index).info().isDynamic()) + return; + + OP_REQUIRES(_ctx.at(start_index).shape().rank() == 0); + OP_REQUIRES(_ctx.at(limit_index).shape().rank() == 0); + OP_REQUIRES(_ctx.at(delta_index).shape().rank() == 0); +} + +void ShapeValidator::visit(const ir::operation::MatrixBandPart &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::MatrixBandPart::Input::INPUT)}; + const auto num_lower_index{ + node.getInputs().at(ir::operation::MatrixBandPart::Input::NUM_LOWER_DIAG)}; + const auto num_upper_index{ + node.getInputs().at(ir::operation::MatrixBandPart::Input::NUM_UPPER_DIAG)}; + + // Check for dimension constraints + if (_ctx.at(output_index).info().isDynamic()) + return; + + OP_REQUIRES(_ctx.at(input_index).shape().rank() >= 2); // input must be more than 2 dim matrix + OP_REQUIRES(_ctx.at(num_upper_index).shape().rank() == 0); // num_lower must be scalar + OP_REQUIRES(_ctx.at(num_lower_index).shape().rank() == 0); // num_upper must be scalar +} + +void ShapeValidator::visit(const ir::operation::LogSoftmax &node) +{ + const auto output_index{node.getOutputs().at(0)}; + if (_ctx.at(output_index).info().isDynamic()) + return; + + const auto input_index{node.getInputs().at(0)}; + + OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank()); +} + +} // namespace compiler +} // namespace onert diff --git a/runtime/onert/core/src/compiler/ShapeValidator.h b/runtime/onert/core/src/compiler/ShapeValidator.h new file mode 100644 index 000000000..f40c098d5 --- /dev/null +++ b/runtime/onert/core/src/compiler/ShapeValidator.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_COMPILER_SHAPE_VALIDATOR_H__ +#define __ONERT_COMPILER_SHAPE_VALIDATOR_H__ + +#include "ir/Layout.h" +#include "ir/OperationVisitor.h" + +namespace onert +{ +namespace ir +{ +class Graph; +class Operands; +} // namespace ir +} // namespace onert + +namespace onert +{ +namespace compiler +{ + +class ShapeValidator : public ir::OperationVisitor +{ +public: + ShapeValidator(void) = delete; + ShapeValidator(const ir::Graph &graph); + +public: + void operator()(); + +public: + void visit(const ir::operation::BatchMatMul &node) override; + void visit(const ir::operation::BatchToSpaceND &node) override; + void visit(const ir::operation::BCQFullyConnected &node) override; + void visit(const ir::operation::BCQGather &node) override; + void visit(const ir::operation::Comparison &node) override; + void visit(const ir::operation::Softmax &node) override; + void visit(const ir::operation::InstanceNorm &node) override; + void visit(const ir::operation::Permute &node) override; + void visit(const ir::operation::Pool2D &node) override; + void visit(const ir::operation::Reduce &node) override; + void visit(const ir::operation::Transpose &node) override; + void visit(const ir::operation::RNN &node) override; + void visit(const ir::operation::SpaceToBatchND &node) override; + void visit(const ir::operation::SpaceToDepth &node) override; + void visit(const ir::operation::ElementwiseActivation &node) override; + void visit(const ir::operation::ElementwiseBinary &node) override; + void visit(const ir::operation::ElementwiseUnary &node) override; + void visit(const ir::operation::EmbeddingLookup &node) override; + void visit(const ir::operation::ExpandDims &node) override; + void visit(const ir::operation::HashtableLookup &node) override; + void visit(const ir::operation::TransposeConv &node) override; + void visit(const ir::operation::Gather &node) override; + void visit(const ir::operation::DepthToSpace &node) override; + void visit(const ir::operation::Pack &node) override; + void visit(const ir::operation::LSTM &node) override; + void visit(const ir::operation::L2Normalization &node) override; + void visit(const ir::operation::Unpack &node) override; + void visit(const ir::operation::Pad &node) override; + void visit(const ir::operation::Select &node) override; + void visit(const ir::operation::StridedSlice &node) override; + void visit(const ir::operation::Split &node) override; + void visit(const ir::operation::Shape &node) override; + void visit(const ir::operation::ResizeBilinear &node) override; + void visit(const ir::operation::Reverse &node) override; + void visit(const ir::operation::If &node) override; + void visit(const ir::operation::While &node) override; + void visit(const ir::operation::SquaredDifference &node) override; + void visit(const ir::operation::Tile &node) override; + void visit(const ir::operation::Range &node) override; + void visit(const ir::operation::MatrixBandPart &node) override; + void visit(const ir::operation::LogSoftmax &node) override; + +private: + void checkUnaryOp(const ir::Operation &node); + +private: + // TODO Remove _ctx field + const ir::Graph &_graph; + const ir::Operands &_ctx; + ir::Layout _current_op_seq_layout; +}; + +} // namespace compiler +} // namespace onert + +#endif // __ONERT_COMPILER_SHAPE_VALIDATOR_H__ diff --git a/runtime/onert/core/src/compiler/StaticShapeInference.cc b/runtime/onert/core/src/compiler/StaticShapeInference.cc index 4eba1ff49..df129d98b 100644 --- a/runtime/onert/core/src/compiler/StaticShapeInference.cc +++ b/runtime/onert/core/src/compiler/StaticShapeInference.cc @@ -147,16 +147,26 @@ void StaticShapeInferer::visit(const ir::operation::ArgMax &op) const auto input_idx{op.getInputs().at(ir::operation::ArgMax::Input::INPUT)}; const auto &input = _operands.at(input_idx); + const auto axis_idx{op.getInputs().at(ir::operation::ArgMax::Input::AXIS)}; + const auto &axis = _operands.at(axis_idx); + // get mutable output operand const auto output_idx = op.getOutputs().at(0); ir::Operand &output = _operands.at(output_idx); - const auto rank = input.info().shape().rank(); - const auto axis = ((op.param().axis < 0) ? rank + op.param().axis : op.param().axis); - assert(0 <= axis && axis < rank); + if (!axis.isConstant()) + { + output.info().setDynamic(); + _return_has_dynamic_tensor = true; + return; + } + + const auto rank = input.info().shape().rank(); + auto axis_value = axis.asScalar<int32_t>(); + axis_value = axis_value < 0 ? axis_value + rank : axis_value; // re-sizing output shape - ir::Shape new_shape = shape_inference::inferArgMaxShape(input.info().shape(), axis, rank); + ir::Shape new_shape = shape_inference::inferArgMaxShape(input.info().shape(), axis_value, rank); output.info().shape(new_shape); } @@ -165,13 +175,60 @@ void StaticShapeInferer::visit(const ir::operation::BatchMatMul &op) const auto lhs_index = op.getInputs().at(ir::operation::BatchMatMul::Input::LHS); const auto rhs_index = op.getInputs().at(ir::operation::BatchMatMul::Input::RHS); const auto output_index = op.getOutputs().at(0); - const auto lhs = _operands.at(lhs_index); - const auto rhs = _operands.at(rhs_index); + const auto &lhs = _operands.at(lhs_index); + const auto &rhs = _operands.at(rhs_index); auto &output = _operands.at(output_index); auto new_shape = shape_inference::inferBatchMatMulShape(lhs.shape(), rhs.shape(), op.param()); output.info().shape(new_shape); } +void StaticShapeInferer::visit(const ir::operation::BCQFullyConnected &op) +{ + const auto input_idx{op.getInputs().at(ir::operation::BCQFullyConnected::Input::INPUT)}; + const auto &input = _operands.at(input_idx); + + const auto cluster_idx{ + op.getInputs().at(ir::operation::BCQFullyConnected::Input::WEIGHTS_CLUSTERS)}; + const auto &cluster = _operands.at(cluster_idx); + + const auto output_idx = op.getOutputs().at(0); + ir::Operand &output = _operands.at(output_idx); + + auto cluster_buf = reinterpret_cast<const int32_t *>(cluster.data()->base()); + assert(cluster_buf); + + // re-sizing output shape + ir::Shape new_shape = shape_inference::inferBCQFullyConnectedShape( + input.info().shape(), cluster.info().shape(), cluster_buf); + output.info().shape(new_shape); +} + +void StaticShapeInferer::visit(const ir::operation::BCQGather &op) +{ + const auto indices_idx{op.getInputs().at(ir::operation::BCQGather::Input::INDICES)}; + const auto &indices = _operands.at(indices_idx); + + const auto input_binary_idx{op.getInputs().at(ir::operation::BCQGather::Input::INPUT_BINARY)}; + const auto &input_binary = _operands.at(input_binary_idx); + + const auto cluster_idx{op.getInputs().at(ir::operation::BCQGather::Input::INPUT_CLUSTERS)}; + const auto &cluster = _operands.at(cluster_idx); + + const auto output_idx = op.getOutputs().at(0); + ir::Operand &output = _operands.at(output_idx); + + auto cluster_buf = reinterpret_cast<const int32_t *>(cluster.data()->base()); + assert(cluster_buf); + + auto rank = input_binary.shape().rank(); + + // re-sizing output shape + ir::Shape new_shape = shape_inference::inferBCQGatherShape( + indices.info().shape(), cluster.info().shape(), cluster_buf, rank, op.param()); + + output.info().shape(new_shape); +} + void StaticShapeInferer::visit(const ir::operation::BinaryArithmetic &op) { handleBinaryArithmeticOp(op, op.getInputs().at(ir::operation::BinaryArithmetic::Input::LHS), @@ -439,6 +496,98 @@ void StaticShapeInferer::visit(const ir::operation::L2Normalization &op) handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::L2Normalization::Input::INPUT)); } +void StaticShapeInferer::visit(const ir::operation::LSTM &op) +{ + const auto output_index{op.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)}; + auto &output = _operands.at(output_index); + + const auto output_state_out_index{ + op.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)}; + + const auto cell_state_out_index{op.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)}; + + const auto scratch_buffer_index{op.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)}; + + if (output.info().isDynamic() || (_operands.exist(output_state_out_index) && + _operands.at(output_state_out_index).info().isDynamic()) || + (_operands.exist(cell_state_out_index) && + _operands.at(cell_state_out_index).info().isDynamic()) || + (_operands.exist(scratch_buffer_index) && + _operands.at(scratch_buffer_index).info().isDynamic())) + return; + + const auto input_index{op.getInputs().at(ir::operation::LSTM::Input::INPUT)}; + const auto &input = _operands.at(input_index); + + const auto input_to_output_weights_index{ + op.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)}; + const auto &input_to_output_weights = _operands.at(input_to_output_weights_index); + + const auto recurrent_to_output_weights_index{ + op.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)}; + const auto &recurrent_to_output_weights = _operands.at(recurrent_to_output_weights_index); + + // re-sizing outputs + const int n_batch = (input.shape().rank() == 3 && op.param().time_major) ? input.shape().dim(1) + : input.shape().dim(0); + const int n_cell = input_to_output_weights.shape().dim(0); + const int n_output = recurrent_to_output_weights.shape().dim(1); + if (input.shape().rank() == 3) + { + if (op.param().time_major) + output.info().shape(ir::Shape{input.shape().dim(0), n_batch, n_output}); + else + output.info().shape(ir::Shape{n_batch, input.shape().dim(1), n_output}); + } + else + { + assert(input.shape().rank() == 2); + output.info().shape(ir::Shape{n_batch, n_output}); + } + + if (_operands.exist(output_state_out_index)) + { + auto &output_state_out = _operands.at(output_state_out_index); + output_state_out.info().shape(ir::Shape{n_batch, n_output}); + } + + if (_operands.exist(cell_state_out_index)) + { + auto &cell_state_out = _operands.at(cell_state_out_index); + cell_state_out.info().shape(ir::Shape{n_batch, n_cell}); + } + + if (_operands.exist(scratch_buffer_index)) + { + auto &scratch_buffer = _operands.at(scratch_buffer_index); + + const auto input_to_input_weights_index{ + op.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; + const auto recurrent_to_input_weights_index{ + op.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; + + bool has_input_to_input_weights = + _operands.at(input_to_input_weights_index).shape().dim(0) != 0 && + _operands.at(input_to_input_weights_index).shape().dim(1) != 0; + bool has_recurrent_to_input_weights = + _operands.at(recurrent_to_input_weights_index).shape().dim(0) != 0 && + _operands.at(recurrent_to_input_weights_index).shape().dim(1) != 0; + + // NOTE The cell_to_input_weights do not exist in non-peephole although regular LSTM(non-CIFG). + // true: no CIFG + // false: CIFG + bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights; + if (has_cifg_param) + { + scratch_buffer.info().shape(ir::Shape{n_batch, n_cell * 4}); + } + else + { + scratch_buffer.info().shape(ir::Shape{n_batch, n_cell * 3}); + } + } +} + void StaticShapeInferer::visit(const ir::operation::MatrixBandPart &op) { handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::MatrixBandPart::Input::INPUT)); @@ -683,9 +832,29 @@ void StaticShapeInferer::visit(const ir::operation::ResizeBilinear &op) const auto output_idx = op.getOutputs().at(0); ir::Operand &output = _operands.at(output_idx); + int32_t height_out, width_out; + if (op.getInputs().size() == 2) + { + auto &size = _operands.at(op.getInputs().at(ir::operation::ResizeBilinear::Input::SIZE)); + if (!size.isConstant()) + { + output.info().setDynamic(); + _return_has_dynamic_tensor = true; + return; + } + const auto size_v = size.asVector<std::int32_t>(); + height_out = size_v[0]; + width_out = size_v[1]; + } + else + { + height_out = op.param().height_out; + width_out = op.param().width_out; + } + // Shape inferencing logic based on Params - ir::Shape new_shape = shape_inference::inferResizeBilinearShape( - input.shape(), op.param().height_out, op.param().width_out); + ir::Shape new_shape = + shape_inference::inferResizeBilinearShape(input.shape(), height_out, width_out); // if size_op is from Const, TFLC put the shape of output into tensor if (new_shape != output.shape()) @@ -803,21 +972,35 @@ void StaticShapeInferer::visit(const ir::operation::SpaceToBatchND &op) void StaticShapeInferer::visit(const ir::operation::Split &op) { - const auto input_idx{op.getInputs().at(0)}; + const auto input_idx{op.getInputs().at(ir::operation::Split::Input::INPUT)}; const auto &input = _operands.at(input_idx); - const auto axis = op.param().axis; + const auto axis_idx{op.getInputs().at(ir::operation::Split::Input::AXIS)}; + const auto &axis = _operands.at(axis_idx); + + auto outputs = op.getOutputs(); + if (!axis.isConstant()) + { + for (auto output_idx : outputs) + { + ir::Operand &output = _operands.at(output_idx); + output.info().setDynamic(); + } + _return_has_dynamic_tensor = true; + return; + } + const auto num_splits = op.param().num_splits; const auto rank = input.info().shape().rank(); - auto axis_resolved = axis < 0 ? axis + rank : axis; + auto axis_value = axis.asScalar<int32_t>(); + axis_value = axis_value < 0 ? axis_value + rank : axis_value; - assert(0 <= axis_resolved && axis_resolved < rank); + assert(0 <= axis_value && axis_value < rank); ir::Shape new_shape = - shape_inference::inferSplitShape(input.info().shape(), axis_resolved, num_splits); - auto output_tensors = op.getOutputs(); - for (auto output_idx : output_tensors) + shape_inference::inferSplitShape(input.info().shape(), axis_value, num_splits); + for (auto output_idx : outputs) { ir::Operand &output = _operands.at(output_idx); output.info().shape(new_shape); @@ -838,13 +1021,6 @@ void StaticShapeInferer::visit(const ir::operation::Squeeze &op) const auto output_idx = op.getOutputs().at(0); ir::Operand &output = _operands.at(output_idx); - if (input.info().isDynamic()) - { - output.info().setDynamic(); - _return_has_dynamic_tensor = true; - return; - } - // Squeeze output shpae ir::Shape new_shape = shape_inference::inferSqueezeShape(input.info().shape(), op.param()); output.info().shape(new_shape); @@ -909,7 +1085,8 @@ void StaticShapeInferer::visit(const ir::operation::Tile &op) assert(multiplier_buffer); // re-sizing output shape - auto new_shape = shape_inference::inferTileShape(input.info().shape(), multiplier_buffer); + auto new_shape = shape_inference::inferTileShape(input.info().shape(), multiplier_buffer, + multiplier.shape().num_elements()); output.info().shape(new_shape); } @@ -918,14 +1095,43 @@ void StaticShapeInferer::visit(const ir::operation::Transpose &op) const auto input_idx{op.getInputs().at(ir::operation::Transpose::Input::INPUT)}; const auto &input = _operands.at(input_idx); + const auto perm_idx{op.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)}; + const auto &perm = _operands.at(perm_idx); + + // perm.shape() != ir::Shape{0} means that perm is (n-1...0) + // TODO This condition changes to perm.num_elements() == 0 + const auto is_regular_transpose = perm.shape() == ir::Shape{0}; + // get mutable output operand const auto output_idx = op.getOutputs().at(0); - ir::Operand &output = _operands.at(output_idx); - const auto perm{op.param().perm}; - // const auto rank{op.param().rank}; + auto &output = _operands.at(output_idx); + if (!perm.isConstant() && !is_regular_transpose) + { + output.info().setDynamic(); + _return_has_dynamic_tensor = true; + return; + } - // set output shape, based on input and params - ir::Shape new_shape = shape_inference::inferTransposeShape(input.info().shape(), perm); + ir::Shape new_shape; + if (is_regular_transpose) + { + // Call by (n-1...0) + new_shape = shape_inference::inferTransposeShape(input.info().shape(), nullptr, 0); + } + else + { + // Check rank + if (input.info().shape().rank() != static_cast<int>(perm.info().shape().num_elements())) + { + throw std::runtime_error("StaticShapeInferer failed, bad rank size: " + + std::to_string(perm.info().shape().num_elements())); + } + + // set output shape, based on input and params + const auto perm_buf = reinterpret_cast<const int32_t *>(perm.data()->base()); + new_shape = shape_inference::inferTransposeShape(input.info().shape(), perm_buf, + perm.shape().num_elements()); + } output.info().shape(new_shape); } diff --git a/runtime/onert/core/src/compiler/TensorRegistries.h b/runtime/onert/core/src/compiler/TensorRegistries.h index 8be87b081..e42225cbf 100644 --- a/runtime/onert/core/src/compiler/TensorRegistries.h +++ b/runtime/onert/core/src/compiler/TensorRegistries.h @@ -69,7 +69,7 @@ public: return _cf_tensor_reg; } - std::shared_ptr<backend::ITensor> getITensor(ir::OperandIndex ind) const + backend::ITensor *getITensor(ir::OperandIndex ind) const { for (auto &tensor_reg : _tensor_regs) { diff --git a/runtime/onert/core/src/compiler/pass/ConstantInsertionPass.cc b/runtime/onert/core/src/compiler/pass/ConstantInsertionPass.cc index 647669e46..ef6240894 100644 --- a/runtime/onert/core/src/compiler/pass/ConstantInsertionPass.cc +++ b/runtime/onert/core/src/compiler/pass/ConstantInsertionPass.cc @@ -44,7 +44,7 @@ void ConstantInsertionPass::callback(const ir::OperationIndex &node_index, ir::O const auto key = ReplaceKey{input, factor}; if (_replace_operands_map.count(key) == 0) { - auto new_object = object; + ir::Operand new_object(object); new_object.unsetDef(); // TODO Remove const_case const_cast<ir::OperationIndexSet &>(new_object.getUses()).clear(); @@ -81,7 +81,7 @@ void ConstantInsertionPass::callback(const ir::OperationIndex &node_index, ir::O } // Now this runtime does not support the node making output as constant - for (const auto &output : node.getOutputs()) + for (const auto &output : node.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED) { UNUSED_RELEASE(output); assert(!_graph.operands().at(output).isConstant()); diff --git a/runtime/onert/core/src/compiler/pass/ConstantOutputPass.cc b/runtime/onert/core/src/compiler/pass/ConstantOutputPass.cc new file mode 100644 index 000000000..c176f6ffb --- /dev/null +++ b/runtime/onert/core/src/compiler/pass/ConstantOutputPass.cc @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ConstantOutputPass.h" + +#include "ir/Graph.h" +#include "ir/operation/Permute.h" +#include "util/logging.h" + +namespace onert +{ +namespace compiler +{ +namespace pass +{ + +void ConstantOutputPass::callback(const ir::OperandIndex &ind, ir::Operand &obj) +{ + if (!_graph.getOutputs().contains(ind) || !obj.isConstant()) + return; + + auto permute_input_ind = _graph.addOperand(obj.shape(), obj.typeInfo()); + auto &permute_input_obj = _graph.operands().at(permute_input_ind); + + // Move the const data + permute_input_obj.data(obj.shareData()); + obj.releaseData(); + obj.info().setAsNonConst(); + + using ir::operation::Permute; + auto permute_obj = std::make_unique<Permute>(permute_input_ind, ind, Permute::Type::COPY); + auto permute_ind = _graph.operations().push(std::move(permute_obj)); + + permute_input_obj.insertUse(permute_ind); + obj.setDef(permute_ind); + + // Make the operations that uses this operand to use the generated operand + auto orig_uses = obj.getUses(); + for (auto use : orig_uses) + { + permute_input_obj.insertUse(use); + obj.removeUse(use); + _graph.operations().at(use).replaceInputs(ind, permute_input_ind); + } + + VERBOSE(ConstantOutputPass) << "Permute Op inserted for a constant ouput, node index : " + << permute_ind << std::endl; + VERBOSE(ConstantOutputPass) << " - Input (inserted) Operand : " << permute_input_ind + << std::endl; + VERBOSE(ConstantOutputPass) << " - Output(original) Operand : " << ind << std::endl; +} + +} // namespace pass +} // namespace compiler +} // namespace onert diff --git a/runtime/onert/core/src/compiler/pass/ConstantOutputPass.h b/runtime/onert/core/src/compiler/pass/ConstantOutputPass.h new file mode 100644 index 000000000..193dd3a68 --- /dev/null +++ b/runtime/onert/core/src/compiler/pass/ConstantOutputPass.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_COMPILER_PASS_CONSTANT_OUTPUT_PASS_H__ +#define __ONERT_COMPILER_PASS_CONSTANT_OUTPUT_PASS_H__ + +#include "OperandPass.h" + +namespace onert +{ +namespace compiler +{ +namespace pass +{ + +/** + * @brief Pass to specially handle constant model outputs + * + * As an output buffer is given right before an execution but constant initialization is done at + * prepare phase, the current runtime structure cannot handle when an output is constant. + * To resolve this problem, this pass inserts a Permute layer with a const input and make the model + * output tensor to be its output. + * + * e.g.) + * + * ((Const Output)) + * + * becomes + * + * (Const) -> [Permute] -> ((Output)) + * + * Note that this is a mandatory pass for Graph. + */ +class ConstantOutputPass : public OperandPass +{ +public: + using OperandPass::OperandPass; + +public: + std::string id() final { return "ConstantOutputPass"; } + +public: + void callback(const ir::OperandIndex &i, ir::Operand &o) final; +}; + +} // namespace pass +} // namespace compiler +} // namespace onert + +#endif // __ONERT_COMPILER_PASS_CONSTANT_INSERTION_PASS_H__ diff --git a/runtime/onert/core/src/compiler/pass/OddOutputPass.cc b/runtime/onert/core/src/compiler/pass/OddOutputPass.cc new file mode 100644 index 000000000..f50fae0d3 --- /dev/null +++ b/runtime/onert/core/src/compiler/pass/OddOutputPass.cc @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "OddOutputPass.h" + +#include "ir/Graph.h" +#include "ir/operation/Permute.h" +#include "util/logging.h" +#include "util/Utils.h" + +namespace onert +{ +namespace compiler +{ +namespace pass +{ + +void OddOutputPass::run() +{ + auto &outputs = _graph.getOutputs(); + + VERBOSE(OddOutputPass) << "Case 1 : An operand which is a model output and a model input" + << std::endl; + for (auto &ind : outputs) + { + if (_graph.getInputs().contains(ind)) + { + auto permute_output_ind = insertPermute(ind); + // Update the output to be newly added operand + _graph.getOutputs().replace(ind, permute_output_ind); + } + } + + VERBOSE(OddOutputPass) << "Case 2 : Two or more duplicated outputs" << std::endl; + std::unordered_set<ir::OperandIndex> occurence; + for (auto &ind : outputs) + { + auto &obj = _graph.operands().at(ind); + if (occurence.count(ind) == 0) + { + occurence.insert(ind); + continue; + } + + // Panic when it is const, it must have been handled earlier in another pass + UNUSED_RELEASE(obj); + assert(!obj.isConstant()); + + auto permute_output_ind = insertPermute(ind); + ind = permute_output_ind; // Replace output index to fix output duplication + } +} + +ir::OperandIndex OddOutputPass::insertPermute(ir::OperandIndex ind) +{ + auto &obj = _graph.operands().at(ind); + auto output_ind = _graph.addOperand(obj.shape(), obj.typeInfo()); + auto &output_obj = _graph.operands().at(output_ind); + + using ir::operation::Permute; + auto permute_obj = std::make_unique<Permute>(ind, output_ind, Permute::Type::COPY); + auto permute_ind = _graph.operations().push(std::move(permute_obj)); + + output_obj.setDef(permute_ind); + obj.insertUse(permute_ind); + + VERBOSE(OddOutputPass) << "Permute Op inserted for a constant output, node index : " + << permute_ind << std::endl; + VERBOSE(OddOutputPass) << " - Input (original) Operand : " << ind << std::endl; + VERBOSE(OddOutputPass) << " - Output(inserted) Operand : " << output_ind << std::endl; + + return output_ind; +} + +} // namespace pass +} // namespace compiler +} // namespace onert diff --git a/runtime/onert/core/src/compiler/pass/OddOutputPass.h b/runtime/onert/core/src/compiler/pass/OddOutputPass.h new file mode 100644 index 000000000..2accbac60 --- /dev/null +++ b/runtime/onert/core/src/compiler/pass/OddOutputPass.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_COMPILER_PASS_ODD_OUTPUT_PASS_H__ +#define __ONERT_COMPILER_PASS_ODD_OUTPUT_PASS_H__ + +#include <unordered_set> + +#include "Pass.h" +#include "ir/Index.h" + +namespace onert +{ +namespace compiler +{ +namespace pass +{ + +/** + * @brief Pass to specially handle odd outputs in a subgraph + * + * Runtime Graph IR requires every input or output must have distinct tensor index, this is onert's + * restriction. However we allow duplication of indices in the models(or API). So we should + * transform the graph after model-loading. + * + * This is necessary since our API lets users to set different buffers for each input and output so + * it is unavoidable that we must copy the value at runtime. + * + * Note that this is a mandatory pass for Graph. + * + * Case 1 : An operand which is a model output and a model input + * + * Create an operand and insert a Permute(copy) op between them. And change the output to be the + * newly generated operand. + * + * e.g.) + * + * ``` + * ((#0 Input0 and also Output0)) + * becomes + * ((#0 Input0)) -> [#0 Permute] -> ((#1 Output0)) + * ``` + * + * Case 2 : Two or more duplicated outputs + * + * Do the same with Case 1, but between two outputs of the same tensor index. + * + * e.g.) + * + * ``` + * ((#0 Input0)) -> [#0 Some Operation] -> ((#1 Output0 and also Output1)) + * becomes + * ((#0 Input0)) -> [#0 Some Operation] -> ((#1 Output0)) [#1 Permute] -> ((#2 Output1)) + * ``` + * + */ +class OddOutputPass : public Pass +{ +public: + using Pass::Pass; + +public: + std::string id() final { return "OddOutputPass"; } + +public: + void run() override; + +private: + ir::OperandIndex insertPermute(ir::OperandIndex input); +}; + +} // namespace pass +} // namespace compiler +} // namespace onert + +#endif // __ONERT_COMPILER_PASS_ODD_OUTPUT_PASS_H__ diff --git a/runtime/onert/core/src/compiler/pass/PassRunner.cc b/runtime/onert/core/src/compiler/pass/PassRunner.cc new file mode 100644 index 000000000..2a058c8ac --- /dev/null +++ b/runtime/onert/core/src/compiler/pass/PassRunner.cc @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "PassRunner.h" + +namespace onert +{ +namespace compiler +{ +namespace pass +{ + +PassRunner &PassRunner::append(std::unique_ptr<Pass> pass) +{ + _passes.emplace_back(std::move(pass)); + return *this; +} + +void PassRunner::run() +{ + for (auto &pass : _passes) + { + VERBOSE(PassRunner) << "Start running '" << pass->id() << "'" << std::endl; + pass->run(); + VERBOSE(PassRunner) << "Finished running '" << pass->id() << "'" << std::endl; + // TODO Dump graph(LowerInfo, OpSequence, ...)? + } +} + +} // namespace pass +} // namespace compiler +} // namespace onert diff --git a/runtime/onert/core/src/compiler/pass/PassRunner.h b/runtime/onert/core/src/compiler/pass/PassRunner.h new file mode 100644 index 000000000..a43c83f89 --- /dev/null +++ b/runtime/onert/core/src/compiler/pass/PassRunner.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_COMPILER_PASS_PASS_RUNNER_H__ +#define __ONERT_COMPILER_PASS_PASS_RUNNER_H__ + +#include <initializer_list> +#include <memory> +#include <vector> + +#include "Pass.h" +#include "util/logging.h" + +namespace onert +{ +namespace compiler +{ +namespace pass +{ + +/** + * @brief Composite passes with logging + */ +class PassRunner +{ +public: + PassRunner() = default; + PassRunner &append(std::unique_ptr<Pass> pass); + + void run(); + +private: + std::vector<std::unique_ptr<Pass>> _passes; +}; + +} // namespace pass +} // namespace compiler +} // namespace onert + +#endif // __ONERT_COMPILER_PASS_PASS_RUNNER_H__ diff --git a/runtime/onert/core/src/compiler/pass/PermutationEliminationPass.cc b/runtime/onert/core/src/compiler/pass/PermutationEliminationPass.cc index f01697034..504f1b995 100644 --- a/runtime/onert/core/src/compiler/pass/PermutationEliminationPass.cc +++ b/runtime/onert/core/src/compiler/pass/PermutationEliminationPass.cc @@ -53,6 +53,20 @@ void PermutationEliminationPass::visit(const ir::operation::Permute &node) if (_graph.getOutputs().contains(out_operand)) { + // If the input is a const, we cannot remove it since we cannot put the constant data in the + // output buffer during prepare phase. + auto permute_input = node.getInputs().at(0); + if (_graph.operands().at(permute_input).isConstant()) + return; + // If the input is a model input, we cannot remove it since our API lets users to set different + // buffers for inputs and outputs even though one tensor is both at the same time. + auto permute_output = node.getOutputs().at(0); + if (_graph.getInputs().contains(permute_input) && _graph.getOutputs().contains(permute_output)) + return; + // Likewise, if copying between outputs to outputs, keep it. + if (_graph.getOutputs().contains(permute_input) && _graph.getOutputs().contains(permute_output)) + return; + // Exceptional case : When the output operand is a model output // In this case we keep the output and remove the input diff --git a/runtime/onert/core/src/compiler/pass/PermutationOperationPass.cc b/runtime/onert/core/src/compiler/pass/PermutationOperationPass.cc index c5c95c726..93d125307 100644 --- a/runtime/onert/core/src/compiler/pass/PermutationOperationPass.cc +++ b/runtime/onert/core/src/compiler/pass/PermutationOperationPass.cc @@ -212,7 +212,7 @@ void PermutationOperationPass::changeToKeepLayout(const Operation &node) } } - for (const auto &output : node.getOutputs() | Remove::DUPLICATED) + for (const auto &output : node.getOutputs() | Remove::DUPLICATED | Remove::UNDEFINED) { auto lower_info = _lowered_graph.getLowerInfo(output); lower_info->removeDefPermuteFactor(removed_factor); @@ -279,6 +279,18 @@ void PermutationOperationPass::visit(const ir::operation::Gather &node) } } +void PermutationOperationPass::visit(const ir::operation::OneHot &node) +{ + const auto &output_ind = node.getOutputs().at(0); + const auto &output_obj = _graph.operands().at(output_ind); + const auto &output_shape = output_obj.shape(); + + if (output_shape.rank() >= 4) + { + changeToKeepLayout(node); + } +} + void PermutationOperationPass::visit(const ir::operation::Pack &node) { const auto &input_ind = node.getInputs().at(ir::operation::Reshape::Input::INPUT); diff --git a/runtime/onert/core/src/compiler/pass/PermutationOperationPass.h b/runtime/onert/core/src/compiler/pass/PermutationOperationPass.h index 2dd76b971..cea5de288 100644 --- a/runtime/onert/core/src/compiler/pass/PermutationOperationPass.h +++ b/runtime/onert/core/src/compiler/pass/PermutationOperationPass.h @@ -44,6 +44,7 @@ public: void visit(const ir::operation::Concat &) final; void visit(const ir::operation::ElementwiseBinary &) final; void visit(const ir::operation::ElementwiseUnary &) final; + void visit(const ir::operation::OneHot &) final; void visit(const ir::operation::Pack &) final; void visit(const ir::operation::PReLU &) final; void visit(const ir::operation::SquaredDifference &) final; diff --git a/runtime/onert/core/src/dumper/dot/DotDumper.cc b/runtime/onert/core/src/dumper/dot/DotDumper.cc index 118057f09..8f3cf328c 100644 --- a/runtime/onert/core/src/dumper/dot/DotDumper.cc +++ b/runtime/onert/core/src/dumper/dot/DotDumper.cc @@ -81,11 +81,8 @@ void DotDumper::dump(const std::string &tag) } else { - showing_cond = !object.isConstant(); - } - if (object.isConstant() || _graph.getInputs().contains(index)) - { - showing_cond = showing_cond && (object.getUses().size() > 0); + showing_cond = + !object.isConstant() || (_graph.getInputs() + _graph.getOutputs()).contains(index); } if (showing_cond) { diff --git a/runtime/onert/core/src/exec/DataflowExecutor.cc b/runtime/onert/core/src/exec/DataflowExecutor.cc index a69ae9cdb..53bc3c204 100644 --- a/runtime/onert/core/src/exec/DataflowExecutor.cc +++ b/runtime/onert/core/src/exec/DataflowExecutor.cc @@ -77,14 +77,12 @@ bool DataflowExecutor::noWaitingJobs() [](const std::unique_ptr<Job> &job) { return job == nullptr; }); } -DataflowExecutor::DataflowExecutor( - std::unique_ptr<compiler::LoweredGraph> lowered_graph, - const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors, - const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors, - const compiler::TensorRegistries &tensor_regs, backend::TensorManagerSet &&tensor_mgrs, - compiler::CodeMap &&code_map) - : ExecutorBase{std::move(lowered_graph), input_tensors, output_tensors, tensor_regs, - std::move(tensor_mgrs)}, +DataflowExecutor::DataflowExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph, + const std::vector<backend::ITensor *> &input_tensors, + const std::vector<backend::ITensor *> &output_tensors, + const compiler::TensorRegistries &tensor_regs, + compiler::CodeMap &&code_map) + : ExecutorBase{std::move(lowered_graph), input_tensors, output_tensors, tensor_regs}, _code_map{std::move(code_map)} { VERBOSE(DataflowExecutor) << "Constructing Dataflow Executor" << std::endl; @@ -161,6 +159,8 @@ void DataflowExecutor::executeImpl() _subject.notifyJobBegin(this, op_seq, backend); + job->fn_seq()->initRunning(); + // check if FunctionSequence needs to handle dynamic tensor bool handle_dynamic_tensor = op_seq->has_dynamic_tensor() || dynamic_input_exists; job->fn_seq()->enableDynamicShapeInferer(handle_dynamic_tensor); diff --git a/runtime/onert/core/src/exec/DataflowExecutor.h b/runtime/onert/core/src/exec/DataflowExecutor.h index 8d60e3e4b..69dfda15c 100644 --- a/runtime/onert/core/src/exec/DataflowExecutor.h +++ b/runtime/onert/core/src/exec/DataflowExecutor.h @@ -50,10 +50,9 @@ public: * @param code_map OpSequence and its code map */ DataflowExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph, - const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors, - const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors, - const compiler::TensorRegistries &tensor_regs, - backend::TensorManagerSet &&tensor_mgrs, compiler::CodeMap &&code_map); + const std::vector<backend::ITensor *> &input_tensors, + const std::vector<backend::ITensor *> &output_tensors, + const compiler::TensorRegistries &tensor_regs, compiler::CodeMap &&code_map); void executeImpl() override; diff --git a/runtime/onert/core/src/exec/DynamicShapeInference.cc b/runtime/onert/core/src/exec/DynamicShapeInference.cc index 70bddfce4..0f604c43f 100644 --- a/runtime/onert/core/src/exec/DynamicShapeInference.cc +++ b/runtime/onert/core/src/exec/DynamicShapeInference.cc @@ -23,14 +23,6 @@ namespace onert namespace exec { -inline backend::IDynamicTensorManager * -dynamicTensorManagerOf(const std::shared_ptr<backend::ITensor> &tensor) -{ - if (!tensor->dynamic_tensor_manager()) - throw std::runtime_error{"Dynamic Tensor Manager is not available for this tensor."}; - return tensor->dynamic_tensor_manager(); -} - void DynamicShapeInferer::handleBinaryArithmeticOp(const ir::Operation &op, const ir::OperandIndex lhs_idx, const ir::OperandIndex rhs_idx) @@ -64,7 +56,7 @@ void DynamicShapeInferer::handleBinaryArithmeticOp(const ir::Operation &op, ir::Shape new_shape = shape_inference::inferEltwiseShape(lhs_shape, rhs_shape); - dynamicTensorManagerOf(output)->applyShape(output_idx, new_shape); + output->applyShape(new_shape); assert(output->buffer() != nullptr); } @@ -96,30 +88,32 @@ void DynamicShapeInferer::handleSimpleUnaryOp(const ir::Operation &op, auto output_ind = op.getOutputs().at(0); auto output = _tensor_registry->getITensor(output_ind); - dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape); + output->applyShape(output_shape); assert(output->buffer() != nullptr); } void DynamicShapeInferer::visit(const ir::operation::ArgMax &op) { const auto input_idx{op.getInputs().at(ir::operation::ArgMax::Input::INPUT)}; - const auto &input = _tensor_registry->getITensor(input_idx); - auto input_shape = input->getShape(); + const auto input = _tensor_registry->getITensor(input_idx); + + const auto axis_idx{op.getInputs().at(ir::operation::ArgMax::Input::AXIS)}; + const auto axis = _tensor_registry->getITensor(axis_idx); + + auto output_ind = op.getOutputs().at(0); + auto output = _tensor_registry->getITensor(output_ind); if (!input->is_dynamic()) return; + auto input_shape = input->getShape(); + auto axis_value = *reinterpret_cast<const int32_t *>(axis->buffer()); const auto rank = input_shape.rank(); - const auto axis = ((op.param().axis < 0) ? rank + op.param().axis : op.param().axis); - - assert(0 <= axis && axis < rank); - - auto output_ind = op.getOutputs().at(0); - auto output = _tensor_registry->getITensor(output_ind); + axis_value = axis_value < 0 ? axis_value + rank : axis_value; - ir::Shape new_shape = shape_inference::inferArgMaxShape(input_shape, axis, rank); + ir::Shape new_shape = shape_inference::inferArgMaxShape(input_shape, axis_value, rank); - dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape); + output->applyShape(new_shape); assert(output->buffer() != nullptr); } @@ -141,7 +135,68 @@ void DynamicShapeInferer::visit(const ir::operation::BatchMatMul &op) // TODO auto new_shape = shape_inference::inferBatchMatMulShape(lhs_shape, rhs_shape, op.param()); - dynamicTensorManagerOf(output)->applyShape(output_index, new_shape); + output->applyShape(new_shape); +} + +void DynamicShapeInferer::visit(const ir::operation::BCQFullyConnected &op) +{ + const auto input_idx{op.getInputs().at(ir::operation::BCQFullyConnected::Input::INPUT)}; + const auto &input = _tensor_registry->getITensor(input_idx); + + const auto cluster_idx{ + op.getInputs().at(ir::operation::BCQFullyConnected::Input::WEIGHTS_CLUSTERS)}; + const auto &cluster = _tensor_registry->getITensor(cluster_idx); + assert(cluster->is_constant()); + + if (!input->is_dynamic()) + return; + + auto input_shape = input->getShape(); + auto cluster_shape = cluster->getShape(); + + auto cluster_buf = reinterpret_cast<const int32_t *>(cluster->buffer()); + assert(cluster_buf); + + ir::Shape new_shape = + shape_inference::inferBCQFullyConnectedShape(input_shape, cluster_shape, cluster_buf); + + auto output_ind = op.getOutputs().at(0); + auto output = _tensor_registry->getITensor(output_ind); + + output->applyShape(new_shape); + assert(output->buffer() != nullptr); +} + +void DynamicShapeInferer::visit(const ir::operation::BCQGather &op) +{ + const auto indices_idx{op.getInputs().at(ir::operation::BCQGather::Input::INDICES)}; + const auto &indices = _tensor_registry->getITensor(indices_idx); + + const auto input_binary_idx{op.getInputs().at(ir::operation::BCQGather::Input::INDICES)}; + const auto &input_binary = _tensor_registry->getITensor(input_binary_idx); + + const auto cluster_idx{op.getInputs().at(ir::operation::BCQGather::Input::INPUT_CLUSTERS)}; + const auto &cluster = _tensor_registry->getITensor(cluster_idx); + assert(cluster->is_constant()); + + if (!indices->is_dynamic()) + return; + + auto indices_shape = indices->getShape(); + auto cluster_shape = cluster->getShape(); + auto rank = input_binary->getShape().rank(); + + auto cluster_buf = reinterpret_cast<const int32_t *>(cluster->buffer()); + assert(cluster_buf); + + ir::Shape new_shape = shape_inference::inferBCQGatherShape(indices_shape, cluster_shape, + cluster_buf, rank, op.param()); + + auto output_ind = op.getOutputs().at(0); + auto output = _tensor_registry->getITensor(output_ind); + + output->applyShape(new_shape); + assert(output->buffer() != nullptr); } void DynamicShapeInferer::visit(const ir::operation::BinaryArithmetic &op) @@ -170,7 +225,7 @@ void DynamicShapeInferer::visit(const ir::operation::BroadcastTo &op) shape->getShape(), reinterpret_cast<const int32_t *>(shape->buffer())); // set output shape and output buffer - dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape); + output->applyShape(output_shape); assert(output->buffer() != nullptr); } @@ -236,7 +291,7 @@ void DynamicShapeInferer::visit(const ir::operation::Concat &op) for (auto input_ind : op.getInputs()) { auto input = _tensor_registry->getITensor(input_ind); - if (input != first_input && !isConcatible(first_input.get(), input.get(), op.param().axis)) + if (input != first_input && !isConcatible(first_input, input, op.param().axis)) throw std::runtime_error("input shapes does not matched for concat"); } } @@ -255,7 +310,7 @@ void DynamicShapeInferer::visit(const ir::operation::Concat &op) auto output = _tensor_registry->getITensor(output_ind); auto output_shape = shape_inference::inferConcatShape(in_shapes, op.param()); - dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape); + output->applyShape(output_shape); } void DynamicShapeInferer::visit(const ir::operation::Conv2D &op) @@ -278,7 +333,7 @@ void DynamicShapeInferer::visit(const ir::operation::Conv2D &op) ir::Shape output_shape = shape_inference::inferConv2DShape(input_shape, ker_shape, op.param()); - dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape); + output->applyShape(output_shape); assert(output->buffer() != nullptr); } @@ -338,7 +393,7 @@ void DynamicShapeInferer::visit(const ir::operation::ExpandDims &op) auto output_shape = shape_inference::inferExpandDimsShape(input_shape, axis_buf[0]); - dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape); + output->applyShape(output_shape); assert(output->buffer() != nullptr); } @@ -354,14 +409,14 @@ void DynamicShapeInferer::visit(const ir::operation::Fill &op) if ((!input->is_dynamic()) && (!output->is_dynamic())) return; - assert(input.get()->data_type() == ir::DataType::INT32); + assert(input->data_type() == ir::DataType::INT32); auto input_buf = reinterpret_cast<const int32_t *>(input->buffer()); assert(input_buf); auto output_shape = shape_inference::inferFillShape(input_shape, input_buf); - dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape); + output->applyShape(output_shape); assert(output->buffer() != nullptr); } @@ -384,7 +439,7 @@ void DynamicShapeInferer::visit(const ir::operation::FullyConnected &op) auto output_ind = op.getOutputs().at(0); auto output = _tensor_registry->getITensor(output_ind); - dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape); + output->applyShape(new_shape); assert(output->buffer() != nullptr); } @@ -416,7 +471,7 @@ void DynamicShapeInferer::visit(const ir::operation::Gather &op) auto output_ind = op.getOutputs().at(0); auto output = _tensor_registry->getITensor(output_ind); - dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape); + output->applyShape(new_shape); assert(output->buffer() != nullptr); } @@ -425,6 +480,109 @@ void DynamicShapeInferer::visit(const ir::operation::L2Normalization &op) handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::L2Normalization::INPUT)); } +void DynamicShapeInferer::visit(const ir::operation::LSTM &op) +{ + const auto output_index{op.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)}; + auto output = _tensor_registry->getITensor(output_index); + + const auto output_state_out_index{ + op.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)}; + + const auto cell_state_out_index{op.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)}; + + const auto scratch_buffer_index{op.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)}; + + if (!output->is_dynamic() && + !(_tensor_registry->getITensor(output_state_out_index) != nullptr && + _tensor_registry->getITensor(output_state_out_index)->is_dynamic()) && + !(_tensor_registry->getITensor(cell_state_out_index) != nullptr && + _tensor_registry->getITensor(cell_state_out_index)->is_dynamic()) && + !(_tensor_registry->getITensor(scratch_buffer_index) != nullptr && + _tensor_registry->getITensor(cell_state_out_index)->is_dynamic())) + return; + + const auto input_index{op.getInputs().at(ir::operation::LSTM::Input::INPUT)}; + const auto input = _tensor_registry->getITensor(input_index); + const auto input_shape = input->getShape(); + + const auto input_to_output_weights_index{ + op.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)}; + const auto input_to_output_weights = _tensor_registry->getITensor(input_to_output_weights_index); + const auto input_to_output_weights_shape = input_to_output_weights->getShape(); + + const auto recurrent_to_output_weights_index{ + op.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)}; + const auto recurrent_to_output_weights = + _tensor_registry->getITensor(recurrent_to_output_weights_index); + const auto recurrent_to_output_weights_shape = recurrent_to_output_weights->getShape(); + + // re-sizing outputs + const int n_batch = + (input_shape.rank() == 3 && op.param().time_major) ? input_shape.dim(1) : input_shape.dim(0); + const int n_cell = input_to_output_weights_shape.dim(0); + const int n_output = recurrent_to_output_weights_shape.dim(1); + if (input_shape.rank() == 3) + { + if (op.param().time_major) + output->applyShape(ir::Shape{input_shape.dim(0), n_batch, n_output}); + else + output->applyShape(ir::Shape{n_batch, input_shape.dim(1), n_output}); + } + else + { + assert(input_shape.rank() == 2); + output->applyShape(ir::Shape{n_batch, n_output}); + } + assert(output->buffer() != nullptr); + + auto output_state_out = _tensor_registry->getITensor(output_state_out_index); + if (output_state_out != nullptr) + { + output_state_out->applyShape(ir::Shape{n_batch, n_output}); + assert(output_state_out->buffer() != nullptr); + } + + auto cell_state_out = _tensor_registry->getITensor(cell_state_out_index); + if (cell_state_out != nullptr) + { + cell_state_out->applyShape(ir::Shape{n_batch, n_cell}); + assert(cell_state_out->buffer() != nullptr); + } + + auto scratch_buffer = _tensor_registry->getITensor(scratch_buffer_index); + if (scratch_buffer != nullptr) + { + const auto input_to_input_weights_index{ + op.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; + const auto recurrent_to_input_weights_index{ + op.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; + + const auto input_to_input_weights_shape = + _tensor_registry->getITensor(input_to_input_weights_index)->getShape(); + bool has_input_to_input_weights = + input_to_input_weights_shape.dim(0) != 0 && input_to_input_weights_shape.dim(1) != 0; + + const auto recurrent_to_input_weights_shape = + _tensor_registry->getITensor(recurrent_to_input_weights_index)->getShape(); + bool has_recurrent_to_input_weights = recurrent_to_input_weights_shape.dim(0) != 0 && + recurrent_to_input_weights_shape.dim(1) != 0; + + // NOTE The cell_to_input_weights do not exist in non-peephole although regular LSTM(non-CIFG). + // true: no CIFG + // false: CIFG + bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights; + if (has_cifg_param) + { + scratch_buffer->applyShape(ir::Shape{n_batch, n_cell * 4}); + } + else + { + scratch_buffer->applyShape(ir::Shape{n_batch, n_cell * 3}); + } + assert(scratch_buffer->buffer() != nullptr); + } +} + void DynamicShapeInferer::visit(const ir::operation::MatrixBandPart &op) { handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::MatrixBandPart::INPUT)); @@ -452,7 +610,7 @@ void DynamicShapeInferer::visit(const ir::operation::OneHot &op) const auto axis_val = op.param().axis; ir::Shape new_shape = shape_inference::inferOnehotShape(indices_shape, *depth_buf, axis_val); - dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape); + output->applyShape(new_shape); assert(output->buffer() != nullptr); } @@ -488,7 +646,7 @@ void DynamicShapeInferer::visit(const ir::operation::Pack &op) ir::Shape new_shape = shape_inference::inferPackShape(input_shape, axis, rank, num); - dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape); + output->applyShape(new_shape); assert(output->buffer() != nullptr); } @@ -515,7 +673,7 @@ void DynamicShapeInferer::visit(const ir::operation::Pad &op) shape_inference::inferPadShape(input->getShape(), pad_buf, pad->getShape().num_elements()); // change output shape and reallocate output tensor memory - dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape); + output->applyShape(output_shape); assert(output->buffer() != nullptr); } @@ -567,7 +725,7 @@ void DynamicShapeInferer::visit(const ir::operation::Range &op) *reinterpret_cast<int32_t *>(limit_tensor->buffer()), *reinterpret_cast<int32_t *>(delta_tensor->buffer())); } - dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape); + output->applyShape(new_shape); assert(output->buffer() != nullptr); } @@ -611,7 +769,7 @@ void DynamicShapeInferer::visit(const ir::operation::Reduce &op) ir::Shape new_shape = shape_inference::inferReduceShape(input_shape, axes_vec, keep_dims); - dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape); + output->applyShape(new_shape); assert(output->buffer() != nullptr); } @@ -665,7 +823,7 @@ void DynamicShapeInferer::visit(const ir::operation::Reshape &op) if (output_shape != output->getShape() || output->buffer() == nullptr) { // change on output shape - dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape); + output->applyShape(output_shape); } assert(output->buffer() != nullptr); } @@ -681,7 +839,7 @@ void DynamicShapeInferer::visit(const ir::operation::Reshape &op) if (output_shape != output->getShape() || output->buffer() == nullptr) { // change on output shape - dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape); + output->applyShape(output_shape); } assert(output->buffer() != nullptr); } @@ -705,14 +863,35 @@ void DynamicShapeInferer::visit(const ir::operation::ResizeBilinear &op) return; // getting output shape from input shape and Params - auto output_shape = shape_inference::inferResizeBilinearShape( - input->getShape(), op.param().height_out, op.param().width_out); + int32_t height_out, width_out; + if (op.getInputs().size() == 2) + { + auto size_ind = op.getInputs().at(ir::operation::ResizeBilinear::Input::SIZE); + auto size = _tensor_registry->getITensor(size_ind); + if (size->data_type() == ir::DataType::INT32) + { + auto size_buf = reinterpret_cast<const int32_t *>(size->buffer()); + height_out = size_buf[0]; + width_out = size_buf[1]; + } + else + { + throw std::runtime_error("DynamicShapeInferer ResizeBilinear : Unsupported data type"); + } + } + else + { + height_out = op.param().height_out; + width_out = op.param().width_out; + } + auto output_shape = + shape_inference::inferResizeBilinearShape(input->getShape(), height_out, width_out); // if shape is changed, change output shape and reallocate output tensor memory if (output_shape != output->getShape() || output->buffer() == nullptr) { // change on output shape - dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape); + output->applyShape(output_shape); } assert(output->buffer() != nullptr); } @@ -749,7 +928,7 @@ void DynamicShapeInferer::visit(const ir::operation::Select &op) auto output_ind = op.getOutputs().at(0); auto output = _tensor_registry->getITensor(output_ind); - dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape); + output->applyShape(new_shape); assert(output->buffer() != nullptr); } @@ -768,7 +947,7 @@ void DynamicShapeInferer::visit(const ir::operation::Shape &op) ir::Shape output_shape; output_shape.append(input_shape.rank()); - dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape); + output->applyShape(output_shape); assert(output->buffer() != nullptr); } @@ -794,7 +973,7 @@ void DynamicShapeInferer::visit(const ir::operation::Slice &op) ir::Shape new_shape = shape_inference::inferSliceShape(input_shape, begins_buf, sizes_buf); - dynamicTensorManagerOf(output)->applyShape(output_index, new_shape); + output->applyShape(new_shape); assert(output->buffer() != nullptr); } @@ -831,7 +1010,7 @@ void DynamicShapeInferer::visit(const ir::operation::SpaceToBatchND &op) ir::Shape new_shape = shape_inference::inferSpaceToBatchNDShape( input_shape, block_shape_shape, padding_shape, block_shape_data, padding_data); - dynamicTensorManagerOf(output)->applyShape(output_idx, new_shape); + output->applyShape(new_shape); assert(output->buffer() != nullptr); } @@ -840,27 +1019,37 @@ void DynamicShapeInferer::visit(const ir::operation::Split &op) const auto input_idx{op.getInputs().at(ir::operation::Split::Input::INPUT)}; const auto &input = _tensor_registry->getITensor(input_idx); - if (!input->is_dynamic()) + // Return if all tensors are not dynamic + bool has_dynamic = false; + for (const auto output_idx : op.getOutputs()) + { + auto output = _tensor_registry->getITensor(output_idx); + has_dynamic |= output->is_dynamic(); + } + if (!input->is_dynamic() && !has_dynamic) { return; } auto input_shape = input->getShape(); - const auto axis = op.param().axis; + const auto axis_idx{op.getInputs().at(ir::operation::Split::Input::AXIS)}; + const auto &axis = _tensor_registry->getITensor(axis_idx); + + auto axis_value = *reinterpret_cast<const int32_t *>(axis->buffer()); const auto num_splits = op.param().num_splits; const auto rank = input_shape.rank(); - auto axis_resolved = axis < 0 ? axis + rank : axis; + axis_value = axis_value < 0 ? axis_value + rank : axis_value; - assert(0 <= axis_resolved && axis_resolved < rank); + assert(0 <= axis_value && axis_value < rank); - ir::Shape new_shape = shape_inference::inferSplitShape(input_shape, axis_resolved, num_splits); + ir::Shape new_shape = shape_inference::inferSplitShape(input_shape, axis_value, num_splits); for (int out_tensor_idx = 0; out_tensor_idx < num_splits; out_tensor_idx++) { auto output_ind = op.getOutputs().at(out_tensor_idx); auto output = _tensor_registry->getITensor(output_ind); - dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape); + output->applyShape(new_shape); assert(output->buffer() != nullptr); } } @@ -889,7 +1078,7 @@ void DynamicShapeInferer::visit(const ir::operation::Squeeze &op) auto output_ind = op.getOutputs().at(0); auto output = _tensor_registry->getITensor(output_ind); - dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape); + output->applyShape(new_shape); assert(output->buffer() != nullptr); } @@ -930,7 +1119,7 @@ void DynamicShapeInferer::visit(const ir::operation::StridedSlice &op) ir::Shape output_shape = onert::shape_inference::inferStridedSliceShape(input_shape, op_params, rank); - dynamicTensorManagerOf(output)->applyShape(output_index, output_shape); + output->applyShape(output_shape); assert(output->buffer() != nullptr); } @@ -952,10 +1141,11 @@ void DynamicShapeInferer::visit(const ir::operation::Tile &op) auto multiplier_buffer = reinterpret_cast<const int32_t *>(multiplier->buffer()); assert(multiplier_buffer); - auto output_shape = shape_inference::inferTileShape(input_shape, multiplier_buffer); + auto output_shape = + shape_inference::inferTileShape(input_shape, multiplier_buffer, multiplier->dimension(0)); // set output shape and output buffer - dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape); + output->applyShape(output_shape); assert(output->buffer() != nullptr); } @@ -967,17 +1157,48 @@ void DynamicShapeInferer::visit(const ir::operation::Transpose &op) // from op, access the buffer of second input to read new shape auto input_ind = op.getInputs().at(ir::operation::Transpose::Input::INPUT); - auto input_tensor = _tensor_registry->getITensor(input_ind); - auto input_shape = input_tensor->getShape(); + auto input = _tensor_registry->getITensor(input_ind); + auto input_shape = input->getShape(); - if (!input_tensor->is_dynamic()) + /* + Here, the state after compilation (static shape inference) could be one of the following: + + input perms output execution-time shape inf required + ------------------------------------ -------------------------------- + case 1) static const static X + case 2) static non-const dynamic O + case 3) dynamic const dynamic O + case 4) dynamic non-const dynamic O + + So, only when both input1 and ouput are static, we can skip dynamic shape inference. + */ + if ((!input->is_dynamic()) && (!output->is_dynamic())) return; - const auto perm{op.param().perm}; - // set output shape, based on input and params - ir::Shape new_shape = shape_inference::inferTransposeShape(input_shape, perm); + auto perm_ind = op.getInputs().at(ir::operation::Transpose::Input::PERMUTATION); + auto perm = _tensor_registry->getITensor(perm_ind); + + ir::Shape new_shape; + // TODO Change perm->dimension(0) == 0 to perm->num_elements() == 0 + if (perm->dimension(0) == 0) // This condition means that perm is (n-1...0) + { + // Call by (n-1...0) + new_shape = shape_inference::inferTransposeShape(input_shape, nullptr, 0); + } + else + { + // Check rank + if (input->num_dimensions() != perm->getShape().num_elements()) + { + throw std::runtime_error("DynamicShapeInferer failed, bad rank size: " + + std::to_string(perm->getShape().num_elements())); + } - dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape); + // set output shape, based on input and params + const auto perm_buffer = reinterpret_cast<const int32_t *>(perm->buffer()); + new_shape = shape_inference::inferTransposeShape(input_shape, perm_buffer, perm->dimension(0)); + } + output->applyShape(new_shape); assert(output->buffer() != nullptr); } @@ -1005,7 +1226,7 @@ void DynamicShapeInferer::visit(const ir::operation::Unpack &op) auto output_ind = op.getOutputs().at(out_tensor_idx); auto output = _tensor_registry->getITensor(output_ind); - dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape); + output->applyShape(new_shape); assert(output->buffer() != nullptr); } diff --git a/runtime/onert/core/src/exec/Execution.cc b/runtime/onert/core/src/exec/Execution.cc index 7feb3ab68..21fdd9c05 100644 --- a/runtime/onert/core/src/exec/Execution.cc +++ b/runtime/onert/core/src/exec/Execution.cc @@ -34,14 +34,13 @@ Execution::Execution(const std::shared_ptr<ExecutorMap> &executors) : _executors void Execution::changeInputShape(const ir::IOIndex &index, const ir::Shape &new_shape) { - // This should be called BEFORE setInput. - if (_io_desc.inputs.at(index.value()) != 0) - throw std::runtime_error("Error in calling order"); - // This will be used later to set input tensor dynamic // Note that 'compiled' model will not be updated with new_shape // but new_shape will change model input shape while 'running' the model _io_desc.dynamic_input_shapes[index] = new_shape; + + VERBOSE(Execution) << "Model input shape will be changed at the start of execute()" + << "(index: " << index.value() << ")" << std::endl; } // TODO Remove default parameter diff --git a/runtime/onert/core/src/exec/ExecutionObservers.cc b/runtime/onert/core/src/exec/ExecutionObservers.cc index 060f874de..5883d9a1c 100644 --- a/runtime/onert/core/src/exec/ExecutionObservers.cc +++ b/runtime/onert/core/src/exec/ExecutionObservers.cc @@ -22,6 +22,7 @@ #include "exec/IExecutor.h" #include "misc/polymorphic_downcast.h" #include "ir/OpSequence.h" +#include "util/EventWriter.h" namespace onert { @@ -70,7 +71,7 @@ void ProfileObserver::handleEnd(IExecutor *exec, const ir::OpSequence *op_seq, }; ChromeTracingObserver::ChromeTracingObserver(const std::string &filepath, const ir::Graph &graph) - : _ofs{filepath, std::ofstream::out}, _recorder{}, _collector{&_recorder}, _graph{graph} + : _base_filepath(filepath), _recorder{}, _collector{&_recorder}, _graph{graph} { } @@ -78,7 +79,7 @@ ChromeTracingObserver::~ChromeTracingObserver() { try { - _recorder.writeToFile(_ofs); + EventWriter{_recorder}.writeToFiles(_base_filepath); } catch (const std::exception &e) { diff --git a/runtime/onert/core/src/exec/ExecutionObservers.h b/runtime/onert/core/src/exec/ExecutionObservers.h index ac0076ed2..f8c2acca5 100644 --- a/runtime/onert/core/src/exec/ExecutionObservers.h +++ b/runtime/onert/core/src/exec/ExecutionObservers.h @@ -76,7 +76,7 @@ private: static std::string opSequenceTag(const ir::OpSequence *op_seq, const ir::Operations &operations); private: - std::ofstream _ofs; + const std::string &_base_filepath; EventRecorder _recorder; EventCollector _collector; const ir::Graph &_graph; diff --git a/runtime/onert/core/src/exec/ExecutorBase.cc b/runtime/onert/core/src/exec/ExecutorBase.cc index f835a9675..018a0bba0 100644 --- a/runtime/onert/core/src/exec/ExecutorBase.cc +++ b/runtime/onert/core/src/exec/ExecutorBase.cc @@ -27,38 +27,32 @@ namespace exec { ExecutorBase::ExecutorBase(std::unique_ptr<compiler::LoweredGraph> &&lowered_graph, - const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors, - const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors, - const compiler::TensorRegistries &tensor_regs, - backend::TensorManagerSet &&tensor_mgrs) + const std::vector<backend::ITensor *> &input_tensors, + const std::vector<backend::ITensor *> &output_tensors, + const compiler::TensorRegistries &tensor_regs) : _lowered_graph{std::move(lowered_graph)}, _graph{_lowered_graph->graph()}, - _input_tensors{input_tensors}, _output_tensors{output_tensors}, - _tensor_mgrs{std::move(tensor_mgrs)}, _mutex() + _input_tensors{input_tensors}, _output_tensors{output_tensors}, _mutex() { // TODO Fix the way of knowing whether it is primary or not bool primary_executor = !(_input_tensors.empty() && _output_tensors.empty()); if (!primary_executor) { auto build_input_tensor_list = [&](const onert::ir::OperandIndexSequence &ind_seq) { - std::vector<std::shared_ptr<backend::ITensor>> list; + std::vector<backend::ITensor *> list; for (auto ind : ind_seq) { - std::shared_ptr<backend::ITensor> tensor = tensor_regs.getITensor(ind); + backend::ITensor *tensor = tensor_regs.getITensor(ind); assert(tensor != nullptr); - DynAllocInfo dyn_alloc_info{ind}; - _input_to_dyn_alloc_info.emplace(tensor, dyn_alloc_info); list.push_back(tensor); } return list; }; auto build_output_tensor_list = [&](const onert::ir::OperandIndexSequence &ind_seq) { - std::vector<std::shared_ptr<backend::ITensor>> list; + std::vector<backend::ITensor *> list; for (auto ind : ind_seq) { - std::shared_ptr<backend::ITensor> tensor = tensor_regs.getITensor(ind); + backend::ITensor *tensor = tensor_regs.getITensor(ind); assert(tensor != nullptr); - DynAllocInfo dyn_alloc_info{ind}; - _output_to_dyn_alloc_info.emplace(tensor, dyn_alloc_info); list.push_back(tensor); } return list; @@ -66,28 +60,9 @@ ExecutorBase::ExecutorBase(std::unique_ptr<compiler::LoweredGraph> &&lowered_gra _input_tensors = build_input_tensor_list(_graph.getInputs()); _output_tensors = build_output_tensor_list(_graph.getOutputs()); } - else - { - assert(input_tensors.size() == _graph.getInputs().size()); - assert(output_tensors.size() == _graph.getOutputs().size()); - for (uint32_t i = 0; i < input_tensors.size(); i++) - { - auto tensor = input_tensors[i]; - auto ind = _graph.getInputs().at(i); - DynAllocInfo dyn_alloc_info{ind}; - _input_to_dyn_alloc_info.emplace(tensor, dyn_alloc_info); - } - for (uint32_t i = 0; i < output_tensors.size(); i++) - { - auto tensor = output_tensors[i]; - auto ind = _graph.getOutputs().at(i); - DynAllocInfo dyn_alloc_info{ind}; - _output_to_dyn_alloc_info.emplace(tensor, dyn_alloc_info); - } - } } -void ExecutorBase::execute(const std::vector<std::shared_ptr<backend::ITensor>> &src_tensors, +void ExecutorBase::execute(const std::vector<backend::ITensor *> &src_tensors, const std::shared_ptr<IPermuteFunction> &pre_fn) { // For thread-safe, use mutex @@ -108,22 +83,12 @@ void ExecutorBase::execute(const std::vector<std::shared_ptr<backend::ITensor>> // If src_tensor or input_tensor is nullptr, pre_fn does not copy the tensors if (src_tensor != nullptr && input_tensor != nullptr) { - auto dyn_alloc_info = _input_to_dyn_alloc_info.find(_input_tensors[n]); const auto orig_input_shape = input_tensor->getShape(); const auto changed_input_shape = convertShape(src_tensor->getShape(), src_tensor->layout(), input_tensor->layout()); if (orig_input_shape != changed_input_shape) { - if (dyn_alloc_info == _input_to_dyn_alloc_info.end()) - { - // The input_tensor is a dynamic tensor of backend that doesn't support dynamic tensor - throw std::runtime_error("Unknown dim is found at execution time for a backend that " - "does not support dynamic tensor"); - } - else - { - input_tensor->set_dynamic(); - } + input_tensor->set_dynamic(); } } } @@ -147,7 +112,7 @@ void ExecutorBase::execute(const IODescription &desc) for (uint32_t i = 0; i < _input_tensors.size(); ++i) { // TODO Remove dynamic_cast - auto tensor = std::dynamic_pointer_cast<backend::controlflow::UserTensor>(_input_tensors[i]); + auto *tensor = dynamic_cast<backend::controlflow::UserTensor *>(_input_tensors[i]); assert(tensor); auto input_shape = desc.dynamic_input_shapes.find(ir::IOIndex{i}); if (input_shape != desc.dynamic_input_shapes.end()) @@ -155,6 +120,7 @@ void ExecutorBase::execute(const IODescription &desc) tensor->set_dynamic(); tensor->setShape(input_shape->second); } + // TODO Check if (desc.inputs[i] == nullptr) // TODO Better design for ITensor? (we need const_cast as ITensor is writable) tensor->setBuffer(static_cast<uint8_t *>(const_cast<void *>(desc.inputs[i]->buffer)), desc.inputs[i]->size); @@ -166,12 +132,12 @@ void ExecutorBase::execute(const IODescription &desc) for (uint32_t i = 0; i < _output_tensors.size(); ++i) { // TODO Remove dynamic_cast - auto tensor = std::dynamic_pointer_cast<backend::controlflow::UserTensor>(_output_tensors[i]); + auto *tensor = dynamic_cast<backend::controlflow::UserTensor *>(_output_tensors[i]); assert(tensor); tensor->set_dynamic(); // It can't be resized but shape could change - // TODO Better design for ITensor? (we need const_cast as ITensor is writable) - tensor->setBuffer(static_cast<uint8_t *>(const_cast<void *>(desc.outputs[i]->buffer)), - desc.outputs[i]->size); + if (desc.outputs[i] == nullptr) + throw std::runtime_error{"Output " + std::to_string(i) + "'s buffer is not set."}; + tensor->setBuffer(static_cast<uint8_t *>(desc.outputs[i]->buffer), desc.outputs[i]->size); } executeImpl(); @@ -218,17 +184,8 @@ void ExecutorBase::handleDynamicInputTensor(ir::IOIndex io_ind, const IODescript auto shape_sig_found = desc.dynamic_input_shapes.find(io_ind); if (shape_sig_found != desc.dynamic_input_shapes.end()) { - auto dyn_alloc_info = _input_to_dyn_alloc_info.find(_input_tensors[io_ind.value()]); - if (dyn_alloc_info == _input_to_dyn_alloc_info.end()) - throw std::runtime_error("Unknown dim is found at execution time for a backend that " - "does not support dynamic tensor"); - auto changed_input_shape = shape_sig_found->second; - auto operand_ind = dyn_alloc_info->second.ind; - - auto dyn_tensor_manager = _input_tensors[io_ind.value()]->dynamic_tensor_manager(); - assert(dyn_tensor_manager); - dyn_tensor_manager->applyShape(operand_ind, changed_input_shape); + _input_tensors[io_ind.value()]->applyShape(changed_input_shape); } } diff --git a/runtime/onert/core/src/exec/ExecutorBase.h b/runtime/onert/core/src/exec/ExecutorBase.h index a13be7dbf..8a6ec9174 100644 --- a/runtime/onert/core/src/exec/ExecutorBase.h +++ b/runtime/onert/core/src/exec/ExecutorBase.h @@ -20,9 +20,7 @@ #include <mutex> #include "IPermuteFunction.h" -#include "Source.h" #include "exec/ExecutionObservers.h" -#include "Sink.h" #include "ShapeConverter.h" #include "exec/IExecutor.h" #include "compiler/LoweredGraph.h" @@ -51,10 +49,9 @@ public: * @param tensor_builders Tensor builders that are currently used */ ExecutorBase(std::unique_ptr<compiler::LoweredGraph> &&lowered_graph, - const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors, - const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors, - const compiler::TensorRegistries &tensor_regs, - backend::TensorManagerSet &&tensor_mgrs); + const std::vector<backend::ITensor *> &input_tensors, + const std::vector<backend::ITensor *> &output_tensors, + const compiler::TensorRegistries &tensor_regs); virtual ~ExecutorBase() = default; @@ -66,7 +63,7 @@ public: * @param src_tensor Tensor list that will be copied to input tensors of this * @param pre_fn The permutation function that copy from src_tensor to input tensors of this */ - void execute(const std::vector<std::shared_ptr<backend::ITensor>> &src_tensors, + void execute(const std::vector<backend::ITensor *> &src_tensors, const std::shared_ptr<IPermuteFunction> &pre_fn); void execute(const IODescription &desc) final; @@ -81,17 +78,9 @@ public: void addObserver(std::unique_ptr<IExecutionObserver> ref) { _subject.add(std::move(ref)); }; - const std::vector<std::shared_ptr<backend::ITensor>> &getInputTensors() const - { - return _input_tensors; - } - - const std::vector<std::shared_ptr<backend::ITensor>> &getOutputTensors() const - { - return _output_tensors; - } + const std::vector<backend::ITensor *> &getInputTensors() const { return _input_tensors; } - const DynAllocInfoMap &getInputsDynamicAllocInfo() const { return _input_to_dyn_alloc_info; } + const std::vector<backend::ITensor *> &getOutputTensors() const { return _output_tensors; } protected: /** @@ -104,11 +93,8 @@ protected: std::shared_ptr<ir::OperationIndexMap<int64_t>> _indexed_ranks; std::unique_ptr<compiler::LoweredGraph> _lowered_graph; const ir::Graph &_graph; - std::vector<std::shared_ptr<backend::ITensor>> _input_tensors; - std::vector<std::shared_ptr<backend::ITensor>> _output_tensors; - DynAllocInfoMap _input_to_dyn_alloc_info; - DynAllocInfoMap _output_to_dyn_alloc_info; - backend::TensorManagerSet _tensor_mgrs; + std::vector<backend::ITensor *> _input_tensors; + std::vector<backend::ITensor *> _output_tensors; std::mutex _mutex; private: diff --git a/runtime/onert/core/src/exec/FunctionSequence.cc b/runtime/onert/core/src/exec/FunctionSequence.cc index fb31f7582..8aefa5eeb 100644 --- a/runtime/onert/core/src/exec/FunctionSequence.cc +++ b/runtime/onert/core/src/exec/FunctionSequence.cc @@ -28,9 +28,11 @@ namespace exec void FunctionSequence::run() { - // TODO Find out when `_enable_dynamic_shape_inferer` is true but `_dynamic_tensor_ctx` is false if (_enable_dynamic_shape_inferer && _dynamic_tensor_ctx) { + // acl_cl and acl_neon backend don't support dynamic shape. + // _dynamic_tensor_ctx is always nullptr for acl_cl and acl_neon + // Thus, those two bakends cannot reach here. if (_dynamic_tensor_ctx->op_seq->size() != _functions.size()) throw std::runtime_error("operation and functions should be mapped one by one"); @@ -61,11 +63,6 @@ void FunctionSequence::run() { for (const auto &function : _functions) { - auto *sub_func_seq = dynamic_cast<FunctionSequence *>(function.get()); - if (sub_func_seq != nullptr) - { - sub_func_seq->enableDynamicShapeInferer(false); - } function->run(); } } diff --git a/runtime/onert/core/src/exec/IPermuteFunction.h b/runtime/onert/core/src/exec/IPermuteFunction.h index 6b4d15380..94bc2e436 100644 --- a/runtime/onert/core/src/exec/IPermuteFunction.h +++ b/runtime/onert/core/src/exec/IPermuteFunction.h @@ -50,13 +50,13 @@ private: public: virtual void run() override { - assert(_src_tensors.size() > 0); + // TODO Optimization : Make control does not reach here? when (_src_tensors.size() == 0) assert(_src_tensors.size() == _dst_tensors.size()); auto src_it = _src_tensors.begin(); auto dst_it = _dst_tensors.begin(); while (src_it != _src_tensors.end()) { - const auto src_tensor = *src_it; + auto src_tensor = *src_it; auto dst_tensor = *dst_it; if (src_tensor != dst_tensor) { @@ -101,9 +101,8 @@ public: virtual void optimize() = 0; private: - template <class T> - void permute(const std::shared_ptr<backend::ITensor> &src, std::shared_ptr<backend::ITensor> &dst, - size_t rank) + // TODO make src const by proving const access() + template <class T> void permute(backend::ITensor *src, backend::ITensor *dst, size_t rank) { const auto permute_type = [&]() -> PermuteType { if (src->layout() == ir::Layout::NHWC && dst->layout() == ir::Layout::NCHW) @@ -121,127 +120,65 @@ private: }(); auto fn = [&](backend::ITensor &src_tensor) { dst->access([&](backend::ITensor &dst_tensor) { - auto src_buffer = src_tensor.buffer(); - auto src_size = src_tensor.total_size(); - auto dst_buffer = dst_tensor.buffer(); - if (permute_type == PermuteType::COPY) + if (rank == 4 && permute_type != PermuteType::COPY) { - assert(src_tensor.layout() == dst_tensor.layout()); - if (!src_tensor.has_padding() && !dst_tensor.has_padding()) + switch (permute_type) { - assert(src_size <= dst_tensor.total_size()); - memcpy(dst_buffer, src_buffer, src_size); - return; - } - } - switch (rank) - { - case 0: - case 1: - { - const int32_t copy_len = dst_tensor.dimension(0); - - memcpy(dst_buffer, src_buffer, copy_len * sizeof(T)); - break; - } - case 2: - { - const int32_t dim_0 = dst_tensor.dimension(0); - const int32_t copy_len = dst_tensor.dimension(1); - - for (int32_t i = 0; i < dim_0; ++i) + case PermuteType::NHWC_TO_NCHW: { - ir::Coordinates coords{i, 0}; - memcpy(dst_buffer + dst_tensor.calcOffset(coords), - src_buffer + src_tensor.calcOffset(coords), copy_len * sizeof(T)); + ir::FeatureShape shape; + shape.N = dst_tensor.dimension(0); + shape.C = dst_tensor.dimension(1); + shape.H = dst_tensor.dimension(2); + shape.W = dst_tensor.dimension(3); + const feature::nhwc::Reader<T> from(&src_tensor); + feature::nchw::View<T> into(&dst_tensor); + feature::iterate(shape) + << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) { + const auto value = from.at(batch, row, col, ch); + into.at(batch, ch, row, col) = value; + }; + break; } - break; - } - case 3: - { - const int32_t dim_0 = dst_tensor.dimension(0); - const int32_t dim_1 = dst_tensor.dimension(1); - const int32_t copy_len = dst_tensor.dimension(2); - - for (auto i = 0; i < dim_0; ++i) + case PermuteType::NCHW_TO_NHWC: { - for (auto j = 0; j < dim_1; ++j) - { - ir::Coordinates coords{i, j, 0}; - memcpy(dst_buffer + dst_tensor.calcOffset(coords), - src_buffer + src_tensor.calcOffset(coords), copy_len * sizeof(T)); - } + ir::FeatureShape shape; + shape.N = src_tensor.dimension(0); + shape.C = src_tensor.dimension(1); + shape.H = src_tensor.dimension(2); + shape.W = src_tensor.dimension(3); + const feature::nchw::Reader<T> from(&src_tensor); + feature::nhwc::View<T> into(&dst_tensor); + feature::iterate(shape) + << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) { + const auto value = from.at(batch, ch, row, col); + into.at(batch, row, col, ch) = value; + }; + break; } - break; - } - case 4: - { - switch (permute_type) + default: { - case PermuteType::NHWC_TO_NCHW: - { - ir::FeatureShape shape; - shape.N = dst_tensor.dimension(0); - shape.C = dst_tensor.dimension(1); - shape.H = dst_tensor.dimension(2); - shape.W = dst_tensor.dimension(3); - const feature::nhwc::Reader<T> from(&src_tensor); - feature::nchw::View<T> into(&dst_tensor); - feature::iterate(shape) - << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) { - const auto value = from.at(batch, row, col, ch); - into.at(batch, ch, row, col) = value; - }; - break; - } - case PermuteType::NCHW_TO_NHWC: - { - ir::FeatureShape shape; - shape.N = src_tensor.dimension(0); - shape.C = src_tensor.dimension(1); - shape.H = src_tensor.dimension(2); - shape.W = src_tensor.dimension(3); - const feature::nchw::Reader<T> from(&src_tensor); - feature::nhwc::View<T> into(&dst_tensor); - feature::iterate(shape) - << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) { - const auto value = from.at(batch, ch, row, col); - into.at(batch, row, col, ch) = value; - }; - break; - } - case PermuteType::COPY: - { - const int32_t dim_0 = dst_tensor.dimension(0); - const int32_t dim_1 = dst_tensor.dimension(1); - const int32_t dim_2 = dst_tensor.dimension(2); - const int32_t copy_len = dst_tensor.dimension(3); - - for (auto i = 0; i < dim_0; ++i) - { - for (auto j = 0; j < dim_1; ++j) - { - for (auto k = 0; k < dim_2; ++k) - { - ir::Coordinates coords{i, j, k, 0}; - memcpy(dst_buffer + dst_tensor.calcOffset(coords), - src_buffer + src_tensor.calcOffset(coords), copy_len * sizeof(T)); - } - } - } - break; - } - default: - { - throw std::runtime_error("Unsupported Permutation"); - break; - } + throw std::runtime_error("Unsupported Permutation"); + break; } - break; } - default: - throw std::runtime_error("Unsupported rank in permutation"); - break; + } + else if (!src_tensor.has_padding() && !dst_tensor.has_padding()) + { + auto src_size = src_tensor.total_size(); + assert(src_size <= dst_tensor.total_size()); + memcpy(dst_tensor.buffer(), src_tensor.buffer(), src_size); + } + else + { + auto loop_shape = src_tensor.getShape(); + const auto copy_axis = loop_shape.rank() - 1; + const auto copy_len = loop_shape.dim(copy_axis) * sizeof(T); + loop_shape.dim(copy_axis) = 1; + ShapeLoop(loop_shape, [&](const onert::ir::Coordinates &coords) { + memcpy(dst_tensor.buffer() + dst_tensor.calcOffset(coords), + src_tensor.buffer() + src_tensor.calcOffset(coords), copy_len); + }); } }); }; @@ -275,8 +212,8 @@ private: } protected: - std::vector<std::shared_ptr<backend::ITensor>> _src_tensors; - std::vector<std::shared_ptr<backend::ITensor>> _dst_tensors; + std::vector<backend::ITensor *> _src_tensors; + std::vector<backend::ITensor *> _dst_tensors; // TODO Remove this member if it is possible std::vector<size_t> _ranks; }; diff --git a/runtime/onert/core/src/exec/LinearExecutor.cc b/runtime/onert/core/src/exec/LinearExecutor.cc index 69dfe9b9b..6e6ca110f 100644 --- a/runtime/onert/core/src/exec/LinearExecutor.cc +++ b/runtime/onert/core/src/exec/LinearExecutor.cc @@ -51,8 +51,10 @@ void LinearExecutor::executeImpl() _subject.notifyJobBegin(this, op_seq, backend); auto &fn_seq = code.fn_seq; - bool handle_dynamic_tensor = op_seq->has_dynamic_tensor() || hasDynamicInput(); + fn_seq->initRunning(); + + bool handle_dynamic_tensor = op_seq->has_dynamic_tensor() || hasDynamicInput(); fn_seq->enableDynamicShapeInferer(handle_dynamic_tensor); fn_seq->run(); diff --git a/runtime/onert/core/src/exec/LinearExecutor.h b/runtime/onert/core/src/exec/LinearExecutor.h index c224d3f4f..22d00ec30 100644 --- a/runtime/onert/core/src/exec/LinearExecutor.h +++ b/runtime/onert/core/src/exec/LinearExecutor.h @@ -47,13 +47,11 @@ public: * @param code_map OpSequence and its code map */ LinearExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph, - const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors, - const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors, - const compiler::TensorRegistries &tensor_regs, - backend::TensorManagerSet &&tensor_mgrs, compiler::CodeMap &&code_map, + const std::vector<backend::ITensor *> &input_tensors, + const std::vector<backend::ITensor *> &output_tensors, + const compiler::TensorRegistries &tensor_regs, compiler::CodeMap &&code_map, const std::vector<ir::OpSequenceIndex> &order) - : ExecutorBase{std::move(lowered_graph), input_tensors, output_tensors, tensor_regs, - std::move(tensor_mgrs)} + : ExecutorBase{std::move(lowered_graph), input_tensors, output_tensors, tensor_regs} { for (auto index : order) { diff --git a/runtime/onert/core/src/exec/ParallelExecutor.cc b/runtime/onert/core/src/exec/ParallelExecutor.cc index ab234aacd..676bdb5fa 100644 --- a/runtime/onert/core/src/exec/ParallelExecutor.cc +++ b/runtime/onert/core/src/exec/ParallelExecutor.cc @@ -59,14 +59,13 @@ void ParallelExecutor::notify(uint32_t finished_job_id) _cv_jobs.notify_all(); } -ParallelExecutor::ParallelExecutor( - std::unique_ptr<compiler::LoweredGraph> lowered_graph, - const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors, - const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors, - const compiler::TensorRegistries &tensor_regs, backend::TensorManagerSet &&tensor_mgrs, - compiler::CodeMap &&code_map) - : DataflowExecutor{std::move(lowered_graph), input_tensors, output_tensors, tensor_regs, - std::move(tensor_mgrs), std::move(code_map)} +ParallelExecutor::ParallelExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph, + const std::vector<backend::ITensor *> &input_tensors, + const std::vector<backend::ITensor *> &output_tensors, + const compiler::TensorRegistries &tensor_regs, + compiler::CodeMap &&code_map) + : DataflowExecutor{std::move(lowered_graph), input_tensors, output_tensors, tensor_regs, + std::move(code_map)} { VERBOSE(ParallelExecutor) << "Constructing Parallel Executor" << std::endl; } @@ -133,6 +132,8 @@ void ParallelExecutor::executeImpl() notify(job_index); }; + job->fn_seq()->initRunning(); + // dynamic tensor setting bool handle_dynamic_tensor = op_seq->has_dynamic_tensor() || dynamic_input_exists; job->fn_seq()->enableDynamicShapeInferer(handle_dynamic_tensor); diff --git a/runtime/onert/core/src/exec/ParallelExecutor.h b/runtime/onert/core/src/exec/ParallelExecutor.h index 929edfce9..111c20c0c 100644 --- a/runtime/onert/core/src/exec/ParallelExecutor.h +++ b/runtime/onert/core/src/exec/ParallelExecutor.h @@ -51,10 +51,9 @@ public: * @param code_map OpSequence and its code map */ ParallelExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph, - const std::vector<std::shared_ptr<backend::ITensor>> &input_tensors, - const std::vector<std::shared_ptr<backend::ITensor>> &output_tensors, - const compiler::TensorRegistries &tensor_regs, - backend::TensorManagerSet &&tensor_mgrs, compiler::CodeMap &&code_map); + const std::vector<backend::ITensor *> &input_tensors, + const std::vector<backend::ITensor *> &output_tensors, + const compiler::TensorRegistries &tensor_regs, compiler::CodeMap &&code_map); void executeImpl() override; diff --git a/runtime/onert/core/src/exec/Sink.h b/runtime/onert/core/src/exec/Sink.h deleted file mode 100644 index 6a99efe60..000000000 --- a/runtime/onert/core/src/exec/Sink.h +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_EXEC_SINK_H__ -#define __ONERT_EXEC_SINK_H__ - -#include "feature/nchw/Reader.h" -#include "feature/nchw/View.h" -#include "feature/nhwc/Reader.h" -#include "feature/nhwc/View.h" - -#include <cassert> -#include <memory> -#include "util/Utils.h" -#include <misc/feature/IndexIterator.h> - -namespace onert -{ -namespace exec -{ -struct ISink -{ - virtual ~ISink() = default; - - virtual void pull(::onert::backend::ITensor &tensor) const = 0; -}; - -// Create second lever inheritance: the first lever is used as a reference type in use-case places -template <typename T> class ITemplSink : public ISink -{ -public: - ITemplSink(void *output_buffer, const size_t &output_size, const ir::Shape &shape, - const bool copy, ir::Layout io_layout) - : _output_buffer{reinterpret_cast<T *>(output_buffer)}, _output_size{output_size}, - _shape{shape}, _copy{copy}, _io_layout{io_layout} - { - } - -protected: - void pullUnif(onert::backend::ITensor &tensor) const - { - assert(((_io_layout == ir::Layout::NHWC && tensor.layout() == ir::Layout::NCHW) || - (_io_layout == ir::Layout::NCHW && tensor.layout() == ir::Layout::NHWC)) || - _copy); - auto input_buffer = tensor.buffer(); - auto rank = _shape.rank(); - - if (!tensor.has_padding() && rank < 4 + _copy) - { - memcpy(_output_buffer, input_buffer, _output_size); - return; - } - - switch (rank) - { - case 0: - case 1: - { - memcpy(_output_buffer, input_buffer, _output_size); - break; - } - case 2: - { - const int32_t copy_len = _shape.dim(1); - - for (auto i = 0; i < _shape.dim(0); ++i) - { - ir::Coordinates coords{i, 0}; - memcpy(_output_buffer + i * copy_len, input_buffer + tensor.calcOffset(coords), - copy_len * sizeof(T)); - } - break; - } - case 3: - { - const int32_t dim1 = _shape.dim(1); - const int32_t dim2 = _shape.dim(2); - - for (auto i = 0; i < _shape.dim(0); ++i) - { - for (auto j = 0; j < _shape.dim(1); ++j) - { - ir::Coordinates coords{i, j, 0}; - memcpy(_output_buffer + i * dim1 * dim2 + j * dim2, - input_buffer + tensor.calcOffset(coords), dim2 * sizeof(T)); - } - } - break; - } - case 4: - { - if (_copy) - { - const int32_t dim1 = _shape.dim(1); - const int32_t dim2 = _shape.dim(2); - const int32_t dim3 = _shape.dim(3); - - for (auto i = 0; i < _shape.dim(0); ++i) - { - for (auto j = 0; j < _shape.dim(1); ++j) - { - for (auto k = 0; k < _shape.dim(2); ++k) - { - ir::Coordinates coords{i, j, k, 0}; - memcpy(_output_buffer + i * dim1 * dim2 * dim3 + j * dim2 * dim3 + k * dim3, - input_buffer + tensor.calcOffset(coords), dim3 * sizeof(T)); - } - } - } - } - else - { - const auto shape = _shape.asFeature(_io_layout); - - if (_io_layout == ir::Layout::NHWC) - { - const exec::feature::nchw::Reader<T> from(&tensor); - exec::feature::nhwc::View<T> into(shape, _output_buffer, _output_size); - feature::iterate(shape) - << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) { - const auto value = from.at(batch, ch, row, col); - into.at(batch, row, col, ch) = value; - }; - } - else if (_io_layout == ir::Layout::NCHW) - { - const exec::feature::nhwc::Reader<T> from(&tensor); - exec::feature::nchw::View<T> into(shape, _output_buffer, _output_size); - feature::iterate(shape) - << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) { - const auto value = from.at(batch, row, col, ch); - into.at(batch, ch, row, col) = value; - }; - } - else - { - throw std::runtime_error("Wrong Layout"); - } - } - break; - } - default: - throw std::runtime_error("NYI: rank > 4"); - break; - } - } - -private: - T *_output_buffer; - const size_t _output_size; - const ir::Shape _shape; - const bool _copy; - const ir::Layout _io_layout; -}; - -template <typename T> class PermutateSink final : public ITemplSink<T> -{ -public: - PermutateSink(void *output_buffer, const size_t &output_size, const ir::Shape &shape, - ir::Layout io_layout) - : ITemplSink<T>(output_buffer, output_size, shape, false, io_layout) - { - } - -public: - void pull(onert::backend::ITensor &tensor) const override { ITemplSink<T>::pullUnif(tensor); } -}; - -// Only supports NHWC format front-end(NNAPI) now -template <typename T> class CopySink final : public ITemplSink<T> -{ -public: - CopySink(void *output_buffer, const size_t &output_size, const ir::Shape &shape, - ir::Layout io_layout = ir::Layout::UNKNOWN) - : ITemplSink<T>(output_buffer, output_size, shape, true, io_layout) - { - } - -public: - void pull(onert::backend::ITensor &tensor) const override { ITemplSink<T>::pullUnif(tensor); } -}; - -} // namespace exec -} // namespace onert - -#endif // __ONERT_EXEC_SINK_H__ diff --git a/runtime/onert/core/src/exec/Source.h b/runtime/onert/core/src/exec/Source.h deleted file mode 100644 index fb2be4dd8..000000000 --- a/runtime/onert/core/src/exec/Source.h +++ /dev/null @@ -1,208 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_EXEC_SOURCE_H__ -#define __ONERT_EXEC_SOURCE_H__ - -#include "feature/IndexIterator.h" -#include "feature/nchw/Reader.h" -#include "feature/nchw/View.h" -#include "feature/nhwc/Reader.h" -#include "feature/nhwc/View.h" - -#include <cassert> -#include <memory> -#include "util/Utils.h" -#include <ir/Layout.h> -#include "ir/Shape.h" - -namespace onert -{ -namespace exec -{ - -struct ISource -{ - virtual ~ISource() = default; - - virtual void push(::onert::backend::ITensor &tensor) const = 0; -}; - -// Create second lever inheritance: the first lever is used as a reference type in use-case places -template <typename T> class ITemplSource : public ISource -{ -public: - ITemplSource(const void *input_buffer, const size_t &input_size, const ir::Shape &shape, - const bool copy, ir::Layout io_layout) - : _input_buffer{reinterpret_cast<const T *>(input_buffer)}, _input_size{input_size}, - _shape{shape}, _copy(copy), _io_layout{io_layout} - { - } - - virtual void push(::onert::backend::ITensor &tensor) const = 0; - -protected: - void pushUnif(onert::backend::ITensor &tensor) const - { - assert(((_io_layout == ir::Layout::NHWC && tensor.layout() == ir::Layout::NCHW) || - (_io_layout == ir::Layout::NCHW && tensor.layout() == ir::Layout::NHWC)) || - _copy); - auto output_buffer = tensor.buffer(); - auto rank = _shape.rank(); - - if (!tensor.has_padding() && rank < 4 + _copy) - { - memcpy(output_buffer, _input_buffer, _input_size); - return; - } - - switch (rank) - { - case 0: - case 1: - { - memcpy(output_buffer, _input_buffer, _input_size); - break; - } - case 2: - { - const int32_t copy_len = _shape.dim(1); - - for (auto i = 0; i < _shape.dim(0); ++i) - { - ir::Coordinates coords{i, 0}; - memcpy(output_buffer + tensor.calcOffset(coords), _input_buffer + i * copy_len, - copy_len * sizeof(T)); - } - break; - } - case 3: - { - const int32_t dim1 = _shape.dim(1); - const int32_t dim2 = _shape.dim(2); - - for (auto i = 0; i < _shape.dim(0); ++i) - { - for (auto j = 0; j < _shape.dim(1); ++j) - { - ir::Coordinates coords{i, j, 0}; - memcpy(output_buffer + tensor.calcOffset(coords), - _input_buffer + i * dim1 * dim2 + j * dim2, dim2 * sizeof(T)); - } - } - break; - } - case 4: - { - if (_copy) - { - const int32_t dim1 = _shape.dim(1); - const int32_t dim2 = _shape.dim(2); - const int32_t dim3 = _shape.dim(3); - for (auto i = 0; i < _shape.dim(0); ++i) - { - for (auto j = 0; j < _shape.dim(1); ++j) - { - for (auto k = 0; k < _shape.dim(2); ++k) - { - ir::Coordinates coords{i, j, k, 0}; - memcpy(output_buffer + tensor.calcOffset(coords), - _input_buffer + i * dim1 * dim2 * dim3 + j * dim2 * dim3 + k * dim3, - dim3 * sizeof(T)); - } - } - } - } - else - { - const auto shape = _shape.asFeature(_io_layout); - - if (_io_layout == ir::Layout::NCHW) - { - const exec::feature::nchw::Reader<T> from(shape, _input_buffer, _input_size); - exec::feature::nhwc::View<T> into(&tensor); - feature::iterate(shape) - << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) { - const auto value = from.at(batch, ch, row, col); - into.at(batch, row, col, ch) = value; - }; - } - else if (_io_layout == ir::Layout::NHWC) - { - const exec::feature::nhwc::Reader<T> from(shape, _input_buffer, _input_size); - exec::feature::nchw::View<T> into(&tensor); - feature::iterate(shape) - << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) { - const auto value = from.at(batch, row, col, ch); - into.at(batch, ch, row, col) = value; - }; - } - else - { - throw std::runtime_error("Wrong Layout"); - } - } - - break; - } - default: - throw std::runtime_error("NYI: rank > 4"); - break; - } - } - -private: - const T *_input_buffer; - const size_t _input_size; - const ir::Shape _shape; - const bool _copy; - const ir::Layout _io_layout; -}; - -template <typename T> class PermutateSource final : public ITemplSource<T> -{ -public: - PermutateSource(const void *input_buffer, const size_t &input_size, const ir::Shape &shape, - ir::Layout io_layout) - : ITemplSource<T>(input_buffer, input_size, shape, false, io_layout) - { - } - -public: - void push(onert::backend::ITensor &tensor) const override - { - // do NHWC_TO_NCHW or NCHW_TO_NHWC permutation - ITemplSource<T>::pushUnif(tensor); - } -}; - -template <typename T> class CopySource final : public ITemplSource<T> -{ -public: - CopySource(const void *input_buffer, const size_t &input_size, const ir::Shape &shape, - ir::Layout io_layout = ir::Layout::UNKNOWN) - : ITemplSource<T>(input_buffer, input_size, shape, true, io_layout) - { - } - -public: - void push(onert::backend::ITensor &tensor) const override { ITemplSource<T>::pushUnif(tensor); } -}; - -} // namespace exec -} // namespace onert - -#endif // __ONERT_EXEC_SOURCE_H__ diff --git a/runtime/onert/core/src/interp/Tensor.h b/runtime/onert/core/src/interp/Tensor.h index 008a4b9d4..8b72d537d 100644 --- a/runtime/onert/core/src/interp/Tensor.h +++ b/runtime/onert/core/src/interp/Tensor.h @@ -171,7 +171,6 @@ public: int32_t data_offset() const override { return _info.typeInfo().offset(); } const ir::OperandInfo &tensorInfo() const override { return _info; } uint64_t num_elements() const override { return _info.shape().num_elements(); }; - backend::IDynamicTensorManager *dynamic_tensor_manager() override { return nullptr; } private: const ir::OperandInfo _info; diff --git a/runtime/onert/core/src/ir/Graph.cc b/runtime/onert/core/src/ir/Graph.cc index fe8b1b443..605562ebc 100644 --- a/runtime/onert/core/src/ir/Graph.cc +++ b/runtime/onert/core/src/ir/Graph.cc @@ -103,7 +103,7 @@ void Graph::initializeUseDef() { operations().iterate([&](const OperationIndex &index, const Operation &node) -> void { auto outputs = node.getOutputs(); - for (auto output : outputs) + for (auto output : outputs | ir::Remove::UNDEFINED) { operands().at(output).setDef(index); } diff --git a/runtime/onert/core/src/ir/GraphIterator.cc b/runtime/onert/core/src/ir/GraphIterator.cc index 4bea1a55d..ac67771c4 100644 --- a/runtime/onert/core/src/ir/GraphIterator.cc +++ b/runtime/onert/core/src/ir/GraphIterator.cc @@ -53,7 +53,7 @@ void PostDfsIterator<is_const>::iterate(GraphRef graph, const IterFn &fn) const return; visited[index] = true; - for (const auto output : node.getOutputs() | Remove::DUPLICATED) + for (const auto output : node.getOutputs() | Remove::DUPLICATED | Remove::UNDEFINED) { const auto &operand = graph.operands().at(output); for (const auto &use : operand.getUses()) @@ -86,7 +86,7 @@ void PostDfsIterator<is_const>::iterateOpSeqs(LoweredGraphRef lowered_graph, return; visited[index] = true; - for (const auto output : op_seq.getOutputs() | Remove::DUPLICATED) + for (const auto output : op_seq.getOutputs() | Remove::DUPLICATED | Remove::UNDEFINED) { const auto &operand = lowered_graph.graph().operands().at(output); for (const auto &use : operand.getUses()) diff --git a/runtime/onert/core/src/ir/Operation.cc b/runtime/onert/core/src/ir/Operation.cc index 04be8c0d9..4af878541 100644 --- a/runtime/onert/core/src/ir/Operation.cc +++ b/runtime/onert/core/src/ir/Operation.cc @@ -24,22 +24,33 @@ namespace ir { Operation::Operation(OperandConstraint input_constr, const OperandIndexSequence &inputs, - const OperandIndexSequence &outputs) - : _input_constr{input_constr}, _inputs{inputs}, _outputs{outputs} + const OperandIndexSequence &outputs, OperandConstraint output_constr) + : _input_constr{input_constr}, _output_constr{output_constr} { + setInputs(inputs); + setOutputs(outputs); } -Operation::Operation(OperandConstraint input_constr) : _input_constr{input_constr} {} +Operation::Operation(OperandConstraint input_constr, OperandConstraint output_constr) + : _input_constr{input_constr}, _output_constr{output_constr} +{ +} Operation::~Operation() = default; void Operation::setInputs(const OperandIndexSequence &indexes) { - assert(_input_constr.check(indexes.size())); + if (!_input_constr.check(indexes.size())) + throw std::runtime_error{"Invalid number of input tensors for this operation."}; _inputs = indexes; } -void Operation::setOutputs(const OperandIndexSequence &indexes) { _outputs = indexes; } +void Operation::setOutputs(const OperandIndexSequence &indexes) +{ + if (!_output_constr.check(indexes.size())) + throw std::runtime_error{"Invalid number of output tensors for this operation."}; + _outputs = indexes; +} void Operation::replaceInputs(const OperandIndex &from, const OperandIndex &to) { diff --git a/runtime/onert/core/src/ir/OperationDumper.cc b/runtime/onert/core/src/ir/OperationDumper.cc index 48361f464..eecfe81cc 100644 --- a/runtime/onert/core/src/ir/OperationDumper.cc +++ b/runtime/onert/core/src/ir/OperationDumper.cc @@ -40,7 +40,7 @@ void dumpUnaryInputOp(const Operation &node, const std::string &adding_input = " void dumpBinaryInputOp(const Operation &node, const std::string &adding_input = "") { VERBOSE(LIR) << "* " << node.name() << std::endl; - VERBOSE(LIR) << " - Inputs : Input(" << node.getInputs().at(0) << ", " << node.getInputs().at(0) + VERBOSE(LIR) << " - Inputs : Input(" << node.getInputs().at(0) << ", " << node.getInputs().at(1) << ") " << adding_input << std::endl; VERBOSE(LIR) << " - Output : Output(" << node.getOutputs().at(0) << ")" << std::endl; } @@ -72,7 +72,7 @@ OperationDumper::OperationDumper(const std::string &start_msg) VERBOSE(LIR) << start_msg << std::endl; } -void OperationDumper::visit(const ArgMax &node) { dumpUnaryInputOp(node); } +void OperationDumper::visit(const ArgMax &node) { dumpBinaryInputOp(node); } void OperationDumper::visit(const BatchToSpaceND &node) { @@ -82,6 +82,20 @@ void OperationDumper::visit(const BatchToSpaceND &node) dumpUnaryInputOp(node, block_size); } +void OperationDumper::visit(const BCQFullyConnected &node) +{ + VERBOSE(LIR) << "* " << node.name() << std::endl; + VERBOSE(LIR) << " - Inputs : IFM(" << node.getInputs().at(BCQFullyConnected::Input::INPUT) + << ") WeightsBinary(" + << node.getInputs().at(BCQFullyConnected::Input::WEIGHTS_BINARY) + << ") WeightsScales(" + << node.getInputs().at(BCQFullyConnected::Input::WEIGHTS_SCALES) + << ") WeightsClusters(" + << node.getInputs().at(BCQFullyConnected::Input::WEIGHTS_CLUSTERS) << ") Bias(" + << node.getInputs().at(BCQFullyConnected::Input::BIAS) << ")" << std::endl; + VERBOSE(LIR) << " - Output : OFM(" << node.getOutputs().at(0) << ")" << std::endl; +} + void OperationDumper::visit(const BinaryArithmetic &node) { dumpBinaryInputOp(node); } void OperationDumper::visit(const operation::BroadcastTo &node) { dumpBinaryInputOp(node); } @@ -185,6 +199,7 @@ void OperationDumper::visit(const LocalResponseNormalization &node) { dumpUnaryI void OperationDumper::visit(const LSTM &node) { + VERBOSE(LIR) << "* " << node.name() << std::endl; VERBOSE(LIR) << " - Inputs : Input(" << node.getInputs().at(LSTM::Input::INPUT) << ") Input To Input Weights(" << node.getInputs().at(LSTM::Input::INPUT_TO_INPUT_WEIGHTS) @@ -209,12 +224,24 @@ void OperationDumper::visit(const LSTM &node) << node.getInputs().at(LSTM::Input::PROJECTION_WEIGHTS) << ") Projection Bias(" << node.getInputs().at(LSTM::Input::PROJECTION_BIAS) << ") Output State In(" << node.getInputs().at(LSTM::Input::OUTPUT_STATE_IN) << ") Cell State In(" - << node.getInputs().at(LSTM::Input::CELL_STATE_IN) << ")" << std::endl; + << node.getInputs().at(LSTM::Input::CELL_STATE_IN); + if (node.getInputs().size() == 24) + { + VERBOSE(LIR) << ") Input Layer Normalization Weights(" + << node.getInputs().at(LSTM::Input::INPUT_LAYER_NORMALIZATION_WEIGHTS) + << ") Forget Layer Normalization Weights(" + << node.getInputs().at(LSTM::Input::FORGET_LAYER_NORMALIZATION_WEIGHTS) + << ") Cell Layer Normalization Weights(" + << node.getInputs().at(LSTM::Input::CELL_LAYER_NORMALIZATION_WEIGHTS) + << ") Ouput Layer Normalization Weights(" + << node.getInputs().at(LSTM::Input::OUTPUT_LAYER_NORMALIZATION_WEIGHTS); + } + VERBOSE(LIR) << ")" << std::endl; VERBOSE(LIR) << " - Output : Scratch Buffer(" << node.getOutputs().at(LSTM::Output::SCRATCH_BUFFER) << ") Output State Out(" - << node.getInputs().at(LSTM::Output::OUTPUT_STATE_OUT) << ") Cell State Out(" - << node.getInputs().at(LSTM::Output::CELL_STATE_OUT) << ") Output(" - << node.getInputs().at(LSTM::Output::OUTPUT) << ")" << std::endl; + << node.getOutputs().at(LSTM::Output::OUTPUT_STATE_OUT) << ") Cell State Out(" + << node.getOutputs().at(LSTM::Output::CELL_STATE_OUT) << ") Output(" + << node.getOutputs().at(LSTM::Output::OUTPUT) << ")" << std::endl; } void OperationDumper::visit(const Pack &node) { dumpPackingOp(node); } @@ -279,7 +306,37 @@ void OperationDumper::visit(const Reshape &node) dumpUnaryInputOp(node, shape); } -void OperationDumper::visit(const ResizeBilinear &node) { dumpUnaryInputOp(node); } +void OperationDumper::visit(const ResizeBilinear &node) +{ + if (node.getInputs().size() == 1) + { + dumpUnaryInputOp(node); + } + else if (node.getInputs().size() == 2) + { + dumpBinaryInputOp(node); + } + else + { + VERBOSE(LIR) << "* " << node.name() << " is set wrong" << std::endl; + } +} + +void OperationDumper::visit(const ResizeNearestNeighbor &node) +{ + if (node.getInputs().size() == 1) + { + dumpUnaryInputOp(node); + } + else if (node.getInputs().size() == 2) + { + dumpBinaryInputOp(node); + } + else + { + VERBOSE(LIR) << "* " << node.name() << " is set wrong" << std::endl; + } +} void OperationDumper::visit(const Reverse &node) { @@ -336,7 +393,7 @@ void OperationDumper::visit(const SpaceToBatchND &node) void OperationDumper::visit(const SpaceToDepth &node) { dumpUnaryInputOp(node); } -void OperationDumper::visit(const Split &node) { dumpUnaryInputOp(node); } +void OperationDumper::visit(const Split &node) { dumpBinaryInputOp(node); } void OperationDumper::visit(const SquaredDifference &node) { dumpBinaryInputOp(node); } @@ -384,7 +441,7 @@ void OperationDumper::visit(const TransposeConv &node) VERBOSE(LIR) << " - Output : OFM(" << node.getOutputs().at(0) << ")" << std::endl; } -void OperationDumper::visit(const Transpose &node) { dumpUnaryInputOp(node); } +void OperationDumper::visit(const Transpose &node) { dumpBinaryInputOp(node); } void OperationDumper::visit(const Unpack &node) { diff --git a/runtime/onert/core/src/ir/OperationDumper.h b/runtime/onert/core/src/ir/OperationDumper.h index e8ab3b3cd..91642ab13 100644 --- a/runtime/onert/core/src/ir/OperationDumper.h +++ b/runtime/onert/core/src/ir/OperationDumper.h @@ -33,6 +33,7 @@ public: public: void visit(const operation::ArgMax &) override; void visit(const operation::BatchToSpaceND &node) override; + void visit(const operation::BCQFullyConnected &node) override; void visit(const operation::BinaryArithmetic &node) override; void visit(const operation::BroadcastTo &) override; void visit(const operation::Comparison &) override; @@ -65,6 +66,7 @@ public: void visit(const operation::Reduce &) override; void visit(const operation::Reshape &node) override; void visit(const operation::ResizeBilinear &) override; + void visit(const operation::ResizeNearestNeighbor &) override; void visit(const operation::Reverse &) override; void visit(const operation::RNN &) override; void visit(const operation::Select &node) override; diff --git a/runtime/onert/core/src/ir/operation/ArgMax.cc b/runtime/onert/core/src/ir/operation/ArgMax.cc index 1275ae43a..f3bd8fd73 100644 --- a/runtime/onert/core/src/ir/operation/ArgMax.cc +++ b/runtime/onert/core/src/ir/operation/ArgMax.cc @@ -31,7 +31,7 @@ void ArgMax::accept(OperationVisitor &v) const { v.visit(*this); } ArgMax::ArgMax(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs, const Param ¶m) - : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param} + : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param} { } diff --git a/runtime/onert/core/src/ir/operation/BatchToSpaceND.cc b/runtime/onert/core/src/ir/operation/BatchToSpaceND.cc index 9ef2b125f..34be79dd2 100644 --- a/runtime/onert/core/src/ir/operation/BatchToSpaceND.cc +++ b/runtime/onert/core/src/ir/operation/BatchToSpaceND.cc @@ -31,7 +31,7 @@ void BatchToSpaceND::accept(OperationVisitor &v) const { v.visit(*this); } BatchToSpaceND::BatchToSpaceND(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs) - : Operation{OperandConstraint::createExact(3u), inputs, outputs} + : Operation{OperandConstraint::createInRange(2u, 3u), inputs, outputs} { } diff --git a/runtime/onert/core/src/ir/operation/ElementwiseUnary.cc b/runtime/onert/core/src/ir/operation/ElementwiseUnary.cc index 7dfcd4a98..6a0be7eb8 100644 --- a/runtime/onert/core/src/ir/operation/ElementwiseUnary.cc +++ b/runtime/onert/core/src/ir/operation/ElementwiseUnary.cc @@ -32,7 +32,9 @@ void ElementwiseUnary::accept(OperationVisitor &v) const { v.visit(*this); } ElementwiseUnary::ElementwiseUnary(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs, const Param ¶m) - : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param} + : Operation{OperandConstraint::createExact(1u), inputs, outputs, + OperandConstraint::createExact(1u)}, + _param{param} { } diff --git a/runtime/onert/core/src/ir/operation/Fill.cc b/runtime/onert/core/src/ir/operation/Fill.cc index c44f45aab..b8b97d1c0 100644 --- a/runtime/onert/core/src/ir/operation/Fill.cc +++ b/runtime/onert/core/src/ir/operation/Fill.cc @@ -30,7 +30,7 @@ namespace operation void Fill::accept(OperationVisitor &v) const { v.visit(*this); } Fill::Fill(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs) - : Operation{OperandConstraint::createExact(1u), inputs, outputs} + : Operation{OperandConstraint::createExact(2u), inputs, outputs} { } diff --git a/runtime/onert/core/src/ir/operation/FullyConnected.cc b/runtime/onert/core/src/ir/operation/FullyConnected.cc index 118ae554a..9837a3137 100644 --- a/runtime/onert/core/src/ir/operation/FullyConnected.cc +++ b/runtime/onert/core/src/ir/operation/FullyConnected.cc @@ -31,7 +31,7 @@ void FullyConnected::accept(OperationVisitor &v) const { v.visit(*this); } FullyConnected::FullyConnected(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs, const Param ¶m) - : Operation{OperandConstraint::createExact(3u), inputs, outputs}, _param{param} + : Operation{OperandConstraint::createInRange(2u, 3u), inputs, outputs}, _param{param} { } diff --git a/runtime/onert/core/src/ir/operation/LSTM.cc b/runtime/onert/core/src/ir/operation/LSTM.cc index 30a865326..5cd7c793a 100644 --- a/runtime/onert/core/src/ir/operation/LSTM.cc +++ b/runtime/onert/core/src/ir/operation/LSTM.cc @@ -31,10 +31,18 @@ void LSTM::accept(OperationVisitor &v) const { v.visit(*this); } LSTM::LSTM(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs, const Param ¶m) - : Operation{OperandConstraint::createExact(23u), inputs, outputs}, _param{param} + : Operation{OperandConstraint::createInRange(20u, 24u), inputs, outputs}, _param{param} { } +std::string LSTM::name() const +{ + if (getOutputs().at(Output::SCRATCH_BUFFER).undefined()) + return std::string{"UnidirectionalSequenceLSTM"}; + else + return Operation::name(); +} + } // namespace operation } // namespace ir } // namespace onert diff --git a/runtime/onert/core/src/ir/operation/Pack.cc b/runtime/onert/core/src/ir/operation/Pack.cc index f0908a2c6..784d4162a 100644 --- a/runtime/onert/core/src/ir/operation/Pack.cc +++ b/runtime/onert/core/src/ir/operation/Pack.cc @@ -25,7 +25,7 @@ namespace operation void Pack::accept(OperationVisitor &v) const { v.visit(*this); } Pack::Pack(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs, const Param ¶m) - : Operation{OperandConstraint::createAtLeast(3u), inputs, outputs}, _param{param} + : Operation{OperandConstraint::createAtLeast(1u), inputs, outputs}, _param{param} { } } // namespace operation diff --git a/runtime/onert/core/src/ir/operation/ResizeBilinear.cc b/runtime/onert/core/src/ir/operation/ResizeBilinear.cc index d0d89f45f..71925bb44 100644 --- a/runtime/onert/core/src/ir/operation/ResizeBilinear.cc +++ b/runtime/onert/core/src/ir/operation/ResizeBilinear.cc @@ -31,7 +31,7 @@ void ResizeBilinear::accept(OperationVisitor &v) const { v.visit(*this); } ResizeBilinear::ResizeBilinear(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs, const Param ¶m) - : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param} + : Operation{OperandConstraint::createInRange(1u, 2u), inputs, outputs}, _param{param} { } diff --git a/runtime/onert/core/src/ir/operation/ResizeNearestNeighbor.cc b/runtime/onert/core/src/ir/operation/ResizeNearestNeighbor.cc index 9f17af97c..98d0b5f26 100644 --- a/runtime/onert/core/src/ir/operation/ResizeNearestNeighbor.cc +++ b/runtime/onert/core/src/ir/operation/ResizeNearestNeighbor.cc @@ -32,7 +32,7 @@ void ResizeNearestNeighbor::accept(OperationVisitor &v) const { v.visit(*this); ResizeNearestNeighbor::ResizeNearestNeighbor(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs, const Param ¶m) - : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param} + : Operation{OperandConstraint::createInRange(1u, 2u), inputs, outputs}, _param{param} { } diff --git a/runtime/onert/core/src/ir/operation/Split.cc b/runtime/onert/core/src/ir/operation/Split.cc index 244884e41..b538e9206 100644 --- a/runtime/onert/core/src/ir/operation/Split.cc +++ b/runtime/onert/core/src/ir/operation/Split.cc @@ -25,7 +25,7 @@ namespace operation void Split::accept(OperationVisitor &v) const { v.visit(*this); } Split::Split(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs, const Param ¶m) - : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param} + : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param} { } } // namespace operation diff --git a/runtime/onert/core/src/ir/operation/Transpose.cc b/runtime/onert/core/src/ir/operation/Transpose.cc index 3a663fbce..997f98ab0 100644 --- a/runtime/onert/core/src/ir/operation/Transpose.cc +++ b/runtime/onert/core/src/ir/operation/Transpose.cc @@ -29,9 +29,8 @@ namespace operation void Transpose::accept(OperationVisitor &v) const { v.visit(*this); } -Transpose::Transpose(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs, - const Param ¶m) - : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param} +Transpose::Transpose(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs) + : Operation{OperandConstraint::createExact(2u), inputs, outputs} { } diff --git a/runtime/onert/core/src/ir/verifier/Verifier.cc b/runtime/onert/core/src/ir/verifier/Verifier.cc index 09cbdcf2f..489845971 100644 --- a/runtime/onert/core/src/ir/verifier/Verifier.cc +++ b/runtime/onert/core/src/ir/verifier/Verifier.cc @@ -51,7 +51,7 @@ bool DAGChecker::verify(const Graph &graph) const noexcept visited[index] = true; on_stack[index] = true; - for (auto output : node.getOutputs() | Remove::DUPLICATED) + for (auto output : node.getOutputs() | Remove::DUPLICATED | Remove::UNDEFINED) { const auto &operand = graph.operands().at(output); for (const auto &use : operand.getUses()) @@ -99,7 +99,7 @@ bool EdgeConsistencyChecker::verify(const Graph &graph) const noexcept errors += 1; } } - for (auto operand_index : node.getOutputs()) + for (auto operand_index : node.getOutputs() | ir::Remove::UNDEFINED) { try { diff --git a/runtime/onert/core/src/util/EventCollectorGlobal.cc b/runtime/onert/core/src/util/EventCollectorGlobal.cc index d09b95210..6c03a5b9a 100644 --- a/runtime/onert/core/src/util/EventCollectorGlobal.cc +++ b/runtime/onert/core/src/util/EventCollectorGlobal.cc @@ -21,6 +21,7 @@ #include <iostream> #include "util/ConfigSource.h" +#include "util/EventWriter.h" namespace onert { @@ -39,8 +40,8 @@ EventCollectorGlobal::~EventCollectorGlobal() try { // TODO Need better way for saved file path than the hardcoded path - std::ofstream ofs{"trace.global.json"}; - _recorder.writeToFile(ofs); + EventWriter{_recorder}.writeToFile("trace.global.json", + EventWriter::WriteFormat::CHROME_TRACING); } catch (const std::exception &e) { diff --git a/runtime/onert/core/src/util/EventRecorder.cc b/runtime/onert/core/src/util/EventRecorder.cc index 13a599bed..3714e4f02 100644 --- a/runtime/onert/core/src/util/EventRecorder.cc +++ b/runtime/onert/core/src/util/EventRecorder.cc @@ -16,389 +16,6 @@ #include "util/EventRecorder.h" -#include <sstream> -#include <vector> -#include <unordered_map> -#include <json/json.h> -#include <assert.h> -#include <utility> -#include <map> -#include <set> -#include <stdint.h> - -// json type for Chrome Event Trace -namespace -{ - -std::string quote(const std::string &value) -{ - std::stringstream ss; - ss << '"' << value << '"'; - return ss.str(); -} - -std::string field(const std::string &k, const std::string &v) -{ - std::stringstream ss; - ss << quote(k) << " : " << quote(v); - return ss.str(); -} - -struct Content // One Entry in Chrome Event Trace -{ - std::vector<std::pair<std::string, std::string>> flds; - std::vector<std::pair<std::string, std::string>> args; -}; - -std::string object(const Content &content) -{ - std::stringstream ss; - - ss << "{ "; - - ss << field(content.flds[0].first, content.flds[0].second); - - for (uint32_t n = 1; n < content.flds.size(); ++n) - { - ss << ", " << field(content.flds.at(n).first, content.flds.at(n).second); - } - - if (content.args.size() > 0) - { - ss << ", " << quote("args") << " : { "; - ss << field(content.args.at(0).first, content.args.at(0).second); - - for (uint32_t n = 1; n < content.args.size(); ++n) - { - ss << ", " << field(content.args.at(n).first, content.args.at(n).second); - } - - ss << "}"; - } - - ss << " }"; - - return ss.str(); -} - -void fill(Content &content, const Event &evt) -{ - content.flds.emplace_back("name", evt.name); - content.flds.emplace_back("pid", "0"); - content.flds.emplace_back("tid", evt.tid); - content.flds.emplace_back("ph", evt.ph); - content.flds.emplace_back("ts", evt.ts); -} - -std::string object(const DurationEvent &evt) -{ - Content content; - - fill(content, evt); - - return ::object(content); -} - -std::string object(const CounterEvent &evt) -{ - Content content; - - fill(content, evt); - - for (auto it = evt.values.begin(); it != evt.values.end(); ++it) - { - content.args.emplace_back(it->first, it->second); - } - - return ::object(content); -} - -} // namespace - -// md table type -namespace -{ - -void writeMDTableRow(std::ostream &os, const std::vector<std::string> &list) -{ - os << "| "; - for (auto &key : list) - { - os << key << " | "; - } - os << "\n"; -} - -struct MDContent -{ - std::string name; - uint64_t begin_ts; - uint64_t end_ts; - uint32_t min_rss; - uint32_t max_rss; - uint32_t min_page_reclaims; - uint32_t max_page_reclaims; - - MDContent() - : begin_ts(0), end_ts(0), min_rss(UINT32_MAX), max_rss(0), min_page_reclaims(UINT32_MAX), - max_page_reclaims(0) - { - // DO NOTHING - } - - virtual ~MDContent() = default; - - void updateRss(uint32_t rss) - { - if (min_rss == UINT32_MAX) - min_rss = rss; - if (max_rss == 0) - max_rss = rss; - - if (min_rss > rss) - min_rss = rss; - else if (max_rss < rss) - max_rss = rss; - } - - void updateMinflt(uint32_t minflt) - { - if (min_page_reclaims == UINT32_MAX) - min_page_reclaims = minflt; - if (max_page_reclaims == 0) - max_page_reclaims = minflt; - - if (min_page_reclaims > minflt) - min_page_reclaims = minflt; - else if (max_page_reclaims < minflt) - max_page_reclaims = minflt; - } - - virtual void write(std::ostream &os) const = 0; -}; - -struct OpSeq : public MDContent -{ - std::string backend; - uint64_t graph_latency; - - struct OpSeqCmp - { - bool operator()(const OpSeq &lhs, const OpSeq &rhs) const - { - return lhs.begin_ts < rhs.begin_ts; - } - bool operator()(const OpSeq &lhs, const OpSeq &rhs) { return lhs.begin_ts < rhs.begin_ts; } - bool operator()(OpSeq &lhs, OpSeq &rhs) { return lhs.begin_ts < rhs.begin_ts; } - }; - - void write(std::ostream &os) const override - { - uint64_t opseq_latency = end_ts - begin_ts; - double opseq_per = static_cast<double>(opseq_latency) / graph_latency * 100.0; - writeMDTableRow(os, {name, backend, std::to_string(opseq_latency), std::to_string(opseq_per), - std::to_string(min_rss), std::to_string(max_rss), - std::to_string(min_page_reclaims), std::to_string(max_page_reclaims)}); - } -}; - -struct Graph : public MDContent -{ - std::set<OpSeq, OpSeq::OpSeqCmp> opseqs; - - void setOpSeqs(const std::map<std::string, OpSeq> &name_to_opseq) - { - uint64_t graph_latency = end_ts - begin_ts; - for (auto it : name_to_opseq) - { - auto opseq = it.second; - opseq.graph_latency = graph_latency; - - opseqs.insert(opseq); - - updateRss(opseq.min_rss); - updateRss(opseq.max_rss); - updateMinflt(opseq.min_page_reclaims); - updateMinflt(opseq.max_page_reclaims); - } - } - - void write(std::ostream &os) const override - { - static std::vector<std::string> graph_headers{"latency(us)", "rss_min(kb)", "rss_max(kb)", - "page_reclaims_min", "page_reclaims_max"}; - - static std::vector<std::string> graph_headers_line{"-----------", "-------", "-------", - "-----------------", "-----------------"}; - - // Graph's Header - writeMDTableRow(os, graph_headers); - writeMDTableRow(os, graph_headers_line); - - // Graph's contents - writeMDTableRow(os, {std::to_string(end_ts - begin_ts), std::to_string(min_rss), - std::to_string(max_rss), std::to_string(min_page_reclaims), - std::to_string(max_page_reclaims)}); - - os << "\n"; - - static std::vector<std::string> opseq_headers{ - "OpSeq name", "backend", "latency(us)", "latency(%)", - "rss_min(kb)", "rss_max(kb)", "page_reclaims_min", "page_reclaims_max"}; - - static std::vector<std::string> opseq_headers_line{ - "----------", "-------", "-----------", "-----------", - "-------", "-------", "-----------------", "-----------------"}; - - os << "## OpSequences \n"; - - // OpSeq's Header - writeMDTableRow(os, opseq_headers); - writeMDTableRow(os, opseq_headers_line); - - // OpSeq's contents - for (auto opseq : opseqs) - { - opseq.write(os); - } - - os << "\n"; - } -}; - -struct MDTableBuilder -{ - MDTableBuilder(const std::vector<DurationEvent> &duration_events, - const std::vector<CounterEvent> &counter_events) - : _duration_events(duration_events), _counter_events(counter_events) - { - for (const auto &evt : _counter_events) - { - uint64_t ts = std::stoull(evt.ts); - auto &name = evt.name; - assert(name.compare("maxrss") == 0 || name.compare("minflt") == 0); - assert(evt.values.size() == 1); - auto &val = evt.values.begin()->second; - if (_ts_to_values.find(ts) == _ts_to_values.end()) - { - std::pair<uint32_t, uint32_t> values; - if (name.compare("maxrss") == 0) - values.first = std::stoul(val); - else - values.second = std::stoul(val); - _ts_to_values.insert({ts, values}); - } - else - { - auto &values = _ts_to_values.at(ts); - if (name.compare("maxrss") == 0) - values.first = std::stoul(val); - else - values.second = std::stoul(val); - } - } - } - - MDTableBuilder &build() - { - for (auto &it : divideGraph()) - { - size_t begin_idx = it.first; - size_t end_idx = it.second; - std::map<std::string, OpSeq> name_to_opseq; - for (size_t i = begin_idx + 1; i < end_idx; ++i) - { - const auto &evt = _duration_events[i]; - assert(evt.name.compare("Graph") != 0); - assert(evt.ph.compare("B") == 0 || evt.ph.compare("E") == 0); - if (evt.ph.compare("B") == 0) - { - assert(name_to_opseq.find(evt.name) == name_to_opseq.end()); - name_to_opseq.insert({evt.name, makeOpSeq(evt)}); - } - else - { - assert(name_to_opseq.find(evt.name) != name_to_opseq.end()); - auto &opseq = name_to_opseq.at(evt.name); - updateOpSeq(opseq, evt); - } - } - - _graphs.emplace_back(makeGraph(begin_idx, end_idx, name_to_opseq)); - } - - return *this; - } - - std::vector<std::pair<size_t, size_t>> divideGraph() - { - std::vector<std::pair<size_t, size_t>> graph_idx_list; // pair<begin_idx, end_idx> - for (size_t i = 0, begin_idx = 0; i < _duration_events.size(); ++i) - { - const auto &evt = _duration_events.at(i); - if (evt.name.compare("Graph") == 0) - { - if (evt.ph.compare("B") == 0) - begin_idx = i; - else - graph_idx_list.emplace_back(begin_idx, i); - } - } - return graph_idx_list; - } - - OpSeq makeOpSeq(const DurationEvent &evt) - { - OpSeq opseq; - opseq.name = evt.name; - opseq.begin_ts = std::stoull(evt.ts); - opseq.updateRss(_ts_to_values.at(opseq.begin_ts).first); - opseq.updateMinflt(_ts_to_values.at(opseq.begin_ts).second); - opseq.backend = evt.tid; - return opseq; - } - - void updateOpSeq(OpSeq &opseq, const DurationEvent &evt) - { - opseq.end_ts = std::stoull(evt.ts); - opseq.updateRss(_ts_to_values.at(opseq.end_ts).first); - opseq.updateMinflt(_ts_to_values.at(opseq.end_ts).second); - } - - Graph makeGraph(size_t begin_idx, size_t end_idx, - const std::map<std::string, OpSeq> &name_to_opseq) - { - Graph graph; - graph.name = "Graph"; - graph.begin_ts = std::stoull(_duration_events[begin_idx].ts); - graph.updateRss(_ts_to_values.at(graph.begin_ts).first); - graph.updateMinflt(_ts_to_values.at(graph.begin_ts).second); - graph.end_ts = std::stoull(_duration_events[end_idx].ts); - graph.updateRss(_ts_to_values.at(graph.end_ts).first); - graph.updateMinflt(_ts_to_values.at(graph.end_ts).second); - graph.setOpSeqs(name_to_opseq); - return graph; - } - - void write(std::ostream &os) - { - // Write contents - for (size_t i = 0; i < _graphs.size(); ++i) - { - os << "# Graph " << i << "\n"; - _graphs.at(i).write(os); - } - } - - const std::vector<DurationEvent> &_duration_events; - const std::vector<CounterEvent> &_counter_events; - // timestamp to std::pair<maxrss, minflt> - std::unordered_map<uint64_t, std::pair<uint32_t, uint32_t>> _ts_to_values; - std::vector<Graph> _graphs; -}; - -} // namespace - void EventRecorder::emit(const DurationEvent &evt) { std::lock_guard<std::mutex> lock{_mu}; @@ -412,146 +29,3 @@ void EventRecorder::emit(const CounterEvent &evt) _counter_events.push_back(evt); } - -void EventRecorder::writeToFile(std::ostream &os) -{ - std::lock_guard<std::mutex> lock{_mu}; - - switch (_write_format) - { - case WriteFormat::CHROME_TRACING: - writeChromeTrace(os); - break; - case WriteFormat::SNPE_BENCHMARK: - writeSNPEBenchmark(os); - break; - case WriteFormat::MD_TABLE: - writeMDTable(os); - break; - default: - assert(!"Invalid value"); - break; - } -} - -void EventRecorder::writeSNPEBenchmark(std::ostream &os) -{ - Json::Value root; - auto &exec_data = root["Execution_Data"] = Json::Value{Json::objectValue}; - - struct Stat - { - uint64_t sum = 0; - uint64_t count = 0; - uint64_t max = 0; - uint64_t min = std::numeric_limits<uint64_t>::max(); - - void accumulate(uint64_t val) - { - sum += val; - count++; - max = std::max(max, val); - min = std::min(min, val); - } - }; - - // Memory - { - std::unordered_map<std::string, Stat> mem_stats; - for (auto &evt : _counter_events) - { - auto &mem_stat = mem_stats[evt.name]; - uint64_t val = std::stoull(evt.values["value"]); - mem_stat.accumulate(val); - } - - auto &mem = exec_data["memory"] = Json::Value{Json::objectValue}; - for (auto &kv : mem_stats) - { - auto &key = kv.first; - auto &val = kv.second; - mem[key]["Avg_Size"] = val.sum / val.count; - mem[key]["Max_Size"] = val.max; - mem[key]["Min_Size"] = val.min; - mem[key]["Runtime"] = "NA"; - } - } - - // Operation Execution Time - { - // NOTE This assumes _duration_events is sorted by "ts" ascending - - // 2D keys : stats[tid][name] - std::unordered_map<std::string, std::unordered_map<std::string, Stat>> stats; - std::unordered_map<std::string, std::unordered_map<std::string, uint64_t>> begin_timestamps; - for (auto &evt : _duration_events) - { - auto &stat = stats[evt.tid][evt.name]; - auto &begin_ts = begin_timestamps[evt.tid][evt.name]; - uint64_t timestamp = std::stoull(evt.ts); - if (evt.ph == "B") - { - if (begin_ts != 0) - throw std::runtime_error{"Invalid Data"}; - begin_ts = timestamp; - } - else if (evt.ph == "E") - { - if (begin_ts == 0 || timestamp < begin_ts) - throw std::runtime_error{"Invalid Data"}; - stat.accumulate(timestamp - begin_ts); - begin_ts = 0; - } - else - throw std::runtime_error{"Invalid Data - invalid value for \"ph\" : \"" + evt.ph + "\""}; - } - - for (auto &kv : begin_timestamps) - for (auto &kv2 : kv.second) - if (kv2.second != 0) - throw std::runtime_error{"Invalid Data - B and E pair does not match."}; - - for (auto &kv : stats) - { - auto &tid = kv.first; - auto &map = kv.second; - auto &json_tid = exec_data[tid] = Json::Value{Json::objectValue}; - for (auto &kv : map) - { - auto &name = kv.first; - auto &val = kv.second; - json_tid[name]["Avg_Time"] = val.sum / val.count; - json_tid[name]["Max_Time"] = val.max; - json_tid[name]["Min_Time"] = val.min; - json_tid[name]["Runtime"] = tid; - } - } - } - - os << root; -} - -void EventRecorder::writeChromeTrace(std::ostream &os) -{ - os << "{\n"; - os << " " << quote("traceEvents") << ": [\n"; - - for (auto &evt : _duration_events) - { - os << " " << object(evt) << ",\n"; - } - - for (auto &evt : _counter_events) - { - os << " " << object(evt) << ",\n"; - } - - os << " { }\n"; - os << " ]\n"; - os << "}\n"; -} - -void EventRecorder::writeMDTable(std::ostream &os) -{ - MDTableBuilder(_duration_events, _counter_events).build().write(os); -} diff --git a/runtime/onert/core/src/util/EventRecorder.h b/runtime/onert/core/src/util/EventRecorder.h index 37ec1a0f1..7af4c7ddb 100644 --- a/runtime/onert/core/src/util/EventRecorder.h +++ b/runtime/onert/core/src/util/EventRecorder.h @@ -21,7 +21,6 @@ #include <memory> #include <mutex> -#include <ostream> #include <vector> struct Event @@ -50,14 +49,6 @@ struct CounterEvent : public Event class EventRecorder { public: - enum class WriteFormat - { - CHROME_TRACING, - SNPE_BENCHMARK, - MD_TABLE, - }; - -public: EventRecorder() = default; public: @@ -66,18 +57,11 @@ public: public: bool empty() { return _duration_events.empty() && _counter_events.empty(); } - void writeToFile(std::ostream &os); - void setWriteFormat(WriteFormat write_format) { _write_format = write_format; } - -private: - void writeSNPEBenchmark(std::ostream &os); - void writeChromeTrace(std::ostream &os); - void writeMDTable(std::ostream &os); + const std::vector<DurationEvent> &duration_events() const { return _duration_events; } + const std::vector<CounterEvent> &counter_events() const { return _counter_events; } private: std::mutex _mu; - // TODO: Allow user to control write_format - WriteFormat _write_format{WriteFormat::SNPE_BENCHMARK}; std::vector<DurationEvent> _duration_events; std::vector<CounterEvent> _counter_events; }; diff --git a/runtime/onert/core/src/util/EventWriter.cc b/runtime/onert/core/src/util/EventWriter.cc new file mode 100644 index 000000000..dacb40e64 --- /dev/null +++ b/runtime/onert/core/src/util/EventWriter.cc @@ -0,0 +1,574 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "util/EventWriter.h" + +#include <sstream> +#include <vector> +#include <unordered_map> +#include <json/json.h> +#include <assert.h> +#include <utility> +#include <map> +#include <set> +#include <stdint.h> +#include <fstream> + +// json type for Chrome Event Trace +namespace +{ + +std::string quote(const std::string &value) +{ + std::stringstream ss; + ss << '"' << value << '"'; + return ss.str(); +} + +std::string field(const std::string &k, const std::string &v) +{ + std::stringstream ss; + ss << quote(k) << " : " << quote(v); + return ss.str(); +} + +struct Content // One Entry in Chrome Event Trace +{ + std::vector<std::pair<std::string, std::string>> flds; + std::vector<std::pair<std::string, std::string>> args; +}; + +std::string object(const Content &content) +{ + std::stringstream ss; + + ss << "{ "; + + ss << field(content.flds[0].first, content.flds[0].second); + + for (uint32_t n = 1; n < content.flds.size(); ++n) + { + ss << ", " << field(content.flds.at(n).first, content.flds.at(n).second); + } + + if (content.args.size() > 0) + { + ss << ", " << quote("args") << " : { "; + ss << field(content.args.at(0).first, content.args.at(0).second); + + for (uint32_t n = 1; n < content.args.size(); ++n) + { + ss << ", " << field(content.args.at(n).first, content.args.at(n).second); + } + + ss << "}"; + } + + ss << " }"; + + return ss.str(); +} + +void fill(Content &content, const Event &evt) +{ + content.flds.emplace_back("name", evt.name); + content.flds.emplace_back("pid", "0"); + content.flds.emplace_back("tid", evt.tid); + content.flds.emplace_back("ph", evt.ph); + content.flds.emplace_back("ts", evt.ts); +} + +std::string object(const DurationEvent &evt) +{ + Content content; + + fill(content, evt); + + return ::object(content); +} + +std::string object(const CounterEvent &evt) +{ + Content content; + + fill(content, evt); + + for (auto it = evt.values.begin(); it != evt.values.end(); ++it) + { + content.args.emplace_back(it->first, it->second); + } + + return ::object(content); +} + +} // namespace + +// md table type +namespace +{ + +void writeMDTableRow(std::ostream &os, const std::vector<std::string> &list) +{ + os << "| "; + for (auto &key : list) + { + os << key << " | "; + } + os << "\n"; +} + +struct MDContent +{ + std::string name; + uint64_t begin_ts; + uint64_t end_ts; + uint32_t min_rss; + uint32_t max_rss; + uint32_t min_page_reclaims; + uint32_t max_page_reclaims; + + MDContent() + : begin_ts(0), end_ts(0), min_rss(UINT32_MAX), max_rss(0), min_page_reclaims(UINT32_MAX), + max_page_reclaims(0) + { + // DO NOTHING + } + + virtual ~MDContent() = default; + + void updateRss(uint32_t rss) + { + if (min_rss == UINT32_MAX) + min_rss = rss; + if (max_rss == 0) + max_rss = rss; + + if (min_rss > rss) + min_rss = rss; + else if (max_rss < rss) + max_rss = rss; + } + + void updateMinflt(uint32_t minflt) + { + if (min_page_reclaims == UINT32_MAX) + min_page_reclaims = minflt; + if (max_page_reclaims == 0) + max_page_reclaims = minflt; + + if (min_page_reclaims > minflt) + min_page_reclaims = minflt; + else if (max_page_reclaims < minflt) + max_page_reclaims = minflt; + } + + virtual void write(std::ostream &os) const = 0; +}; + +struct OpSeq : public MDContent +{ + std::string backend; + uint64_t graph_latency; + + struct OpSeqCmp + { + bool operator()(const OpSeq &lhs, const OpSeq &rhs) const + { + return lhs.begin_ts < rhs.begin_ts; + } + bool operator()(const OpSeq &lhs, const OpSeq &rhs) { return lhs.begin_ts < rhs.begin_ts; } + bool operator()(OpSeq &lhs, OpSeq &rhs) { return lhs.begin_ts < rhs.begin_ts; } + }; + + void write(std::ostream &os) const override + { + uint64_t opseq_latency = end_ts - begin_ts; + double opseq_per = static_cast<double>(opseq_latency) / graph_latency * 100.0; + writeMDTableRow(os, {name, backend, std::to_string(opseq_latency), std::to_string(opseq_per), + std::to_string(min_rss), std::to_string(max_rss), + std::to_string(min_page_reclaims), std::to_string(max_page_reclaims)}); + } +}; + +struct Graph : public MDContent +{ + std::set<OpSeq, OpSeq::OpSeqCmp> opseqs; + + void setOpSeqs(const std::map<std::string, OpSeq> &name_to_opseq) + { + uint64_t graph_latency = end_ts - begin_ts; + for (auto it : name_to_opseq) + { + auto opseq = it.second; + opseq.graph_latency = graph_latency; + + opseqs.insert(opseq); + + updateRss(opseq.min_rss); + updateRss(opseq.max_rss); + updateMinflt(opseq.min_page_reclaims); + updateMinflt(opseq.max_page_reclaims); + } + } + + void write(std::ostream &os) const override + { + static std::vector<std::string> graph_headers{"latency(us)", "rss_min(kb)", "rss_max(kb)", + "page_reclaims_min", "page_reclaims_max"}; + + static std::vector<std::string> graph_headers_line{"-----------", "-------", "-------", + "-----------------", "-----------------"}; + + // Graph's Header + writeMDTableRow(os, graph_headers); + writeMDTableRow(os, graph_headers_line); + + // Graph's contents + writeMDTableRow(os, {std::to_string(end_ts - begin_ts), std::to_string(min_rss), + std::to_string(max_rss), std::to_string(min_page_reclaims), + std::to_string(max_page_reclaims)}); + + os << "\n"; + + static std::vector<std::string> opseq_headers{ + "OpSeq name", "backend", "latency(us)", "latency(%)", + "rss_min(kb)", "rss_max(kb)", "page_reclaims_min", "page_reclaims_max"}; + + static std::vector<std::string> opseq_headers_line{ + "----------", "-------", "-----------", "-----------", + "-------", "-------", "-----------------", "-----------------"}; + + os << "## OpSequences \n"; + + // OpSeq's Header + writeMDTableRow(os, opseq_headers); + writeMDTableRow(os, opseq_headers_line); + + // OpSeq's contents + for (auto opseq : opseqs) + { + opseq.write(os); + } + + os << "\n"; + } +}; + +struct MDTableBuilder +{ + MDTableBuilder(const std::vector<DurationEvent> &duration_events, + const std::vector<CounterEvent> &counter_events) + : _duration_events(duration_events), _counter_events(counter_events) + { +// when ready with low overhead in release build +#ifdef DEBUG + for (const auto &evt : _counter_events) + { + uint64_t ts = std::stoull(evt.ts); + auto &name = evt.name; + assert(name.compare("maxrss") == 0 || name.compare("minflt") == 0); + assert(evt.values.size() == 1); + auto &val = evt.values.begin()->second; + if (_ts_to_values.find(ts) == _ts_to_values.end()) + { + std::pair<uint32_t, uint32_t> values; + if (name.compare("maxrss") == 0) + values.first = std::stoul(val); + else + values.second = std::stoul(val); + _ts_to_values.insert({ts, values}); + } + else + { + auto &values = _ts_to_values.at(ts); + if (name.compare("maxrss") == 0) + values.first = std::stoul(val); + else + values.second = std::stoul(val); + } + } +#endif + } + + MDTableBuilder &build() + { + for (auto &it : divideGraph()) + { + size_t begin_idx = it.first; + size_t end_idx = it.second; + std::map<std::string, OpSeq> name_to_opseq; + for (size_t i = begin_idx + 1; i < end_idx; ++i) + { + const auto &evt = _duration_events[i]; + assert(evt.name.compare("Graph") != 0); + assert(evt.ph.compare("B") == 0 || evt.ph.compare("E") == 0); + if (evt.ph.compare("B") == 0) + { + assert(name_to_opseq.find(evt.name) == name_to_opseq.end()); + name_to_opseq.insert({evt.name, makeOpSeq(evt)}); + } + else + { + assert(name_to_opseq.find(evt.name) != name_to_opseq.end()); + auto &opseq = name_to_opseq.at(evt.name); + updateOpSeq(opseq, evt); + } + } + + _graphs.emplace_back(makeGraph(begin_idx, end_idx, name_to_opseq)); + } + + return *this; + } + + std::vector<std::pair<size_t, size_t>> divideGraph() + { + std::vector<std::pair<size_t, size_t>> graph_idx_list; // pair<begin_idx, end_idx> + for (size_t i = 0, begin_idx = 0; i < _duration_events.size(); ++i) + { + const auto &evt = _duration_events.at(i); + if (evt.name.compare("Graph") == 0) + { + if (evt.ph.compare("B") == 0) + begin_idx = i; + else + graph_idx_list.emplace_back(begin_idx, i); + } + } + return graph_idx_list; + } + + OpSeq makeOpSeq(const DurationEvent &evt) + { + OpSeq opseq; + opseq.name = evt.name; + opseq.begin_ts = std::stoull(evt.ts); + opseq.backend = evt.tid; +#ifdef DEBUG + opseq.updateRss(_ts_to_values.at(opseq.begin_ts).first); + opseq.updateMinflt(_ts_to_values.at(opseq.begin_ts).second); +#else + opseq.updateRss(0); + opseq.updateMinflt(0); +#endif + return opseq; + } + + void updateOpSeq(OpSeq &opseq, const DurationEvent &evt) + { + opseq.end_ts = std::stoull(evt.ts); +#ifdef DEBUG + opseq.updateRss(_ts_to_values.at(opseq.end_ts).first); + opseq.updateMinflt(_ts_to_values.at(opseq.end_ts).second); +#else + opseq.updateRss(0); + opseq.updateMinflt(0); +#endif + } + + Graph makeGraph(size_t begin_idx, size_t end_idx, + const std::map<std::string, OpSeq> &name_to_opseq) + { + Graph graph; + graph.name = "Graph"; + graph.begin_ts = std::stoull(_duration_events[begin_idx].ts); + graph.end_ts = std::stoull(_duration_events[end_idx].ts); + graph.setOpSeqs(name_to_opseq); +#ifdef DEBUG + graph.updateRss(_ts_to_values.at(graph.begin_ts).first); + graph.updateMinflt(_ts_to_values.at(graph.begin_ts).second); + graph.updateRss(_ts_to_values.at(graph.end_ts).first); + graph.updateMinflt(_ts_to_values.at(graph.end_ts).second); +#else + graph.updateRss(0); + graph.updateMinflt(0); +#endif + return graph; + } + + void write(std::ostream &os) + { + // Write contents + for (size_t i = 0; i < _graphs.size(); ++i) + { + os << "# Graph " << i << "\n"; + _graphs.at(i).write(os); + } + } + + const std::vector<DurationEvent> &_duration_events; + const std::vector<CounterEvent> &_counter_events; + // timestamp to std::pair<maxrss, minflt> + std::unordered_map<uint64_t, std::pair<uint32_t, uint32_t>> _ts_to_values; + std::vector<Graph> _graphs; +}; + +} // namespace + +EventWriter::EventWriter(const EventRecorder &recorder) : _recorder(recorder) +{ + // DO NOTHING +} + +void EventWriter::writeToFiles(const std::string &base_filepath) +{ + // Note. According to an internal issue, let snpe json as just file name not '.snpe.json' + writeToFile(base_filepath, WriteFormat::SNPE_BENCHMARK); + writeToFile(base_filepath + ".chrome.json", WriteFormat::CHROME_TRACING); + writeToFile(base_filepath + ".table.md", WriteFormat::MD_TABLE); +} + +void EventWriter::writeToFile(const std::string &filepath, WriteFormat write_format) +{ + std::ofstream os{filepath, std::ofstream::out}; + switch (write_format) + { + case WriteFormat::CHROME_TRACING: + writeChromeTrace(os); + break; + case WriteFormat::SNPE_BENCHMARK: + writeSNPEBenchmark(os); + break; + case WriteFormat::MD_TABLE: + writeMDTable(os); + break; + default: + assert(!"Invalid value"); + break; + } +} + +void EventWriter::writeSNPEBenchmark(std::ostream &os) +{ + Json::Value root; + auto &exec_data = root["Execution_Data"] = Json::Value{Json::objectValue}; + + struct Stat + { + uint64_t sum = 0; + uint64_t count = 0; + uint64_t max = 0; + uint64_t min = std::numeric_limits<uint64_t>::max(); + + void accumulate(uint64_t val) + { + sum += val; + count++; + max = std::max(max, val); + min = std::min(min, val); + } + }; + + // Memory + { + std::unordered_map<std::string, Stat> mem_stats; + for (auto &evt : _recorder.counter_events()) + { + auto &mem_stat = mem_stats[evt.name]; + uint64_t val = std::stoull(evt.values.at("value")); + mem_stat.accumulate(val); + } + + auto &mem = exec_data["memory"] = Json::Value{Json::objectValue}; + for (auto &kv : mem_stats) + { + auto &key = kv.first; + auto &val = kv.second; + mem[key]["Avg_Size"] = val.sum / val.count; + mem[key]["Max_Size"] = val.max; + mem[key]["Min_Size"] = val.min; + mem[key]["Runtime"] = "NA"; + } + } + + // Operation Execution Time + { + // NOTE This assumes _duration_events is sorted by "ts" ascending + + // 2D keys : stats[tid][name] + std::unordered_map<std::string, std::unordered_map<std::string, Stat>> stats; + std::unordered_map<std::string, std::unordered_map<std::string, uint64_t>> begin_timestamps; + for (auto &evt : _recorder.duration_events()) + { + auto &stat = stats[evt.tid][evt.name]; + auto &begin_ts = begin_timestamps[evt.tid][evt.name]; + uint64_t timestamp = std::stoull(evt.ts); + if (evt.ph == "B") + { + if (begin_ts != 0) + throw std::runtime_error{"Invalid Data"}; + begin_ts = timestamp; + } + else if (evt.ph == "E") + { + if (begin_ts == 0 || timestamp < begin_ts) + throw std::runtime_error{"Invalid Data"}; + stat.accumulate(timestamp - begin_ts); + begin_ts = 0; + } + else + throw std::runtime_error{"Invalid Data - invalid value for \"ph\" : \"" + evt.ph + "\""}; + } + + for (auto &kv : begin_timestamps) + for (auto &kv2 : kv.second) + if (kv2.second != 0) + throw std::runtime_error{"Invalid Data - B and E pair does not match."}; + + for (auto &kv : stats) + { + auto &tid = kv.first; + auto &map = kv.second; + auto &json_tid = exec_data[tid] = Json::Value{Json::objectValue}; + for (auto &kv : map) + { + auto &name = kv.first; + auto &val = kv.second; + json_tid[name]["Avg_Time"] = val.sum / val.count; + json_tid[name]["Max_Time"] = val.max; + json_tid[name]["Min_Time"] = val.min; + json_tid[name]["Runtime"] = tid; + } + } + } + + os << root; +} + +void EventWriter::writeChromeTrace(std::ostream &os) +{ + os << "{\n"; + os << " " << quote("traceEvents") << ": [\n"; + + for (auto &evt : _recorder.duration_events()) + { + os << " " << object(evt) << ",\n"; + } + + for (auto &evt : _recorder.counter_events()) + { + os << " " << object(evt) << ",\n"; + } + + os << " { }\n"; + os << " ]\n"; + os << "}\n"; +} + +void EventWriter::writeMDTable(std::ostream &os) +{ + MDTableBuilder(_recorder.duration_events(), _recorder.counter_events()).build().write(os); +} diff --git a/runtime/onert/core/src/util/EventWriter.h b/runtime/onert/core/src/util/EventWriter.h new file mode 100644 index 000000000..7e838ca82 --- /dev/null +++ b/runtime/onert/core/src/util/EventWriter.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_UTIL_EVENT_WRITER_H__ +#define __ONERT_UTIL_EVENT_WRITER_H__ + +#include "EventRecorder.h" + +#include <string> +#include <ostream> + +class EventWriter +{ +public: + enum class WriteFormat + { + CHROME_TRACING, + SNPE_BENCHMARK, + MD_TABLE, + }; + +public: + EventWriter(const EventRecorder &recorder); + +public: + void writeToFiles(const std::string &base_filepath); + void writeToFile(const std::string &filepath, WriteFormat write_format); + +private: + void writeSNPEBenchmark(std::ostream &os); + void writeChromeTrace(std::ostream &os); + void writeMDTable(std::ostream &os); + +private: + const EventRecorder &_recorder; +}; + +#endif // __ONERT_UTIL_EVENT_WRITER_H__ diff --git a/runtime/onert/core/src/util/ShapeInference.cc b/runtime/onert/core/src/util/ShapeInference.cc index 95c15049d..0278df4d2 100644 --- a/runtime/onert/core/src/util/ShapeInference.cc +++ b/runtime/onert/core/src/util/ShapeInference.cc @@ -22,6 +22,7 @@ #include "util/logging.h" #include <cassert> +#include <numeric> #include <sstream> #include <cmath> @@ -72,6 +73,19 @@ ir::Shape broadcastShapes(const ir::Shape &lhs_shape, const ir::Shape &rhs_shape } // namespace +namespace bcq +{ +inline int getOutputSize(const ir::Shape &cluster_shape, const int32_t *cluster_buf) +{ + int size = 0; + for (int idx = 0; idx < cluster_shape.dim(0); idx++) + { + size += cluster_buf[idx * 2 + 1]; + } + return size; +} +} // namespace bcq + // // Shape inference // @@ -116,6 +130,11 @@ ir::Shape inferEltwiseShape(const ir::Shape &lhs_shape, const ir::Shape &rhs_sha ir::Shape inferArgMaxShape(const ir::Shape &input_shape, int axis, int rank) { + if (axis < 0 || axis >= rank) + { + throw std::runtime_error("ArgMax shape inference: Wrong axis value " + std::to_string(axis)); + } + ir::Shape out_shape; for (int idx = 0; idx < rank; ++idx) { @@ -259,19 +278,24 @@ ir::Shape inferBatchMatMulShape(const ir::Shape &lhs_shape, const ir::Shape &rhs return output_shape; } -ir::Shape inferBroadcastToShape(const ir::Shape wshape, const int32_t *shape_buffer) +/* + * shp_shape : SHAPE input tensor's shape + * shp_buf : SHAPE input tensor's buffer + */ +ir::Shape inferBroadcastToShape(const ir::Shape shp_shape, const int32_t *shp_buf) { - const int num_elements = wshape.num_elements(); + + const int num_elements = shp_shape.num_elements(); assert(num_elements != 0); - assert(shape_buffer); + assert(shp_buf); ir::Shape new_shape(num_elements); for (int i = 0; i < num_elements; ++i) { - assert(shape_buffer[i] != 0); // It shouldn't be 0. - new_shape.dim(i) = shape_buffer[i]; + assert(shp_buf[i] != 0); // It shouldn't be 0. + new_shape.dim(i) = shp_buf[i]; } return new_shape; @@ -305,6 +329,9 @@ ir::Shape inferConcatShape(const Shapes &in_shapes, const ir::operation::Concat: ir::Shape inferConv2DShape(const ir::Shape &in_shape, const ir::Shape &ker_shape, const ir::operation::Conv2D::Param ¶m, ir::Layout layout) { + if (param.stride.horizontal == 0 || param.stride.vertical == 0) + throw std::runtime_error{"Conv2D: stride values must be positive"}; + auto ifm_shape = in_shape.asFeature(layout); // Kernel format is [depth_out, kernel_height, kernel_width, depth_in] @@ -321,6 +348,9 @@ ir::Shape inferDepthwiseConv2DShape(const ir::Shape &in_shape, const ir::Shape & const ir::operation::DepthwiseConv2D::Param ¶m, ir::Layout layout) { + if (param.stride.horizontal == 0 || param.stride.vertical == 0) + throw std::runtime_error{"DepthwiseConv2D: stride values must be positive"}; + assert(layout == ir::Layout::NHWC); auto ifm_shape = in_shape.asFeature(layout); @@ -354,13 +384,13 @@ ir::Shape inferExpandDimsShape(const ir::Shape &in_shape, int32_t axis) return out_shape; } -ir::Shape inferFillShape(const ir::Shape &in_shape, const int32_t *buffer) +ir::Shape inferFillShape(const ir::Shape &in_shape, const int32_t *in_buf) { ir::Shape out_shape(in_shape.dim(0)); for (int out_x = 0; out_x < out_shape.rank(); ++out_x) { - out_shape.dim(out_x) = buffer[out_x]; + out_shape.dim(out_x) = in_buf[out_x]; } return out_shape; @@ -380,11 +410,60 @@ ir::Shape inferFullyConnectedShape(const ir::Shape &in_shape, const ir::Shape &k return {ir::Shape({static_cast<int32_t>(batch_size), num_units})}; } +ir::Shape inferBCQFullyConnectedShape(const ir::Shape &in_shape, const ir::Shape &cluster_shape, + const int32_t *cluster_buf) +{ + assert(cluster_shape.rank() == 2); + assert(cluster_shape.dim(1) == 2); + + const auto input_size = in_shape.dim(1); + const auto output_size = bcq::getOutputSize(cluster_shape, cluster_buf); + + return {ir::Shape({output_size, input_size})}; +} + +ir::Shape inferBCQGatherShape(const ir::Shape &indices_shape, const ir::Shape &cluster_shape, + const int32_t *cluster_buf, int rank, + const ir::operation::BCQGather::Param ¶m) +{ + ir::Shape out_shape; + ir::Shape in_original_shape; + + assert(cluster_shape.rank() == 2); + assert(cluster_shape.dim(1) == 2); + + auto hidden_size = param.input_hidden_size; + auto axis = param.axis; + + in_original_shape.append(bcq::getOutputSize(cluster_shape, cluster_buf)); + in_original_shape.append(hidden_size); + + const int indices_rank = indices_shape.rank(); + for (int idx = 0; idx < rank; ++idx) + { + if (idx == (int)axis) + { + for (int indices_idx = 0; indices_idx < indices_rank; indices_idx++) + { + out_shape.append(indices_shape.dim(indices_idx)); + } + } + else + { + out_shape.append(in_original_shape.dim(idx)); + } + } + + return out_shape; +} + ir::Shape inferGatherShape(const ir::Shape &input_shape, const ir::Shape &indices_shape, int axis, int rank) { ir::Shape out_shape; + const int indices_rank = indices_shape.rank(); + for (int idx = 0; idx < rank; ++idx) { if (idx == axis) @@ -470,6 +549,9 @@ ir::Shape inferPadShape(const ir::Shape &in_shape, const int32_t *pad_buf, const ir::Shape inferPoolShape(const ir::Shape &in_shape, const ir::operation::Pool2D::Param ¶m, const ir::Layout layout) { + if (param.stride.horizontal == 0 || param.stride.vertical == 0) + throw std::runtime_error{"Pool2D: stride values must be positive"}; + assert(layout == ir::Layout::NHWC); auto ifm_shape = in_shape.asFeature(layout); const auto out_h_w = calcConvLikeHeightAndWidth(ifm_shape.H, ifm_shape.W, param.kh, param.kw, @@ -482,6 +564,17 @@ ir::Shape inferResizeBilinearShape(const ir::Shape &in_shape, const int32_t outp const int32_t output_width) { assert(in_shape.rank() == 4); + if (output_height < 0) + { + throw std::runtime_error{"ResizeBilinear: size value must be positive value, output_height = " + + std::to_string(output_height)}; + } + if (output_width < 0) + { + throw std::runtime_error{"ResizeBilinear: size value must be positive value, output_width = " + + std::to_string(output_width)}; + } + ir::Shape ret(in_shape.rank()); ret.dim(0) = in_shape.dim(0); @@ -613,7 +706,8 @@ ir::Shape inferSelectShape(const ir::Shape &input_cond_shape, const ir::Shape &i return new_shape; } -ir::Shape inferSliceShape(const ir::Shape &input_shape, const int32_t *begins, const int32_t *sizes) +ir::Shape inferSliceShape(const ir::Shape &input_shape, const int32_t *begins_buf, + const int32_t *sizes_buf) { const uint32_t rank = input_shape.rank(); ir::Shape out_shape(rank); @@ -623,12 +717,12 @@ ir::Shape inferSliceShape(const ir::Shape &input_shape, const int32_t *begins, c const auto input_dim = input_shape.dim(idx); // begin is zero-based - auto begin = begins[idx]; + auto begin = begins_buf[idx]; if (begin < 0) throw std::runtime_error("shape inference Slice: Invalid begin."); // size is one-based - auto size = sizes[idx]; + auto size = sizes_buf[idx]; if (size < -1) throw std::runtime_error("shape inference Slice: Invalid size."); @@ -648,8 +742,8 @@ ir::Shape inferSliceShape(const ir::Shape &input_shape, const int32_t *begins, c } ir::Shape inferSpaceToBatchNDShape(const ir::Shape &input_shape, const ir::Shape &block_shape_shape, - const ir::Shape &padding_shape, const int32_t *block_shape_data, - const int32_t *padding_data) + const ir::Shape &padding_shape, const int32_t *block_shape_buf, + const int32_t *padding_buf) { const uint32_t rank = input_shape.rank(); ir::Shape out_shape(rank); @@ -677,14 +771,14 @@ ir::Shape inferSpaceToBatchNDShape(const ir::Shape &input_shape, const ir::Shape for (int dim = 0; dim < kSpatialDimensionNum; ++dim) { int final_dim_size = - (input_shape.dim(dim + 1) + padding_data[dim * 2] + padding_data[dim * 2 + 1]); + (input_shape.dim(dim + 1) + padding_buf[dim * 2] + padding_buf[dim * 2 + 1]); - assert(final_dim_size % block_shape_data[dim] == 0); + assert(final_dim_size % block_shape_buf[dim] == 0); - out_shape.dim(dim + 1) = final_dim_size / block_shape_data[dim]; + out_shape.dim(dim + 1) = final_dim_size / block_shape_buf[dim]; } - const int output_batch_size = input_shape.dim(0) * block_shape_data[0] * block_shape_data[1]; + const int output_batch_size = input_shape.dim(0) * block_shape_buf[0] * block_shape_buf[1]; const int output_channel_size = input_shape.dim(3); out_shape.dim(0) = output_batch_size; @@ -948,35 +1042,71 @@ ir::Shape inferStridedSliceShape(const ir::Shape &input_shape, const StridedSlic return out_shape; } -ir::Shape inferTileShape(const ir::Shape &in_shape, const int32_t *multiplier) +ir::Shape inferTileShape(const ir::Shape &in_shape, const int32_t *multiplier_buf, + const int32_t multiplier_size) { - // assert(in_shape.rank() == multiplier.rank()); + if (multiplier_size != in_shape.rank()) + { + throw std::runtime_error("inferTileShape failed, input rank: " + + std::to_string(in_shape.rank()) + ", bad multipliers size: " + + std::to_string(multiplier_size) + ""); + } ir::Shape new_Shape(in_shape.rank()); for (int i = 0; i < in_shape.rank(); ++i) { - assert(multiplier[i]); // multiplier[i] shuld not be 0. - new_Shape.dim(i) = in_shape.dim(i) * multiplier[i]; + assert(multiplier_buf[i]); // multiplier_buf[i] shuld not be 0. + new_Shape.dim(i) = in_shape.dim(i) * multiplier_buf[i]; } return new_Shape; } -ir::Shape inferTransposeShape(const ir::Shape &in_shape, const std::vector<int> &perm) +ir::Shape inferTransposeShape(const ir::Shape &in_shape, const int32_t *perm_buf, + const int32_t perm_size) { - if (static_cast<int>(perm.size()) > in_shape.rank()) + const auto rank = in_shape.rank(); + if (perm_size > rank) + { + throw std::runtime_error("inferTransposeShape failed, bad permutation size: " + + std::to_string(perm_size)); + } + + const int32_t *perm_data = perm_buf; + std::vector<int32_t> regular_perm_vec; + if (perm_size == 0) + { + // perm_data will be set to (n-1...0) + regular_perm_vec.resize(rank); + std::iota(regular_perm_vec.begin(), regular_perm_vec.end(), 0); + std::reverse(regular_perm_vec.begin(), regular_perm_vec.end()); + perm_data = regular_perm_vec.data(); + } + else { - throw std::runtime_error("inferTransposeShape failed, bad rank size: " + - std::to_string(static_cast<int>(perm.size()))); + assert(rank == perm_size); } - ir::Shape out_shape(static_cast<int>(perm.size())); - for (int idx = 0; idx < static_cast<int>(perm.size()); idx++) + + ir::Shape out_shape(rank); + std::vector<bool> visit_perms(rank, false); + for (int idx = 0; idx < rank; idx++) { - if (perm[idx] < 0 || perm[idx] >= static_cast<int>(perm.size())) + const auto perm_val = perm_data[idx]; + // Check invalid permutation value + if (perm_val < 0 || perm_val >= rank) { - throw std::runtime_error("inferTransposeShape failed, bad perm value: " + - std::to_string(perm[idx])); + throw std::runtime_error("inferTransposeShape failed, bad permutation value: " + + std::to_string(perm_val)); } - out_shape.dim(idx) = in_shape.dim(perm[idx]); + + // Check duplicated permutation value + if (visit_perms.at(perm_val)) + { + throw std::runtime_error("inferTransposeShape failed, duplicated permutation value: " + + std::to_string(perm_val)); + } + visit_perms.at(perm_val) = true; + + out_shape.dim(idx) = in_shape.dim(perm_val); } return out_shape; } diff --git a/runtime/onert/frontend/base_loader/include/base_loader.h b/runtime/onert/frontend/base_loader/include/base_loader.h index 480452e01..d21001e59 100644 --- a/runtime/onert/frontend/base_loader/include/base_loader.h +++ b/runtime/onert/frontend/base_loader/include/base_loader.h @@ -1,4 +1,5 @@ /* + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -38,7 +39,7 @@ namespace onert namespace base_loader { -template <typename LoaderDomain, typename SpecificLoader> class BaseLoader +template <typename LoaderDomain> class BaseLoader { protected: using Verifier = typename LoaderDomain::Verifier; @@ -69,6 +70,7 @@ public: explicit BaseLoader(std::unique_ptr<ir::Subgraphs> &subgs) : _base{nullptr}, _pagesize(getpagesize()), _fd(-1), _subgraphs(subgs), _model{nullptr} { + _use_mmaped_data = util::getConfigBool(util::config::USE_MMAPED_DATA); } /** @@ -93,7 +95,6 @@ protected: ir::Activation convertActivation(ActivationFunctionType type); ir::DataType tensorTypeToDataType(TensorType type); ir::OperandIndex tensorIdxToOperandIdx(int32_t tensorIdx); - void deallocateMmappedArea(uint8_t *ptr, size_t size); // Create operands form tflite::Tensor ir::OperandIndex loadOperand(const Tensor *tensor, ir::Graph &subg); @@ -107,7 +108,11 @@ protected: // Load Pool2D param template <typename Param> void loadPool2DOptions(Param ¶m, const Pool2DOptions *options); +private: + virtual std::unique_ptr<ir::Graph> loadSubgraph(const SubGraph *subg) = 0; // Operations + template <typename OpIR, typename... Args> + const OpIR *loadOperationTo(const Operator *op, ir::Graph &subg, Args &&... args); void loadConv2D(const Operator *op, ir::Graph &subg); void loadDepthwiseConv2D(const Operator *op, ir::Graph &subg); void loadTransposeConv(const Operator *op, ir::Graph &subg); @@ -115,62 +120,50 @@ protected: void loadReshape(const Operator *op, ir::Graph &subg); void loadSoftmax(const Operator *op, ir::Graph &subg); void loadConcatenation(const Operator *op, ir::Graph &subg); - void loadFill(const Operator *op, ir::Graph &subg); void loadFC(const Operator *op, ir::Graph &subg); - template <ir::operation::BinaryArithmetic::ArithmeticType op_type> - void loadBinaryArithmetic(const Operator *op, ir::Graph &subg); + void loadBinaryArithmetic(const Operator *op, ir::Graph &subg, + ir::operation::BinaryArithmetic::ArithmeticType op_type); void loadAddV2(const Operator *op, ir::Graph &subg); void loadPack(const Operator *op, ir::Graph &subg); void loadResizeBilinear(const Operator *op, ir::Graph &subg); void loadResizeNearestNeighbor(const Operator *op, ir::Graph &subg); - void loadSelect(const Operator *op, ir::Graph &subg); - void loadSquaredDifference(const Operator *op, ir::Graph &subg); - void loadTranspose(const Operator *op, ir::Graph &subg); - template <ir::operation::Reduce::ReduceType reduce_type> - void loadReduce(const Operator *op, ir::Graph &subg); + void loadReduce(const Operator *op, ir::Graph &subg, + ir::operation::Reduce::ReduceType reduce_type); void loadReduceAll(const Operator *op, ir::Graph &subg); - void loadReverseV2(const Operator *op, ir::Graph &subg); - void loadPad(const Operator *op, ir::Graph &subg); void loadElementwiseActivation(const Operator *op, ir::Graph &subg, ir::operation::ElementwiseActivation::Type op_type, float alpha = 0.f, float beta = 0.f); - template <ir::operation::ElementwiseBinary::ElementwiseBinaryType op_type> - void loadElementwiseBinary(const Operator *op, ir::Graph &subg); + void loadElementwiseBinary(const Operator *op, ir::Graph &subg, + ir::operation::ElementwiseBinary::ElementwiseBinaryType op_type); void loadElementwiseUnary(const Operator *op, ir::Graph &subg, ir::operation::ElementwiseUnary::Type op_type); - void loadExpandDims(const Operator *op, ir::Graph &subg); void loadGather(const Operator *op, ir::Graph &subg); void loadCustom(const Operator *op, ir::Graph &subg); - void loadSpaceToBatchND(const Operator *op, ir::Graph &subg); void loadBatchMatMul(const Operator *op, ir::Graph &subg); - void loadBatchToSpaceND(const Operator *op, ir::Graph &subg); void loadSqueeze(const Operator *op, ir::Graph &subg); - void loadPrelu(const Operator *op, ir::Graph &subg); void loadSplit(const Operator *op, ir::Graph &subg); void loadSplitV(const Operator *op, ir::Graph &subg); - void loadSlice(const Operator *op, ir::Graph &subg); void loadStridedSlice(const Operator *op, ir::Graph &subg); void loadUnpack(const Operator *op, ir::Graph &subg); void loadComparison(const Operator *op, ir::Graph &subg); void loadEinsum(const Operator *op, ir::Graph &subg); void loadOneHot(const Operator *op, ir::Graph &subg); - void loadShape(const Operator *op, ir::Graph &subg); void loadIf(const Operator *op, ir::Graph &subg); void loadWhile(const Operator *op, ir::Graph &subg); void loadArgMax(const Operator *op, ir::Graph &subg); - void loadPow(const Operator *op, ir::Graph &subg); - void loadTile(const Operator *op, ir::Graph &subg); - void loadRange(const Operator *op, ir::Graph &subg); - void loadRank(const Operator *op, ir::Graph &subg); - void loadMatrixBandPart(const Operator *op, ir::Graph &subg); - void loadBroadcastTo(const Operator *op, ir::Graph &subg); void loadFusedBatchNorm(const Operator *op, ir::Graph &subg); void loadLogSoftmax(const Operator *op, ir::Graph &subg); void loadSpaceToDepth(const Operator *op, ir::Graph &subg); - void loadStatelessRandomUniform(const Operator *op, ir::Graph &subg); - void loadL2Normalization(const Operator *op, ir::Graph &subg); void loadLeakyRelu(const Operator *op, ir::Graph &subg); + void verifySubgraphIndex(int subg_index) + { + const auto num_subgraphs = _model->subgraphs()->size(); + if (subg_index < 0 || subg_index >= static_cast<int32_t>(num_subgraphs)) + throw std::runtime_error{std::string{"Invalid subgraph index - "} + + std::to_string(subg_index)}; + } + protected: // Base address for mapped region for loading (if needed) uint8_t *_base; @@ -186,10 +179,12 @@ protected: std::unordered_map<ir::OperandIndex, std::string> _tensor_names; // Verifier std::unique_ptr<Verifier> _verifier; + // Boolean flag to use MMAPED_DATA + bool _use_mmaped_data = false; }; -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::loadFromFile(const char *file_path) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::BaseLoader::loadFromFile(const char *file_path) { _fd = open(file_path, O_RDONLY); if (_fd < 0) @@ -216,22 +211,22 @@ void BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::loadFromFile(const ch _verifier = std::make_unique<Verifier>(reinterpret_cast<const std::uint8_t *>(_base), size); loadModel(); + munmap(_base, size); close(_fd); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::loadFromBuffer(uint8_t *buffer, - size_t size) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::BaseLoader::loadFromBuffer(uint8_t *buffer, size_t size) { _base = buffer; _verifier = std::make_unique<Verifier>(reinterpret_cast<const std::uint8_t *>(_base), size); loadModel(); } -template <typename LoaderDomain, typename SpecificLoader> -ir::Activation BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::convertActivation( - const ActivationFunctionType type) +template <typename LoaderDomain> +ir::Activation +BaseLoader<LoaderDomain>::BaseLoader::convertActivation(const ActivationFunctionType type) { switch (type) { @@ -246,14 +241,13 @@ ir::Activation BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::convertActi case ActivationFunctionType::ActivationFunctionType_TANH: return ir::Activation::TANH; default: - throw std::runtime_error(std::string("Unsupported activation type: ") - .append(EnumNameActivationFunctionType(type))); + throw std::runtime_error(std::string("Unsupported or invalid activation type: ") + + std::to_string(static_cast<int>(type))); } } -template <typename LoaderDomain, typename SpecificLoader> -ir::DataType -BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::tensorTypeToDataType(const TensorType type) +template <typename LoaderDomain> +ir::DataType BaseLoader<LoaderDomain>::BaseLoader::tensorTypeToDataType(const TensorType type) { switch (type) { @@ -275,39 +269,13 @@ BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::tensorTypeToDataType(const } } -template <typename LoaderDomain, typename SpecificLoader> -ir::OperandIndex -BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::tensorIdxToOperandIdx(int32_t tensorIdx) +template <typename LoaderDomain> +ir::OperandIndex BaseLoader<LoaderDomain>::BaseLoader::tensorIdxToOperandIdx(int32_t tensorIdx) { return isOptionalInputTensor(tensorIdx) ? ir::OperandIndex() : _tensor_to_operand[tensorIdx]; } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::deallocateMmappedArea(uint8_t *ptr, - size_t size) -{ - // Calculate offset from base address of mapped region - ptrdiff_t unaligned_offset_start = ptr - _base; - ptrdiff_t unaligned_offset_end = unaligned_offset_start + size; - - // Calculated aligned offset from base address of mapped region - // munmap accepts memory address which is a multiple of the pagesize - ptrdiff_t aligned_offset_start = - ((unaligned_offset_start + (_pagesize - 1)) / _pagesize) * _pagesize; - ptrdiff_t aligned_offset_end = (unaligned_offset_end / _pagesize) * _pagesize; - - ptrdiff_t area_size = aligned_offset_end - aligned_offset_start; - if (area_size > 0) - { - // Unmap mapped region for CachedData - if (munmap(_base + aligned_offset_start, area_size) == -1) - { - VERBOSE(BASE_LOADER) << "munmap failed" << std::endl; - } - } -} - -/* Copied from tensorflow lite. Need to append copyright */ +/* Copy is copied from tensorflow lite */ template <typename T> bool Copy(const T *data_ptr, std::vector<uint16_t> &arr) { if (data_ptr->values() == nullptr) @@ -324,9 +292,8 @@ template <typename T> bool Copy(const T *data_ptr, std::vector<uint16_t> &arr) return true; } -template <typename LoaderDomain, typename SpecificLoader> -ir::OperandIndex BaseLoader<LoaderDomain, SpecificLoader>::loadOperand(const Tensor *tensor, - ir::Graph &subg) +template <typename LoaderDomain> +ir::OperandIndex BaseLoader<LoaderDomain>::loadOperand(const Tensor *tensor, ir::Graph &subg) { ir::Shape shape; // Shape @@ -386,18 +353,44 @@ ir::OperandIndex BaseLoader<LoaderDomain, SpecificLoader>::loadOperand(const Ten { std::vector<uint16_t> w1_segments; std::vector<uint16_t> w1_indices; - // ignore traversal_order, block_map + // check traversal_order + if (src_sparsity->traversal_order()) + { + const int traversal_order_size = src_sparsity->traversal_order()->size(); + for (int i = 0; i < traversal_order_size; ++i) + { + if (i != src_sparsity->traversal_order()->Get(i)) + throw std::runtime_error("traversal_order [0, 1, ..., n-1] is only supported."); + } + } + // check block_map + int block_rank = 0; + if (src_sparsity->block_map()) + { + block_rank = src_sparsity->block_map()->size(); + for (int i = 0; i < block_rank; ++i) + { + if (i != src_sparsity->block_map()->Get(i)) + throw std::runtime_error("block_map [0, 1, ..., n-1] is only supported."); + } + } // load metadata - const size_t dim_metadata_size = src_sparsity->dim_metadata()->size(); - if (dim_metadata_size != 2) - throw std::runtime_error("sparse tensor is supported only for 2D"); + const int dim_metadata_size = src_sparsity->dim_metadata()->size(); + auto dense_rank = shape.rank(); + if (dense_rank + block_rank != dim_metadata_size) + throw std::runtime_error("sparsity dim_metadata length is wrong."); + bool random_sparsity = dim_metadata_size == 2 && block_rank == 0; + bool block2D_sparsity = dim_metadata_size == 4 && block_rank == 2; + if (dim_metadata_size != !random_sparsity && !block2D_sparsity) + throw std::runtime_error( + "sparsity is supported only for 2D tensor with random or 16x1 block sparsity."); + const auto *src_metadata = src_sparsity->dim_metadata()->Get(0); if (src_metadata->format() != DimensionType::DimensionType_DENSE) throw std::runtime_error("sparse tensor dim[0] is not DENSE"); src_metadata = src_sparsity->dim_metadata()->Get(1); if (src_metadata->format() != DimensionType::DimensionType_SPARSE_CSR) throw std::runtime_error("sparse tensor dim[0] is not SPARSE_CSR"); - auto ParseSparseIndexVector = [src_metadata, &w1_segments, &w1_indices]() { if (src_metadata->array_segments() == nullptr || src_metadata->array_indices() == nullptr) return false; @@ -433,7 +426,17 @@ ir::OperandIndex BaseLoader<LoaderDomain, SpecificLoader>::loadOperand(const Ten }; if (ParseSparseIndexVector() == false) throw std::runtime_error("Error during parsing sparsity index information"); - type_info.sparse2DMetadata(std::move(w1_segments), std::move(w1_indices)); + // Get block size + std::vector<int32_t> block_size; + for (int i = 0; i < block_rank; ++i) + { + auto block_metadata = src_sparsity->dim_metadata()->Get(dense_rank + i); + if (block_metadata->format() != DimensionType::DimensionType_DENSE) + throw std::runtime_error("block dimension must be DENSE."); + block_size.push_back(block_metadata->dense_size()); + } + type_info.sparsity(std::make_shared<ir::Sparsity>(std::move(w1_segments), std::move(w1_indices), + std::move(block_size))); } // Create operand const auto operand_index = subg.addOperand(shape, type_info); @@ -450,8 +453,28 @@ ir::OperandIndex BaseLoader<LoaderDomain, SpecificLoader>::loadOperand(const Ten } else // Model is loaded(mmap'd) from a file { - data_obj = std::make_unique<ir::CachedData>(data->data(), data->size()); - deallocateMmappedArea(const_cast<uint8_t *>(data->data()), data->size()); + size_t data_size = data->size(); + ptrdiff_t unaligned_offset_start = data->data() - _base; + ptrdiff_t offset_end = unaligned_offset_start + data_size; + + // Calculated aligned offset from base address of mapped region + // munmap accepts memory address which is a multiple of the pagesize + ptrdiff_t aligned_offset_start = (unaligned_offset_start / _pagesize) * _pagesize; + size_t mmap_size = offset_end - aligned_offset_start; + + if (_use_mmaped_data) + { + data_obj = std::make_unique<ir::MMapedData>(_fd, aligned_offset_start, mmap_size, + unaligned_offset_start, data_size); + } + else + { + size_t offset = unaligned_offset_start - aligned_offset_start; + uint8_t *mmap_base = static_cast<uint8_t *>( + mmap(NULL, mmap_size, PROT_READ, MAP_PRIVATE, _fd, aligned_offset_start)); + data_obj = std::make_unique<ir::CachedData>(mmap_base + offset, data_size); + munmap(mmap_base, mmap_size); + } } subg.setOperandValue(operand_index, std::move(data_obj)); } @@ -465,10 +488,9 @@ ir::OperandIndex BaseLoader<LoaderDomain, SpecificLoader>::loadOperand(const Ten return operand_index; } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadOperationIO(const Operator *op, - ir::OperandIndexSequence &inputs, - ir::OperandIndexSequence &outputs) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadOperationIO(const Operator *op, ir::OperandIndexSequence &inputs, + ir::OperandIndexSequence &outputs) { for (const std::int32_t idx : *op->inputs()) { @@ -490,120 +512,116 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperationIO(const Operator *o } } -template <typename LoaderDomain, typename SpecificLoader> +template <typename LoaderDomain> template <typename Param, typename OptionsType> -void BaseLoader<LoaderDomain, SpecificLoader>::loadStridesAndPaddings(Param ¶m, - const OptionsType *options) +void BaseLoader<LoaderDomain>::loadStridesAndPaddings(Param ¶m, const OptionsType *options) { // Strides param.stride.vertical = options->stride_h(); param.stride.horizontal = options->stride_w(); // Paddings - if (options->padding() == Padding::Padding_SAME) - param.padding.type = ir::PaddingType::SAME; - if (options->padding() == Padding::Padding_VALID) - param.padding.type = ir::PaddingType::VALID; + switch (options->padding()) + { + case Padding::Padding_SAME: + param.padding.type = ir::PaddingType::SAME; + break; + case Padding::Padding_VALID: + param.padding.type = ir::PaddingType::VALID; + break; + default: + throw std::runtime_error{"Invalid padding type"}; + } // param paddings indexes unused } -template <typename LoaderDomain, typename SpecificLoader> +template <typename LoaderDomain> template <typename Param> -void BaseLoader<LoaderDomain, SpecificLoader>::loadPool2DOptions(Param ¶m, - const Pool2DOptions *options) +void BaseLoader<LoaderDomain>::loadPool2DOptions(Param ¶m, const Pool2DOptions *options) { // Strides and Paddings + if (options->stride_h() <= 0 || options->stride_w() <= 0) + throw std::runtime_error{"Invalid stride vertical or horizontal - both must be bigger than 0"}; loadStridesAndPaddings(param, options); // Filter width and height // Strides + if (options->filter_width() <= 0 || options->filter_height() <= 0) + throw std::runtime_error{"Invalid filter width or height - both must be bigger than 0"}; param.kw = options->filter_width(); param.kh = options->filter_height(); // Activation param.activation = convertActivation(options->fused_activation_function()); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadConv2D(const Operator *op, ir::Graph &subg) +template <typename LoaderDomain> +template <typename OpIR, typename... Args> +const OpIR *BaseLoader<LoaderDomain>::loadOperationTo(const Operator *op, ir::Graph &subg, + Args &&... args) { + static_assert(sizeof...(args) <= 1, "You can't have more than 1 arguments!"); ir::OperandIndexSequence inputs; ir::OperandIndexSequence outputs; loadOperationIO(op, inputs, outputs); + std::unique_ptr<OpIR> new_op(new OpIR(inputs, outputs, std::forward<Args>(args)...)); + auto ret = new_op.get(); + subg.addOperation(std::move(new_op)); + + return ret; +} + +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadConv2D(const Operator *op, ir::Graph &subg) +{ ir::operation::Conv2D::Param param; const auto *options = op->builtin_options_as_Conv2DOptions(); param.activation = convertActivation(options->fused_activation_function()); loadStridesAndPaddings(param, options); - param.dilation.width_factor = options->dilation_w_factor(); param.dilation.height_factor = options->dilation_h_factor(); - std::unique_ptr<ir::Operation> new_op(new ir::operation::Conv2D(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::Conv2D>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadDepthwiseConv2D(const Operator *op, - ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadDepthwiseConv2D(const Operator *op, ir::Graph &subg) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - ir::operation::DepthwiseConv2D::Param param; const auto *options = op->builtin_options_as_DepthwiseConv2DOptions(); param.activation = convertActivation(options->fused_activation_function()); loadStridesAndPaddings(param, options); - // Multiplier param.multiplier = options->depth_multiplier(); // Dilation h/w factor unused - std::unique_ptr<ir::Operation> new_op(new ir::operation::DepthwiseConv2D(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); + + loadOperationTo<ir::operation::DepthwiseConv2D>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadTransposeConv(const Operator *op, - ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadTransposeConv(const Operator *op, ir::Graph &subg) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - ir::operation::TransposeConv::Param param; const auto *options = op->builtin_options_as_TransposeConvOptions(); loadStridesAndPaddings(param, options); - std::unique_ptr<ir::Operation> new_op(new ir::operation::TransposeConv(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); + + loadOperationTo<ir::operation::TransposeConv>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadPool2D(const Operator *op, ir::Graph &subg, - ir::operation::Pool2D::PoolType op_type) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadPool2D(const Operator *op, ir::Graph &subg, + ir::operation::Pool2D::PoolType op_type) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - ir::operation::Pool2D::Param param; param.op_type = op_type; const auto *options = op->builtin_options_as_Pool2DOptions(); loadPool2DOptions(param, options); - std::unique_ptr<ir::Operation> new_op(new ir::operation::Pool2D(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::Pool2D>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadReshape(const Operator *op, ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadReshape(const Operator *op, ir::Graph &subg) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - ir::operation::Reshape::Param param{}; const auto *options = op->builtin_options_as_ReshapeOptions(); if (options != nullptr) @@ -611,99 +629,64 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadReshape(const Operator *op, i const auto *new_shape = options->new_shape(); if (new_shape) { - for (uint i = 0; i < new_shape->Length(); ++i) + for (uint i = 0; i < new_shape->size(); ++i) { param.new_shape.push_back(new_shape->Get(i)); } } } - std::unique_ptr<ir::Operation> new_op(new ir::operation::Reshape(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::Reshape>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadSoftmax(const Operator *op, ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadSoftmax(const Operator *op, ir::Graph &subg) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - ir::operation::Softmax::Param param; const auto *options = op->builtin_options_as_SoftmaxOptions(); // Beta param.beta = options->beta(); - std::unique_ptr<ir::Operation> new_op(new ir::operation::Softmax(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::Softmax>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadConcatenation(const Operator *op, - ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadConcatenation(const Operator *op, ir::Graph &subg) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - ir::operation::Concat::Param param; const auto *options = op->builtin_options_as_ConcatenationOptions(); // Axis param.axis = options->axis(); // activation unused - std::unique_ptr<ir::Operation> new_op(new ir::operation::Concat(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::Concat>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadFill(const Operator *op, ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadFC(const Operator *op, ir::Graph &subg) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - - std::unique_ptr<ir::Operation> new_op(new ir::operation::Fill(inputs, outputs)); - subg.addOperation(std::move(new_op)); -} + ir::operation::FullyConnected::Param param; + const auto *options = op->builtin_options_as_FullyConnectedOptions(); -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadFC(const Operator *op, ir::Graph &subg) -{ - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; + param.activation = convertActivation(options->fused_activation_function()); + // weights_format unused - loadOperationIO(op, inputs, outputs); + const auto fc = loadOperationTo<ir::operation::FullyConnected>(op, subg, param); - const auto &input_operand = subg.operands().at(inputs.at(ir::operation::FullyConnected::INPUT)); - auto &weights_operand = subg.operands().at(inputs.at(ir::operation::FullyConnected::WEIGHT)); + const auto &input_operand = + subg.operands().at(fc->getInputs().at(ir::operation::FullyConnected::INPUT)); + auto &weights_operand = + subg.operands().at(fc->getInputs().at(ir::operation::FullyConnected::WEIGHT)); if (input_operand.typeInfo().type() == ir::DataType::FLOAT32 && weights_operand.typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM) { weights_operand.type(ir::DataType::QUANT_INT8_SYMM); } - - ir::operation::FullyConnected::Param param; - const auto *options = op->builtin_options_as_FullyConnectedOptions(); - - param.activation = convertActivation(options->fused_activation_function()); - // weights_format unused - - std::unique_ptr<ir::Operation> new_op(new ir::operation::FullyConnected(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadAddV2(const Operator *op, ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadAddV2(const Operator *op, ir::Graph &subg) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - ir::operation::BinaryArithmetic::Param param; param.arithmetic_type = ir::operation::BinaryArithmetic::ArithmeticType::ADD; @@ -722,21 +705,13 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadAddV2(const Operator *op, ir: param.activation = convertActivation(fused_activation_func); } - std::unique_ptr<ir::Operation> new_op( - new ir::operation::BinaryArithmetic(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::BinaryArithmetic>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -template <ir::operation::BinaryArithmetic::ArithmeticType op_type> -void BaseLoader<LoaderDomain, SpecificLoader>::loadBinaryArithmetic(const Operator *op, - ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadBinaryArithmetic( + const Operator *op, ir::Graph &subg, ir::operation::BinaryArithmetic::ArithmeticType op_type) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - ir::operation::BinaryArithmetic::Param param; param.arithmetic_type = op_type; switch (op_type) @@ -771,172 +746,66 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadBinaryArithmetic(const Operat break; } - std::unique_ptr<ir::Operation> new_op( - new ir::operation::BinaryArithmetic(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::BinaryArithmetic>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadPack(const Operator *op, ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadPack(const Operator *op, ir::Graph &subg) { - // This runtime_error will be removed if the one of backend supports this operation - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - ir::operation::Pack::Param param; const auto *options = op->builtin_options_as_PackOptions(); param.num = options->values_count(); param.axis = options->axis(); - std::unique_ptr<ir::Operation> new_op(new ir::operation::Pack(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::Pack>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadElementwiseActivation( +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadElementwiseActivation( const Operator *op, ir::Graph &subg, ir::operation::ElementwiseActivation::Type op_type, float alpha, float beta) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - ir::operation::ElementwiseActivation::Param param; param.op_type = op_type; param.alpha = alpha; param.beta = beta; - std::unique_ptr<ir::Operation> new_op( - new ir::operation::ElementwiseActivation(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::ElementwiseActivation>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadResizeBilinear(const Operator *op, - ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadResizeBilinear(const Operator *op, ir::Graph &subg) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - auto input = inputs.at(0); - auto size = inputs.at(1); - - // FIXME Handle ResizeBilinearOptions. - if (!subg.operands().at(size).isConstant()) - throw std::runtime_error("ResizeBilinear: non-constant 'size' is not supported."); - - std::vector<std::int32_t> size_v = subg.operands().at(size).template asVector<std::int32_t>(); - ir::operation::ResizeBilinear::Param param; - param.height_out = size_v[0]; - param.width_out = size_v[1]; param.align_corners = op->builtin_options_as_ResizeBilinearOptions()->align_corners(); param.half_pixel_centers = op->builtin_options_as_ResizeBilinearOptions()->half_pixel_centers(); - std::unique_ptr<ir::Operation> new_op(new ir::operation::ResizeBilinear({input}, outputs, param)); - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::ResizeBilinear>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadResizeNearestNeighbor(const Operator *op, - ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadResizeNearestNeighbor(const Operator *op, ir::Graph &subg) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - auto input = inputs.at(0); - auto size = inputs.at(1); - - if (!subg.operands().at(size).isConstant()) - throw std::runtime_error("ResizeNearestNeighbor: non-constant 'size' is not supported."); - - std::vector<std::int32_t> size_v = subg.operands().at(size).template asVector<std::int32_t>(); - ir::operation::ResizeNearestNeighbor::Param param; - param.height_out = size_v[0]; - param.width_out = size_v[1]; param.align_corners = op->builtin_options_as_ResizeNearestNeighborOptions()->align_corners(); - std::unique_ptr<ir::Operation> new_op( - new ir::operation::ResizeNearestNeighbor({input}, outputs, param)); - subg.addOperation(std::move(new_op)); -} - -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadSelect(const Operator *op, ir::Graph &subg) -{ - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - - std::unique_ptr<ir::Operation> new_op(new ir::operation::Select(inputs, outputs)); - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::ResizeNearestNeighbor>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadSquaredDifference(const Operator *op, - ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadReduce(const Operator *op, ir::Graph &subg, + ir::operation::Reduce::ReduceType reduce_type) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - - std::unique_ptr<ir::Operation> new_op(new ir::operation::SquaredDifference(inputs, outputs)); - subg.addOperation(std::move(new_op)); -} - -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadTranspose(const Operator *op, ir::Graph &subg) -{ - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - auto input = inputs.at(0); - auto perm = inputs.at(1); - - if (!subg.operands().at(perm).isConstant()) - throw std::runtime_error("Transpose: non-constant 'perm' is not supported."); - - ir::operation::Transpose::Param param; - param.perm = subg.operands().at(perm).template asVector<int>(); - - std::unique_ptr<ir::Operation> new_op(new ir::operation::Transpose({input}, outputs, param)); - subg.addOperation(std::move(new_op)); -} - -template <typename LoaderDomain, typename SpecificLoader> -template <ir::operation::Reduce::ReduceType reduce_type> -void BaseLoader<LoaderDomain, SpecificLoader>::loadReduce(const Operator *op, ir::Graph &subg) -{ - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - ir::operation::Reduce::Param param; param.reduce_type = reduce_type; param.keep_dims = op->builtin_options_as_ReducerOptions()->keep_dims(); - std::unique_ptr<ir::Operation> new_op(new ir::operation::Reduce(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::Reduce>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadReduceAll(const Operator *op, ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadReduceAll(const Operator *op, ir::Graph &subg) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - ir::operation::Reduce::Param param; param.reduce_type = ir::operation::Reduce::ReduceType::ALL; if (op->custom_options() == nullptr) @@ -952,64 +821,28 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadReduceAll(const Operator *op, param.keep_dims = attr_map["keep_dims"].AsBool(); } - std::unique_ptr<ir::Operation> new_op(new ir::operation::Reduce(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); -} - -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadReverseV2(const Operator *op, ir::Graph &subg) -{ - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - - std::unique_ptr<ir::Operation> new_op(new ir::operation::Reverse(inputs, outputs)); - subg.addOperation(std::move(new_op)); -} - -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadPad(const Operator *op, ir::Graph &subg) -{ - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - - std::unique_ptr<ir::Operation> new_op(new ir::operation::Pad(inputs, outputs)); - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::Reduce>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -template <ir::operation::ElementwiseBinary::ElementwiseBinaryType op_type> -void BaseLoader<LoaderDomain, SpecificLoader>::loadElementwiseBinary(const Operator *op, - ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadElementwiseBinary( + const Operator *op, ir::Graph &subg, + ir::operation::ElementwiseBinary::ElementwiseBinaryType op_type) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - ir::operation::ElementwiseBinary::Param param; param.op_type = op_type; - std::unique_ptr<ir::Operation> new_op( - new ir::operation::ElementwiseBinary(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::ElementwiseBinary>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadElementwiseUnary( - const Operator *op, ir::Graph &subg, ir::operation::ElementwiseUnary::Type op_type) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadElementwiseUnary(const Operator *op, ir::Graph &subg, + ir::operation::ElementwiseUnary::Type op_type) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - ir::operation::ElementwiseUnary::Param param; param.op_type = op_type; + const auto eu = loadOperationTo<ir::operation::ElementwiseUnary>(op, subg, param); if (op_type == ir::operation::ElementwiseUnary::Type::CAST) { auto qasymm8ToUint8 = [](ir::Operand &operand) { @@ -1018,61 +851,24 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadElementwiseUnary( operand.type(ir::DataType::UINT8); } }; - qasymm8ToUint8(subg.operands().at(inputs.at(ir::operation::ElementwiseUnary::Input::INPUT))); - qasymm8ToUint8(subg.operands().at(outputs.at(0))); + qasymm8ToUint8( + subg.operands().at(eu->getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT))); + qasymm8ToUint8(subg.operands().at(eu->getOutputs().at(0))); } - - std::unique_ptr<ir::Operation> new_op( - new ir::operation::ElementwiseUnary(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); -} - -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadExpandDims(const Operator *op, ir::Graph &subg) -{ - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - - std::unique_ptr<ir::Operation> new_op(new ir::operation::ExpandDims(inputs, outputs)); - subg.addOperation(std::move(new_op)); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadGather(const Operator *op, ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadGather(const Operator *op, ir::Graph &subg) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); ir::operation::Gather::Param param; param.axis = op->builtin_options_as_GatherOptions()->axis(); - std::unique_ptr<ir::Operation> new_op(new ir::operation::Gather(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); -} - -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadSpaceToBatchND(const Operator *op, - ir::Graph &subg) -{ - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - - std::unique_ptr<ir::Operation> new_op{new ir::operation::SpaceToBatchND{inputs, outputs}}; - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::Gather>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadBatchMatMul(const Operator *op, ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadBatchMatMul(const Operator *op, ir::Graph &subg) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); ir::operation::BatchMatMul::Param param; const auto builtin_op = _model->operator_codes()->Get(op->opcode_index())->builtin_code(); @@ -1105,89 +901,21 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadBatchMatMul(const Operator *o " as " + EnumNameBuiltinOperator(BuiltinOperator::BuiltinOperator_BATCH_MATMUL)); } - std::unique_ptr<ir::Operation> new_op{new ir::operation::BatchMatMul{inputs, outputs, param}}; - subg.addOperation(std::move(new_op)); -} - -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadBatchToSpaceND(const Operator *op, - ir::Graph &subg) -{ - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - - std::unique_ptr<ir::Operation> new_op{new ir::operation::BatchToSpaceND{inputs, outputs}}; - subg.addOperation(std::move(new_op)); -} - -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadMatrixBandPart(const Operator *op, - ir::Graph &subg) -{ - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - - std::unique_ptr<ir::Operation> new_op(new ir::operation::MatrixBandPart(inputs, outputs)); - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::BatchMatMul>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadBroadcastTo(const Operator *op, ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadSpaceToDepth(const Operator *op, ir::Graph &subg) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - - std::unique_ptr<ir::Operation> new_op(new ir::operation::BroadcastTo(inputs, outputs)); - subg.addOperation(std::move(new_op)); -} -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadSpaceToDepth(const Operator *op, ir::Graph &subg) -{ - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; ir::operation::SpaceToDepth::Param param; - const auto *options = op->builtin_options_as_SpaceToDepthOptions(); - param.block_size = options->block_size(); - loadOperationIO(op, inputs, outputs); - - std::unique_ptr<ir::Operation> new_op(new ir::operation::SpaceToDepth(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); -} - -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadStatelessRandomUniform(const Operator *op, - ir::Graph &subg) -{ - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - loadOperationIO(op, inputs, outputs); - - std::unique_ptr<ir::Operation> new_op(new ir::operation::StatelessRandomUniform(inputs, outputs)); - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::SpaceToDepth>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadRank(const Operator *op, ir::Graph &subg) -{ - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - loadOperationIO(op, inputs, outputs); - - std::unique_ptr<ir::Operation> new_op(new ir::operation::Rank(inputs, outputs)); - subg.addOperation(std::move(new_op)); -} - -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadCustom(const Operator *op, ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadCustom(const Operator *op, ir::Graph &subg) { ir::OperandIndexSequence inputs; ir::OperandIndexSequence outputs; @@ -1237,7 +965,7 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadCustom(const Operator *op, ir loadReduceAll(op, subg); break; case BuiltinOP::MatrixBandPart: - loadMatrixBandPart(op, subg); + loadOperationTo<ir::operation::MatrixBandPart>(op, subg); break; case BuiltinOP::BatchMatMul: loadBatchMatMul(op, subg); @@ -1246,13 +974,13 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadCustom(const Operator *op, ir loadEinsum(op, subg); break; case BuiltinOP::BroadcastTo: - loadBroadcastTo(op, subg); + loadOperationTo<ir::operation::BroadcastTo>(op, subg); break; case BuiltinOP::FusedBatchNorm: loadFusedBatchNorm(op, subg); break; case BuiltinOP::StatelessRandomUniform: - loadStatelessRandomUniform(op, subg); + loadOperationTo<ir::operation::StatelessRandomUniform>(op, subg); break; case BuiltinOP::Erf: loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::ERF); @@ -1285,141 +1013,71 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadCustom(const Operator *op, ir } } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadSqueeze(const Operator *op, ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadSqueeze(const Operator *op, ir::Graph &subg) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - - ir::operation::Squeeze::Param param{}; + ir::operation::Squeeze::Param param; const auto *options = op->builtin_options_as_SqueezeOptions(); const auto *dims = options->squeeze_dims(); if (dims) { - if (dims->Length() > sizeof(param.dims) / sizeof(param.dims[0])) + if (dims->size() > sizeof(param.dims) / sizeof(param.dims[0])) throw std::runtime_error("Squeeze: 'param.ndims' is out of range."); - param.ndim = dims->Length(); + param.ndim = dims->size(); for (int i = 0; i < param.ndim; ++i) param.dims[i] = dims->Get(i); } - std::unique_ptr<ir::Operation> new_op(new ir::operation::Squeeze(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::Squeeze>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadPrelu(const Operator *op, ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadSplit(const Operator *op, ir::Graph &subg) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - - std::unique_ptr<ir::Operation> new_op(new ir::operation::PReLU(inputs, outputs)); - subg.addOperation(std::move(new_op)); -} - -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadSplit(const Operator *op, ir::Graph &subg) -{ - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - // Notice : input order is strange for tflite split - auto input = inputs.at(1); - auto axis = inputs.at(0); - - // FIXME Handle SplitOptions. - if (!subg.operands().at(axis).isConstant()) - throw std::runtime_error("Split: non-constant 'axis' is not supported."); - - ir::operation::Split::Param param{}; - param.axis = subg.operands().at(axis).template asScalar<int>(); + ir::operation::Split::Param param; const auto *options = op->builtin_options_as_SplitOptions(); param.num_splits = options->num_splits(); - std::unique_ptr<ir::Operation> new_op(new ir::operation::Split({input}, outputs, param)); - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::Split>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadSplitV(const Operator *op, ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadSplitV(const Operator *op, ir::Graph &subg) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - - ir::operation::SplitV::Param param{}; - + ir::operation::SplitV::Param param; const auto *options = op->builtin_options_as_SplitVOptions(); param.num_splits = options->num_splits(); - std::unique_ptr<ir::Operation> new_op(new ir::operation::SplitV(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); -} - -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadSlice(const Operator *op, ir::Graph &subg) -{ - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - - std::unique_ptr<ir::Operation> new_op{new ir::operation::Slice{inputs, outputs}}; - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::SplitV>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadStridedSlice(const Operator *op, ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadStridedSlice(const Operator *op, ir::Graph &subg) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - ir::operation::StridedSlice::Param param; - const auto *options = op->builtin_options_as_StridedSliceOptions(); param.begin_mask = options->begin_mask(); param.end_mask = options->end_mask(); param.shrink_axis_mask = options->shrink_axis_mask(); - std::unique_ptr<ir::Operation> new_op{new ir::operation::StridedSlice{inputs, outputs, param}}; - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::StridedSlice>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadUnpack(const Operator *op, ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadUnpack(const Operator *op, ir::Graph &subg) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - ir::operation::Unpack::Param param; const auto *options = op->builtin_options_as_UnpackOptions(); param.num = options->num(); param.axis = options->axis(); - std::unique_ptr<ir::Operation> new_op(new ir::operation::Unpack(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::Unpack>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadComparison(const Operator *op, ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadComparison(const Operator *op, ir::Graph &subg) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - ir::operation::Comparison::Param param; - const auto builtin_op = _model->operator_codes()->Get(op->opcode_index())->builtin_code(); switch (builtin_op) @@ -1447,24 +1105,13 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadComparison(const Operator *op std::string("Unsupported operation: ").append(EnumNameBuiltinOperator(builtin_op))); } - std::unique_ptr<ir::Operation> new_op(new ir::operation::Comparison(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::Comparison>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadEinsum(const Operator *op, ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadEinsum(const Operator *op, ir::Graph &subg) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); ir::operation::Einsum::Param param; - - if (inputs.size() != 2) - { - throw std::runtime_error{"Einsum: NYI input - only support two inputs"}; - } - if (op->custom_options() == nullptr) { throw std::runtime_error{"Einsum: empty equation"}; @@ -1478,24 +1125,16 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadEinsum(const Operator *op, ir param.equation = attr_map["equation"].ToString(); } - std::unique_ptr<ir::Operation> new_op{new ir::operation::Einsum{inputs, outputs, param}}; - subg.addOperation(std::move(new_op)); + const auto es = loadOperationTo<ir::operation::Einsum>(op, subg, param); + if (es->getInputs().size() != 2) + { + throw std::runtime_error{"Einsum: NYI input - only support two inputs"}; + } } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadFusedBatchNorm(const Operator *op, - ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadFusedBatchNorm(const Operator *op, ir::Graph &subg) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); ir::operation::FusedBatchNorm::Param param; - - if (inputs.size() != 5) - { - throw std::runtime_error{"FusedBatchNorm: NYI input - only support five inputs"}; - } - if (op->custom_options() == nullptr) { throw std::runtime_error{"FusedBatchNorm: empty option"}; @@ -1511,195 +1150,104 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadFusedBatchNorm(const Operator param.data_format = attr_map["data_format"].ToString(); } - std::unique_ptr<ir::Operation> new_op{new ir::operation::FusedBatchNorm{inputs, outputs, param}}; - subg.addOperation(std::move(new_op)); + const auto fbn = loadOperationTo<ir::operation::FusedBatchNorm>(op, subg, param); + + if (fbn->getInputs().size() != 5) + { + throw std::runtime_error{"FusedBatchNorm: NYI input - only support five inputs"}; + } } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadOneHot(const Operator *op, ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadOneHot(const Operator *op, ir::Graph &subg) { if (op->inputs()->size() != 4 || op->outputs()->size() != 1) throw std::runtime_error("OneHot Op has wrong number of input or output tensors."); - // Set input and output tensors - ir::OperandIndexSequence inputs, outputs; - loadOperationIO(op, inputs, outputs); - // Set parameter - const auto axis = op->builtin_options_as_OneHotOptions()->axis(); - std::unique_ptr<ir::Operation> new_op(new ir::operation::OneHot(inputs, outputs, {axis})); - subg.addOperation(std::move(new_op)); -} + ir::operation::OneHot::Param param; + param.axis = op->builtin_options_as_OneHotOptions()->axis(); -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadShape(const Operator *op, ir::Graph &subg) -{ - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - - // ir::operation::Shape::Param param; - // const auto *options = op->builtin_options_as_ShapeOptions(); - // param.out_type = tensorTypeToDataType(options->out_type()); - - std::unique_ptr<ir::Operation> new_op(new ir::operation::Shape(inputs, outputs /*, param*/)); - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::OneHot>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadIf(const Operator *op, ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadIf(const Operator *op, ir::Graph &subg) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; + const auto *options = op->builtin_options_as_IfOptions(); + const int32_t then_index = options->then_subgraph_index(); + const int32_t else_index = options->else_subgraph_index(); - loadOperationIO(op, inputs, outputs); + verifySubgraphIndex(then_index); + verifySubgraphIndex(else_index); ir::operation::If::Param param; - const auto *options = op->builtin_options_as_IfOptions(); - const uint32_t then_index = options->then_subgraph_index(); - const uint32_t else_index = options->else_subgraph_index(); - param.then_subg_index = ir::SubgraphIndex{then_index}; - param.else_subg_index = ir::SubgraphIndex{else_index}; + param.then_subg_index = ir::SubgraphIndex{static_cast<uint32_t>(then_index)}; + param.else_subg_index = ir::SubgraphIndex{static_cast<uint32_t>(else_index)}; - std::unique_ptr<ir::Operation> new_op(new ir::operation::If(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::If>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadWhile(const Operator *op, ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadWhile(const Operator *op, ir::Graph &subg) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; + const auto *options = op->builtin_options_as_WhileOptions(); + const int32_t cond_index = options->cond_subgraph_index(); + const int32_t body_index = options->body_subgraph_index(); - loadOperationIO(op, inputs, outputs); + verifySubgraphIndex(cond_index); + verifySubgraphIndex(body_index); ir::operation::While::Param param; - const auto *options = op->builtin_options_as_WhileOptions(); - const uint32_t cond_index = options->cond_subgraph_index(); - const uint32_t body_index = options->body_subgraph_index(); - param.cond_subg_index = ir::SubgraphIndex{cond_index}; - param.body_subg_index = ir::SubgraphIndex{body_index}; + param.cond_subg_index = ir::SubgraphIndex{static_cast<uint32_t>(cond_index)}; + param.body_subg_index = ir::SubgraphIndex{static_cast<uint32_t>(body_index)}; - std::unique_ptr<ir::Operation> new_op(new ir::operation::While(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::While>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadArgMax(const Operator *op, ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadArgMax(const Operator *op, ir::Graph &subg) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - - auto inputOperand = subg.operands().at(inputs.at(0)); - auto axisOperand = subg.operands().at(inputs.at(1)); - - if (!axisOperand.isConstant()) - throw std::runtime_error("ArgMax: non-constant 'axis' is not supported."); - if (!(axisOperand.operandSize() == 4 && (axisOperand.typeInfo().type() == ir::DataType::INT32 || - axisOperand.typeInfo().type() == ir::DataType::INT64))) - throw std::runtime_error("ArgMax: `axis` with an int32 or int64 element is only supported."); - ir::operation::ArgMax::Param param; - param.axis = axisOperand.template asVector<int>()[0]; const auto output_type = op->builtin_options_as_ArgMaxOptions()->output_type(); switch (output_type) { case TensorType::TensorType_INT32: case TensorType::TensorType_INT64: + param.output_type = tensorTypeToDataType(output_type); break; default: throw std::runtime_error("ArgMax: `output_type` must be either int32 or int64."); } - param.output_type = tensorTypeToDataType(output_type); - std::unique_ptr<ir::Operation> new_op(new ir::operation::ArgMax(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); -} + auto am = loadOperationTo<ir::operation::ArgMax>(op, subg, param); -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadPow(const Operator *op, ir::Graph &subg) -{ - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - - std::unique_ptr<ir::Operation> new_op(new ir::operation::Pow(inputs, outputs)); - subg.addOperation(std::move(new_op)); -} - -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadRange(const Operator *op, ir::Graph &subg) -{ - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - - std::unique_ptr<ir::Operation> new_op(new ir::operation::Range(inputs, outputs)); - subg.addOperation(std::move(new_op)); -} - -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadTile(const Operator *op, ir::Graph &subg) -{ - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - - auto multiples = inputs.at(ir::operation::Tile::MULTIPLES); - - if (!subg.operands().at(multiples).isConstant()) - throw std::runtime_error("Tile: non-constant 'multiples' is not supported."); - - std::unique_ptr<ir::Operation> new_op(new ir::operation::Tile(inputs, outputs)); - subg.addOperation(std::move(new_op)); + auto &axisOperand = subg.operands().at(am->getInputs().at(ir::operation::ArgMax::Input::AXIS)); + if (!(axisOperand.operandSize() == 4 && (axisOperand.typeInfo().type() == ir::DataType::INT32 || + axisOperand.typeInfo().type() == ir::DataType::INT64))) + throw std::runtime_error("ArgMax: `axis` with an int32 or int64 element is only supported."); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadLogSoftmax(const Operator *op, ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadLogSoftmax(const Operator *op, ir::Graph &subg) { - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - ir::operation::LogSoftmax::Param param; - // In tflite, beta is fixed to 1.0 and axis is fixed to -1. param.beta = 1.0f; param.axis = -1; - std::unique_ptr<ir::Operation> new_op(new ir::operation::LogSoftmax(inputs, outputs, param)); - subg.addOperation(std::move(new_op)); -} - -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadL2Normalization(const Operator *op, - ir::Graph &subg) -{ - ir::OperandIndexSequence inputs; - ir::OperandIndexSequence outputs; - - loadOperationIO(op, inputs, outputs); - - std::unique_ptr<ir::Operation> new_op(new ir::operation::L2Normalization(inputs, outputs)); - subg.addOperation(std::move(new_op)); + loadOperationTo<ir::operation::LogSoftmax>(op, subg, param); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadLeakyRelu(const Operator *op, ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadLeakyRelu(const Operator *op, ir::Graph &subg) { float alpha = op->builtin_options_as_LeakyReluOptions()->alpha(); loadElementwiseActivation(op, subg, ir::operation::ElementwiseActivation::Type::LEAKY_RELU, alpha, 1.f); } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op, ir::Graph &subg) +template <typename LoaderDomain> +void BaseLoader<LoaderDomain>::loadOperation(const Operator *op, ir::Graph &subg) { const auto builtin_op = _model->operator_codes()->Get(op->opcode_index())->builtin_code(); @@ -1733,16 +1281,16 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op, loadFC(op, subg); return; case BuiltinOperator::BuiltinOperator_ADD: - loadBinaryArithmetic<ir::operation::BinaryArithmetic::ArithmeticType::ADD>(op, subg); + loadBinaryArithmetic(op, subg, ir::operation::BinaryArithmetic::ArithmeticType::ADD); return; case BuiltinOperator::BuiltinOperator_SUB: - loadBinaryArithmetic<ir::operation::BinaryArithmetic::ArithmeticType::SUB>(op, subg); + loadBinaryArithmetic(op, subg, ir::operation::BinaryArithmetic::ArithmeticType::SUB); return; case BuiltinOperator::BuiltinOperator_MUL: - loadBinaryArithmetic<ir::operation::BinaryArithmetic::ArithmeticType::MUL>(op, subg); + loadBinaryArithmetic(op, subg, ir::operation::BinaryArithmetic::ArithmeticType::MUL); return; case BuiltinOperator::BuiltinOperator_DIV: - loadBinaryArithmetic<ir::operation::BinaryArithmetic::ArithmeticType::DIV>(op, subg); + loadBinaryArithmetic(op, subg, ir::operation::BinaryArithmetic::ArithmeticType::DIV); return; case BuiltinOperator::BuiltinOperator_PACK: loadPack(op, subg); @@ -1769,40 +1317,37 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op, loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::RSQRT); return; case BuiltinOperator::BuiltinOperator_SELECT: - loadSelect(op, subg); - return; case BuiltinOperator::BuiltinOperator_SELECT_V2: - // Use same loader with BuiltinOperator_SELECT - loadSelect(op, subg); + loadOperationTo<ir::operation::Select>(op, subg); return; case BuiltinOperator::BuiltinOperator_SQRT: loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::SQRT); return; case BuiltinOperator::BuiltinOperator_SQUARED_DIFFERENCE: - loadSquaredDifference(op, subg); + loadOperationTo<ir::operation::SquaredDifference>(op, subg); return; case BuiltinOperator::BuiltinOperator_TANH: loadElementwiseActivation(op, subg, ir::operation::ElementwiseActivation::Type::TANH, 1.f, 1.f); return; case BuiltinOperator::BuiltinOperator_TRANSPOSE: - loadTranspose(op, subg); + loadOperationTo<ir::operation::Transpose>(op, subg); return; case BuiltinOperator::BuiltinOperator_MEAN: - loadReduce<ir::operation::Reduce::ReduceType::MEAN>(op, subg); + loadReduce(op, subg, ir::operation::Reduce::ReduceType::MEAN); return; case BuiltinOperator::BuiltinOperator_REDUCE_ANY: - loadReduce<ir::operation::Reduce::ReduceType::ANY>(op, subg); + loadReduce(op, subg, ir::operation::Reduce::ReduceType::ANY); return; case BuiltinOperator::BuiltinOperator_REDUCE_MAX: - loadReduce<ir::operation::Reduce::ReduceType::MAX>(op, subg); + loadReduce(op, subg, ir::operation::Reduce::ReduceType::MAX); return; case BuiltinOperator::BuiltinOperator_REVERSE_V2: - loadReverseV2(op, subg); + loadOperationTo<ir::operation::Reverse>(op, subg); return; case BuiltinOperator::BuiltinOperator_PAD: case BuiltinOperator::BuiltinOperator_PADV2: - loadPad(op, subg); + loadOperationTo<ir::operation::Pad>(op, subg); return; case BuiltinOperator::BuiltinOperator_LOGISTIC: loadElementwiseActivation(op, subg, ir::operation::ElementwiseActivation::Type::LOGISTIC); @@ -1811,19 +1356,19 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op, loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::EXP); return; case BuiltinOperator::BuiltinOperator_EXPAND_DIMS: - loadExpandDims(op, subg); + loadOperationTo<ir::operation::ExpandDims>(op, subg); return; case BuiltinOperator::BuiltinOperator_GATHER: loadGather(op, subg); return; case BuiltinOperator::BuiltinOperator_SPACE_TO_BATCH_ND: - loadSpaceToBatchND(op, subg); + loadOperationTo<ir::operation::SpaceToBatchND>(op, subg); return; case BuiltinOperator::BuiltinOperator_BATCH_TO_SPACE_ND: - loadBatchToSpaceND(op, subg); + loadOperationTo<ir::operation::BatchToSpaceND>(op, subg); return; case BuiltinOperator::BuiltinOperator_SUM: - loadReduce<ir::operation::Reduce::ReduceType::SUM>(op, subg); + loadReduce(op, subg, ir::operation::Reduce::ReduceType::SUM); return; case BuiltinOperator::BuiltinOperator_CUSTOM: loadCustom(op, subg); @@ -1832,7 +1377,7 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op, loadSqueeze(op, subg); return; case BuiltinOperator::BuiltinOperator_PRELU: - loadPrelu(op, subg); + loadOperationTo<ir::operation::PReLU>(op, subg); return; case BuiltinOperator::BuiltinOperator_SPLIT: loadSplit(op, subg); @@ -1841,7 +1386,7 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op, loadSplitV(op, subg); return; case BuiltinOperator::BuiltinOperator_SLICE: - loadSlice(op, subg); + loadOperationTo<ir::operation::Slice>(op, subg); return; case BuiltinOperator::BuiltinOperator_STRIDED_SLICE: loadStridedSlice(op, subg); @@ -1850,10 +1395,10 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op, loadUnpack(op, subg); return; case BuiltinOperator::BuiltinOperator_MINIMUM: - loadElementwiseBinary<ir::operation::ElementwiseBinary::ElementwiseBinaryType::MIN>(op, subg); + loadElementwiseBinary(op, subg, ir::operation::ElementwiseBinary::ElementwiseBinaryType::MIN); return; case BuiltinOperator::BuiltinOperator_MAXIMUM: - loadElementwiseBinary<ir::operation::ElementwiseBinary::ElementwiseBinaryType::MAX>(op, subg); + loadElementwiseBinary(op, subg, ir::operation::ElementwiseBinary::ElementwiseBinaryType::MAX); return; case BuiltinOperator::BuiltinOperator_CAST: loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::CAST); @@ -1879,10 +1424,10 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op, loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::SIN); return; case BuiltinOperator::BuiltinOperator_SHAPE: - loadShape(op, subg); + loadOperationTo<ir::operation::Shape>(op, subg); return; case BuiltinOperator::BuiltinOperator_REDUCE_PROD: - loadReduce<ir::operation::Reduce::ReduceType::PROD>(op, subg); + loadReduce(op, subg, ir::operation::Reduce::ReduceType::PROD); return; case BuiltinOperator::BuiltinOperator_IF: loadIf(op, subg); @@ -1903,26 +1448,26 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op, loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::ROUND); return; case BuiltinOperator::BuiltinOperator_POW: - loadPow(op, subg); + loadOperationTo<ir::operation::Pow>(op, subg); return; case BuiltinOperator::BuiltinOperator_LOGICAL_NOT: loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::LOGICAL_NOT); return; case BuiltinOperator::BuiltinOperator_LOGICAL_OR: - loadElementwiseBinary<ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR>( - op, subg); + loadElementwiseBinary(op, subg, + ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR); return; case BuiltinOperator::BuiltinOperator_FILL: - loadFill(op, subg); + loadOperationTo<ir::operation::Fill>(op, subg); return; case BuiltinOperator::BuiltinOperator_ZEROS_LIKE: loadElementwiseUnary(op, subg, ir::operation::ElementwiseUnary::Type::ZEROS_LIKE); return; case BuiltinOperator::BuiltinOperator_TILE: - loadTile(op, subg); + loadOperationTo<ir::operation::Tile>(op, subg); return; case BuiltinOperator::BuiltinOperator_RANGE: - loadRange(op, subg); + loadOperationTo<ir::operation::Range>(op, subg); return; case BuiltinOperator::BuiltinOperator_BATCH_MATMUL: loadBatchMatMul(op, subg); @@ -1937,13 +1482,13 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op, loadSpaceToDepth(op, subg); return; case BuiltinOperator::BuiltinOperator_L2_NORMALIZATION: - loadL2Normalization(op, subg); + loadOperationTo<ir::operation::L2Normalization>(op, subg); break; case BuiltinOperator::BuiltinOperator_LEAKY_RELU: loadLeakyRelu(op, subg); return; case BuiltinOperator::BuiltinOperator_RANK: - loadRank(op, subg); + loadOperationTo<ir::operation::Rank>(op, subg); return; default: throw std::runtime_error( @@ -1951,8 +1496,7 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op, } } -template <typename LoaderDomain, typename SpecificLoader> -void BaseLoader<LoaderDomain, SpecificLoader>::loadModel() +template <typename LoaderDomain> void BaseLoader<LoaderDomain>::loadModel() { LoaderDomain::VerifyModelBuffer(*_verifier.get()); _model = LoaderDomain::GetModel(_base); @@ -1967,8 +1511,7 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadModel() auto subgraphs = std::make_unique<ir::Subgraphs>(); for (uint32_t subgraph_index = 0; subgraph_index < domain_subgraphs->size(); ++subgraph_index) { - auto subg = - static_cast<SpecificLoader *>(this)->loadSubgraph((*_model->subgraphs())[subgraph_index]); + auto subg = loadSubgraph((*_model->subgraphs())[subgraph_index]); subgraphs->push(ir::SubgraphIndex{subgraph_index}, std::move(subg)); } _subgraphs = std::move(subgraphs); diff --git a/runtime/onert/frontend/circle/CMakeLists.txt b/runtime/onert/frontend/circle/CMakeLists.txt index 8bcf85dd3..76dca9989 100644 --- a/runtime/onert/frontend/circle/CMakeLists.txt +++ b/runtime/onert/frontend/circle/CMakeLists.txt @@ -8,7 +8,7 @@ add_library(circle_loader SHARED ${CIRCLE_LOADER_SOURCES}) target_include_directories(circle_loader PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include) -target_link_libraries(circle_loader PUBLIC onert_core) +target_link_libraries(circle_loader PRIVATE onert_core) target_link_libraries(circle_loader PRIVATE base_loader nnfw_common nnfw_coverage) target_link_libraries(circle_loader PRIVATE circle_schema) diff --git a/runtime/onert/frontend/circle/src/circle_loader.cc b/runtime/onert/frontend/circle/src/circle_loader.cc index 92a9ee7a5..4565ffc00 100644 --- a/runtime/onert/frontend/circle/src/circle_loader.cc +++ b/runtime/onert/frontend/circle/src/circle_loader.cc @@ -69,7 +69,7 @@ struct LoaderDomain static bool VerifyModelBuffer(Verifier &verifier) { return circle::VerifyModelBuffer(verifier); } }; -class CircleLoader final : public base_loader::BaseLoader<LoaderDomain, CircleLoader> +class CircleLoader final : public base_loader::BaseLoader<LoaderDomain> { protected: void loadInstanceNorm(const Operator *op, ir::Graph &subg); @@ -91,7 +91,8 @@ public: } } - std::unique_ptr<ir::Graph> loadSubgraph(const circle::SubGraph *circle_subg) +private: + std::unique_ptr<ir::Graph> loadSubgraph(const circle::SubGraph *circle_subg) override { auto subg = std::make_unique<ir::Graph>(); // Load tensors diff --git a/runtime/onert/frontend/nnapi/execution.cc b/runtime/onert/frontend/nnapi/execution.cc index ce7da579e..56ca5ef00 100644 --- a/runtime/onert/frontend/nnapi/execution.cc +++ b/runtime/onert/frontend/nnapi/execution.cc @@ -94,12 +94,36 @@ int ANeuralNetworksExecution_setInput(ANeuralNetworksExecution *execution, int32 // Omitted optional input // LSTM operation's some inputs can be optional input + // Transpose operation's permutation input can be optional input if ((buffer == nullptr) && (length == 0)) { + uint32_t dims[1] = {0}; + ANeuralNetworksOperandType compared_shape; + compared_shape.dimensionCount = 1; + compared_shape.dimensions = dims; if (execution->hasUnspecifiedDims(operand_index)) { return ANEURALNETWORKS_NO_ERROR; } + else if (type == nullptr && execution->IsOptionalInput(operand_index)) + { + if (!execution->setOptionalInput(index, type, buffer, length)) + { + VERBOSE(NNAPI::Execution) << "setInput: Fail to set optional input" << std::endl; + return ANEURALNETWORKS_BAD_DATA; + } + return ANEURALNETWORKS_NO_ERROR; + } + // TODO Changes the condition to check zero sized + else if (execution->compareShape(&compared_shape, operand_index)) + { + if (!execution->setInput(index, type, buffer, length)) + { + VERBOSE(NNAPI::Execution) << "setInput: Fail to set input" << std::endl; + return ANEURALNETWORKS_BAD_DATA; + } + return ANEURALNETWORKS_NO_ERROR; + } else { VERBOSE(NNAPI::Execution) << "setInput: Cannot handle fully-specified shape on model build " diff --git a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.cc b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.cc index eb12d7e76..6114b74b0 100644 --- a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.cc +++ b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.cc @@ -98,6 +98,17 @@ bool ANeuralNetworksExecution::compareShape(const ANeuralNetworksOperandType *ty return operand_shape == shape_from_type; } +bool ANeuralNetworksExecution::IsOptionalInput(const onert::ir::OperandIndex index) noexcept +{ + const auto &operand_shape = _execution->primary_subgraph().operands().at(index).shape(); + for (int32_t i = 0; i < operand_shape.rank(); ++i) + { + if (operand_shape.dim(i) != 0) + return false; + } + return true; +} + bool ANeuralNetworksExecution::hasUnspecifiedDims(const onert::ir::OperandIndex index) noexcept { const auto operand_shape = _execution->primary_subgraph().operands().at(index).shape(); @@ -148,6 +159,45 @@ bool ANeuralNetworksExecution::setInput(uint32_t index, const ANeuralNetworksOpe return true; } +bool ANeuralNetworksExecution::setOptionalInput(uint32_t index, + const ANeuralNetworksOperandType *type, + const void *buffer, size_t length) noexcept +{ + assert(type == nullptr); + assert(buffer == nullptr); + assert(length == 0); + try + { + onert::ir::IOIndex input_index{index}; + const auto operand_index = getInputOperandIndex(index); + + const auto type_info = _execution->primary_subgraph().operands().at(operand_index).typeInfo(); + const auto shape = (type != nullptr) + ? NNAPIConvert::getShape(type) + : _execution->primary_subgraph().operands().at(operand_index).shape(); + + // ANeuralNetworksExecution::setInput() uses only shape information + ANeuralNetworksOperandType optional_input_type; + optional_input_type.dimensionCount = shape.rank(); + std::vector<uint32_t> dims(optional_input_type.dimensionCount); + for (uint32_t i = 0; i < optional_input_type.dimensionCount; ++i) + { + dims.at(i) = shape.dim(i); + } + optional_input_type.dimensions = dims.data(); + + return setInput(index, &optional_input_type, buffer, length); + } + catch (const std::exception &e) + { + VERBOSE(EXCEPTION) << e.what() << std::endl; + + return false; + } + + return true; +} + bool ANeuralNetworksExecution::setOutput(uint32_t index, const ANeuralNetworksOperandType *type, void *buffer, size_t length) noexcept { diff --git a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.h b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.h index 848ae743f..1f4b868f6 100644 --- a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.h +++ b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksExecution.h @@ -35,6 +35,8 @@ public: public: bool setInput(uint32_t index, const ANeuralNetworksOperandType *type, const void *buffer, size_t length) noexcept; + bool setOptionalInput(uint32_t index, const ANeuralNetworksOperandType *type, const void *buffer, + size_t length) noexcept; bool setOutput(uint32_t index, const ANeuralNetworksOperandType *type, void *buffer, size_t length) noexcept; bool startExecute(void) noexcept; @@ -46,6 +48,7 @@ public: const onert::ir::OperandIndex index) noexcept; bool compareShape(const ANeuralNetworksOperandType *type, const onert::ir::OperandIndex index) noexcept; + bool IsOptionalInput(const onert::ir::OperandIndex index) noexcept; bool hasUnspecifiedDims(const onert::ir::OperandIndex index) noexcept; size_t getOperandSize(const onert::ir::OperandIndex index) noexcept; const std::shared_ptr<onert::exec::Execution> instance(void) noexcept; diff --git a/runtime/onert/frontend/nnapi/ANeuralNetworksModel.test.cc b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksModel.test.cc index 15a279a7e..bb42f2b08 100644 --- a/runtime/onert/frontend/nnapi/ANeuralNetworksModel.test.cc +++ b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksModel.test.cc @@ -16,10 +16,10 @@ #include <gtest/gtest.h> -#include "wrapper/ANeuralNetworksModel.h" +#include "ANeuralNetworksModel.h" -TEST(MODEL, model_build) +TEST(MODEL, neg_model_build) { ANeuralNetworksModel model; - ASSERT_EQ(model.isFinished(), false); + ASSERT_FALSE(model.isFinished()); } diff --git a/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc b/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc index 8e3d83db4..e6c38f5f8 100644 --- a/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc +++ b/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc @@ -708,31 +708,7 @@ OperationFactory::OperationFactory() return new operation::StridedSlice{inputs, outputs, param}; }; - _map[ANEURALNETWORKS_TRANSPOSE] = [](const OperationFactory::Param &init_param, - Operands &operands) { - // TODO make this work with init_param.input_count == 1 (when permutation vector is optional) - - // Inputs - // 0: An n-D tensor, specifying the tensor to be transposed. - // 1: An optional 1-D Tensor of {@link ANEURALNETWORKS_TENSOR_INT32}, - // the permutation of the dimensions of the input tensor. - // The returned tensor's dimension i corresponds to the input dimension - // perm[i]. If perm is not given, it is set to (n-1...0), where n is the - // rank of the input tensor. Hence by default, this operation performs a - // regular matrix transpose on 2-D input Tensors. - assert(init_param.input_count == 2); - assert(init_param.output_count == 1); - - OperandIndexSequence inputs{init_param.inputs[0]}; - OperandIndexSequence outputs{init_param.outputs[0]}; - std::vector<std::int32_t> perm = - operands.at(OperandIndex{init_param.inputs[1]}).asVector<std::int32_t>(); - - operation::Transpose::Param param; - param.perm.assign(perm.cbegin(), perm.cend()); - - return new operation::Transpose{inputs, outputs, param}; - }; + _map[ANEURALNETWORKS_TRANSPOSE] = createSimpleBinaryOp<operation::Transpose>; _map[ANEURALNETWORKS_MUL] = getBinaryArithmeticGenerator(onert::ir::operation::BinaryArithmetic::ArithmeticType::MUL); @@ -982,6 +958,28 @@ OperationFactory::OperationFactory() return new operation::ResizeBilinear{inputs, outputs, param}; }; + _map[ANEURALNETWORKS_RESIZE_NEAREST_NEIGHBOR] = [](const OperationFactory::Param &init_param, + Operands &operands) { + assert((init_param.input_count == 3 || init_param.input_count == 4) && + init_param.output_count == 1); + + OperandIndexSequence outputs{init_param.outputs[0]}; + + // Each input should be interpreted as follows: + // + // 0 -> IFM Index + // 1 -> Height Index + // 2 -> Width Index + OperandIndexSequence inputs{init_param.inputs[0]}; + + operation::ResizeNearestNeighbor::Param param; + param.height_out = operands.at(OperandIndex{init_param.inputs[1]}).asScalar<int32_t>(); + param.width_out = operands.at(OperandIndex{init_param.inputs[2]}).asScalar<int32_t>(); + param.align_corners = false; + // The layout input is not supported yet + return new operation::ResizeNearestNeighbor{inputs, outputs, param}; + }; + _map[ANEURALNETWORKS_RELU1] = getElementwiseActivationGenerator( onert::ir::operation::ElementwiseActivation::Type::RELU, 1.f, -1.f); @@ -1304,6 +1302,105 @@ OperationFactory::OperationFactory() } param.cell_threshold = operands.at(OperandIndex{init_param.inputs[21]}).asScalar<float>(); param.projection_threshold = operands.at(OperandIndex{init_param.inputs[22]}).asScalar<float>(); + // This is initialization to prevent warning or error by static code analyzer. LSTM operation + // does not need time_major + param.time_major = false; + + return new operation::LSTM{inputs, outputs, param}; + }; + + _map[ANEURALNETWORKS_UNIDIRECTIONAL_SEQUENCE_LSTM] = [](const OperationFactory::Param &init_param, + Operands &operands) { + assert((init_param.input_count >= 24 || init_param.input_count <= 28) && + (init_param.output_count >= 1 && init_param.output_count <= 3)); + + // Each input should be interpreted as follows: + // + // 0 -> Input Tensor Index + // 1 -> Input to Input Tensor Index + // 2 -> Input to Forget Tensor Index + // 3 -> Input to Cell Tensor Index + // 4 -> Input to Output Tensor Index + // 5 -> Recurrent to Input Weights Tensor Index + // 6 -> Recurrent to Forget Weights Tensor Index + // 7 -> Recurrent to Cell Weights Tensor Index + // 8 -> Recurrent to Output Weights Tensor Index + // 9 -> Cell to Input Weights Tensor Index + // 10 -> Cell to Forget Weights Tensor Index + // 11 -> Cell to Output Weights Tensor Index + // 12 -> Input Gate Bias Tensor Index + // 13 -> Forget Gate Bias Tensor Index + // 14 -> Cell Bias Tensor Index + // 15 -> Output Gate Bias Tensor Index + // 16 -> Projection Weights Tensor Index + // 17 -> Projection Bias Tensor Index + // 18 -> Output State In Tensor Index + // 19 -> Cell State In Tensor Index + assert(init_param.input_count - 3 > 20); + OperandIndexSequence inputs; + for (uint32_t n = 0; n < 20; ++n) + { + inputs.append(OperandIndex{init_param.inputs[n]}); + } + + // 24 -> Input Layer Normalization Weights Tensor Index + // 25 -> Forget Layer Normalization Weights Tensor Index + // 26 -> Cell Layer Normalization Weights Tensor Index + // 27 -> Output Layer Normalization Weights Tensor Index + if (init_param.input_count > 24) + { + for (uint32_t n = 24; n < 28; ++n) + { + if (init_param.input_count > n) + { + inputs.append(OperandIndex{init_param.inputs[n]}); + } + } + } + + // Each output should be interpreted as follows: + // + // 0 -> Output Tensor Index -> 3 + // 1 -> Output State Out Tensor Index + // 2 -> Cell State Out Tensor Index + const OperandIndex scratch_buffer_index; + OperandIndex output_state_index = + init_param.output_count >= 2 ? OperandIndex{init_param.outputs[1]} : OperandIndex(); + OperandIndex cell_state_index = + init_param.output_count >= 3 ? OperandIndex{init_param.outputs[2]} : OperandIndex(); + const OperandIndex output_index = OperandIndex{init_param.outputs[0]}; + OperandIndexSequence outputs{scratch_buffer_index, output_state_index, cell_state_index, + output_index}; + + operation::LSTM::Param param; + const auto activation_index = OperandIndex{init_param.inputs[20]}; + switch (operands.at(activation_index).asScalar<int32_t>()) + { + case 0: + param.activation = Activation::NONE; + break; + case 1: + param.activation = Activation::RELU; + break; + case 2: + param.activation = Activation::RELU1; + break; + case 3: + param.activation = Activation::RELU6; + break; + case 4: + param.activation = Activation::TANH; + break; + case 6: + param.activation = Activation::SIGMOID; + break; + default: + throw std::runtime_error("Unsupported activation type"); + break; + } + param.cell_threshold = operands.at(OperandIndex{init_param.inputs[21]}).asScalar<float>(); + param.projection_threshold = operands.at(OperandIndex{init_param.inputs[22]}).asScalar<float>(); + param.time_major = operands.at(OperandIndex{init_param.inputs[23]}).asScalar<bool>(); return new operation::LSTM{inputs, outputs, param}; }; @@ -1406,7 +1503,7 @@ OperationFactory::OperationFactory() // TODO Remove ANEURALNETWORKS_ABS_EX _map[ANEURALNETWORKS_ABS_EX] = _map[ANEURALNETWORKS_ABS]; - _map[ANEURALNETWORKS_ARGMAX] = [](const OperationFactory::Param &init_param, Operands &operands) { + _map[ANEURALNETWORKS_ARGMAX] = [](const OperationFactory::Param &init_param, Operands &) { assert(init_param.input_count == 2 && init_param.output_count == 1); OperandIndexSequence outputs{init_param.outputs[0]}; @@ -1415,10 +1512,9 @@ OperationFactory::OperationFactory() // // 0 -> Input Tensor Index // 1 -> Axis Tensor Index - OperandIndexSequence inputs{init_param.inputs[0]}; + OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]}; operation::ArgMax::Param param; - param.axis = operands.at(OperandIndex{init_param.inputs[1]}).asScalar<std::int32_t>(); // NNAPI ARGMAX output type is always int32 param.output_type = DataType::INT32; @@ -1517,7 +1613,7 @@ OperationFactory::OperationFactory() assert(init_param.input_count == 3); assert(init_param.output_count >= 1); // At least one output tensor and axis - OperandIndexSequence inputs{init_param.inputs[0]}; + OperandIndexSequence inputs{init_param.inputs[1], init_param.inputs[0]}; OperandIndexSequence outputs; for (uint32_t n = 0; n < init_param.output_count; ++n) { @@ -1525,7 +1621,6 @@ OperationFactory::OperationFactory() } operation::Split::Param param; - param.axis = operands.at(OperandIndex{init_param.inputs[1]}).asScalar<std::int32_t>(); param.num_splits = operands.at(OperandIndex{init_param.inputs[2]}).asScalar<std::int32_t>(); return new operation::Split{inputs, outputs, param}; diff --git a/runtime/onert/frontend/tflite/CMakeLists.txt b/runtime/onert/frontend/tflite/CMakeLists.txt index fcadf5223..604a9e4cb 100644 --- a/runtime/onert/frontend/tflite/CMakeLists.txt +++ b/runtime/onert/frontend/tflite/CMakeLists.txt @@ -8,7 +8,7 @@ add_library(tflite_loader SHARED ${TFLITE_LOADER_SOURCES}) target_include_directories(tflite_loader PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include) -target_link_libraries(tflite_loader PUBLIC onert_core) +target_link_libraries(tflite_loader PRIVATE onert_core) target_link_libraries(tflite_loader PRIVATE base_loader nnfw_common nnfw_coverage) install(TARGETS tflite_loader DESTINATION lib) diff --git a/runtime/onert/frontend/tflite/src/tflite_loader.cc b/runtime/onert/frontend/tflite/src/tflite_loader.cc index 7eef15717..fe4295ada 100644 --- a/runtime/onert/frontend/tflite/src/tflite_loader.cc +++ b/runtime/onert/frontend/tflite/src/tflite_loader.cc @@ -62,7 +62,7 @@ struct LoaderDomain } }; -class TFLiteLoader final : public base_loader::BaseLoader<LoaderDomain, TFLiteLoader> +class TFLiteLoader final : public base_loader::BaseLoader<LoaderDomain> { public: using BaseLoader::BaseLoader; @@ -78,7 +78,8 @@ public: } } - std::unique_ptr<ir::Graph> loadSubgraph(const onert_tflite::SubGraph *tflite_subg) +private: + std::unique_ptr<ir::Graph> loadSubgraph(const onert_tflite::SubGraph *tflite_subg) override { auto subg = std::make_unique<ir::Graph>(); // Load tensors diff --git a/runtime/onert/test/graph/Index.cc b/runtime/onert/test/graph/Index.cc index 358e64c82..2d110e326 100644 --- a/runtime/onert/test/graph/Index.cc +++ b/runtime/onert/test/graph/Index.cc @@ -20,7 +20,7 @@ using Index = ::onert::util::Index<uint32_t, struct TestTag>; -TEST(Index, index_test) +TEST(Index, neg_index_test) { Index idx1{1u}; Index idx2{2u}; diff --git a/runtime/onert/test/graph/operand/IndexSet.cc b/runtime/onert/test/graph/operand/IndexSet.cc index 6215e0d24..6ef425a2d 100644 --- a/runtime/onert/test/graph/operand/IndexSet.cc +++ b/runtime/onert/test/graph/operand/IndexSet.cc @@ -21,7 +21,7 @@ using onert::ir::OperandIndex; using onert::ir::OperandIndexSequence; -TEST(graph_OperandIndexSequence, append) +TEST(graph_OperandIndexSequence, neg_append) { OperandIndexSequence iset{0, 2, 4, 8}; @@ -42,7 +42,7 @@ TEST(graph_OperandIndexSequence, append) ASSERT_FALSE(iset.contains(OperandIndex{11})); } -TEST(graph_OperandIndexSequence, replace) +TEST(graph_OperandIndexSequence, neg_replace) { OperandIndexSequence iset{0, 1, 2, 3}; diff --git a/runtime/onert/test/graph/operand/LayoutSet.cc b/runtime/onert/test/graph/operand/LayoutSet.cc index e35bddd8b..ef965a41e 100644 --- a/runtime/onert/test/graph/operand/LayoutSet.cc +++ b/runtime/onert/test/graph/operand/LayoutSet.cc @@ -21,7 +21,22 @@ using onert::ir::Layout; using onert::ir::LayoutSet; -TEST(graph_operand_LayoutSet, layout_set_operators) +TEST(graph_operand_LayoutSet, neg_add_remove) +{ + LayoutSet set{Layout::NCHW}; + set.remove(Layout::NHWC); + ASSERT_EQ(set.size(), 1); + set.add(Layout::NHWC); + ASSERT_EQ(set.size(), 2); + set.remove(Layout::NHWC); + ASSERT_EQ(set.size(), 1); + set.remove(Layout::NCHW); + ASSERT_EQ(set.size(), 0); + set.remove(Layout::NCHW); + ASSERT_EQ(set.size(), 0); +} + +TEST(graph_operand_LayoutSet, set_operators) { LayoutSet set1{Layout::NCHW}; LayoutSet set2{Layout::NHWC}; diff --git a/runtime/onert/test/graph/operand/Set.cc b/runtime/onert/test/graph/operand/Set.cc index 0d35b5581..ffee417b8 100644 --- a/runtime/onert/test/graph/operand/Set.cc +++ b/runtime/onert/test/graph/operand/Set.cc @@ -18,7 +18,7 @@ #include "ir/Operands.h" -TEST(graph_operand_Set, set_test) +TEST(graph_operand_Set, neg_set_test) { onert::ir::Operands set; diff --git a/runtime/onert/test/graph/operand/UseDef.cc b/runtime/onert/test/graph/operand/UseDef.cc index cd2cdb739..a8686eb18 100644 --- a/runtime/onert/test/graph/operand/UseDef.cc +++ b/runtime/onert/test/graph/operand/UseDef.cc @@ -31,7 +31,7 @@ using Mock = onert_test::ir::SimpleMock; } // namespace -TEST(graph_operand_usedef, usedef_test) +TEST(graph_operand_usedef, neg_usedef_test) { onert::ir::Graph graph; onert::ir::verifier::DAGChecker verifier; @@ -62,7 +62,7 @@ TEST(graph_operand_usedef, usedef_test) graph.finishBuilding(); - ASSERT_EQ(verifier.verify(graph), true); + ASSERT_TRUE(verifier.verify(graph)); // Check def ASSERT_EQ(graph.operands().at(operand_index1).getDef(), mocknode_index1); diff --git a/runtime/onert/test/graph/operation/SetIO.cc b/runtime/onert/test/graph/operation/SetIO.cc index 378c5b4b9..22068ff58 100644 --- a/runtime/onert/test/graph/operation/SetIO.cc +++ b/runtime/onert/test/graph/operation/SetIO.cc @@ -62,7 +62,7 @@ TEST(graph_operation_setIO, operation_setIO_conv) ASSERT_EQ(conv->getInputs().at(Index{0}).value(), 8); } -TEST(graph_operation_setIO, operation_setIO_concat) +TEST(graph_operation_setIO, neg_operation_setIO_concat) { onert::ir::Graph graph; diff --git a/runtime/onert/test/graph/verifier/Verifier.cc b/runtime/onert/test/graph/verifier/Verifier.cc index f8c7557e3..3bce2746c 100644 --- a/runtime/onert/test/graph/verifier/Verifier.cc +++ b/runtime/onert/test/graph/verifier/Verifier.cc @@ -45,5 +45,54 @@ TEST(Verifier, dag_checker) onert::ir::verifier::DAGChecker verifier; - ASSERT_EQ(verifier.verify(graph), true); + ASSERT_TRUE(verifier.verify(graph)); +} + +TEST(Verifier, neg_edge_consistency_checker_1) +{ + onert::ir::Graph graph; + + onert::ir::Shape shape{3}; + onert::ir::TypeInfo type{onert::ir::DataType::INT32}; + + auto operand1 = graph.addOperand(shape, type); + auto operand2 = graph.addOperand(shape, type); + + graph.addInput(operand1); + graph.addOutput(operand2); + + auto mock_op = std::make_unique<Mock>(IndexSet{operand1}, IndexSet{operand2}); + auto op_ind = graph.addOperation(std::move(mock_op)); + + graph.finishBuilding(); + + graph.operands().at(operand1).removeUse(op_ind); // Manipulate the operand alone + + onert::ir::verifier::EdgeConsistencyChecker verifier; + ASSERT_FALSE(verifier.verify(graph)); +} + +TEST(Verifier, neg_edge_consistency_checker_2) +{ + onert::ir::Graph graph; + + onert::ir::Shape shape{3}; + onert::ir::TypeInfo type{onert::ir::DataType::INT32}; + + auto operand1 = graph.addOperand(shape, type); + auto operand2 = graph.addOperand(shape, type); + + graph.addInput(operand1); + graph.addOutput(operand2); + + auto mock_op = std::make_unique<Mock>(IndexSet{operand1}, IndexSet{operand2}); + auto mock_op_ptr = mock_op.get(); + auto op_ind = graph.addOperation(std::move(mock_op)); + + graph.finishBuilding(); + + mock_op_ptr->setInputs({operand2}); // Manipulate the operation alone + + onert::ir::verifier::EdgeConsistencyChecker verifier; + ASSERT_FALSE(verifier.verify(graph)); } diff --git a/runtime/onert/test/util/ShapeInference.cc b/runtime/onert/test/util/ShapeInference.cc index aab33fab5..a5f0af5ee 100644 --- a/runtime/onert/test/util/ShapeInference.cc +++ b/runtime/onert/test/util/ShapeInference.cc @@ -34,7 +34,7 @@ TEST(ShapeInference, Elementwise) ASSERT_EQ(infered_out_shape.dim(3), 3); } -TEST(ShapeInference, IncorrectElementwise) +TEST(ShapeInference, neg_Elementwise) { Shape lhs_shape{1, 299, 299, 3}; Shape rhs_shape{5, 3}; @@ -123,6 +123,18 @@ TEST(ShapeInference, Pool2DNodeExplicit) ASSERT_EQ(infered_out_shape.asFeature(Layout::NHWC).C, 20); } +TEST(ShapeInference, neg_Pool2DNode_InvalidStride) +{ + Shape in_shape{10, 6, 12, 20}; + Stride stride{0, 7}; + Padding padding{PaddingType::SAME}; + + operation::Pool2D::Param avg_pool_param{ + operation::Pool2D::PoolType::AVG, 3, 6, stride, padding, Activation::NONE}; + ASSERT_THROW(onert::shape_inference::inferPoolShape(in_shape, avg_pool_param), + std::runtime_error); +} + TEST(ShapeInference, Conv2D) { Shape in_shape{10, 6, 12, 20}; @@ -159,6 +171,17 @@ TEST(ShapeInference, Conv2D) ASSERT_EQ(infered_out_shape.asFeature(Layout::NHWC).C, 30); } +TEST(ShapeInference, neg_Conv2D_InvalidStride) +{ + Shape in_shape{10, 6, 12, 20}; + Shape ker_shape{30, 3, 6, 20}; + + operation::Conv2D::Param param{Stride{0, 0}, Padding{PaddingType::VALID}, Activation::NONE, + Dilation{1, 1}}; + ASSERT_THROW(onert::shape_inference::inferConv2DShape(in_shape, ker_shape, param), + std::runtime_error); +} + TEST(ShapeInference, DepthwiseConv2D) { Shape in_shape{10, 6, 12, 20}; @@ -195,6 +218,17 @@ TEST(ShapeInference, DepthwiseConv2D) ASSERT_EQ(infered_out_shape.asFeature(Layout::NHWC).C, 60); } +TEST(ShapeInference, neg_DepthwiseConv2D_InvalidSride) +{ + Shape in_shape{10, 6, 12, 20}; + Shape ker_shape{1, 3, 6, 60}; + + operation::DepthwiseConv2D::Param param{Stride{3, 0}, Padding{PaddingType::VALID}, 3, + Activation::NONE}; + ASSERT_THROW(onert::shape_inference::inferDepthwiseConv2DShape(in_shape, ker_shape, param), + std::runtime_error); +} + TEST(ShapeInference, Concat) { { @@ -328,7 +362,8 @@ TEST(ShapeInference, Transpose) // pre-conditions ASSERT_EQ(in_shape.rank(), perm.size()); ASSERT_EQ(expected.rank(), perm.size()); - auto inferred_out_shape = onert::shape_inference::inferTransposeShape(in_shape, perm); + auto inferred_out_shape = + onert::shape_inference::inferTransposeShape(in_shape, perm.data(), perm.size()); // post-conditions ASSERT_EQ(inferred_out_shape.rank(), perm.size()); for (int32_t dim = 0; dim < expected.rank(); dim++) @@ -369,12 +404,141 @@ TEST(ShapeInference, neg_Transpose) { std::vector<int> perm = {2, 0, 1, 0}; // int32_t rank = 3; - ASSERT_THROW(onert::shape_inference::inferTransposeShape(in_shape, perm), std::runtime_error); + ASSERT_THROW(onert::shape_inference::inferTransposeShape(in_shape, perm.data(), perm.size()), + std::runtime_error); } // Invalid parameter value { std::vector<int> perm = {2, 0, 3}; // int32_t rank = 3; - ASSERT_THROW(onert::shape_inference::inferTransposeShape(in_shape, perm), std::runtime_error); + ASSERT_THROW(onert::shape_inference::inferTransposeShape(in_shape, perm.data(), perm.size()), + std::runtime_error); + } +} + +TEST(ShapeInference, Gather) +{ + auto check = [&](Shape &input, Shape &indices, Shape &expected, int32_t axis) { + int rank = input.rank(); + auto actual = onert::shape_inference::inferGatherShape(input, indices, axis, rank); + + ASSERT_EQ(actual.rank(), expected.rank()); + + for (int32_t dim = 0; dim < expected.rank(); dim++) + ASSERT_EQ(actual.dim(dim), expected.dim(dim)); + }; + + // check for 2-D, 3-D, axis 0 + { + Shape input{3, 4}; + Shape indices{1, 1, 2}; + int32_t axis = 0; + Shape expected{1, 1, 2, 4}; + check(input, indices, expected, axis); + } + + // check for 2-D, 3-D, axis 1 + { + Shape input{3, 4}; + Shape indices{1, 2, 1}; + int32_t axis = 1; + Shape expected{3, 1, 2, 1}; + check(input, indices, expected, axis); + } + + // check for 3-D, 2-D, axis 0 + { + Shape input{2, 3, 4}; + Shape indices{1, 2}; + int32_t axis = 0; + Shape expected{1, 2, 3, 4}; + check(input, indices, expected, axis); + } + + // check for 3-D, 2-D, axis 2 + { + Shape input{2, 3, 4}; + Shape indices{2, 1}; + int32_t axis = 2; + Shape expected{2, 3, 2, 1}; + check(input, indices, expected, axis); + } + + // check for 4D, axis 0 + { + Shape input{1, 2, 3, 4}; + Shape indices{2}; + int32_t axis = 0; + Shape expected{2, 2, 3, 4}; + check(input, indices, expected, axis); + } +} + +TEST(ShapeInference, BCQFullyConnected) +{ + auto check = [&](Shape &in_shape, Shape &cluster_shape, std::vector<int> cluster, + Shape &expected) { + auto actual = onert::shape_inference::inferBCQFullyConnectedShape(in_shape, cluster_shape, + cluster.data()); + ASSERT_EQ(actual.rank(), expected.rank()); + + for (int32_t dim = 0; dim < expected.rank(); dim++) + ASSERT_EQ(actual.dim(dim), expected.dim(dim)); + }; + + { + Shape in_shape{10, 1}; + Shape cluster_shape{3, 2}; + std::vector<int> cluster = {1, 10, 2, 10, 3, 10}; + + Shape expected{30, 1}; + check(in_shape, cluster_shape, cluster, expected); + } + + { + Shape in_shape{1, 1}; + Shape cluster_shape{1, 2}; + std::vector<int> cluster = {3, 50}; + + Shape expected{50, 1}; + check(in_shape, cluster_shape, cluster, expected); + } +} + +TEST(ShapeInference, BCQGather) +{ + auto check = [&](Shape &indices_shape, Shape &cluster_shape, std::vector<int> cluster, + uint32_t hidden_size, uint32_t axis, int rank, Shape &expected) { + operation::BCQGather::Param param{hidden_size, axis}; + auto actual = onert::shape_inference::inferBCQGatherShape(indices_shape, cluster_shape, + cluster.data(), rank, param); + ASSERT_EQ(actual.rank(), expected.rank()); + + for (int32_t dim = 0; dim < expected.rank(); dim++) + ASSERT_EQ(actual.dim(dim), expected.dim(dim)); + }; + + { + Shape indices_shape{5, 1}; + Shape cluster_shape{3, 2}; + std::vector<int> cluster = {1, 10, 2, 10, 3, 10}; + uint32_t hidden_size = 10; + uint32_t axis = 0; + int rank = 2; + + Shape expected{5, 1, 10}; + check(indices_shape, cluster_shape, cluster, hidden_size, axis, rank, expected); + } + + { + Shape indices_shape{5, 1}; + Shape cluster_shape{3, 2}; + std::vector<int> cluster = {1, 10, 2, 10, 3, 10}; + uint32_t hidden_size = 10; + uint32_t axis = 1; + int rank = 2; + + Shape expected{30, 5, 1}; + check(indices_shape, cluster_shape, cluster, hidden_size, axis, rank, expected); } } |