diff options
author | Chunseok Lee <chunseok.lee@samsung.com> | 2022-04-15 19:15:11 +0900 |
---|---|---|
committer | Chunseok Lee <chunseok.lee@samsung.com> | 2022-04-15 19:15:11 +0900 |
commit | 3ad689f0803519e343c36d5700646e86059df961 (patch) | |
tree | 862346c401a5577518fa7f042532aa931b53aa0e /runtime/onert/backend | |
parent | ac6e4dd7b480e83b586ef533d7b29a8a97eb48fe (diff) | |
download | nnfw-3ad689f0803519e343c36d5700646e86059df961.tar.gz nnfw-3ad689f0803519e343c36d5700646e86059df961.tar.bz2 nnfw-3ad689f0803519e343c36d5700646e86059df961.zip |
Imported Upstream version 1.20.0upstream/1.20.0submit/tizen/20220415.103159
Diffstat (limited to 'runtime/onert/backend')
170 files changed, 2266 insertions, 24518 deletions
diff --git a/runtime/onert/backend/CMakeLists.txt b/runtime/onert/backend/CMakeLists.txt index 4b21e0ace..c43160ba7 100644 --- a/runtime/onert/backend/CMakeLists.txt +++ b/runtime/onert/backend/CMakeLists.txt @@ -1,9 +1,14 @@ +# Backend common libs set(LIB_ONERT_BACKEND_ACL_COMMON onert_backend_acl_common) +set(LIB_ONERT_BACKEND_CL_COMMON onert_backend_cl_common) +add_subdirectory(cl_common) +add_subdirectory(acl_common) +# Backends add_subdirectory(cpu) add_subdirectory(acl_cl) add_subdirectory(acl_neon) -add_subdirectory(acl_common) add_subdirectory(ruy) add_subdirectory(gpu_cl) add_subdirectory(xnnpack) +add_subdirectory(trix) diff --git a/runtime/onert/backend/acl_cl/BackendContext.cc b/runtime/onert/backend/acl_cl/BackendContext.cc deleted file mode 100644 index 5595043ca..000000000 --- a/runtime/onert/backend/acl_cl/BackendContext.cc +++ /dev/null @@ -1,242 +0,0 @@ -/* - * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "BackendContext.h" - -#include "TensorBuilder.h" -#include "KernelGenerator.h" -#include "Optimizer.h" -#include "util/logging.h" -#include "ir/Index.h" -#include "ir/OperandIndexMap.h" -#include "ir/OperandIndexSequence.h" - -namespace onert -{ -namespace backend -{ -namespace acl_cl -{ - -void BackendContext::initConsts() -{ - _data.graph->operations().iterate([&](const ir::OperationIndex &, const ir::Operation &op) { - constant_initializer->setLayout(graph()->layout()); - op.accept(*constant_initializer); - }); - - _data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &operand) { - if (_data.external_operands.contains(ind) || !operand.isConstant()) - return; - const auto &obj = graph()->operands().at(ind); - if (obj.isConstant() && !constant_initializer->exist(ind)) - { - constant_initializer->registerDefaultInitializer(ind, obj); - } - }); - - constant_initializer->run(); -} - -void BackendContext::planTensors() -{ - ir::OperandIndexMap<uint32_t> uses_map; - ir::OperandIndexMap<uint32_t> def_map; - ir::OperandIndexSequence constants; - - // Prepare scanning - _data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) { - if (_data.external_operands.contains(ind)) - return; - - uses_map[ind] = obj.getUses().size(); - def_map[ind] = obj.getDef().valid() ? 1 : 0; - - if (obj.isConstant()) - constants.append(ind); - - if (!tensor_builder->isRegistered(ind)) - { - // These tensors do not exist in any operation (No use and def) - const auto info = obj.info(); - const auto layout = _data.operand_layouts.at(ind); - // TODO Change tensor info to have permuted shape - tensor_builder->registerTensorInfo(ind, info, layout); - } - }); - - // Start scanning to do notify{First|Last}Use for each tensor - - // If a tensor is a constant, increase the use of the tensor and allocate it first. - // Increasing use count here makes the tensor never be deallocated, i.e it they will be - // deallocated last. - VERBOSE(planTensors) << "TENSORS as CONSTANT" << std::endl; - for (const auto &ind : constants) - { - uses_map[ind]++; - tensor_builder->notifyFirstUse(ind); - } - - // At each operation, - // 1. Scan DEF of outputs. If the DEF, allocate it - // 2. Scan DEF of inputs. If variable tensor, allocate it - // 3. Scan USE of inputs. Decrease the USE and deallocate if the USE is 0 - for (const auto op_ind : _data.op_order) - { - const auto &op = graph()->operations().at(op_ind); - auto op_inputs = op.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED; - auto op_outputs = op.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED; - - // Define outputs - for (const auto &ind : op_outputs) - { - if (!tensor_builder->isRegistered(ind)) - continue; - assert(def_map.find(ind) != def_map.end()); - if (def_map[ind]) - { - def_map[ind] = 0; - tensor_builder->notifyFirstUse(ind); - } - } - - // Scan variable tensors - // This tensor has features like constant. But OperandInfo and LowerInfo treat them as - // non-constant because of less memory usage by memory planning in here - for (const auto &ind : op_inputs) - { - if (!tensor_builder->isRegistered(ind)) - continue; - const auto &operand = graph()->operands().at(ind); - if (operand.info().isVariable()) - { - // The variable tensor with buffer is not supported yet - assert(operand.data() == nullptr); - assert(operand.getUses().size() == 1 && !operand.getDef().valid()); - assert(uses_map[ind] == 1 && def_map[ind] == 0); - tensor_builder->notifyFirstUse(ind); - } - } - - for (const auto &ind : op_inputs) - { - if (!tensor_builder->isRegistered(ind)) - continue; - assert(uses_map.find(ind) != uses_map.end()); - assert(uses_map[ind] > 0); - uses_map[ind]--; - if (uses_map[ind] == 0) - { - // plan for deallocation of static tensornode - tensor_builder->notifyLastUse(ind); - } - } - } - - _data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &) { - if (uses_map[ind] == 0) - { - tensor_builder->notifyLastUse(ind); - } - }); - - // Dispose and validate - for (const auto &ind : constants) - { - --uses_map[ind]; - if (uses_map[ind] == 0) // To prevent notifyLastUse from being called twice - { - tensor_builder->notifyLastUse(ind); - } - } - - assert( - std::all_of(uses_map.begin(), uses_map.end(), - [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; })); - - assert( - std::all_of(def_map.begin(), def_map.end(), - [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; })); -} - -ITensorRegistry *BackendContext::genTensors() -{ - optimizer->optimize(); - - graph()->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) { - if (external_operands().contains(ind)) - return; - - const auto frontend_layout = graph()->layout(); - const auto backend_layout = operand_layouts().at(ind); - ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout), - obj.typeInfo(), obj.info().memAllocType(), obj.isConstant()}; - tensor_builder->registerTensorInfo(ind, backend_info, backend_layout); - }); - - // TODO Get compiler options from compiler, and use it rather than getting it from Env - if (util::getConfigString(util::config::EXECUTOR) == "Linear") - { - planTensors(); - } - else - { - // For the executors that does not have fixed linear execution order: - // To make tensors never be deallocated, this is a workaround to use static memory planner - graph()->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &) { - if (tensor_builder->isRegistered(ind)) - tensor_builder->notifyFirstUse(ind); - }); - } - - tensor_builder->prepare(); - - return tensor_registry.get(); -} - -FunctionMap BackendContext::genKernels() -{ - FunctionMap ret; - - for (auto op_ind : _data.op_order) - { - auto fn_seq = kernel_gen->generate(op_ind); - ret.emplace_back(op_ind, std::move(fn_seq)); - } - - tensor_builder->allocate(); - initConsts(); - - // NOTE For memory optimization, we want to free some operand data - const_cast<ir::Graph &>(*_data.graph) - .operands() - .iterate([&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); }); - - for (auto &it : ret) - { - auto &fn_seq = it.second; - fn_seq->iterate([&](exec::IFunction &ifunc) { - ifunc.prepare(); - tensor_builder->postFunctionPrepare(); - }); - } - - return ret; -} - -} // namespace acl_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/acl_cl/BackendContext.h b/runtime/onert/backend/acl_cl/BackendContext.h index 2638046ca..5da915825 100644 --- a/runtime/onert/backend/acl_cl/BackendContext.h +++ b/runtime/onert/backend/acl_cl/BackendContext.h @@ -17,10 +17,11 @@ #ifndef __ONERT_BACKEND_ACL_CL_BACKEND_CONTEXT_H__ #define __ONERT_BACKEND_ACL_CL_BACKEND_CONTEXT_H__ -#include <backend/BackendContext.h> -#include "TensorBuilder.h" +#include <AclBackendContext.h> + #include "ConstantInitializer.h" #include "KernelGenerator.h" +#include "TensorBuilder.h" namespace onert { @@ -31,33 +32,8 @@ namespace acl_cl class Optimizer; -class BackendContext : public onert::backend::BackendContext -{ -public: - BackendContext(const Backend *backend, ContextData &&data, - std::shared_ptr<ITensorRegistry> tensor_registry = nullptr, - std::shared_ptr<TensorBuilder> tensor_builder = nullptr, - std::shared_ptr<ConstantInitializer> constant_initializer = nullptr, - std::shared_ptr<KernelGenerator> kernel_gen = nullptr) - : onert::backend::BackendContext(backend, std::move(data), tensor_registry), - tensor_builder{tensor_builder}, constant_initializer{constant_initializer}, kernel_gen{ - kernel_gen} - { - } - - ITensorRegistry *genTensors() override; - FunctionMap genKernels() override; - -private: - void initConsts(); - void planTensors(); - -public: - std::shared_ptr<TensorBuilder> tensor_builder; - std::shared_ptr<ConstantInitializer> constant_initializer; - std::shared_ptr<KernelGenerator> kernel_gen; - std::shared_ptr<Optimizer> optimizer; -}; +using BackendContext = + acl_common::AclBackendContext<TensorBuilder, ConstantInitializer, KernelGenerator, Optimizer>; } // namespace acl_cl } // namespace backend diff --git a/runtime/onert/backend/acl_cl/ConstantInitializer.cc b/runtime/onert/backend/acl_cl/ConstantInitializer.cc index 54b2a7a08..0431bb198 100644 --- a/runtime/onert/backend/acl_cl/ConstantInitializer.cc +++ b/runtime/onert/backend/acl_cl/ConstantInitializer.cc @@ -58,21 +58,7 @@ void ConstantInitializer::visit(const ir::operation::SpaceToBatchND &node) if (block_size_obj.isConstant()) { - _init_map[block_size_index] = [](const ir::Operand &model_obj, backend::ITensor &obj) { - assert(model_obj.data()); - const auto &shape = model_obj.shape(); - const auto base = reinterpret_cast<const int32_t *>(model_obj.data()->base()); - assert(model_obj.shape().rank() == 1); - obj.access([&](ITensor &tensor) { - for (size_t i = 0; i < shape.num_elements(); ++i) - { - const int32_t value = base[shape.num_elements() - i - 1]; - int32_t *into = reinterpret_cast<int32_t *>(tensor.buffer() + - tensor.calcOffset({static_cast<int32_t>(i)})); - *into = value; - } - }); - }; + _init_map[block_size_index] = acl_common::initReverseOrder<int32_t>; } const auto &paddings_index = node.getInputs().at(ir::operation::SpaceToBatchND::PADDINGS); diff --git a/runtime/onert/backend/acl_cl/Optimizer.cc b/runtime/onert/backend/acl_cl/Optimizer.cc index 12e805ee5..a9ce888ee 100644 --- a/runtime/onert/backend/acl_cl/Optimizer.cc +++ b/runtime/onert/backend/acl_cl/Optimizer.cc @@ -16,12 +16,12 @@ #include "Optimizer.h" -#include "ParentInfo.h" +#include <AclSubTensorAnalyzer.h> -#include <cassert> #include <compiler/LoweredGraph.h> #include <util/logging.h> -#include "AclSubTensorAnalyzer.h" + +#include <cassert> namespace onert { diff --git a/runtime/onert/backend/acl_common/AclBackendContext.h b/runtime/onert/backend/acl_common/AclBackendContext.h new file mode 100644 index 000000000..b8d027476 --- /dev/null +++ b/runtime/onert/backend/acl_common/AclBackendContext.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_ACL_COMMON_ACLBACKEND_CONTEXT_H__ +#define __ONERT_BACKEND_ACL_COMMON_ACLBACKEND_CONTEXT_H__ + +#include <backend/BackendContext.h> +#include <ir/Index.h> +#include <ir/OperandIndexMap.h> +#include <ir/OperandIndexSequence.h> +#include <util/logging.h> + +#include <cl_common/BackendContext.h> + +namespace onert +{ +namespace backend +{ +namespace acl_common +{ + +// TODO Find better way to handle common code (reduce template) +template <typename T_TensorBuilder, typename T_ConstantInitializer, typename T_KernelGenerator, + typename T_Optimizer> +class AclBackendContext + : public onert::backend::cl_common::BackendContext<T_TensorBuilder, T_ConstantInitializer, + T_KernelGenerator> +{ +public: + AclBackendContext(const Backend *backend, ContextData &&data, + std::shared_ptr<ITensorRegistry> tensor_registry = nullptr, + std::shared_ptr<T_TensorBuilder> tensor_builder = nullptr, + std::shared_ptr<T_ConstantInitializer> constant_initializer = nullptr, + std::shared_ptr<T_KernelGenerator> kernel_gen = nullptr) + : onert::backend::cl_common::BackendContext<T_TensorBuilder, T_ConstantInitializer, + T_KernelGenerator>( + backend, std::move(data), tensor_registry, tensor_builder, constant_initializer, kernel_gen) + { + // DO NOTHING + } + + ITensorRegistry *genTensors() override + { + optimizer->optimize(); + + this->graph()->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) { + if (this->external_operands().contains(ind)) + return; + + const auto frontend_layout = this->graph()->layout(); + const auto backend_layout = this->operand_layouts().at(ind); + ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout), + obj.typeInfo(), obj.info().memAllocType(), obj.isConstant()}; + this->tensor_builder->registerTensorInfo(ind, backend_info, backend_layout); + }); + + // TODO Get compiler options from compiler, and use it rather than getting it from Env + if (util::getConfigString(util::config::EXECUTOR) == "Linear") + { + this->planTensors(); + } + else + { + // For the executors that does not have fixed linear execution order: + // To make tensors never be deallocated, this is a workaround to use static memory planner + this->graph()->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &) { + if (this->tensor_builder->isRegistered(ind)) + this->tensor_builder->notifyFirstUse(ind); + }); + } + + this->tensor_builder->prepare(); + + return this->tensor_registry.get(); + } + +protected: + void registerTensorInfo(const ir::OperandIndex &ind, const ir::OperandInfo &info, + ir::Layout backend_layout) override + { + this->tensor_builder->registerTensorInfo(ind, info, backend_layout); + } + +public: + // TODO Make it private + std::shared_ptr<T_Optimizer> optimizer; +}; + +} // namespace acl_common +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_ACL_COMMON_ACLBACKEND_CONTEXT_H__ diff --git a/runtime/onert/backend/acl_common/AclConstantInitializer.h b/runtime/onert/backend/acl_common/AclConstantInitializer.h index b7f66b50e..65659ad50 100644 --- a/runtime/onert/backend/acl_common/AclConstantInitializer.h +++ b/runtime/onert/backend/acl_common/AclConstantInitializer.h @@ -153,6 +153,23 @@ void permuteInit(const onert::ir::Operand &model_obj, onert::backend::ITensor &o Init<T>(model_obj, obj, copy, frontend_layout); } +// Pre-defined initializer - fill reverse order +template <typename T> void initReverseOrder(const ir::Operand &model_obj, backend::ITensor &obj) +{ + assert(model_obj.data()); + const auto &shape = model_obj.shape(); + const auto base = reinterpret_cast<const T *>(model_obj.data()->base()); + assert(model_obj.shape().rank() == 1); + obj.access([&](ITensor &tensor) { + for (size_t i = 0; i < shape.num_elements(); ++i) + { + const T value = base[shape.num_elements() - i - 1]; + T *into = reinterpret_cast<T *>(tensor.buffer() + tensor.calcOffset({static_cast<T>(i)})); + *into = value; + } + }); +} + class AclConstantInitializer : public ir::OperationVisitor { public: diff --git a/runtime/onert/backend/acl_common/AclSubTensorAnalyzer.h b/runtime/onert/backend/acl_common/AclSubTensorAnalyzer.h index 60f4ebf7e..a0bbe7c3c 100644 --- a/runtime/onert/backend/acl_common/AclSubTensorAnalyzer.h +++ b/runtime/onert/backend/acl_common/AclSubTensorAnalyzer.h @@ -17,9 +17,10 @@ #ifndef __ONERT_BACKEND_ACL_COMMON_ACL_SUB_TENSOR_ANALYZER_H__ #define __ONERT_BACKEND_ACL_COMMON_ACL_SUB_TENSOR_ANALYZER_H__ +#include <cl_common/ParentInfo.h> + #include <ir/OperationVisitor.h> #include <ir/Graph.h> -#include "ParentInfo.h" namespace onert { @@ -94,21 +95,21 @@ public: } coordinate_info.set(axis, axis_point); - _parent_map.emplace( - input_index, acl_common::ParentInfo{output_index, _current_op_layout, coordinate_info}); + _parent_map.emplace(input_index, + cl_common::ParentInfo{output_index, _current_op_layout, coordinate_info}); axis_point += input_shape.dim(axis); } } - std::unordered_map<ir::OperandIndex, ParentInfo> &&releaseParentMap() + std::unordered_map<ir::OperandIndex, cl_common::ParentInfo> &&releaseParentMap() { return std::move(_parent_map); } private: const ir::Graph &_graph; - std::unordered_map<ir::OperandIndex, ParentInfo> _parent_map; + std::unordered_map<ir::OperandIndex, cl_common::ParentInfo> _parent_map; ir::Layout _current_op_layout{ir::Layout::UNKNOWN}; bool usePadding{false}; }; diff --git a/runtime/onert/backend/acl_common/AclTensorBuilder.h b/runtime/onert/backend/acl_common/AclTensorBuilder.h index 7c1c5dd9a..e008fd6f5 100644 --- a/runtime/onert/backend/acl_common/AclTensorBuilder.h +++ b/runtime/onert/backend/acl_common/AclTensorBuilder.h @@ -17,18 +17,21 @@ #ifndef __ONERT_BACKEND_ACL_COMMON_TEMPL_TENSOR_BUILDER_H__ #define __ONERT_BACKEND_ACL_COMMON_TEMPL_TENSOR_BUILDER_H__ -#include <memory> -#include <queue> - -#include <arm_compute/core/Types.h> -#include "ir/OperandIndexMap.h" -#include <ir/Operands.h> #include "AclTensorManager.h" #include "AclTensorRegistry.h" -#include <memory> -#include "ParentInfo.h" + +#include <cl_common/LifetimeMap.h> +#include <cl_common/ParentInfo.h> + +#include <ir/OperandIndexMap.h> +#include <ir/Operands.h> #include <util/Utils.h> +#include <arm_compute/core/Types.h> + +#include <memory> +#include <queue> + namespace onert { namespace backend @@ -36,16 +39,12 @@ namespace backend namespace acl_common { -enum class UsesType -{ - FIRST, - LAST -}; - template <typename T_ITensor, typename T_Tensor, typename T_SubTensor> class AclTensorBuilder { public: using T_AclTensorManager = AclTensorManager<T_ITensor, T_Tensor, T_SubTensor>; + // TODO Remove this alias and direct usage of this type + using UsesType = cl_common::UsesType; AclTensorBuilder(const ir::Operands &operands, T_AclTensorManager *tensor_mgr); @@ -76,7 +75,7 @@ public: _uses_count_map[index] = num_uses; } - void parent_map(std::unordered_map<ir::OperandIndex, ParentInfo> &&parent_map) + void parent_map(std::unordered_map<ir::OperandIndex, cl_common::ParentInfo> &&parent_map) { _parent_map = std::move(parent_map); } @@ -104,10 +103,10 @@ private: std::unique_ptr<T_AclTensorManager> _tensor_mgr; // for linear executor - std::vector<std::pair<UsesType, ir::OperandIndex>> _lifetime_seq; + cl_common::LifetimeSeq _lifetime_seq; // Extra info for concat elimination - ir::OperandIndexMap<ParentInfo> _parent_map; + ir::OperandIndexMap<cl_common::ParentInfo> _parent_map; }; } // namespace acl_common @@ -217,55 +216,7 @@ void AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::prepare(void) template <typename T_ITensor, typename T_Tensor, typename T_SubTensor> void AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::allocate(void) { - // Update lifetime sequence to apply subtensor optimization - - std::unordered_map<ir::OperandIndex, ir::OperandIndex> root_map; - std::function<ir::OperandIndex &(ir::OperandIndex)> find_root = - [&](ir::OperandIndex ind) -> ir::OperandIndex & { - ir::OperandIndex &ret = root_map[ind]; - - // We know the root parent value already - if (ret.valid()) - return ret; - - auto itr = _parent_map.find(ind); - if (itr == _parent_map.end()) - { - // If there is no parent, let's store the value of itself - return ret = ind; - } - else - { - return ret = find_root(itr->second.parent); - } - }; - - ir::OperandIndexMap<bool> first_use_check; - ir::OperandIndexMap<bool> last_use_check; - std::map<size_t, std::pair<UsesType, ir::OperandIndex>> lifetime_map; - for (size_t i = 0; i < _lifetime_seq.size(); i++) - { - auto &entry = _lifetime_seq[i]; - if (entry.first != UsesType::FIRST) - continue; - auto root_ind = find_root(entry.second); - if (first_use_check[root_ind]) - continue; - first_use_check[root_ind] = true; - lifetime_map[i] = {UsesType::FIRST, root_ind}; - } - - for (int i = _lifetime_seq.size() - 1; i >= 0; i--) - { - auto &entry = _lifetime_seq[i]; - if (entry.first != UsesType::LAST) - continue; - auto root_ind = find_root(entry.second); - if (last_use_check[root_ind]) - continue; - last_use_check[root_ind] = true; - lifetime_map[i] = {UsesType::LAST, root_ind}; - } + auto lifetime_map = cl_common::createLifetimeMap(_lifetime_seq, _parent_map); for (auto &entry : lifetime_map) { diff --git a/runtime/onert/backend/acl_common/CMakeLists.txt b/runtime/onert/backend/acl_common/CMakeLists.txt index d3ae5acf7..8d409a47c 100644 --- a/runtime/onert/backend/acl_common/CMakeLists.txt +++ b/runtime/onert/backend/acl_common/CMakeLists.txt @@ -12,6 +12,7 @@ target_include_directories(${LIB_ONERT_BACKEND_ACL_COMMON} PUBLIC ${CMAKE_CURREN target_link_libraries(${LIB_ONERT_BACKEND_ACL_COMMON} PUBLIC onert_core) target_link_libraries(${LIB_ONERT_BACKEND_ACL_COMMON} PUBLIC arm_compute arm_compute_ex) target_link_libraries(${LIB_ONERT_BACKEND_ACL_COMMON} PUBLIC nnfw_lib_misc) +target_link_libraries(${LIB_ONERT_BACKEND_ACL_COMMON} PUBLIC ${LIB_ONERT_BACKEND_CL_COMMON}) target_link_libraries(${LIB_ONERT_BACKEND_ACL_COMMON} PRIVATE nnfw_common) target_link_libraries(${LIB_ONERT_BACKEND_ACL_COMMON} PRIVATE nnfw_coverage) diff --git a/runtime/onert/backend/acl_neon/BackendContext.cc b/runtime/onert/backend/acl_neon/BackendContext.cc deleted file mode 100644 index 4de3de02d..000000000 --- a/runtime/onert/backend/acl_neon/BackendContext.cc +++ /dev/null @@ -1,243 +0,0 @@ -/* - * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "BackendContext.h" - -#include "TensorBuilder.h" -#include "KernelGenerator.h" -#include "Optimizer.h" -#include "util/logging.h" -#include "ir/Index.h" -#include "ir/OperandIndexMap.h" -#include "ir/OperandIndexSequence.h" - -namespace onert -{ -namespace backend -{ -namespace acl_neon -{ - -void BackendContext::initConsts() -{ - _data.graph->operations().iterate([&](const ir::OperationIndex &, const ir::Operation &op) { - constant_initializer->setLayout(graph()->layout()); - op.accept(*constant_initializer); - }); - - _data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &operand) { - if (_data.external_operands.contains(ind) || !operand.isConstant()) - return; - const auto &obj = graph()->operands().at(ind); - if (obj.isConstant() && !constant_initializer->exist(ind)) - { - constant_initializer->registerDefaultInitializer(ind, obj); - } - }); - - constant_initializer->run(); -} - -void BackendContext::planTensors() -{ - ir::OperandIndexMap<uint32_t> uses_map; - ir::OperandIndexMap<uint32_t> def_map; - ir::OperandIndexSequence constants; - - // Prepare scanning - _data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) { - if (_data.external_operands.contains(ind)) - return; - - uses_map[ind] = obj.getUses().size(); - def_map[ind] = obj.getDef().valid() ? 1 : 0; - - if (obj.isConstant()) - constants.append(ind); - - if (!tensor_builder->isRegistered(ind)) - { - // These tensors do not exist in any operation (No use and def) - const auto info = obj.info(); - const auto layout = _data.operand_layouts.at(ind); - // TODO Change tensor info to have permuted shape - tensor_builder->registerTensorInfo(ind, info, layout); - } - }); - - // Start scanning to do notify{First|Last}Use for each tensor - - // If a tensor is a constant, increase the use of the tensor and allocate it first. - // Increasing use count here makes the tensor never be deallocated, i.e it they will be - // deallocated last. - VERBOSE(planTensors) << "TENSORS as CONSTANT" << std::endl; - for (const auto &ind : constants) - { - uses_map[ind]++; - tensor_builder->notifyFirstUse(ind); - } - - // At each operation, - // 1. Scan DEF of outputs. If the DEF, allocate it - // 2. Scan DEF of inputs. If variable tensor, allocate it - // 3. Scan USE of inputs. Decrease the USE and deallocate if the USE is 0 - for (const auto op_ind : _data.op_order) - { - auto op_inputs = - graph()->operations().at(op_ind).getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED; - auto op_outputs = graph()->operations().at(op_ind).getOutputs() | ir::Remove::DUPLICATED | - ir::Remove::UNDEFINED; - - // Define outputs - for (const auto &ind : op_outputs) - { - if (!tensor_builder->isRegistered(ind)) - continue; - assert(def_map.find(ind) != def_map.end()); - if (def_map[ind]) - { - def_map[ind] = 0; - tensor_builder->notifyFirstUse(ind); - } - } - - // Scan variable tensors - // This tensor has features like constant. But OperandInfo and LowerInfo treat them as - // non-constant because of less memory usage by memory planning in here - for (const auto &ind : op_inputs) - { - if (!tensor_builder->isRegistered(ind)) - continue; - const auto &operand = graph()->operands().at(ind); - if (operand.info().isVariable()) - { - // The variable tensor with buffer is not supported yet - assert(operand.data() == nullptr); - assert(operand.getUses().size() == 1 && !operand.getDef().valid()); - assert(uses_map[ind] == 1 && def_map[ind] == 0); - tensor_builder->notifyFirstUse(ind); - } - } - - for (const auto &ind : op_inputs) - { - if (!tensor_builder->isRegistered(ind)) - continue; - assert(uses_map.find(ind) != uses_map.end()); - assert(uses_map[ind] > 0); - uses_map[ind]--; - if (uses_map[ind] == 0) - { - // plan for deallocation of static tensornode - tensor_builder->notifyLastUse(ind); - } - } - } - - _data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &) { - if (uses_map[ind] == 0) - { - tensor_builder->notifyLastUse(ind); - } - }); - - // Dispose and validate - for (const auto &ind : constants) - { - --uses_map[ind]; - if (uses_map[ind] == 0) // To prevent notifyLastUse from being called twice - { - tensor_builder->notifyLastUse(ind); - } - } - - assert( - std::all_of(uses_map.begin(), uses_map.end(), - [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; })); - - assert( - std::all_of(def_map.begin(), def_map.end(), - [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; })); -} - -ITensorRegistry *BackendContext::genTensors() -{ - optimizer->optimize(); - - graph()->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) { - if (external_operands().contains(ind)) - return; - - const auto frontend_layout = graph()->layout(); - const auto backend_layout = operand_layouts().at(ind); - ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout), - obj.typeInfo(), obj.info().memAllocType(), obj.isConstant()}; - tensor_builder->registerTensorInfo(ind, backend_info, backend_layout); - }); - - // TODO Get compiler options from compiler, and use it rather than getting it from Env - if (util::getConfigString(util::config::EXECUTOR) == "Linear") - { - planTensors(); - } - else - { - // For the executors that does not have fixed linear execution order: - // To make tensors never be deallocated, this is a workaround to use static memory planner - graph()->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &) { - if (tensor_builder->isRegistered(ind)) - tensor_builder->notifyFirstUse(ind); - }); - } - - tensor_builder->prepare(); - - return tensor_registry.get(); -} - -FunctionMap BackendContext::genKernels() -{ - FunctionMap ret; - - for (auto op_ind : _data.op_order) - { - auto fn_seq = kernel_gen->generate(op_ind); - ret.emplace_back(op_ind, std::move(fn_seq)); - } - - tensor_builder->allocate(); - initConsts(); - - // NOTE For memory optimization, we want to free some operand data - const_cast<ir::Graph &>(*_data.graph) - .operands() - .iterate([&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); }); - - for (auto &it : ret) - { - auto &fn_seq = it.second; - fn_seq->iterate([&](exec::IFunction &ifunc) { - ifunc.prepare(); - tensor_builder->postFunctionPrepare(); - }); - } - - return ret; -} - -} // namespace acl_neon -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/acl_neon/BackendContext.h b/runtime/onert/backend/acl_neon/BackendContext.h index 35d777f7b..b73dd188e 100644 --- a/runtime/onert/backend/acl_neon/BackendContext.h +++ b/runtime/onert/backend/acl_neon/BackendContext.h @@ -17,10 +17,11 @@ #ifndef __ONERT_BACKEND_ACL_NEON_BACKEND_CONTEXT_H__ #define __ONERT_BACKEND_ACL_NEON_BACKEND_CONTEXT_H__ -#include <backend/BackendContext.h> -#include "TensorBuilder.h" +#include <AclBackendContext.h> + #include "ConstantInitializer.h" #include "KernelGenerator.h" +#include "TensorBuilder.h" namespace onert { @@ -31,34 +32,8 @@ namespace acl_neon class Optimizer; -class BackendContext : public onert::backend::BackendContext -{ -public: - BackendContext(const Backend *backend, ContextData &&data, - std::shared_ptr<ITensorRegistry> tensor_registry = nullptr, - std::shared_ptr<TensorBuilder> tensor_builder = nullptr, - std::shared_ptr<ConstantInitializer> constant_initializer = nullptr, - std::shared_ptr<KernelGenerator> kernel_gen = nullptr) - : onert::backend::BackendContext(backend, std::move(data), tensor_registry), - tensor_builder{tensor_builder}, constant_initializer{constant_initializer}, kernel_gen{ - kernel_gen} - { - } - - ITensorRegistry *genTensors() override; - FunctionMap genKernels() override; - -private: - void initConsts(); - void planTensors(); - -public: - // TODO Make it private - std::shared_ptr<TensorBuilder> tensor_builder; - std::shared_ptr<ConstantInitializer> constant_initializer; - std::shared_ptr<KernelGenerator> kernel_gen; - std::shared_ptr<Optimizer> optimizer; -}; +using BackendContext = + acl_common::AclBackendContext<TensorBuilder, ConstantInitializer, KernelGenerator, Optimizer>; } // namespace acl_neon } // namespace backend diff --git a/runtime/onert/backend/acl_neon/ConstantInitializer.cc b/runtime/onert/backend/acl_neon/ConstantInitializer.cc index 35da7c952..1bd702756 100644 --- a/runtime/onert/backend/acl_neon/ConstantInitializer.cc +++ b/runtime/onert/backend/acl_neon/ConstantInitializer.cc @@ -37,21 +37,7 @@ void ConstantInitializer::visit(const ir::operation::SpaceToBatchND &node) if (block_size_obj.isConstant()) { - _init_map[block_size_index] = [](const ir::Operand &model_obj, backend::ITensor &obj) { - assert(model_obj.data()); - const auto &shape = model_obj.shape(); - const auto base = reinterpret_cast<const int32_t *>(model_obj.data()->base()); - assert(model_obj.shape().rank() == 1); - obj.access([&](ITensor &tensor) { - for (size_t i = 0; i < shape.num_elements(); ++i) - { - const int32_t value = base[shape.num_elements() - i - 1]; - int32_t *into = reinterpret_cast<int32_t *>(tensor.buffer() + - tensor.calcOffset({static_cast<int32_t>(i)})); - *into = value; - } - }); - }; + _init_map[block_size_index] = acl_common::initReverseOrder<int32_t>; } const auto &paddings_index = node.getInputs().at(ir::operation::SpaceToBatchND::PADDINGS); diff --git a/runtime/onert/backend/acl_neon/Optimizer.cc b/runtime/onert/backend/acl_neon/Optimizer.cc index 781103f9c..283edd174 100644 --- a/runtime/onert/backend/acl_neon/Optimizer.cc +++ b/runtime/onert/backend/acl_neon/Optimizer.cc @@ -16,12 +16,12 @@ #include "Optimizer.h" -#include "ParentInfo.h" +#include <AclSubTensorAnalyzer.h> -#include <cassert> #include <compiler/LoweredGraph.h> #include <util/logging.h> -#include "AclSubTensorAnalyzer.h" + +#include <cassert> namespace onert { diff --git a/runtime/onert/backend/cl_common/CMakeLists.txt b/runtime/onert/backend/cl_common/CMakeLists.txt new file mode 100644 index 000000000..c75129696 --- /dev/null +++ b/runtime/onert/backend/cl_common/CMakeLists.txt @@ -0,0 +1,7 @@ +file(GLOB_RECURSE SOURCES "src/*.cc") + +add_library(${LIB_ONERT_BACKEND_CL_COMMON} STATIC ${SOURCES}) + +target_include_directories(${LIB_ONERT_BACKEND_CL_COMMON} PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include) +set_target_properties(${LIB_ONERT_BACKEND_CL_COMMON} PROPERTIES POSITION_INDEPENDENT_CODE ON) +target_link_libraries(${LIB_ONERT_BACKEND_CL_COMMON} PUBLIC onert_core) diff --git a/runtime/onert/backend/cl_common/include/cl_common/BackendContext.h b/runtime/onert/backend/cl_common/include/cl_common/BackendContext.h new file mode 100644 index 000000000..7bb72d74e --- /dev/null +++ b/runtime/onert/backend/cl_common/include/cl_common/BackendContext.h @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_CL_COMMON_BACKEND_CONTEXT_H__ +#define __ONERT_BACKEND_CL_COMMON_BACKEND_CONTEXT_H__ + +#include <backend/BackendContext.h> +#include <ir/Index.h> +#include <ir/OperandIndexMap.h> +#include <ir/OperandIndexSequence.h> +#include <util/logging.h> + +namespace onert +{ +namespace backend +{ +namespace cl_common +{ + +// TODO Find better way to handle common code (reduce template) +template <typename T_TensorBuilder, typename T_ConstantInitializer, typename T_KernelGenerator> +class BackendContext : public onert::backend::BackendContext +{ +public: + BackendContext(const Backend *backend, ContextData &&data, + std::shared_ptr<ITensorRegistry> tensor_registry = nullptr, + std::shared_ptr<T_TensorBuilder> tensor_builder = nullptr, + std::shared_ptr<T_ConstantInitializer> constant_initializer = nullptr, + std::shared_ptr<T_KernelGenerator> kernel_gen = nullptr) + : onert::backend::BackendContext(backend, std::move(data), tensor_registry), + tensor_builder{tensor_builder}, constant_initializer{constant_initializer}, kernel_gen{ + kernel_gen} + { + } + + FunctionMap genKernels() override + { + FunctionMap ret; + + // kernel_gen + for (auto op_ind : _data.op_order) + { + auto fn_seq = kernel_gen->generate(op_ind); + ret.emplace_back(op_ind, std::move(fn_seq)); + } + + tensor_builder->allocate(); + initConsts(); + + // NOTE For memory optimization, we want to free some operand data + const_cast<ir::Graph &>(*_data.graph) + .operands() + .iterate([&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); }); + + for (auto &it : ret) + { + auto &fn_seq = it.second; + fn_seq->iterate([&](exec::IFunction &ifunc) { + ifunc.prepare(); + tensor_builder->postFunctionPrepare(); + }); + } + + return ret; + } + +protected: + void initConsts() + { + _data.graph->operations().iterate([&](const ir::OperationIndex &, const ir::Operation &op) { + constant_initializer->setLayout(graph()->layout()); + op.accept(*constant_initializer); + }); + + _data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &operand) { + if (_data.external_operands.contains(ind) || !operand.isConstant()) + return; + const auto &obj = graph()->operands().at(ind); + if (obj.isConstant() && !constant_initializer->exist(ind)) + { + constant_initializer->registerDefaultInitializer(ind, obj); + } + }); + + constant_initializer->run(); + } + + virtual void registerTensorInfo(const ir::OperandIndex &ind, const ir::OperandInfo &info, + ir::Layout backend_layout) = 0; + + void planTensors() + { + ir::OperandIndexMap<uint32_t> uses_map; + ir::OperandIndexMap<uint32_t> def_map; + ir::OperandIndexSequence constants; + + // Prepare scanning + _data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) { + if (_data.external_operands.contains(ind)) + return; + + uses_map[ind] = obj.getUses().size(); + def_map[ind] = obj.getDef().valid() ? 1 : 0; + + if (obj.isConstant()) + constants.append(ind); + + if (!tensor_builder->isRegistered(ind)) + { + // These tensors do not exist in any operation (No use and def) + const auto info = obj.info(); + const auto layout = _data.operand_layouts.at(ind); + // TODO Change tensor info to have permuted shape + registerTensorInfo(ind, info, layout); + } + }); + + // Start scanning to do notify{First|Last}Use for each tensor + + // If a tensor is a constant, increase the use of the tensor and allocate it first. + // Increasing use count here makes the tensor never be deallocated, i.e it they will be + // deallocated last. + VERBOSE(planTensors) << "TENSORS as CONSTANT" << std::endl; + for (const auto &ind : constants) + { + uses_map[ind]++; + tensor_builder->notifyFirstUse(ind); + } + + // At each operation, + // 1. Scan DEF of outputs. If the DEF, allocate it + // 2. Scan DEF of inputs. If variable tensor, allocate it + // 3. Scan USE of inputs. Decrease the USE and deallocate if the USE is 0 + for (const auto op_ind : _data.op_order) + { + const auto &op = graph()->operations().at(op_ind); + auto op_inputs = op.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED; + auto op_outputs = op.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED; + + // Define outputs + for (const auto &ind : op_outputs) + { + if (!tensor_builder->isRegistered(ind)) + continue; + assert(def_map.find(ind) != def_map.end()); + if (def_map[ind]) + { + def_map[ind] = 0; + tensor_builder->notifyFirstUse(ind); + } + } + + // Scan variable tensors + // This tensor has features like constant. But OperandInfo and LowerInfo treat them as + // non-constant because of less memory usage by memory planning in here + for (const auto &ind : op_inputs) + { + if (!tensor_builder->isRegistered(ind)) + continue; + const auto &operand = graph()->operands().at(ind); + if (operand.info().isVariable()) + { + // The variable tensor with buffer is not supported yet + assert(operand.data() == nullptr); + assert(operand.getUses().size() == 1 && !operand.getDef().valid()); + assert(uses_map[ind] == 1 && def_map[ind] == 0); + tensor_builder->notifyFirstUse(ind); + } + } + + for (const auto &ind : op_inputs) + { + if (!tensor_builder->isRegistered(ind)) + continue; + assert(uses_map.find(ind) != uses_map.end()); + assert(uses_map[ind] > 0); + uses_map[ind]--; + if (uses_map[ind] == 0) + { + // plan for deallocation of static tensornode + tensor_builder->notifyLastUse(ind); + } + } + } + + _data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &) { + if (uses_map[ind] == 0) + { + tensor_builder->notifyLastUse(ind); + } + }); + + // Dispose and validate + for (const auto &ind : constants) + { + --uses_map[ind]; + if (uses_map[ind] == 0) // To prevent notifyLastUse from being called twice + { + tensor_builder->notifyLastUse(ind); + } + } + + assert( + std::all_of(uses_map.begin(), uses_map.end(), + [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; })); + + assert( + std::all_of(def_map.begin(), def_map.end(), + [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; })); + } + +public: + // TODO Make it protected + std::shared_ptr<T_TensorBuilder> tensor_builder; + std::shared_ptr<T_ConstantInitializer> constant_initializer; + std::shared_ptr<T_KernelGenerator> kernel_gen; +}; + +} // namespace cl_common +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_CL_COMMON_BACKEND_CONTEXT_H__ diff --git a/runtime/onert/backend/cl_common/include/cl_common/LifetimeMap.h b/runtime/onert/backend/cl_common/include/cl_common/LifetimeMap.h new file mode 100644 index 000000000..5fe5eec79 --- /dev/null +++ b/runtime/onert/backend/cl_common/include/cl_common/LifetimeMap.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_CL_COMMON_LIFETIME_MAP_H__ +#define __ONERT_BACKEND_CL_COMMON_LIFETIME_MAP_H__ + +#include "cl_common/ParentInfo.h" + +#include <ir/OperandIndexMap.h> + +#include <map> +#include <vector> + +namespace onert +{ +namespace backend +{ +namespace cl_common +{ + +// TODO Abstract UserType into LifetimeMap and LifetimeSeq +enum class UsesType +{ + FIRST, + LAST +}; + +// TODO Define class or struct for LifetimeMap and LifetimeSeq +using LifetimeMap = std::map<size_t, std::pair<UsesType, ir::OperandIndex>>; +using LifetimeSeq = std::vector<std::pair<UsesType, ir::OperandIndex>>; + +LifetimeMap createLifetimeMap(LifetimeSeq &seq, ir::OperandIndexMap<ParentInfo> &parent_map); + +} // namespace cl_common +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_CL_COMMON_LIFETIME_MAP_H__ diff --git a/runtime/onert/backend/gpu_cl/ParentInfo.h b/runtime/onert/backend/cl_common/include/cl_common/ParentInfo.h index d7cb2d4fb..510211cb7 100644 --- a/runtime/onert/backend/gpu_cl/ParentInfo.h +++ b/runtime/onert/backend/cl_common/include/cl_common/ParentInfo.h @@ -14,8 +14,8 @@ * limitations under the License. */ -#ifndef __ONERT_BACKEND_PARENT_INFO_H__ -#define __ONERT_BACKEND_PARENT_INFO_H__ +#ifndef __ONERT_BACKEND_CL_COMMON_PARENT_INFO_H__ +#define __ONERT_BACKEND_CL_COMMON_PARENT_INFO_H__ #include <ir/Index.h> #include <ir/Coordinates.h> @@ -24,7 +24,7 @@ namespace onert { namespace backend { -namespace gpu_cl +namespace cl_common { /** @@ -37,8 +37,8 @@ struct ParentInfo ir::Coordinates coordinates; }; -} // namespace gpu_cl +} // namespace cl_common } // namespace backend } // namespace onert -#endif // __ONERT_BACKEND_ACL_COMMON_PARENT_INFO_H__ +#endif // __ONERT_BACKEND_CL_COMMON_PARENT_INFO_H__ diff --git a/runtime/onert/backend/cl_common/src/LifetimeMap.cc b/runtime/onert/backend/cl_common/src/LifetimeMap.cc new file mode 100644 index 000000000..0b17c58fb --- /dev/null +++ b/runtime/onert/backend/cl_common/src/LifetimeMap.cc @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cl_common/LifetimeMap.h" + +#include <unordered_map> + +namespace onert +{ +namespace backend +{ +namespace cl_common +{ + +LifetimeMap createLifetimeMap(LifetimeSeq &lifetime_seq, + ir::OperandIndexMap<ParentInfo> &parent_map) +{ + // Update lifetime sequence to apply subtensor optimization + std::unordered_map<ir::OperandIndex, ir::OperandIndex> root_map; + std::function<ir::OperandIndex &(ir::OperandIndex)> find_root = + [&](ir::OperandIndex ind) -> ir::OperandIndex & { + ir::OperandIndex &ret = root_map[ind]; + + // We know the root parent value already + if (ret.valid()) + return ret; + + auto itr = parent_map.find(ind); + if (itr == parent_map.end()) + { + // If there is no parent, let's store the value of itself + return ret = ind; + } + else + { + return ret = find_root(itr->second.parent); + } + }; + + ir::OperandIndexMap<bool> first_use_check; + ir::OperandIndexMap<bool> last_use_check; + LifetimeMap lifetime_map; + for (size_t i = 0; i < lifetime_seq.size(); i++) + { + auto &entry = lifetime_seq[i]; + if (entry.first != UsesType::FIRST) + continue; + auto root_ind = find_root(entry.second); + if (first_use_check[root_ind]) + continue; + first_use_check[root_ind] = true; + lifetime_map[i] = {UsesType::FIRST, root_ind}; + } + + for (int i = lifetime_seq.size() - 1; i >= 0; i--) + { + auto &entry = lifetime_seq[i]; + if (entry.first != UsesType::LAST) + continue; + auto root_ind = find_root(entry.second); + if (last_use_check[root_ind]) + continue; + last_use_check[root_ind] = true; + lifetime_map[i] = {UsesType::LAST, root_ind}; + } + + return lifetime_map; +} + +} // namespace cl_common +} // namespace backend +} // namespace onert diff --git a/runtime/onert/backend/cpu/ops/OperationUtils.cc b/runtime/onert/backend/cpu/ops/OperationUtils.cc index 8ac875842..aa4ef352e 100644 --- a/runtime/onert/backend/cpu/ops/OperationUtils.cc +++ b/runtime/onert/backend/cpu/ops/OperationUtils.cc @@ -194,7 +194,7 @@ void CalculateActivationRangeQuantized(ir::Activation activation, const IPortabl } else { - std::cout << "Unsupported fused activation function." << std::endl; + throw std::runtime_error{"Unsupported fused activation function."}; } } diff --git a/runtime/onert/backend/cpu/ops/OperationUtils.h b/runtime/onert/backend/cpu/ops/OperationUtils.h index ac2fbb84f..1fefc3228 100644 --- a/runtime/onert/backend/cpu/ops/OperationUtils.h +++ b/runtime/onert/backend/cpu/ops/OperationUtils.h @@ -18,19 +18,19 @@ #define __NNFW_SUPPORT_NNAPI_OPERATION_UTILS_H__ #include <backend/IPortableTensor.h> - -#include <cker/Shape.h> -#include <cker/Types.h> -#include <iostream> #include <ir/DataType.h> -#include <ir/InternalType.h> #include <ir/Operand.h> #include <ir/Padding.h> +#include <util/CalculateActivationRange.h> + +#include <cker/Shape.h> +#include <cker/Types.h> #include <limits> #include <vector> using OperandType = onert::ir::DataType; +using namespace onert::util; namespace onert { @@ -166,40 +166,6 @@ void GetQuantizedConvolutionMultipliersAndShifts( int num_channels, std::vector<int32_t> &per_channel_output_multiplier, std::vector<int> &per_channel_output_shift); -template <typename T> -void CalculateActivationRange(ir::Activation activation, T *activation_min, T *activation_max) -{ - if (activation == ir::Activation::RELU) - { - *activation_min = 0; - *activation_max = std::numeric_limits<T>::max(); - } - else if (activation == ir::Activation::RELU6) - { - *activation_min = 0; - *activation_max = 6; - } - else if (activation == ir::Activation::RELU1) - { - *activation_min = -1; - *activation_max = 1; - } - else if (activation == ir::Activation::SIGMOID) - { - *activation_min = 0; - *activation_max = 1; - } - else if (activation == ir::Activation::NONE) - { - *activation_min = std::numeric_limits<T>::lowest(); - *activation_max = std::numeric_limits<T>::max(); - } - else - { - std::cout << "Unsupported fused activation function." << std::endl; - } -} - void CalculateActivationRangeQuantized(ir::Activation activation, const IPortableTensor *output, int32_t *act_min, int32_t *act_max); diff --git a/runtime/onert/backend/gpu_cl/Backend.h b/runtime/onert/backend/gpu_cl/Backend.h index dc0b8596c..d67ba1602 100644 --- a/runtime/onert/backend/gpu_cl/Backend.h +++ b/runtime/onert/backend/gpu_cl/Backend.h @@ -22,13 +22,13 @@ #include "BackendContext.h" #include "Config.h" -#include "ClTensorRegistry.h" +#include "TensorRegistry.h" #include "KernelGenerator.h" #include "TensorManager.h" #include "TensorBuilder.h" -#include "open_cl/Environment.h" -#include "open_cl/Status.h" +#include "tensorflow/lite/delegates/gpu/cl/environment.h" +#include "tensorflow/lite/delegates/gpu/common/status.h" namespace onert { @@ -50,22 +50,22 @@ public: const auto &operands = data.graph->operands(); auto context = std::make_unique<gpu_cl::BackendContext>(this, std::move(data)); - auto environment = std::make_shared<Environment>(); + auto environment = std::make_shared<tflite::gpu::cl::Environment>(); if (!CreateEnvironment(environment.get()).ok()) { return nullptr; } auto tm = createTensorManager(&environment->context()); - auto tr = std::make_shared<ClTensorRegistry<TensorManager>>(tm); + auto tr = std::make_shared<TensorRegistry>(tm); - InferenceContext::CreateInferenceInfo create_info; - create_info.precision = CalculationsPrecision::F32; + tflite::gpu::cl::InferenceContext::CreateInferenceInfo create_info; + create_info.precision = tflite::gpu::cl::CalculationsPrecision::F32; create_info.storage_type = - GetStorageTypeWithMinimalMemoryConsumption(environment->device().GetInfo()); - create_info.hints.Add(ModelHints::kFastestInference); + tflite::gpu::cl::GetStorageTypeWithMinimalMemoryConsumption(environment->device().GetInfo()); + create_info.hints.Add(tflite::gpu::cl::ModelHints::kFastestInference); - auto cc = std::make_shared<CreationContext>(); + auto cc = std::make_shared<tflite::gpu::cl::CreationContext>(); cc->device = environment->GetDevicePtr(); cc->context = &environment->context(); cc->queue = environment->queue(); diff --git a/runtime/onert/backend/gpu_cl/BackendContext.cc b/runtime/onert/backend/gpu_cl/BackendContext.cc index 6c3ac81a2..ec9442155 100644 --- a/runtime/onert/backend/gpu_cl/BackendContext.cc +++ b/runtime/onert/backend/gpu_cl/BackendContext.cc @@ -33,147 +33,26 @@ namespace backend namespace gpu_cl { -void BackendContext::initConsts() +void BackendContext::registerTensorInfo(const ir::OperandIndex &ind, const ir::OperandInfo &info, + ir::Layout backend_layout) { - _data.graph->operations().iterate([&](const ir::OperationIndex &, const ir::Operation &op) { - constant_initializer->setLayout(graph()->layout()); - op.accept(*constant_initializer); - }); - _data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &operand) { - if (_data.external_operands.contains(ind) || !operand.isConstant()) - return; - const auto &obj = graph()->operands().at(ind); - if (obj.isConstant() && !constant_initializer->exist(ind)) - { - constant_initializer->registerDefaultInitializer(ind, obj); - } - }); - - constant_initializer->run(); + TensorType type = TensorType::TENSOR_TYPE_VALID; + tensor_builder->registerTensorInfo(ind, info, backend_layout, type); } -void BackendContext::planTensors() +ITensorRegistry *BackendContext::genTensors() { - ir::OperandIndexMap<uint32_t> uses_map; - ir::OperandIndexMap<uint32_t> def_map; - ir::OperandIndexSequence constants; - - // Prepare scanning - _data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) { - if (_data.external_operands.contains(ind)) - return; - uses_map[ind] = obj.getUses().size(); - def_map[ind] = obj.getDef().valid() ? 1 : 0; - - if (obj.isConstant()) - constants.append(ind); - - if (!tensor_builder->isRegistered(ind)) - { - // These tensors do not exist in any operation (No use and def) - const auto info = obj.info(); - const auto layout = _data.operand_layouts.at(ind); - // TODO Change tensor info to have permuted shape - tensor_builder->registerTensorInfo(ind, info, layout); - } - }); - - // Start scanning to do notify{First|Last}Use for each tensor + ir::OperandIndexMap<TensorType> type_map; - // If a tensor is a constant, increase the use of the tensor and allocate it first. - // Increasing use count here makes the tensor never be deallocated, i.e it they will be - // deallocated last. - VERBOSE(planTensors) << "TENSORS as CONSTANT" << std::endl; - for (const auto &ind : constants) + for (const auto &ind : graph()->getInputs()) { - uses_map[ind]++; - tensor_builder->notifyFirstUse(ind); + type_map[ind] = TensorType::TENSOR_TYPE_INPUT; } - // At each operation, - // 1. Scan DEF of outputs. If the DEF, allocate it - // 2. Scan DEF of inputs. If variable tensor, allocate it - // 3. Scan USE of inputs. Decrease the USE and deallocate if the USE is 0 - for (const auto op_ind : _data.op_order) + for (const auto &ind : graph()->getOutputs()) { - const auto &op = graph()->operations().at(op_ind); - auto op_inputs = op.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED; - auto op_outputs = op.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED; - - // Define outputs - for (const auto &ind : op_outputs) - { - if (!tensor_builder->isRegistered(ind)) - continue; - assert(def_map.find(ind) != def_map.end()); - if (def_map[ind]) - { - def_map[ind] = 0; - tensor_builder->notifyFirstUse(ind); - } - } - - // Scan variable tensors - // This tensor has features like constant. But OperandInfo and LowerInfo treat them as - // non-constant because of less memory usage by memory planning in here - for (const auto &ind : op_inputs) - { - if (!tensor_builder->isRegistered(ind)) - continue; - const auto &operand = graph()->operands().at(ind); - if (operand.info().isVariable()) - { - // The variable tensor with buffer is not supported yet - assert(operand.data() == nullptr); - assert(operand.getUses().size() == 1 && !operand.getDef().valid()); - assert(uses_map[ind] == 1 && def_map[ind] == 0); - tensor_builder->notifyFirstUse(ind); - } - } - - for (const auto &ind : op_inputs) - { - if (!tensor_builder->isRegistered(ind)) - continue; - assert(uses_map.find(ind) != uses_map.end()); - assert(uses_map[ind] > 0); - uses_map[ind]--; - if (uses_map[ind] == 0) - { - // plan for deallocation of static tensornode - tensor_builder->notifyLastUse(ind); - } - } + type_map[ind] = TensorType::TENSOR_TYPE_OUTPUT; } - - _data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &) { - if (uses_map[ind] == 0) - { - tensor_builder->notifyLastUse(ind); - } - }); - - // Dispose and validate - for (const auto &ind : constants) - { - --uses_map[ind]; - if (uses_map[ind] == 0) // To prevent notifyLastUse from being called twice - { - tensor_builder->notifyLastUse(ind); - } - } - - assert( - std::all_of(uses_map.begin(), uses_map.end(), - [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; })); - - assert( - std::all_of(def_map.begin(), def_map.end(), - [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; })); -} - -ITensorRegistry *BackendContext::genTensors() -{ graph()->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) { if (external_operands().contains(ind)) return; @@ -182,7 +61,11 @@ ITensorRegistry *BackendContext::genTensors() const auto backend_layout = operand_layouts().at(ind); ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout), obj.typeInfo(), obj.info().memAllocType(), obj.isConstant()}; - tensor_builder->registerTensorInfo(ind, backend_info, backend_layout); + if (obj.isConstant()) + { + type_map[ind] = TensorType::TENSOR_TYPE_INPUT; + } + tensor_builder->registerTensorInfo(ind, backend_info, backend_layout, type_map[ind]); }); // TODO Get compiler options from compiler, and use it rather than getting it from Env @@ -199,44 +82,10 @@ ITensorRegistry *BackendContext::genTensors() tensor_builder->notifyFirstUse(ind); }); } - tensor_builder->prepare(); - return tensor_registry.get(); } -FunctionMap BackendContext::genKernels() -{ - FunctionMap ret; - - // kernel_gen - for (auto op_ind : _data.op_order) - { - auto fn_seq = kernel_gen->generate(op_ind); - ret.emplace_back(op_ind, std::move(fn_seq)); - } - - tensor_builder->allocate(); - - initConsts(); - - // NOTE For memory optimization, we want to free some operand data - const_cast<ir::Graph &>(*_data.graph) - .operands() - .iterate([&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); }); - - for (auto &it : ret) - { - auto &fn_seq = it.second; - fn_seq->iterate([&](exec::IFunction &ifunc) { - ifunc.prepare(); - tensor_builder->postFunctionPrepare(); - }); - } - - return ret; -} - } // namespace gpu_cl } // namespace backend } // namespace onert diff --git a/runtime/onert/backend/gpu_cl/BackendContext.h b/runtime/onert/backend/gpu_cl/BackendContext.h index f17489e7a..7412d2bce 100644 --- a/runtime/onert/backend/gpu_cl/BackendContext.h +++ b/runtime/onert/backend/gpu_cl/BackendContext.h @@ -20,10 +20,12 @@ #include <backend/BackendContext.h> #include <util/ConfigSource.h> +#include <cl_common/BackendContext.h> + #include "ConstantInitializer.h" #include "KernelGenerator.h" #include "TensorBuilder.h" -#include "open_cl/InferenceContext.h" +#include "tensorflow/lite/delegates/gpu/cl/inference_context.h" namespace onert { @@ -32,31 +34,28 @@ namespace backend namespace gpu_cl { -class BackendContext : public onert::backend::BackendContext +class BackendContext + : public onert::backend::cl_common::BackendContext<TensorBuilder, ConstantInitializer, + KernelGenerator> { public: BackendContext(const Backend *backend, ContextData &&data, - std::shared_ptr<ITensorRegistry> tensor_registry = nullptr, + std::shared_ptr<TensorRegistry> tensor_registry = nullptr, std::shared_ptr<TensorBuilder> tensor_builder = nullptr, std::shared_ptr<ConstantInitializer> constant_initializer = nullptr, std::shared_ptr<KernelGenerator> kernel_gen = nullptr) - : onert::backend::BackendContext(backend, std::move(data), tensor_registry), - tensor_builder{tensor_builder}, constant_initializer{constant_initializer}, kernel_gen{ - kernel_gen} + : onert::backend::cl_common::BackendContext<TensorBuilder, ConstantInitializer, + KernelGenerator>( + backend, std::move(data), tensor_registry, tensor_builder, constant_initializer, kernel_gen) { + // DO NOTHING } ITensorRegistry *genTensors() override; - FunctionMap genKernels() override; - -private: - void initConsts(); - void planTensors(); -public: - std::shared_ptr<TensorBuilder> tensor_builder; - std::shared_ptr<ConstantInitializer> constant_initializer; - std::shared_ptr<KernelGenerator> kernel_gen; +protected: + void registerTensorInfo(const ir::OperandIndex &ind, const ir::OperandInfo &info, + ir::Layout backend_layout) override; }; } // namespace gpu_cl diff --git a/runtime/onert/backend/gpu_cl/CMakeLists.txt b/runtime/onert/backend/gpu_cl/CMakeLists.txt index 49bae37f8..eb1964214 100644 --- a/runtime/onert/backend/gpu_cl/CMakeLists.txt +++ b/runtime/onert/backend/gpu_cl/CMakeLists.txt @@ -1,14 +1,14 @@ set(LIB_ONERT_BACKEND_GPU_CL onert_backend_gpu_cl) +if(NOT BUILD_GPU_CL) + return() +endif(NOT BUILD_GPU_CL) + nnas_find_package(Opencl_Headers QUIET) if(NOT Opencl_Headers_FOUND) return() endif(NOT Opencl_Headers_FOUND) -if(NOT BUILD_GPU_CL) - return() -endif(NOT BUILD_GPU_CL) - nnas_find_package(Farmhash QUIET) if(NOT Farmhash_FOUND) return() @@ -19,18 +19,32 @@ if(NOT Abseil_FOUND) return() endif(NOT Abseil_FOUND) -file(GLOB_RECURSE SOURCES "*.cc") +nnfw_find_package(Fp16 QUIET) +if(NOT Fp16_FOUND) + return() +endif(NOT Fp16_FOUND) +nnas_find_package(TensorFlowGpu QUIET) +if(NOT TensorFlowGpu_FOUND) + message(FATAL_ERROR 'TensorFlowGpu lib not found') + return() +endif(NOT TensorFlowGpu_FOUND) + +file(GLOB_RECURSE SOURCES "*.cc") add_library(${LIB_ONERT_BACKEND_GPU_CL} SHARED ${SOURCES}) target_include_directories(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) +target_include_directories(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE ${TENSORFLOWGPU_SOURCE_DIR}) target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE abseil) target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE dl) target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE farmhash) -target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE Headers) +target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} INTERFACE Open_CL_Headers) +target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE fp16) +target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE TensorFlowGpu) target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE onert_core) +target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE ${LIB_ONERT_BACKEND_CL_COMMON}) target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE nnfw_common) target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE nnfw_coverage) diff --git a/runtime/onert/backend/gpu_cl/ClConstantInitializer.cc b/runtime/onert/backend/gpu_cl/ClConstantInitializer.cc index b3ef2f560..05dd8e2a3 100644 --- a/runtime/onert/backend/gpu_cl/ClConstantInitializer.cc +++ b/runtime/onert/backend/gpu_cl/ClConstantInitializer.cc @@ -93,6 +93,9 @@ void ClConstantInitializer::registerPermuteInitializer(const ir::OperandIndex &i case DataType::FLOAT32: _init_map[index] = std::bind(permuteInit<float>, _1, _2, _current_layout); break; + case DataType::INT32: + _init_map[index] = std::bind(permuteInit<int32_t>, _1, _2, _current_layout); + break; default: throw std::runtime_error("Not supported, yet"); break; diff --git a/runtime/onert/backend/gpu_cl/ClConstantInitializer.h b/runtime/onert/backend/gpu_cl/ClConstantInitializer.h index d7d21e847..95e228acd 100644 --- a/runtime/onert/backend/gpu_cl/ClConstantInitializer.h +++ b/runtime/onert/backend/gpu_cl/ClConstantInitializer.h @@ -17,8 +17,6 @@ #ifndef __ONERT_COMPILER_GPU_CL_CLCONSTANT_INITIALIZER_H__ #define __ONERT_COMPILER_GPU_CL_CLCONSTANT_INITIALIZER_H__ -#include "ClTensorRegistry.h" - #include <unordered_map> #include <functional> diff --git a/runtime/onert/backend/gpu_cl/ClFunction.h b/runtime/onert/backend/gpu_cl/ClFunction.h index 9d3d69092..5e8a11a84 100644 --- a/runtime/onert/backend/gpu_cl/ClFunction.h +++ b/runtime/onert/backend/gpu_cl/ClFunction.h @@ -22,9 +22,9 @@ #include <vector> #include <memory> -#include "open_cl/kernels/GpuOperation.h" -#include "open_cl/ClCommandQueue.h" -#include "open_cl/Status.h" +#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h" +#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h" +#include "tensorflow/lite/delegates/gpu/common/status.h" namespace onert { @@ -32,19 +32,18 @@ namespace backend { namespace gpu_cl { - class ClFunction : public ::onert::exec::IFunction { public: ClFunction() : _gpu_operations(), _creation_context() {} public: - void configure(std::shared_ptr<CreationContext> creation_context) + void configure(std::shared_ptr<tflite::gpu::cl::CreationContext> creation_context) { _creation_context = creation_context; } - void add_operation(std::unique_ptr<GPUOperation> gpu_operation) + void add_operation(std::unique_ptr<tflite::gpu::cl::GPUOperation> gpu_operation) { _gpu_operations.push_back(std::move(gpu_operation)); } @@ -57,6 +56,10 @@ public: { throw std::runtime_error("Failed to AddToQueue."); } + if (!_creation_context->queue->WaitForCompletion().ok()) + { + throw std::runtime_error("Failed to WaitForCompletion."); + } } } @@ -77,8 +80,8 @@ public: } private: - std::vector<std::unique_ptr<GPUOperation>> _gpu_operations; - std::shared_ptr<CreationContext> _creation_context; + std::vector<std::unique_ptr<tflite::gpu::cl::GPUOperation>> _gpu_operations; + std::shared_ptr<tflite::gpu::cl::CreationContext> _creation_context; }; } // namespace gpu_cl diff --git a/runtime/onert/backend/gpu_cl/ClMemoryManager.h b/runtime/onert/backend/gpu_cl/ClMemoryManager.h deleted file mode 100644 index 3bac0d51d..000000000 --- a/runtime/onert/backend/gpu_cl/ClMemoryManager.h +++ /dev/null @@ -1,135 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_ACL_COMMON_MEMORY_MANAGER_H__ -#define __ONERT_BACKEND_ACL_COMMON_MEMORY_MANAGER_H__ - -#include <cassert> - -#include "ir/OperandIndexMap.h" -#include "ir/Shape.h" -#include "open_cl/ClContext.h" -#include "open_cl/InferenceContext.h" -#include "open_cl/Status.h" -#include "open_cl/StorageTypeUtil.h" -#include "open_cl/TensorType.h" -#include "util/logging.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -template <typename T_ITensor, typename T_Tensor> class ClMemoryManager -{ -public: - ClMemoryManager(CLContext *context) : _context{context} {} - - virtual ~ClMemoryManager() = default; - - virtual void allocate(void) - { - for (const auto &tensor_entry : _tensors) - { - auto tensor = tensor_entry.second; - const auto &t = tensor_reserver_.Get(tensor_entry.first.value()); - const auto &shape = t->shape; - const auto &descriptor = t->descriptor; - if (!CreateTensor(*_context, shape, descriptor, tensor->handle()).ok()) - { - return; - } - } - } - - virtual void deallocate(void) - { - // NYI - } - - virtual void startLifetime(const ir::OperandIndex &) - { /* DO NOTHING */ - } - virtual void finishLifetime(const ir::OperandIndex &) - { /* DO NOTHING */ - } - - void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &info, - InferenceContext::CreateInferenceInfo create_info, - std::shared_ptr<Environment> environment, DeviceInfo &device_info) - { - ValueId max_id = 0; - auto data_type = DeduceDataTypeFromPrecision(create_info.precision); - const auto shape = info.shape(); - - auto tensor = std::make_shared<T_Tensor>(shape.rank(), shape, environment); - _tensors[ind] = tensor; - - BHWC t_shape; - switch (shape.rank()) - { - case 1: - // B layout - t_shape = BHWC(shape.dim(0), 1, 1, 1); - break; - case 2: - // BC layout - t_shape = BHWC(shape.dim(0), 1, 1, shape.dim(1)); - break; - case 3: - // BWC layout - t_shape = BHWC(shape.dim(0), 1, shape.dim(1), shape.dim(2)); - break; - case 4: - // BHWC layout - t_shape = BHWC(shape.dim(0), shape.dim(1), shape.dim(2), shape.dim(3)); - break; - default: - break; - } - - TensorStorageType storage_type = create_info.storage_type; - Layout layout = t_shape.b == 1 ? Layout::HWC : Layout::BHWC; - - ValueId id = ind.value(); - storage_type = SelectBestStorageType(device_info, t_shape, storage_type, data_type, layout); - auto dummy = std::make_shared<InferenceContext::DummyTensor>(); - dummy->shape = t_shape; - dummy->descriptor = TensorDescriptor{data_type, storage_type, layout}; - tensor_reserver_.Add(id, dummy); - - max_id = std::max(max_id, id); - - tensor_reserver_.SetNext(max_id + 1); - } - - ir::OperandIndexMap<std::shared_ptr<T_Tensor>> &tensors(void) { return _tensors; } - - InferenceContext::TensorReserver &tensorReservers(void) { return tensor_reserver_; } - -private: - ir::OperandIndexMap<std::shared_ptr<T_Tensor>> _tensors; - InferenceContext::TensorReserver tensor_reserver_; - CLContext *_context; -}; - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_ACL_COMMON_MEMORY_MANAGER_H__ diff --git a/runtime/onert/backend/gpu_cl/ClTensorBuilder.h b/runtime/onert/backend/gpu_cl/ClTensorBuilder.h deleted file mode 100644 index 951bbd844..000000000 --- a/runtime/onert/backend/gpu_cl/ClTensorBuilder.h +++ /dev/null @@ -1,289 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_CL_TENSOR_BUILDER_H__ -#define __ONERT_BACKEND_CL_TENSOR_BUILDER_H__ - -#include <memory> -#include <queue> - -#include "ClTensorManager.h" -#include "ClTensorRegistry.h" -#include "ParentInfo.h" - -#include "open_cl/TensorType.h" -#include "open_cl/TensorTypeUtil.h" -#include "open_cl/ClDevice.h" -#include "open_cl/InferenceContext.h" - -#include "ir/OperandIndexMap.h" -#include "ir/OperandIndexSequence.h" -#include <ir/Operands.h> -#include <util/Utils.h> - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -enum class UsesType -{ - FIRST, - LAST -}; - -template <typename T_ITensor, typename T_Tensor> class ClTensorBuilder -{ -public: - using T_ClTensorManager = ClTensorManager<T_ITensor, T_Tensor>; - - ClTensorBuilder(const ir::Operands &operands, T_ClTensorManager *tensor_mgr, - InferenceContext::CreateInferenceInfo create_info, - const std::shared_ptr<Environment> &environment); - - /** - * @brief Register tensor information to allocate on ACL-CL backend - * @param[in] ind Operand index - * @param[in] info Tensor information - * @param[in] layout Tensor data layout - */ - void registerTensorInfo(const ir::OperandIndex &ind, const ir::OperandInfo &info, - ir::Layout backend_layout); - - void notifyFirstUse(const ir::OperandIndex &); - void notifyLastUse(const ir::OperandIndex &); - - bool isRegistered(const ir::OperandIndex &) const; - - void prepare(); - void allocate(); - void postFunctionPrepare(); - - T_ClTensorManager *cl_tensor_manager(void) { return _tensor_mgr.get(); } - - void setUsesCount(const ir::OperandIndex &index, size_t num_uses) - { - assert(_uses_count_map.find(index) != _uses_count_map.end() ? _uses_count_map[index] == num_uses - : true); - _uses_count_map[index] = num_uses; - } - - void parent_map(std::unordered_map<ir::OperandIndex, ParentInfo> &&parent_map) - { - _parent_map = std::move(parent_map); - } - - bool areSubTensorsOf(const ir::OperandIndex &parent, const ir::OperandIndexSequence &seq); - - /** - * @brief Check child tensor is allocated as subtensor of parent tensor - * @param[in] parent Index of parent - * @param[in] child Index of child - * @return @c true if child is allocated as subtensor of parent, otherwise @c false - */ - bool isSubTensorOf(const ir::OperandIndex &parent, const ir::OperandIndex &child); - -private: - void buildTensors(void); - ir::OperandIndex findRootParent(ir::OperandIndex index); - -private: - const ir::Operands &_operands; - ir::OperandIndexMap<ir::OperandInfo> _tensor_info_map; - ir::OperandIndexMap<ir::Layout> _tensor_layout_map; - ir::OperandIndexMap<size_t> _uses_count_map; - - std::unique_ptr<T_ClTensorManager> _tensor_mgr; - InferenceContext::CreateInferenceInfo _create_info; - std::shared_ptr<Environment> _environment; - - // for linear executor - std::vector<std::pair<UsesType, ir::OperandIndex>> _lifetime_seq; - - // Extra info for concat elimination - ir::OperandIndexMap<ParentInfo> _parent_map; -}; - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#include <cassert> -#include <stack> - -#include "util/logging.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -template <typename T_ITensor, typename T_Tensor> -ClTensorBuilder<T_ITensor, T_Tensor>::ClTensorBuilder( - const ir::Operands &operands, T_ClTensorManager *tensor_mgr, - InferenceContext::CreateInferenceInfo create_info, - const std::shared_ptr<Environment> &environment) - : _operands{operands}, _tensor_mgr{tensor_mgr}, _create_info{create_info}, _environment{ - environment} -{ - assert(_tensor_mgr); -} - -template <typename T_ITensor, typename T_Tensor> -void ClTensorBuilder<T_ITensor, T_Tensor>::registerTensorInfo(const ir::OperandIndex &ind, - const ir::OperandInfo &info, - ir::Layout backend_layout) -{ - assert(_tensor_mgr->constTensors().size() == 0); - assert(_tensor_mgr->nonconstTensors().size() == 0); - - _uses_count_map[ind] = _operands.at(ind).getUses().size(); - - _tensor_info_map.emplace(ind, info); - _tensor_layout_map.insert({ind, backend_layout}); -} - -template <typename T_ITensor, typename T_Tensor> -void ClTensorBuilder<T_ITensor, T_Tensor>::notifyFirstUse(const ir::OperandIndex &ind) -{ - _lifetime_seq.emplace_back(UsesType::FIRST, ind); -} - -template <typename T_ITensor, typename T_Tensor> -void ClTensorBuilder<T_ITensor, T_Tensor>::notifyLastUse(const ir::OperandIndex &ind) -{ - _lifetime_seq.emplace_back(UsesType::LAST, ind); -} - -template <typename T_ITensor, typename T_Tensor> -bool ClTensorBuilder<T_ITensor, T_Tensor>::isRegistered(const ir::OperandIndex &ind) const -{ - return _tensor_info_map.find(ind) != _tensor_info_map.end(); -} - -template <typename T_ITensor, typename T_Tensor> -void ClTensorBuilder<T_ITensor, T_Tensor>::prepare(void) -{ - buildTensors(); -} - -template <typename T_ITensor, typename T_Tensor> -void ClTensorBuilder<T_ITensor, T_Tensor>::allocate(void) -{ - // Update lifetime sequence to apply subtensor optimization - - std::unordered_map<ir::OperandIndex, ir::OperandIndex> root_map; - std::function<ir::OperandIndex &(ir::OperandIndex)> find_root = - [&](ir::OperandIndex ind) -> ir::OperandIndex & { - ir::OperandIndex &ret = root_map[ind]; - - // We know the root parent value already - if (ret.valid()) - return ret; - - auto itr = _parent_map.find(ind); - if (itr == _parent_map.end()) - { - // If there is no parent, let's store the value of itself - return ret = ind; - } - else - { - return ret = find_root(itr->second.parent); - } - }; - - ir::OperandIndexMap<bool> first_use_check; - ir::OperandIndexMap<bool> last_use_check; - std::map<size_t, std::pair<UsesType, ir::OperandIndex>> lifetime_map; - for (size_t i = 0; i < _lifetime_seq.size(); i++) - { - auto &entry = _lifetime_seq[i]; - if (entry.first != UsesType::FIRST) - continue; - auto root_ind = find_root(entry.second); - if (first_use_check[root_ind]) - continue; - first_use_check[root_ind] = true; - lifetime_map[i] = {UsesType::FIRST, root_ind}; - } - - for (int i = _lifetime_seq.size() - 1; i >= 0; i--) - { - auto &entry = _lifetime_seq[i]; - if (entry.first != UsesType::LAST) - continue; - auto root_ind = find_root(entry.second); - if (last_use_check[root_ind]) - continue; - last_use_check[root_ind] = true; - lifetime_map[i] = {UsesType::LAST, root_ind}; - } - - for (auto &entry : lifetime_map) - { - auto &use = entry.second; - auto use_type = use.first; - auto use_index = use.second; - assert(use_index.valid()); - if (use_type == UsesType::FIRST) - _tensor_mgr->startLifetime(use_index); - else - _tensor_mgr->finishLifetime(use_index); - } - - _tensor_mgr->allocateConsts(); - - // TODO Since `_parent_map` is filled for all Concat nodes even if the node this backend uses - // After refactoring BackendContext we can uncomment this - // assert(_tensor_info_map.size() == - // _tensor_mgr->nonconstTensors().size() + num of constants of _tensor_info_map + - // _parent_map.size()); - _tensor_mgr->allocateNonconsts(); -} - -template <typename T_ITensor, typename T_Tensor> -void ClTensorBuilder<T_ITensor, T_Tensor>::postFunctionPrepare(void) -{ - _tensor_mgr->tryDeallocConstants(); -} - -template <typename T_ITensor, typename T_Tensor> -void ClTensorBuilder<T_ITensor, T_Tensor>::buildTensors(void) -{ - assert(_tensor_mgr->constTensors().size() == 0); - assert(_tensor_mgr->nonconstTensors().size() == 0); - // Normal tensors - for (auto &entry : _tensor_info_map) - { - auto ind = entry.first; - if (_parent_map.count(ind) > 0) - continue; - - const auto &info = entry.second; - _tensor_mgr->buildTensor(ind, info, _create_info, _environment, _environment->device().info_); - } -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_ACL_COMMON_TEMPL_TENSOR_BUILDER_H__ diff --git a/runtime/onert/backend/gpu_cl/ClTensorManager.h b/runtime/onert/backend/gpu_cl/ClTensorManager.h deleted file mode 100644 index 49a11730f..000000000 --- a/runtime/onert/backend/gpu_cl/ClTensorManager.h +++ /dev/null @@ -1,235 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_ACL_COMMON_TENSOR_MANAGER_H__ -#define __ONERT_BACKEND_ACL_COMMON_TENSOR_MANAGER_H__ - -#include "ClMemoryManager.h" - -#include "open_cl/InferenceContext.h" -#include "open_cl/TensorType.h" - -#include "ir/OperandInfo.h" -#include "ir/OperandIndexMap.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -template <typename T_ITensor, typename T_Tensor> class ClTensorManager -{ -public: - using T_ClMemoryManager = ClMemoryManager<T_ITensor, T_Tensor>; - - ClTensorManager(T_ClMemoryManager *const_mgr, T_ClMemoryManager *nonconst_mgr); - - virtual ~ClTensorManager() = default; - - void allocateConsts(void); - void allocateNonconsts(void); - void deallocateConsts(void); - void deallocateNonconsts(void); - - void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &info, - InferenceContext::CreateInferenceInfo create_info, - std::shared_ptr<Environment> environment, DeviceInfo &device_info); - - std::shared_ptr<T_ITensor> findTensorAsParent(const ir::OperandIndex &ind); - - void startLifetime(const ir::OperandIndex &ind); - void finishLifetime(const ir::OperandIndex &ind); - - std::shared_ptr<T_ITensor> at(const ir::OperandIndex &ind); - std::shared_ptr<InferenceContext::DummyTensor> atR(const ir::OperandIndex &ind); - - InferenceContext::TensorReserver &constTensorReservers(void); - InferenceContext::TensorReserver &nonconstTensorReservers(void); - - ir::OperandIndexMap<std::shared_ptr<T_Tensor>> &constTensors(void); - ir::OperandIndexMap<std::shared_ptr<T_Tensor>> &nonconstTensors(void); - - void iterate(const std::function<void(const ir::OperandIndex &)> &fn); - - void tryDeallocConstants(void); - -private: - std::unique_ptr<T_ClMemoryManager> _const_mgr; - std::unique_ptr<T_ClMemoryManager> _nonconst_mgr; - ir::OperandIndexMap<T_ClMemoryManager &> _ind_to_mgr; -}; - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#include <cassert> -#include "util/logging.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -template <typename T_ITensor, typename T_Tensor> -ClTensorManager<T_ITensor, T_Tensor>::ClTensorManager(T_ClMemoryManager *const_mgr, - T_ClMemoryManager *nonconst_mgr) - : _const_mgr{const_mgr}, _nonconst_mgr{nonconst_mgr} -{ - // DO NOTHING -} - -template <typename T_ITensor, typename T_Tensor> -void ClTensorManager<T_ITensor, T_Tensor>::allocateConsts(void) -{ - _const_mgr->allocate(); -} - -template <typename T_ITensor, typename T_Tensor> -void ClTensorManager<T_ITensor, T_Tensor>::allocateNonconsts(void) -{ - _nonconst_mgr->allocate(); -} - -template <typename T_ITensor, typename T_Tensor> -void ClTensorManager<T_ITensor, T_Tensor>::deallocateConsts(void) -{ - _const_mgr->deallocate(); -} - -template <typename T_ITensor, typename T_Tensor> -void ClTensorManager<T_ITensor, T_Tensor>::deallocateNonconsts(void) -{ - _nonconst_mgr->deallocate(); -} - -template <typename T_ITensor, typename T_Tensor> -void ClTensorManager<T_ITensor, T_Tensor>::buildTensor( - const ir::OperandIndex &ind, const ir::OperandInfo &info, - InferenceContext::CreateInferenceInfo create_info, std::shared_ptr<Environment> environment, - DeviceInfo &device_info) -{ - assert(_ind_to_mgr.find(ind) == _ind_to_mgr.end()); - - if (info.isConstant()) - { - _const_mgr->buildTensor(ind, info, create_info, environment, device_info); - _ind_to_mgr.insert({ind, *_const_mgr}); - } - else - { - _nonconst_mgr->buildTensor(ind, info, create_info, environment, device_info); - _ind_to_mgr.insert({ind, *_nonconst_mgr}); - } -} - -template <typename T_ITensor, typename T_Tensor> -void ClTensorManager<T_ITensor, T_Tensor>::startLifetime(const ir::OperandIndex &ind) -{ - assert(_ind_to_mgr.find(ind) != _ind_to_mgr.end()); - _ind_to_mgr.at(ind).startLifetime(ind); -} - -template <typename T_ITensor, typename T_Tensor> -void ClTensorManager<T_ITensor, T_Tensor>::finishLifetime(const ir::OperandIndex &ind) -{ - assert(_ind_to_mgr.find(ind) != _ind_to_mgr.end()); - _ind_to_mgr.at(ind).finishLifetime(ind); -} - -template <typename T_ITensor, typename T_Tensor> -std::shared_ptr<T_ITensor> ClTensorManager<T_ITensor, T_Tensor>::at(const ir::OperandIndex &ind) -{ - if (_ind_to_mgr.find(ind) == _ind_to_mgr.end()) - return nullptr; - - auto &tensors = _ind_to_mgr.at(ind).tensors(); - if (tensors.find(ind) != tensors.end()) - { - return tensors.at(ind); - } - - return nullptr; -} - -template <typename T_ITensor, typename T_Tensor> -ir::OperandIndexMap<std::shared_ptr<T_Tensor>> & -ClTensorManager<T_ITensor, T_Tensor>::constTensors(void) -{ - return _const_mgr->tensors(); -} - -template <typename T_ITensor, typename T_Tensor> -ir::OperandIndexMap<std::shared_ptr<T_Tensor>> & -ClTensorManager<T_ITensor, T_Tensor>::nonconstTensors(void) -{ - return _nonconst_mgr->tensors(); -} - -template <typename T_ITensor, typename T_Tensor> -std::shared_ptr<InferenceContext::DummyTensor> -ClTensorManager<T_ITensor, T_Tensor>::atR(const ir::OperandIndex &ind) -{ - if (_nonconst_mgr->tensorReservers().HaveTensor(ind.value())) - { - return _nonconst_mgr->tensorReservers().Get(ind.value()); - } - else if (_const_mgr->tensorReservers().HaveTensor(ind.value())) - { - return _const_mgr->tensorReservers().Get(ind.value()); - } - return nullptr; -} - -template <typename T_ITensor, typename T_Tensor> -InferenceContext::TensorReserver &ClTensorManager<T_ITensor, T_Tensor>::constTensorReservers(void) -{ - return _const_mgr->tensorReservers(); -} - -template <typename T_ITensor, typename T_Tensor> -InferenceContext::TensorReserver & -ClTensorManager<T_ITensor, T_Tensor>::nonconstTensorReservers(void) -{ - return _nonconst_mgr->tensorReservers(); -} - -template <typename T_ITensor, typename T_Tensor> -void ClTensorManager<T_ITensor, T_Tensor>::iterate( - const std::function<void(const ir::OperandIndex &)> &fn) -{ - for (auto it : _nonconst_mgr->tensors()) - fn(it.first); - - for (auto it : _const_mgr->tensors()) - fn(it.first); -} - -template <typename T_ITensor, typename T_Tensor> -void ClTensorManager<T_ITensor, T_Tensor>::tryDeallocConstants(void) -{ - // NYI -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_ACL_COMMON_TENSOR_MANAGER_H__ diff --git a/runtime/onert/backend/gpu_cl/Config.cc b/runtime/onert/backend/gpu_cl/Config.cc index 067a2070f..9959a471b 100644 --- a/runtime/onert/backend/gpu_cl/Config.cc +++ b/runtime/onert/backend/gpu_cl/Config.cc @@ -17,8 +17,11 @@ #include "Config.h" #include <dlfcn.h> -#include "open_cl/OpenclWrapper.h" -#include "open_cl/Status.h" + +#include "tensorflow/lite/delegates/gpu/common/status.h" +#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h" + +using namespace tflite::gpu::cl; namespace onert { @@ -26,12 +29,9 @@ namespace backend { namespace gpu_cl { - -Config::~Config() { UnloadOpenCL(_handle); } - bool Config::initialize() { - if (LoadOpenCL(&_handle).ok()) + if (LoadOpenCL().ok()) { return true; } diff --git a/runtime/onert/backend/gpu_cl/Config.h b/runtime/onert/backend/gpu_cl/Config.h index aa5a51a15..6a455bbb5 100644 --- a/runtime/onert/backend/gpu_cl/Config.h +++ b/runtime/onert/backend/gpu_cl/Config.h @@ -31,7 +31,7 @@ namespace gpu_cl class Config : public IConfig { public: - virtual ~Config(); + virtual ~Config() {} public: std::string id() override { return "gpu_cl"; } diff --git a/runtime/onert/backend/gpu_cl/KernelGenerator.cc b/runtime/onert/backend/gpu_cl/KernelGenerator.cc index a84867f8c..04edc3928 100644 --- a/runtime/onert/backend/gpu_cl/KernelGenerator.cc +++ b/runtime/onert/backend/gpu_cl/KernelGenerator.cc @@ -19,13 +19,14 @@ #include "KernelGenerator.h" -#include "ClTensorRegistry.h" #include "ClFunction.h" #include "TensorManager.h" -#include "open_cl/selectors/ConvolutionSelector.h" -#include "open_cl/selectors/DwConvolutionSelector.h" -#include "open_cl/selectors/SimpleSelectors.h" +#include "tensorflow/lite/delegates/gpu/common/shape.h" +#include "tensorflow/lite/delegates/gpu/cl/tensor.h" +#include "tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h" +#include "tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.h" +#include "tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h" #include "ir/Operations.h" #include "ir/Operations.Include.h" @@ -37,6 +38,9 @@ #include "util/logging.h" #include "util/Utils.h" +using namespace tflite::gpu; +using namespace tflite::gpu::cl; + namespace onert { namespace backend @@ -60,14 +64,14 @@ void UpdatePadding(const ir::PaddingType type, const BHWC &input_shape, AttrT *a } } -gpu_cl::PoolingType convertPoolType(ir::operation::Pool2D::PoolType type_ir) +PoolingType convertPoolType(ir::operation::Pool2D::PoolType type_ir) { switch (type_ir) { case ir::operation::Pool2D::PoolType::AVG: - return gpu_cl::PoolingType::AVERAGE; + return PoolingType::AVERAGE; case ir::operation::Pool2D::PoolType::MAX: - return gpu_cl::PoolingType::MAX; + return PoolingType::MAX; default: throw std::runtime_error("gpu_Cl KernelGenerator : Not supported operation yet"); } @@ -75,7 +79,7 @@ gpu_cl::PoolingType convertPoolType(ir::operation::Pool2D::PoolType type_ir) KernelGenerator::KernelGenerator(const ir::Graph &graph, const std::shared_ptr<TensorBuilder> &tensor_builder, - const std::shared_ptr<ClTensorRegistry<TensorManager>> &tensor_reg, + const std::shared_ptr<TensorRegistry> &tensor_reg, const std::shared_ptr<CreationContext> &creation_context) : basic::KernelGeneratorBase{graph}, _ctx(graph.operands()), _operations_ctx(graph.operations()), _current_layout{graph.layout()}, @@ -190,7 +194,7 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) auto bias_tensor = _tensor_reg->getClTensor(bias); auto output_tensor = _tensor_reg->getClTensor(output); - gpu_cl::Convolution2DAttributes attr; + Convolution2DAttributes attr; attr.strides = ToHW(param.stride.vertical, param.stride.horizontal); attr.dilations = HW(std::max(static_cast<u_int32_t>(1), param.dilation.height_factor), std::max(static_cast<u_int32_t>(1), param.dilation.width_factor)); @@ -237,7 +241,7 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) { std::unique_ptr<GPUOperation> gpu_op_1; OperationDef op_def_1; - std::shared_ptr<Tensor> new_tensor = std::make_shared<Tensor>(); + std::shared_ptr<cl::Tensor> new_tensor = std::make_shared<cl::Tensor>(); _new_tensors[output] = new_tensor; if (!CreateTensor(*_creation_context->context, output_shape, @@ -334,9 +338,9 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) const int filter_width = ker_shape.w; const int output_depth = out_shape.c; - InternalTensor<OHWI, DataType::FLOAT32> weights; + tflite::gpu::Tensor<OHWI, DataType::FLOAT32> weights; weights.id = attr.weights.id; - weights.shape = OHWI(output_depth, filter_height, filter_width, input_depth); + weights.shape = tflite::gpu::OHWI(output_depth, filter_height, filter_width, input_depth); weights.data.resize(weights.shape.DimensionsProduct()); float *dst = &weights.data[0]; for (int j = 0; j < output_depth; ++j) @@ -387,7 +391,7 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) { std::unique_ptr<GPUOperation> gpu_op_1; OperationDef op_def_1; - std::shared_ptr<Tensor> new_tensor = std::make_shared<Tensor>(); + std::shared_ptr<cl::Tensor> new_tensor = std::make_shared<cl::Tensor>(); _new_tensors[ofm_index] = new_tensor; if (!CreateTensor(*_creation_context->context, out_shape, diff --git a/runtime/onert/backend/gpu_cl/KernelGenerator.h b/runtime/onert/backend/gpu_cl/KernelGenerator.h index 3e341b111..91fd3cd9d 100644 --- a/runtime/onert/backend/gpu_cl/KernelGenerator.h +++ b/runtime/onert/backend/gpu_cl/KernelGenerator.h @@ -17,11 +17,13 @@ #ifndef __ONERT_BACKEND_GPU_CL_KERNEL_GENERATOR_H__ #define __ONERT_BACKEND_GPU_CL_KERNEL_GENERATOR_H__ -#include "ClTensorRegistry.h" +#include "TensorRegistry.h" #include "backend/basic/TensorRegistry.h" #include "TensorBuilder.h" #include "TensorManager.h" +#include "tensorflow/lite/delegates/gpu/api.h" + #include <backend/CustomKernelBuilder.h> #include <backend/basic/KernelGeneratorBase.h> #include <ir/Operands.h> @@ -39,8 +41,8 @@ class KernelGenerator : public basic::KernelGeneratorBase { public: KernelGenerator(const ir::Graph &graph, const std::shared_ptr<TensorBuilder> &tensor_builder, - const std::shared_ptr<ClTensorRegistry<TensorManager>> &tensor_reg, - const std::shared_ptr<CreationContext> &creation_context); + const std::shared_ptr<TensorRegistry> &tensor_reg, + const std::shared_ptr<tflite::gpu::cl::CreationContext> &creation_context); std::unique_ptr<exec::FunctionSequence> generate(ir::OperationIndex ind) override; @@ -58,9 +60,9 @@ private: const ir::Operations &_operations_ctx; ir::Layout _current_layout; std::shared_ptr<TensorBuilder> _tensor_builder; - std::shared_ptr<ClTensorRegistry<TensorManager>> _tensor_reg; - std::shared_ptr<CreationContext> _creation_context; - ir::OperandIndexMap<std::shared_ptr<Tensor>> _new_tensors; + std::shared_ptr<TensorRegistry> _tensor_reg; + std::shared_ptr<tflite::gpu::cl::CreationContext> _creation_context; + ir::OperandIndexMap<std::shared_ptr<tflite::gpu::cl::Tensor>> _new_tensors; }; } // namespace gpu_cl diff --git a/runtime/onert/backend/gpu_cl/MemoryManager.h b/runtime/onert/backend/gpu_cl/MemoryManager.h new file mode 100644 index 000000000..a3b9b39de --- /dev/null +++ b/runtime/onert/backend/gpu_cl/MemoryManager.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_GPU_CL_MEMORY_MANAGER_H__ +#define __ONERT_BACKEND_GPU_CL_MEMORY_MANAGER_H__ + +#include "ex/InferenceContextEx.h" +#include "operand/CLTensor.h" + +#include "ir/OperandIndexMap.h" +#include "ir/OperandInfo.h" +#include "util/logging.h" + +#include "tensorflow/lite/delegates/gpu/cl/cl_context.h" +#include "tensorflow/lite/delegates/gpu/common/status.h" +#include "tensorflow/lite/delegates/gpu/cl/storage_type_util.h" +#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h" + +#include <cassert> + +namespace onert +{ +namespace backend +{ +namespace gpu_cl +{ + +class MemoryManager +{ +public: + MemoryManager(tflite::gpu::cl::CLContext *context) : _context{context} {} + + ~MemoryManager() = default; + + void allocate(void) + { + for (const auto &tensor_entry : _tensors) + { + auto tensor = tensor_entry.second; + auto type = tensor->get_type(); + + // if (type == TensorType::TENSOR_TYPE_DELETE) { + // continue; + // } + + const auto &t = tensor_reserver_.Get(tensor_entry.first.value()); + const auto &shape = t->shape; + const auto &descriptor = t->descriptor; + if (!CreateTensor(*_context, shape, descriptor, tensor->handle()).ok()) + { + std::runtime_error("Failed to CreateTensor"); + } + switch (type) + { + case TensorType::TENSOR_TYPE_INPUT: + tensor->writeConvertInit(); + break; + case TensorType::TENSOR_TYPE_OUTPUT: + tensor->readConvertInit(); + break; + default: + break; + } + } + } + + void deallocate(void) + { + // NYI + } + + void startLifetime(const ir::OperandIndex &) + { /* DO NOTHING */ + } + void finishLifetime(const ir::OperandIndex &) + { /* DO NOTHING */ + } + + void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &info, + tflite::gpu::cl::InferenceContext::CreateInferenceInfo create_info, + std::shared_ptr<tflite::gpu::cl::Environment> environment, + tflite::gpu::cl::DeviceInfo &device_info, TensorType type) + { + tflite::gpu::ValueId max_id = 0; + auto data_type = DeduceDataTypeFromPrecision(create_info.precision); + const auto shape = info.shape(); + + auto tensor = std::make_shared<operand::CLTensor>(shape.rank(), shape, environment, type); + _tensors[ind] = tensor; + tflite::gpu::BHWC t_shape; + switch (shape.rank()) + { + case 1: + // B layout + t_shape = tflite::gpu::BHWC(shape.dim(0), 1, 1, 1); + break; + case 2: + // BC layout + t_shape = tflite::gpu::BHWC(shape.dim(0), 1, 1, shape.dim(1)); + break; + case 3: + // BWC layout + t_shape = tflite::gpu::BHWC(shape.dim(0), 1, shape.dim(1), shape.dim(2)); + break; + case 4: + // BHWC layout + t_shape = tflite::gpu::BHWC(shape.dim(0), shape.dim(1), shape.dim(2), shape.dim(3)); + break; + default: + break; + } + + tflite::gpu::cl::TensorStorageType storage_type = create_info.storage_type; + tflite::gpu::Layout layout = + t_shape.b == 1 ? tflite::gpu::Layout::HWC : tflite::gpu::Layout::BHWC; + + tflite::gpu::ValueId id = ind.value(); + storage_type = + tflite::gpu::cl::SelectBestStorageType(device_info, t_shape, storage_type, data_type, layout); + auto dummy = std::make_shared<InferenceContextEx::DummyTensor>(); + dummy->shape = t_shape; + dummy->descriptor = tflite::gpu::cl::TensorDescriptor{data_type, storage_type, layout}; + tensor_reserver_.Add(id, dummy); + + max_id = std::max(max_id, id); + + tensor_reserver_.SetNext(max_id + 1); + } + + ir::OperandIndexMap<std::shared_ptr<operand::CLTensor>> &tensors(void) { return _tensors; } + + InferenceContextEx::TensorReserverEx &tensorReservers(void) { return tensor_reserver_; } + +private: + ir::OperandIndexMap<std::shared_ptr<operand::CLTensor>> _tensors; + InferenceContextEx::TensorReserverEx tensor_reserver_; + tflite::gpu::cl::CLContext *_context; +}; + +} // namespace gpu_cl +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_GPU_CL_MEMORY_MANAGER_H__ diff --git a/runtime/onert/backend/gpu_cl/TensorBuilder.cc b/runtime/onert/backend/gpu_cl/TensorBuilder.cc new file mode 100644 index 000000000..e71733427 --- /dev/null +++ b/runtime/onert/backend/gpu_cl/TensorBuilder.cc @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <memory> +#include <queue> + +#include "TensorBuilder.h" + +#include "TensorManager.h" + +#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h" +#include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h" +#include "tensorflow/lite/delegates/gpu/cl/cl_device.h" +#include "tensorflow/lite/delegates/gpu/cl/inference_context.h" + +#include "ir/OperandIndexMap.h" +#include "ir/OperandIndexSequence.h" +#include <ir/Operands.h> +#include <util/Utils.h> + +#include <cassert> +#include <stack> + +#include "util/logging.h" + +namespace onert +{ +namespace backend +{ +namespace gpu_cl +{ + +using UsesType = cl_common::UsesType; + +TensorBuilder::TensorBuilder(const ir::Operands &operands, TensorManager *tensor_mgr, + tflite::gpu::cl::InferenceContext::CreateInferenceInfo create_info, + const std::shared_ptr<tflite::gpu::cl::Environment> &environment) + : _operands{operands}, _tensor_mgr{tensor_mgr}, _create_info{create_info}, _environment{ + environment} +{ + assert(_tensor_mgr); +} + +void TensorBuilder::registerTensorInfo(const ir::OperandIndex &ind, const ir::OperandInfo &info, + ir::Layout backend_layout, TensorType type) +{ + assert(_tensor_mgr->constTensors().size() == 0); + assert(_tensor_mgr->nonconstTensors().size() == 0); + + _uses_count_map[ind] = _operands.at(ind).getUses().size(); + + _tensor_info_map.emplace(ind, info); + _tensor_type_map.emplace(ind, type); + + _tensor_layout_map.insert({ind, backend_layout}); +} + +void TensorBuilder::notifyFirstUse(const ir::OperandIndex &ind) +{ + _lifetime_seq.emplace_back(UsesType::FIRST, ind); +} + +void TensorBuilder::notifyLastUse(const ir::OperandIndex &ind) +{ + _lifetime_seq.emplace_back(UsesType::LAST, ind); +} + +bool TensorBuilder::isRegistered(const ir::OperandIndex &ind) const +{ + return _tensor_info_map.find(ind) != _tensor_info_map.end(); +} + +void TensorBuilder::prepare(void) { buildTensors(); } + +void TensorBuilder::allocate(void) +{ + auto lifetime_map = cl_common::createLifetimeMap(_lifetime_seq, _parent_map); + + for (auto &entry : lifetime_map) + { + auto &use = entry.second; + auto use_type = use.first; + auto use_index = use.second; + assert(use_index.valid()); + if (use_type == UsesType::FIRST) + _tensor_mgr->startLifetime(use_index); + else + _tensor_mgr->finishLifetime(use_index); + } + + _tensor_mgr->allocateConsts(); + + // TODO Since `_parent_map` is filled for all Concat nodes even if the node this backend uses + // After refactoring BackendContext we can uncomment this + // assert(_tensor_info_map.size() == + // _tensor_mgr->nonconstTensors().size() + num of constants of _tensor_info_map + + // _parent_map.size()); + _tensor_mgr->allocateNonconsts(); +} + +void TensorBuilder::postFunctionPrepare(void) { _tensor_mgr->tryDeallocConstants(); } + +void TensorBuilder::buildTensors(void) +{ + assert(_tensor_mgr->constTensors().size() == 0); + assert(_tensor_mgr->nonconstTensors().size() == 0); + // Normal tensors + for (auto &entry : _tensor_info_map) + { + auto ind = entry.first; + if (_parent_map.count(ind) > 0) + continue; + auto type = _tensor_type_map.at(ind); + const auto &info = entry.second; + _tensor_mgr->buildTensor(ind, info, _create_info, _environment, _environment->device().info_, + type); + } +} + +} // namespace gpu_cl +} // namespace backend +} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/TensorBuilder.h b/runtime/onert/backend/gpu_cl/TensorBuilder.h index d55358191..2a5cb8b5e 100644 --- a/runtime/onert/backend/gpu_cl/TensorBuilder.h +++ b/runtime/onert/backend/gpu_cl/TensorBuilder.h @@ -17,10 +17,13 @@ #ifndef __ONERT_BACKEND_GPU_CL_TENSOR_BUILDER_H__ #define __ONERT_BACKEND_GPU_CL_TENSOR_BUILDER_H__ -#include <backend/basic/TensorBuilder.h> -#include "operand/ICLTensor.h" -#include "operand/CLTensor.h" -#include "ClTensorBuilder.h" +#include "TensorManager.h" + +#include <cl_common/LifetimeMap.h> +#include <cl_common/ParentInfo.h> + +#include <ir/Operands.h> +#include <ir/OperandIndexSequence.h> namespace onert { @@ -28,8 +31,76 @@ namespace backend { namespace gpu_cl { +class TensorBuilder +{ +public: + TensorBuilder(const ir::Operands &operands, TensorManager *tensor_mgr, + tflite::gpu::cl::InferenceContext::CreateInferenceInfo create_info, + const std::shared_ptr<tflite::gpu::cl::Environment> &environment); + + /** + * @brief Register tensor information to allocate on ACL-CL backend + * @param[in] ind Operand index + * @param[in] info Tensor information + * @param[in] layout Tensor data layout + */ + void registerTensorInfo(const ir::OperandIndex &ind, const ir::OperandInfo &info, + ir::Layout backend_layout, TensorType type); + + void notifyFirstUse(const ir::OperandIndex &); + void notifyLastUse(const ir::OperandIndex &); + + bool isRegistered(const ir::OperandIndex &) const; + + void prepare(); + void allocate(); + void postFunctionPrepare(); + + TensorManager *cl_tensor_manager(void) { return _tensor_mgr.get(); } + + void setUsesCount(const ir::OperandIndex &index, size_t num_uses) + { + assert(_uses_count_map.find(index) != _uses_count_map.end() ? _uses_count_map[index] == num_uses + : true); + _uses_count_map[index] = num_uses; + } + + void parent_map(std::unordered_map<ir::OperandIndex, cl_common::ParentInfo> &&parent_map) + { + _parent_map = std::move(parent_map); + } + + bool areSubTensorsOf(const ir::OperandIndex &parent, const ir::OperandIndexSequence &seq); + + /** + * @brief Check child tensor is allocated as subtensor of parent tensor + * @param[in] parent Index of parent + * @param[in] child Index of child + * @return @c true if child is allocated as subtensor of parent, otherwise @c false + */ + bool isSubTensorOf(const ir::OperandIndex &parent, const ir::OperandIndex &child); + +private: + void buildTensors(void); + ir::OperandIndex findRootParent(ir::OperandIndex index); + +private: + const ir::Operands &_operands; + ir::OperandIndexMap<ir::OperandInfo> _tensor_info_map; + ir::OperandIndexMap<ir::Layout> _tensor_layout_map; + ir::OperandIndexMap<TensorType> _tensor_type_map; + ir::OperandIndexMap<size_t> _uses_count_map; + + std::unique_ptr<TensorManager> _tensor_mgr; + tflite::gpu::cl::InferenceContext::CreateInferenceInfo _create_info; + std::shared_ptr<tflite::gpu::cl::Environment> _environment; + + // for linear executor + cl_common::LifetimeSeq _lifetime_seq; -using TensorBuilder = ClTensorBuilder<operand::ICLTensor, operand::CLTensor>; + // Extra info for concat elimination + ir::OperandIndexMap<cl_common::ParentInfo> _parent_map; +}; } // namespace gpu_cl } // namespace backend diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvCommon.h b/runtime/onert/backend/gpu_cl/TensorBuilderHelper.h index 4700381dc..7290ff5da 100644 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvCommon.h +++ b/runtime/onert/backend/gpu_cl/TensorBuilderHelper.h @@ -1,12 +1,11 @@ /* * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -15,8 +14,11 @@ * limitations under the License. */ -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_COMMON_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_COMMON_H__ +#ifndef __ONERT_BACKEND_GPU_CL_TENSOR_BUILDER_HELPER_H__ +#define __ONERT_BACKEND_GPU_CL_TENSOR_BUILDER_HELPER_H__ + +#include "absl/status/status.h" +#include "tensorflow/lite/delegates/gpu/common/shape.h" namespace onert { @@ -25,20 +27,18 @@ namespace backend namespace gpu_cl { -enum class ConvWeightsLayout +enum TensorType { - kUnknown, - kOHWIOGroupI4O4, + TENSOR_TYPE_VALID = 0, + TENSOR_TYPE_INPUT = 1, + TENSOR_TYPE_OUTPUT = 2, + TENSOR_TYPE_DELETE = 3 }; -struct ConvWeightsDescription -{ - ConvWeightsLayout layout; - int output_group_size; -}; +absl::Status ExtractAxisFromIndex(int dims, int index, tflite::gpu::Axis *axis); } // namespace gpu_cl } // namespace backend } // namespace onert -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_COMMON_H__ +#endif // __ONERT_BACKEND_GPU_CL_TENSOR_BUILDER_HELPER_H__ diff --git a/runtime/onert/backend/gpu_cl/TensorManager.cc b/runtime/onert/backend/gpu_cl/TensorManager.cc new file mode 100644 index 000000000..9fe0605ac --- /dev/null +++ b/runtime/onert/backend/gpu_cl/TensorManager.cc @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TensorManager.h" + +#include <util/logging.h> + +#include <cassert> + +namespace onert +{ +namespace backend +{ +namespace gpu_cl +{ + +TensorManager::TensorManager(MemoryManager *const_mgr, MemoryManager *nonconst_mgr) + : _const_mgr{const_mgr}, _nonconst_mgr{nonconst_mgr} +{ + // DO NOTHING +} + +void TensorManager::allocateConsts(void) { _const_mgr->allocate(); } + +void TensorManager::allocateNonconsts(void) { _nonconst_mgr->allocate(); } + +void TensorManager::deallocateConsts(void) { _const_mgr->deallocate(); } + +void TensorManager::deallocateNonconsts(void) { _nonconst_mgr->deallocate(); } + +void TensorManager::buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &info, + tflite::gpu::cl::InferenceContext::CreateInferenceInfo create_info, + std::shared_ptr<tflite::gpu::cl::Environment> environment, + tflite::gpu::cl::DeviceInfo &device_info, TensorType type) +{ + assert(_ind_to_mgr.find(ind) == _ind_to_mgr.end()); + + if (info.isConstant()) + { + _const_mgr->buildTensor(ind, info, create_info, environment, device_info, type); + _ind_to_mgr.insert({ind, *_const_mgr}); + } + else + { + _nonconst_mgr->buildTensor(ind, info, create_info, environment, device_info, type); + _ind_to_mgr.insert({ind, *_nonconst_mgr}); + } +} + +void TensorManager::startLifetime(const ir::OperandIndex &ind) +{ + assert(_ind_to_mgr.find(ind) != _ind_to_mgr.end()); + _ind_to_mgr.at(ind).startLifetime(ind); +} + +void TensorManager::finishLifetime(const ir::OperandIndex &ind) +{ + assert(_ind_to_mgr.find(ind) != _ind_to_mgr.end()); + _ind_to_mgr.at(ind).finishLifetime(ind); +} + +std::shared_ptr<operand::ICLTensor> TensorManager::at(const ir::OperandIndex &ind) +{ + if (_ind_to_mgr.find(ind) == _ind_to_mgr.end()) + return nullptr; + + auto &tensors = _ind_to_mgr.at(ind).tensors(); + if (tensors.find(ind) != tensors.end()) + { + return tensors.at(ind); + } + + return nullptr; +} + +ir::OperandIndexMap<std::shared_ptr<operand::CLTensor>> &TensorManager::constTensors(void) +{ + return _const_mgr->tensors(); +} + +ir::OperandIndexMap<std::shared_ptr<operand::CLTensor>> &TensorManager::nonconstTensors(void) +{ + return _nonconst_mgr->tensors(); +} + +std::shared_ptr<InferenceContextEx::DummyTensor> TensorManager::atR(const ir::OperandIndex &ind) +{ + if (_nonconst_mgr->tensorReservers().HaveTensor(ind.value())) + { + return _nonconst_mgr->tensorReservers().Get(ind.value()); + } + else if (_const_mgr->tensorReservers().HaveTensor(ind.value())) + { + return _const_mgr->tensorReservers().Get(ind.value()); + } + return nullptr; +} + +InferenceContextEx::TensorReserverEx &TensorManager::constTensorReservers(void) +{ + return _const_mgr->tensorReservers(); +} + +InferenceContextEx::TensorReserverEx &TensorManager::nonconstTensorReservers(void) +{ + return _nonconst_mgr->tensorReservers(); +} + +void TensorManager::iterate(const std::function<void(const ir::OperandIndex &)> &fn) +{ + for (auto it : _nonconst_mgr->tensors()) + fn(it.first); + + for (auto it : _const_mgr->tensors()) + fn(it.first); +} + +void TensorManager::tryDeallocConstants(void) +{ + // NYI +} + +} // namespace gpu_cl +} // namespace backend +} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/TensorManager.h b/runtime/onert/backend/gpu_cl/TensorManager.h index 111b5f8a7..52abc579a 100644 --- a/runtime/onert/backend/gpu_cl/TensorManager.h +++ b/runtime/onert/backend/gpu_cl/TensorManager.h @@ -14,15 +14,16 @@ * limitations under the License. */ -#ifndef __ONERT_BACKEND_CL_TENSOR_MANAGER_H__ -#define __ONERT_BACKEND_CL_TENSOR_MANAGER_H__ +#ifndef __ONERT_BACKEND_GPU_CL_TENSOR_MANAGER_H__ +#define __ONERT_BACKEND_GPU_CL_TENSOR_MANAGER_H__ -#include "ClMemoryManager.h" -#include "ClTensorManager.h" -#include "open_cl/ClContext.h" -#include "operand/CLTensor.h" -#include "operand/ICLTensor.h" -#include "util/logging.h" +#include "MemoryManager.h" + +#include "tensorflow/lite/delegates/gpu/cl/inference_context.h" +#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h" + +#include "ir/OperandInfo.h" +#include "ir/OperandIndexMap.h" namespace onert { @@ -31,13 +32,50 @@ namespace backend namespace gpu_cl { -using MemoryManager = ClMemoryManager<operand::ICLTensor, operand::CLTensor>; +class TensorManager +{ +public: + TensorManager(MemoryManager *const_mgr, MemoryManager *nonconst_mgr); + + virtual ~TensorManager() = default; + + void allocateConsts(void); + void allocateNonconsts(void); + void deallocateConsts(void); + void deallocateNonconsts(void); + + void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &info, + tflite::gpu::cl::InferenceContext::CreateInferenceInfo create_info, + std::shared_ptr<tflite::gpu::cl::Environment> environment, + tflite::gpu::cl::DeviceInfo &device_info, TensorType type); + + std::shared_ptr<operand::ICLTensor> findTensorAsParent(const ir::OperandIndex &ind); + + void startLifetime(const ir::OperandIndex &ind); + void finishLifetime(const ir::OperandIndex &ind); + + std::shared_ptr<operand::ICLTensor> at(const ir::OperandIndex &ind); + std::shared_ptr<InferenceContextEx::DummyTensor> atR(const ir::OperandIndex &ind); + + InferenceContextEx::TensorReserverEx &constTensorReservers(void); + InferenceContextEx::TensorReserverEx &nonconstTensorReservers(void); + + ir::OperandIndexMap<std::shared_ptr<operand::CLTensor>> &constTensors(void); + ir::OperandIndexMap<std::shared_ptr<operand::CLTensor>> &nonconstTensors(void); + + void iterate(const std::function<void(const ir::OperandIndex &)> &fn); + + void tryDeallocConstants(void); -using TensorManager = ClTensorManager<operand::ICLTensor, operand::CLTensor>; +private: + std::unique_ptr<MemoryManager> _const_mgr; + std::unique_ptr<MemoryManager> _nonconst_mgr; + ir::OperandIndexMap<MemoryManager &> _ind_to_mgr; +}; -inline TensorManager *createTensorManager(CLContext *context) +inline TensorManager *createTensorManager(tflite::gpu::cl::CLContext *context) { - VERBOSE(createTensorManager) << "ClTensorManager" << std::endl; + VERBOSE(createTensorManager) << "GPU-CL TensorManager" << std::endl; return new TensorManager(new MemoryManager(context), new MemoryManager(context)); } @@ -45,4 +83,4 @@ inline TensorManager *createTensorManager(CLContext *context) } // namespace backend } // namespace onert -#endif // __ONERT_BACKEND_ACL_CL_TENSOR_MANAGER_H__ +#endif // __ONERT_BACKEND_GPU_CL_TENSOR_MANAGER_H__ diff --git a/runtime/onert/backend/gpu_cl/ClTensorRegistry.h b/runtime/onert/backend/gpu_cl/TensorRegistry.h index 1f0018bd1..6f17aff54 100644 --- a/runtime/onert/backend/gpu_cl/ClTensorRegistry.h +++ b/runtime/onert/backend/gpu_cl/TensorRegistry.h @@ -17,6 +17,8 @@ #ifndef __ONERT_BACKEND_GPU_CL_TENSOR_REGISTRY_H__ #define __ONERT_BACKEND_GPU_CL_TENSOR_REGISTRY_H__ +#include "TensorManager.h" + #include "backend/ITensorRegistry.h" namespace onert @@ -27,14 +29,14 @@ namespace gpu_cl { /** - * @brief Tensor registry class for acl backends + * @brief Tensor registry class for gpu-cl backends * - * This is implemented as a wrapper of AclTensorManager. + * This is implemented as a wrapper of TensorManager. */ -template <typename T_ClTensorManager> class ClTensorRegistry : public ITensorRegistry +class TensorRegistry : public ITensorRegistry { public: - ClTensorRegistry(T_ClTensorManager *tensor_mgr) : _tensor_mgr{tensor_mgr} {} + TensorRegistry(TensorManager *tensor_mgr) : _tensor_mgr{tensor_mgr} {} ITensor *getITensor(const ir::OperandIndex &ind) override { return _tensor_mgr->at(ind).get(); } @@ -45,7 +47,7 @@ public: auto getClTensorReserver(const ir::OperandIndex &ind) { return _tensor_mgr->atR(ind); } private: - T_ClTensorManager *_tensor_mgr; + TensorManager *_tensor_mgr; }; } // namespace gpu_cl diff --git a/runtime/onert/backend/gpu_cl/ex/InferenceContextEx.h b/runtime/onert/backend/gpu_cl/ex/InferenceContextEx.h new file mode 100644 index 000000000..f67387904 --- /dev/null +++ b/runtime/onert/backend/gpu_cl/ex/InferenceContextEx.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2019 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_GPU_CL_INFERENCE_CONTEXT_EX_H__ +#define __ONERT_BACKEND_GPU_CL_INFERENCE_CONTEXT_EX_H__ + +#include "tensorflow/lite/delegates/gpu/cl/inference_context.h" +#include "tensorflow/lite/delegates/gpu/common/model.h" +#include "absl/strings/str_cat.h" + +namespace onert +{ +namespace backend +{ +namespace gpu_cl +{ + +class InferenceContextEx : public tflite::gpu::cl::InferenceContext +{ +public: + struct DummyTensor + { + tflite::gpu::BHWC shape; + tflite::gpu::cl::TensorDescriptor descriptor; + + bool operator==(const DummyTensor &b) const + { + return shape == b.shape && descriptor == b.descriptor; + } + }; + + class TensorReserverEx + { + public: + tflite::gpu::ValueId Add(const std::shared_ptr<DummyTensor> &dummy) + { + reservations_[next_] = dummy; + return next_++; + } + void Add(tflite::gpu::ValueId id, const std::shared_ptr<DummyTensor> &dummy) + { + reservations_[id] = dummy; + } + void SetNext(tflite::gpu::ValueId id) { next_ = id; } + bool HaveTensor(tflite::gpu::ValueId id) + { + return reservations_.find(id) != reservations_.end(); + } + std::shared_ptr<DummyTensor> Get(tflite::gpu::ValueId id) { return reservations_[id]; } + + std::vector<std::pair<tflite::gpu::ValueId, tflite::gpu::cl::TensorDescriptor>> + GetTensorDescs() const + { + std::vector<std::pair<tflite::gpu::ValueId, tflite::gpu::cl::TensorDescriptor>> result; + for (auto &v : reservations_) + { + tflite::gpu::cl::TensorDescriptor desc = v.second->descriptor; + desc.shape.b = v.second->shape.b; + desc.shape.h = v.second->shape.h; + desc.shape.w = v.second->shape.w; + desc.shape.d = 1; + desc.shape.c = v.second->shape.c; + result.push_back({v.first, desc}); + } + return result; + } + + void Add(const std::vector<std::pair<tflite::gpu::ValueId, tflite::gpu::cl::TensorDescriptor>> + &tensors) + { + for (auto &v : tensors) + { + auto dummy = std::make_shared<DummyTensor>(); + dummy->descriptor = v.second; + dummy->shape.b = v.second.shape.b; + dummy->shape.h = v.second.shape.h; + dummy->shape.w = v.second.shape.w; + dummy->shape.c = v.second.shape.c; + Add(v.first, dummy); + } + } + + private: + // absl::flat_hash_map<ValueId, DummyTensor> reservations_; + std::unordered_map<tflite::gpu::ValueId, std::shared_ptr<DummyTensor>> reservations_; + tflite::gpu::ValueId next_ = 0; + }; +}; + +} // namespace gpu_cl +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_GPU_CL_INFERENCE_CONTEXT_EX_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/Api.cc b/runtime/onert/backend/gpu_cl/open_cl/Api.cc deleted file mode 100644 index 10bf87c38..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/Api.cc +++ /dev/null @@ -1,202 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2020 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Api.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ -namespace -{ - -struct ObjectTypeGetter -{ - ObjectType operator()(absl::monostate) const { return ObjectType::UNKNOWN; } - ObjectType operator()(OpenClBuffer) const { return ObjectType::OPENCL_BUFFER; } - ObjectType operator()(OpenClTexture) const { return ObjectType::OPENCL_TEXTURE; } - ObjectType operator()(CpuMemory) const { return ObjectType::CPU_MEMORY; } -}; - -struct ObjectValidityChecker -{ - bool operator()(absl::monostate) const { return false; } - bool operator()(OpenClBuffer obj) const { return obj.memobj; } - bool operator()(OpenClTexture obj) const { return obj.memobj; } - bool operator()(CpuMemory obj) const - { - return obj.data != nullptr && obj.size_bytes > 0 && - (data_type == DataType::UNKNOWN || obj.size_bytes % SizeOf(data_type) == 0); - } - DataType data_type; -}; - -} // namespace - -bool IsValid(const ObjectDef &def) -{ - return def.data_type != DataType::UNKNOWN && def.data_layout != DataLayout::UNKNOWN && - def.object_type != ObjectType::UNKNOWN; -} - -ObjectType GetType(const TensorObject &object) { return absl::visit(ObjectTypeGetter{}, object); } - -bool IsValid(const TensorObjectDef &def) { return IsValid(def.object_def); } - -bool IsValid(const TensorObjectDef &def, const TensorObject &object) -{ - return GetType(object) == def.object_def.object_type && - absl::visit(ObjectValidityChecker{def.object_def.data_type}, object); -} - -bool IsObjectPresent(ObjectType type, const TensorObject &obj) -{ - switch (type) - { - case ObjectType::CPU_MEMORY: - return absl::holds_alternative<CpuMemory>(obj); - case ObjectType::OPENCL_BUFFER: - return absl::holds_alternative<OpenClBuffer>(obj); - case ObjectType::OPENCL_TEXTURE: - return absl::holds_alternative<OpenClTexture>(obj); - case ObjectType::UNKNOWN: - return false; - } - return false; -} - -uint32_t NumElements(const TensorObjectDef &def) -{ - const auto &d = def.dimensions; - switch (def.object_def.data_layout) - { - case DataLayout::BHWC: - return d.product(); - case DataLayout::HWDC4: - case DataLayout::HDWC4: - case DataLayout::DHWC4: - return d.b * d.h * d.w * AlignByN(d.c, 4); - case DataLayout::UNKNOWN: - return 0; - } - return 0; -} - -int GetPosition(const InferenceOptions &options, InferencePriority p) -{ - if (options.priority1 == p) - return 1; - if (options.priority2 == p) - return 2; - if (options.priority3 == p) - return 3; - return 4; // least important -} - -PriorityImportance GetRelativeImportance(const InferenceOptions &options, InferencePriority p1, - InferencePriority p2) -{ - int p1_position = GetPosition(options, p1); - int p2_position = GetPosition(options, p2); - if (p1_position == p2_position) - return PriorityImportance::UNKNOWN; - return p1_position < p2_position ? PriorityImportance::HIGHER : PriorityImportance::LOWER; -} - -bool IsValid(const InferenceOptions &options) -{ - if (options.usage == InferenceUsage::UNKNOWN) - { - return false; - } - if (options.priority1 == InferencePriority::UNKNOWN || - options.priority2 == InferencePriority::UNKNOWN || - options.priority3 == InferencePriority::UNKNOWN) - { - return false; - } - if (options.priority1 == InferencePriority::AUTO) - { - return false; - } - if (options.priority2 == InferencePriority::AUTO && options.priority3 != InferencePriority::AUTO) - { - return false; - } - if (options.priority1 == options.priority2 || options.priority1 == options.priority3) - { - return false; - } - if (options.priority2 == options.priority3 && options.priority2 != InferencePriority::AUTO) - { - return false; - } - return true; -} - -// Implementation note: this resolution logic is shared between GL and CL -// backends, but they might have own logic. Thus, the function is defined -// here just for code re-use purposes. -void ResolveAutoPriority(InferenceOptions *options) -{ - // priority1 can not be AUTO as it would make options invalid. - if (options->priority2 == InferencePriority::AUTO) - { - switch (options->priority1) - { - case InferencePriority::MIN_LATENCY: - options->priority2 = InferencePriority::MIN_MEMORY_USAGE; - options->priority3 = InferencePriority::MAX_PRECISION; - return; - case InferencePriority::MIN_MEMORY_USAGE: - options->priority2 = InferencePriority::MAX_PRECISION; - options->priority3 = InferencePriority::MIN_LATENCY; - return; - case InferencePriority::MAX_PRECISION: - options->priority2 = InferencePriority::MIN_LATENCY; - options->priority3 = InferencePriority::MIN_MEMORY_USAGE; - return; - case InferencePriority::UNKNOWN: - case InferencePriority::AUTO: - // Invalid and unreachable option. - return; - } - } - - if (options->priority3 == InferencePriority::AUTO) - { - // Simply add missing priority - if (GetPosition(*options, InferencePriority::MIN_LATENCY) == 4) - { - options->priority3 = InferencePriority::MIN_LATENCY; - } - else if (GetPosition(*options, InferencePriority::MAX_PRECISION) == 4) - { - options->priority3 = InferencePriority::MAX_PRECISION; - } - else if (GetPosition(*options, InferencePriority::MIN_MEMORY_USAGE) == 4) - { - options->priority3 = InferencePriority::MIN_MEMORY_USAGE; - } - } -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/Api.h b/runtime/onert/backend/gpu_cl/open_cl/Api.h deleted file mode 100644 index 35be3d99c..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/Api.h +++ /dev/null @@ -1,359 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2020 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_API_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_API_H__ - -// Usage example: -// -// // Builder is created from a model using GPU-specific parameters. -// std::unique_ptr<InferenceBuilder> builder = ...; -// -// // input data is coming from a texture -// // output data goes to CPU -// builder->SetInputObjectDef(0, {DataType::FLOAT16, DataLayout::PHWC4, -// ObjectType::OPENGL_TEXTURE, true}); -// builder->SetOutputObjectDef(0, {DataType::FLOAT32, DataLayout::BHWC, -// ObjectType::CPU_MEMORY, false}); -// std::unique_ptr<InferenceRunner> runner; -// RETURN_IF_ERROR(builder->Build(&runner)); // may take significant time. -// RETURN_IF_ERROR( -// runner->SetInputObject(0, OpenGlTexture{texture_ud, texture_format})); -// RETURN_IF_ERROR(runner->Run()); - -#include <cstdint> -#include <memory> -#include <vector> - -#include "absl/types/span.h" -#include "absl/types/variant.h" -#include "DataType.h" -#include "Status.h" -#include "Util.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -// Common abbreviations: -// B - batch -// H - height -// W - width -// C - channels -// D - depth := DivideRoundUp(C, 4) -// C4 - is the constant = 4. -enum class DataLayout -{ - UNKNOWN, - BHWC, - DHWC4, - HWDC4, - HDWC4, -}; - -enum class ObjectType -{ - UNKNOWN, - CPU_MEMORY, - OPENCL_TEXTURE, - OPENCL_BUFFER, -}; - -struct OpenClBuffer -{ - OpenClBuffer() = default; - explicit OpenClBuffer(cl_mem new_memobj) : memobj(new_memobj) {} - - cl_mem memobj = nullptr; -}; - -struct OpenClTexture -{ - OpenClTexture() = default; - explicit OpenClTexture(cl_mem new_memobj) : memobj(new_memobj) {} - - cl_mem memobj = nullptr; - // TODO(akulik): should it specify texture format? -}; - -struct CpuMemory -{ - CpuMemory() = default; - CpuMemory(void *new_data, size_t new_size_bytes) : data(new_data), size_bytes(new_size_bytes) {} - - void *data = nullptr; - size_t size_bytes = 0; -}; - -template <typename T> inline CpuMemory MakeCpuMemory(absl::Span<T> t) -{ - CpuMemory m; - m.data = t.data(); - m.size_bytes = t.size() * sizeof(T); - return m; -} - -template <typename T> inline CpuMemory MakeReadableCpuMemory(absl::Span<const T> t) -{ - CpuMemory m; - m.data = const_cast<T *>(t.data()); - m.size_bytes = t.size() * sizeof(T); - return m; -} - -// Defines object representation. -struct ObjectDef -{ - DataType data_type = DataType::UNKNOWN; - DataLayout data_layout = DataLayout::UNKNOWN; - ObjectType object_type = ObjectType::UNKNOWN; - - // If true, then object is managed externally and needs to be provided to - // InferenceRunner by a user before running inference. - // - // User-provided objects will not be re-used internally for any purpose to - // lower overall memory usage. - bool user_provided = false; - - bool operator==(const ObjectDef &other) const - { - return data_type == other.data_type && data_layout == other.data_layout && - object_type == other.object_type && user_provided == other.user_provided; - } -}; - -bool IsValid(const ObjectDef &def); - -struct Dimensions -{ - Dimensions() : b(1), h(1), w(1), c(1) {} - - Dimensions(int32_t batch, int32_t height, int32_t width, int32_t channels) - : b(batch), h(height), w(width), c(channels) - { - } - - int32_t d() const { return DivideRoundUp(c, 4); } - - int32_t product() const { return b * h * w * c; } - - bool operator==(const Dimensions &other) const - { - return b == other.b && h == other.h && w == other.w && c == other.c; - } - - int32_t b; - int32_t h; - int32_t w; - int32_t c; -}; - -// Connects tensor shape with corresponding object definition. -struct TensorObjectDef -{ - // Dimensions semantic is defined by corresponding DataLayout. - Dimensions dimensions; - ObjectDef object_def; - - bool operator==(const TensorObjectDef &other) const - { - return dimensions == other.dimensions && object_def == other.object_def; - } -}; - -// @return true if tensor object def is defined. -bool IsValid(const TensorObjectDef &def); - -// @return the number of elements in a tensor object. -uint32_t NumElements(const TensorObjectDef &def); - -using TensorObject = absl::variant<absl::monostate, CpuMemory, OpenClBuffer, OpenClTexture>; - -// @return true if object is set and corresponding values are defined. -bool IsValid(const TensorObjectDef &def, const TensorObject &object); - -ObjectType GetType(const TensorObject &object); - -// @return true if corresponding object is set for the given type -bool IsObjectPresent(ObjectType type, const TensorObject &obj); - -class InferenceRunner; - -// Allows to inspect and change input and output definitions before a graph is -// prepared for the inference. -class InferenceBuilder -{ -public: - virtual ~InferenceBuilder() {} - - // Returns inference graph inputs and outputs definitions. - virtual std::vector<TensorObjectDef> inputs() const = 0; - virtual std::vector<TensorObjectDef> outputs() const = 0; - - // Sets new shape for the input if underlying implementation and graph - // structure allows dynamic tensors. - virtual absl::Status SetInputShape(int index, const Dimensions &dimensions) = 0; - - // Updates object definitions for the given index. Implementation may allow - // to use different layouts and/or data type conversions between objects - // defined in a graph and given objects, for example: - // input '0' is DataType::FLOAT32, DataLayout::BHWC. - // A user, however, has an input in DataType::FLOAT16, DataLayout::PHWC4. - // An implementation may allow this transformation to happen automatically - // under the hood. - virtual absl::Status SetInputObjectDef(int index, ObjectDef def) = 0; - virtual absl::Status SetOutputObjectDef(int index, ObjectDef def) = 0; - virtual absl::Status SetAllInputObjectDefsTo(ObjectDef def) - { - auto input_defs = inputs(); - for (size_t i = 0; i < input_defs.size(); ++i) - { - RETURN_IF_ERROR(SetInputObjectDef(i, def)); - } - return absl::OkStatus(); - } - virtual absl::Status SetAllOutputObjectDefsTo(ObjectDef def) - { - auto output_defs = outputs(); - for (size_t i = 0; i < output_defs.size(); ++i) - { - RETURN_IF_ERROR(SetOutputObjectDef(i, def)); - } - return absl::OkStatus(); - } - - // Creates new instance of the inference runner. InferenceBuilder stays valid - // and could be used to create another inference runner if needed. - // - // This method may take significant time to prepare new inference runner. For - // example, it may require to compile OpenGL shaders. - virtual absl::Status Build(std::unique_ptr<InferenceRunner> *runner) = 0; -}; - -// Runs prepared inference. Every object marked as external needs to be set -// prior calling Run method. -class InferenceRunner -{ -public: - virtual ~InferenceRunner() {} - - // Returns inference graph inputs and outputs definitions. - virtual std::vector<TensorObjectDef> inputs() const = 0; - virtual std::vector<TensorObjectDef> outputs() const = 0; - - // Getters provide access to underlying objects for the given index. - // Setters allow to set or change external object for the given index. Note, - // object need to match object definition set before in InferenceBuilder. - - virtual absl::Status GetInputObject(int index, TensorObject *object) = 0; - virtual absl::Status GetOutputObject(int index, TensorObject *object) = 0; - virtual absl::Status SetInputObject(int index, TensorObject object) = 0; - virtual absl::Status SetOutputObject(int index, TensorObject object) = 0; - - virtual absl::Status Run() = 0; -}; - -// Encapsulated compilation/runtime tradeoffs. -enum class InferenceUsage -{ - UNKNOWN, - - // InferenceRunner will be used only once. Therefore, it is important to - // minimize bootstrap time as well. - FAST_SINGLE_ANSWER, - - // Prefer maximizing the throughput. Same inference runner will be used - // repeatedly on different inputs. - SUSTAINED_SPEED, -}; - -// Defines aspects to control while instantiating a runner. -enum class InferencePriority -{ - UNKNOWN, - - AUTO, - - MIN_LATENCY, - - MAX_PRECISION, - - MIN_MEMORY_USAGE, -}; - -struct InferenceOptions -{ - InferenceUsage usage = InferenceUsage::SUSTAINED_SPEED; - - // Ordered priorities provide better understanding of desired semantics, - // where priority(n) is more important than priority(n+1). - // AUTO priority is needed when a single priority is the most important - // factor. For example, priority1 = InferencePriority::MIN_LATENCY and leaving - // everything else to AUTO would result in configuration that achieves maximum - // performance. - // - // AUTO priority can only be used when higher priorities are fully specified. - // For example: - // VALID: priority1 = MIN_LATENCY, priority2 = AUTO, priority3 = AUTO - // VALID: priority1 = MIN_LATENCY, priority2 = MAX_PRECISION, - // priority3 = AUTO - // INVALID: priority1 = AUTO, priority2 = MIN_LATENCY, priority3 = AUTO - // INVALID: priority1 = MIN_LATENCY, priority2 = AUTO, - // priority3 = MAX_PRECISION - // Invalid priorities will result in error. - InferencePriority priority1 = InferencePriority::MAX_PRECISION; - - InferencePriority priority2 = InferencePriority::AUTO; - - InferencePriority priority3 = InferencePriority::AUTO; -}; - -// Returns a position number for the priority. If priority is missing, -// then it it would return 'max num priorities + 1'. -int GetPosition(const InferenceOptions &options, InferencePriority p); - -// Return true if options are valid. -bool IsValid(const InferenceOptions &options); - -// Resolves AUTO priorities and specifies them explicitly. -// Note, no-one should assume that these mappings will not change. -// Technically this function is declared here for code re-use purposes and -// by no means it should be treated as canonical way to resolve AUTO. -void ResolveAutoPriority(InferenceOptions *options); - -enum class PriorityImportance -{ - UNKNOWN, - HIGHER, - LOWER, -}; - -// If both p1 and p2 are not present in options, return UNKNOWN -// If p1 is present, but p2 is not, return HIGHER -// If p2 is present, but p1 is not, return LOWER -// If both are present, and p1 is more important, return HIGHER, otherwise, -// LOWER. -PriorityImportance GetRelativeImportance(const InferenceOptions &options, InferencePriority p1, - InferencePriority p2); -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_API_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/Arguments.cc b/runtime/onert/backend/gpu_cl/open_cl/Arguments.cc deleted file mode 100644 index a7f86bffc..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/Arguments.cc +++ /dev/null @@ -1,926 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2020 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Arguments.h" - -#include "absl/strings/ascii.h" -#include "absl/strings/str_cat.h" -#include "absl/strings/str_replace.h" -#include "absl/strings/str_split.h" -#include "absl/strings/substitute.h" - -#include "AccessType.h" -#include "TensorType.h" -#include "DataType.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -namespace -{ - -bool IsWordSymbol(char symbol) { return absl::ascii_isalnum(symbol) || symbol == '_'; } - -std::string GetNextWord(const std::string &code, size_t first_position) -{ - size_t pos = first_position; - char t = code[pos]; - while (IsWordSymbol(t)) - { - pos++; - t = code[pos]; - } - return code.substr(first_position, pos - first_position); -} - -size_t FindEnclosingBracket(const std::string &text, size_t first_pos, char bracket) -{ - const std::map<char, char> brackets = { - {'(', ')'}, - {'{', '}'}, - {'[', ']'}, - {'<', '>'}, - }; - char b_open = bracket; - auto it = brackets.find(b_open); - if (it == brackets.end()) - { - return -1; - } - char b_close = it->second; - size_t pos = first_pos; - int opened = 1; - int closed = 0; - while (opened != closed && pos < text.size()) - { - if (text[pos] == b_open) - { - opened++; - } - else if (text[pos] == b_close) - { - closed++; - } - pos++; - } - if (opened == closed) - { - return pos; - } - else - { - return -1; - } -} - -absl::Status ParseArgsInsideBrackets(const std::string &text, size_t open_bracket_pos, - size_t *close_bracket_pos, std::vector<std::string> *args) -{ - *close_bracket_pos = FindEnclosingBracket(text, open_bracket_pos + 1, text[open_bracket_pos]); - if (*close_bracket_pos == static_cast<size_t>(-1)) - { - return absl::NotFoundError("Not found enclosing bracket"); - } - std::string str_args = - text.substr(open_bracket_pos + 1, *close_bracket_pos - open_bracket_pos - 2); - std::vector<absl::string_view> words = absl::StrSplit(str_args, ','); - args->reserve(words.size()); - for (const auto &word : words) - { - absl::string_view arg = absl::StripAsciiWhitespace(word); - if (!arg.empty()) - { - args->push_back(std::string(arg)); - } - } - return absl::OkStatus(); -} - -void ReplaceAllWords(const std::string &old_word, const std::string &new_word, std::string *str) -{ - size_t position = str->find(old_word); - while (position != std::string::npos) - { - char prev = position == 0 ? '.' : (*str)[position - 1]; - char next = position + old_word.size() < str->size() ? (*str)[position + old_word.size()] : '.'; - if (IsWordSymbol(prev) || IsWordSymbol(next)) - { - position = str->find(old_word, position + 1); - continue; - } - str->replace(position, old_word.size(), new_word); - position = str->find(old_word, position + new_word.size()); - } -} - -std::string RenameArg(const std::vector<std::string> &object_names, const std::string &postfix, - const std::string &arg_name) -{ - for (const auto &object_name : object_names) - { - if (absl::StartsWith(arg_name, object_name) && arg_name.size() > object_name.size() && - arg_name[object_name.size()] == '_') - { - return object_name + postfix + - arg_name.substr(object_name.size(), arg_name.size() - object_name.size()); - } - } - return arg_name + postfix; -} - -void AppendArgument(const std::string &arg, std::string *args) -{ - if (!args->empty()) - { - absl::StrAppend(args, ",\n "); - } - absl::StrAppend(args, arg); -} - -std::string GetImageModifier(AccessType access) -{ - switch (access) - { - case AccessType::READ: - return "__read_only"; - case AccessType::WRITE: - return "__write_only"; - case AccessType::READ_WRITE: - return "__read_write"; - default: - throw std::runtime_error("Invalid AccessType"); - } -} - -std::string GetDefaultSamplers(const DeviceInfo &device_info) -{ - std::string result; - result += "__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | " - "CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"; - if (device_info.IsAdreno3xx()) - { - // Unfortunately, CLK_ADDRESS_CLAMP is very slow on Adreno3xx and - // we can observe huge register overhead when compared to other modes. - - // While using CLK_ADDRESS_NONE with out-of-range image coordinates is - // undefined in the OpenCL specification, we have observed that - // CLK_ADDRESS_NONE works like CLK_ADDRESS_CLAMP for out-of-range image - // coordinates for RGBA F16/F32 textures on Adreno3xx devices. Using - // CLK_ADDRESS_NONE is significantly faster than CLK_ADDRESS_CLAMP on Adreno - // 3xx. - result += "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | " - "CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"; - } - else - { - result += "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | " - "CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"; - } - - return result; -} - -} // namespace - -// Static -constexpr char Arguments::kArgsPrefix[]; - -Arguments::Arguments(Arguments &&args) - : int_values_(std::move(args.int_values_)), - shared_int4s_data_(std::move(args.shared_int4s_data_)), - float_values_(std::move(args.float_values_)), - shared_float4s_data_(std::move(args.shared_float4s_data_)), buffers_(std::move(args.buffers_)), - images2d_(std::move(args.images2d_)), image2d_arrays_(std::move(args.image2d_arrays_)), - images3d_(std::move(args.images3d_)), image_buffers_(std::move(args.image_buffers_)), - custom_memories_(std::move(args.custom_memories_)), object_refs_(std::move(args.object_refs_)), - objects_(std::move(args.objects_)) -{ -} -Arguments &Arguments::operator=(Arguments &&args) -{ - if (this != &args) - { - int_values_ = std::move(args.int_values_); - shared_int4s_data_ = std::move(args.shared_int4s_data_); - float_values_ = std::move(args.float_values_); - shared_float4s_data_ = std::move(args.shared_float4s_data_); - buffers_ = std::move(args.buffers_); - images2d_ = std::move(args.images2d_); - image2d_arrays_ = std::move(args.image2d_arrays_); - images3d_ = std::move(args.images3d_); - image_buffers_ = std::move(args.image_buffers_); - custom_memories_ = std::move(args.custom_memories_); - object_refs_ = std::move(args.object_refs_); - objects_ = std::move(args.objects_); - } - return *this; -} - -void Arguments::AddFloat(const std::string &name, float value) -{ - float_values_[name].value = value; -} -void Arguments::AddInt(const std::string &name, int value) { int_values_[name].value = value; } -void Arguments::AddBuffer(const std::string &name, const GPUBufferDescriptor &desc) -{ - buffers_[name] = desc; -} -void Arguments::AddImage2D(const std::string &name, const GPUImage2DDescriptor &desc) -{ - images2d_[name] = desc; -} - -void Arguments::AddImage2DArray(const std::string &name, const GPUImage2DArrayDescriptor &desc) -{ - image2d_arrays_[name] = desc; -} - -void Arguments::AddImage3D(const std::string &name, const GPUImage3DDescriptor &desc) -{ - images3d_[name] = desc; -} - -void Arguments::AddImageBuffer(const std::string &name, const GPUImageBufferDescriptor &desc) -{ - image_buffers_[name] = desc; -} - -void Arguments::AddCustomMemory(const std::string &name, const GPUCustomMemoryDescriptor &desc) -{ - custom_memories_[name] = desc; -} - -void Arguments::AddObjectRef(const std::string &name, AccessType access_type, - GPUObjectDescriptorPtr &&descriptor_ptr) -{ - descriptor_ptr->SetAccess(access_type); - object_refs_[name] = {std::move(descriptor_ptr)}; -} - -void Arguments::AddObject(const std::string &name, GPUObjectDescriptorPtr &&descriptor_ptr) -{ - descriptor_ptr->SetAccess(AccessType::READ); - objects_[name] = {nullptr, std::move(descriptor_ptr)}; -} - -void Arguments::AddGPUResources(const std::string &name, const GPUResources &resources) -{ - for (const auto &r : resources.ints) - { - AddInt(absl::StrCat(name, "_", r)); - } - for (const auto &r : resources.floats) - { - AddFloat(absl::StrCat(name, "_", r)); - } - for (const auto &r : resources.buffers) - { - AddBuffer(absl::StrCat(name, "_", r.first), r.second); - } - for (const auto &r : resources.images2d) - { - AddImage2D(absl::StrCat(name, "_", r.first), r.second); - } - for (const auto &r : resources.image2d_arrays) - { - AddImage2DArray(absl::StrCat(name, "_", r.first), r.second); - } - for (const auto &r : resources.images3d) - { - AddImage3D(absl::StrCat(name, "_", r.first), r.second); - } - for (const auto &r : resources.image_buffers) - { - AddImageBuffer(absl::StrCat(name, "_", r.first), r.second); - } - for (const auto &r : resources.custom_memories) - { - AddCustomMemory(absl::StrCat(name, "_", r.first), r.second); - } -} - -absl::Status Arguments::SetInt(const std::string &name, int value) -{ - auto it = int_values_.find(name); - if (it == int_values_.end()) - { - return absl::NotFoundError(absl::StrCat("No int argument with name - ", name)); - } - it->second.value = value; - if (it->second.active) - { - shared_int4s_data_[it->second.offset] = value; - } - return absl::OkStatus(); -} - -absl::Status Arguments::SetFloat(const std::string &name, float value) -{ - auto it = float_values_.find(name); - if (it == float_values_.end()) - { - return absl::NotFoundError(absl::StrCat("No float argument with name - ", name)); - } - it->second.value = value; - if (it->second.active) - { - shared_float4s_data_[it->second.offset] = value; - } - return absl::OkStatus(); -} - -absl::Status Arguments::SetImage2D(const std::string &name, cl_mem memory) -{ - auto it = images2d_.find(name); - if (it == images2d_.end()) - { - return absl::NotFoundError(absl::StrCat("No image2D argument with name - ", name)); - } - it->second.memory = memory; - return absl::OkStatus(); -} - -absl::Status Arguments::SetBuffer(const std::string &name, cl_mem memory) -{ - auto it = buffers_.find(name); - if (it == buffers_.end()) - { - return absl::NotFoundError(absl::StrCat("No buffer argument with name - ", name)); - } - it->second.memory = memory; - return absl::OkStatus(); -} - -absl::Status Arguments::SetImage2DArray(const std::string &name, cl_mem memory) -{ - auto it = image2d_arrays_.find(name); - if (it == image2d_arrays_.end()) - { - return absl::NotFoundError(absl::StrCat("No image2D array argument with name - ", name)); - } - it->second.memory = memory; - return absl::OkStatus(); -} - -absl::Status Arguments::SetImage3D(const std::string &name, cl_mem memory) -{ - auto it = images3d_.find(name); - if (it == images3d_.end()) - { - return absl::NotFoundError(absl::StrCat("No image3D argument with name - ", name)); - } - it->second.memory = memory; - return absl::OkStatus(); -} - -absl::Status Arguments::SetImageBuffer(const std::string &name, cl_mem memory) -{ - auto it = image_buffers_.find(name); - if (it == image_buffers_.end()) - { - return absl::NotFoundError(absl::StrCat("No image buffer argument with name - ", name)); - } - it->second.memory = memory; - return absl::OkStatus(); -} - -absl::Status Arguments::SetCustomMemory(const std::string &name, cl_mem memory) -{ - auto it = custom_memories_.find(name); - if (it == custom_memories_.end()) - { - return absl::NotFoundError(absl::StrCat("No custom memory argument with name - ", name)); - } - it->second.memory = memory; - return absl::OkStatus(); -} - -absl::Status Arguments::SetObjectRef(const std::string &name, const GPUObject *object) -{ - auto it = object_refs_.find(name); - if (it == object_refs_.end()) - { - return absl::NotFoundError(absl::StrCat("No object ref with name - ", name)); - } - GPUResourcesWithValue resources; - RETURN_IF_ERROR(object->GetGPUResources(it->second.descriptor.get(), &resources)); - return SetGPUResources(name, resources); -} - -absl::Status Arguments::SetGPUResources(const std::string &name, - const GPUResourcesWithValue &resources) -{ - for (const auto &r : resources.ints) - { - RETURN_IF_ERROR(SetInt(absl::StrCat(name, "_", r.first), r.second)); - } - for (const auto &r : resources.floats) - { - RETURN_IF_ERROR(SetFloat(absl::StrCat(name, "_", r.first), r.second)); - } - for (const auto &r : resources.buffers) - { - RETURN_IF_ERROR(SetBuffer(absl::StrCat(name, "_", r.first), r.second)); - } - for (const auto &r : resources.images2d) - { - RETURN_IF_ERROR(SetImage2D(absl::StrCat(name, "_", r.first), r.second)); - } - for (const auto &r : resources.image2d_arrays) - { - RETURN_IF_ERROR(SetImage2DArray(absl::StrCat(name, "_", r.first), r.second)); - } - for (const auto &r : resources.images3d) - { - RETURN_IF_ERROR(SetImage3D(absl::StrCat(name, "_", r.first), r.second)); - } - for (const auto &r : resources.image_buffers) - { - RETURN_IF_ERROR(SetImageBuffer(absl::StrCat(name, "_", r.first), r.second)); - } - for (const auto &r : resources.custom_memories) - { - RETURN_IF_ERROR(SetCustomMemory(absl::StrCat(name, "_", r.first), r.second)); - } - return absl::OkStatus(); -} -void Arguments::RenameArgs(const std::string &postfix, std::string *code) const -{ - size_t next_position = code->find(kArgsPrefix); - while (next_position != std::string::npos) - { - size_t arg_pos = next_position + strlen(kArgsPrefix); - std::string arg_name = GetNextWord(*code, arg_pos); - code->replace(arg_pos, arg_name.size(), arg_name + postfix); - next_position = code->find(kArgsPrefix, arg_pos + arg_name.size()); - } -} - -absl::Status Arguments::Merge(Arguments &&args, const std::string &postfix) -{ - std::vector<std::string> object_names; - object_names.reserve(args.object_refs_.size() + args.objects_.size()); - for (auto &v : args.object_refs_) - { - object_names.push_back(v.first); - const std::string name = v.first + postfix; - if (object_refs_.find(name) != object_refs_.end()) - { - return absl::InvalidArgumentError( - absl::StrCat("Object reference name collision. Name - ", name)); - } - object_refs_[name] = {std::move(v.second.descriptor)}; - } - for (auto &v : args.objects_) - { - object_names.push_back(v.first); - const std::string name = v.first + postfix; - if (objects_.find(name) != objects_.end()) - { - return absl::InvalidArgumentError(absl::StrCat("Object name collision. Name - ", name)); - } - objects_[name] = {std::move(v.second.obj_ptr), std::move(v.second.descriptor)}; - } - for (const auto &v : args.int_values_) - { - AddInt(RenameArg(object_names, postfix, v.first), v.second.value); - } - for (const auto &v : args.float_values_) - { - AddFloat(RenameArg(object_names, postfix, v.first), v.second.value); - } - for (const auto &v : args.buffers_) - { - AddBuffer(RenameArg(object_names, postfix, v.first), v.second); - } - for (const auto &v : args.images2d_) - { - AddImage2D(RenameArg(object_names, postfix, v.first), v.second); - } - for (const auto &v : args.image2d_arrays_) - { - AddImage2DArray(RenameArg(object_names, postfix, v.first), v.second); - } - for (const auto &v : args.images3d_) - { - AddImage3D(RenameArg(object_names, postfix, v.first), v.second); - } - for (const auto &v : args.image_buffers_) - { - AddImageBuffer(RenameArg(object_names, postfix, v.first), v.second); - } - for (const auto &v : args.custom_memories_) - { - AddCustomMemory(RenameArg(object_names, postfix, v.first), v.second); - } - return absl::OkStatus(); -} - -absl::Status Arguments::TransformToCLCode(const DeviceInfo &device_info, - const std::map<std::string, std::string> &linkables, - std::string *code) -{ - RETURN_IF_ERROR(AddObjectArgs()); - RETURN_IF_ERROR(ResolveSelectorsPass(linkables, code)); - ResolveArgsPass(device_info, code); - *code = absl::Substitute(*code, GetListOfArgs()); - *code = GetDefaultSamplers(device_info) + *code; - return absl::OkStatus(); -} - -std::string Arguments::GetListOfArgs() -{ - std::string result; - for (auto &t : buffers_) - { - const std::string type_name = t.second.data_type == DataType::FLOAT32 ? "float" : "half"; - std::string attributes; - for (const auto &attr : t.second.attributes) - { - attributes += absl::StrCat(" __attribute__((", attr, "))"); - } - AppendArgument(absl::StrCat(MemoryTypeToCLType(t.second.memory_type), " ", - ToCLDataType(t.second.data_type, t.second.element_size), "* ", - t.first, attributes), - &result); - } - for (auto &t : image_buffers_) - { - AppendArgument( - absl::StrCat(GetImageModifier(t.second.access_type), " image1d_buffer_t ", t.first), &result); - } - for (auto &t : images2d_) - { - AppendArgument(absl::StrCat(GetImageModifier(t.second.access_type), " image2d_t ", t.first), - &result); - } - for (auto &t : image2d_arrays_) - { - AppendArgument( - absl::StrCat(GetImageModifier(t.second.access_type), " image2d_array_t ", t.first), &result); - } - for (auto &t : images3d_) - { - AppendArgument(absl::StrCat(GetImageModifier(t.second.access_type), " image3d_t ", t.first), - &result); - } - for (auto &t : custom_memories_) - { - AppendArgument(absl::StrCat(t.second.type_name, " ", t.first), &result); - } - for (uint32_t i = 0; i < shared_int4s_data_.size() / 4; ++i) - { - AppendArgument(absl::StrCat("int4 shared_int4_", i), &result); - } - for (uint32_t i = 0; i < shared_float4s_data_.size() / 4; ++i) - { - AppendArgument(absl::StrCat("float4 shared_float4_", i), &result); - } - return result; -} - -absl::Status Arguments::Bind(cl_kernel kernel, int offset) -{ - for (auto &t : buffers_) - { - const int error_code = clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError(absl::StrCat("Failed to set kernel arguments - ", - CLErrorCodeToString(error_code), "(at index - ", - offset, ")")); - } - offset++; - } - for (auto &t : image_buffers_) - { - const int error_code = clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError(absl::StrCat("Failed to set kernel arguments - ", - CLErrorCodeToString(error_code), "(at index - ", - offset, ")")); - } - offset++; - } - for (auto &t : images2d_) - { - const int error_code = clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError(absl::StrCat("Failed to set kernel arguments - ", - CLErrorCodeToString(error_code), "(at index - ", - offset, ")")); - } - offset++; - } - for (auto &t : image2d_arrays_) - { - const int error_code = clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError(absl::StrCat("Failed to set kernel arguments - ", - CLErrorCodeToString(error_code), "(at index - ", - offset, ")")); - } - offset++; - } - for (auto &t : images3d_) - { - const int error_code = clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError(absl::StrCat("Failed to set kernel arguments - ", - CLErrorCodeToString(error_code), "(at index - ", - offset, ")")); - } - offset++; - } - for (auto &t : custom_memories_) - { - const int error_code = clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError(absl::StrCat("Failed to set kernel arguments - ", - CLErrorCodeToString(error_code), "(at index - ", - offset, ")")); - } - offset++; - } - for (size_t i = 0; i < shared_int4s_data_.size() / 4; ++i) - { - const int error_code = - clSetKernelArg(kernel, offset, sizeof(int32_t) * 4, &shared_int4s_data_[i * 4]); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError(absl::StrCat("Failed to set kernel arguments - ", - CLErrorCodeToString(error_code), "(at index - ", - offset, ")")); - } - offset++; - } - for (size_t i = 0; i < shared_float4s_data_.size() / 4; ++i) - { - const int error_code = - clSetKernelArg(kernel, offset, sizeof(int32_t) * 4, &shared_float4s_data_[i * 4]); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError(absl::StrCat("Failed to set kernel arguments - ", - CLErrorCodeToString(error_code), "(at index - ", - offset, ")")); - } - offset++; - } - return absl::OkStatus(); -} - -std::string Arguments::AddActiveArgument(const std::string &arg_name, bool) -{ - { - auto it = int_values_.find(arg_name); - if (it != int_values_.end()) - { - int int_index; - if (it->second.active) - { - int_index = it->second.offset; - } - else - { - it->second.active = true; - it->second.offset = shared_int4s_data_.size(); - int_index = it->second.offset; - shared_int4s_data_.push_back(it->second.value); - } - std::string index = std::to_string(int_index / 4); - std::string postfixes[4] = {"x", "y", "z", "w"}; - return "shared_int4_" + index + "." + postfixes[int_index % 4]; - } - } - { - auto it = float_values_.find(arg_name); - if (it != float_values_.end()) - { - int float_index; - if (it->second.active) - { - float_index = it->second.offset; - } - else - { - it->second.active = true; - it->second.offset = shared_float4s_data_.size(); - float_index = it->second.offset; - shared_float4s_data_.push_back(it->second.value); - } - std::string index = std::to_string(float_index / 4); - std::string postfixes[4] = {"x", "y", "z", "w"}; - return "shared_float4_" + index + "." + postfixes[float_index % 4]; - } - } - return arg_name; -} - -void Arguments::ResolveArgsPass(const DeviceInfo &device_info, std::string *code) -{ - bool use_f32_for_half_arguments = device_info.IsPowerVR(); - size_t position = 0; - size_t next_position = code->find(kArgsPrefix); - while (next_position != std::string::npos) - { - size_t arg_pos = next_position; - next_position += strlen(kArgsPrefix); - std::string object_name = GetNextWord(*code, next_position); - std::string new_name = AddActiveArgument(object_name, use_f32_for_half_arguments); - code->replace(arg_pos, object_name.size() + strlen(kArgsPrefix), new_name); - position = arg_pos + new_name.size(); - next_position = code->find(kArgsPrefix, position); - } - - int shared_int4s_aligned_size = AlignByN(shared_int4s_data_.size(), 4); - shared_int4s_data_.resize(shared_int4s_aligned_size); - int shared_float4s_aligned_size = AlignByN(shared_float4s_data_.size(), 4); - shared_float4s_data_.resize(shared_float4s_aligned_size); -} - -void Arguments::ResolveObjectNames(const std::string &object_name, - const std::vector<std::string> &member_names, std::string *code) -{ - for (const auto &member_name : member_names) - { - const std::string new_name = kArgsPrefix + object_name + "_" + member_name; - ReplaceAllWords(member_name, new_name, code); - } -} - -GPUObjectDescriptor *Arguments::GetObjectDescriptor(const std::string &object_name) const -{ - { - auto it = object_refs_.find(object_name); - if (it != object_refs_.end()) - { - return it->second.descriptor.get(); - } - } - { - auto it = objects_.find(object_name); - if (it != objects_.end()) - { - return it->second.descriptor.get(); - } - } - return nullptr; -} - -absl::Status Arguments::ResolveSelector(const std::map<std::string, std::string> &linkables, - const std::string &object_name, const std::string &selector, - const std::vector<std::string> &args, - const std::vector<std::string> &template_args, - std::string *result) -{ - const GPUObjectDescriptor *desc_ptr = GetObjectDescriptor(object_name); - if (!desc_ptr) - { - return absl::NotFoundError(absl::StrCat("No object with name - ", object_name)); - } - auto names = desc_ptr->GetGPUResources().GetNames(); - const auto *tensor_desc = dynamic_cast<const TensorDescriptor *>(desc_ptr); - if (tensor_desc && selector == "Write") - { - auto it = linkables.find(object_name); - if (it != linkables.end()) - { - if (desc_ptr->GetAccess() != AccessType::WRITE && - desc_ptr->GetAccess() != AccessType::READ_WRITE) - { - return absl::FailedPreconditionError( - absl::StrCat("Object with name - ", object_name, " should have Write access.")); - } - std::string value_name, x_coord, y_coord, s_coord; - RETURN_IF_ERROR(tensor_desc->GetLinkingContextFromWriteSelector(args, &value_name, &x_coord, - &y_coord, &s_coord)); - // x_coord can have batch size property of link_object - ResolveObjectNames(object_name, names, &x_coord); - *result = it->second; - ReplaceAllWords("in_out_value", value_name, result); - ReplaceAllWords("X_COORD", x_coord, result); - ReplaceAllWords("Y_COORD", y_coord, result); - ReplaceAllWords("S_COORD", s_coord, result); - RETURN_IF_ERROR(ResolveSelectorsPass({}, result)); - } - } - std::string patch; - RETURN_IF_ERROR(desc_ptr->PerformSelector(selector, args, template_args, &patch)); - ResolveObjectNames(object_name, names, &patch); - *result += patch; - return absl::OkStatus(); -} - -absl::Status Arguments::ResolveSelectorsPass(const std::map<std::string, std::string> &linkables, - std::string *code) -{ - std::string result; - size_t position = 0; - size_t next_position = code->find(kArgsPrefix); - while (next_position != std::string::npos) - { - size_t arg_pos = next_position; - next_position += strlen(kArgsPrefix); - std::string object_name = GetNextWord(*code, next_position); - char next = (*code)[next_position + object_name.size()]; - if (next == '.') - { - next_position += object_name.size() + 1; - std::string selector_name = GetNextWord(*code, next_position); - next_position += selector_name.size(); - next = (*code)[next_position]; - std::vector<std::string> template_args; - if (next == '<') - { - size_t close_bracket_pos; - RETURN_IF_ERROR( - ParseArgsInsideBrackets(*code, next_position, &close_bracket_pos, &template_args)); - next_position = close_bracket_pos; - next = (*code)[next_position]; - } - if (next != '(') - { - return absl::NotFoundError( - absl::StrCat("Expected ( after ", object_name, ".", selector_name, " call")); - } - std::vector<std::string> args; - size_t close_bracket_pos; - RETURN_IF_ERROR(ParseArgsInsideBrackets(*code, next_position, &close_bracket_pos, &args)); - for (auto &arg : args) - { - RETURN_IF_ERROR(ResolveSelectorsPass({}, &arg)); - } - std::string patch; - RETURN_IF_ERROR( - ResolveSelector(linkables, object_name, selector_name, args, template_args, &patch)); - code->replace(arg_pos, close_bracket_pos - arg_pos, patch); - position = arg_pos + patch.size(); - } - else - { - position = arg_pos + strlen(kArgsPrefix); - } - next_position = code->find(kArgsPrefix, position); - } - return absl::OkStatus(); -} - -absl::Status Arguments::AllocateObjects(CLContext *context) -{ - for (auto &t : objects_) - { - RETURN_IF_ERROR(t.second.descriptor->CreateGPUObject(context, &t.second.obj_ptr)); - } - return absl::OkStatus(); -} - -void Arguments::ReleaseCPURepresentation() -{ - for (auto &t : objects_) - { - t.second.descriptor->Release(); - } -} - -absl::Status Arguments::AddObjectArgs() -{ - for (auto &t : objects_) - { - AddGPUResources(t.first, t.second.descriptor->GetGPUResources()); - GPUResourcesWithValue resources; - RETURN_IF_ERROR(t.second.obj_ptr->GetGPUResources(t.second.descriptor.get(), &resources)); - RETURN_IF_ERROR(SetGPUResources(t.first, resources)); - } - for (auto &t : object_refs_) - { - AddGPUResources(t.first, t.second.descriptor->GetGPUResources()); - } - return absl::OkStatus(); -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/Arguments.h b/runtime/onert/backend/gpu_cl/open_cl/Arguments.h deleted file mode 100644 index 0c6ce1edf..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/Arguments.h +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2020 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_ARGUMENTS_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_ARGUMENTS_H__ - -#include <map> -#include <string> -#include <vector> - -#include "ClDevice.h" -#include "GpuObject.h" -#include "OpenclWrapper.h" - -#include "AccessType.h" -#include "Types.h" -#include "Util.h" -#include "Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -class ArgumentsBinder -{ -public: - virtual absl::Status SetInt(const std::string &name, int value) = 0; - virtual absl::Status SetFloat(const std::string &name, float value) = 0; - virtual ~ArgumentsBinder() = default; -}; - -class Arguments : public ArgumentsBinder -{ -public: - Arguments() = default; - void AddFloat(const std::string &name, float value = 0.0f); - void AddInt(const std::string &name, int value = 0); - void AddObjectRef(const std::string &name, AccessType access_type, - GPUObjectDescriptorPtr &&descriptor_ptr); - void AddObject(const std::string &name, GPUObjectDescriptorPtr &&descriptor_ptr); - - absl::Status SetInt(const std::string &name, int value) override; - absl::Status SetFloat(const std::string &name, float value) override; - absl::Status SetObjectRef(const std::string &name, const GPUObject *object); - - absl::Status Bind(cl_kernel kernel, int offset = 0); - - void RenameArgs(const std::string &postfix, std::string *code) const; - absl::Status Merge(Arguments &&args, const std::string &postfix); - - absl::Status AllocateObjects(CLContext *context); - void ReleaseCPURepresentation(); - absl::Status TransformToCLCode(const DeviceInfo &device_info, - const std::map<std::string, std::string> &linkables, - std::string *code); - - // Move only - Arguments(Arguments &&args); - Arguments &operator=(Arguments &&args); - Arguments(const Arguments &) = delete; - Arguments &operator=(const Arguments &) = delete; - - ~Arguments() override = default; - -private: - void AddBuffer(const std::string &name, const GPUBufferDescriptor &desc); - void AddImage2D(const std::string &name, const GPUImage2DDescriptor &desc); - void AddImage2DArray(const std::string &name, const GPUImage2DArrayDescriptor &desc); - void AddImage3D(const std::string &name, const GPUImage3DDescriptor &desc); - void AddImageBuffer(const std::string &name, const GPUImageBufferDescriptor &desc); - void AddCustomMemory(const std::string &name, const GPUCustomMemoryDescriptor &desc); - - absl::Status SetImage2D(const std::string &name, cl_mem memory); - absl::Status SetBuffer(const std::string &name, cl_mem memory); - absl::Status SetImage2DArray(const std::string &name, cl_mem memory); - absl::Status SetImage3D(const std::string &name, cl_mem memory); - absl::Status SetImageBuffer(const std::string &name, cl_mem memory); - absl::Status SetCustomMemory(const std::string &name, cl_mem memory); - - std::string GetListOfArgs(); - - std::string AddActiveArgument(const std::string &arg_name, bool use_f32_for_halfs); - void AddGPUResources(const std::string &name, const GPUResources &resources); - - absl::Status SetGPUResources(const std::string &name, const GPUResourcesWithValue &resources); - - absl::Status AddObjectArgs(); - - void ResolveArgsPass(const DeviceInfo &device_info, std::string *code); - absl::Status ResolveSelectorsPass(const std::map<std::string, std::string> &linkables, - std::string *code); - - absl::Status ResolveSelector(const std::map<std::string, std::string> &linkables, - const std::string &object_name, const std::string &selector, - const std::vector<std::string> &args, - const std::vector<std::string> &template_args, std::string *result); - - void ResolveObjectNames(const std::string &object_name, - const std::vector<std::string> &member_names, std::string *code); - - GPUObjectDescriptor *GetObjectDescriptor(const std::string &object_name) const; - - static constexpr char kArgsPrefix[] = "args."; - - struct IntValue - { - int value; - - // many uniforms generated automatically and not used - // to reduce amount of data transferred we adding this optimization - bool active = false; - - // offset to shared uniform storage. - uint32_t offset = -1; - }; - std::map<std::string, IntValue> int_values_; - std::vector<int32_t> shared_int4s_data_; - - struct FloatValue - { - float value; - - // many uniforms generated automatically and not used - // to reduce amount of data transferred we adding this optimization - bool active = false; - - // offset to shared uniform storage. - uint32_t offset = -1; - }; - std::map<std::string, FloatValue> float_values_; - std::vector<float> shared_float4s_data_; - - std::map<std::string, GPUBufferDescriptor> buffers_; - std::map<std::string, GPUImage2DDescriptor> images2d_; - std::map<std::string, GPUImage2DArrayDescriptor> image2d_arrays_; - std::map<std::string, GPUImage3DDescriptor> images3d_; - std::map<std::string, GPUImageBufferDescriptor> image_buffers_; - std::map<std::string, GPUCustomMemoryDescriptor> custom_memories_; - - struct ObjectRefArg - { - GPUObjectDescriptorPtr descriptor; - }; - std::map<std::string, ObjectRefArg> object_refs_; - - struct ObjectArg - { - GPUObjectPtr obj_ptr; - GPUObjectDescriptorPtr descriptor; - }; - std::map<std::string, ObjectArg> objects_; -}; - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_ARGUMENTS_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/Buffer.cc b/runtime/onert/backend/gpu_cl/open_cl/Buffer.cc deleted file mode 100644 index 64c071921..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/Buffer.cc +++ /dev/null @@ -1,234 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Buffer.h" - -#include <string> - -#include "ClContext.h" -#include "DataType.h" -#include "GpuObject.h" -#include "Util.h" -#include "Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ -namespace -{ - -absl::Status CreateBuffer(size_t size_in_bytes, bool gpu_read_only, const void *data, - CLContext *context, Buffer *result) -{ - cl_mem buffer; - RETURN_IF_ERROR(CreateCLBuffer(context->context(), size_in_bytes, gpu_read_only, - const_cast<void *>(data), &buffer)); - *result = Buffer(buffer, size_in_bytes); - - return absl::OkStatus(); -} - -} // namespace - -BufferDescriptor::BufferDescriptor(BufferDescriptor &&desc) - : GPUObjectDescriptor(std::move(desc)), element_type(desc.element_type), - element_size(desc.element_size), memory_type(desc.memory_type), - attributes(std::move(desc.attributes)), size(desc.size), data(std::move(desc.data)) -{ -} - -BufferDescriptor &BufferDescriptor::operator=(BufferDescriptor &&desc) -{ - if (this != &desc) - { - std::swap(element_type, desc.element_type); - std::swap(element_size, desc.element_size); - std::swap(memory_type, desc.memory_type); - attributes = std::move(desc.attributes); - std::swap(size, desc.size); - data = std::move(desc.data); - GPUObjectDescriptor::operator=(std::move(desc)); - } - return *this; -} - -void BufferDescriptor::Release() { data.clear(); } - -GPUResources BufferDescriptor::GetGPUResources() const -{ - GPUResources resources; - GPUBufferDescriptor desc; - desc.data_type = element_type; - desc.access_type = access_type_; - desc.element_size = element_size; - desc.memory_type = memory_type; - desc.attributes = attributes; - resources.buffers.push_back({"buffer", desc}); - return resources; -} - -absl::Status BufferDescriptor::PerformSelector(const std::string &selector, - const std::vector<std::string> &args, - const std::vector<std::string> &template_args, - std::string *result) const -{ - if (selector == "Read") - { - return PerformReadSelector(args, result); - } - else if (selector == "GetPtr") - { - return PerformGetPtrSelector(args, template_args, result); - } - else - { - return absl::NotFoundError( - absl::StrCat("BufferDescriptor don't have selector with name - ", selector)); - } -} - -absl::Status BufferDescriptor::PerformReadSelector(const std::vector<std::string> &args, - std::string *result) const -{ - if (args.size() != 1) - { - return absl::NotFoundError( - absl::StrCat("BufferDescriptor Read require one argument, but ", args.size(), " was passed")); - } - *result = absl::StrCat("buffer[", args[0], "]"); - return absl::OkStatus(); -} - -absl::Status BufferDescriptor::PerformGetPtrSelector(const std::vector<std::string> &args, - const std::vector<std::string> &template_args, - std::string *result) const -{ - if (args.size() > 1) - { - return absl::NotFoundError(absl::StrCat( - "BufferDescriptor GetPtr require one or zero arguments, but ", args.size(), " was passed")); - } - if (template_args.size() > 1) - { - return absl::NotFoundError(absl::StrCat("BufferDescriptor GetPtr require one or zero teemplate " - "arguments, but ", - template_args.size(), " was passed")); - } - std::string conversion; - if (template_args.size() == 1) - { - const std::string type_name = ToCLDataType(element_type, element_size); - if (type_name != template_args[0]) - { - conversion = absl::StrCat("(", MemoryTypeToCLType(memory_type), " ", template_args[0], "*)&"); - } - } - if (args.empty()) - { - *result = absl::StrCat(conversion, "buffer"); - } - else if (conversion.empty()) - { - *result = absl::StrCat("(buffer + ", args[0], ")"); - } - else - { - *result = absl::StrCat(conversion, "buffer[", args[0], "]"); - } - return absl::OkStatus(); -} - -absl::Status BufferDescriptor::CreateGPUObject(CLContext *context, GPUObjectPtr *result) const -{ - Buffer gpu_buffer; - RETURN_IF_ERROR(gpu_buffer.CreateFromBufferDescriptor(*this, context)); - *result = absl::make_unique<Buffer>(std::move(gpu_buffer)); - return absl::OkStatus(); -} - -Buffer::Buffer(cl_mem buffer, size_t size_in_bytes) : buffer_(buffer), size_(size_in_bytes) {} - -Buffer::Buffer(Buffer &&buffer) : buffer_(buffer.buffer_), size_(buffer.size_) -{ - buffer.buffer_ = nullptr; - buffer.size_ = 0; -} - -Buffer &Buffer::operator=(Buffer &&buffer) -{ - if (this != &buffer) - { - Release(); - std::swap(size_, buffer.size_); - std::swap(buffer_, buffer.buffer_); - } - return *this; -} - -void Buffer::Release() -{ - if (buffer_) - { - clReleaseMemObject(buffer_); - buffer_ = nullptr; - size_ = 0; - } -} - -absl::Status Buffer::GetGPUResources(const GPUObjectDescriptor *obj_ptr, - GPUResourcesWithValue *resources) const -{ - const auto *buffer_desc = dynamic_cast<const BufferDescriptor *>(obj_ptr); - if (!buffer_desc) - { - return absl::InvalidArgumentError("Expected BufferDescriptor on input."); - } - - resources->buffers.push_back({"buffer", buffer_}); - return absl::OkStatus(); -} - -absl::Status Buffer::CreateFromBufferDescriptor(const BufferDescriptor &desc, CLContext *context) -{ - bool read_only = desc.memory_type == MemoryType::CONSTANT; - uint8_t *data_ptr = desc.data.empty() ? nullptr : const_cast<unsigned char *>(desc.data.data()); - size_ = desc.size; - return CreateCLBuffer(context->context(), desc.size, read_only, data_ptr, &buffer_); -} - -absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, CLContext *context, Buffer *result) -{ - return CreateBuffer(size_in_bytes, true, nullptr, context, result); -} - -absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, const void *data, CLContext *context, - Buffer *result) -{ - return CreateBuffer(size_in_bytes, true, data, context, result); -} - -absl::Status CreateReadWriteBuffer(size_t size_in_bytes, CLContext *context, Buffer *result) -{ - return CreateBuffer(size_in_bytes, false, nullptr, context, result); -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/Buffer.h b/runtime/onert/backend/gpu_cl/open_cl/Buffer.h deleted file mode 100644 index 39e97be6d..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/Buffer.h +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_BUFFER_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_BUFFER_H__ - -#include "absl/strings/str_cat.h" -#include "absl/types/span.h" - -#include "ClCommandQueue.h" -#include "ClContext.h" -#include "GpuObject.h" -#include "OpenclWrapper.h" -#include "DataType.h" -#include "Util.h" -#include "Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -struct BufferDescriptor : public GPUObjectDescriptor -{ - DataType element_type; - int element_size; - MemoryType memory_type = MemoryType::GLOBAL; - std::vector<std::string> attributes; - - // optional - int size = 0; - std::vector<uint8_t> data; - - BufferDescriptor() = default; - BufferDescriptor(const BufferDescriptor &) = default; - BufferDescriptor &operator=(const BufferDescriptor &) = default; - BufferDescriptor(BufferDescriptor &&desc); - BufferDescriptor &operator=(BufferDescriptor &&desc); - - absl::Status PerformSelector(const std::string &selector, const std::vector<std::string> &args, - const std::vector<std::string> &template_args, - std::string *result) const override; - - GPUResources GetGPUResources() const override; - absl::Status PerformReadSelector(const std::vector<std::string> &args, std::string *result) const; - absl::Status PerformGetPtrSelector(const std::vector<std::string> &args, - const std::vector<std::string> &template_args, - std::string *result) const; - - absl::Status CreateGPUObject(CLContext *context, GPUObjectPtr *result) const override; - void Release() override; -}; - -// Buffer represent linear GPU data storage with arbitrary data format. -// Buffer is moveable but not copyable. -class Buffer : public GPUObject -{ -public: - Buffer() {} // just for using Buffer as a class members - Buffer(cl_mem buffer, size_t size_in_bytes); - - // Move only - Buffer(Buffer &&buffer); - Buffer &operator=(Buffer &&buffer); - Buffer(const Buffer &) = delete; - Buffer &operator=(const Buffer &) = delete; - - virtual ~Buffer() { Release(); } - - // for profiling and memory statistics - uint64_t GetMemorySizeInBytes() const { return size_; } - - cl_mem GetMemoryPtr() const { return buffer_; } - - // Writes data to a buffer. Data should point to a region that - // has exact size in bytes as size_in_bytes(constructor parameter). - template <typename T> absl::Status WriteData(CLCommandQueue *queue, const std::vector<T> *data); - - // Reads data from Buffer into CPU memory. - template <typename T> absl::Status ReadData(CLCommandQueue *queue, std::vector<T> *result) const; - - absl::Status GetGPUResources(const GPUObjectDescriptor *obj_ptr, - GPUResourcesWithValue *resources) const override; - - absl::Status CreateFromBufferDescriptor(const BufferDescriptor &desc, CLContext *context); - -private: - void Release(); - - cl_mem buffer_ = nullptr; - size_t size_ = 0; -}; - -absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, CLContext *context, Buffer *result); - -absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, const void *data, CLContext *context, - Buffer *result); - -absl::Status CreateReadWriteBuffer(size_t size_in_bytes, CLContext *context, Buffer *result); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_BUFFER_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClCommandQueue.cc b/runtime/onert/backend/gpu_cl/open_cl/ClCommandQueue.cc deleted file mode 100644 index d147b7b13..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/ClCommandQueue.cc +++ /dev/null @@ -1,359 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "ClCommandQueue.h" - -#include <algorithm> -#include <map> -#include <string> -#include <vector> -#include <limits> - -#include "absl/strings/str_cat.h" -#include "ClDevice.h" -#include "ClEvent.h" -#include "Util.h" -#include "Types.h" -#include "Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -using namespace std; - -CLCommandQueue::CLCommandQueue(cl_command_queue queue, bool has_ownership) - : queue_(queue), has_ownership_(has_ownership) -{ -} - -CLCommandQueue::CLCommandQueue(CLCommandQueue &&queue) - : queue_(queue.queue_), has_ownership_(queue.has_ownership_) -{ - queue.queue_ = nullptr; -} - -CLCommandQueue &CLCommandQueue::operator=(CLCommandQueue &&queue) -{ - if (this != &queue) - { - Release(); - std::swap(queue_, queue.queue_); - has_ownership_ = queue.has_ownership_; - } - return *this; -} - -CLCommandQueue::~CLCommandQueue() { Release(); } - -void CLCommandQueue::Release() -{ - if (has_ownership_ && queue_) - { - clReleaseCommandQueue(queue_); - queue_ = nullptr; - } -} - -absl::Status CLCommandQueue::Dispatch(const CLKernel &kernel, const int3 &work_groups_count, - const int3 &work_group_size, CLEvent *event) -{ - std::vector<size_t> local(3); - std::vector<size_t> global(3); - for (int i = 0; i < 3; ++i) - { - local[i] = work_group_size[i]; - global[i] = work_groups_count[i] * work_group_size[i]; - } - cl_event resulting_event; - const int error_code = - clEnqueueNDRangeKernel(queue_, kernel.kernel(), 3, nullptr, global.data(), local.data(), 0, - nullptr, event ? &resulting_event : nullptr); - if (event) - { - *event = CLEvent(resulting_event); - } - if (error_code != CL_SUCCESS) - { - return absl::UnknownError( - absl::StrCat("Failed to clEnqueueNDRangeKernel - ", CLErrorCodeToString(error_code))); - } - return absl::OkStatus(); -} - -absl::Status CLCommandQueue::Dispatch(const CLKernel &kernel, const int3 &work_groups_count, - const int3 &work_group_size) -{ - return Dispatch(kernel, work_groups_count, work_group_size, nullptr); -} - -absl::Status CLCommandQueue::EnqueueEvent(CLEvent *event) -{ - cl_event resulting_event; - const int error_code = clEnqueueMarker(queue_, &resulting_event); - *event = CLEvent(resulting_event); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError( - absl::StrCat("Failed to clEnqueueMarker - ", CLErrorCodeToString(error_code))); - } - return absl::OkStatus(); -} - -absl::Status CLCommandQueue::EnqueueWriteImage(cl_mem memory, int3 region, const void *data) -{ - const size_t origin[] = {0, 0, 0}; - const size_t r[] = {static_cast<size_t>(region.x), static_cast<size_t>(region.y), - static_cast<size_t>(region.z)}; - auto error_code = - clEnqueueWriteImage(queue_, memory, CL_TRUE, origin, r, 0, 0, data, 0, nullptr, nullptr); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError(absl::StrCat("Failed to upload data to GPU (clEnqueueWriteImage) - ", - CLErrorCodeToString(error_code))); - } - - return absl::OkStatus(); -} - -absl::Status CLCommandQueue::EnqueueReadImage(cl_mem memory, int3 region, void *data) -{ - const size_t origin[] = {0, 0, 0}; - const size_t r[] = {static_cast<size_t>(region.x), static_cast<size_t>(region.y), - static_cast<size_t>(region.z)}; - auto error_code = - clEnqueueReadImage(queue_, memory, CL_TRUE, origin, r, 0, 0, data, 0, nullptr, nullptr); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError(absl::StrCat("Failed to read data from GPU (clEnqueueReadImage) - ", - CLErrorCodeToString(error_code))); - } - - return absl::OkStatus(); -} - -absl::Status CLCommandQueue::EnqueueWriteBuffer(cl_mem memory, size_t size_in_bytes, - const void *data) -{ - auto error_code = - clEnqueueWriteBuffer(queue_, memory, CL_TRUE, 0, size_in_bytes, data, 0, nullptr, nullptr); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError(absl::StrCat("Failed to upload data to GPU (clEnqueueWriteBuffer) - ", - CLErrorCodeToString(error_code))); - } - return absl::OkStatus(); -} - -absl::Status CLCommandQueue::EnqueueReadBuffer(cl_mem memory, size_t size_in_bytes, void *data) -{ - auto error_code = - clEnqueueReadBuffer(queue_, memory, CL_TRUE, 0, size_in_bytes, data, 0, nullptr, nullptr); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError(absl::StrCat("Failed to read data from GPU (clEnqueueReadBuffer) - ", - CLErrorCodeToString(error_code))); - } - return absl::OkStatus(); -} - -absl::Status CLCommandQueue::WaitForCompletion() -{ - auto error_code = clFinish(queue_); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError( - absl::StrCat("Failed to clFinish - ", CLErrorCodeToString(error_code))); - } - return absl::OkStatus(); -} - -ProfilingCommandQueue::ProfilingCommandQueue(cl_command_queue queue) : CLCommandQueue(queue, true) -{ - events_.reserve(128); -} - -ProfilingCommandQueue::ProfilingCommandQueue(ProfilingCommandQueue &&queue) - : CLCommandQueue(std::move(queue)), events_(std::move(queue.events_)), - current_label_(std::move(queue.current_label_)) -{ -} - -ProfilingCommandQueue &ProfilingCommandQueue::operator=(ProfilingCommandQueue &&queue) -{ - if (this != &queue) - { - events_ = std::move(queue.events_); - current_label_ = std::move(queue.current_label_); - CLCommandQueue::operator=(std::move(queue)); - } - return *this; -} - -void ProfilingCommandQueue::SetEventsLabel(const std::string &name) { current_label_ = name; } - -void ProfilingCommandQueue::ResetMeasurements() { events_.clear(); } - -absl::Status ProfilingCommandQueue::Dispatch(const CLKernel &kernel, const int3 &work_groups_count, - const int3 &work_group_size) -{ - events_.push_back(CLEvent()); - RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count, work_group_size, - &events_[events_.size() - 1])); - events_.back().SetName(current_label_); - return absl::OkStatus(); -} - -absl::Status -ProfilingCommandQueue::GetBestWorkGroupIndex(const CLKernel &kernel, const DeviceInfo &device_info, - const std::vector<int3> &work_groups_count, - const std::vector<int3> &work_group_sizes, int *index) -{ - // Some Adreno 3xx can have wrong numbers for some events - const bool possible_bug_with_events = device_info.IsAdreno3xx(); - events_.resize(work_group_sizes.size()); - for (size_t i = 0; i < work_group_sizes.size(); ++i) - { - RETURN_IF_ERROR( - CLCommandQueue::Dispatch(kernel, work_groups_count[i], work_group_sizes[i], &events_[i])); - - // reducing the speed of memory leak on Mali for some kernels - if (device_info.IsMali() && i % 8 == 7) - { - events_[i - 7].Wait(); - } - if (possible_bug_with_events) - { - // We are trying to increase probability for correct result. - RETURN_IF_ERROR(WaitForCompletion()); - } - } - - RETURN_IF_ERROR(WaitForCompletion()); - - // To release memory of some kernel pool on Mali. - if (device_info.IsMali()) - { - RETURN_IF_ERROR(kernel.ReInit()); - } - - int minimum_index = 0; - double minimum_time = std::numeric_limits<double>::max(); - if (possible_bug_with_events) - { // we will try to cut out suspicious results - double average_time = 0.0; - int average_samples_count = 0; - for (size_t i = 0; i < work_group_sizes.size(); ++i) - { - if (events_[i].GetEventTimeMs() < 100 * 1000) - { // 100 sec - average_time += events_[i].GetEventTimeMs(); - average_samples_count++; - } - } - if (average_samples_count == 0) - { - throw std::runtime_error("It cannot be divided by zero"); - } - else - { - average_time /= average_samples_count; - } - - for (size_t i = 0; i < work_group_sizes.size(); ++i) - { - double time = events_[i].GetEventTimeMs(); - if (time < minimum_time && time >= 0.1 * average_time) - { - minimum_index = i; - minimum_time = time; - } - } - } - else - { - for (size_t i = 0; i < work_group_sizes.size(); ++i) - { - double time = events_[i].GetEventTimeMs(); - if (time < minimum_time) - { - minimum_index = i; - minimum_time = time; - } - } - } - - *index = minimum_index; - - return absl::OkStatus(); -} - -absl::Status CreateCLCommandQueue(const CLDevice &device, const CLContext &context, - CLCommandQueue *result) -{ - int error_code; - cl_command_queue queue = clCreateCommandQueue(context.context(), device.id(), 0, &error_code); - if (!queue) - { - return absl::UnknownError( - absl::StrCat("Failed to create a command queue - ", CLErrorCodeToString(error_code))); - } - *result = CLCommandQueue(queue, true); - return absl::OkStatus(); -} - -double ProfilingCommandQueue::GetQueueExecutionTimeMs() const -{ - const uint64_t start = events_.front().GetStartedTimeNs(); - const uint64_t end = events_.back().GetFinishedTimeNs(); - const uint64_t time_ns = (end - start); - - return static_cast<double>(time_ns) / 1000000.0; -} - -double ProfilingCommandQueue::GetSumOfEventsTimeMs() const -{ - double sum = 0.0; - for (uint32_t i = 0; i < events_.size(); ++i) - { - sum += events_[i].GetEventTimeMs(); - } - return sum; -} - -absl::Status CreateProfilingCommandQueue(const CLDevice &device, const CLContext &context, - ProfilingCommandQueue *result) -{ - int error_code; - cl_command_queue queue = - clCreateCommandQueue(context.context(), device.id(), CL_QUEUE_PROFILING_ENABLE, &error_code); - if (!queue) - { - return absl::UnknownError( - absl::StrCat("Failed to create a command queue - ", CLErrorCodeToString(error_code))); - } - - *result = ProfilingCommandQueue(queue); - return absl::OkStatus(); -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClCommandQueue.h b/runtime/onert/backend/gpu_cl/open_cl/ClCommandQueue.h deleted file mode 100644 index 81f93fd23..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/ClCommandQueue.h +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_CL_COMMAND_QUEUE_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_CL_COMMAND_QUEUE_H__ - -#include <cstdint> -#include <string> -#include <vector> - -#include "absl/time/time.h" -#include "ClContext.h" -#include "ClDevice.h" -#include "ClEvent.h" -#include "ClKernel.h" -#include "OpenclWrapper.h" -#include "Types.h" -#include "Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -struct ProfilingInfo -{ - struct DispatchInfo - { - std::string label; - absl::Duration duration; - }; - - std::vector<DispatchInfo> dispatches; - - absl::Duration GetTotalTime() const; - - // Returns report (string of lines delimited by \n) - // This method uses GPU counters and measure GPU time only. - // Report has next structure: - // Per kernel timing(K kernels): - // conv2d 3.2ms - // ... - // -------------------- - // Accumulated time per operation type: - // conv2d - 14.5ms - // .... - // -------------------- - // Ideal total time: 23.4ms // Total time for all kernels - std::string GetDetailedReport() const; -}; - -// A wrapper around opencl command queue -class CLCommandQueue -{ -public: - CLCommandQueue() {} - CLCommandQueue(cl_command_queue queue, bool has_ownership); - - // Move only - CLCommandQueue(CLCommandQueue &&queue); - CLCommandQueue &operator=(CLCommandQueue &&queue); - CLCommandQueue(const CLCommandQueue &) = delete; - CLCommandQueue &operator=(const CLCommandQueue &) = delete; - - virtual ~CLCommandQueue(); - - cl_command_queue queue() const { return queue_; } - - virtual absl::Status Dispatch(const CLKernel &kernel, const int3 &work_groups_count, - const int3 &work_group_size); - - absl::Status Dispatch(const CLKernel &kernel, const int3 &work_groups_count, - const int3 &work_group_size, CLEvent *event); - - absl::Status EnqueueEvent(CLEvent *event); - - absl::Status EnqueueWriteImage(cl_mem memory, int3 region, const void *data); - absl::Status EnqueueReadImage(cl_mem memory, int3 region, void *data); - - absl::Status EnqueueWriteBuffer(cl_mem memory, size_t size_in_bytes, const void *data); - absl::Status EnqueueReadBuffer(cl_mem memory, size_t size_in_bytes, void *data); - - absl::Status WaitForCompletion(); - -protected: - void Release(); - - cl_command_queue queue_ = nullptr; - bool has_ownership_ = false; -}; - -class ProfilingCommandQueue : public CLCommandQueue -{ -public: - ProfilingCommandQueue() {} - explicit ProfilingCommandQueue(cl_command_queue queue); - - // Move only - ProfilingCommandQueue(ProfilingCommandQueue &&queue); - ProfilingCommandQueue &operator=(ProfilingCommandQueue &&queue); - ProfilingCommandQueue(const ProfilingCommandQueue &) = delete; - ProfilingCommandQueue &operator=(const ProfilingCommandQueue &) = delete; - - absl::Status Dispatch(const CLKernel &kernel, const int3 &work_groups_count, - const int3 &work_group_size) override; - - // will write index for fastest work_group among work_group_sizes - absl::Status GetBestWorkGroupIndex(const CLKernel &kernel, const DeviceInfo &device_info, - const std::vector<int3> &work_groups_count, - const std::vector<int3> &work_group_sizes, int *index); - - // call ResetMeasurements() to start new seriese of measurements - void ResetMeasurements(); - - double GetQueueExecutionTimeMs() const; - - // Difference from GetQueueExecutionTimeMs is that this number doesn't include - // time between kernels(kernels launches or preparing) on GPU. Usually, this - // time should be 5-10% better than GetQueueExecutionTimeMs, because 5-10% - // spend on something else(maybe kernels launches or preparing) - double GetSumOfEventsTimeMs() const; - - // This label will be used for all subsequent dispatches. - void SetEventsLabel(const std::string &name); - -private: - std::vector<CLEvent> events_; - std::string current_label_; -}; - -absl::Status CreateCLCommandQueue(const CLDevice &device, const CLContext &context, - CLCommandQueue *result); - -absl::Status CreateProfilingCommandQueue(const CLDevice &device, const CLContext &context, - ProfilingCommandQueue *result); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_CL_COMMAND_QUEUE_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClContext.cc b/runtime/onert/backend/gpu_cl/open_cl/ClContext.cc deleted file mode 100644 index 3289ff914..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/ClContext.cc +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "ClContext.h" - -#include "absl/strings/str_cat.h" -#include "ClImageFormat.h" -#include "Util.h" -#include "Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ -namespace -{ - -std::vector<cl_image_format> GetSupportedImage2DFormats(cl_context context, cl_mem_flags flags) -{ - cl_uint num_image_formats; - cl_int error = clGetSupportedImageFormats(context, flags, CL_MEM_OBJECT_IMAGE2D, 0, nullptr, - &num_image_formats); - if (error != CL_SUCCESS) - { - return {}; - } - - std::vector<cl_image_format> result(num_image_formats); - error = clGetSupportedImageFormats(context, flags, CL_MEM_OBJECT_IMAGE2D, num_image_formats, - &result[0], nullptr); - if (error != CL_SUCCESS) - { - return {}; - } - return result; -} - -bool IsEqualToImageFormat(cl_image_format image_format, DataType data_type, int num_channels) -{ - return image_format.image_channel_data_type == ToImageChannelType(data_type) && - image_format.image_channel_order == ToChannelOrder(num_channels); -} - -void AddSupportedImageFormats(cl_context context, DeviceInfo *info) -{ - auto supported_formats = GetSupportedImage2DFormats(context, CL_MEM_READ_WRITE); - for (auto format : supported_formats) - { - info->supports_r_f16_tex2d = - info->supports_r_f16_tex2d || IsEqualToImageFormat(format, DataType::FLOAT16, 1); - info->supports_rg_f16_tex2d = - info->supports_rg_f16_tex2d || IsEqualToImageFormat(format, DataType::FLOAT16, 2); - info->supports_rgb_f16_tex2d = - info->supports_rgb_f16_tex2d || IsEqualToImageFormat(format, DataType::FLOAT16, 3); - info->supports_rgba_f16_tex2d = - info->supports_rgba_f16_tex2d || IsEqualToImageFormat(format, DataType::FLOAT16, 4); - info->supports_r_f32_tex2d = - info->supports_r_f32_tex2d || IsEqualToImageFormat(format, DataType::FLOAT32, 1); - info->supports_rg_f32_tex2d = - info->supports_rg_f32_tex2d || IsEqualToImageFormat(format, DataType::FLOAT32, 2); - info->supports_rgb_f32_tex2d = - info->supports_rgb_f32_tex2d || IsEqualToImageFormat(format, DataType::FLOAT32, 3); - info->supports_rgba_f32_tex2d = - info->supports_rgba_f32_tex2d || IsEqualToImageFormat(format, DataType::FLOAT32, 4); - } -} - -absl::Status CreateCLContext(const CLDevice &device, cl_context_properties *properties, - CLContext *result) -{ - int error_code; - cl_device_id device_id = device.id(); - cl_context context = clCreateContext(properties, 1, &device_id, nullptr, nullptr, &error_code); - if (!context) - { - return absl::UnknownError( - absl::StrCat("Failed to create a compute context - ", CLErrorCodeToString(error_code))); - } - AddSupportedImageFormats(context, &device.info_); - - *result = CLContext(context, true); - return absl::OkStatus(); -} - -} // namespace - -CLContext::CLContext(cl_context context, bool has_ownership) - : context_(context), has_ownership_(has_ownership) -{ -} - -CLContext::CLContext(CLContext &&context) - : context_(context.context_), has_ownership_(context.has_ownership_) -{ - context.context_ = nullptr; -} - -CLContext &CLContext::operator=(CLContext &&context) -{ - if (this != &context) - { - Release(); - std::swap(context_, context.context_); - has_ownership_ = context.has_ownership_; - } - return *this; -} - -CLContext::~CLContext() { Release(); } - -void CLContext::Release() -{ - if (has_ownership_ && context_) - { - clReleaseContext(context_); - context_ = nullptr; - } -} - -bool CLContext::IsFloatTexture2DSupported(int num_channels, DataType data_type, - cl_mem_flags flags) const -{ - auto supported_formats = GetSupportedImage2DFormats(context_, flags); - for (auto format : supported_formats) - { - if (format.image_channel_data_type == ToImageChannelType(data_type) && - format.image_channel_order == ToChannelOrder(num_channels)) - { - return true; - } - } - - return false; -} - -absl::Status CreateCLContext(const CLDevice &device, CLContext *result) -{ - return CreateCLContext(device, nullptr, result); -} - -absl::Status CreateCLGLContext(const CLDevice &device, cl_context_properties egl_context, - cl_context_properties egl_display, CLContext *result) -{ - if (!device.SupportsExtension("cl_khr_gl_sharing")) - { - return absl::UnavailableError("Device doesn't support CL-GL sharing."); - } - cl_context_properties platform = reinterpret_cast<cl_context_properties>(device.platform()); - cl_context_properties props[] = {CL_GL_CONTEXT_KHR, - egl_context, - CL_EGL_DISPLAY_KHR, - egl_display, - CL_CONTEXT_PLATFORM, - platform, - 0}; - return CreateCLContext(device, props, result); -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClContext.h b/runtime/onert/backend/gpu_cl/open_cl/ClContext.h deleted file mode 100644 index cf1d0d2d2..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/ClContext.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_CL_CONTEXT_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_CL_CONTEXT_H__ - -#include "ClDevice.h" -#include "OpenclWrapper.h" -#include "DataType.h" -#include "Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -// A RAII wrapper around opencl context -class CLContext -{ -public: - CLContext() {} - CLContext(cl_context context, bool has_ownership); - - // Move only - CLContext(CLContext &&context); - CLContext &operator=(CLContext &&context); - CLContext(const CLContext &) = delete; - CLContext &operator=(const CLContext &) = delete; - - ~CLContext(); - - cl_context context() const { return context_; } - - bool IsFloatTexture2DSupported(int num_channels, DataType data_type, - cl_mem_flags flags = CL_MEM_READ_WRITE) const; - -private: - void Release(); - - cl_context context_ = nullptr; - bool has_ownership_ = false; -}; - -absl::Status CreateCLContext(const CLDevice &device, CLContext *result); -absl::Status CreateCLGLContext(const CLDevice &device, cl_context_properties egl_context, - cl_context_properties egl_display, CLContext *result); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_CL_CONTEXT_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClDevice.cc b/runtime/onert/backend/gpu_cl/open_cl/ClDevice.cc deleted file mode 100644 index 8dede139c..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/ClDevice.cc +++ /dev/null @@ -1,448 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "ClDevice.h" - -#include <algorithm> -#include <string> -#include <vector> - -#include "Util.h" -#include "Status.h" - -#include "absl/strings/numbers.h" -#include "absl/strings/str_split.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -template <> std::string GetDeviceInfo<std::string>(cl_device_id id, cl_device_info info) -{ - size_t size; - cl_int error = clGetDeviceInfo(id, info, 0, nullptr, &size); - if (error != CL_SUCCESS) - { - return ""; - } - - std::string result(size - 1, 0); - error = clGetDeviceInfo(id, info, size, &result[0], nullptr); - if (error != CL_SUCCESS) - { - return ""; - } - return result; -} - -namespace -{ -template <typename T> T GetPlatformInfo(cl_platform_id id, cl_platform_info info) -{ - T result; - cl_int error = clGetPlatformInfo(id, info, sizeof(T), &result, nullptr); - if (error != CL_SUCCESS) - { - return -1; - } - return result; -} - -std::string GetPlatformInfo(cl_platform_id id, cl_platform_info info) -{ - size_t size; - cl_int error = clGetPlatformInfo(id, info, 0, nullptr, &size); - if (error != CL_SUCCESS) - { - return ""; - } - - std::string result(size - 1, 0); - error = clGetPlatformInfo(id, info, size, &result[0], nullptr); - if (error != CL_SUCCESS) - { - return ""; - } - return result; -} - -void GetDeviceWorkDimsSizes(cl_device_id id, int3 *result) -{ - int dims_count = GetDeviceInfo<cl_uint>(id, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS); - if (dims_count < 3) - { - return; - } - std::vector<size_t> limits(dims_count); - cl_int error = clGetDeviceInfo(id, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * dims_count, - limits.data(), nullptr); - if (error != CL_SUCCESS) - { - return; - } - // dims_count must be at least 3 according to spec - result->x = limits[0]; - result->y = limits[1]; - result->z = limits[2]; -} - -OpenCLVersion ParseCLVersion(const std::string &version) -{ - const auto first_dot_pos = version.find_first_of('.'); - if (first_dot_pos == std::string::npos) - { - return OpenCLVersion::CL_1_0; - } - const int major = version[first_dot_pos - 1] - '0'; - const int minor = version[first_dot_pos + 1] - '0'; - - if (major == 1) - { - if (minor == 2) - { - return OpenCLVersion::CL_1_2; - } - else if (minor == 1) - { - return OpenCLVersion::CL_1_1; - } - else - { - return OpenCLVersion::CL_1_0; - } - } - else if (major == 2) - { - if (minor == 2) - { - return OpenCLVersion::CL_2_2; - } - else if (minor == 1) - { - return OpenCLVersion::CL_2_1; - } - else - { - return OpenCLVersion::CL_2_0; - } - } - else if (major == 3) - { - return OpenCLVersion::CL_3_0; - } - else - { - return OpenCLVersion::CL_1_0; - } -} - -Vendor ParseVendor(const std::string &device_name, const std::string &vendor_name) -{ - std::string d_name = device_name; - std::string v_name = vendor_name; - std::transform(d_name.begin(), d_name.end(), d_name.begin(), ::tolower); - std::transform(v_name.begin(), v_name.end(), v_name.begin(), ::tolower); - if (d_name.find("qualcomm") != std::string::npos || v_name.find("qualcomm") != std::string::npos) - { - return Vendor::kQualcomm; - } - else if (d_name.find("mali") != std::string::npos || v_name.find("mali") != std::string::npos) - { - return Vendor::kMali; - } - else if (d_name.find("power") != std::string::npos || v_name.find("power") != std::string::npos) - { - return Vendor::kPowerVR; - } - else if (d_name.find("nvidia") != std::string::npos || v_name.find("nvidia") != std::string::npos) - { - return Vendor::kNvidia; - } - else if (d_name.find("advanced micro devices") != std::string::npos || - v_name.find("advanced micro devices") != std::string::npos) - { - return Vendor::kAMD; - } - else if (d_name.find("intel") != std::string::npos || v_name.find("intel") != std::string::npos) - { - return Vendor::kIntel; - } - else - { - return Vendor::kUnknown; - } -} - -// check that gpu_version belong to range min_version-max_version -// min_version is included and max_version is excluded. -bool IsGPUVersionInRange(int gpu_version, int min_version, int max_version) -{ - return gpu_version >= min_version && gpu_version < max_version; -} -} // namespace - -DeviceInfo DeviceInfoFromDeviceID(cl_device_id id) -{ - DeviceInfo info; - const auto device_name = GetDeviceInfo<std::string>(id, CL_DEVICE_NAME); - const auto vendor_name = GetDeviceInfo<std::string>(id, CL_DEVICE_VENDOR); - const auto opencl_c_version = GetDeviceInfo<std::string>(id, CL_DEVICE_OPENCL_C_VERSION); - info.vendor = ParseVendor(device_name, vendor_name); - if (info.vendor == Vendor::kQualcomm) - { - info.adreno_info = AdrenoInfo(opencl_c_version); - } - else if (info.vendor == Vendor::kMali) - { - info.mali_info = MaliInfo(device_name); - } - info.cl_version = ParseCLVersion(opencl_c_version); - info.extensions = absl::StrSplit(GetDeviceInfo<std::string>(id, CL_DEVICE_EXTENSIONS), ' '); - - info.supports_fp16 = false; - info.supports_image3d_writes = false; - for (const auto &ext : info.extensions) - { - if (ext == "cl_khr_fp16") - { - info.supports_fp16 = true; - } - if (ext == "cl_khr_3d_image_writes") - { - info.supports_image3d_writes = true; - } - } - - cl_device_fp_config f32_config = - GetDeviceInfo<cl_device_fp_config>(id, CL_DEVICE_SINGLE_FP_CONFIG); - info.supports_fp32_rtn = f32_config & CL_FP_ROUND_TO_NEAREST; - - if (info.supports_fp16) - { - cl_device_fp_config f16_config; - auto status = GetDeviceInfo<cl_device_fp_config>(id, CL_DEVICE_HALF_FP_CONFIG, &f16_config); - // AMD supports cl_khr_fp16 but CL_DEVICE_HALF_FP_CONFIG is empty. - if (status.ok() && info.vendor != Vendor::kAMD) - { - info.supports_fp16_rtn = f16_config & CL_FP_ROUND_TO_NEAREST; - } - else - { // happens on PowerVR - f16_config = f32_config; - info.supports_fp16_rtn = info.supports_fp32_rtn; - } - } - else - { - info.supports_fp16_rtn = false; - } - - if (info.vendor == Vendor::kPowerVR && !info.supports_fp16) - { - // PowerVR doesn't have full support of fp16 and so doesn't list this - // extension. But it can support fp16 in MADs and as buffers/textures types, - // so we will use it. - info.supports_fp16 = true; - info.supports_fp16_rtn = info.supports_fp32_rtn; - } - - if (!info.supports_image3d_writes && - ((info.vendor == Vendor::kQualcomm && - IsGPUVersionInRange(info.adreno_info.gpu_version, 400, 500)) || - info.vendor == Vendor::kNvidia)) - { - // in local tests Adreno 430 can write in image 3d, at least on small sizes, - // but it doesn't have cl_khr_3d_image_writes in list of available - // extensions - // The same for NVidia - info.supports_image3d_writes = true; - } - info.compute_units_count = GetDeviceInfo<cl_uint>(id, CL_DEVICE_MAX_COMPUTE_UNITS); - info.image2d_max_width = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_WIDTH); - info.image2d_max_height = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_HEIGHT); - info.buffer_max_size = GetDeviceInfo<cl_ulong>(id, CL_DEVICE_MAX_MEM_ALLOC_SIZE); - if (info.cl_version >= OpenCLVersion::CL_1_2) - { - info.image_buffer_max_size = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE); - info.image_array_max_layers = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE_MAX_ARRAY_SIZE); - } - info.image3d_max_width = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE3D_MAX_WIDTH); - info.image3d_max_height = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_HEIGHT); - info.image3d_max_depth = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE3D_MAX_DEPTH); - int3 max_work_group_sizes; - GetDeviceWorkDimsSizes(id, &max_work_group_sizes); - info.max_work_group_size_x = max_work_group_sizes.x; - info.max_work_group_size_y = max_work_group_sizes.y; - info.max_work_group_size_z = max_work_group_sizes.z; - - if (info.IsIntel()) - { - if (info.SupportsExtension("cl_intel_required_subgroup_size")) - { - size_t sub_groups_count; - cl_int status = clGetDeviceInfo(id, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/, 0, nullptr, - &sub_groups_count); - if (status == CL_SUCCESS) - { - std::vector<size_t> sub_group_sizes(sub_groups_count); - status = - clGetDeviceInfo(id, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/, - sizeof(size_t) * sub_groups_count, sub_group_sizes.data(), nullptr); - if (status == CL_SUCCESS) - { - for (size_t i = 0; i < sub_groups_count; ++i) - { - info.supported_subgroup_sizes.push_back(sub_group_sizes[i]); - } - } - } - } - } - return info; -} - -CLDevice::CLDevice(cl_device_id id, cl_platform_id platform_id) - : info_(DeviceInfoFromDeviceID(id)), id_(id), platform_id_(platform_id) -{ -} - -CLDevice::CLDevice(const CLDevice &device) - : info_(device.info_), id_(device.id_), platform_id_(device.platform_id_) -{ -} - -CLDevice &CLDevice::operator=(const CLDevice &device) -{ - if (this != &device) - { - info_ = device.info_; - id_ = device.id_; - platform_id_ = device.platform_id_; - } - return *this; -} - -CLDevice::CLDevice(CLDevice &&device) - : info_(std::move(device.info_)), id_(device.id_), platform_id_(device.platform_id_) -{ - device.id_ = nullptr; - device.platform_id_ = nullptr; -} - -CLDevice &CLDevice::operator=(CLDevice &&device) -{ - if (this != &device) - { - id_ = nullptr; - platform_id_ = nullptr; - info_ = std::move(device.info_); - std::swap(id_, device.id_); - std::swap(platform_id_, device.platform_id_); - } - return *this; -} - -bool CLDevice::SupportsFP16() const { return info_.supports_fp16; } - -bool CLDevice::SupportsExtension(const std::string &extension) const -{ - return info_.SupportsExtension(extension); -} - -bool CLDevice::SupportsTextureArray() const { return info_.SupportsTextureArray(); } - -bool CLDevice::SupportsImageBuffer() const { return info_.SupportsImageBuffer(); } - -bool CLDevice::SupportsImage3D() const { return info_.SupportsImage3D(); } - -bool CLDevice::SupportsFP32RTN() const { return info_.supports_fp32_rtn; } - -bool CLDevice::SupportsFP16RTN() const { return info_.supports_fp16_rtn; } - -std::string CLDevice::GetPlatformVersion() const -{ - return GetPlatformInfo(platform_id_, CL_PLATFORM_VERSION); -} - -bool CLDevice::IsCL20OrHigher() const { return info_.IsCL20OrHigher(); } - -bool CLDevice::SupportsSubGroupWithSize(int sub_group_size) const -{ - return info_.SupportsSubGroupWithSize(sub_group_size); -} - -bool CLDevice::IsAdreno() const { return info_.IsAdreno(); } - -bool CLDevice::IsAdreno3xx() const { return info_.IsAdreno3xx(); } - -bool CLDevice::IsAdreno4xx() const { return info_.IsAdreno4xx(); } - -bool CLDevice::IsAdreno5xx() const { return info_.IsAdreno5xx(); } - -bool CLDevice::IsAdreno6xx() const { return info_.IsAdreno6xx(); } - -bool CLDevice::IsAdreno6xxOrHigher() const { return info_.IsAdreno6xxOrHigher(); } - -bool CLDevice::IsPowerVR() const { return info_.IsPowerVR(); } - -bool CLDevice::IsNvidia() const { return info_.IsNvidia(); } - -bool CLDevice::IsMali() const { return info_.IsMali(); } - -bool CLDevice::IsAMD() const { return info_.IsAMD(); } - -bool CLDevice::IsIntel() const { return info_.IsIntel(); } - -bool CLDevice::SupportsOneLayerTextureArray() const { return info_.SupportsOneLayerTextureArray(); } - -void CLDevice::DisableOneLayerTextureArray() -{ - info_.adreno_info.support_one_layer_texture_array = false; -} - -absl::Status CreateDefaultGPUDevice(CLDevice *result) -{ - cl_uint num_platforms; - clGetPlatformIDs(0, nullptr, &num_platforms); - if (num_platforms == 0) - { - return absl::UnknownError("No supported OpenCL platform."); - } - std::vector<cl_platform_id> platforms(num_platforms); - clGetPlatformIDs(num_platforms, platforms.data(), nullptr); - - cl_platform_id platform_id = platforms[0]; - cl_uint num_devices; - clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 0, nullptr, &num_devices); - if (num_devices == 0) - { - return absl::UnknownError("No GPU on current platform."); - } - - std::vector<cl_device_id> devices(num_devices); - clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, num_devices, devices.data(), nullptr); - - *result = CLDevice(devices[0], platform_id); - return absl::OkStatus(); -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClDevice.h b/runtime/onert/backend/gpu_cl/open_cl/ClDevice.h deleted file mode 100644 index 6e740fe97..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/ClDevice.h +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_CL_DEVICE_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_CL_DEVICE_H__ - -#include <string> -#include <vector> - -#include "DeviceInfo.h" -#include "OpenclWrapper.h" -#include "Util.h" -#include "Types.h" -#include "Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -// A wrapper around opencl device id -class CLDevice -{ -public: - CLDevice() = default; - CLDevice(cl_device_id id, cl_platform_id platform_id); - - CLDevice(CLDevice &&device); - CLDevice &operator=(CLDevice &&device); - CLDevice(const CLDevice &); - CLDevice &operator=(const CLDevice &); - - ~CLDevice() {} - - cl_device_id id() const { return id_; } - cl_platform_id platform() const { return platform_id_; } - std::string GetPlatformVersion() const; - - Vendor vendor() const { return info_.vendor; } - OpenCLVersion cl_version() const { return info_.cl_version; } - bool SupportsFP16() const; - bool SupportsTextureArray() const; - bool SupportsImageBuffer() const; - bool SupportsImage3D() const; - bool SupportsExtension(const std::string &extension) const; - bool SupportsFP32RTN() const; - bool SupportsFP16RTN() const; - bool IsCL20OrHigher() const; - bool SupportsSubGroupWithSize(int sub_group_size) const; - bool IsAdreno() const; - bool IsAdreno3xx() const; - bool IsAdreno4xx() const; - bool IsAdreno5xx() const; - bool IsAdreno6xx() const; - bool IsAdreno6xxOrHigher() const; - bool IsPowerVR() const; - bool IsNvidia() const; - bool IsMali() const; - bool IsAMD() const; - bool IsIntel() const; - - // To track bug on some Adreno. b/131099086 - bool SupportsOneLayerTextureArray() const; - void DisableOneLayerTextureArray(); - - const DeviceInfo &GetInfo() const { return info_; } - // We update device info during context creation, so as supported texture - // formats can be requested from context only. - mutable DeviceInfo info_; - -private: - cl_device_id id_ = nullptr; - cl_platform_id platform_id_ = nullptr; -}; - -absl::Status CreateDefaultGPUDevice(CLDevice *result); - -template <typename T> T GetDeviceInfo(cl_device_id id, cl_device_info info) -{ - T result; - cl_int error = clGetDeviceInfo(id, info, sizeof(T), &result, nullptr); - if (error != CL_SUCCESS) - { - return -1; - } - return result; -} - -template <typename T> absl::Status GetDeviceInfo(cl_device_id id, cl_device_info info, T *result) -{ - cl_int error = clGetDeviceInfo(id, info, sizeof(T), result, nullptr); - if (error != CL_SUCCESS) - { - return absl::InvalidArgumentError(CLErrorCodeToString(error)); - } - return absl::OkStatus(); -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_CL_DEVICE_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClErrors.h b/runtime/onert/backend/gpu_cl/open_cl/ClErrors.h deleted file mode 100644 index 48cd2fb00..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/ClErrors.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_CL_ERRORS_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_CL_ERRORS_H__ - -#include <string> - -#include "Util.h" -#include "Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -// @return if error_code is success, then return OK status. Otherwise translates -// error code into a message. -inline absl::Status GetOpenCLError(cl_int error_code) -{ - if (error_code == CL_SUCCESS) - { - return absl::OkStatus(); - } - return absl::InternalError("OpenCL error: " + CLErrorCodeToString(error_code)); -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_CL_ERRORS_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClEvent.cc b/runtime/onert/backend/gpu_cl/open_cl/ClEvent.cc deleted file mode 100644 index beb64a9a8..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/ClEvent.cc +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "ClEvent.h" - -#include "OpenclWrapper.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ -CLEvent::CLEvent(cl_event event) : event_(event) {} - -CLEvent::CLEvent(CLEvent &&event) : event_(event.event_), name_(std::move(event.name_)) -{ - event.event_ = nullptr; -} - -CLEvent &CLEvent::operator=(CLEvent &&event) -{ - if (this != &event) - { - Release(); - std::swap(event_, event.event_); - name_ = std::move(event.name_); - } - return *this; -} - -uint64_t CLEvent::GetStartedTimeNs() const -{ - cl_ulong time_ns; - clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &time_ns, nullptr); - return time_ns; -} - -uint64_t CLEvent::GetFinishedTimeNs() const -{ - cl_ulong time_ns; - clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &time_ns, nullptr); - return time_ns; -} - -double CLEvent::GetEventTimeMs() const -{ - const uint64_t start = GetStartedTimeNs(); - const uint64_t end = GetFinishedTimeNs(); - const uint64_t time_ns = (end - start); - - return static_cast<double>(time_ns) * 1e-6; -} - -uint64_t CLEvent::GetEventTimeNs() const { return GetFinishedTimeNs() - GetStartedTimeNs(); } - -void CLEvent::SetName(const std::string &name) { name_ = name; } - -void CLEvent::Wait() const { clWaitForEvents(1, &event_); } - -CLEvent::~CLEvent() { Release(); } - -void CLEvent::Release() -{ - if (event_) - { - clReleaseEvent(event_); - event_ = nullptr; - } -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClEvent.h b/runtime/onert/backend/gpu_cl/open_cl/ClEvent.h deleted file mode 100644 index 265409ffe..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/ClEvent.h +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_CL_EVENT_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_CL_EVENT_H__ - -#include <cstdint> -#include <string> - -#include "OpenclWrapper.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -// A RAII wrapper around opencl event -class CLEvent -{ -public: - CLEvent() {} - explicit CLEvent(cl_event event); - - // Move only - CLEvent(CLEvent &&event); - CLEvent &operator=(CLEvent &&event); - CLEvent(const CLEvent &) = delete; - CLEvent &operator=(const CLEvent &) = delete; - - ~CLEvent(); - - uint64_t GetStartedTimeNs() const; - uint64_t GetFinishedTimeNs() const; - - double GetEventTimeMs() const; - uint64_t GetEventTimeNs() const; - - void Wait() const; - - cl_event event() const { return event_; } - - bool is_valid() const { return event_ != nullptr; } - - void SetName(const std::string &name); - std::string GetName() const { return name_; } - -private: - void Release(); - - cl_event event_ = nullptr; - - std::string name_; // optional, for profiling mostly -}; - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_CL_EVENT_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClImageFormat.cc b/runtime/onert/backend/gpu_cl/open_cl/ClImageFormat.cc deleted file mode 100644 index 247a63d39..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/ClImageFormat.cc +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "ClImageFormat.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -cl_channel_order ToChannelOrder(int num_channels) -{ - switch (num_channels) - { - case 1: - return CL_R; - case 2: - return CL_RG; - case 3: - return CL_RGB; - case 4: - return CL_RGBA; - default: - return -1; - } -} - -cl_channel_type ToImageChannelType(DataType data_type) -{ - switch (data_type) - { - case DataType::FLOAT32: - return CL_FLOAT; - case DataType::FLOAT16: - return CL_HALF_FLOAT; - default: - return -1; - } -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClImageFormat.h b/runtime/onert/backend/gpu_cl/open_cl/ClImageFormat.h deleted file mode 100644 index a763746bd..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/ClImageFormat.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_CL_IMAGE_FORMAT_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_CL_IMAGE_FORMAT_H__ - -#include "OpenclWrapper.h" -#include "DataType.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -cl_channel_order ToChannelOrder(int num_channels); - -cl_channel_type ToImageChannelType(DataType data_type); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_CL_IMAGE_FORMAT_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClKernel.cc b/runtime/onert/backend/gpu_cl/open_cl/ClKernel.cc deleted file mode 100644 index f7745b9ac..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/ClKernel.cc +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "ClKernel.h" - -#include "absl/strings/str_cat.h" -#include "ClProgram.h" -#include "Util.h" -#include "Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ -namespace -{ - -absl::Status GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id, int *result) -{ - size_t max_work_group_size; - cl_int error_code = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, - sizeof(size_t), &max_work_group_size, nullptr); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError(absl::StrCat("Failed to get info CL_KERNEL_WORK_GROUP_SIZE ", - CLErrorCodeToString(error_code))); - } - *result = static_cast<int>(max_work_group_size); - return absl::OkStatus(); -} - -absl::Status GetKernelPrivateMemorySize(cl_kernel kernel, cl_device_id device_id, int *result) -{ - cl_ulong private_mem_size; - cl_int error_code = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_PRIVATE_MEM_SIZE, - sizeof(cl_ulong), &private_mem_size, nullptr); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError(absl::StrCat("Failed to get info CL_KERNEL_PRIVATE_MEM_SIZE ", - CLErrorCodeToString(error_code))); - } - *result = static_cast<int>(private_mem_size); - return absl::OkStatus(); -} - -} // namespace - -CLKernel::CLKernel(CLKernel &&kernel) - : info_(kernel.info_), binding_counter_(kernel.binding_counter_), - function_name_(std::move(kernel.function_name_)), program_(kernel.program_), - kernel_(kernel.kernel_) -{ - kernel.kernel_ = nullptr; -} - -CLKernel &CLKernel::operator=(CLKernel &&kernel) -{ - if (this != &kernel) - { - Release(); - std::swap(info_, kernel.info_); - std::swap(binding_counter_, kernel.binding_counter_); - function_name_ = std::move(kernel.function_name_); - std::swap(program_, kernel.program_); - std::swap(kernel_, kernel.kernel_); - } - return *this; -} - -CLKernel::~CLKernel() { Release(); } - -absl::Status CLKernel::ReInit() const -{ - clReleaseKernel(kernel_); - cl_kernel *kern_ptr = const_cast<cl_kernel *>(&kernel_); - int error_code; - *kern_ptr = clCreateKernel(program_, function_name_.c_str(), &error_code); - if (!kernel_ || error_code != CL_SUCCESS) - { - *kern_ptr = nullptr; - return absl::UnknownError( - absl::StrCat("Failed to create ", function_name_, CLErrorCodeToString(error_code))); - } - return absl::OkStatus(); -} - -void CLKernel::Release() -{ - if (kernel_) - { - clReleaseKernel(kernel_); - clReleaseProgram(program_); - kernel_ = nullptr; - } -} - -absl::Status CLKernel::CreateFromProgram(const CLProgram &program, const std::string &function_name) -{ - int error_code; - function_name_ = function_name; - kernel_ = clCreateKernel(program.program(), function_name.c_str(), &error_code); - if (!kernel_ || error_code != CL_SUCCESS) - { - kernel_ = nullptr; - return absl::UnknownError( - absl::StrCat("Failed to create ", function_name, CLErrorCodeToString(error_code))); - } - - program_ = program.program(); - clRetainProgram(program_); - - RETURN_IF_ERROR( - GetKernelPrivateMemorySize(kernel_, program.GetDeviceId(), &info_.private_memory_size)); - RETURN_IF_ERROR( - GetKernelMaxWorkGroupSize(kernel_, program.GetDeviceId(), &info_.max_work_group_size)); - return absl::OkStatus(); -} - -absl::Status CLKernel::SetMemory(int index, cl_mem memory) -{ - return SetBytes(index, &memory, sizeof(cl_mem)); -} - -absl::Status CLKernel::SetMemoryAuto(cl_mem memory) -{ - return SetBytesAuto(&memory, sizeof(cl_mem)); -} - -absl::Status CLKernel::SetBytes(int index, const void *ptr, int length) const -{ - const int error_code = clSetKernelArg(kernel_, index, length, ptr); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError( - absl::StrCat("Failed to set kernel arguments - ", CLErrorCodeToString(error_code))); - } - return absl::OkStatus(); -} - -absl::Status CLKernel::SetBytesAuto(const void *ptr, int length) -{ - const int error_code = clSetKernelArg(kernel_, binding_counter_, length, ptr); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError(absl::StrCat("Failed to set kernel arguments - ", - CLErrorCodeToString(error_code), "(at index - ", - binding_counter_, ")")); - } - binding_counter_++; - return absl::OkStatus(); -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClKernel.h b/runtime/onert/backend/gpu_cl/open_cl/ClKernel.h deleted file mode 100644 index 9575b7946..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/ClKernel.h +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_CL_KERNEL_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_CL_KERNEL_H__ - -#include <string> - -#include "ClContext.h" -#include "ClDevice.h" -#include "ClProgram.h" -#include "OpenclWrapper.h" -#include "Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -struct KernelInfo -{ - int private_memory_size = 0; - int max_work_group_size = 0; -}; - -// Arguments binding to CLKernel can be manual or automatic -// In manual you specify binding index explicitly -// In automatic binding, index auto-incremented with every binding call -// Also, if you use automatic mode you must call ResetBindingCounter -// before parameters binding -class CLKernel -{ -public: - CLKernel() {} - - // Move only - CLKernel(CLKernel &&kernel); - CLKernel &operator=(CLKernel &&kernel); - CLKernel(const CLKernel &) = delete; - CLKernel &operator=(const CLKernel &) = delete; - - ~CLKernel(); - - cl_kernel kernel() const { return kernel_; } - - absl::Status CreateFromProgram(const CLProgram &program, const std::string &function_name); - - absl::Status SetMemory(int index, cl_mem memory); - absl::Status SetMemoryAuto(cl_mem memory); - template <typename T> absl::Status SetBytes(int index, const T &value) const - { - return SetBytes(index, static_cast<const void *>(&value), sizeof(T)); - } - template <typename T> absl::Status SetBytesAuto(const T &value) - { - return SetBytesAuto(static_cast<const void *>(&value), sizeof(T)); - } - - int GetBindingCounter() const { return binding_counter_; } - void ResetBindingCounter() { binding_counter_ = 0; } - - // Do not use this function - // workaround for Mali memory leak - absl::Status ReInit() const; - - KernelInfo info_; - -private: - void Release(); - absl::Status SetBytes(int index, const void *ptr, int length) const; - absl::Status SetBytesAuto(const void *ptr, int length); - - int binding_counter_ = -1; - - std::string function_name_ = ""; - // reference to program from which kernel was created - cl_program program_ = nullptr; - cl_kernel kernel_ = nullptr; -}; - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_CL_KERNEL_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClMemory.cc b/runtime/onert/backend/gpu_cl/open_cl/ClMemory.cc deleted file mode 100644 index fd3bc5579..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/ClMemory.cc +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "ClMemory.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -cl_mem_flags ToClMemFlags(AccessType access_type) -{ - switch (access_type) - { - case AccessType::READ: - return CL_MEM_READ_ONLY; - case AccessType::WRITE: - return CL_MEM_WRITE_ONLY; - case AccessType::READ_WRITE: - return CL_MEM_READ_WRITE; - default: - throw std::runtime_error("Invalid AccessType"); - } - - return CL_MEM_READ_ONLY; // unreachable -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClMemory.h b/runtime/onert/backend/gpu_cl/open_cl/ClMemory.h deleted file mode 100644 index c704ec71f..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/ClMemory.h +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_CL_MEMORY_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_CL_MEMORY_H__ - -#include <algorithm> - -#include "OpenclWrapper.h" -#include "AccessType.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -// RAII wrapper for OpenCL memory object. -// -// Image is moveable but not copyable. -class CLMemory -{ -public: - // Creates invalid object. - CLMemory() : CLMemory(nullptr, false) {} - - CLMemory(cl_mem memory, bool has_ownership) : memory_(memory), has_ownership_(has_ownership) {} - - // Move-only - CLMemory(const CLMemory &) = delete; - CLMemory &operator=(const CLMemory &) = delete; - CLMemory(CLMemory &&image) : memory_(image.memory_), has_ownership_(image.has_ownership_) - { - image.memory_ = nullptr; - } - - ~CLMemory() { Invalidate(); } - - CLMemory &operator=(CLMemory &&image) - { - if (this != &image) - { - Invalidate(); - std::swap(memory_, image.memory_); - has_ownership_ = image.has_ownership_; - } - return *this; - } - - cl_mem memory() const { return memory_; } - - bool is_valid() const { return memory_ != nullptr; } - - // @return true if this object actually owns corresponding CL memory - // and manages it's lifetime. - bool has_ownership() const { return has_ownership_; } - - cl_mem Release() - { - cl_mem to_return = memory_; - memory_ = nullptr; - return to_return; - } - -private: - void Invalidate() - { - if (memory_ && has_ownership_) - { - clReleaseMemObject(memory_); - } - memory_ = nullptr; - } - - cl_mem memory_ = nullptr; - bool has_ownership_ = false; -}; - -cl_mem_flags ToClMemFlags(AccessType access_type); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_CL_MEMORY_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClProgram.cc b/runtime/onert/backend/gpu_cl/open_cl/ClProgram.cc deleted file mode 100644 index c72b01a73..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/ClProgram.cc +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "ClProgram.h" - -#include <cstdint> -#include <cstring> -#include <vector> - -#include "absl/strings/str_cat.h" -#include "absl/types/span.h" -#include "Util.h" -#include "Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ -namespace -{ - -std::string GetProgramBuildInfo(cl_program program, cl_device_id id, cl_program_build_info info) -{ - size_t size; - cl_int error_code = clGetProgramBuildInfo(program, id, info, 0, nullptr, &size); - if (error_code != CL_SUCCESS) - { - return absl::StrCat("Failed to GetProgramBuildInfo - ", CLErrorCodeToString(error_code)); - } - - std::string result(size - 1, 0); - error_code = clGetProgramBuildInfo(program, id, info, size, &result[0], nullptr); - if (error_code != CL_SUCCESS) - { - return absl::StrCat("Failed to GetProgramBuildInfo - ", CLErrorCodeToString(error_code)); - } - return result; -} - -absl::Status GetBinarySize(cl_program program, size_t *binary_size) -{ - cl_int error_code = - clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), binary_size, nullptr); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError( - absl::StrCat("Failed to get program binary size - ", CLErrorCodeToString(error_code))); - } - return absl::OkStatus(); -} - -absl::Status BuildProgram(cl_program program, const CLDevice &device, - const std::string &compiler_options) -{ - const int error_code = - clBuildProgram(program, 0, nullptr, compiler_options.c_str(), nullptr, nullptr); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError( - absl::StrCat("Failed to build program executable - ", CLErrorCodeToString(error_code), - GetProgramBuildInfo(program, device.id(), CL_PROGRAM_BUILD_LOG))); - } - - return absl::OkStatus(); -} - -std::string CompilerOptionToString(const CLDevice &device, CompilerOptions option) -{ - switch (option) - { - case CompilerOptions::ADRENO_FULL_SIMD_LINE: - if (device.info_.adreno_info.gpu_version < 500) - { - return "-qcom-accelerate-16-bit"; - } - else - { - return "-qcom-accelerate-16-bit=true"; - } - case CompilerOptions::ADRENO_MORE_WAVES: - if (device.info_.adreno_info.gpu_version >= 500) - { - return "-qcom-accelerate-16-bit=false"; - } - else - { - return ""; - } - case CompilerOptions::POWERVR_FP16: - return "-cl-fast-relaxed-math"; - case CompilerOptions::CL_OPT_DISABLE: - return "-cl-opt-disable"; - case CompilerOptions::CL_2_0: - return "-cl-std=CL2.0"; - case CompilerOptions::CL_3_0: - return "-cl-std=CL3.0"; - } - return ""; -} - -} // namespace - -std::string CompilerOptionsToString(const CLDevice &device, - const std::vector<CompilerOptions> &compiler_options) -{ - std::string result; - for (auto option : compiler_options) - { - absl::StrAppend(&result, CompilerOptionToString(device, option), " "); - } - return result; -} - -CLProgram::CLProgram(cl_program program, cl_device_id device_id) - : program_(program), device_id_(device_id) -{ -} - -CLProgram::CLProgram(CLProgram &&program) - : program_(program.program_), device_id_(program.device_id_) -{ - program.program_ = nullptr; -} - -CLProgram &CLProgram::operator=(CLProgram &&program) -{ - if (this != &program) - { - Release(); - std::swap(program_, program.program_); - std::swap(device_id_, program.device_id_); - } - return *this; -} - -CLProgram::~CLProgram() { Release(); } - -void CLProgram::Release() -{ - if (program_) - { - clReleaseProgram(program_); - program_ = nullptr; - } -} - -absl::Status CLProgram::GetBinary(std::vector<uint8_t> *result) const -{ - size_t binary_size; - RETURN_IF_ERROR(GetBinarySize(program_, &binary_size)); - result->resize(result->size() + binary_size); - uint8_t *binary_ptr = result->data() + result->size() - binary_size; - cl_int error_code = - clGetProgramInfo(program_, CL_PROGRAM_BINARIES, binary_size, &binary_ptr, nullptr); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError( - absl::StrCat("Failed to get program binary - ", CLErrorCodeToString(error_code))); - } - return absl::OkStatus(); -} - -absl::Status CreateCLProgram(const std::string &code, const std::string &compiler_options, - const CLContext &context, const CLDevice &device, CLProgram *result) -{ - int error_code; - const char *source = code.c_str(); - - cl_program program = - clCreateProgramWithSource(context.context(), 1, &source, nullptr, &error_code); - if (!program || error_code != CL_SUCCESS) - { - return absl::UnknownError( - absl::StrCat("Failed to create compute program - ", CLErrorCodeToString(error_code))); - } - - *result = CLProgram(program, device.id()); - RETURN_IF_ERROR(BuildProgram(program, device, compiler_options)); - return absl::OkStatus(); -} - -absl::Status CreateCLProgramFromBinary(const CLContext &context, const CLDevice &device, - absl::Span<const uint8_t> binary, CLProgram *result) -{ - cl_int binary_status; - cl_int error_code; - cl_device_id devices_list[] = {device.id()}; - size_t binary_size = binary.size(); - const uint8_t *binary_pointer = binary.data(); - cl_program program = clCreateProgramWithBinary(context.context(), 1, devices_list, &binary_size, - &binary_pointer, &binary_status, &error_code); - if (binary_status != CL_SUCCESS) - { - return absl::UnknownError(absl::StrCat( - "Something wrong with binary after clCreateProgramWithBinary - ", binary_status)); - } - if (error_code != CL_SUCCESS) - { - return absl::UnknownError( - absl::StrCat("Failed to create program - ", CLErrorCodeToString(error_code))); - } - *result = CLProgram(program, device.id()); - return BuildProgram(program, device, ""); -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClProgram.h b/runtime/onert/backend/gpu_cl/open_cl/ClProgram.h deleted file mode 100644 index d039ff698..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/ClProgram.h +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_CL_PROGRAM_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_CL_PROGRAM_H__ - -#include <cstdint> -#include <vector> - -#include "ClContext.h" -#include "ClDevice.h" -#include "OpenclWrapper.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -enum class CompilerOptions -{ - // ADRENO_FULL_SIMD_LINE: - // Adreno can have 2 sizes for SIMD size. - // On Adreno 4xx/5xx it is 32/64, on Adreno6xx it is 64/128. - // Some our algorithms actually rely on exact size, for example on full - // SIMD size, so we need this define. - // This define is actually -qcom-accelerate-16-bit, but it controls SIMD - // size. - ADRENO_FULL_SIMD_LINE, - ADRENO_MORE_WAVES, - POWERVR_FP16, - CL_OPT_DISABLE, - CL_2_0, - CL_3_0, -}; - -std::string CompilerOptionsToString(const CLDevice &device, - const std::vector<CompilerOptions> &compiler_options); - -class CLProgram -{ -public: - CLProgram() {} - CLProgram(cl_program program, cl_device_id device_id); - - // Move only - CLProgram(CLProgram &&program); - CLProgram &operator=(CLProgram &&program); - CLProgram(const CLProgram &) = delete; - CLProgram &operator=(const CLProgram &) = delete; - - ~CLProgram(); - - cl_program program() const { return program_; } - - // Return the cl_device_id associated with the program object. - // This can be the device associated with context on which the program object - // has been created or can be device that was specified when a program object - // was created using clCreateProgramWithBinary. - cl_device_id GetDeviceId() const { return device_id_; } - - absl::Status GetBinary(std::vector<uint8_t> *result) const; - -private: - void Release(); - - cl_program program_ = nullptr; - - // reference - cl_device_id device_id_ = nullptr; -}; - -absl::Status CreateCLProgram(const std::string &code, const std::string &compiler_options, - const CLContext &context, const CLDevice &device, CLProgram *result); - -absl::Status CreateCLProgramFromBinary(const CLContext &context, const CLDevice &device, - absl::Span<const uint8_t> binary, CLProgram *result); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_CL_PROGRAM_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/DataType.cc b/runtime/onert/backend/gpu_cl/open_cl/DataType.cc deleted file mode 100644 index ce2aa8298..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/DataType.cc +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "DataType.h" - -#include <stddef.h> -#include <string> - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -size_t SizeOf(DataType data_type) -{ - switch (data_type) - { - case DataType::UINT8: - case DataType::INT8: - return 1; - case DataType::FLOAT16: - case DataType::INT16: - case DataType::UINT16: - return 2; - case DataType::FLOAT32: - case DataType::INT32: - case DataType::UINT32: - return 4; - case DataType::FLOAT64: - case DataType::INT64: - case DataType::UINT64: - return 8; - case DataType::UNKNOWN: - return 0; - } - return 0; -} - -std::string ToString(DataType data_type) -{ - switch (data_type) - { - case DataType::FLOAT16: - return "float16"; - case DataType::FLOAT32: - return "float32"; - case DataType::FLOAT64: - return "float64"; - case DataType::INT16: - return "int16"; - case DataType::INT32: - return "int32"; - case DataType::INT64: - return "int64"; - case DataType::INT8: - return "int8"; - case DataType::UINT16: - return "uint16"; - case DataType::UINT32: - return "uint32"; - case DataType::UINT64: - return "uint64"; - case DataType::UINT8: - return "uint8"; - case DataType::UNKNOWN: - return "unknown"; - } - return "undefined"; -} - -std::string ToCLDataType(DataType data_type, int vec_size) -{ - const std::string postfix = vec_size == 1 ? "" : std::to_string(vec_size); - switch (data_type) - { - case DataType::FLOAT16: - return "half" + postfix; - case DataType::FLOAT32: - return "float" + postfix; - case DataType::FLOAT64: - return "double" + postfix; - case DataType::INT16: - return "short" + postfix; - case DataType::INT32: - return "int" + postfix; - case DataType::INT64: - return "long" + postfix; - case DataType::INT8: - return "char" + postfix; - case DataType::UINT16: - return "ushort" + postfix; - case DataType::UINT32: - return "uint" + postfix; - case DataType::UINT64: - return "ulong" + postfix; - case DataType::UINT8: - return "uchar" + postfix; - case DataType::UNKNOWN: - return "unknown"; - } - return "undefined"; -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/DataType.h b/runtime/onert/backend/gpu_cl/open_cl/DataType.h deleted file mode 100644 index 2a5afd551..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/DataType.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_DATA_TYPE_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_DATA_TYPE_H__ - -#include <stddef.h> -#include <string> - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -enum class DataType -{ - UNKNOWN = 0, - FLOAT16 = 1, - FLOAT32 = 2, - FLOAT64 = 3, - UINT8 = 4, - INT8 = 5, - UINT16 = 6, - INT16 = 7, - UINT32 = 8, - INT32 = 9, - UINT64 = 10, - INT64 = 11, -}; - -size_t SizeOf(DataType type); - -std::string ToString(DataType t); - -std::string ToCLDataType(DataType data_type, int vec_size = 1); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_DATA_TYPE_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/DeviceInfo.cc b/runtime/onert/backend/gpu_cl/open_cl/DeviceInfo.cc deleted file mode 100644 index 2966fad75..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/DeviceInfo.cc +++ /dev/null @@ -1,383 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2020 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "DeviceInfo.h" - -#include <algorithm> -#include <map> -#include <string> -#include <vector> - -#include "absl/strings/numbers.h" -#include "absl/strings/str_split.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -namespace -{ -// check that gpu_version belong to range min_version-max_version -// min_version is included and max_version is excluded. -bool IsGPUVersionInRange(int gpu_version, int min_version, int max_version) -{ - return gpu_version >= min_version && gpu_version < max_version; -} - -MaliGPU GetMaliGPUVersion(const std::string &device_name) -{ - const std::map<std::string, MaliGPU> kMapping = { - {"T604", MaliGPU::T604}, {"T622", MaliGPU::T622}, {"T624", MaliGPU::T624}, - {"T628", MaliGPU::T628}, {"T658", MaliGPU::T658}, {"T678", MaliGPU::T678}, - {"T720", MaliGPU::T720}, {"T760", MaliGPU::T760}, {"T820", MaliGPU::T820}, - {"T830", MaliGPU::T830}, {"T860", MaliGPU::T860}, {"T880", MaliGPU::T880}, - {"G31", MaliGPU::G31}, {"G51", MaliGPU::G51}, {"G71", MaliGPU::G71}, - {"G52", MaliGPU::G52}, {"G72", MaliGPU::G72}, {"G76", MaliGPU::G76}, - {"G57", MaliGPU::G57}, {"G77", MaliGPU::G77}, {"G68", MaliGPU::G68}, - {"G78", MaliGPU::G78}, - }; - for (const auto &v : kMapping) - { - if (device_name.find(v.first) != std::string::npos) - { - return v.second; - } - } - return MaliGPU::UNKNOWN; -} - -} // namespace - -// There is no rule for gpu version encoding, but we found these samples: -// Version: OpenCL C 2.0 Adreno(TM) 540 // Pixel 2 -// Version: OpenCL C 2.0 Adreno(TM) 630 // Sony Compact XZ2 -// Version: OpenCL C 2.0 Adreno(TM) 630 // Pixel 3 -// Version: OpenCL C 2.0 Adreno(TM) 540 // Samsung S8 -// Version: OpenCL C 1.2 Adreno(TM) 430 // HTC One M9 -// Version: OpenCL C 2.0 Adreno(TM) 530 // Samsung S7 Edge -// Version: OpenCL C 1.2 Adreno(TM) 405 // Motorola Moto G(4) -// After the number string ends. -// It is assumed that the <vendor-specific information> for Adreno GPUs has -// the following format: -// <text?><space?>Adreno(TM)<space><text?><version> -// Returns -1 if vendor-specific information cannot be parsed -int GetAdrenoGPUVersion(const std::string &gpu_version) -{ - const std::string gpu = absl::AsciiStrToLower(gpu_version); - const std::vector<absl::string_view> words = absl::StrSplit(gpu, ' '); - size_t i = 0; - for (; i < words.size(); ++i) - { - if (words[i].find("adreno") != words[i].npos) - { - break; - } - } - i += 1; - for (; i < words.size(); ++i) - { - int number; - bool is_number = absl::SimpleAtoi(words[i], &number); - // Adreno GPUs starts from 2xx, but opencl support should be only from 3xx - if (is_number && number >= 300) - { - return number; - } - } - return -1; -} - -std::string VendorToString(Vendor v) -{ - switch (v) - { - case Vendor::kQualcomm: - return "Qualcomm"; - case Vendor::kMali: - return "Mali"; - case Vendor::kPowerVR: - return "PowerVR"; - case Vendor::kNvidia: - return "NVIDIA"; - case Vendor::kAMD: - return "AMD"; - case Vendor::kIntel: - return "Intel"; - case Vendor::kUnknown: - return "unknown vendor"; - default: - return "Error"; - } -} - -std::string OpenCLVersionToString(OpenCLVersion version) -{ - switch (version) - { - case OpenCLVersion::CL_1_0: - return "1.0"; - case OpenCLVersion::CL_1_1: - return "1.1"; - case OpenCLVersion::CL_1_2: - return "1.2"; - case OpenCLVersion::CL_2_0: - return "2.0"; - case OpenCLVersion::CL_2_1: - return "2.1"; - case OpenCLVersion::CL_2_2: - return "2.2"; - case OpenCLVersion::CL_3_0: - return "3.0"; - default: - return "Error"; - } -} - -AdrenoInfo::AdrenoInfo(const std::string &device_version) - : gpu_version(GetAdrenoGPUVersion(device_version)) -{ -} - -int AdrenoInfo::GetMaximumWavesCount() const -{ - if (gpu_version < 400) - { - return -1; // Adreno 3xx does not support it currently - } - else if (gpu_version >= 400 && gpu_version < 500) - { - return -1; // Adreno 4xx does not support it currently - } - else if (gpu_version >= 500 && gpu_version < 600) - { - return -1; // Adreno 5xx does not support it currently - } - else if (gpu_version >= 600 && gpu_version < 700) - { - return gpu_version == 640 ? 30 : 16; - } - else - { - return -1; // Adreno 7xx and higher does not exist yet - } -} - -int AdrenoInfo::GetRegisterMemorySizePerComputeUnit() const -{ - if (gpu_version < 400) - { - return -1; // Adreno 3xx does not support it currently - } - else if (gpu_version >= 400 && gpu_version < 500) - { - return -1; // Adreno 4xx does not support it currently - } - else if (gpu_version >= 500 && gpu_version < 600) - { - return -1; // Adreno 5xx does not support it currently - } - else if (gpu_version >= 600 && gpu_version < 700) - { - return gpu_version == 640 ? 128 * 144 * 16 : 128 * 96 * 16; - } - else - { - return -1; // Adreno 7xx and higher does not exist yet - } -} - -int AdrenoInfo::GetMaximumWavesCount(int register_footprint_per_tread, bool full_wave) const -{ - const int register_usage_per_wave = GetWaveSize(full_wave) * register_footprint_per_tread; - const int possible_waves_count = GetRegisterMemorySizePerComputeUnit() / register_usage_per_wave; - return std::min(possible_waves_count, GetMaximumWavesCount()); -} - -int AdrenoInfo::GetWaveSize(bool full_wave) const -{ - if (gpu_version < 400) - { - return -1; // Adreno 3xx does not support it currently - } - else if (gpu_version < 600) - { - return full_wave ? 64 : 32; - } - else - { - return full_wave ? 128 : 64; - } -} - -MaliInfo::MaliInfo(const std::string &device_name) : gpu_version(GetMaliGPUVersion(device_name)) {} - -bool MaliInfo::IsMaliT6xx() const -{ - return gpu_version == MaliGPU::T604 || gpu_version == MaliGPU::T622 || - gpu_version == MaliGPU::T624 || gpu_version == MaliGPU::T628 || - gpu_version == MaliGPU::T658 || gpu_version == MaliGPU::T678; -} - -bool MaliInfo::IsMaliT7xx() const -{ - return gpu_version == MaliGPU::T720 || gpu_version == MaliGPU::T760; -} - -bool MaliInfo::IsMaliT8xx() const -{ - return gpu_version == MaliGPU::T820 || gpu_version == MaliGPU::T830 || - gpu_version == MaliGPU::T860 || gpu_version == MaliGPU::T880; -} - -bool MaliInfo::IsMidgard() const { return IsMaliT6xx() || IsMaliT7xx() || IsMaliT8xx(); } - -bool MaliInfo::IsBifrostGen1() const -{ - return gpu_version == MaliGPU::G31 || gpu_version == MaliGPU::G51 || gpu_version == MaliGPU::G71; -} - -bool MaliInfo::IsBifrostGen2() const -{ - return gpu_version == MaliGPU::G52 || gpu_version == MaliGPU::G72; -} - -bool MaliInfo::IsBifrostGen3() const { return gpu_version == MaliGPU::G76; } - -bool MaliInfo::IsBifrost() const { return IsBifrostGen1() || IsBifrostGen2() || IsBifrostGen3(); } - -bool MaliInfo::IsValhall() const -{ - return gpu_version == MaliGPU::G57 || gpu_version == MaliGPU::G77 || - gpu_version == MaliGPU::G68 || gpu_version == MaliGPU::G78; -} - -bool DeviceInfo::SupportsTextureArray() const { return cl_version >= OpenCLVersion::CL_1_2; } - -bool DeviceInfo::SupportsImageBuffer() const { return cl_version >= OpenCLVersion::CL_1_2; } - -bool DeviceInfo::SupportsImage3D() const -{ - if (vendor == Vendor::kMali) - { - // On Mali T880 read_imageh doesn't compile with image3d_t - return false; - } - return supports_image3d_writes; -} - -bool DeviceInfo::SupportsFloatImage2D(DataType data_type, int channels) const -{ - if (channels == 1) - { - return data_type == DataType::FLOAT32 ? supports_r_f32_tex2d : supports_r_f16_tex2d; - } - else if (channels == 2) - { - return data_type == DataType::FLOAT32 ? supports_rg_f32_tex2d : supports_rg_f16_tex2d; - } - else if (channels == 3) - { - return data_type == DataType::FLOAT32 ? supports_rgb_f32_tex2d : supports_rgb_f16_tex2d; - } - else if (channels == 4) - { - return data_type == DataType::FLOAT32 ? supports_rgba_f32_tex2d : supports_rgba_f16_tex2d; - } - else - { - return false; - } -} - -bool DeviceInfo::SupportsOneLayerTextureArray() const -{ - return !IsAdreno() || adreno_info.support_one_layer_texture_array; -} - -bool DeviceInfo::SupportsExtension(const std::string &extension) const -{ - for (const auto &ext : extensions) - { - if (ext == extension) - { - return true; - } - } - return false; -} - -bool DeviceInfo::IsCL20OrHigher() const -{ - return cl_version != OpenCLVersion::CL_1_0 && cl_version != OpenCLVersion::CL_1_1 && - cl_version != OpenCLVersion::CL_1_2; -} - -bool DeviceInfo::SupportsSubGroupWithSize(int sub_group_size) const -{ - for (auto subgroup_size : supported_subgroup_sizes) - { - if (sub_group_size == subgroup_size) - { - return true; - } - } - return false; -} - -bool DeviceInfo::IsAdreno() const { return vendor == Vendor::kQualcomm; } - -bool DeviceInfo::IsAdreno3xx() const -{ - return IsAdreno() && IsGPUVersionInRange(adreno_info.gpu_version, 300, 400); -} - -bool DeviceInfo::IsAdreno4xx() const -{ - return IsAdreno() && IsGPUVersionInRange(adreno_info.gpu_version, 400, 500); -} - -bool DeviceInfo::IsAdreno5xx() const -{ - return IsAdreno() && IsGPUVersionInRange(adreno_info.gpu_version, 500, 600); -} - -bool DeviceInfo::IsAdreno6xx() const -{ - return IsAdreno() && IsGPUVersionInRange(adreno_info.gpu_version, 600, 700); -} - -bool DeviceInfo::IsAdreno6xxOrHigher() const -{ - return IsAdreno() && adreno_info.gpu_version >= 600; -} - -bool DeviceInfo::IsPowerVR() const { return vendor == Vendor::kPowerVR; } - -bool DeviceInfo::IsNvidia() const { return vendor == Vendor::kNvidia; } - -bool DeviceInfo::IsMali() const { return vendor == Vendor::kMali; } - -bool DeviceInfo::IsAMD() const { return vendor == Vendor::kAMD; } - -bool DeviceInfo::IsIntel() const { return vendor == Vendor::kIntel; } - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/DeviceInfo.h b/runtime/onert/backend/gpu_cl/open_cl/DeviceInfo.h deleted file mode 100644 index 85d7d4c80..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/DeviceInfo.h +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2020 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_DEVICE_INFO_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_DEVICE_INFO_H__ - -#include <string> -#include <vector> - -#include "DataType.h" - -// for use only in device_info.cc, but keep here to make tests -int GetAdrenoGPUVersion(const std::string &gpu_version); - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -enum class Vendor -{ - kQualcomm, - kMali, - kPowerVR, - kNvidia, - kAMD, - kIntel, - kUnknown -}; -std::string VendorToString(Vendor v); - -enum class OpenCLVersion -{ - UNKNOWN, - CL_1_0, - CL_1_1, - CL_1_2, - CL_2_0, - CL_2_1, - CL_2_2, - CL_3_0 -}; -std::string OpenCLVersionToString(OpenCLVersion version); - -struct AdrenoInfo -{ - AdrenoInfo() = default; - explicit AdrenoInfo(const std::string &device_version); - int gpu_version = -1; // can be, for example, 405/430/540/530/630 etc. - - // This function returns some not very documented physical parameter of - // Adreno6xx GPU. - // We obtained it using Snapdragon Profiler. - int GetMaximumWavesCount() const; - - // returns amount of register memory per CU(Compute Unit) in bytes. - int GetRegisterMemorySizePerComputeUnit() const; - - // returns maximum possible amount of waves based on register usage. - int GetMaximumWavesCount(int register_footprint_per_tread, bool full_wave = true) const; - - int GetWaveSize(bool full_wave) const; - - // Not supported on some Adreno devices with specific driver version. - // b/131099086 - bool support_one_layer_texture_array = true; -}; - -enum class MaliGPU -{ - T604, - T622, - T624, - T628, - T658, - T678, - T720, - T760, - T820, - T830, - T860, - T880, - G31, - G51, - G71, - G52, - G72, - G76, - G57, - G77, - G68, - G78, - UNKNOWN -}; - -struct MaliInfo -{ - MaliInfo() = default; - explicit MaliInfo(const std::string &device_name); - MaliGPU gpu_version = MaliGPU::UNKNOWN; - - bool IsMaliT6xx() const; - bool IsMaliT7xx() const; - bool IsMaliT8xx() const; - bool IsMidgard() const; - bool IsBifrostGen1() const; - bool IsBifrostGen2() const; - bool IsBifrostGen3() const; - bool IsBifrost() const; - bool IsValhall() const; -}; - -struct DeviceInfo -{ - DeviceInfo() = default; - - bool IsAdreno() const; - bool IsAdreno3xx() const; - bool IsAdreno4xx() const; - bool IsAdreno5xx() const; - bool IsAdreno6xx() const; - bool IsAdreno6xxOrHigher() const; - bool IsPowerVR() const; - bool IsNvidia() const; - bool IsMali() const; - bool IsAMD() const; - bool IsIntel() const; - - bool SupportsTextureArray() const; - bool SupportsImageBuffer() const; - bool SupportsImage3D() const; - - bool SupportsFloatImage2D(DataType data_type, int channels) const; - - // To track bug on some Adreno. b/131099086 - bool SupportsOneLayerTextureArray() const; - - bool SupportsExtension(const std::string &extension) const; - bool IsCL20OrHigher() const; - bool SupportsSubGroupWithSize(int sub_group_size) const; - - std::vector<std::string> extensions; - bool supports_fp16 = false; - bool supports_image3d_writes = false; - Vendor vendor = Vendor::kUnknown; - OpenCLVersion cl_version = OpenCLVersion::UNKNOWN; - int compute_units_count = 0; - uint64_t buffer_max_size = 0; - uint64_t image2d_max_width = 0; - uint64_t image2d_max_height = 0; - uint64_t image_buffer_max_size = 0; - uint64_t image_array_max_layers = 0; - uint64_t image3d_max_width = 0; - uint64_t image3d_max_height = 0; - uint64_t image3d_max_depth = 0; - int max_work_group_size_x = 0; - int max_work_group_size_y = 0; - int max_work_group_size_z = 0; - std::vector<int> supported_subgroup_sizes; - - // rtn is ROUND_TO_NEAREST - // with rtn precision is much better then with rtz (ROUND_TO_ZERO) - // Adreno 3xx supports only rtz, Adreno 4xx and more support rtn - // Mali from T6xx supports rtn - // PowerVR supports only rtz - bool supports_fp32_rtn = false; - bool supports_fp16_rtn = false; - - bool supports_r_f16_tex2d = false; - bool supports_rg_f16_tex2d = false; - bool supports_rgb_f16_tex2d = false; - bool supports_rgba_f16_tex2d = false; - - bool supports_r_f32_tex2d = false; - bool supports_rg_f32_tex2d = false; - bool supports_rgb_f32_tex2d = false; - bool supports_rgba_f32_tex2d = false; - - AdrenoInfo adreno_info; - MaliInfo mali_info; -}; - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_DEVICE_INFO_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/Environment.cc b/runtime/onert/backend/gpu_cl/open_cl/Environment.cc deleted file mode 100644 index b558f0377..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/Environment.cc +++ /dev/null @@ -1,276 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Environment.h" - -#include <string> -#include <vector> - -#include "Util.h" -#include "Shape.h" -#include "Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -Environment::Environment(CLDevice &&device, CLContext &&context, CLCommandQueue &&queue, - ProfilingCommandQueue &&profiling_queue) - : device_(std::move(device)), context_(std::move(context)), queue_(std::move(queue)), - profiling_queue_(std::move(profiling_queue)) -{ -} - -Environment::Environment(Environment &&environment) - : device_(std::move(environment.device_)), context_(std::move(environment.context_)), - queue_(std::move(environment.queue_)), - profiling_queue_(std::move(environment.profiling_queue_)), - program_cache_(std::move(environment.program_cache_)) -{ -} - -Environment &Environment::operator=(Environment &&environment) -{ - if (this != &environment) - { - device_ = std::move(environment.device_); - context_ = std::move(environment.context_); - queue_ = std::move(environment.queue_); - profiling_queue_ = std::move(environment.profiling_queue_); - program_cache_ = std::move(environment.program_cache_); - } - return *this; -} - -absl::Status Environment::Init() -{ - if (device().IsAdreno() && device().SupportsTextureArray()) - { - // Some Adreno < 600 have bug with one layer texture array. b/131099086 - // If we have one layer texture array and will write smt from kernel to this - // texture, we will get zeroes instead of actual values. - // The same kernel will work, if we use texture array with more than one - // layer. - if (device().info_.adreno_info.gpu_version < 600) - { - GetDevicePtr()->DisableOneLayerTextureArray(); - } - } - return absl::OkStatus(); -} - -void Environment::SetHighPerformance() const -{ - // TODO(sorokin) use cl_perf_hint if available -} - -void Environment::SetDefaultPerformance() const -{ - // TODO(sorokin) use cl_perf_hint if available -} - -void Environment::SetLowPerformance() const -{ - // TODO(sorokin) use cl_perf_hint if available -} - -std::vector<CalculationsPrecision> Environment::GetSupportedPrecisions() const -{ - std::vector<CalculationsPrecision> precisions; - for (CalculationsPrecision precision : - {CalculationsPrecision::F32, CalculationsPrecision::F32_F16, CalculationsPrecision::F16}) - { - if (IsSupported(precision)) - { - precisions.push_back(precision); - } - } - return precisions; -} - -bool Environment::IsSupported(CalculationsPrecision precision) const -{ - switch (precision) - { - case CalculationsPrecision::F32_F16: - case CalculationsPrecision::F16: - return device_.SupportsFP16(); - case CalculationsPrecision::F32: - return true; - } - return false; -} - -std::vector<TensorStorageType> Environment::GetSupportedStorages() const -{ - std::vector<TensorStorageType> storage_types; - for (auto storage_type : - {TensorStorageType::TEXTURE_2D, TensorStorageType::BUFFER, TensorStorageType::TEXTURE_ARRAY, - TensorStorageType::IMAGE_BUFFER, TensorStorageType::TEXTURE_3D}) - { - if (IsSupported(storage_type)) - { - storage_types.push_back(storage_type); - } - } - return storage_types; -} - -std::vector<TensorStorageType> Environment::GetSupportedStoragesWithHWZeroClampSupport() const -{ - std::vector<TensorStorageType> storage_types; - for (auto storage_type : {TensorStorageType::TEXTURE_2D, TensorStorageType::TEXTURE_ARRAY, - TensorStorageType::TEXTURE_3D}) - { - if (IsSupported(storage_type)) - { - storage_types.push_back(storage_type); - } - } - return storage_types; -} - -bool Environment::IsSupported(TensorStorageType storage_type) const -{ - switch (storage_type) - { - case TensorStorageType::TEXTURE_2D: - return !device_.IsAMD(); - case TensorStorageType::BUFFER: - return true; - case TensorStorageType::TEXTURE_ARRAY: - return !device_.IsAMD() && device_.SupportsTextureArray(); - case TensorStorageType::IMAGE_BUFFER: - return (device_.IsAdreno() || device_.IsAMD() || device_.IsNvidia()) && - device_.SupportsImageBuffer(); - case TensorStorageType::TEXTURE_3D: - return !device_.IsAMD() && device_.SupportsImage3D(); - case TensorStorageType::SINGLE_TEXTURE_2D: - return false; - case TensorStorageType::UNKNOWN: - return false; - } - return false; -} - -TensorStorageType GetFastestStorageType(const DeviceInfo &gpu_info) -{ - if (gpu_info.IsAdreno()) - { - if (gpu_info.IsAdreno6xxOrHigher()) - { - return TensorStorageType::TEXTURE_ARRAY; - } - else - { - return TensorStorageType::TEXTURE_2D; - } - } - else if (gpu_info.IsPowerVR()) - { - return TensorStorageType::TEXTURE_2D; - } - else if (gpu_info.IsMali()) - { - const MaliInfo mali_info = gpu_info.mali_info; - if (mali_info.IsMaliT8xx() || mali_info.IsBifrostGen3() || mali_info.IsValhall()) - { - return TensorStorageType::TEXTURE_2D; - } - else - { - return TensorStorageType::BUFFER; - } - } - else if (gpu_info.IsNvidia()) - { - return gpu_info.SupportsImageBuffer() ? TensorStorageType::IMAGE_BUFFER - : TensorStorageType::BUFFER; - } - else if (gpu_info.IsAMD()) - { - return gpu_info.SupportsImageBuffer() ? TensorStorageType::IMAGE_BUFFER - : TensorStorageType::BUFFER; - } - else if (gpu_info.IsIntel()) - { - return TensorStorageType::BUFFER; - } - return TensorStorageType::BUFFER; -} - -TensorStorageType GetStorageTypeWithMinimalMemoryConsumption(const DeviceInfo &gpu_info) -{ - if (gpu_info.IsAdreno()) - { - if (gpu_info.IsAdreno3xx() || gpu_info.IsAdreno4xx()) - { - return TensorStorageType::BUFFER; - } - else - { - return TensorStorageType::IMAGE_BUFFER; - } - } - else if (gpu_info.IsPowerVR()) - { - return TensorStorageType::BUFFER; - } - else if (gpu_info.IsMali()) - { - return TensorStorageType::BUFFER; - } - else if (gpu_info.IsNvidia()) - { - return gpu_info.SupportsImageBuffer() ? TensorStorageType::IMAGE_BUFFER - : TensorStorageType::BUFFER; - } - else if (gpu_info.IsAMD()) - { - return gpu_info.SupportsImageBuffer() ? TensorStorageType::IMAGE_BUFFER - : TensorStorageType::BUFFER; - } - else if (gpu_info.IsIntel()) - { - return TensorStorageType::BUFFER; - } - return TensorStorageType::BUFFER; -} - -absl::Status CreateEnvironment(Environment *result) -{ - CLDevice gpu; - RETURN_IF_ERROR(CreateDefaultGPUDevice(&gpu)); - - CLContext context; - RETURN_IF_ERROR(CreateCLContext(gpu, &context)); - CLCommandQueue queue; - RETURN_IF_ERROR(CreateCLCommandQueue(gpu, context, &queue)); - ProfilingCommandQueue profiling_queue; - RETURN_IF_ERROR(CreateProfilingCommandQueue(gpu, context, &profiling_queue)); - - *result = - Environment(std::move(gpu), std::move(context), std::move(queue), std::move(profiling_queue)); - return result->Init(); -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/Environment.h b/runtime/onert/backend/gpu_cl/open_cl/Environment.h deleted file mode 100644 index 47866b563..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/Environment.h +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_ENVIRONMENT_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_ENVIRONMENT_H__ - -#include "ClCommandQueue.h" -#include "ClContext.h" -#include "ClDevice.h" -#include "DeviceInfo.h" -#include "Precision.h" -#include "TensorType.h" -#include "DataType.h" -#include "ProgramCache.h" -#include "Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -class Environment -{ -public: - Environment() = default; - explicit Environment(CLDevice &&device, CLContext &&context, CLCommandQueue &&queue, - ProfilingCommandQueue &&profiling_queue); - // Move only - Environment(Environment &&environment); - Environment &operator=(Environment &&environment); - Environment(const Environment &) = delete; - Environment &operator=(const Environment &) = delete; - - const CLDevice &device() const { return device_; } - CLDevice *GetDevicePtr() { return &device_; } - const CLDevice *GetDevicePtr() const { return &device_; } - CLContext &context() { return context_; } - CLCommandQueue *queue() { return &queue_; } - ProfilingCommandQueue *profiling_queue() { return &profiling_queue_; } - ProgramCache *program_cache() { return &program_cache_; } - const ProgramCache *program_cache() const { return &program_cache_; } - - std::vector<CalculationsPrecision> GetSupportedPrecisions() const; - bool IsSupported(CalculationsPrecision precision) const; - std::vector<TensorStorageType> GetSupportedStorages() const; - // returns storage types that support zero clamping when reading OOB in HW - // (Height/Width) dimensions. - std::vector<TensorStorageType> GetSupportedStoragesWithHWZeroClampSupport() const; - bool IsSupported(TensorStorageType storage_type) const; - - absl::Status Init(); - - void SetHighPerformance() const; - void SetDefaultPerformance() const; - void SetLowPerformance() const; // for energy saving - -private: - CLDevice device_; - CLContext context_; - CLCommandQueue queue_; - ProfilingCommandQueue profiling_queue_; - ProgramCache program_cache_; -}; - -TensorStorageType GetFastestStorageType(const DeviceInfo &gpu_info); -TensorStorageType GetStorageTypeWithMinimalMemoryConsumption(const DeviceInfo &gpu_info); - -absl::Status CreateEnvironment(Environment *result); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_ENVIRONMENT_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/GpuObject.h b/runtime/onert/backend/gpu_cl/open_cl/GpuObject.h deleted file mode 100644 index a31630235..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/GpuObject.h +++ /dev/null @@ -1,222 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2020 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_GPU_OBJECT_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_GPU_OBJECT_H__ - -#include <map> -#include <memory> -#include <string> -#include <vector> - -#include "ClContext.h" -#include "OpenclWrapper.h" -#include "AccessType.h" -#include "DataType.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -struct GPUImage2DDescriptor -{ - DataType data_type = DataType::UNKNOWN; - AccessType access_type = AccessType::UNKNOWN; - cl_mem memory = nullptr; -}; - -struct GPUImage3DDescriptor -{ - DataType data_type = DataType::UNKNOWN; - AccessType access_type = AccessType::UNKNOWN; - cl_mem memory = nullptr; -}; - -struct GPUImage2DArrayDescriptor -{ - DataType data_type = DataType::UNKNOWN; - AccessType access_type = AccessType::UNKNOWN; - cl_mem memory = nullptr; -}; - -struct GPUImageBufferDescriptor -{ - DataType data_type = DataType::UNKNOWN; - AccessType access_type = AccessType::UNKNOWN; - cl_mem memory = nullptr; -}; - -struct GPUCustomMemoryDescriptor -{ - std::string type_name = ""; - cl_mem memory = nullptr; -}; - -enum class MemoryType -{ - GLOBAL, - CONSTANT, - LOCAL -}; - -std::string MemoryTypeToCLType(MemoryType type); - -struct GPUBufferDescriptor -{ - DataType data_type = DataType::UNKNOWN; - AccessType access_type = AccessType::UNKNOWN; - int element_size = 0; - MemoryType memory_type = MemoryType::GLOBAL; - std::vector<std::string> attributes; - cl_mem memory = nullptr; -}; - -struct GPUResources -{ - std::vector<std::string> ints; - std::vector<std::string> floats; - std::vector<std::pair<std::string, GPUBufferDescriptor>> buffers; - std::vector<std::pair<std::string, GPUImage2DDescriptor>> images2d; - std::vector<std::pair<std::string, GPUImage2DArrayDescriptor>> image2d_arrays; - std::vector<std::pair<std::string, GPUImage3DDescriptor>> images3d; - std::vector<std::pair<std::string, GPUImageBufferDescriptor>> image_buffers; - std::vector<std::pair<std::string, GPUCustomMemoryDescriptor>> custom_memories; - - std::vector<std::string> GetNames() const - { - std::vector<std::string> names = ints; - names.insert(names.end(), floats.begin(), floats.end()); - for (const auto &obj : buffers) - { - names.push_back(obj.first); - } - for (const auto &obj : images2d) - { - names.push_back(obj.first); - } - for (const auto &obj : image2d_arrays) - { - names.push_back(obj.first); - } - for (const auto &obj : images3d) - { - names.push_back(obj.first); - } - for (const auto &obj : image_buffers) - { - names.push_back(obj.first); - } - for (const auto &obj : custom_memories) - { - names.push_back(obj.first); - } - return names; - } -}; - -struct GPUResourcesWithValue -{ - std::vector<std::pair<std::string, int>> ints; - std::vector<std::pair<std::string, float>> floats; - std::vector<std::pair<std::string, cl_mem>> buffers; - std::vector<std::pair<std::string, cl_mem>> images2d; - std::vector<std::pair<std::string, cl_mem>> image2d_arrays; - std::vector<std::pair<std::string, cl_mem>> images3d; - std::vector<std::pair<std::string, cl_mem>> image_buffers; - std::vector<std::pair<std::string, cl_mem>> custom_memories; -}; - -class GPUObject; - -class GPUObjectDescriptor -{ -public: - GPUObjectDescriptor() = default; - GPUObjectDescriptor(const GPUObjectDescriptor &) = default; - GPUObjectDescriptor &operator=(const GPUObjectDescriptor &) = default; - GPUObjectDescriptor(GPUObjectDescriptor &&obj_desc) : state_vars_(std::move(obj_desc.state_vars_)) - { - } - GPUObjectDescriptor &operator=(GPUObjectDescriptor &&obj_desc) - { - if (this != &obj_desc) - { - state_vars_ = std::move(obj_desc.state_vars_); - } - return *this; - } - virtual ~GPUObjectDescriptor() = default; - - void SetStateVar(const std::string &key, const std::string &value) const - { - state_vars_[key] = value; - } - - virtual std::string PerformConstExpr(const std::string &) const { return ""; } - - virtual absl::Status PerformSelector(const std::string &, const std::vector<std::string> &, - const std::vector<std::string> &, std::string *result) const - { - *result = ""; - return absl::OkStatus(); - } - virtual GPUResources GetGPUResources() const { return GPUResources(); } - - virtual absl::Status CreateGPUObject(CLContext *, std::unique_ptr<GPUObject> *) const - { - return absl::OkStatus(); - } - virtual void Release() {} - - void SetAccess(AccessType access_type) { access_type_ = access_type; } - AccessType GetAccess() const { return access_type_; } - -protected: - // friend flatbuffers::Offset<data::GPUObjectDescriptor> Encode( - // const GPUObjectDescriptor& desc, flatbuffers::FlatBufferBuilder* builder); - // friend void Decode(const data::GPUObjectDescriptor* fb_obj, - // GPUObjectDescriptor* obj); - mutable std::map<std::string, std::string> state_vars_; - AccessType access_type_ = AccessType::UNKNOWN; -}; - -using GPUObjectDescriptorPtr = std::unique_ptr<GPUObjectDescriptor>; - -class GPUObject -{ -public: - GPUObject() = default; - // Move only - GPUObject(GPUObject &&obj_desc) = default; - GPUObject &operator=(GPUObject &&obj_desc) = default; - GPUObject(const GPUObject &) = delete; - GPUObject &operator=(const GPUObject &) = delete; - virtual ~GPUObject() = default; - virtual absl::Status GetGPUResources(const GPUObjectDescriptor *obj_ptr, - GPUResourcesWithValue *resources) const = 0; -}; - -using GPUObjectPtr = std::unique_ptr<GPUObject>; - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_GPU_OBJECT_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/InferenceContext.cc b/runtime/onert/backend/gpu_cl/open_cl/InferenceContext.cc deleted file mode 100644 index afb7e2950..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/InferenceContext.cc +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "InferenceContext.h" - -#include <algorithm> -#include <cmath> -#include <cstdint> -#include <map> -#include <memory> -#include <string> -#include <vector> -#include <unordered_map> - -#include "Buffer.h" -#include "ClDevice.h" - -#include "kernels/GpuOperation.h" -#include "ModelHints.h" -#include "Precision.h" -#include "StorageTypeUtil.h" -#include "TensorType.h" -#include "DataType.h" -#include "Model.h" -#include "Operations.h" -#include "Shape.h" -#include "Types.h" -#include "Util.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -CLNode::CLNode(CLNode &&node) - : operation(std::move(node.operation)), inputs(std::move(node.inputs)), - outputs(std::move(node.outputs)), name(std::move(node.name)) -{ -} - -CLNode &CLNode::operator=(CLNode &&node) -{ - if (this != &node) - { - operation = std::move(node.operation); - inputs = std::move(node.inputs); - outputs = std::move(node.outputs); - name = std::move(node.name); - } - return *this; -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/InferenceContext.h b/runtime/onert/backend/gpu_cl/open_cl/InferenceContext.h deleted file mode 100644 index ebe2c5313..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/InferenceContext.h +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_INFERENCE_CONTEXT_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_INFERENCE_CONTEXT_H__ - -#include <cstdint> -#include <functional> -#include <map> -#include <memory> -#include <vector> -#include <unordered_map> - -#include "Buffer.h" -#include "ClCommandQueue.h" -#include "Environment.h" -#include "GpuObject.h" -#include "kernels/GpuOperation.h" -#include "ModelHints.h" -#include "OpenclWrapper.h" -#include "Precision.h" -#include "TensorType.h" -#include "Model.h" -#include "InternalTensor.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -struct CLNode -{ - std::unique_ptr<GPUOperation> operation; - std::vector<ValueId> inputs; - std::vector<ValueId> outputs; - - // Mostly for debug purposes. - std::string name; - - CLNode() = default; - - CLNode(CLNode &&node); - CLNode &operator=(CLNode &&node); - CLNode(const CLNode &) = delete; - CLNode &operator=(const CLNode &) = delete; -}; - -class InferenceContext -{ -public: - struct CreateInferenceInfo - { - CalculationsPrecision precision; - TensorStorageType storage_type; - ModelHints hints; - }; - - struct DummyTensor - { - BHWC shape; - TensorDescriptor descriptor; - - bool operator==(const DummyTensor &b) const - { - return shape == b.shape && descriptor == b.descriptor; - } - }; - - class TensorReserver - { - public: - ValueId Add(const std::shared_ptr<DummyTensor> dummy) - { - reservations_[next_] = std::move(dummy); - return next_++; - } - void Add(ValueId id, const std::shared_ptr<DummyTensor> dummy) - { - reservations_[id] = std::move(dummy); - } - void SetNext(ValueId id) { next_ = id; } - bool HaveTensor(ValueId id) { return reservations_.find(id) != reservations_.end(); } - std::shared_ptr<DummyTensor> Get(ValueId id) { return reservations_[id]; } - - std::vector<std::pair<ValueId, TensorDescriptor>> GetTensorDescs() const - { - std::vector<std::pair<ValueId, TensorDescriptor>> result; - for (auto &v : reservations_) - { - TensorDescriptor desc = v.second->descriptor; - desc.shape.b = v.second->shape.b; - desc.shape.h = v.second->shape.h; - desc.shape.w = v.second->shape.w; - desc.shape.d = 1; - desc.shape.c = v.second->shape.c; - result.push_back({v.first, desc}); - } - return result; - } - - void Add(const std::vector<std::pair<ValueId, TensorDescriptor>> &tensors) - { - for (auto &v : tensors) - { - auto dummy = std::make_shared<DummyTensor>(); - dummy->descriptor = v.second; - dummy->shape.b = v.second.shape.b; - dummy->shape.h = v.second.shape.h; - dummy->shape.w = v.second.shape.w; - dummy->shape.c = v.second.shape.c; - Add(v.first, dummy); - } - } - - private: - std::unordered_map<ValueId, std::shared_ptr<DummyTensor>> reservations_; - ValueId next_ = 0; - }; - -private: -}; - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_INFERENCE_CONTEXT_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/InternalTensor.h b/runtime/onert/backend/gpu_cl/open_cl/InternalTensor.h deleted file mode 100644 index f0423db86..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/InternalTensor.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_INTERNAL_TENSOR_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_INTERNAL_TENSOR_H__ - -#include <stdint.h> - -#include <vector> - -#include "DataType.h" -#include "Shape.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ -namespace internal_tensor -{ - -// Meta function given element type returns a type for Tensor data container. -template <DataType Type> struct StorageType; - -template <> struct StorageType<DataType::FLOAT32> -{ - using value = std::vector<float>; -}; - -template <> struct StorageType<DataType::INT32> -{ - using value = std::vector<int32_t>; -}; - -} // namespace internal_tensor - -template <typename ShapeT, DataType Type> struct InternalTensor -{ - using ShapeType = ShapeT; - - constexpr static DataType kType = Type; - - using TensorStorageType = typename internal_tensor::StorageType<Type>::value; - - // Opaque id of a tensor. - int64_t id = -1; - - ShapeType shape; - - TensorStorageType data; -}; - -// TensorRef is a reference to another tensor. If an object should never hold -// tensor data, then TensorRef should be used instead. -template <typename ShapeT> struct TensorRef -{ - using ShapeType = ShapeT; - - DataType type = DataType::UNKNOWN; - - ShapeT shape; - - // Opaque reference to a tensor. Upstream component is responsible for - // resolving this reference into an actual tensor. - int64_t ref = -1; - - // Specifies if the tensor should be a variable input tensor that must be an - // output as well as an input to the graph. - bool is_variable_input = false; -}; - -template <typename ShapeT, DataType Type> constexpr DataType InternalTensor<ShapeT, Type>::kType; - -template <typename ShapeT, DataType Type> -InternalTensor<ShapeT, Type> MakeZeroTensor(const ShapeT &shape) -{ - InternalTensor<ShapeT, Type> tensor; - tensor.shape = shape; - tensor.data = - typename InternalTensor<ShapeT, Type>::TensorStorageType(shape.DimensionsProduct(), 0); - return tensor; -} - -using TensorFloat32 = InternalTensor<BHWC, DataType::FLOAT32>; -using Tensor5DFloat32 = InternalTensor<BHWDC, DataType::FLOAT32>; - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_INTERNAL_TENSOR_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/LinearStorage.cc b/runtime/onert/backend/gpu_cl/open_cl/LinearStorage.cc deleted file mode 100644 index 3889d4369..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/LinearStorage.cc +++ /dev/null @@ -1,265 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "LinearStorage.h" - -#include "absl/strings/str_cat.h" -#include "DataType.h" -#include "Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -TensorLinearDescriptor::TensorLinearDescriptor(TensorLinearDescriptor &&desc) - : GPUObjectDescriptor(std::move(desc)), storage_type(desc.storage_type), - element_type(desc.element_type), memory_type(desc.memory_type), size(desc.size), - data(std::move(desc.data)) -{ -} - -TensorLinearDescriptor &TensorLinearDescriptor::operator=(TensorLinearDescriptor &&desc) -{ - if (this != &desc) - { - std::swap(storage_type, desc.storage_type); - std::swap(element_type, desc.element_type); - std::swap(memory_type, desc.memory_type); - std::swap(size, desc.size); - data = std::move(desc.data); - GPUObjectDescriptor::operator=(std::move(desc)); - } - return *this; -} - -void TensorLinearDescriptor::Release() { data.clear(); } - -GPUResources TensorLinearDescriptor::GetGPUResources() const -{ - GPUResources resources; - resources.ints.push_back("length"); - if (storage_type == LinearStorageType::BUFFER) - { - GPUBufferDescriptor desc; - desc.data_type = element_type; - desc.access_type = access_type_; - desc.element_size = 4; - desc.memory_type = memory_type; - resources.buffers.push_back({"buffer", desc}); - } - else - { - GPUImage2DDescriptor desc; - desc.data_type = element_type; - desc.access_type = access_type_; - resources.images2d.push_back({"tex2d", desc}); - } - return resources; -} - -absl::Status TensorLinearDescriptor::PerformSelector(const std::string &selector, - const std::vector<std::string> &args, - const std::vector<std::string> &, - std::string *result) const -{ - if (selector == "Length") - { - *result = "length"; - return absl::OkStatus(); - } - else if (selector == "Read") - { - return PerformReadSelector(args, result); - } - else if (selector == "GetPtr") - { - if (storage_type != LinearStorageType::BUFFER) - { - return absl::InvalidArgumentError( - "GetPtr selector supported for LinearStorageType::BUFFER only."); - } - *result = "buffer"; - return absl::OkStatus(); - } - else - { - return absl::NotFoundError( - absl::StrCat("TensorLinearDescriptor don't have selector with name - ", selector)); - } -} - -absl::Status TensorLinearDescriptor::PerformReadSelector(const std::vector<std::string> &args, - std::string *result) const -{ - if (args.size() != 1) - { - return absl::NotFoundError(absl::StrCat( - "TensorLinearDescriptor Read require one argument, but ", args.size(), " was passed")); - } - if (storage_type == LinearStorageType::BUFFER) - { - *result = absl::StrCat("buffer[", args[0], "]"); - return absl::OkStatus(); - } - else - { - const std::string read = element_type == DataType::FLOAT16 ? "read_imageh" : "read_imagef"; - *result = absl::StrCat(read, "(tex2d, smp_none, (int2)(", args[0], ", 0))"); - return absl::OkStatus(); - } -} - -absl::Status TensorLinearDescriptor::CreateGPUObject(CLContext *context, GPUObjectPtr *result) const -{ - LinearStorage gpu_storage; - RETURN_IF_ERROR(gpu_storage.CreateFromTensorLinearDescriptor(*this, context)); - *result = absl::make_unique<LinearStorage>(std::move(gpu_storage)); - return absl::OkStatus(); -} - -void TensorLinearDescriptor::UploadLinearData(const InternalTensor<Linear, DataType::FLOAT32> &src, - int aligned_size) -{ - size = aligned_size == 0 ? DivideRoundUp(src.shape.v, 4) : aligned_size; - if (element_type == DataType::FLOAT32) - { - data.resize(size * sizeof(float) * 4); - float *gpu_data = reinterpret_cast<float *>(data.data()); - for (int i = 0; i < size * 4; ++i) - { - if (i < src.shape.v) - { - gpu_data[i] = src.data[i]; - } - else - { - gpu_data[i] = 0.0f; - } - } - } - // TODO - // It doesn't support F16 yet. I will try to add it later. - // - // else { - // data.resize(size * sizeof(half) * 4); - // half* gpu_data = reinterpret_cast<half*>(data.data()); - // for (int i = 0; i < size * 4; ++i) { - // if (i < src.shape.v) { - // gpu_data[i] = src.data[i]; - // } else { - // gpu_data[i] = 0.0f; - // } - // } - // } -} - -void LinearStorage::Release() -{ - if (memory_) - { - clReleaseMemObject(memory_); - memory_ = nullptr; - } -} - -LinearStorage::LinearStorage(LinearStorage &&storage) - : GPUObject(std::move(storage)), memory_(storage.memory_), depth_(storage.depth_), - storage_type_(storage.storage_type_) -{ - storage.memory_ = nullptr; -} - -LinearStorage &LinearStorage::operator=(LinearStorage &&storage) -{ - if (this != &storage) - { - Release(); - std::swap(memory_, storage.memory_); - std::swap(depth_, storage.depth_); - std::swap(storage_type_, storage.storage_type_); - GPUObject::operator=(std::move(storage)); - } - return *this; -} - -absl::Status LinearStorage::GetGPUResources(const GPUObjectDescriptor *obj_ptr, - GPUResourcesWithValue *resources) const -{ - const auto *linear_desc = dynamic_cast<const TensorLinearDescriptor *>(obj_ptr); - if (!linear_desc) - { - return absl::InvalidArgumentError("Expected TensorLinearDescriptor on input."); - } - - resources->ints.push_back({"length", depth_}); - - if (storage_type_ == LinearStorageType::BUFFER) - { - resources->buffers.push_back({"buffer", memory_}); - } - else - { - resources->images2d.push_back({"tex2d", memory_}); - } - - return absl::OkStatus(); -} - -absl::Status LinearStorage::CreateFromTensorLinearDescriptor(const TensorLinearDescriptor &desc, - CLContext *context) -{ - storage_type_ = desc.storage_type; - depth_ = desc.size; - uint8_t *data_ptr = desc.data.empty() ? nullptr : const_cast<unsigned char *>(desc.data.data()); - if (storage_type_ == LinearStorageType::BUFFER) - { - bool read_only = desc.memory_type == MemoryType::CONSTANT; - uint8_t *data_ptr = desc.data.empty() ? nullptr : const_cast<unsigned char *>(desc.data.data()); - // TODO - // It doesn't support F16 yet. I will try to add it later. - // - // const int float4_size = desc.element_type == DataType::FLOAT32 - // ? sizeof(float) * 4 - // : sizeof(half) * 4; - const int float4_size = sizeof(float) * 4; - return CreateCLBuffer(context->context(), depth_ * float4_size, read_only, data_ptr, &memory_); - } - else - { - return CreateRGBAImage2D(context->context(), depth_, 1, - DataTypeToChannelType(desc.element_type), data_ptr, &memory_); - } -} - -LinearStorageType DeduceLinearStorageType(TensorStorageType tensor_storage_type) -{ - if (tensor_storage_type == TensorStorageType::BUFFER) - { - return LinearStorageType::BUFFER; - } - else - { - return LinearStorageType::TEXTURE_2D; - } -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/LinearStorage.h b/runtime/onert/backend/gpu_cl/open_cl/LinearStorage.h deleted file mode 100644 index f6c3ac82f..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/LinearStorage.h +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_LINEAR_STORAGE_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_LINEAR_STORAGE_H__ - -#include <string> -#include <utility> - -#include "absl/strings/str_cat.h" -#include "absl/types/span.h" -#include "GpuObject.h" -#include "OpenclWrapper.h" -#include "TensorType.h" -#include "Util.h" -#include "DataType.h" -#include "Status.h" -#include "Types.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -enum class LinearStorageType -{ - BUFFER, - TEXTURE_2D -}; - -struct TensorLinearDescriptor : public GPUObjectDescriptor -{ - LinearStorageType storage_type; - DataType element_type; // FLOAT32 or FLOAT16 - MemoryType memory_type = MemoryType::GLOBAL; // applicable for BUFFER - - // optional - int size = 0; - std::vector<uint8_t> data; - - TensorLinearDescriptor() = default; - TensorLinearDescriptor(const TensorLinearDescriptor &) = default; - TensorLinearDescriptor &operator=(const TensorLinearDescriptor &) = default; - TensorLinearDescriptor(TensorLinearDescriptor &&desc); - TensorLinearDescriptor &operator=(TensorLinearDescriptor &&desc); - - void UploadLinearData(const InternalTensor<Linear, DataType::FLOAT32> &src, int aligned_size = 0); - - absl::Status PerformSelector(const std::string &selector, const std::vector<std::string> &args, - const std::vector<std::string> &template_args, - std::string *result) const override; - - GPUResources GetGPUResources() const override; - absl::Status PerformReadSelector(const std::vector<std::string> &args, std::string *result) const; - - absl::Status CreateGPUObject(CLContext *context, GPUObjectPtr *result) const override; - void Release() override; -}; - -LinearStorageType DeduceLinearStorageType(TensorStorageType tensor_storage_type); - -// Represent GPU 1D-array of FLT4(float4/half4) values -// Can use inside texture2d or buffer -class LinearStorage : public GPUObject -{ -public: - LinearStorage() {} - ~LinearStorage() override { Release(); } - - // Move only - LinearStorage(LinearStorage &&storage); - LinearStorage &operator=(LinearStorage &&storage); - LinearStorage(const LinearStorage &) = delete; - LinearStorage &operator=(const LinearStorage &) = delete; - - absl::Status GetGPUResources(const GPUObjectDescriptor *obj_ptr, - GPUResourcesWithValue *resources) const override; - - absl::Status CreateFromTensorLinearDescriptor(const TensorLinearDescriptor &desc, - CLContext *context); - -private: - void Release(); - - cl_mem memory_ = nullptr; - int depth_; - LinearStorageType storage_type_; -}; - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_LINEAR_STORAGE_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/Model.h b/runtime/onert/backend/gpu_cl/open_cl/Model.h deleted file mode 100644 index f434bb22f..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/Model.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_MODEL_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_MODEL_H__ - -#include <string> - -#include "absl/types/any.h" -#include "InternalTensor.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -// There is yet another representation of CNN graph. The primary purpose of this -// representation is to simplify graph manipulation. - -using ValueId = uint32_t; - -// Used to emulate quantized behavior. -struct QuantizationParams -{ - float min = 0; - float max = 0; - float scale = 0; -}; - -struct Operation -{ - std::string type; - absl::any attributes; -}; - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_MODEL_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/ModelHints.h b/runtime/onert/backend/gpu_cl/open_cl/ModelHints.h deleted file mode 100644 index 474c56b2a..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/ModelHints.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_MODEL_HINTS_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_MODEL_HINTS_H__ - -#include <cstdint> - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -struct ModelHints -{ - using ModelHint = uint64_t; - - // By default we want the fastest inference. - static constexpr ModelHint kFastestInference = 0x00000000; - // Can improve compilation time, but inference can be slower. - static constexpr ModelHint kReduceKernelsCount = 0x00000001; - // Can improve tuning time, but inference can be slower. - static constexpr ModelHint kFastTuning = 0x00000002; - - // Experimental. - // Can improve performance and memory consumption, but slow down - // initialization a lot and create more kernels. - static constexpr ModelHint kAllowSpecialKernels = 0x00000004; - - void Add(ModelHint hint) - { - if (hint == kFastestInference) - { - hints = kFastestInference; - } - else - { - hints |= hint; - } - } - - bool Check(ModelHint hint) const { return hints & hint; } - - uint64_t hints = kFastestInference; -}; - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_MODEL_HINTS_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/OpenclWrapper.cc b/runtime/onert/backend/gpu_cl/open_cl/OpenclWrapper.cc deleted file mode 100644 index dbaf6faf6..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/OpenclWrapper.cc +++ /dev/null @@ -1,407 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#if defined(_WIN32) -#define __WINDOWS__ -#endif - -#include "OpenclWrapper.h" - -#ifdef __WINDOWS__ -#include <windows.h> -#else -#include <dlfcn.h> -#endif - -#include <string> - -#include "absl/strings/str_cat.h" -#include "Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -#ifdef __ANDROID__ -#define LoadFunction(function) \ - if (use_wrapper) \ - { \ - function = reinterpret_cast<PFN_##function>(loadOpenCLPointer(#function)); \ - } \ - else \ - { \ - function = reinterpret_cast<PFN_##function>(dlsym(*libopencl, #function)); \ - } -#elif defined(__WINDOWS__) -#define LoadFunction(function) \ - function = reinterpret_cast<PFN_##function>(GetProcAddress(libopencl, #function)); -#else -#define LoadFunction(function) \ - function = reinterpret_cast<PFN_##function>(dlsym(*libopencl, #function)); -#endif - -#ifdef __WINDOWS__ -void LoadOpenCLFunctions(HMODULE libopencl); -#else -void LoadOpenCLFunctions(void **libopencl, bool use_wrapper); -#endif - -absl::Status LoadOpenCL(void **libopencl) -{ -#ifdef __WINDOWS__ - HMODULE libopencl = LoadLibraryA("OpenCL.dll"); - if (libopencl) - { - LoadOpenCLFunctions(libopencl); - return absl::OkStatus(); - } - else - { - DWORD error_code = GetLastError(); - return absl::UnknownError( - absl::StrCat("Can not open OpenCL library on this device, error code - ", error_code)); - } -#else - *libopencl = dlopen("libOpenCL.so", RTLD_NOW | RTLD_LOCAL); - if (*libopencl) - { - LoadOpenCLFunctions(libopencl, false); - return absl::OkStatus(); - } - // record error - std::string error(dlerror()); -#ifdef __ANDROID__ - // Pixel phone or auto? - *libopencl = dlopen("libOpenCL-pixel.so", RTLD_NOW | RTLD_LOCAL); - if (!*libopencl) - { - *libopencl = dlopen("libOpenCL-car.so", RTLD_NOW | RTLD_LOCAL); - } - if (*libopencl) - { - typedef void (*enableOpenCL_t)(); - enableOpenCL_t enableOpenCL = - reinterpret_cast<enableOpenCL_t>(dlsym(*libopencl, "enableOpenCL")); - enableOpenCL(); - LoadOpenCLFunctions(libopencl, true); - return absl::OkStatus(); - } -#endif - return absl::UnknownError(absl::StrCat("Can not open OpenCL library on this device - ", error)); -#endif -} - -void UnloadOpenCL(void *libopencl) -{ - if (libopencl) - { - dlclose(libopencl); - } -} - -#ifdef __WINDOWS__ -void LoadOpenCLFunctions(HMODULE libopencl) -{ -#else -#ifdef __ANDROID__ -void LoadOpenCLFunctions(void **libopencl, bool use_wrapper) -{ - typedef void *(*loadOpenCLPointer_t)(const char *name); - loadOpenCLPointer_t loadOpenCLPointer; - if (use_wrapper) - { - loadOpenCLPointer = - reinterpret_cast<loadOpenCLPointer_t>(dlsym(*libopencl, "loadOpenCLPointer")); - } -#else -void LoadOpenCLFunctions(void **libopencl, bool) -{ -#endif // __ANDROID__ -#endif // __WINDOWS__ - - LoadFunction(clGetPlatformIDs); - LoadFunction(clGetPlatformInfo); - LoadFunction(clGetDeviceIDs); - LoadFunction(clGetDeviceInfo); - LoadFunction(clCreateSubDevices); - LoadFunction(clRetainDevice); - LoadFunction(clReleaseDevice); - LoadFunction(clCreateContext); - LoadFunction(clCreateContextFromType); - LoadFunction(clRetainContext); - LoadFunction(clReleaseContext); - LoadFunction(clGetContextInfo); - LoadFunction(clCreateCommandQueueWithProperties); - LoadFunction(clRetainCommandQueue); - LoadFunction(clReleaseCommandQueue); - LoadFunction(clGetCommandQueueInfo); - LoadFunction(clCreateBuffer); - LoadFunction(clCreateSubBuffer); - LoadFunction(clCreateImage); - LoadFunction(clCreatePipe); - LoadFunction(clRetainMemObject); - LoadFunction(clReleaseMemObject); - LoadFunction(clGetSupportedImageFormats); - LoadFunction(clGetMemObjectInfo); - LoadFunction(clGetImageInfo); - LoadFunction(clGetPipeInfo); - LoadFunction(clSetMemObjectDestructorCallback); - LoadFunction(clSVMAlloc); - LoadFunction(clSVMFree); - LoadFunction(clCreateSamplerWithProperties); - LoadFunction(clRetainSampler); - LoadFunction(clReleaseSampler); - LoadFunction(clGetSamplerInfo); - LoadFunction(clCreateProgramWithSource); - LoadFunction(clCreateProgramWithBinary); - LoadFunction(clCreateProgramWithBuiltInKernels); - LoadFunction(clRetainProgram); - LoadFunction(clReleaseProgram); - LoadFunction(clBuildProgram); - LoadFunction(clCompileProgram); - LoadFunction(clLinkProgram); - LoadFunction(clUnloadPlatformCompiler); - LoadFunction(clGetProgramInfo); - LoadFunction(clGetProgramBuildInfo); - LoadFunction(clCreateKernel); - LoadFunction(clCreateKernelsInProgram); - LoadFunction(clRetainKernel); - LoadFunction(clReleaseKernel); - LoadFunction(clSetKernelArg); - LoadFunction(clSetKernelArgSVMPointer); - LoadFunction(clSetKernelExecInfo); - LoadFunction(clGetKernelInfo); - LoadFunction(clGetKernelArgInfo); - LoadFunction(clGetKernelWorkGroupInfo); - LoadFunction(clWaitForEvents); - LoadFunction(clGetEventInfo); - LoadFunction(clCreateUserEvent); - LoadFunction(clRetainEvent); - LoadFunction(clReleaseEvent); - LoadFunction(clSetUserEventStatus); - LoadFunction(clSetEventCallback); - LoadFunction(clGetEventProfilingInfo); - LoadFunction(clFlush); - LoadFunction(clFinish); - LoadFunction(clEnqueueReadBuffer); - LoadFunction(clEnqueueReadBufferRect); - LoadFunction(clEnqueueWriteBuffer); - LoadFunction(clEnqueueWriteBufferRect); - LoadFunction(clEnqueueFillBuffer); - LoadFunction(clEnqueueCopyBuffer); - LoadFunction(clEnqueueCopyBufferRect); - LoadFunction(clEnqueueReadImage); - LoadFunction(clEnqueueWriteImage); - LoadFunction(clEnqueueFillImage); - LoadFunction(clEnqueueCopyImage); - LoadFunction(clEnqueueCopyImageToBuffer); - LoadFunction(clEnqueueCopyBufferToImage); - LoadFunction(clEnqueueMapBuffer); - LoadFunction(clEnqueueMapImage); - LoadFunction(clEnqueueUnmapMemObject); - LoadFunction(clEnqueueMigrateMemObjects); - LoadFunction(clEnqueueNDRangeKernel); - LoadFunction(clEnqueueNativeKernel); - LoadFunction(clEnqueueMarkerWithWaitList); - LoadFunction(clEnqueueBarrierWithWaitList); - LoadFunction(clEnqueueSVMFree); - LoadFunction(clEnqueueSVMMemcpy); - LoadFunction(clEnqueueSVMMemFill); - LoadFunction(clEnqueueSVMMap); - LoadFunction(clEnqueueSVMUnmap); - LoadFunction(clGetExtensionFunctionAddressForPlatform); - LoadFunction(clCreateImage2D); - LoadFunction(clCreateImage3D); - LoadFunction(clEnqueueMarker); - LoadFunction(clEnqueueWaitForEvents); - LoadFunction(clEnqueueBarrier); - LoadFunction(clUnloadCompiler); - LoadFunction(clGetExtensionFunctionAddress); - LoadFunction(clCreateCommandQueue); - LoadFunction(clCreateSampler); - LoadFunction(clEnqueueTask); - - // OpenGL sharing - LoadFunction(clCreateFromGLBuffer); - LoadFunction(clCreateFromGLTexture); - LoadFunction(clEnqueueAcquireGLObjects); - LoadFunction(clEnqueueReleaseGLObjects); - - // cl_khr_egl_event extension - LoadFunction(clCreateEventFromEGLSyncKHR); - - // EGL sharing - LoadFunction(clCreateFromEGLImageKHR); - LoadFunction(clEnqueueAcquireEGLObjectsKHR); - LoadFunction(clEnqueueReleaseEGLObjectsKHR); -} // namespace gpu_cl - -// No OpenCL support, do not set function addresses -PFN_clGetPlatformIDs clGetPlatformIDs; -PFN_clGetPlatformInfo clGetPlatformInfo; -PFN_clGetDeviceIDs clGetDeviceIDs; -PFN_clGetDeviceInfo clGetDeviceInfo; -PFN_clCreateSubDevices clCreateSubDevices; -PFN_clRetainDevice clRetainDevice; -PFN_clReleaseDevice clReleaseDevice; -PFN_clCreateContext clCreateContext; -PFN_clCreateContextFromType clCreateContextFromType; -PFN_clRetainContext clRetainContext; -PFN_clReleaseContext clReleaseContext; -PFN_clGetContextInfo clGetContextInfo; -PFN_clCreateCommandQueueWithProperties clCreateCommandQueueWithProperties; -PFN_clRetainCommandQueue clRetainCommandQueue; -PFN_clReleaseCommandQueue clReleaseCommandQueue; -PFN_clGetCommandQueueInfo clGetCommandQueueInfo; -PFN_clCreateBuffer clCreateBuffer; -PFN_clCreateSubBuffer clCreateSubBuffer; -PFN_clCreateImage clCreateImage; -PFN_clCreatePipe clCreatePipe; -PFN_clRetainMemObject clRetainMemObject; -PFN_clReleaseMemObject clReleaseMemObject; -PFN_clGetSupportedImageFormats clGetSupportedImageFormats; -PFN_clGetMemObjectInfo clGetMemObjectInfo; -PFN_clGetImageInfo clGetImageInfo; -PFN_clGetPipeInfo clGetPipeInfo; -PFN_clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback; -PFN_clSVMAlloc clSVMAlloc; -PFN_clSVMFree clSVMFree; -PFN_clCreateSamplerWithProperties clCreateSamplerWithProperties; -PFN_clRetainSampler clRetainSampler; -PFN_clReleaseSampler clReleaseSampler; -PFN_clGetSamplerInfo clGetSamplerInfo; -PFN_clCreateProgramWithSource clCreateProgramWithSource; -PFN_clCreateProgramWithBinary clCreateProgramWithBinary; -PFN_clCreateProgramWithBuiltInKernels clCreateProgramWithBuiltInKernels; -PFN_clRetainProgram clRetainProgram; -PFN_clReleaseProgram clReleaseProgram; -PFN_clBuildProgram clBuildProgram; -PFN_clCompileProgram clCompileProgram; -PFN_clLinkProgram clLinkProgram; -PFN_clUnloadPlatformCompiler clUnloadPlatformCompiler; -PFN_clGetProgramInfo clGetProgramInfo; -PFN_clGetProgramBuildInfo clGetProgramBuildInfo; -PFN_clCreateKernel clCreateKernel; -PFN_clCreateKernelsInProgram clCreateKernelsInProgram; -PFN_clRetainKernel clRetainKernel; -PFN_clReleaseKernel clReleaseKernel; -PFN_clSetKernelArg clSetKernelArg; -PFN_clSetKernelArgSVMPointer clSetKernelArgSVMPointer; -PFN_clSetKernelExecInfo clSetKernelExecInfo; -PFN_clGetKernelInfo clGetKernelInfo; -PFN_clGetKernelArgInfo clGetKernelArgInfo; -PFN_clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo; -PFN_clWaitForEvents clWaitForEvents; -PFN_clGetEventInfo clGetEventInfo; -PFN_clCreateUserEvent clCreateUserEvent; -PFN_clRetainEvent clRetainEvent; -PFN_clReleaseEvent clReleaseEvent; -PFN_clSetUserEventStatus clSetUserEventStatus; -PFN_clSetEventCallback clSetEventCallback; -PFN_clGetEventProfilingInfo clGetEventProfilingInfo; -PFN_clFlush clFlush; -PFN_clFinish clFinish; -PFN_clEnqueueReadBuffer clEnqueueReadBuffer; -PFN_clEnqueueReadBufferRect clEnqueueReadBufferRect; -PFN_clEnqueueWriteBuffer clEnqueueWriteBuffer; -PFN_clEnqueueWriteBufferRect clEnqueueWriteBufferRect; -PFN_clEnqueueFillBuffer clEnqueueFillBuffer; -PFN_clEnqueueCopyBuffer clEnqueueCopyBuffer; -PFN_clEnqueueCopyBufferRect clEnqueueCopyBufferRect; -PFN_clEnqueueReadImage clEnqueueReadImage; -PFN_clEnqueueWriteImage clEnqueueWriteImage; -PFN_clEnqueueFillImage clEnqueueFillImage; -PFN_clEnqueueCopyImage clEnqueueCopyImage; -PFN_clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer; -PFN_clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage; -PFN_clEnqueueMapBuffer clEnqueueMapBuffer; -PFN_clEnqueueMapImage clEnqueueMapImage; -PFN_clEnqueueUnmapMemObject clEnqueueUnmapMemObject; -PFN_clEnqueueMigrateMemObjects clEnqueueMigrateMemObjects; -PFN_clEnqueueNDRangeKernel clEnqueueNDRangeKernel; -PFN_clEnqueueNativeKernel clEnqueueNativeKernel; -PFN_clEnqueueMarkerWithWaitList clEnqueueMarkerWithWaitList; -PFN_clEnqueueBarrierWithWaitList clEnqueueBarrierWithWaitList; -PFN_clEnqueueSVMFree clEnqueueSVMFree; -PFN_clEnqueueSVMMemcpy clEnqueueSVMMemcpy; -PFN_clEnqueueSVMMemFill clEnqueueSVMMemFill; -PFN_clEnqueueSVMMap clEnqueueSVMMap; -PFN_clEnqueueSVMUnmap clEnqueueSVMUnmap; -PFN_clGetExtensionFunctionAddressForPlatform clGetExtensionFunctionAddressForPlatform; -PFN_clCreateImage2D clCreateImage2D; -PFN_clCreateImage3D clCreateImage3D; -PFN_clEnqueueMarker clEnqueueMarker; -PFN_clEnqueueWaitForEvents clEnqueueWaitForEvents; -PFN_clEnqueueBarrier clEnqueueBarrier; -PFN_clUnloadCompiler clUnloadCompiler; -PFN_clGetExtensionFunctionAddress clGetExtensionFunctionAddress; -PFN_clCreateCommandQueue clCreateCommandQueue; -PFN_clCreateSampler clCreateSampler; -PFN_clEnqueueTask clEnqueueTask; - -// OpenGL sharing -PFN_clCreateFromGLBuffer clCreateFromGLBuffer; -PFN_clCreateFromGLTexture clCreateFromGLTexture; -PFN_clEnqueueAcquireGLObjects clEnqueueAcquireGLObjects; -PFN_clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects; - -// cl_khr_egl_event extension -PFN_clCreateEventFromEGLSyncKHR clCreateEventFromEGLSyncKHR; - -// EGL sharing -PFN_clCreateFromEGLImageKHR clCreateFromEGLImageKHR; -PFN_clEnqueueAcquireEGLObjectsKHR clEnqueueAcquireEGLObjectsKHR; -PFN_clEnqueueReleaseEGLObjectsKHR clEnqueueReleaseEGLObjectsKHR; - -cl_mem CreateImage2DLegacy(cl_context context, cl_mem_flags flags, - const cl_image_format *image_format, const cl_image_desc *image_desc, - void *host_ptr, cl_int *errcode_ret) -{ - if (clCreateImage) - { // clCreateImage available since OpenCL 1.2 - return clCreateImage(context, flags, image_format, image_desc, host_ptr, errcode_ret); - } - else - { - return clCreateImage2D(context, flags, image_format, image_desc->image_width, - image_desc->image_height, image_desc->image_row_pitch, host_ptr, - errcode_ret); - } -} - -cl_mem CreateImage3DLegacy(cl_context context, cl_mem_flags flags, - const cl_image_format *image_format, const cl_image_desc *image_desc, - void *host_ptr, cl_int *errcode_ret) -{ - if (clCreateImage) - { // clCreateImage available since OpenCL 1.2 - return clCreateImage(context, flags, image_format, image_desc, host_ptr, errcode_ret); - } - else - { - return clCreateImage3D(context, flags, image_format, image_desc->image_width, - image_desc->image_height, image_desc->image_depth, - image_desc->image_row_pitch, image_desc->image_slice_pitch, host_ptr, - errcode_ret); - } -} -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/OpenclWrapper.h b/runtime/onert/backend/gpu_cl/open_cl/OpenclWrapper.h deleted file mode 100644 index 021f8735a..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/OpenclWrapper.h +++ /dev/null @@ -1,560 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_WRAPPERE_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_WRAPPERE_H__ - -#include "CL/cl.h" -#include "CL/cl_egl.h" -#include "CL/cl_ext.h" -#include "CL/cl_gl.h" -#include "CL/cl_platform.h" -#include "Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -absl::Status LoadOpenCL(void **libopencl); -void UnloadOpenCL(void *libopencl); - -typedef cl_int(CL_API_CALL *PFN_clGetPlatformIDs)( - cl_uint /* num_entries */, cl_platform_id * /* platforms */, - cl_uint * /* num_platforms */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clGetPlatformInfo)( - cl_platform_id /* platform */, cl_platform_info /* param_name */, size_t /* param_value_size */, - void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clGetDeviceIDs)( - cl_platform_id /* platform */, cl_device_type /* device_type */, cl_uint /* num_entries */, - cl_device_id * /* devices */, cl_uint * /* num_devices */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clGetDeviceInfo)( - cl_device_id /* device */, cl_device_info /* param_name */, size_t /* param_value_size */, - void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clCreateSubDevices)( - cl_device_id /* in_device */, const cl_device_partition_property * /* properties */, - cl_uint /* num_devices */, cl_device_id * /* out_devices */, - cl_uint * /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2; -typedef cl_int(CL_API_CALL *PFN_clRetainDevice)(cl_device_id /* device */) - CL_API_SUFFIX__VERSION_1_2; -typedef cl_int(CL_API_CALL *PFN_clReleaseDevice)(cl_device_id /* device */) - CL_API_SUFFIX__VERSION_1_2; -typedef cl_context(CL_API_CALL *PFN_clCreateContext)( - const cl_context_properties * /* properties */, cl_uint /* num_devices */, - const cl_device_id * /* devices */, - void(CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *), - void * /* user_data */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_context(CL_API_CALL *PFN_clCreateContextFromType)( - const cl_context_properties * /* properties */, cl_device_type /* device_type */, - void(CL_CALLBACK * /* pfn_notify*/)(const char *, const void *, size_t, void *), - void * /* user_data */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clRetainContext)(cl_context /* context */) - CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clReleaseContext)(cl_context /* context */) - CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clGetContextInfo)( - cl_context /* context */, cl_context_info /* param_name */, size_t /* param_value_size */, - void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_command_queue(CL_API_CALL *PFN_clCreateCommandQueueWithProperties)( - cl_context /* context */, cl_device_id /* device */, const cl_queue_properties * /* properties */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; -typedef cl_int(CL_API_CALL *PFN_clRetainCommandQueue)(cl_command_queue /* command_queue */) - CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clReleaseCommandQueue)(cl_command_queue /* command_queue */) - CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clGetCommandQueueInfo)( - cl_command_queue /* command_queue */, cl_command_queue_info /* param_name */, - size_t /* param_value_size */, void * /* param_value */, - size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_mem(CL_API_CALL *PFN_clCreateBuffer)( - cl_context /* context */, cl_mem_flags /* flags */, size_t /* size */, void * /* host_ptr */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_mem(CL_API_CALL *PFN_clCreateSubBuffer)( - cl_mem /* buffer */, cl_mem_flags /* flags */, cl_buffer_create_type /* buffer_create_type */, - const void * /* buffer_create_info */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; -typedef cl_mem(CL_API_CALL *PFN_clCreateImage)( - cl_context /* context */, cl_mem_flags /* flags */, const cl_image_format * /* image_format */, - const cl_image_desc * /* image_desc */, void * /* host_ptr */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2; -typedef cl_mem(CL_API_CALL *PFN_clCreatePipe)( - cl_context /* context */, cl_mem_flags /* flags */, cl_uint /* pipe_packet_size */, - cl_uint /* pipe_max_packets */, const cl_pipe_properties * /* properties */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; -typedef cl_int(CL_API_CALL *PFN_clRetainMemObject)(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clReleaseMemObject)(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clGetSupportedImageFormats)( - cl_context /* context */, cl_mem_flags /* flags */, cl_mem_object_type /* image_type */, - cl_uint /* num_entries */, cl_image_format * /* image_formats */, - cl_uint * /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clGetMemObjectInfo)( - cl_mem /* memobj */, cl_mem_info /* param_name */, size_t /* param_value_size */, - void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clGetImageInfo)( - cl_mem /* image */, cl_image_info /* param_name */, size_t /* param_value_size */, - void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clGetPipeInfo)( - cl_mem /* pipe */, cl_pipe_info /* param_name */, size_t /* param_value_size */, - void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0; -typedef cl_int(CL_API_CALL *PFN_clSetMemObjectDestructorCallback)( - cl_mem /* memobj */, - void(CL_CALLBACK * /*pfn_notify*/)(cl_mem /* memobj */, void * /*user_data*/), - void * /*user_data */) CL_API_SUFFIX__VERSION_1_1; -typedef void *(CL_API_CALL *PFN_clSVMAlloc)(cl_context /* context */, cl_svm_mem_flags /* flags */, - size_t /* size */, - cl_uint /* alignment */)CL_API_SUFFIX__VERSION_2_0; -typedef void(CL_API_CALL *PFN_clSVMFree)(cl_context /* context */, - void * /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0; -typedef cl_sampler(CL_API_CALL *PFN_clCreateSamplerWithProperties)( - cl_context /* context */, const cl_sampler_properties * /* normalized_coords */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; -typedef cl_int(CL_API_CALL *PFN_clRetainSampler)(cl_sampler /* sampler */) - CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clReleaseSampler)(cl_sampler /* sampler */) - CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clGetSamplerInfo)( - cl_sampler /* sampler */, cl_sampler_info /* param_name */, size_t /* param_value_size */, - void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_program(CL_API_CALL *PFN_clCreateProgramWithSource)( - cl_context /* context */, cl_uint /* count */, const char ** /* strings */, - const size_t * /* lengths */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_program(CL_API_CALL *PFN_clCreateProgramWithBinary)( - cl_context /* context */, cl_uint /* num_devices */, const cl_device_id * /* device_list */, - const size_t * /* lengths */, const unsigned char ** /* binaries */, cl_int * /* binary_status */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_program(CL_API_CALL *PFN_clCreateProgramWithBuiltInKernels)( - cl_context /* context */, cl_uint /* num_devices */, const cl_device_id * /* device_list */, - const char * /* kernel_names */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2; -typedef cl_int(CL_API_CALL *PFN_clRetainProgram)(cl_program /* program */) - CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clReleaseProgram)(cl_program /* program */) - CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clBuildProgram)( - cl_program /* program */, cl_uint /* num_devices */, const cl_device_id * /* device_list */, - const char * /* options */, - void(CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), - void * /* user_data */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clCompileProgram)( - cl_program /* program */, cl_uint /* num_devices */, const cl_device_id * /* device_list */, - const char * /* options */, cl_uint /* num_input_headers */, - const cl_program * /* input_headers */, const char ** /* header_include_names */, - void(CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), - void * /* user_data */) CL_API_SUFFIX__VERSION_1_2; -typedef cl_program(CL_API_CALL *PFN_clLinkProgram)( - cl_context /* context */, cl_uint /* num_devices */, const cl_device_id * /* device_list */, - const char * /* options */, cl_uint /* num_input_programs */, - const cl_program * /* input_programs */, - void(CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), - void * /* user_data */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2; -typedef cl_int(CL_API_CALL *PFN_clUnloadPlatformCompiler)(cl_platform_id /* platform */) - CL_API_SUFFIX__VERSION_1_2; -typedef cl_int(CL_API_CALL *PFN_clGetProgramInfo)( - cl_program /* program */, cl_program_info /* param_name */, size_t /* param_value_size */, - void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clGetProgramBuildInfo)( - cl_program /* program */, cl_device_id /* device */, cl_program_build_info /* param_name */, - size_t /* param_value_size */, void * /* param_value */, - size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_kernel(CL_API_CALL *PFN_clCreateKernel)( - cl_program /* program */, const char * /* kernel_name */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clCreateKernelsInProgram)( - cl_program /* program */, cl_uint /* num_kernels */, cl_kernel * /* kernels */, - cl_uint * /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clRetainKernel)(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clReleaseKernel)(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clSetKernelArg)(cl_kernel /* kernel */, cl_uint /* arg_index */, - size_t /* arg_size */, const void * /* arg_value */) - CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clSetKernelArgSVMPointer)( - cl_kernel /* kernel */, cl_uint /* arg_index */, - const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0; -typedef cl_int(CL_API_CALL *PFN_clSetKernelExecInfo)( - cl_kernel /* kernel */, cl_kernel_exec_info /* param_name */, size_t /* param_value_size */, - const void * /* param_value */) CL_API_SUFFIX__VERSION_2_0; -typedef cl_int(CL_API_CALL *PFN_clGetKernelInfo)( - cl_kernel /* kernel */, cl_kernel_info /* param_name */, size_t /* param_value_size */, - void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clGetKernelArgInfo)( - cl_kernel /* kernel */, cl_uint /* arg_indx */, cl_kernel_arg_info /* param_name */, - size_t /* param_value_size */, void * /* param_value */, - size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2; -typedef cl_int(CL_API_CALL *PFN_clGetKernelWorkGroupInfo)( - cl_kernel /* kernel */, cl_device_id /* device */, cl_kernel_work_group_info /* param_name */, - size_t /* param_value_size */, void * /* param_value */, - size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clWaitForEvents)( - cl_uint /* num_events */, const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clGetEventInfo)( - cl_event /* event */, cl_event_info /* param_name */, size_t /* param_value_size */, - void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_event(CL_API_CALL *PFN_clCreateUserEvent)( - cl_context /* context */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; -typedef cl_int(CL_API_CALL *PFN_clRetainEvent)(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clReleaseEvent)(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clSetUserEventStatus)( - cl_event /* event */, cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1; -typedef cl_int(CL_API_CALL *PFN_clSetEventCallback)( - cl_event /* event */, cl_int /* command_exec_callback_type */, - void(CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *), - void * /* user_data */) CL_API_SUFFIX__VERSION_1_1; -typedef cl_int(CL_API_CALL *PFN_clGetEventProfilingInfo)( - cl_event /* event */, cl_profiling_info /* param_name */, size_t /* param_value_size */, - void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clFlush)(cl_command_queue /* command_queue */) - CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clFinish)(cl_command_queue /* command_queue */) - CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clEnqueueReadBuffer)( - cl_command_queue /* command_queue */, cl_mem /* buffer */, cl_bool /* blocking_read */, - size_t /* offset */, size_t /* size */, void * /* ptr */, cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clEnqueueReadBufferRect)( - cl_command_queue /* command_queue */, cl_mem /* buffer */, cl_bool /* blocking_read */, - const size_t * /* buffer_offset */, const size_t * /* host_offset */, const size_t * /* region */, - size_t /* buffer_row_pitch */, size_t /* buffer_slice_pitch */, size_t /* host_row_pitch */, - size_t /* host_slice_pitch */, void * /* ptr */, cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; -typedef cl_int(CL_API_CALL *PFN_clEnqueueWriteBuffer)( - cl_command_queue /* command_queue */, cl_mem /* buffer */, cl_bool /* blocking_write */, - size_t /* offset */, size_t /* size */, const void * /* ptr */, - cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clEnqueueWriteBufferRect)( - cl_command_queue /* command_queue */, cl_mem /* buffer */, cl_bool /* blocking_write */, - const size_t * /* buffer_offset */, const size_t * /* host_offset */, const size_t * /* region */, - size_t /* buffer_row_pitch */, size_t /* buffer_slice_pitch */, size_t /* host_row_pitch */, - size_t /* host_slice_pitch */, const void * /* ptr */, cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; -typedef cl_int(CL_API_CALL *PFN_clEnqueueFillBuffer)( - cl_command_queue /* command_queue */, cl_mem /* buffer */, const void * /* pattern */, - size_t /* pattern_size */, size_t /* offset */, size_t /* size */, - cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; -typedef cl_int(CL_API_CALL *PFN_clEnqueueCopyBuffer)( - cl_command_queue /* command_queue */, cl_mem /* src_buffer */, cl_mem /* dst_buffer */, - size_t /* src_offset */, size_t /* dst_offset */, size_t /* size */, - cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clEnqueueCopyBufferRect)( - cl_command_queue /* command_queue */, cl_mem /* src_buffer */, cl_mem /* dst_buffer */, - const size_t * /* src_origin */, const size_t * /* dst_origin */, const size_t * /* region */, - size_t /* src_row_pitch */, size_t /* src_slice_pitch */, size_t /* dst_row_pitch */, - size_t /* dst_slice_pitch */, cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; -typedef cl_int(CL_API_CALL *PFN_clEnqueueReadImage)( - cl_command_queue /* command_queue */, cl_mem /* image */, cl_bool /* blocking_read */, - const size_t * /* origin[3] */, const size_t * /* region[3] */, size_t /* row_pitch */, - size_t /* slice_pitch */, void * /* ptr */, cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clEnqueueWriteImage)( - cl_command_queue /* command_queue */, cl_mem /* image */, cl_bool /* blocking_write */, - const size_t * /* origin[3] */, const size_t * /* region[3] */, size_t /* input_row_pitch */, - size_t /* input_slice_pitch */, const void * /* ptr */, cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clEnqueueFillImage)( - cl_command_queue /* command_queue */, cl_mem /* image */, const void * /* fill_color */, - const size_t * /* origin[3] */, const size_t * /* region[3] */, - cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; -typedef cl_int(CL_API_CALL *PFN_clEnqueueCopyImage)( - cl_command_queue /* command_queue */, cl_mem /* src_image */, cl_mem /* dst_image */, - const size_t * /* src_origin[3] */, const size_t * /* dst_origin[3] */, - const size_t * /* region[3] */, cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clEnqueueCopyImageToBuffer)( - cl_command_queue /* command_queue */, cl_mem /* src_image */, cl_mem /* dst_buffer */, - const size_t * /* src_origin[3] */, const size_t * /* region[3] */, size_t /* dst_offset */, - cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clEnqueueCopyBufferToImage)( - cl_command_queue /* command_queue */, cl_mem /* src_buffer */, cl_mem /* dst_image */, - size_t /* src_offset */, const size_t * /* dst_origin[3] */, const size_t * /* region[3] */, - cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; -typedef void *(CL_API_CALL *PFN_clEnqueueMapBuffer)( - cl_command_queue /* command_queue */, cl_mem /* buffer */, cl_bool /* blocking_map */, - cl_map_flags /* map_flags */, size_t /* offset */, size_t /* size */, - cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, - cl_event * /* event */, cl_int * /* errcode_ret */)CL_API_SUFFIX__VERSION_1_0; -typedef void *(CL_API_CALL *PFN_clEnqueueMapImage)( - cl_command_queue /* command_queue */, cl_mem /* image */, cl_bool /* blocking_map */, - cl_map_flags /* map_flags */, const size_t * /* origin[3] */, const size_t * /* region[3] */, - size_t * /* image_row_pitch */, size_t * /* image_slice_pitch */, - cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, - cl_event * /* event */, cl_int * /* errcode_ret */)CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clEnqueueUnmapMemObject)( - cl_command_queue /* command_queue */, cl_mem /* memobj */, void * /* mapped_ptr */, - cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clEnqueueMigrateMemObjects)( - cl_command_queue /* command_queue */, cl_uint /* num_mem_objects */, - const cl_mem * /* mem_objects */, cl_mem_migration_flags /* flags */, - cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; -typedef cl_int(CL_API_CALL *PFN_clEnqueueNDRangeKernel)( - cl_command_queue /* command_queue */, cl_kernel /* kernel */, cl_uint /* work_dim */, - const size_t * /* global_work_offset */, const size_t * /* global_work_size */, - const size_t * /* local_work_size */, cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clEnqueueNativeKernel)( - cl_command_queue /* command_queue */, void(CL_CALLBACK * /*user_func*/)(void *), - void * /* args */, size_t /* cb_args */, cl_uint /* num_mem_objects */, - const cl_mem * /* mem_list */, const void ** /* args_mem_loc */, - cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; -typedef cl_int(CL_API_CALL *PFN_clEnqueueMarkerWithWaitList)( - cl_command_queue /* command_queue */, cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; -typedef cl_int(CL_API_CALL *PFN_clEnqueueBarrierWithWaitList)( - cl_command_queue /* command_queue */, cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; -typedef cl_int(CL_API_CALL *PFN_clEnqueueSVMFree)( - cl_command_queue /* command_queue */, cl_uint /* num_svm_pointers */, - void *[] /* svm_pointers[] */, - void(CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */, - cl_uint /* num_svm_pointers */, - void *[] /* svm_pointers[] */, void * /* user_data */), - void * /* user_data */, cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; -typedef cl_int(CL_API_CALL *PFN_clEnqueueSVMMemcpy)( - cl_command_queue /* command_queue */, cl_bool /* blocking_copy */, void * /* dst_ptr */, - const void * /* src_ptr */, size_t /* size */, cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; -typedef cl_int(CL_API_CALL *PFN_clEnqueueSVMMemFill)( - cl_command_queue /* command_queue */, void * /* svm_ptr */, const void * /* pattern */, - size_t /* pattern_size */, size_t /* size */, cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; -typedef cl_int(CL_API_CALL *PFN_clEnqueueSVMMap)( - cl_command_queue /* command_queue */, cl_bool /* blocking_map */, cl_map_flags /* flags */, - void * /* svm_ptr */, size_t /* size */, cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; -typedef cl_int(CL_API_CALL *PFN_clEnqueueSVMUnmap)( - cl_command_queue /* command_queue */, void * /* svm_ptr */, cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; -typedef void *(CL_API_CALL *PFN_clGetExtensionFunctionAddressForPlatform)( - cl_platform_id /* platform */, const char * /* func_name */)CL_API_SUFFIX__VERSION_1_2; -typedef cl_mem(CL_API_CALL *PFN_clCreateImage2D)(cl_context /* context */, cl_mem_flags /* flags */, - const cl_image_format * /* image_format */, - size_t /* image_width */, - size_t /* image_height */, - size_t /* image_row_pitch */, - void * /* host_ptr */, cl_int * /* errcode_ret */); -typedef cl_mem(CL_API_CALL *PFN_clCreateImage3D)( - cl_context /* context */, cl_mem_flags /* flags */, const cl_image_format * /* image_format */, - size_t /* image_width */, size_t /* image_height */, size_t /* image_depth */, - size_t /* image_row_pitch */, size_t /* image_slice_pitch */, void * /* host_ptr */, - cl_int * /* errcode_ret */); -typedef cl_int(CL_API_CALL *PFN_clEnqueueMarker)(cl_command_queue /* command_queue */, - cl_event * /* event */); -typedef cl_int(CL_API_CALL *PFN_clEnqueueWaitForEvents)(cl_command_queue /* command_queue */, - cl_uint /* num_events */, - const cl_event * /* event_list */); -typedef cl_int(CL_API_CALL *PFN_clEnqueueBarrier)(cl_command_queue /* command_queue */); -typedef cl_int(CL_API_CALL *PFN_clUnloadCompiler)(); -typedef void *(CL_API_CALL *PFN_clGetExtensionFunctionAddress)(const char * /* func_name */); -typedef cl_command_queue(CL_API_CALL *PFN_clCreateCommandQueue)( - cl_context /* context */, cl_device_id /* device */, cl_command_queue_properties /* properties */, - cl_int * /* errcode_ret */); -typedef cl_sampler(CL_API_CALL *PFN_clCreateSampler)(cl_context /* context */, - cl_bool /* normalized_coords */, - cl_addressing_mode /* addressing_mode */, - cl_filter_mode /* filter_mode */, - cl_int * /* errcode_ret */); -typedef cl_int(CL_API_CALL *PFN_clEnqueueTask)(cl_command_queue /* command_queue */, - cl_kernel /* kernel */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */); - -// OpenGL sharing -typedef cl_mem(CL_API_CALL *PFN_clCreateFromGLBuffer)(cl_context, cl_mem_flags, cl_GLuint, int *); -typedef cl_mem(CL_API_CALL *PFN_clCreateFromGLTexture)( - cl_context /* context */, cl_mem_flags /* flags */, cl_GLenum /* target */, - cl_GLint /* miplevel */, cl_GLuint /* texture */, - cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2; -typedef cl_int(CL_API_CALL *PFN_clEnqueueAcquireGLObjects)(cl_command_queue /* command_queue */, - cl_uint /* num_objects */, - const cl_mem * /* mem_objects */, - cl_uint /* num_events_in_wait_list */, - const cl_event * /* event_wait_list */, - cl_event * /* event */); -typedef cl_int(CL_API_CALL *PFN_clEnqueueReleaseGLObjects)( - cl_command_queue /* command_queue */, cl_uint /* num_objects */, const cl_mem * /* mem_objects */, - cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */, - cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; - -// cl_khr_egl_event extension - -// CLeglDisplayKHR is an opaque handle to an EGLDisplay -typedef void *CLeglDisplayKHR; - -// CLeglSyncKHR is an opaque handle to an EGLSync object -typedef void *CLeglSyncKHR; - -typedef cl_event(CL_API_CALL *PFN_clCreateEventFromEGLSyncKHR)(cl_context /* context */, - CLeglSyncKHR /* sync */, - CLeglDisplayKHR /* display */, - cl_int * /* errcode_ret */); - -// EGL sharing -typedef cl_mem(CL_API_CALL *PFN_clCreateFromEGLImageKHR)( - cl_context /*context*/, CLeglDisplayKHR /*display*/, CLeglImageKHR /*image*/, - cl_mem_flags /*flags*/, const cl_egl_image_properties_khr * /*properties*/, - cl_int * /*errcode_ret*/); -typedef cl_int(CL_API_CALL *PFN_clEnqueueAcquireEGLObjectsKHR)( - cl_command_queue /*command_queue*/, cl_uint /*num_objects*/, const cl_mem * /*mem_objects*/, - cl_uint /*num_events_in_wait_list*/, const cl_event * /*event_wait_list*/, cl_event * /*event*/); -typedef cl_int(CL_API_CALL *PFN_clEnqueueReleaseEGLObjectsKHR)( - cl_command_queue /*command_queue*/, cl_uint /*num_objects*/, const cl_mem * /*mem_objects*/, - cl_uint /*num_events_in_wait_list*/, const cl_event * /*event_wait_list*/, cl_event * /*event*/); - -extern PFN_clGetPlatformIDs clGetPlatformIDs; -extern PFN_clGetPlatformInfo clGetPlatformInfo; -extern PFN_clGetDeviceIDs clGetDeviceIDs; -extern PFN_clGetDeviceInfo clGetDeviceInfo; -extern PFN_clCreateSubDevices clCreateSubDevices; -extern PFN_clRetainDevice clRetainDevice; -extern PFN_clReleaseDevice clReleaseDevice; -extern PFN_clCreateContext clCreateContext; -extern PFN_clCreateContextFromType clCreateContextFromType; -extern PFN_clRetainContext clRetainContext; -extern PFN_clReleaseContext clReleaseContext; -extern PFN_clGetContextInfo clGetContextInfo; -extern PFN_clCreateCommandQueueWithProperties clCreateCommandQueueWithProperties; -extern PFN_clRetainCommandQueue clRetainCommandQueue; -extern PFN_clReleaseCommandQueue clReleaseCommandQueue; -extern PFN_clGetCommandQueueInfo clGetCommandQueueInfo; -extern PFN_clCreateBuffer clCreateBuffer; -extern PFN_clCreateSubBuffer clCreateSubBuffer; -extern PFN_clCreateImage clCreateImage; -extern PFN_clCreatePipe clCreatePipe; -extern PFN_clRetainMemObject clRetainMemObject; -extern PFN_clReleaseMemObject clReleaseMemObject; -extern PFN_clGetSupportedImageFormats clGetSupportedImageFormats; -extern PFN_clGetMemObjectInfo clGetMemObjectInfo; -extern PFN_clGetImageInfo clGetImageInfo; -extern PFN_clGetPipeInfo clGetPipeInfo; -extern PFN_clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback; -extern PFN_clSVMAlloc clSVMAlloc; -extern PFN_clSVMFree clSVMFree; -extern PFN_clCreateSamplerWithProperties clCreateSamplerWithProperties; -extern PFN_clRetainSampler clRetainSampler; -extern PFN_clReleaseSampler clReleaseSampler; -extern PFN_clGetSamplerInfo clGetSamplerInfo; -extern PFN_clCreateProgramWithSource clCreateProgramWithSource; -extern PFN_clCreateProgramWithBinary clCreateProgramWithBinary; -extern PFN_clCreateProgramWithBuiltInKernels clCreateProgramWithBuiltInKernels; -extern PFN_clRetainProgram clRetainProgram; -extern PFN_clReleaseProgram clReleaseProgram; -extern PFN_clBuildProgram clBuildProgram; -extern PFN_clCompileProgram clCompileProgram; -extern PFN_clLinkProgram clLinkProgram; -extern PFN_clUnloadPlatformCompiler clUnloadPlatformCompiler; -extern PFN_clGetProgramInfo clGetProgramInfo; -extern PFN_clGetProgramBuildInfo clGetProgramBuildInfo; -extern PFN_clCreateKernel clCreateKernel; -extern PFN_clCreateKernelsInProgram clCreateKernelsInProgram; -extern PFN_clRetainKernel clRetainKernel; -extern PFN_clReleaseKernel clReleaseKernel; -extern PFN_clSetKernelArg clSetKernelArg; -extern PFN_clSetKernelArgSVMPointer clSetKernelArgSVMPointer; -extern PFN_clSetKernelExecInfo clSetKernelExecInfo; -extern PFN_clGetKernelInfo clGetKernelInfo; -extern PFN_clGetKernelArgInfo clGetKernelArgInfo; -extern PFN_clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo; -extern PFN_clWaitForEvents clWaitForEvents; -extern PFN_clGetEventInfo clGetEventInfo; -extern PFN_clCreateUserEvent clCreateUserEvent; -extern PFN_clRetainEvent clRetainEvent; -extern PFN_clReleaseEvent clReleaseEvent; -extern PFN_clSetUserEventStatus clSetUserEventStatus; -extern PFN_clSetEventCallback clSetEventCallback; -extern PFN_clGetEventProfilingInfo clGetEventProfilingInfo; -extern PFN_clFlush clFlush; -extern PFN_clFinish clFinish; -extern PFN_clEnqueueReadBuffer clEnqueueReadBuffer; -extern PFN_clEnqueueReadBufferRect clEnqueueReadBufferRect; -extern PFN_clEnqueueWriteBuffer clEnqueueWriteBuffer; -extern PFN_clEnqueueWriteBufferRect clEnqueueWriteBufferRect; -extern PFN_clEnqueueFillBuffer clEnqueueFillBuffer; -extern PFN_clEnqueueCopyBuffer clEnqueueCopyBuffer; -extern PFN_clEnqueueCopyBufferRect clEnqueueCopyBufferRect; -extern PFN_clEnqueueReadImage clEnqueueReadImage; -extern PFN_clEnqueueWriteImage clEnqueueWriteImage; -extern PFN_clEnqueueFillImage clEnqueueFillImage; -extern PFN_clEnqueueCopyImage clEnqueueCopyImage; -extern PFN_clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer; -extern PFN_clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage; -extern PFN_clEnqueueMapBuffer clEnqueueMapBuffer; -extern PFN_clEnqueueMapImage clEnqueueMapImage; -extern PFN_clEnqueueUnmapMemObject clEnqueueUnmapMemObject; -extern PFN_clEnqueueMigrateMemObjects clEnqueueMigrateMemObjects; -extern PFN_clEnqueueNDRangeKernel clEnqueueNDRangeKernel; -extern PFN_clEnqueueNativeKernel clEnqueueNativeKernel; -extern PFN_clEnqueueMarkerWithWaitList clEnqueueMarkerWithWaitList; -extern PFN_clEnqueueBarrierWithWaitList clEnqueueBarrierWithWaitList; -extern PFN_clEnqueueSVMFree clEnqueueSVMFree; -extern PFN_clEnqueueSVMMemcpy clEnqueueSVMMemcpy; -extern PFN_clEnqueueSVMMemFill clEnqueueSVMMemFill; -extern PFN_clEnqueueSVMMap clEnqueueSVMMap; -extern PFN_clEnqueueSVMUnmap clEnqueueSVMUnmap; -extern PFN_clGetExtensionFunctionAddressForPlatform clGetExtensionFunctionAddressForPlatform; -extern PFN_clCreateImage2D clCreateImage2D; -extern PFN_clCreateImage3D clCreateImage3D; -extern PFN_clEnqueueMarker clEnqueueMarker; -extern PFN_clEnqueueWaitForEvents clEnqueueWaitForEvents; -extern PFN_clEnqueueBarrier clEnqueueBarrier; -extern PFN_clUnloadCompiler clUnloadCompiler; -extern PFN_clGetExtensionFunctionAddress clGetExtensionFunctionAddress; -extern PFN_clCreateCommandQueue clCreateCommandQueue; -extern PFN_clCreateSampler clCreateSampler; -extern PFN_clEnqueueTask clEnqueueTask; - -// OpenGL sharing -extern PFN_clCreateFromGLBuffer clCreateFromGLBuffer; -extern PFN_clCreateFromGLTexture clCreateFromGLTexture; -extern PFN_clEnqueueAcquireGLObjects clEnqueueAcquireGLObjects; -extern PFN_clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects; - -// cl_khr_egl_event extension -extern PFN_clCreateEventFromEGLSyncKHR clCreateEventFromEGLSyncKHR; - -// EGL sharing -extern PFN_clCreateFromEGLImageKHR clCreateFromEGLImageKHR; -extern PFN_clEnqueueAcquireEGLObjectsKHR clEnqueueAcquireEGLObjectsKHR; -extern PFN_clEnqueueReleaseEGLObjectsKHR clEnqueueReleaseEGLObjectsKHR; - -// For convenient image creation -// It uses clCreateImage if it available (clCreateImage available since cl 1.2) -// otherwise it will use legacy clCreateImage2D -cl_mem CreateImage2DLegacy(cl_context context, cl_mem_flags flags, - const cl_image_format *image_format, const cl_image_desc *image_desc, - void *host_ptr, cl_int *errcode_ret); - -// It uses clCreateImage if it available (clCreateImage available since cl 1.2) -// otherwise it will use legacy clCreateImage3D -cl_mem CreateImage3DLegacy(cl_context context, cl_mem_flags flags, - const cl_image_format *image_format, const cl_image_desc *image_desc, - void *host_ptr, cl_int *errcode_ret); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_WRAPPERE_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/Operations.cc b/runtime/onert/backend/gpu_cl/open_cl/Operations.cc deleted file mode 100644 index 2608b5364..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/Operations.cc +++ /dev/null @@ -1,704 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2020 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Operations.h" -#include "open_cl/Operations.h" - -#include <algorithm> -#include <cstdint> -#include <set> -#include <string> -#include <utility> -#include <vector> -#include <unordered_map> - -#include "absl/container/flat_hash_map.h" - -#include "Shape.h" -#include "Status.h" -#include "InternalTensor.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -Padding2D &Padding2D::operator=(const Padding2D &value) -{ - prepended = value.prepended; - appended = value.appended; - return *this; -} - -bool Padding2D::operator==(const Padding2D &value) -{ - return this->prepended == value.prepended && this->appended == value.appended; -} - -bool Padding2D::operator!=(const Padding2D &value) { return !(*this == value); } - -Padding2D &Padding2D::operator-(const Padding2D &value) -{ - prepended.h -= value.prepended.h; - prepended.w -= value.prepended.w; - appended.h -= value.appended.h; - appended.w -= value.appended.w; - return *this; -} - -Padding3D &Padding3D::operator=(const Padding3D &value) -{ - prepended = value.prepended; - appended = value.appended; - return *this; -} - -bool Padding3D::operator==(const Padding3D &value) -{ - return this->prepended == value.prepended && this->appended == value.appended; -} - -bool Padding3D::operator!=(const Padding3D &value) { return !(*this == value); } - -Padding3D &Padding3D::operator-(const Padding3D &value) -{ - prepended.h -= value.prepended.h; - prepended.w -= value.prepended.w; - prepended.d -= value.prepended.d; - appended.h -= value.appended.h; - appended.w -= value.appended.w; - appended.d -= value.appended.d; - return *this; -} - -std::string ToString(enum OperationType op) -{ - switch (op) - { - // case OperationType::ABS: - // return "abs"; - case OperationType::ADD: - return "add"; - // case OperationType::CONCAT: - // return "concat"; - // case OperationType::COS: - // return "cos"; - // case OperationType::EXP: - // return "exp"; - // case OperationType::LOG: - // return "log"; - // case OperationType::NEG: - // return "neg"; - // case OperationType::POOLING_2D: - // return "pooling_2d"; - // case OperationType::REDUCE_MAXIMUM: - // return "reduce_maximum"; - // case OperationType::REDUCE_MINIMUM: - // return "reduce_minimum"; - // case OperationType::REDUCE_PRODUCT: - // return "reduce_product"; - // case OperationType::REDUCE_SUM: - // return "reduce_sum"; - // case OperationType::RESIZE: - // return "resize"; - // case OperationType::RELU: - // return "relu"; - // case OperationType::RSQRT: - // return "rsqrt"; - // case OperationType::SQRT: - // return "sqrt"; - // case OperationType::SQUARE: - // return "square"; - case OperationType::UNKNOWN: - return "unknown_operation"; - } - return ""; -} - -OperationType OperationTypeFromString(const std::string &name) -{ - static const auto operations = new std::unordered_map<std::string, OperationType>({ - // {"abs", OperationType::ABS}, - {"add", OperationType::ADD}, - // {"concat", OperationType::CONCAT}, - // {"cos", OperationType::COS}, - // {"exp", OperationType::EXP}, - // {"log", OperationType::LOG}, - // {"neg", OperationType::NEG}, - // {"pooling_2d", OperationType::POOLING_2D}, - // {"reduce_maximum", OperationType::REDUCE_MAXIMUM}, - // {"reduce_minimum", OperationType::REDUCE_MINIMUM}, - // {"reduce_product", OperationType::REDUCE_PRODUCT}, - // {"reduce_sum", OperationType::REDUCE_SUM}, - // {"relu", OperationType::RELU}, - // {"resize", OperationType::RESIZE}, - // {"rsqrt", OperationType::RSQRT}, - // {"sqrt", OperationType::SQRT}, - // {"square", OperationType::SQUARE}, - }); - auto op = operations->find(name); - return op == operations->end() ? OperationType::UNKNOWN : op->second; -} - -namespace -{ - -template <typename T> T DivideRoundUp(T n, T divisor) { return (n - 1) / divisor + 1; } - -int32_t CalculateOutputSizeBeforeStrides(int32_t input, int32_t kernel, int32_t padding, - int32_t dilation) -{ - const int32_t dilated_kernel = (kernel - 1) * dilation + 1; - return input + padding - dilated_kernel + 1; -} - -template <Axis T> -int32_t CalculateOutputWithoutStrides(const BHWC &input, const Convolution2DAttributes &attr) -{ - return CalculateOutputSizeBeforeStrides( - input.get<T>(), attr.weights.shape.get<T>(), - attr.padding.prepended.get<T>() + attr.padding.appended.get<T>(), attr.dilations.get<T>()); -} - -template <Axis T> -int32_t CalculateOutputWithoutStrides(const BHWDC &input, const Convolution3DAttributes &attr) -{ - return CalculateOutputSizeBeforeStrides( - input.get<T>(), attr.weights.shape.get<T>(), - attr.padding.prepended.get<T>() + attr.padding.appended.get<T>(), attr.dilations.get<T>()); -} - -template <Axis T> -int32_t CalculateOutputWithoutStrides(const BHWC &input, const Pooling2DAttributes &attr) -{ - return CalculateOutputSizeBeforeStrides(input.get<T>(), attr.kernel.get<T>(), - attr.padding.prepended.get<T>() + - attr.padding.appended.get<T>(), - /*dilation=*/1); -} - -template <Axis T> -int32_t CalculateOutputWithoutStrides(const BHWDC &input, const Pooling3DAttributes &attr) -{ - return CalculateOutputSizeBeforeStrides(input.get<T>(), attr.kernel.get<T>(), - attr.padding.prepended.get<T>() + - attr.padding.appended.get<T>(), - /*dilation=*/1); -} - -template <Axis T> -int32_t CalculateOutput(const BHWC &input, const ConvolutionTransposedAttributes &attr) -{ - return (input.get<T>() - 1) * attr.stride.get<T>() - - (attr.padding.prepended.get<T>() + attr.padding.appended.get<T>()) + - attr.weights.shape.get<T>() + attr.adjacent.get<T>(); -} - -template <Axis T> -int32_t CalculateOutput(const BHWDC &input, const ConvolutionTransposed3DAttributes &attr) -{ - return (input.get<T>() - 1) * attr.stride.get<T>() - - (attr.padding.prepended.get<T>() + attr.padding.appended.get<T>()) + - attr.weights.shape.get<T>(); -} - -inline int32_t StridedSize(int32_t size, int32_t stride) -{ - return stride == 0 ? -1 : DivideRoundUp(size, stride); -} - -template <Axis AxisT, typename AttrT> int32_t CalculateOutput(const BHWC &input, const AttrT &attr) -{ - return StridedSize(CalculateOutputWithoutStrides<AxisT>(input, attr), - attr.strides.template get<AxisT>()); -} - -template <Axis AxisT, typename AttrT> int32_t CalculateOutput(const BHWDC &input, const AttrT &attr) -{ - return StridedSize(CalculateOutputWithoutStrides<AxisT>(input, attr), - attr.strides.template get<AxisT>()); -} - -int32_t CalculateSamePadding(int32_t input, int32_t kernel, int32_t dilation, int32_t stride) -{ - const int32_t dilated_kernel = (kernel - 1) * dilation + 1; - return std::max(0, dilated_kernel - (input - 1) % stride - 1); -} - -// Returns a padding that should be present to make sure image size stays -// the same. -template <Axis AxisT> -int32_t CalculateSamePadding(const BHWC &input, const Convolution2DAttributes &attr) -{ - return CalculateSamePadding(input.get<AxisT>(), attr.weights.shape.get<AxisT>(), - attr.dilations.get<AxisT>(), attr.strides.get<AxisT>()); -} - -// Returns a padding that should be present to make sure image size stays -// the same. -template <Axis AxisT> -int32_t CalculateSamePadding(const BHWDC &input, const Convolution3DAttributes &attr) -{ - return CalculateSamePadding(input.get<AxisT>(), attr.weights.shape.get<AxisT>(), - attr.dilations.get<AxisT>(), attr.strides.get<AxisT>()); -} - -template <Axis AxisT> -int32_t CalculateSamePadding(const BHWC &input, const ConvolutionTransposedAttributes &attr) -{ - return CalculateSamePadding(input.get<AxisT>(), attr.weights.shape.get<AxisT>(), - /*dilation=*/1, attr.stride.get<AxisT>()); -} - -template <Axis AxisT> -int32_t CalculateSamePadding(const BHWDC &input, const ConvolutionTransposed3DAttributes &attr) -{ - return CalculateSamePadding(input.get<AxisT>(), attr.weights.shape.get<AxisT>(), - /*dilation=*/1, attr.stride.get<AxisT>()); -} - -template <Axis AxisT> -int32_t CalculateSamePadding(const BHWC &input, const Pooling2DAttributes &attr) -{ - return CalculateSamePadding(input.get<AxisT>(), attr.kernel.get<AxisT>(), - /*dilation=*/1, attr.strides.get<AxisT>()); -} - -template <Axis AxisT> -int32_t CalculateSamePadding(const BHWDC &input, const Pooling3DAttributes &attr) -{ - return CalculateSamePadding(input.get<AxisT>(), attr.kernel.get<AxisT>(), - /*dilation=*/1, attr.strides.get<AxisT>()); -} - -template <Axis AxisT> -int32_t CalculateSamePadding(const BHWC &input, const MaxUnpooling2DAttributes &attr) -{ - return CalculateSamePadding(input.get<AxisT>(), attr.kernel.get<AxisT>(), - /*dilation=*/1, attr.strides.get<AxisT>()); -} - -template <Axis AxisT> -int32_t CalculateSamePadding(const BHWDC &input, const MaxUnpooling3DAttributes &attr) -{ - return CalculateSamePadding(input.get<AxisT>(), attr.kernel.get<AxisT>(), - /*dilation=*/1, attr.strides.get<AxisT>()); -} - -Padding2D MakeSamePadding(const BHWC &input, const ConvolutionTransposedAttributes &attr) -{ - int32_t padding_height = CalculateSamePadding<Axis::HEIGHT>(input, attr); - int32_t padding_width = CalculateSamePadding<Axis::WIDTH>(input, attr); - Padding2D padding; - padding.prepended = HW(padding_height / 2, padding_width / 2); - padding.appended = HW(padding_height - padding_height / 2, padding_width - padding_width / 2); - return padding; -} - -Padding3D MakeSamePadding(const BHWDC &input, const ConvolutionTransposed3DAttributes &attr) -{ - int32_t padding_height = CalculateSamePadding<Axis::HEIGHT>(input, attr); - int32_t padding_width = CalculateSamePadding<Axis::WIDTH>(input, attr); - int32_t padding_depth = CalculateSamePadding<Axis::DEPTH>(input, attr); - Padding3D padding; - padding.prepended = HWD(padding_height / 2, padding_width / 2, padding_depth / 2); - padding.appended = HWD(padding_height - padding_height / 2, padding_width - padding_width / 2, - padding_depth - padding_depth / 2); - return padding; -} - -// If padding depends on input, convert it into fixed padding. -template <class AttrT> Padding2D MakeSamePadding(const BHWC &input, const AttrT &attr) -{ - int32_t padding_height = CalculateSamePadding<Axis::HEIGHT>(input, attr); - int32_t padding_width = CalculateSamePadding<Axis::WIDTH>(input, attr); - Padding2D padding; - padding.prepended = HW(padding_height / 2, padding_width / 2); - padding.appended = HW(padding_height - padding_height / 2, padding_width - padding_width / 2); - return padding; -} - -// If padding depends on input, convert it into fixed padding. -template <class AttrT> Padding3D MakeSamePadding(const BHWDC &input, const AttrT &attr) -{ - int32_t padding_height = CalculateSamePadding<Axis::HEIGHT>(input, attr); - int32_t padding_width = CalculateSamePadding<Axis::WIDTH>(input, attr); - int32_t padding_depth = CalculateSamePadding<Axis::DEPTH>(input, attr); - Padding3D padding; - padding.prepended = HWD(padding_height / 2, padding_width / 2, padding_depth / 2); - padding.appended = HWD(padding_height - padding_height / 2, padding_width - padding_width / 2, - padding_depth - padding_depth / 2); - return padding; -} - -} // namespace - -BHWC CalculateOutputShape(const BHWC &input, const MaxUnpooling2DAttributes &attr) -{ - return BHWC( - input.b, input.h * attr.strides.h - attr.padding.prepended.h - attr.padding.appended.h, - input.w * attr.strides.w - attr.padding.prepended.w - attr.padding.appended.w, input.c); -} - -BHWDC CalculateOutputShape(const BHWDC &input, const MaxUnpooling3DAttributes &attr) -{ - return BHWDC( - input.b, input.h * attr.strides.h - attr.padding.prepended.h - attr.padding.appended.h, - input.w * attr.strides.w - attr.padding.prepended.w - attr.padding.appended.w, - input.d * attr.strides.d - attr.padding.prepended.d - attr.padding.appended.d, input.c); -} - -BHWC CalculateOutputShape(const BHWC &input, const Pooling2DAttributes &attr) -{ - return BHWC(input.b, CalculateOutput<Axis::HEIGHT>(input, attr), - CalculateOutput<Axis::WIDTH>(input, attr), input.c); -} - -BHWDC CalculateOutputShape(const BHWDC &input, const Pooling3DAttributes &attr) -{ - return BHWDC(input.b, CalculateOutput<Axis::HEIGHT>(input, attr), - CalculateOutput<Axis::WIDTH>(input, attr), CalculateOutput<Axis::DEPTH>(input, attr), - input.c); -} - -BHWC CalculateOutputShape(const BHWC &input, const Convolution2DAttributes &attr) -{ - return BHWC(input.b, CalculateOutput<Axis::HEIGHT>(input, attr), - CalculateOutput<Axis::WIDTH>(input, attr), - attr.weights.shape.get<Axis::OUTPUT_CHANNELS>()); -} - -BHWDC CalculateOutputShape(const BHWDC &input, const Convolution3DAttributes &attr) -{ - return BHWDC(input.b, CalculateOutput<Axis::HEIGHT>(input, attr), - CalculateOutput<Axis::WIDTH>(input, attr), CalculateOutput<Axis::DEPTH>(input, attr), - attr.weights.shape.get<Axis::OUTPUT_CHANNELS>()); -} - -BHWC CalculateOutputShape(const BHWC &input, const ConvolutionTransposedAttributes &attr) -{ - return BHWC(input.b, CalculateOutput<Axis::HEIGHT>(input, attr), - CalculateOutput<Axis::WIDTH>(input, attr), - attr.weights.shape.get<Axis::OUTPUT_CHANNELS>()); -} - -BHWDC CalculateOutputShape(const BHWDC &input, const ConvolutionTransposed3DAttributes &attr) -{ - return BHWDC(input.b, CalculateOutput<Axis::HEIGHT>(input, attr), - CalculateOutput<Axis::WIDTH>(input, attr), CalculateOutput<Axis::DEPTH>(input, attr), - attr.weights.shape.get<Axis::OUTPUT_CHANNELS>()); -} - -BHWC CalculateOutputShape(const BHWC &input, const DepthwiseConvolution2DAttributes &attr) -{ - return BHWC(input.b, CalculateOutput<Axis::HEIGHT>(input, attr), - CalculateOutput<Axis::WIDTH>(input, attr), - attr.weights.shape.get<Axis::OUTPUT_CHANNELS>() * - attr.weights.shape.get<Axis::INPUT_CHANNELS>()); -} - -BHWDC CalculateOutputShape(const BHWDC &input, const DepthwiseConvolution3DAttributes &attr) -{ - return BHWDC(input.b, CalculateOutput<Axis::HEIGHT>(input, attr), - CalculateOutput<Axis::WIDTH>(input, attr), CalculateOutput<Axis::DEPTH>(input, attr), - attr.weights.shape.get<Axis::OUTPUT_CHANNELS>() * - attr.weights.shape.get<Axis::INPUT_CHANNELS>()); -} - -BHWC CalculateOutputShape(const BHWC &input, const SliceAttributes &attr) -{ - (void)input; - return BHWC(StridedSize(attr.ends.b - attr.starts.b, attr.strides.b), - StridedSize(attr.ends.h - attr.starts.h, attr.strides.h), - StridedSize(attr.ends.w - attr.starts.w, attr.strides.w), - StridedSize(attr.ends.c - attr.starts.c, attr.strides.c)); -} - -BHWDC CalculateOutputShape(const BHWDC &input, const Slice3DAttributes &attr) -{ - (void)input; - return BHWDC(StridedSize(attr.ends.b - attr.starts.b, attr.strides.b), - StridedSize(attr.ends.h - attr.starts.h, attr.strides.h), - StridedSize(attr.ends.w - attr.starts.w, attr.strides.w), - StridedSize(attr.ends.d - attr.starts.d, attr.strides.d), - StridedSize(attr.ends.c - attr.starts.c, attr.strides.c)); -} - -BHWC CalculateOutputShape(const BHWC &input, const PadAttributes &attr) -{ - return BHWC( - attr.appended.b + attr.prepended.b + input.b, attr.appended.h + attr.prepended.h + input.h, - attr.appended.w + attr.prepended.w + input.w, attr.appended.c + attr.prepended.c + input.c); -} - -BHWDC CalculateOutputShape(const BHWDC &input, const Pad3DAttributes &attr) -{ - return BHWDC( - attr.appended.b + attr.prepended.b + input.b, attr.appended.h + attr.prepended.h + input.h, - attr.appended.w + attr.prepended.w + input.w, attr.appended.d + attr.prepended.d + input.d, - attr.appended.c + attr.prepended.c + input.c); -} - -BHWC CalculateOutputShape(const BHWC &input, const FullyConnectedAttributes &attr) -{ - return BHWC(input.b, 1, 1, attr.weights.shape.o); -} - -BHWC CalculateOutputShape(const BHWC &input, const MeanAttributes &attr) -{ - const int b = attr.dims.find(Axis::BATCH) == attr.dims.end() ? input.b : 1; - const int h = attr.dims.find(Axis::HEIGHT) == attr.dims.end() ? input.h : 1; - const int w = attr.dims.find(Axis::WIDTH) == attr.dims.end() ? input.w : 1; - const int c = attr.dims.find(Axis::CHANNELS) == attr.dims.end() ? input.c : 1; - return BHWC(b, h, w, c); -} - -absl::Status CalculateOutputShape(const std::vector<BHWC> &input, const ConcatAttributes &attr, - BHWC *output_shape) -{ - BHWC new_shape = input[0]; - switch (attr.axis) - { - case Axis::CHANNELS: - for (size_t i = 1; i < input.size(); i++) - { - if (input[i].h != new_shape.h || input[i].w != new_shape.w || input[i].b != new_shape.b) - { - return absl::InvalidArgumentError( - "Height, Width and Batch must be the same when concatenating " - "by channels axis"); - } - new_shape.c += input[i].c; - } - break; - case Axis::HEIGHT: - for (size_t i = 1; i < input.size(); i++) - { - if (input[i].w != new_shape.w || input[i].c != new_shape.c || input[i].b != new_shape.b) - { - return absl::InvalidArgumentError( - "Channels, Width and Batch must be the same when concatenating " - "by height axis"); - } - new_shape.h += input[i].h; - } - break; - case Axis::WIDTH: - for (size_t i = 1; i < input.size(); i++) - { - if (input[i].h != new_shape.h || input[i].c != new_shape.c || input[i].b != new_shape.b) - { - return absl::InvalidArgumentError( - "Height, Channels and Batch must be the same when concatenating " - "by width axis"); - } - new_shape.w += input[i].w; - } - break; - case Axis::BATCH: - for (size_t i = 1; i < input.size(); i++) - { - if (input[i].h != new_shape.h || input[i].c != new_shape.c || input[i].w != new_shape.w) - { - return absl::InvalidArgumentError( - "Width, Height and Channels must be the same when concatenating " - "by batch axis"); - } - new_shape.b += input[i].b; - } - break; - default: - return absl::InvalidArgumentError("Invalid axis"); - break; - } - *output_shape = new_shape; - return absl::OkStatus(); -} - -absl::Status CalculateOutputShape(const std::vector<BHWDC> &input, const ConcatAttributes &attr, - BHWDC *output_shape) -{ - BHWDC new_shape = input[0]; - switch (attr.axis) - { - case Axis::CHANNELS: - for (size_t i = 1; i < input.size(); ++i) - { - if (input[i].h != new_shape.h || input[i].w != new_shape.w || input[i].d != new_shape.d || - input[i].b != new_shape.b) - { - return absl::InvalidArgumentError("Height, Width, Batch and Depth must be the same when " - "concatenating " - "by channels axis"); - } - new_shape.c += input[i].c; - } - break; - case Axis::HEIGHT: - for (size_t i = 1; i < input.size(); ++i) - { - if (input[i].w != new_shape.w || input[i].c != new_shape.c || input[i].d != new_shape.d || - input[i].b != new_shape.b) - { - return absl::InvalidArgumentError( - "Width, Depth, Batch and Channels must be the same when " - "concatenating " - "by height axis"); - } - new_shape.h += input[i].h; - } - break; - case Axis::WIDTH: - for (size_t i = 1; i < input.size(); ++i) - { - if (input[i].h != new_shape.h || input[i].c != new_shape.c || input[i].d != new_shape.d || - input[i].b != new_shape.b) - { - return absl::InvalidArgumentError( - "Height, Depth, Batch and Channels must be the same when " - "concatenating " - "by width axis"); - } - new_shape.w += input[i].w; - } - break; - case Axis::DEPTH: - for (size_t i = 1; i < input.size(); ++i) - { - if (input[i].w != new_shape.w || input[i].h != new_shape.h || input[i].c != new_shape.c || - input[i].b != new_shape.b) - { - return absl::InvalidArgumentError( - "Width, Height, Batch and Channels must be the same when " - "concatenating " - "by depth axis"); - } - new_shape.d += input[i].d; - } - break; - case Axis::BATCH: - for (size_t i = 1; i < input.size(); ++i) - { - if (input[i].w != new_shape.w || input[i].h != new_shape.h || input[i].c != new_shape.c || - input[i].d != new_shape.d) - { - return absl::InvalidArgumentError( - "Width, Height, Depth and Channels must be the same when " - "concatenating " - "by batch axis"); - } - new_shape.b += input[i].b; - } - break; - default: - return absl::InvalidArgumentError("Invalid axis"); - } - *output_shape = new_shape; - return absl::OkStatus(); -} - -Padding2D CalculateSamePadding(const BHWC &input, const Convolution2DAttributes &attr) -{ - return MakeSamePadding(input, attr); -} - -Padding3D CalculateSamePadding(const BHWDC &input, const Convolution3DAttributes &attr) -{ - return MakeSamePadding(input, attr); -} - -Padding2D CalculateSamePadding(const BHWC &input, const ConvolutionTransposedAttributes &attr) -{ - return MakeSamePadding(input, attr); -} - -Padding3D CalculateSamePadding(const BHWDC &input, const ConvolutionTransposed3DAttributes &attr) -{ - return MakeSamePadding(input, attr); -} - -Padding2D CalculateSamePadding(const BHWC &input, const DepthwiseConvolution2DAttributes &attr) -{ - return MakeSamePadding(input, attr); -} - -Padding3D CalculateSamePadding(const BHWDC &input, const DepthwiseConvolution3DAttributes &attr) -{ - return MakeSamePadding(input, attr); -} - -Padding2D CalculateSamePadding(const BHWC &input, const Pooling2DAttributes &attr) -{ - return MakeSamePadding(input, attr); -} - -Padding3D CalculateSamePadding(const BHWDC &input, const Pooling3DAttributes &attr) -{ - return MakeSamePadding(input, attr); -} - -Padding2D CalculateSamePadding(const BHWC &input, const MaxUnpooling2DAttributes &attr) -{ - return MakeSamePadding(input, attr); -} - -Padding3D CalculateSamePadding(const BHWDC &input, const MaxUnpooling3DAttributes &attr) -{ - return MakeSamePadding(input, attr); -} - -float CalculateResizeScale(int32_t input_size, int32_t output_size, const Resize2DAttributes &attr) -{ - return attr.align_corners && input_size > 1 && output_size > 1 - ? static_cast<float>(input_size - 1) / (output_size - 1) - : static_cast<float>(input_size) / output_size; -} - -float CalculateResizeScale(int32_t input_size, int32_t output_size, const Resize3DAttributes &attr) -{ - return attr.align_corners && input_size > 1 && output_size > 1 - ? static_cast<float>(input_size - 1) / (output_size - 1) - : static_cast<float>(input_size) / output_size; -} - -BHWC CalculateOutputShape(const BHWC &input, const Resize2DAttributes &attr) -{ - return BHWC(input.b, attr.new_shape.h, attr.new_shape.w, input.c); -} - -BHWDC CalculateOutputShape(const BHWDC &input, const Resize3DAttributes &attr) -{ - return BHWDC(input.b, attr.new_shape.h, attr.new_shape.w, attr.new_shape.d, input.c); -} - -BHWC CalculateOutputShape(const BHWC &input, const TransposeAttributes &attr) -{ - return BHWC(input.get(attr.perm.b), input.get(attr.perm.h), input.get(attr.perm.w), - input.get(attr.perm.c)); -} - -BHWDC CalculateOutputShape(const BHWDC &input, const Transpose3DAttributes &attr) -{ - return BHWDC(input.get(attr.perm.b), input.get(attr.perm.h), input.get(attr.perm.w), - input.get(attr.perm.d), input.get(attr.perm.c)); -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/Operations.h b/runtime/onert/backend/gpu_cl/open_cl/Operations.h deleted file mode 100644 index 825eb90a4..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/Operations.h +++ /dev/null @@ -1,586 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2020 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_OPERATIONS_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_OPERATIONS_H__ - -#include <cstdint> -#include <set> -#include <string> -#include <vector> - -#include "absl/types/variant.h" - -#include "DataType.h" -#include "Shape.h" -#include "Status.h" -#include "InternalTensor.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -enum class OperationType -{ - UNKNOWN = 0, - // ABS, - ADD, - // BATCH_TO_SPACE, - // BATCH_NORMALIZATION, - // BATCHED_MATMUL, - // CONCAT, - // CONST, - // CONVOLUTION_2D, - // CONVOLUTION_TRANSPOSED, - // COPY, - // COS, - // DEPTHWISE_CONVOLUTION, - // DIV, - // ELU, - // EQUAL, - // EXP, - // FULLY_CONNECTED, - // GREATER, - // GREATER_EQUAL, - // HARD_SWISH, - // LESS, - // LESS_EQUAL, - // LOG, - // LSTM, - // MAXIMUM, - // MAX_UNPOOLING_2D, - // MEAN, - // MEAN_STDDEV_NORMALIZATION, - // MINIMUM, - // MUL, - // NEG, - // NOT_EQUAL, - // PAD, - // POOLING_2D, - // POW, - // PRELU, - // Used to accurately run inference on quantized models. - // QUANTIZE_AND_DEQUANTIZE, - // REDUCE_MAXIMUM, - // REDUCE_MINIMUM, - // REDUCE_PRODUCT, - // REDUCE_SUM, - // RELU, - // RESHAPE, - // RESIZE, - // RSQRT, - // SIGMOID, - // SIN, - // SLICE, - // SOFTMAX, - // SPACE_TO_BATCH, - // SPACE_TO_DEPTH, - // SQRT, - // SQUARE, - // SQUARED_DIFF, - // SUB, - // TANH, - // TRANSPOSE, -}; - -std::string ToString(enum OperationType op); - -OperationType OperationTypeFromString(const std::string &name); - -typedef absl::variant<absl::monostate, InternalTensor<HWC, DataType::FLOAT32>, - InternalTensor<Linear, DataType::FLOAT32>, float> - TensorOrScalar; - -struct Padding2D -{ - Padding2D() = default; - Padding2D(const Padding2D &); - Padding2D &operator=(const Padding2D &value); - bool operator==(const Padding2D &value); - bool operator!=(const Padding2D &value); - Padding2D &operator-(const Padding2D &value); - - // Padding values for every axis (if needed), where 'prepended' defines - // padding for the beginning of each axis and 'appended' represents end part - // of the corresponding axis. - HW prepended = HW(-1, -1); - HW appended = HW(-1, -1); -}; - -struct Padding3D -{ - Padding3D() = default; - Padding3D(const Padding3D &); - Padding3D &operator=(const Padding3D &value); - bool operator==(const Padding3D &value); - bool operator!=(const Padding3D &value); - Padding3D &operator-(const Padding3D &value); - // Padding values for every axis (if needed), where 'prepended' defines - // padding for the beginning of each axis and 'appended' represents end part - // of the corresponding axis. - HWD prepended = HWD(0, 0, 0); - HWD appended = HWD(0, 0, 0); -}; - -struct Crop2D : public Padding2D -{ -}; - -struct SpaceToBatchAttributes -{ - HW block; - Padding2D padding; -}; - -struct BatchToSpaceAttributes -{ - HW block; - Crop2D crop; -}; - -enum class PoolingType -{ - UNDEFINED = 0, - - // average pooling - AVERAGE = 1, - - // max pooling - MAX = 2, -}; - -struct Pooling2DAttributes -{ - PoolingType type = PoolingType::UNDEFINED; - // Strides for every axis. - HW strides = HW(-1, -1); - HW kernel = HW(-1, -1); - Padding2D padding; - // NOTE(akulik): technically the number of outputs from Pooling node indicates - // whether indices are needed or not, but I decided to keep it inside - // attributes to simplify processing. - bool output_indices = false; -}; - -struct Pooling3DAttributes -{ - PoolingType type = PoolingType::UNDEFINED; - // Strides for every axis. - HWD strides = HWD(0, 0, 0); - HWD kernel = HWD(0, 0, 0); - Padding3D padding; - // NOTE(akulik): technically the number of outputs from Pooling node indicates - // whether indices are needed or not, but I decided to keep it inside - // attributes to simplify processing. - bool output_indices = false; -}; - -struct MaxUnpooling2DAttributes -{ - // Strides for every axis. - HW strides = HW(-1, -1); - HW kernel = HW(-1, -1); - Padding2D padding; -}; - -struct MaxUnpooling3DAttributes -{ - // Strides for every axis. - HWD strides = HWD(0, 0, 0); - HWD kernel = HWD(0, 0, 0); - Padding3D padding; -}; - -struct MeanAttributes -{ - // The vector of dimensions to calculate mean along. - std::set<Axis> dims; -}; - -struct ConcatAttributes -{ - // Defines axis by which to concat on. - Axis axis = Axis::UNKNOWN; -}; - -// @return shape of a tensor after MaxUnpooling2D operation is applied to -// the given input. -BHWC CalculateOutputShape(const BHWC &input, const MaxUnpooling2DAttributes &attr); - -// @return shape of a tensor after MaxUnpooling3D operation is applied to -// the given input. -BHWDC CalculateOutputShape(const BHWDC &input, const MaxUnpooling3DAttributes &attr); - -// @return shape of a tensor after Pooling2D operation is applied to the given -// input. -BHWC CalculateOutputShape(const BHWC &input, const Pooling2DAttributes &attr); - -// @return shape of a tensor after Pooling3D operation is applied to the given -// input. -BHWDC CalculateOutputShape(const BHWDC &input, const Pooling3DAttributes &attr); - -// @return shape of a tensor after Concat operation is applied to the given -// input. -absl::Status CalculateOutputShape(const std::vector<BHWC> &input, const ConcatAttributes &attr, - BHWC *output_shape); - -// @return shape of a tensor after Concat operation is applied to the given -// input. -absl::Status CalculateOutputShape(const std::vector<BHWDC> &input, const ConcatAttributes &attr, - BHWDC *output_shape); - -// @return padding for pooling operation to make sure output keep the same shape -// as the given input. -Padding2D CalculateSamePadding(const BHWC &input, const Pooling2DAttributes &attr); - -// @return padding for pooling operation to make sure output keep the same shape -// as the given input. -Padding3D CalculateSamePadding(const BHWDC &input, const Pooling3DAttributes &attr); - -// @return padding for max unpooling operation to make sure output keep the same -// shape as the given input. -Padding2D CalculateSamePadding(const BHWC &input, const MaxUnpooling2DAttributes &attr); - -// @return padding for max unpooling operation to make sure output keep the same -// shape as the given input. -Padding3D CalculateSamePadding(const BHWDC &input, const MaxUnpooling3DAttributes &attr); - -struct Convolution2DAttributes -{ - HW strides = HW(1, 1); // Along each axis. - HW dilations = HW(1, 1); // Along each axis. - Padding2D padding; - - InternalTensor<OHWI, DataType::FLOAT32> weights; - InternalTensor<Linear, DataType::FLOAT32> bias; // optional -}; - -struct Convolution3DAttributes -{ - HWD strides = HWD(0, 0, 0); // Along each axis. - HWD dilations = HWD(0, 0, 0); // Along each axis. - Padding3D padding; - - InternalTensor<OHWDI, DataType::FLOAT32> weights; - InternalTensor<Linear, DataType::FLOAT32> bias; // optional -}; - -// @return shape of a tensor after Convolution2D operation is applied to -// the given input. -BHWC CalculateOutputShape(const BHWC &input, const Convolution2DAttributes &attr); - -// @return shape of a tensor after Convolution3D operation is applied to -// the given input. -BHWDC CalculateOutputShape(const BHWDC &input, const Convolution3DAttributes &attr); - -// @return padding for convolution operation to make sure output keep the same -// shape as the given input. -Padding2D CalculateSamePadding(const BHWC &input, const Convolution2DAttributes &attr); - -// @return padding for convolution operation to make sure output keep the same -// shape as the given input. -Padding3D CalculateSamePadding(const BHWDC &input, const Convolution3DAttributes &attr); - -struct ConvolutionTransposedAttributes -{ - HW stride = HW(1, 1); // Along each axis. - HW adjacent; // TODO(sorokin): No op on Flow. - Padding2D padding; - - InternalTensor<OHWI, DataType::FLOAT32> weights; - InternalTensor<Linear, DataType::FLOAT32> bias; // optional -}; - -struct ConvolutionTransposed3DAttributes -{ - HWD stride = HWD(0, 0, 0); // Along each axis. - Padding3D padding; - - InternalTensor<OHWDI, DataType::FLOAT32> weights; - InternalTensor<Linear, DataType::FLOAT32> bias; // optional -}; - -Padding2D CalculateSamePadding(const BHWC &input, const ConvolutionTransposedAttributes &attr); - -Padding3D CalculateSamePadding(const BHWDC &input, const ConvolutionTransposed3DAttributes &attr); - -// @return shape of a tensor after ConvolutionTransposed operation is applied to -// the given input. -BHWC CalculateOutputShape(const BHWC &input, const ConvolutionTransposedAttributes &attr); - -// @return shape of a tensor after ConvolutionTransposed3D operation is applied -// to -// the given input. -BHWDC CalculateOutputShape(const BHWDC &input, const ConvolutionTransposed3DAttributes &attr); - -struct DepthwiseConvolution2DAttributes : public Convolution2DAttributes -{ -}; -struct DepthwiseConvolution3DAttributes : public Convolution3DAttributes -{ -}; - -// @return shape of a tensor after DepthwiseConvolution2D operation is applied -// to the given input. -BHWC CalculateOutputShape(const BHWC &input, const DepthwiseConvolution2DAttributes &attr); - -// @return shape of a tensor after DepthwiseConvolution3D operation is applied -// to the given input. -BHWDC CalculateOutputShape(const BHWDC &input, const DepthwiseConvolution3DAttributes &attr); - -// @return padding for depthwise convolution operation to make sure output keep -// the same shape as the given input. -Padding2D CalculateSamePadding(const BHWC &input, const DepthwiseConvolution2DAttributes &attr); - -// @return padding for depthwise convolution operation to make sure output keep -// the same shape as the given input. -Padding3D CalculateSamePadding(const BHWDC &input, const DepthwiseConvolution3DAttributes &attr); - -// f(x):= { -// if x < 0 : x -> alpha * x -// if x >= 0 : x -> min(clip, x) -// } -// -// Examples: -// - ReLU: clip = 0, alpha = 0 -// - ReLU6: clip = 6, alpha = 0 -// - Leaky ReLU: clip = 0, alpha = a -struct ReLUAttributes -{ - // clip <= 0 mean it is not set. - float clip = 0; - - float alpha = 0; -}; - -struct PReLUAttributes -{ - // clip <= 0 mean it is not set. - float clip = 0; - - // If alpha is linear, then it is sharded across CHANNELS axis, otherwise - // full shape alpha is required. - absl::variant<InternalTensor<Linear, DataType::FLOAT32>, InternalTensor<HWC, DataType::FLOAT32>> - alpha; -}; - -struct ReduceAttributes -{ - Axis axis = Axis::UNKNOWN; -}; - -struct SoftmaxAttributes -{ - Axis axis = Axis::UNKNOWN; -}; - -enum LstmKernelType -{ - FULL = 0, - BASIC = 1, // Currently, only basic is supported. -}; - -struct LstmAttributes -{ - LstmKernelType kernel_type = LstmKernelType::BASIC; -}; - -enum class SamplingType -{ - UNKNOWN = 0, - NEAREST = 1, - BILINEAR = 2, -}; - -struct Resize2DAttributes -{ - HW new_shape; - - SamplingType type = SamplingType::UNKNOWN; - - // If true, the centers of the 4 corner pixels of the input and output tensors - // are aligned, preserving the values at the corner pixels. Defaults to false. - bool align_corners = false; - - bool half_pixel_centers = false; -}; - -// TODO(b/147771327): rename to Resize3D -struct Resize3DAttributes -{ - HWD new_shape; - - SamplingType type = SamplingType::NEAREST; - - // If true, the centers of the 8 corner pixels of the input and output tensors - // are aligned, preserving the values at the corner pixels. Defaults to false. - bool align_corners = false; - - bool half_pixel_centers = false; -}; - -float CalculateResizeScale(int32_t input_size, int32_t output_size, const Resize2DAttributes &attr); - -float CalculateResizeScale(int32_t input_size, int32_t output_size, const Resize3DAttributes &attr); - -// @return shape of a tensor after scale operation is applied to the given -// input. -BHWC CalculateOutputShape(const BHWC &input, const Resize2DAttributes &attr); - -// @return shape of a tensor after scale operation is applied to the given -// input. -BHWDC CalculateOutputShape(const BHWDC &input, const Resize3DAttributes &attr); - -enum class PaddingContentType -{ - ZEROS = 0, - REFLECT = 1, - EDGE = 2, -}; - -struct PadAttributes -{ - PaddingContentType type = PaddingContentType::ZEROS; - - BHWC prepended; - BHWC appended; -}; - -// @return shape of a tensor after Pad operation is applied to the given input. -BHWC CalculateOutputShape(const BHWC &input, const PadAttributes &attr); - -struct Pad3DAttributes -{ - PaddingContentType type = PaddingContentType::ZEROS; - - BHWDC prepended; - BHWDC appended; -}; - -// @return shape of a tensor after Pad3D operation is applied to the given -// input. -BHWDC CalculateOutputShape(const BHWDC &input, const Pad3DAttributes &attr); - -struct ConstTensorAttributes -{ - InternalTensor<BHWC, DataType::FLOAT32> tensor; -}; - -// Simple slicing without advanced support for shrinking, reverse slicing etc. -struct SliceAttributes -{ - // Specifies start and end dimensions for slicing. - BHWC starts; - BHWC ends; - - // Stride should be >= 1. - BHWC strides; -}; - -// @return shape of a tensor after Slice2D operation is applied to the given -// input. -BHWC CalculateOutputShape(const BHWC &input, const SliceAttributes &attr); - -// Simple slicing without advanced support for shrinking, reverse slicing etc. -struct Slice3DAttributes -{ - // Specifies start and end dimensions for slicing. - BHWDC starts; - BHWDC ends; - - // Stride should be >= 1. - BHWDC strides; -}; - -// @return shape of a tensor after Slice3D operation is applied to the given -// input. -BHWDC CalculateOutputShape(const BHWDC &input, const Slice3DAttributes &attr); - -struct FullyConnectedAttributes -{ - InternalTensor<OHWI, DataType::FLOAT32> weights; - InternalTensor<Linear, DataType::FLOAT32> bias; -}; - -// @return shape of a tensor after FullyConnected operation is applied to -// the given input. -BHWC CalculateOutputShape(const BHWC &input, const FullyConnectedAttributes &attr); - -// @return shape of a tensor after Mean operation is applied to the given input. -BHWC CalculateOutputShape(const BHWC &input, const MeanAttributes &attr); - -struct ElementwiseAttributes -{ - TensorOrScalar param; - // For elementwise operation with 2 inputs op(A, B), runtime_tensor_is_second - // true when runtime tensor is B(on second position). this is important for - // ops that non commutative, for example substract. - bool runtime_tensor_is_second = false; -}; - -struct ReshapeAttributes -{ - BHWC new_shape; -}; - -struct Reshape3DAttributes -{ - BHWDC new_shape; -}; - -struct TransposeAttributes -{ - // A permutation of the dimensions of input tensor - BHWC perm; -}; - -// @return shape of a tensor after Transpose operation is applied to -// the given input. -BHWC CalculateOutputShape(const BHWC &input, const TransposeAttributes &attr); - -struct Transpose3DAttributes -{ - // A permutation of the dimensions of input tensor - BHWDC perm; -}; - -// @return shape of a tensor after Transpose3D operation is applied to -// the given input. -BHWDC CalculateOutputShape(const BHWDC &input, const Transpose3DAttributes &attr); - -struct SpaceToDepthAttributes -{ - int block_size; -}; - -// These help perform a combination of Quantize & Dequantize to adjust float -// values like quantized inference would. -struct QuantizeAndDequantizeAttributes -{ - float min = 0; - float max = 0; - float scale = 0; -}; - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_OPERATIONS_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/Precision.cc b/runtime/onert/backend/gpu_cl/open_cl/Precision.cc deleted file mode 100644 index bd908bd43..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/Precision.cc +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Precision.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -std::string ToString(CalculationsPrecision precision) -{ - switch (precision) - { - case CalculationsPrecision::F32_F16: - return "CalculationsPrecision::F32_F16"; - case CalculationsPrecision::F32: - return "CalculationsPrecision::F32"; - case CalculationsPrecision::F16: - return "CalculationsPrecision::F16"; - } - return " "; -} - -DataType DeduceDataTypeFromPrecision(CalculationsPrecision precision) -{ - if (precision == CalculationsPrecision::F32) - { - return DataType::FLOAT32; - } - else - { - return DataType::FLOAT16; - } - return DataType::UNKNOWN; -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/Precision.h b/runtime/onert/backend/gpu_cl/open_cl/Precision.h deleted file mode 100644 index cb910c783..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/Precision.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_PRECISION_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_PRECISION_H__ - -#include <string> - -#include "DataType.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -enum class CalculationsPrecision -{ - F32, - F32_F16, - F16 -}; -// F32 - all data and all math ops in F32 -// F16 - all data and all math ops in F16 -// F32_F16 - as F16, but some operations (Convolution, -// DepthwiseConvolution, FullyConnected, ConvolutionTransposed) -// have accumulator in F32 and usually it calculates 4 mads in F16, sum them, -// than converts this partial sum to F32 and add to accumulator. - -DataType DeduceDataTypeFromPrecision(CalculationsPrecision precision); - -std::string ToString(CalculationsPrecision precision); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_PRECISION_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/ProgramCache.cc b/runtime/onert/backend/gpu_cl/open_cl/ProgramCache.cc deleted file mode 100644 index 350d7a1c5..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/ProgramCache.cc +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "ProgramCache.h" - -#include <cstdint> -#include <string> - -#include "ClProgram.h" -#include "Status.h" -#include "Util.h" -#include "farmhash.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -ProgramCache::ProgramDescriptor::ProgramDescriptor(const std::string &code_text, - const std::string &options, - bool use_fingerprints) - : code(code_text), compiler_options(options), use_fingerprint(use_fingerprints) -{ - const uint64_t code_fingerprint = ::util::Fingerprint64(code); - const uint64_t options_fingerprint = ::util::Fingerprint64(compiler_options); - fingerprint = code_fingerprint + options_fingerprint; -} - -ProgramCache::ProgramDescriptor::ProgramDescriptor(uint64_t fingerprints) - : fingerprint(fingerprints), use_fingerprint(true) -{ -} - -ProgramCache::ProgramCache(ProgramCache &&program_cache) - : use_fingerprints_(program_cache.use_fingerprints_), - programs_(std::move(program_cache.programs_)) -{ -} - -ProgramCache &ProgramCache::operator=(ProgramCache &&program_cache) -{ - if (this != &program_cache) - { - use_fingerprints_ = program_cache.use_fingerprints_; - programs_ = std::move(program_cache.programs_); - } - return *this; -} - -absl::Status ProgramCache::GetOrCreateCLKernel(const std::string &code, - const std::string &function_name, - const std::vector<CompilerOptions> &compiler_options, - const CLContext &context, const CLDevice &device, - CLKernel *result) -{ - const std::string options = CompilerOptionsToString(device, compiler_options); - ProgramDescriptor desc{code, options, use_fingerprints_}; - auto it = programs_.find(desc); - if (it != programs_.end()) - { - return result->CreateFromProgram(it->second, function_name); - } - - CLProgram program; - RETURN_IF_ERROR(CreateCLProgram(code, options, context, device, &program)); - RETURN_IF_ERROR(result->CreateFromProgram(program, function_name)); - programs_.insert(std::make_pair(std::move(desc), std::move(program))); - return absl::OkStatus(); -} - -absl::Status ProgramCache::GetOrCreateCLKernel(const std::string &code, - const std::string &function_name, - const CLContext &context, const CLDevice &device, - CLKernel *result) -{ - return GetOrCreateCLKernel(code, function_name, {}, context, device, result); -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/ProgramCache.h b/runtime/onert/backend/gpu_cl/open_cl/ProgramCache.h deleted file mode 100644 index 3f5ee0215..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/ProgramCache.h +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_PROGRAM_CACHE_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_PROGRAM_CACHE_H__ - -#include <cstdint> -#include <string> -#include <vector> - -#include "absl/container/flat_hash_map.h" -#include "absl/types/span.h" -#include "ClContext.h" -#include "ClDevice.h" -#include "ClKernel.h" -#include "ClProgram.h" -#include "Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -class ProgramCache -{ -public: - ProgramCache() = default; - - ProgramCache(ProgramCache &&program_cache); - ProgramCache &operator=(ProgramCache &&program_cache); - ProgramCache(const ProgramCache &) = delete; - ProgramCache &operator=(const ProgramCache &) = delete; - - absl::Status GetOrCreateCLKernel(const std::string &code, const std::string &function_name, - const std::vector<CompilerOptions> &compiler_options, - const CLContext &context, const CLDevice &device, - CLKernel *result); - - absl::Status GetOrCreateCLKernel(const std::string &code, const std::string &function_name, - const CLContext &context, const CLDevice &device, - CLKernel *result); - -private: - struct ProgramDescriptor - { - ProgramDescriptor() = default; - ProgramDescriptor(const std::string &code_text, const std::string &options, - bool use_fingerprint); - explicit ProgramDescriptor(uint64_t fingerprint); - - std::string code; - std::string compiler_options; - uint64_t fingerprint; - bool use_fingerprint; - }; - struct ProgramDescriptorHasher - { - std::size_t operator()(const ProgramDescriptor &k) const - { - if (k.use_fingerprint) - { - return std::hash<uint64_t>()(k.fingerprint); - } - else - { - return std::hash<std::string>()(k.code) + std::hash<std::string>()(k.compiler_options); - } - } - }; - struct ProgramDescriptorEqual - { - bool operator()(const ProgramDescriptor &a, const ProgramDescriptor &b) const - { - if (a.use_fingerprint && b.use_fingerprint) - { - return a.fingerprint == b.fingerprint; - } - else - { - return a.compiler_options == b.compiler_options && a.code == b.code; - } - } - }; - - // There is a low probability of a hash collision when cache is deserialized - // because only fingerprints are serialized instead of full source code. - bool use_fingerprints_ = false; - absl::flat_hash_map<ProgramDescriptor, CLProgram, ProgramDescriptorHasher, ProgramDescriptorEqual> - programs_; -}; - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_PROGRAM_CACHE_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/Shape.cc b/runtime/onert/backend/gpu_cl/open_cl/Shape.cc deleted file mode 100644 index 5a2374516..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/Shape.cc +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Shape.h" - -#include <stdint.h> - -#include <string> -#include <vector> - -#include "absl/strings/str_cat.h" -#include "absl/strings/str_join.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ -namespace -{ - -struct GetAxisByIndexFunc -{ - template <Layout T> Axis operator()() const { return GetAxis<T>(index); } - int32_t index; -}; - -struct GetIndexByAxisFunc -{ - template <Layout T> int operator()() const { return GetAxisIndex<T>(axis); } - Axis axis; -}; - -struct NumAxisFunc -{ - template <Layout T> int operator()() const { return Size<T>(); } -}; - -} // namespace - -std::string ToString(Axis axis) -{ - switch (axis) - { - case Axis::BATCH: - return "batch"; - case Axis::CHANNELS: - return "channels"; - case Axis::INPUT_CHANNELS: - return "input_channels"; - case Axis::OUTPUT_CHANNELS: - return "output_channels"; - case Axis::HEIGHT: - return "height"; - case Axis::WIDTH: - return "width"; - case Axis::VALUE: - return "value"; - case Axis::DEPTH: - return "depth"; - case Axis::UNKNOWN: - return "unknown"; - } - return "undefined"; -} - -std::string ToString(Layout layout) -{ - switch (layout) - { - case Layout::SCALAR: - return "scalar"; - case Layout::LINEAR: - return "linear"; - case Layout::HW: - return "hw"; - case Layout::HWD: - return "hwd"; - case Layout::CHW: - return "chw"; - case Layout::HWC: - return "hwc"; - case Layout::HWDC: - return "hwdc"; - case Layout::OHWI: - return "ohwi"; - case Layout::IHWO: - return "ihwo"; - case Layout::OIHW: - return "oihw"; - case Layout::IOHW: - return "iohw"; - case Layout::BHWC: - return "bhwc"; - case Layout::BHWDC: - return "bhwdc"; - case Layout::OHWDI: - return "ohwi"; - case Layout::UNKNOWN: - return "unknown"; - } - return "undefined"; -} - -Axis GetAxis(Layout layout, int32_t index) -{ - return DispatchByLayout(layout, GetAxisByIndexFunc{index}); -} - -int GetAxisIndex(Layout layout, Axis axis) -{ - return DispatchByLayout(layout, GetIndexByAxisFunc{axis}); -} - -bool HasAxis(Layout layout, Axis axis) { return GetAxisIndex(layout, axis) >= 0; } - -int Size(Layout layout) { return DispatchByLayout(layout, NumAxisFunc()); } - -std::string ToString(const Shape &s) -{ - return absl::StrCat("{", ToString(s.layout), ", {", absl::StrJoin(s.dimensions, ", "), "}}"); -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/Shape.h b/runtime/onert/backend/gpu_cl/open_cl/Shape.h deleted file mode 100644 index 3767e106f..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/Shape.h +++ /dev/null @@ -1,668 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_SHAPE_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_SHAPE_H__ - -#include <stddef.h> -#include <stdint.h> - -#include <array> -#include <functional> -#include <numeric> -#include <string> -#include <utility> -#include <vector> - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -enum class Axis -{ - UNKNOWN = 0, - CHANNELS = 1, - INPUT_CHANNELS = 2, - OUTPUT_CHANNELS = 3, - HEIGHT = 4, - WIDTH = 5, - BATCH = 6, - VALUE = 7, - DEPTH = 8, -}; - -std::string ToString(Axis t); - -// Layout represents axis order. -enum class Layout -{ - UNKNOWN = 0, - SCALAR = 1, - LINEAR = 2, - HW = 3, - CHW = 4, - HWC = 5, - OIHW = 6, - OHWI = 7, - IHWO = 8, - IOHW = 9, - BHWC = 10, - HWDC = 11, - BHWDC = 12, - HWD = 13, - OHWDI = 14, -}; - -std::string ToString(Layout l); - -// Returns number of axis for the fixed layout. -template <Layout T> constexpr int Size(); - -// Returns number of axis for the given layout. -int Size(Layout layout); - -// Returns Axis for the given index and fixed layout. -template <Layout T> constexpr Axis GetAxis(int index); - -// Returns axis for the given layout and index. -Axis GetAxis(Layout layout, int32_t index); - -// Returns axis index for the given axis and fixed layout. -template <Layout T> constexpr int GetAxisIndex(Axis axis); - -// Returns axis index for the given layout and axis. -int GetAxisIndex(Layout layout, Axis axis); - -// Checks if fixed layout has given axis -template <Layout T> constexpr bool HasAxis(Axis axis); - -// Checks if given layout has given axis -bool HasAxis(Layout layout, Axis axis); - -// Stores Layout(axis set and order) and value for dimensions. -struct Shape -{ - Shape() : layout(Layout::UNKNOWN), dimensions() {} - - explicit Shape(Layout t) : layout(t), dimensions(Size(t)) {} - - Shape(Layout t, std::vector<int32_t> d) : layout(t), dimensions(std::move(d)) {} - - bool operator==(const Shape &other) const - { - return (layout == other.layout) && (dimensions == other.dimensions); - } - - bool operator!=(const Shape &other) const { return !operator==(other); } - - // All methods below are matching same methods defined in StrongShape to - // make sure generic algorithms work both ways. - - // Returns back a dimension or -1 if it is not found. - template <Axis D> int32_t get() const; - int32_t get(Axis axis) const; - - template <Axis D> bool set(int32_t t); - bool set(Axis axis, int32_t t); - - Axis axis(int index) const { return GetAxis(layout, index); } - - int index(Axis axis) const { return GetAxisIndex(layout, axis); } - - bool has(Axis axis) const { return HasAxis(layout, axis); } - - int64_t DimensionsProduct() const - { - return std::accumulate(dimensions.begin(), dimensions.end(), 1ll, std::multiplies<int64_t>()); - } - - Layout layout = Layout::UNKNOWN; - - std::vector<int32_t> dimensions; -}; - -std::string ToString(const Shape &s); - -// StrongShape provides convenient explicit access to dimensions stored in -// shape, e.g. StrongShape<Layout::HW> s; provides s.h and s.w accessors. -// -// There is a conversion possible both ways between Shape and StrongShape. -// -// OIHW oihw; // specific shape -// Shape l = oihw.ToShape(); -// -// OHWI other; // notice not the same but compatible shape. -// if (!other.Adopt(l)) { -// // error handling -// } -// -// StrongShape supports the following set of operations: -// -// // Returns number of axis in the shape class. -// static constexpr int size(); -// -// // Returns Axis for the given index or Axis::UNKNOWN if index -// // falls outside of the defined range in this shape. -// static constexpr Axis axis(int index); -// -// // Returns index for the given axis or -1 if axis is not defined in this -// // shape. -// static constexpr int index(Axis axis); -// -// // Getters -// int32_t get(int index) const; -// int32_t get(Axis axis) const; -// int32_t get<Axis>() const; -// -// // Setters that return false if set was not successful. -// bool set(int index, int32_t v); -// bool set(Axis axis, int32_t v); -// bool set<Axis>(int32_t v); -// -// // Returns shape's layout. -// static const Layout layout; -// -// // Turns specific shape into generic shape. -// Shape ToShape() const; -// -// // Copies all dimensions from the given shape. -// bool Adopt(const Shape&); -// -template <Layout L> struct StrongShape; - -using Scalar = StrongShape<Layout::SCALAR>; -using Linear = StrongShape<Layout::LINEAR>; -using HW = StrongShape<Layout::HW>; -using HWD = StrongShape<Layout::HWD>; - -// Common tensor shape for CNN models working with images. -using CHW = StrongShape<Layout::CHW>; -using HWC = StrongShape<Layout::HWC>; -using HWDC = StrongShape<Layout::HWDC>; -using BHWC = StrongShape<Layout::BHWC>; -using BHWDC = StrongShape<Layout::BHWDC>; - -// Tensor shape used in convolution_2d weights. -using OIHW = StrongShape<Layout::OIHW>; -using OHWI = StrongShape<Layout::OHWI>; -using IHWO = StrongShape<Layout::IHWO>; -using IOHW = StrongShape<Layout::IOHW>; - -// Tensor shape used in convolution_3d weights. -using OHWDI = StrongShape<Layout::OHWDI>; - -// ----------------------------------------------------------------------------- -// Everything below are internal implementation details. -// ----------------------------------------------------------------------------- - -namespace internal_shape -{ - -template <Axis T> struct AxisTraits; - -#define TFLITE_GPU_AXIS_TRAITS(AxisName, HolderName) \ - template <> struct AxisTraits<Axis::AxisName> \ - { \ - struct Holder \ - { \ - int32_t HolderName; \ - \ - protected: \ - int32_t operator()() const { return HolderName; } \ - void operator()(int32_t v) { HolderName = v; } \ - }; \ - \ - using dimension_holder_type = Holder; \ - } - -TFLITE_GPU_AXIS_TRAITS(CHANNELS, c); -TFLITE_GPU_AXIS_TRAITS(HEIGHT, h); -TFLITE_GPU_AXIS_TRAITS(WIDTH, w); -TFLITE_GPU_AXIS_TRAITS(INPUT_CHANNELS, i); -TFLITE_GPU_AXIS_TRAITS(OUTPUT_CHANNELS, o); -TFLITE_GPU_AXIS_TRAITS(BATCH, b); -TFLITE_GPU_AXIS_TRAITS(VALUE, v); -TFLITE_GPU_AXIS_TRAITS(DEPTH, d); - -#undef TFLITE_GPU_AXIS_TRAITS - -template <int N, Axis... As> struct StrongShapeImpl; - -template <int N> struct StrongShapeImpl<N> -{ - static constexpr int size() { return N; } - - static constexpr Axis axis(int) { return Axis::UNKNOWN; } - - static constexpr int index(Axis) { return -1; } - - static constexpr bool has(Axis) { return false; } - - int32_t get(Axis) const { return -1; } - - int32_t get(int) const { return -1; } - - template <Axis B> int32_t get() const { return -1; } - - bool set(Axis, int32_t) { return false; } - - bool set(int, int32_t) { return false; } - - template <Axis B> bool set(int32_t) { return false; } -}; - -// Used to deduce number of axis, and to be a child of a proper holder to -// provide access to the dimension by name -template <int N, Axis A, Axis... As> -struct StrongShapeImpl<N, A, As...> : public AxisTraits<A>::dimension_holder_type, - public StrongShapeImpl<N + 1, As...> -{ - using dimension_holder_type = typename AxisTraits<A>::dimension_holder_type; - - using rest_type = StrongShapeImpl<N + 1, As...>; - - StrongShapeImpl() : dimension_holder_type{0}, rest_type() {} - - template <typename... Ts> - explicit StrongShapeImpl(int32_t t, Ts... ts) : dimension_holder_type{t}, rest_type(ts...) - { - } - - static constexpr Axis axis(int index) { return index == N ? A : rest_type::axis(index); } - - static constexpr int index(Axis axis) { return axis == A ? N : rest_type::index(axis); } - - static constexpr bool has(Axis axis) { return axis == A ? true : rest_type::has(axis); } - - int32_t get(Axis axis) const - { - return axis == A ? dimension_holder_type::operator()() : rest_type::get(axis); - } - - template <Axis B> int32_t get() const - { - return B == A ? dimension_holder_type::operator()() : rest_type::template get<B>(); - } - - int32_t get(int index) const - { - return index == N ? dimension_holder_type::operator()() : rest_type::get(index); - } - - bool set(Axis axis, int32_t t) - { - if (axis == A) - { - dimension_holder_type::operator()(t); - return true; - } - return rest_type::set(axis, t); - } - - bool set(int index, int32_t t) - { - if (index == N) - { - dimension_holder_type::operator()(t); - return true; - } - return rest_type::set(index, t); - } - - template <Axis B> bool set(int32_t t) - { - if (A == B) - { - dimension_holder_type::operator()(t); - return true; - } - return rest_type::template set<B>(t); - } -}; - -template <Layout T> struct LayoutTraits; - -#define TFLITE_GPU_LAYOUT_TRAITS(LayoutName, ...) \ - template <> struct LayoutTraits<Layout::LayoutName> \ - { \ - using strong_shape_type = StrongShapeImpl<0, __VA_ARGS__>; \ - } - -TFLITE_GPU_LAYOUT_TRAITS(HW, Axis::HEIGHT, Axis::WIDTH); -TFLITE_GPU_LAYOUT_TRAITS(HWD, Axis::HEIGHT, Axis::WIDTH, Axis::DEPTH); -TFLITE_GPU_LAYOUT_TRAITS(OHWI, Axis::OUTPUT_CHANNELS, Axis::HEIGHT, Axis::WIDTH, - Axis::INPUT_CHANNELS); -TFLITE_GPU_LAYOUT_TRAITS(OIHW, Axis::OUTPUT_CHANNELS, Axis::INPUT_CHANNELS, Axis::HEIGHT, - Axis::WIDTH); -TFLITE_GPU_LAYOUT_TRAITS(IOHW, Axis::INPUT_CHANNELS, Axis::OUTPUT_CHANNELS, Axis::HEIGHT, - Axis::WIDTH); -TFLITE_GPU_LAYOUT_TRAITS(IHWO, Axis::INPUT_CHANNELS, Axis::HEIGHT, Axis::WIDTH, - Axis::OUTPUT_CHANNELS); -TFLITE_GPU_LAYOUT_TRAITS(CHW, Axis::CHANNELS, Axis::HEIGHT, Axis::WIDTH); -TFLITE_GPU_LAYOUT_TRAITS(HWC, Axis::HEIGHT, Axis::WIDTH, Axis::CHANNELS); -TFLITE_GPU_LAYOUT_TRAITS(HWDC, Axis::HEIGHT, Axis::WIDTH, Axis::DEPTH, Axis::CHANNELS); -TFLITE_GPU_LAYOUT_TRAITS(LINEAR, Axis::VALUE); -TFLITE_GPU_LAYOUT_TRAITS(SCALAR, Axis::VALUE); -TFLITE_GPU_LAYOUT_TRAITS(BHWC, Axis::BATCH, Axis::HEIGHT, Axis::WIDTH, Axis::CHANNELS); -TFLITE_GPU_LAYOUT_TRAITS(BHWDC, Axis::BATCH, Axis::HEIGHT, Axis::WIDTH, Axis::DEPTH, - Axis::CHANNELS); -TFLITE_GPU_LAYOUT_TRAITS(OHWDI, Axis::OUTPUT_CHANNELS, Axis::HEIGHT, Axis::WIDTH, Axis::DEPTH, - Axis::INPUT_CHANNELS); - -#undef TFLITE_GPU_LAYOUT_TRAITS - -template <> struct LayoutTraits<Layout::UNKNOWN> -{ - using strong_shape_type = StrongShapeImpl<0>; -}; - -template <Axis A> struct DimensionGetterFixedAxisFunc -{ - template <Layout T> int32_t operator()() const - { - constexpr int i = GetAxisIndex<T>(A); - return i >= 0 && i < l->dimensions.size() ? l->dimensions[i] : -1; - } - const Shape *l; -}; - -struct DimensionGetterFunc -{ - template <Layout T> int32_t operator()() const - { - uint32_t i = GetAxisIndex<T>(axis); - return i < l->dimensions.size() ? l->dimensions[i] : -1; - } - Axis axis; - const Shape *l; -}; - -template <Axis A> struct DimensionSetterFixedAxisFunc -{ - template <Layout T> bool operator()() const - { - constexpr uint32_t i = GetAxisIndex<T>(A); - if (i < l->dimensions.size()) - { - l->dimensions[i] = v; - return true; - } - return false; - } - Shape *l; - int32_t v; -}; - -struct DimensionSetterFunc -{ - template <Layout T> bool operator()() const - { - uint32_t i = GetAxisIndex<T>(axis); - if (i < l->dimensions.size()) - { - l->dimensions[i] = v; - return true; - } - return false; - } - Axis axis; - Shape *l; - int32_t v; -}; - -template <Layout L> struct ToShapeFunc -{ - template <Layout T> bool operator()() const - { - for (int i = 0; i < StrongShape<L>::size(); ++i) - { - int index = GetAxisIndex<T>(StrongShape<L>::axis(i)); - if (index < 0) - return false; - shape->set(i, l.dimensions[index]); - } - return true; - } - - StrongShape<L> *shape; - const Shape &l; -}; - -} // namespace internal_shape - -// template <Axis... As> -template <Layout L> struct StrongShape : public internal_shape::LayoutTraits<L>::strong_shape_type -{ - using strong_shape_type = typename internal_shape::LayoutTraits<L>::strong_shape_type; - StrongShape() = default; - - template <typename... Ts> explicit StrongShape(Ts... t) : strong_shape_type(t...) {} - - constexpr static Layout layout = L; - - bool operator==(const StrongShape<L> &shape) const - { - // TODO(akulik): implement better alternative. - return this->ToShape() == shape.ToShape(); - } - - bool operator!=(const StrongShape<L> &shape) const - { - // TODO(akulik): implement better alternative. - return this->ToShape() != shape.ToShape(); - } - bool empty() const { return DimensionsProduct() == 0; } - - // Turns StrongShape into generic shape. - Shape ToShape() const - { - std::vector<int32_t> dimensions(StrongShape::size()); - for (int i = 0; i < StrongShape::size(); ++i) - { - dimensions[i] = StrongShape::get(i); - } - return Shape(L, std::move(dimensions)); - } - - // @return all dimensions multiplied - int64_t DimensionsProduct() const - { - int64_t product = 1; - for (int i = 0; i < StrongShape::size(); ++i) - { - product *= StrongShape::get(i); - } - return product; - } - - // Translates given coordinates of the layout into a linear index assuming - // dimensions are sorted in tensor access order e.g. if you access - // foobar[i][j][k] order of coordinates should be i,j,k. - int64_t LinearIndex(const std::array<int32_t, StrongShape::size()> &coordinates) const - { - int64_t index = coordinates[0]; - for (int i = 1; i < StrongShape::size(); ++i) - { - index = index * StrongShape::get(i) + coordinates[i]; - } - return index; - } - - // Copies all dimensions from the given generic shape into specific shape. - // It requires shape to have all axis defined in the given - // StrongShape. For example: - // - If this shape is OHWI but given shape is OIHW, Adopt will copy all - // dimensions and return true. - // - If this shape is OIHW but input shape is HW, Adopt will copy H and W - // dimensions and return true, but if this shape is HW and given shape - // OIHW, then Adopt will return false because not all axis are present in - // the input shape. - // - // @return false if generic shape is not compatible. - bool Adopt(const Shape &shape) - { - return DispatchByLayout(shape.layout, internal_shape::ToShapeFunc<L>{this, shape}); - } - - // For all axis defined in a given shape copies values to this shape. - // Therefore, it is possible to copy dimensions from CHW to BCHW, but not - // the other way around. - // - // BCHW bchw; - // CHW chw; - // bchw.CopyAllGivenAxis(chw); --> true - // chw.CopyAllGivenAxis(bchw); --> false - // - // @return false if axis in source shape is not defined here, thus value - // was not copied. - template <Layout B> bool CopyAllGivenAxis(const StrongShape<B> &source) - { - for (int i = 0; i < source.size(); ++i) - { - if (!StrongShape::set(source.axis(i), source.get(i))) - { - return false; - } - } - return true; - } - - // For all axis defined in this shape copies values from the given shape. - // - // BCHW bchw; - // CHW chw; - // bchw.CopyAllDefinedAxis(chw); --> false - // chw.CopyAllDefinedAxis(bchw); --> true - // - // @return false if given shape does not have axis defined here, - // therefore a value was not copied. - template <Layout B> bool CopyAllDefinedAxis(const StrongShape<B> &source) - { - for (int i = 0; i < StrongShape::size(); ++i) - { - int source_index = source.index(StrongShape::axis(i)); - if (source_index < 0) - { - return false; - } - StrongShape::set(i, source.get(source_index)); // always true - } - return true; - } - - // Copies values only for matching axis. - template <Layout B> void CopyMatchingAxis(const StrongShape<B> &source) - { - for (int i = 0; i < StrongShape::size(); ++i) - { - StrongShape::set(source.axis(i), source.get(i)); - } - } - - // AbslHash function for using in flat hash containers. - template <typename H> friend H AbslHashValue(H hash_state, const StrongShape &strong_shape) - { - for (size_t i = 0; i < strong_shape.size(); ++i) - { - hash_state = H::combine(std::move(hash_state), strong_shape.get(i)); - } - return hash_state; - } -}; - -template <Layout T> inline std::string ToString(const StrongShape<T> &s) -{ - return ToString(s.ToShape()); -} - -template <Layout L> constexpr Layout StrongShape<L>::layout; - -template <class F> -auto DispatchByLayout(Layout type, F f) -> decltype(f.template operator()<Layout::UNKNOWN>()) -{ - switch (type) - { - case Layout::HW: - return f.template operator()<Layout::HW>(); - case Layout::HWD: - return f.template operator()<Layout::HWD>(); - case Layout::HWC: - return f.template operator()<Layout::HWC>(); - case Layout::HWDC: - return f.template operator()<Layout::HWDC>(); - case Layout::CHW: - return f.template operator()<Layout::CHW>(); - case Layout::OIHW: - return f.template operator()<Layout::OIHW>(); - case Layout::IOHW: - return f.template operator()<Layout::IOHW>(); - case Layout::OHWI: - return f.template operator()<Layout::OHWI>(); - case Layout::IHWO: - return f.template operator()<Layout::IHWO>(); - case Layout::LINEAR: - return f.template operator()<Layout::LINEAR>(); - case Layout::SCALAR: - return f.template operator()<Layout::SCALAR>(); - case Layout::BHWC: - return f.template operator()<Layout::BHWC>(); - case Layout::BHWDC: - return f.template operator()<Layout::BHWDC>(); - case Layout::OHWDI: - return f.template operator()<Layout::OHWDI>(); - case Layout::UNKNOWN: - return f.template operator()<Layout::UNKNOWN>(); - } - return f.template operator()<Layout::UNKNOWN>(); -} - -template <Layout T> constexpr int Size() { return StrongShape<T>::size(); } - -template <Layout T> constexpr Axis GetAxis(int index) { return StrongShape<T>::axis(index); } - -template <Layout T> constexpr int GetAxisIndex(Axis axis) { return StrongShape<T>::index(axis); } - -template <Layout T> constexpr bool HasAxis(Axis axis) { return StrongShape<T>::has(axis); } - -template <Axis D> inline int32_t Shape::get() const -{ - return DispatchByLayout(layout, internal_shape::DimensionGetterFixedAxisFunc<D>{this}); -} - -inline int32_t Shape::get(Axis axis) const -{ - return DispatchByLayout(layout, internal_shape::DimensionGetterFunc{axis, this}); -} - -template <Axis D> inline bool Shape::set(int32_t t) -{ - return DispatchByLayout(layout, internal_shape::DimensionSetterFixedAxisFunc<D>{this, t}); -} - -inline bool Shape::set(Axis axis, int32_t t) -{ - return DispatchByLayout(layout, internal_shape::DimensionSetterFunc{axis, this, t}); -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_SHAPE_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/Spi.h b/runtime/onert/backend/gpu_cl/open_cl/Spi.h deleted file mode 100644 index c1d65b67e..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/Spi.h +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPEN_CL_SPI_H__ -#define __ONERT_BACKEND_GPU_CL_OPEN_CL_SPI_H__ - -#include <cstdint> - -#include "Api.h" -#include "AccessType.h" -#include "Status.h" - -// Contains only service provider-related interfaces. Users should not use them -// directly. - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -// Converts a tensor object into another one. -class TensorObjectConverter -{ -public: - virtual ~TensorObjectConverter() = default; - - virtual absl::Status Convert(const TensorObject &input, const TensorObject &output) = 0; -}; - -class TensorObjectConverterBuilder -{ -public: - virtual ~TensorObjectConverterBuilder() = default; - - virtual bool IsSupported(const TensorObjectDef &input, const TensorObjectDef &output) const = 0; - - virtual absl::Status MakeConverter(const TensorObjectDef &input, const TensorObjectDef &output, - std::unique_ptr<TensorObjectConverter> *converter) = 0; -}; - -// Connects tensor definition provided by a user (external) with tensor -// definition used by the inference engine (internal). -struct TensorTieDef -{ - uint32_t id; - AccessType access_type; - TensorObjectDef internal_def; - TensorObjectDef external_def; -}; - -// Connects external tensor object to internal tensor object and provides -// functionality to copy data to/from external object to internal. -class TensorTie -{ -public: - explicit TensorTie(const TensorTieDef &def) : def_(def) {} - - virtual ~TensorTie() = default; - - virtual absl::Status SetExternalObject(TensorObject obj) = 0; - - virtual TensorObject GetExternalObject() = 0; - - virtual absl::Status CopyToExternalObject() = 0; - - virtual absl::Status CopyFromExternalObject() = 0; - - const TensorTieDef &def() const { return def_; } - -private: - const TensorTieDef def_; -}; - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPEN_CL_SPI_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/Status.h b/runtime/onert/backend/gpu_cl/open_cl/Status.h deleted file mode 100644 index 6295a7e77..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/Status.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2020 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_STATUS_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_STATUS_H__ - -#include "absl/status/status.h" // IWYU pragma: export -#define RETURN_IF_ERROR(s) \ - { \ - auto c = (s); \ - if (!c.ok()) \ - return c; \ - } // IWYU pragma: export - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_STATUS_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/StorageTypeUtil.cc b/runtime/onert/backend/gpu_cl/open_cl/StorageTypeUtil.cc deleted file mode 100644 index eada697ac..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/StorageTypeUtil.cc +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2020 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "StorageTypeUtil.h" - -#include "TensorType.h" -#include "DataType.h" -#include "Shape.h" -#include "Util.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -bool CanCreateTensorWithShape(const DeviceInfo &device_info, const BHWDC &shape, - const TensorDescriptor &descriptor) -{ - const int slices = DivideRoundUp(shape.c, 4); - switch (descriptor.storage_type) - { - case TensorStorageType::BUFFER: - { - const uint64_t flt4_size = 4 * (descriptor.data_type == DataType::FLOAT32 ? 4 : 2); - const uint64_t buffer_size = shape.b * shape.w * shape.h * shape.d * slices * flt4_size; - return buffer_size <= device_info.buffer_max_size; - } - case TensorStorageType::IMAGE_BUFFER: - return (uint64_t)shape.b * shape.w * shape.h * shape.d * slices <= - device_info.image_buffer_max_size; - case TensorStorageType::TEXTURE_3D: - if (device_info.cl_version < OpenCLVersion::CL_1_2 && slices == 1) - { - // clCreateImage3D (that used in CL 1.0/1.1) can not create image with - // depth = 1 by specification; - return false; - } - return (uint64_t)shape.w * shape.b <= device_info.image3d_max_width && - (uint64_t)shape.h <= device_info.image3d_max_height && - (uint64_t)slices * shape.d <= device_info.image3d_max_depth; - case TensorStorageType::TEXTURE_ARRAY: - // Bug on some Adreno. b/131099086 - if (slices == 1 && !device_info.SupportsOneLayerTextureArray()) - { - return false; - } - return (uint64_t)shape.w * shape.b <= device_info.image2d_max_width && - (uint64_t)shape.h <= device_info.image2d_max_height && - (uint64_t)slices * shape.d <= device_info.image_array_max_layers; - case TensorStorageType::TEXTURE_2D: - return (uint64_t)shape.w * shape.b * shape.d <= device_info.image2d_max_width && - (uint64_t)shape.h * slices <= device_info.image2d_max_height; - case TensorStorageType::SINGLE_TEXTURE_2D: - return (uint64_t)shape.c <= 4 && - device_info.SupportsFloatImage2D(descriptor.data_type, shape.c) && - (uint64_t)shape.w * shape.b * shape.d <= device_info.image2d_max_width && - (uint64_t)shape.h <= device_info.image2d_max_height; - default: - return false; - } -} - -bool CanCreateTensorWithShape(const DeviceInfo &device_info, const BHWC &shape, - const TensorDescriptor &descriptor) -{ - const BHWDC shape5D(shape.b, shape.h, shape.w, 1, shape.c); - return CanCreateTensorWithShape(device_info, shape5D, descriptor); -} - -TensorStorageType SelectBestStorageType(const DeviceInfo &device_info, const BHWC &shape, - const TensorStorageType &desired, const DataType &data_type, - const Layout &layout) -{ - if (CanCreateTensorWithShape(device_info, shape, TensorDescriptor{data_type, desired, layout})) - { - return desired; - } - auto GetBestTypeAfterTextureArray = [&]() { - if (device_info.SupportsImageBuffer() && - CanCreateTensorWithShape( - device_info, shape, TensorDescriptor{data_type, TensorStorageType::IMAGE_BUFFER, layout})) - { - return TensorStorageType::IMAGE_BUFFER; - } - else - { - return TensorStorageType::BUFFER; - } - }; - auto GetBestTypeAfterTexture2D = [&]() { - if (device_info.SupportsTextureArray() && - CanCreateTensorWithShape( - device_info, shape, - TensorDescriptor{data_type, TensorStorageType::TEXTURE_ARRAY, layout})) - { - return TensorStorageType::TEXTURE_ARRAY; - } - else - { - return GetBestTypeAfterTextureArray(); - } - }; - auto GetBestTypeAfterTexture3D = [&]() { - if (CanCreateTensorWithShape( - device_info, shape, TensorDescriptor{data_type, TensorStorageType::TEXTURE_2D, layout})) - { - return TensorStorageType::TEXTURE_2D; - } - else - { - return GetBestTypeAfterTexture2D(); - } - }; - switch (desired) - { - case TensorStorageType::TEXTURE_2D: - case TensorStorageType::SINGLE_TEXTURE_2D: - return GetBestTypeAfterTexture2D(); - case TensorStorageType::TEXTURE_ARRAY: - return GetBestTypeAfterTextureArray(); - case TensorStorageType::TEXTURE_3D: - return GetBestTypeAfterTexture3D(); - case TensorStorageType::IMAGE_BUFFER: - case TensorStorageType::BUFFER: - return TensorStorageType::BUFFER; - default: - return TensorStorageType::BUFFER; - } -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/StorageTypeUtil.h b/runtime/onert/backend/gpu_cl/open_cl/StorageTypeUtil.h deleted file mode 100644 index a84c3865f..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/StorageTypeUtil.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2020 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_STORAGE_TYPE_UTIL_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_STORAGE_TYPE_UTIL_H__ - -#include "DeviceInfo.h" -#include "TensorType.h" -#include "DataType.h" -#include "Shape.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -bool CanCreateTensorWithShape(const DeviceInfo &device_info, const BHWDC &shape, - const TensorDescriptor &descriptor); - -bool CanCreateTensorWithShape(const DeviceInfo &device_info, const BHWC &shape, - const TensorDescriptor &descriptor); - -TensorStorageType SelectBestStorageType(const DeviceInfo &device_info, const BHWC &shape, - const TensorStorageType &desired, const DataType &data_type, - const Layout &layout); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_STORAGE_TYPE_UTIL_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/Tensor.cc b/runtime/onert/backend/gpu_cl/open_cl/Tensor.cc deleted file mode 100644 index 983e0d29d..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/Tensor.cc +++ /dev/null @@ -1,690 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Tensor.h" - -#include <cstring> -#include <vector> - -#include "absl/strings/str_cat.h" - -#include "Buffer.h" -#include "ClImageFormat.h" -#include "ClMemory.h" -#include "GpuObject.h" -#include "TensorType.h" -#include "InternalTensor.h" -#include "DataType.h" -#include "Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ -namespace -{ - -absl::Status AllocateTensorMemory(const CLContext &context, const BHWDC &shape, - const TensorDescriptor &descriptor, const void *data_ptr, - CLMemory *result) -{ - const int slices = DivideRoundUp(shape.c, 4); - cl_mem_flags mem_flags = CL_MEM_READ_WRITE; - if (data_ptr) - { - mem_flags |= CL_MEM_COPY_HOST_PTR; - } - switch (descriptor.storage_type) - { - case TensorStorageType::BUFFER: - case TensorStorageType::IMAGE_BUFFER: - { - const size_t data_size = - shape.b * shape.w * shape.h * shape.d * slices * 4 * SizeOf(descriptor.data_type); - cl_int error_code; - cl_mem memory = clCreateBuffer(context.context(), mem_flags, data_size, - const_cast<void *>(data_ptr), &error_code); - if (!memory) - { - return absl::UnknownError(absl::StrCat( - "Failed to allocate device memory (clCreateBuffer): ", CLErrorCodeToString(error_code))); - } - *result = CLMemory(memory, true); - return absl::OkStatus(); - } - case TensorStorageType::TEXTURE_2D: - { - cl_image_desc desc; - desc.image_type = CL_MEM_OBJECT_IMAGE2D; - desc.image_width = shape.w * shape.b * shape.d; - desc.image_height = shape.h * slices; - desc.image_depth = 0; - desc.image_row_pitch = 0; - desc.image_slice_pitch = 0; - desc.num_mip_levels = 0; - desc.num_samples = 0; - desc.buffer = nullptr; - - cl_image_format format; - format.image_channel_order = CL_RGBA; - format.image_channel_data_type = ToImageChannelType(descriptor.data_type); - - cl_int error_code; - cl_mem memory = CreateImage2DLegacy(context.context(), mem_flags, &format, &desc, - const_cast<void *>(data_ptr), &error_code); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError(absl::StrCat("Failed to create 2D texture (clCreateImage): ", - CLErrorCodeToString(error_code))); - } - - *result = CLMemory(memory, true); - return absl::OkStatus(); - } - case TensorStorageType::TEXTURE_3D: - { - cl_image_desc desc; - desc.image_type = CL_MEM_OBJECT_IMAGE3D; - desc.image_width = shape.w * shape.b; - desc.image_height = shape.h; - desc.image_depth = slices * shape.d; - desc.image_row_pitch = 0; - desc.image_slice_pitch = 0; - desc.num_mip_levels = 0; - desc.num_samples = 0; - desc.buffer = nullptr; - - cl_image_format format; - format.image_channel_order = CL_RGBA; - format.image_channel_data_type = ToImageChannelType(descriptor.data_type); - - cl_int error_code; - cl_mem memory = CreateImage3DLegacy(context.context(), mem_flags, &format, &desc, - const_cast<void *>(data_ptr), &error_code); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError(absl::StrCat("Failed to create 3D texture (clCreateImage): ", - CLErrorCodeToString(error_code))); - } - - *result = CLMemory(memory, true); - return absl::OkStatus(); - } - case TensorStorageType::TEXTURE_ARRAY: - { - cl_image_desc desc; - desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY; - desc.image_width = shape.w * shape.b; - desc.image_height = shape.h; - desc.image_depth = 0; - desc.image_array_size = slices * shape.d; - desc.image_row_pitch = 0; - desc.image_slice_pitch = 0; - desc.num_mip_levels = 0; - desc.num_samples = 0; - desc.buffer = nullptr; - - cl_image_format format; - format.image_channel_order = CL_RGBA; - format.image_channel_data_type = ToImageChannelType(descriptor.data_type); - - cl_int error_code; - cl_mem memory = clCreateImage(context.context(), mem_flags, &format, &desc, - const_cast<void *>(data_ptr), &error_code); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError(absl::StrCat( - "Failed to create 2D texture array (clCreateImage): ", CLErrorCodeToString(error_code))); - } - - *result = CLMemory(memory, true); - return absl::OkStatus(); - } - - case TensorStorageType::SINGLE_TEXTURE_2D: - { - if (slices != 1) - { - return absl::InvalidArgumentError(absl::StrCat( - "SINGLE_TEXTURE_2D support only channels in range [1-4], but ", shape.c, "was provided")); - } - cl_image_desc desc; - desc.image_type = CL_MEM_OBJECT_IMAGE2D; - desc.image_width = shape.w * shape.b * shape.d; - desc.image_height = shape.h; - desc.image_depth = 0; - desc.image_row_pitch = 0; - desc.image_slice_pitch = 0; - desc.num_mip_levels = 0; - desc.num_samples = 0; - desc.buffer = nullptr; - - cl_image_format format; - if (context.IsFloatTexture2DSupported(shape.c, descriptor.data_type)) - { - format.image_channel_order = ToChannelOrder(shape.c); - format.image_channel_data_type = ToImageChannelType(descriptor.data_type); - } - else - { - return absl::InvalidArgumentError( - absl::StrCat("This device doesn't support ", shape.c, "-channel textures.")); - } - - cl_int error_code; - cl_mem memory = CreateImage2DLegacy(context.context(), mem_flags, &format, &desc, - const_cast<void *>(data_ptr), &error_code); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError(absl::StrCat( - "Failed to create single 2D texture (clCreateImage): ", CLErrorCodeToString(error_code))); - } - - *result = CLMemory(memory, true); - return absl::OkStatus(); - } - - default: - return absl::InternalError("Unsupported tensor storage type"); - } -} - -absl::Status CreateImageBufferFromBuffer(const CLContext &context, cl_mem memory, - DataType data_type, int width, cl_mem *result) -{ - cl_image_format format; - cl_image_desc desc; - std::memset(&desc, 0, sizeof(desc)); - desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; - desc.image_width = width; - desc.mem_object = memory; - - format.image_channel_data_type = ToImageChannelType(data_type); - format.image_channel_order = CL_RGBA; - - cl_int error_code; - *result = - clCreateImage(context.context(), CL_MEM_READ_WRITE, &format, &desc, nullptr, &error_code); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError(absl::StrCat("Failed to create Image from Buffer (clCreateImage): ", - CLErrorCodeToString(error_code))); - } - return absl::OkStatus(); -} - -absl::Status CreateTensor(const CLContext &context, const BHWDC &shape, - const TensorDescriptor &descriptor, cl_mem memory, Tensor *result) -{ - const bool memory_owner = memory == nullptr; - if (memory_owner) - { - CLMemory mem; - RETURN_IF_ERROR(AllocateTensorMemory(context, shape, descriptor, nullptr, &mem)); - memory = mem.Release(); - } - if (descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) - { - cl_mem image_memory; - RETURN_IF_ERROR(CreateImageBufferFromBuffer( - context, memory, descriptor.data_type, - shape.b * shape.w * shape.h * shape.d * DivideRoundUp(shape.c, 4), &image_memory)); - *result = Tensor(memory, memory_owner, image_memory, shape, descriptor); - } - else - { - *result = Tensor(memory, memory_owner, shape, descriptor); - } - return absl::OkStatus(); -} - -absl::Status CreateTensorShared(const CLContext &context, const BHWDC &shape, - const TensorDescriptor &descriptor, cl_mem memory, Tensor *result) -{ - const bool memory_owner = false; - if (descriptor.storage_type == TensorStorageType::IMAGE_BUFFER) - { - cl_mem image_memory; - RETURN_IF_ERROR(CreateImageBufferFromBuffer( - context, memory, descriptor.data_type, - shape.b * shape.w * shape.h * shape.d * DivideRoundUp(shape.c, 4), &image_memory)); - *result = Tensor(memory, memory_owner, image_memory, shape, descriptor); - } - else - { - *result = Tensor(memory, memory_owner, shape, descriptor); - } - return absl::OkStatus(); -} - -} // namespace - -absl::Status TensorDescriptor::CreateGPUObject(CLContext *context, GPUObjectPtr *result) const -{ - Tensor gpu_tensor; - RETURN_IF_ERROR(gpu_tensor.CreateFromDescriptor(*this, context)); - *result = absl::make_unique<Tensor>(std::move(gpu_tensor)); - return absl::OkStatus(); -} - -Tensor::Tensor(cl_mem memory, bool memory_owner, const BHWC &shape, - const TensorDescriptor &descriptor) - : memory_(memory), image_buffer_memory_(nullptr), memory_owner_(memory_owner), - shape_(shape.b, shape.h, shape.w, 1, shape.c), descriptor_(descriptor) -{ -} - -Tensor::Tensor(cl_mem memory, bool memory_owner, const BHWDC &shape, - const TensorDescriptor &descriptor) - : memory_(memory), image_buffer_memory_(nullptr), memory_owner_(memory_owner), shape_(shape), - descriptor_(descriptor) -{ -} - -Tensor::Tensor(cl_mem memory, bool memory_owner, cl_mem image_buffer_memory, const BHWC &shape, - const TensorDescriptor &descriptor) - : memory_(memory), image_buffer_memory_(image_buffer_memory), memory_owner_(memory_owner), - shape_(shape.b, shape.h, shape.w, 1, shape.c), descriptor_(descriptor) -{ -} - -Tensor::Tensor(cl_mem memory, bool memory_owner, cl_mem image_buffer_memory, const BHWDC &shape, - const TensorDescriptor &descriptor) - : memory_(memory), image_buffer_memory_(image_buffer_memory), memory_owner_(memory_owner), - shape_(shape), descriptor_(descriptor) -{ -} - -Tensor::Tensor(Tensor &&tensor) - : memory_(tensor.memory_), image_buffer_memory_(tensor.image_buffer_memory_), - memory_owner_(tensor.memory_owner_), shape_(tensor.shape_), descriptor_(tensor.descriptor_) -{ - tensor.memory_ = nullptr; - tensor.image_buffer_memory_ = nullptr; -} - -Tensor &Tensor::operator=(Tensor &&tensor) -{ - if (this != &tensor) - { - Release(); - std::swap(memory_, tensor.memory_); - std::swap(image_buffer_memory_, tensor.image_buffer_memory_); - std::swap(memory_owner_, tensor.memory_owner_); - std::swap(shape_, tensor.shape_); - std::swap(descriptor_, tensor.descriptor_); - } - return *this; -} - -void Tensor::Release() -{ - // image_buffer_memory_ always owned by object - if (image_buffer_memory_) - { - clReleaseMemObject(image_buffer_memory_); - image_buffer_memory_ = nullptr; - } - if (memory_owner_ && memory_) - { - clReleaseMemObject(memory_); - memory_ = nullptr; - } -} - -absl::Status Tensor::GetGPUResources(const GPUObjectDescriptor *obj_ptr, - GPUResourcesWithValue *resources) const -{ - const auto *buffer_desc = dynamic_cast<const BufferDescriptor *>(obj_ptr); - if (buffer_desc) - { - if (descriptor_.storage_type != TensorStorageType::BUFFER) - { - return absl::InvalidArgumentError("Tensor can be used with BufferDescriptor only wtih " - "TensorStorageType::BUFFER."); - } - resources->buffers.push_back({"buffer", memory_}); - return absl::OkStatus(); - } - const auto *tensor_desc = dynamic_cast<const TensorDescriptor *>(obj_ptr); - if (!tensor_desc) - { - return absl::InvalidArgumentError("Expected TensorDescriptor on input."); - } - if (descriptor_.HasAxis(Axis::WIDTH)) - { - resources->ints.push_back({"width", Width()}); - resources->ints.push_back({"width_div2", Width() / 2}); - resources->ints.push_back({"width_div4", Width() / 4}); - resources->ints.push_back({"width_batched", Width() * Batch()}); - resources->ints.push_back({"width_batched_div2", Width() * Batch() / 2}); - resources->ints.push_back({"width_batched_div4", Width() * Batch() / 4}); - } - if (descriptor_.HasAxis(Axis::HEIGHT)) - { - resources->ints.push_back({"height", Height()}); - } - if (descriptor_.HasAxis(Axis::CHANNELS)) - { - resources->ints.push_back({"slices", Slices()}); - resources->ints.push_back({"channels", Channels()}); - } - if (descriptor_.HasAxis(Axis::BATCH)) - { - resources->ints.push_back({"batch", Batch()}); - } - if (descriptor_.HasAxis(Axis::DEPTH)) - { - resources->ints.push_back({"depth", Depth()}); - } - - if (descriptor_.storage_type == TensorStorageType::BUFFER) - { - resources->buffers.push_back({"buffer", memory_}); - } - else if (descriptor_.storage_type == TensorStorageType::TEXTURE_2D || - descriptor_.storage_type == TensorStorageType::SINGLE_TEXTURE_2D) - { - resources->images2d.push_back({"image2d", memory_}); - } - else if (descriptor_.storage_type == TensorStorageType::TEXTURE_ARRAY) - { - resources->image2d_arrays.push_back({"image2d_array", memory_}); - } - else if (descriptor_.storage_type == TensorStorageType::TEXTURE_3D) - { - resources->images3d.push_back({"image3d", memory_}); - } - else if (descriptor_.storage_type == TensorStorageType::IMAGE_BUFFER) - { - if (obj_ptr->GetAccess() == AccessType::READ) - { - resources->image_buffers.push_back({"image_buffer", image_buffer_memory_}); - } - else - { - resources->buffers.push_back({"buffer", memory_}); - } - } - - return absl::OkStatus(); -} - -int3 Tensor::GetFullTensorRegion() const -{ - switch (descriptor_.storage_type) - { - case TensorStorageType::BUFFER: - case TensorStorageType::TEXTURE_ARRAY: - case TensorStorageType::TEXTURE_3D: - case TensorStorageType::IMAGE_BUFFER: - return {shape_.w * shape_.b, shape_.h, shape_.d * Slices()}; - case TensorStorageType::TEXTURE_2D: - return {shape_.w * shape_.b * shape_.d, shape_.h * Slices(), 1}; - case TensorStorageType::SINGLE_TEXTURE_2D: - return {shape_.w * shape_.b * shape_.d, shape_.h, 1}; - case TensorStorageType::UNKNOWN: - return {-1, -1, -1}; - } - return {-1, -1, -1}; -} - -absl::Status Tensor::IsValid(const BHWC &shape) const -{ - if (shape.b != shape_.b) - { - return absl::InvalidArgumentError("Shape batch does not match tensor batch"); - } - if (shape.w != shape_.w) - { - return absl::InvalidArgumentError("Shape width does not match tensor width"); - } - if (shape.h != shape_.h) - { - return absl::InvalidArgumentError("Shape height does not match tensor height"); - } - if (shape.c != shape_.c) - { - return absl::InvalidArgumentError("Shape channels does not match tensor channels"); - } - return absl::OkStatus(); -} - -absl::Status Tensor::IsValid(const BHWDC &shape) const -{ - if (shape.b != shape_.b) - { - return absl::InvalidArgumentError("Shape batch does not match tensor batch"); - } - if (shape.w != shape_.w) - { - return absl::InvalidArgumentError("Shape width does not match tensor width"); - } - if (shape.h != shape_.h) - { - return absl::InvalidArgumentError("Shape height does not match tensor height"); - } - if (shape.d != shape_.d) - { - return absl::InvalidArgumentError("Shape depth does not match tensor depth"); - } - if (shape.c != shape_.c) - { - return absl::InvalidArgumentError("Shape channels does not match tensor channels"); - } - return absl::OkStatus(); -} - -int Tensor::GetAlignedChannels() const -{ - return descriptor_.storage_type == TensorStorageType::SINGLE_TEXTURE_2D ? shape_.c - : AlignByN(shape_.c, 4); -} - -uint64_t Tensor::GetMemorySizeInBytes() const -{ - const uint64_t flt_size = static_cast<uint64_t>(SizeOf(descriptor_.data_type)); - const uint64_t flt4_size = 4 * flt_size; - switch (descriptor_.storage_type) - { - case TensorStorageType::BUFFER: - case TensorStorageType::IMAGE_BUFFER: - case TensorStorageType::TEXTURE_ARRAY: - case TensorStorageType::TEXTURE_2D: - case TensorStorageType::TEXTURE_3D: - return flt4_size * shape_.b * shape_.w * shape_.h * shape_.d * Slices(); - case TensorStorageType::SINGLE_TEXTURE_2D: - return flt_size * shape_.w * shape_.h * shape_.c * shape_.b * shape_.d; - default: - return 0; - } -} - -cl_mem Tensor::GetMemoryPtr() const -{ - return descriptor_.storage_type == TensorStorageType::IMAGE_BUFFER ? image_buffer_memory_ - : memory_; -} - -cl_mem Tensor::GetMemoryPtrForWriting() const { return memory_; } - -absl::Status Tensor::WriteDataBHWDC(absl::Span<const float> in, CLCommandQueue *queue) -{ - void *data_ptr = nullptr; - const int aligned_channels = GetAlignedChannels(); - const int elements_count = shape_.b * shape_.w * shape_.h * shape_.d * aligned_channels; - - const size_t data_size = elements_count * SizeOf(descriptor_.data_type); - std::vector<float> data_f; - data_f.resize(elements_count); - data_ptr = data_f.data(); - DataFromBHWDC(in, shape_, descriptor_, absl::MakeSpan(data_f.data(), data_f.size())); - - switch (descriptor_.storage_type) - { - case TensorStorageType::BUFFER: - case TensorStorageType::IMAGE_BUFFER: - RETURN_IF_ERROR(queue->EnqueueWriteBuffer(memory_, data_size, data_ptr)); - break; - case TensorStorageType::TEXTURE_ARRAY: - case TensorStorageType::TEXTURE_2D: - case TensorStorageType::TEXTURE_3D: - case TensorStorageType::SINGLE_TEXTURE_2D: - RETURN_IF_ERROR(queue->EnqueueWriteImage(memory_, GetFullTensorRegion(), data_ptr)); - break; - default: - return absl::InternalError("Unsupported tensor storage type"); - } - - return absl::OkStatus(); -} - -absl::Status Tensor::WriteData(CLCommandQueue *queue, const TensorFloat32 &src) -{ - RETURN_IF_ERROR(IsValid(src.shape)); - return WriteDataBHWDC(absl::MakeConstSpan(src.data), queue); -} - -absl::Status Tensor::WriteData(CLCommandQueue *queue, - const InternalTensor<Linear, DataType::FLOAT32> &src) -{ - return WriteDataBHWDC(absl::MakeConstSpan(src.data), queue); -} - -absl::Status Tensor::WriteData(CLCommandQueue *queue, - const InternalTensor<HWC, DataType::FLOAT32> &src) -{ - return WriteDataBHWDC(absl::MakeConstSpan(src.data), queue); -} - -absl::Status Tensor::WriteData(CLCommandQueue *queue, const Tensor5DFloat32 &src) -{ - RETURN_IF_ERROR(IsValid(src.shape)); - return WriteDataBHWDC(absl::MakeConstSpan(src.data), queue); -} - -absl::Status Tensor::ReadDataBHWDC(absl::Span<float> out, CLCommandQueue *queue) const -{ - void *data_ptr = nullptr; - const int aligned_channels = GetAlignedChannels(); - const int elements_count = shape_.b * shape_.w * shape_.h * shape_.d * aligned_channels; - const size_t data_size = elements_count * SizeOf(descriptor_.data_type); - - std::vector<float> data_f; - data_f.resize(elements_count); - data_ptr = data_f.data(); - switch (descriptor_.storage_type) - { - case TensorStorageType::BUFFER: - case TensorStorageType::IMAGE_BUFFER: - RETURN_IF_ERROR(queue->EnqueueReadBuffer(memory_, data_size, data_ptr)); - break; - case TensorStorageType::TEXTURE_ARRAY: - case TensorStorageType::TEXTURE_2D: - case TensorStorageType::TEXTURE_3D: - case TensorStorageType::SINGLE_TEXTURE_2D: - RETURN_IF_ERROR(queue->EnqueueReadImage(memory_, GetFullTensorRegion(), data_ptr)); - break; - default: - return absl::InternalError("Unsupported tensor storage type"); - } - - if (descriptor_.data_type == DataType::FLOAT32) - { - DataToBHWDC(absl::MakeConstSpan(data_f.data(), data_f.size()), shape_, descriptor_, out); - } - - return absl::OkStatus(); -} - -absl::Status Tensor::ReadData(CLCommandQueue *queue, TensorFloat32 *dst) const -{ - RETURN_IF_ERROR(IsValid(dst->shape)); - return ReadDataBHWDC(absl::MakeSpan(dst->data), queue); -} - -absl::Status Tensor::ReadData(CLCommandQueue *queue, Tensor5DFloat32 *dst) const -{ - RETURN_IF_ERROR(IsValid(dst->shape)); - return ReadDataBHWDC(absl::MakeSpan(dst->data), queue); -} - -absl::Status Tensor::CreateFromDescriptor(const TensorDescriptor &desc, CLContext *context) -{ - shape_ = desc.shape; - descriptor_.data_type = desc.data_type; - descriptor_.storage_type = desc.storage_type; - descriptor_.layout = desc.layout; - memory_owner_ = true; - CLMemory memory; - uint8_t *data_ptr = desc.data.empty() ? nullptr : const_cast<unsigned char *>(desc.data.data()); - RETURN_IF_ERROR(AllocateTensorMemory(*context, shape_, descriptor_, data_ptr, &memory)); - memory_ = memory.Release(); - if (desc.storage_type == TensorStorageType::IMAGE_BUFFER) - { - RETURN_IF_ERROR(CreateImageBufferFromBuffer(*context, memory_, desc.data_type, - shape_.b * shape_.w * shape_.h * shape_.d * - DivideRoundUp(shape_.c, 4), - &image_buffer_memory_)); - } - return absl::OkStatus(); -} - -absl::Status CreateTensor(const CLContext &context, const BHWC &shape, - const TensorDescriptor &descriptor, Tensor *result) -{ - const BHWDC shape5D(shape.b, shape.h, shape.w, 1, shape.c); - return CreateTensor(context, shape5D, descriptor, nullptr, result); -} - -absl::Status CreateTensor(const CLContext &context, const BHWDC &shape, - const TensorDescriptor &descriptor, Tensor *result) -{ - return CreateTensor(context, shape, descriptor, nullptr, result); -} - -absl::Status CreateSharedTensor(const CLContext &context, cl_mem memory, const BHWC &shape, - const TensorDescriptor &descriptor, Tensor *result) -{ - const BHWDC shape5D(shape.b, shape.h, shape.w, 1, shape.c); - return CreateTensorShared(context, shape5D, descriptor, memory, result); -} - -absl::Status CreateSharedTensor(const CLContext &context, cl_mem memory, const BHWDC &shape, - const TensorDescriptor &descriptor, Tensor *result) -{ - return CreateTensorShared(context, shape, descriptor, memory, result); -} - -absl::Status AllocateTensorMemory(const CLContext &context, const BHWC &shape, - const TensorDescriptor &descriptor, CLMemory *result) -{ - const BHWDC shape5D(shape.b, shape.h, shape.w, 1, shape.c); - return AllocateTensorMemory(context, shape5D, descriptor, nullptr, result); -} - -absl::Status AllocateTensorMemory(const CLContext &context, const BHWDC &shape, - const TensorDescriptor &descriptor, CLMemory *result) -{ - return AllocateTensorMemory(context, shape, descriptor, nullptr, result); -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/Tensor.h b/runtime/onert/backend/gpu_cl/open_cl/Tensor.h deleted file mode 100644 index b1930a423..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/Tensor.h +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_TENSOR_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_TENSOR_H__ - -#include <cstdint> -#include <memory> - -#include "absl/types/span.h" -#include "ClCommandQueue.h" -#include "OpenclWrapper.h" -#include "ClContext.h" -#include "ClDevice.h" -#include "ClMemory.h" -#include "GpuObject.h" -#include "TensorType.h" -#include "Util.h" -#include "DataType.h" -#include "Shape.h" -#include "Status.h" -#include "InternalTensor.h" -#include "Types.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -class Tensor : public GPUObject -{ -public: - Tensor() : memory_(nullptr), image_buffer_memory_(nullptr), memory_owner_(true) {} - Tensor(cl_mem memory, bool memory_owner, const BHWC &shape, const TensorDescriptor &descriptor); - Tensor(cl_mem memory, bool memory_owner, const BHWDC &shape, const TensorDescriptor &descriptor); - Tensor(cl_mem memory, bool memory_owner, cl_mem image_buffer_memory, const BHWC &shape, - const TensorDescriptor &descriptor); - Tensor(cl_mem memory, bool memory_owner, cl_mem image_buffer_memory, const BHWDC &shape, - const TensorDescriptor &descriptor); - - // Move only - Tensor(Tensor &&tensor); - Tensor &operator=(Tensor &&tensor); - Tensor(const Tensor &) = delete; - Tensor &operator=(const Tensor &) = delete; - - virtual ~Tensor() { Release(); } - - absl::Status GetGPUResources(const GPUObjectDescriptor *obj_ptr, - GPUResourcesWithValue *resources) const override; - - int Width() const { return shape_.w; } - int Height() const { return shape_.h; } - int Depth() const { return shape_.d; } - int Channels() const { return shape_.c; } - int Slices() const { return DivideRoundUp(shape_.c, 4); } - int Batch() const { return shape_.b; } - TensorDescriptor GetDescriptor() const { return descriptor_; } - DataType GetDataType() const { return descriptor_.data_type; } - TensorStorageType GetStorageType() const { return descriptor_.storage_type; } - - // for profiling and memory statistics - uint64_t GetMemorySizeInBytes() const; - - cl_mem GetMemoryPtr() const; - - // This function returns buffer memory ptr for IMAGE_BUFFER instead of image - // memory ptr. - cl_mem GetMemoryPtrForWriting() const; - - absl::Status WriteData(CLCommandQueue *queue, const TensorFloat32 &src); - absl::Status WriteData(CLCommandQueue *queue, - const InternalTensor<Linear, DataType::FLOAT32> &src); - absl::Status WriteData(CLCommandQueue *queue, const InternalTensor<HWC, DataType::FLOAT32> &src); - - absl::Status WriteData(CLCommandQueue *queue, const Tensor5DFloat32 &src); - absl::Status ReadData(CLCommandQueue *queue, TensorFloat32 *dst) const; - absl::Status ReadData(CLCommandQueue *queue, Tensor5DFloat32 *dst) const; - - absl::Status CreateFromDescriptor(const TensorDescriptor &desc, CLContext *context); - -private: - absl::Status IsValid(const BHWC &shape) const; - absl::Status IsValid(const BHWDC &shape) const; - - int GetChannelsAlignment() const; - int GetAlignedChannels() const; - - absl::Status WriteDataBHWDC(absl::Span<const float> in, CLCommandQueue *queue); - absl::Status ReadDataBHWDC(absl::Span<float> out, CLCommandQueue *queue) const; - - int3 GetFullTensorRegion() const; - void Release(); - - cl_mem memory_; - cl_mem image_buffer_memory_; // for TensorStorageType::IMAGE_BUFFER only - bool memory_owner_; - BHWDC shape_; - TensorDescriptor descriptor_; -}; - -using TensorPtr = std::shared_ptr<Tensor>; - -absl::Status AllocateTensorMemory(const CLContext &context, const BHWC &shape, - const TensorDescriptor &descriptor, CLMemory *result); - -absl::Status AllocateTensorMemory(const CLContext &context, const BHWDC &shape, - const TensorDescriptor &descriptor, CLMemory *result); - -absl::Status CreateTensor(const CLContext &context, const BHWC &shape, - const TensorDescriptor &descriptor, Tensor *result); - -absl::Status CreateTensor(const CLContext &context, const BHWDC &shape, - const TensorDescriptor &descriptor, Tensor *result); - -absl::Status CreateSharedTensor(const CLContext &context, cl_mem memory, const BHWC &shape, - const TensorDescriptor &descriptor, Tensor *result); - -absl::Status CreateSharedTensor(const CLContext &context, cl_mem memory, const BHWDC &shape, - const TensorDescriptor &descriptor, Tensor *result); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_TENSOR_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/TensorType.cc b/runtime/onert/backend/gpu_cl/open_cl/TensorType.cc deleted file mode 100644 index 7ede38795..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/TensorType.cc +++ /dev/null @@ -1,1116 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "TensorType.h" - -#include "absl/strings/str_cat.h" -#include "absl/strings/substitute.h" -#include "Shape.h" -#include "DataType.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ -namespace -{ - -std::string GetWriteImageFromDataType(DataType data_type) -{ - if (data_type == DataType::FLOAT32) - { - return "write_imagef"; - } - else if (data_type == DataType::FLOAT16) - { - return "write_imageh"; - } - else - { - throw std::runtime_error("Not supported data type"); - } -} - -} // namespace - -std::string TextureAddressModeToString(TextureAddressMode address_mode) -{ - switch (address_mode) - { - case TextureAddressMode::DONT_CARE: - return "smp_none"; - case TextureAddressMode::ZERO: - return "smp_zero"; - } - return ""; -} - -std::string ToString(TensorStorageType type) -{ - switch (type) - { - case TensorStorageType::UNKNOWN: - return "TensorStorageType::UNKNOWN"; - case TensorStorageType::BUFFER: - return "TensorStorageType::BUFFER"; - case TensorStorageType::TEXTURE_ARRAY: - return "TensorStorageType::TEXTURE_ARRAY"; - case TensorStorageType::TEXTURE_2D: - return "TensorStorageType::TEXTURE_2D"; - case TensorStorageType::TEXTURE_3D: - return "TensorStorageType::TEXTURE_3D"; - case TensorStorageType::SINGLE_TEXTURE_2D: - return "TensorStorageType::SINGLE_TEXTURE_2D"; - case TensorStorageType::IMAGE_BUFFER: - return "TensorStorageType::IMAGE_BUFFER"; - } - return ""; -} - -TensorDescriptor::TensorDescriptor(TensorDescriptor &&desc) - : GPUObjectDescriptor(std::move(desc)), data_type(desc.data_type), - storage_type(desc.storage_type), layout(desc.layout), shape(desc.shape), - data(std::move(desc.data)) -{ -} -TensorDescriptor &TensorDescriptor::operator=(TensorDescriptor &&desc) -{ - if (this != &desc) - { - std::swap(data_type, desc.data_type); - std::swap(storage_type, desc.storage_type); - std::swap(layout, desc.layout); - std::swap(shape, desc.shape); - data = std::move(desc.data); - GPUObjectDescriptor::operator=(std::move(desc)); - } - return *this; -} - -GPUResources TensorDescriptor::GetGPUResources() const -{ - GPUResources resources; - if (HasAxis(Axis::WIDTH)) - { - resources.ints.push_back("width"); - resources.ints.push_back("width_div2"); - resources.ints.push_back("width_div4"); - resources.ints.push_back("width_batched"); - resources.ints.push_back("width_batched_div2"); - resources.ints.push_back("width_batched_div4"); - } - if (HasAxis(Axis::HEIGHT)) - { - resources.ints.push_back("height"); - } - if (HasAxis(Axis::CHANNELS)) - { - resources.ints.push_back("slices"); - resources.ints.push_back("channels"); - } - if (HasAxis(Axis::BATCH)) - { - resources.ints.push_back("batch"); - } - if (HasAxis(Axis::DEPTH)) - { - resources.ints.push_back("depth"); - } - if (storage_type == TensorStorageType::BUFFER) - { - GPUBufferDescriptor desc; - desc.data_type = data_type; - desc.access_type = access_type_; - desc.element_size = 4; - auto it1 = state_vars_.find("ElementsX2"); - if (it1 != state_vars_.end() && it1->second == "true") - { - desc.element_size = 8; - } - auto it2 = state_vars_.find("ElementsX4"); - if (it2 != state_vars_.end() && it2->second == "true") - { - desc.element_size = 16; - } - resources.buffers.push_back({"buffer", desc}); - } - else if (storage_type == TensorStorageType::SINGLE_TEXTURE_2D || - storage_type == TensorStorageType::TEXTURE_2D) - { - GPUImage2DDescriptor desc; - desc.data_type = data_type; - desc.access_type = access_type_; - resources.images2d.push_back({"image2d", desc}); - } - else if (storage_type == TensorStorageType::TEXTURE_ARRAY) - { - GPUImage2DArrayDescriptor desc; - desc.data_type = data_type; - desc.access_type = access_type_; - resources.image2d_arrays.push_back({"image2d_array", desc}); - } - else if (storage_type == TensorStorageType::TEXTURE_3D) - { - GPUImage3DDescriptor desc; - desc.data_type = data_type; - desc.access_type = access_type_; - resources.images3d.push_back({"image3d", desc}); - } - else if (storage_type == TensorStorageType::IMAGE_BUFFER) - { - if (access_type_ == AccessType::READ) - { - GPUImageBufferDescriptor desc; - desc.data_type = data_type; - desc.access_type = access_type_; - resources.image_buffers.push_back({"image_buffer", desc}); - } - else - { - GPUBufferDescriptor desc; - desc.data_type = data_type; - desc.access_type = access_type_; - desc.element_size = 4; - resources.buffers.push_back({"buffer", desc}); - } - } - return resources; -} - -absl::Status TensorDescriptor::PerformSelector(const std::string &selector, - const std::vector<std::string> &args, - const std::vector<std::string> &template_args, - std::string *result) const -{ - if (selector == "Width") - { - *result = GetWidth(); - return absl::OkStatus(); - } - else if (selector == "Height") - { - *result = "height"; - return absl::OkStatus(); - } - else if (selector == "Slices") - { - *result = "slices"; - return absl::OkStatus(); - } - else if (selector == "SliceStride") - { - *result = GetSliceStride(); - return absl::OkStatus(); - } - else if (selector == "Channels") - { - *result = "channels"; - return absl::OkStatus(); - } - else if (selector == "Batch") - { - if (HasAxis(Axis::BATCH)) - { - *result = "batch"; - } - else - { - *result = "1"; - } - return absl::OkStatus(); - } - else if (selector == "Depth") - { - *result = "depth"; - return absl::OkStatus(); - } - else if (selector == "SetBatchRef") - { - if (args.size() != 1) - { - return absl::InvalidArgumentError("Unsupported arguments in SetBatchRef selector"); - } - state_vars_["batch_id"] = args[0]; - *result = ""; - return absl::OkStatus(); - } - else if (selector == "Read") - { - return PerformReadSelector(args, template_args, result); - } - else if (selector == "Write") - { - return PerformWriteSelector(args, result); - } - else if (selector == "WriteLinear") - { - return PerformWriteLinearSelector(args, result); - } - else if (selector == "GetAddress") - { - return PerformGetAddressSelector(args, result); - } - else if (selector == "GetPtrWithSliceOffset") - { - return PerformGetPtrWithSliceOffsetSelector(args, result); - } - else if (selector == "GetWHOffset") - { - return PerformGetWHOffsetSelector(args, result); - } - else if (selector == "GetHandle") - { - return PerformGetHandleSelector(args, result); - } - else - { - return absl::NotFoundError( - absl::StrCat("TensorDescriptor don't have selector with name - ", selector)); - } -} - -absl::Status TensorDescriptor::PerformReadSelector(const std::vector<std::string> &args, - const std::vector<std::string> &template_args, - std::string *result) const -{ - DataType read_as_type = data_type; - if (!template_args.empty()) - { - if (template_args.size() != 1) - { - return absl::NotFoundError("Unrecognized Read selector template arguments."); - } - else - { - RETURN_IF_ERROR(GetDataTypeFromTemplateArgs(template_args[0], &read_as_type)); - } - } - if (args.size() == 1) - { // function overload for 1D linear types. - if (storage_type == TensorStorageType::BUFFER || - storage_type == TensorStorageType::IMAGE_BUFFER) - { - *result = Read(read_as_type, args[0]); - return absl::OkStatus(); - } - else - { - return absl::InvalidArgumentError( - "Read selector with single argument can be used only with linear " - "storage types(BUFFER or IMAGE_BUFFER)"); - } - } - std::string xc; - std::string yc; - std::string zc; - std::string sc; - std::string bc; - bool parsed = ParseCoordsFromArgs(args, 0, &xc, &yc, &zc, &sc, &bc); - if (args.size() < 2 || !parsed) - { - return absl::NotFoundError("Unrecognized Read selector"); - } - - *result = Read(read_as_type, GetGlobalAddressNoDeclaration(xc, yc, zc, sc, bc)); - return absl::OkStatus(); -} - -absl::Status TensorDescriptor::GetLinkingContextFromWriteSelector( - const std::vector<std::string> &args, std::string *value_name, std::string *x_coord, - std::string *y_coord, std::string *s_coord) const -{ - std::string xc; - std::string yc; - std::string zc; - std::string sc; - std::string bc; - bool parsed = ParseCoordsFromArgs(args, 1, &xc, &yc, &zc, &sc, &bc); - if (args.size() < 2 || !parsed) - { - return absl::NotFoundError("Unrecognized Write selector"); - } - *value_name = args[0]; - if (HasAxis(Axis::BATCH) && !IsBatchedWidth()) - { - *x_coord = absl::StrCat("((", xc, ") * batch + (", bc, "))"); - } - else - { - *x_coord = absl::StrCat("(", xc, ")"); - } - *y_coord = absl::StrCat("(", yc, ")"); - *s_coord = absl::StrCat("(", sc, ")"); - return absl::OkStatus(); -} - -absl::Status TensorDescriptor::PerformWriteSelector(const std::vector<std::string> &args, - std::string *result) const -{ - std::string xc; - std::string yc; - std::string zc; - std::string sc; - std::string bc; - bool parsed = ParseCoordsFromArgs(args, 1, &xc, &yc, &zc, &sc, &bc); - if (args.size() < 2 || !parsed) - { - return absl::NotFoundError("Unrecognized Write selector"); - } - *result = Write(args[0], GetGlobalAddressNoDeclaration(xc, yc, zc, sc, bc)); - return absl::OkStatus(); -} - -absl::Status TensorDescriptor::PerformWriteLinearSelector(const std::vector<std::string> &args, - std::string *result) const -{ - if (storage_type != TensorStorageType::BUFFER && storage_type != TensorStorageType::IMAGE_BUFFER) - { - return absl::InvalidArgumentError("WriteLinear selector can be used only with linear " - "storages(BUFFER/IMAGE_BUFFER)"); - } - if (args.size() != 2) - { - return absl::NotFoundError("Unrecognized WriteLinear selector"); - } - *result = Write(args[0], "(" + args[1] + ")"); - return absl::OkStatus(); -} - -std::string TensorDescriptor::Read(DataType read_as_type, const std::string &global_address) const -{ - const std::string read_as = read_as_type == DataType::FLOAT16 ? "read_imageh" : "read_imagef"; - std::string image_type; - if (storage_type == TensorStorageType::TEXTURE_2D || - storage_type == TensorStorageType::SINGLE_TEXTURE_2D) - { - image_type = "image2d"; - } - else if (storage_type == TensorStorageType::TEXTURE_3D) - { - image_type = "image3d"; - } - else if (storage_type == TensorStorageType::TEXTURE_ARRAY) - { - image_type = "image2d_array"; - } - switch (storage_type) - { - case TensorStorageType::BUFFER: - if (read_as_type == data_type) - { - return absl::StrCat("buffer[", global_address, "]"); - } - else - { - const std::string conversion = - read_as_type == DataType::FLOAT16 ? "convert_half4" : "convert_float4"; - return absl::StrCat(conversion, "(buffer[", global_address, "])"); - } - case TensorStorageType::TEXTURE_2D: - case TensorStorageType::TEXTURE_3D: - case TensorStorageType::SINGLE_TEXTURE_2D: - case TensorStorageType::TEXTURE_ARRAY: - return absl::StrCat(read_as, "(", image_type, - ", " + TextureAddressModeToString(ModeFromState()) + ", ", global_address, - ")"); - case TensorStorageType::IMAGE_BUFFER: - return absl::StrCat(read_as, "(image_buffer, ", global_address, ")"); - case TensorStorageType::UNKNOWN: - return ""; - } - return ""; -} - -std::string TensorDescriptor::Write(const std::string &var_name, - const std::string &global_address) const -{ - std::string image_type; - if (storage_type == TensorStorageType::TEXTURE_2D || - storage_type == TensorStorageType::SINGLE_TEXTURE_2D) - { - image_type = "image2d"; - } - else if (storage_type == TensorStorageType::TEXTURE_3D) - { - image_type = "image3d"; - } - else if (storage_type == TensorStorageType::TEXTURE_ARRAY) - { - image_type = "image2d_array"; - } - switch (storage_type) - { - case TensorStorageType::BUFFER: - case TensorStorageType::IMAGE_BUFFER: - return absl::StrCat("buffer[", global_address, "] = ", var_name, ";\n"); - case TensorStorageType::TEXTURE_2D: - case TensorStorageType::TEXTURE_3D: - case TensorStorageType::SINGLE_TEXTURE_2D: - case TensorStorageType::TEXTURE_ARRAY: - return absl::StrCat(GetWriteImageFromDataType(data_type), "(", image_type, ", ", - global_address, ", ", var_name, ");\n"); - case TensorStorageType::UNKNOWN: - return ""; - } - return ""; -} - -absl::Status TensorDescriptor::PerformGetAddressSelector(const std::vector<std::string> &args, - std::string *result) const -{ - std::string xc; - std::string yc; - std::string zc; - std::string sc; - std::string bc; - bool parsed = ParseCoordsFromArgs(args, 1, &xc, &yc, &zc, &sc, &bc); - if (args.size() < 3 || !parsed) - { - return absl::NotFoundError("Unrecognized GetAddress selector"); - } - - *result = DeclareAddress(args[0], GetGlobalAddressNoDeclaration(xc, yc, zc, sc, bc)); - return absl::OkStatus(); -} - -absl::Status -TensorDescriptor::PerformGetPtrWithSliceOffsetSelector(const std::vector<std::string> &args, - std::string *result) const -{ - if (storage_type != TensorStorageType::BUFFER) - { - return absl::InvalidArgumentError( - "GetPtrWithSliceOffset selector can be used only with BUFFER"); - } - if (args.size() != 1) - { - return absl::NotFoundError( - absl::StrCat("GetPtrWithSliceOffset require one argument(slice coordinate), but ", - args.size(), " was passed")); - } - *result = absl::StrCat("buffer + ", args[0], " * ", GetSliceStride()); - return absl::OkStatus(); -} - -absl::Status TensorDescriptor::PerformGetWHOffsetSelector(const std::vector<std::string> &args, - std::string *result) const -{ - if (storage_type != TensorStorageType::BUFFER && storage_type != TensorStorageType::IMAGE_BUFFER) - { - return absl::InvalidArgumentError( - "GetWHOffset selector can be used only with BUFFER/IMAGE_BUFFER"); - } - if (args.size() != 2) - { - return absl::NotFoundError(absl::StrCat( - "GetWHOffset require two arguments(X and Y coordinates), but ", args.size(), " was passed")); - } - if (HasAxis(Axis::BATCH) && !IsBatchedWidth()) - { - auto it = state_vars_.find("batch_id"); - std::string batch_id; - if (it == state_vars_.end()) - { - return absl::NotFoundError( - "Not found batch_id. Should be setted up by SetBatchRef(). method"); - } - else - { - batch_id = it->second; - } - *result = absl::StrCat("((", args[1], ") * ", GetWidth(), " + (", args[0], ")) * batch + (", - batch_id, ")"); - } - else - { - *result = absl::StrCat("(", args[1], ") * ", GetWidth(), " + (", args[0], ")"); - } - return absl::OkStatus(); -} - -absl::Status TensorDescriptor::PerformGetHandleSelector(const std::vector<std::string> &args, - std::string *result) const -{ - if (!args.empty()) - { - return absl::NotFoundError( - absl::StrCat("GetHandle does not require arguments, but ", args.size(), " was passed")); - } - switch (storage_type) - { - case TensorStorageType::BUFFER: - *result = "buffer"; - return absl::OkStatus(); - case TensorStorageType::IMAGE_BUFFER: - if (access_type_ == AccessType::READ) - { - *result = "image_buffer"; - } - else - { - *result = "buffer"; - } - return absl::OkStatus(); - case TensorStorageType::TEXTURE_2D: - case TensorStorageType::SINGLE_TEXTURE_2D: - *result = "image2d"; - return absl::OkStatus(); - case TensorStorageType::TEXTURE_ARRAY: - *result = "image2d_array"; - return absl::OkStatus(); - case TensorStorageType::TEXTURE_3D: - *result = "image3d"; - return absl::OkStatus(); - case TensorStorageType::UNKNOWN: - return absl::UnavailableError("Unknown type"); - } - return absl::UnavailableError("Unknown type"); -} - -std::string TensorDescriptor::DeclareAddress(const std::string &var_name, - const std::string &address) const -{ - return absl::StrCat(StorageTypeToAddressType(), " ", var_name, " = ", address, ";"); -} - -std::string TensorDescriptor::StorageTypeToAddressType() const -{ - switch (storage_type) - { - case TensorStorageType::BUFFER: - case TensorStorageType::IMAGE_BUFFER: - return "int"; - case TensorStorageType::TEXTURE_2D: - case TensorStorageType::SINGLE_TEXTURE_2D: - return "int2"; - case TensorStorageType::TEXTURE_ARRAY: - case TensorStorageType::TEXTURE_3D: - return "int4"; - case TensorStorageType::UNKNOWN: - return ""; - } - return ""; -} - -std::string TensorDescriptor::GetGlobalAddressNoDeclarationWHS(const std::string &x, - const std::string &y, - const std::string &s) const -{ - switch (storage_type) - { - case TensorStorageType::BUFFER: - case TensorStorageType::IMAGE_BUFFER: - { - return absl::Substitute("((($2) * height + ($1)) * $3 + ($0))", x, y, s, GetWidth()); - } - case TensorStorageType::TEXTURE_2D: - return absl::Substitute("(int2)(($0), ($1) * slices + ($2))", x, y, s); - case TensorStorageType::SINGLE_TEXTURE_2D: - return absl::StrCat("(int2)(", x, ", ", y, ")"); - case TensorStorageType::TEXTURE_ARRAY: - case TensorStorageType::TEXTURE_3D: - return absl::StrCat("(int4)(", x, ", ", y, ", ", s, ", 0)"); - case TensorStorageType::UNKNOWN: - return "error"; - } - return "error"; -} - -std::string TensorDescriptor::GetGlobalAddressNoDeclarationWHSB(const std::string &x, - const std::string &y, - const std::string &s, - const std::string &b) const -{ - switch (storage_type) - { - case TensorStorageType::BUFFER: - case TensorStorageType::IMAGE_BUFFER: - return absl::Substitute("(((($3) * height + $2) * width + ($1)) * batch + ($0))", b, x, y, s); - case TensorStorageType::TEXTURE_2D: - return absl::Substitute("(int2)(($0) * batch + ($1), ($2) * slices + ($3))", x, b, y, s); - case TensorStorageType::SINGLE_TEXTURE_2D: - return absl::Substitute("(int2)(($0) * batch + ($1), ($2))", x, b, y); - case TensorStorageType::TEXTURE_ARRAY: - case TensorStorageType::TEXTURE_3D: - return absl::Substitute("(int4)(($0) * batch + ($1), ($2), ($3), 0)", x, b, y, s); - default: - throw std::runtime_error("Unknown storage type"); - } -} - -std::string TensorDescriptor::GetGlobalAddressNoDeclarationWHDS(const std::string &x, - const std::string &y, - const std::string &z, - const std::string &s) const -{ - switch (storage_type) - { - case TensorStorageType::BUFFER: - case TensorStorageType::IMAGE_BUFFER: - { - return absl::Substitute("(((($3) * slices + ($2)) * height + ($1)) * $4 + ($0))", x, y, s, z, - GetWidth()); - } - case TensorStorageType::TEXTURE_2D: - return absl::Substitute("(int2)(($0) * depth + ($1), ($2) * slices + ($3))", x, z, y, s); - case TensorStorageType::SINGLE_TEXTURE_2D: - return absl::Substitute("(int2)(($0) * depth + ($1), ($2))", x, z, y); - case TensorStorageType::TEXTURE_ARRAY: - case TensorStorageType::TEXTURE_3D: - return absl::Substitute("(int4)(($0), ($1), ($2) * slices + ($3), 0)", x, y, z, s); - case TensorStorageType::UNKNOWN: - return "error"; - } - return "error"; -} - -std::string TensorDescriptor::GetGlobalAddressNoDeclarationWHDSB(const std::string &x, - const std::string &y, - const std::string &z, - const std::string &s, - const std::string &b) const -{ - switch (storage_type) - { - case TensorStorageType::BUFFER: - case TensorStorageType::IMAGE_BUFFER: - return absl::Substitute("((((($4) * slices + ($3)) * height + $2) * width + ($1)) * batch + " - "($0))", - b, x, y, s, z); - case TensorStorageType::TEXTURE_2D: - return absl::Substitute("(int2)((($0) * batch + ($1)) * depth + ($2), ($3) * slices + ($4))", - x, b, z, y, s); - case TensorStorageType::SINGLE_TEXTURE_2D: - return absl::Substitute("(int2)((($0) * batch + ($1)) * depth + ($2), ($3))", x, b, z, y); - case TensorStorageType::TEXTURE_ARRAY: - case TensorStorageType::TEXTURE_3D: - return absl::Substitute("(int4)(($0) * batch + ($1), ($2), ($3) * slices + ($4), 0)", x, b, y, - z, s); - default: - throw std::runtime_error("Unknown storage type"); - } -} - -std::string TensorDescriptor::GetGlobalAddressNoDeclaration(const std::string &xc, - const std::string &yc, - const std::string &zc, - const std::string &sc, - const std::string &bc) const -{ - if (layout == Layout::HWC || (IsBatchedWidth() && layout == Layout::BHWC)) - { - return GetGlobalAddressNoDeclarationWHS(xc, yc, sc); - } - else if (layout == Layout::BHWC) - { - return GetGlobalAddressNoDeclarationWHSB(xc, yc, sc, bc); - } - else if (layout == Layout::HWDC || (IsBatchedWidth() && layout == Layout::BHWDC)) - { - return GetGlobalAddressNoDeclarationWHDS(xc, yc, zc, sc); - } - else if (layout == Layout::BHWDC) - { - return GetGlobalAddressNoDeclarationWHDSB(xc, yc, zc, sc, bc); - } - else - { - throw std::runtime_error("Unsupported layout"); - } -} - -absl::Status TensorDescriptor::GetDataTypeFromTemplateArgs(const std::string &template_arg, - DataType *result) const -{ - std::string read_type = template_arg; - if (read_type == "FLT" || read_type == "ACCUM_FLT") - { - auto it = state_vars_.find(read_type); - if (it == state_vars_.end()) - { - return absl::UnavailableError( - absl::StrCat("Read selector template argument ", read_type, " uninitialized.")); - } - else - { - read_type = it->second; - } - } - - if (read_type == "half") - { - *result = DataType::FLOAT16; - } - else if (read_type == "float") - { - *result = DataType::FLOAT32; - } - else - { - return absl::NotFoundError( - absl::StrCat("Unrecognized Read selector template argument - ", read_type)); - } - return absl::OkStatus(); -} - -bool TensorDescriptor::HasAxis(Axis axis) const -{ - if (axis == Axis::WIDTH || axis == Axis::HEIGHT || axis == Axis::CHANNELS) - { - return true; - } - if (axis == Axis::BATCH && (layout == Layout::BHWC || layout == Layout::BHWDC)) - { - return true; - } - if (axis == Axis::DEPTH && (layout == Layout::HWDC || layout == Layout::BHWDC)) - { - return true; - } - return false; -} - -void TensorDescriptor::SetTextureAddressMode(TextureAddressMode mode) -{ - if (mode == TextureAddressMode::ZERO) - { - state_vars_["TextureMode"] = "ZERO"; - } - else - { - state_vars_["TextureMode"] = "DONT_CARE"; - } -} - -bool TensorDescriptor::ParseCoordsFromArgs(const std::vector<std::string> &args, int offset, - std::string *xc, std::string *yc, std::string *zc, - std::string *sc, std::string *bc) const -{ - if (HasAxis(Axis::WIDTH)) - { - if ((size_t)offset >= args.size()) - return false; - *xc = args[offset++]; - } - if (HasAxis(Axis::HEIGHT)) - { - if ((size_t)offset >= args.size()) - return false; - *yc = args[offset++]; - } - if (HasAxis(Axis::DEPTH)) - { - if ((size_t)offset >= args.size()) - return false; - *zc = args[offset++]; - } - if (HasAxis(Axis::CHANNELS)) - { - if ((size_t)offset >= args.size()) - { - auto it = state_vars_.find("slice_id"); - if (it == state_vars_.end()) - { - return false; - } - else - { - *sc = it->second; - } - } - else - { - *sc = args[offset++]; - } - } - if (HasAxis(Axis::BATCH) && !IsBatchedWidth()) - { - if ((size_t)offset >= args.size()) - { - auto it = state_vars_.find("batch_id"); - if (it == state_vars_.end()) - { - return false; - } - else - { - *bc = it->second; - } - } - else - { - *bc = args[offset++]; - } - } - return true; -} - -bool TensorDescriptor::IsBatchedWidth() const -{ - auto it = state_vars_.find("BatchedWidth"); - return it != state_vars_.end() && it->second == "true"; -} - -std::string TensorDescriptor::GetWidth() const -{ - std::string div; - auto it1 = state_vars_.find("ElementsX2"); - if (it1 != state_vars_.end() && it1->second == "true") - { - div = "_div2"; - } - auto it2 = state_vars_.find("ElementsX4"); - if (it2 != state_vars_.end() && it2->second == "true") - { - div = "_div4"; - } - auto it = state_vars_.find("BatchedWidth"); - if (it != state_vars_.end() && it->second == "true") - { - return "width_batched" + div; - } - else - { - return "width" + div; - } -} - -std::string TensorDescriptor::GetSliceStride() const -{ - if (IsBatchedWidth()) - { - return GetWidth() + " * height"; - } - else - { - if (HasAxis(Axis::BATCH)) - { - return GetWidth() + " * height * batch"; - } - else - { - return GetWidth() + " * height"; - } - } -} - -TextureAddressMode TensorDescriptor::ModeFromState() const -{ - auto it = state_vars_.find("TextureMode"); - if (it != state_vars_.end()) - { - if (it->second == "ZERO") - { - return TextureAddressMode::ZERO; - } - else - { - return TextureAddressMode::DONT_CARE; - } - } - else - { - return TextureAddressMode::DONT_CARE; - } -} - -void TensorDescriptor::UploadData(const InternalTensor<HWC, DataType::FLOAT32> &src) -{ - shape = BHWDC(1, src.shape.h, src.shape.w, 1, src.shape.c); - UploadData(absl::MakeConstSpan(src.data)); -} - -void TensorDescriptor::UploadData(const InternalTensor<Linear, DataType::FLOAT32> &src) -{ - shape = BHWDC(1, 1, 1, 1, src.shape.v); - UploadData(absl::MakeConstSpan(src.data)); -} - -void TensorDescriptor::UploadData(absl::Span<const float> src) -{ - int aligned_channels = - storage_type == TensorStorageType::SINGLE_TEXTURE_2D ? shape.c : AlignByN(shape.c, 4); - int elements_count = shape.b * shape.w * shape.h * shape.d * aligned_channels; - data.resize(elements_count * SizeOf(data_type)); - if (data_type == DataType::FLOAT32) - { - float *gpu_data = reinterpret_cast<float *>(data.data()); - DataFromBHWDC(src, shape, *this, absl::MakeSpan(gpu_data, elements_count)); - } -} - -bool TensorDescriptor::SupportsZeroClamp(const Axis &axis) const -{ - switch (storage_type) - { - case TensorStorageType::UNKNOWN: - return false; - case TensorStorageType::BUFFER: - case TensorStorageType::IMAGE_BUFFER: - return false; - case TensorStorageType::TEXTURE_ARRAY: - case TensorStorageType::TEXTURE_2D: - case TensorStorageType::SINGLE_TEXTURE_2D: - return axis == Axis::WIDTH || axis == Axis::HEIGHT; - case TensorStorageType::TEXTURE_3D: - return axis == Axis::WIDTH || axis == Axis::HEIGHT || axis == Axis::DEPTH; - } - return false; -} - -bool TensorDescriptor::CanReadOutOfBorder(const Axis &) const -{ - switch (storage_type) - { - case TensorStorageType::UNKNOWN: - return false; - case TensorStorageType::BUFFER: - return false; - case TensorStorageType::IMAGE_BUFFER: - case TensorStorageType::TEXTURE_2D: - case TensorStorageType::TEXTURE_3D: - case TensorStorageType::SINGLE_TEXTURE_2D: - case TensorStorageType::TEXTURE_ARRAY: - return true; - } - return false; -} - -bool TensorDescriptor::IsLinear() const -{ - return storage_type == TensorStorageType::BUFFER || - storage_type == TensorStorageType::IMAGE_BUFFER; -} - -bool TensorDescriptor::ReturnsZeroForNegOneRead() const -{ - return storage_type == TensorStorageType::IMAGE_BUFFER; -} - -namespace -{ -int GetLinearIndex(const TensorDescriptor &desc, const BHWDC &shape, int b, int x, int y, int d, - int s, int sub_c) -{ - const int slices = DivideRoundUp(shape.c, 4); - switch (desc.storage_type) - { - case TensorStorageType::BUFFER: - case TensorStorageType::IMAGE_BUFFER: - case TensorStorageType::TEXTURE_ARRAY: - case TensorStorageType::TEXTURE_3D: - return ((((d * slices + s) * shape.h + y) * shape.w + x) * shape.b + b) * 4 + - sub_c; // DSHWBC4 - case TensorStorageType::TEXTURE_2D: - return ((((y * slices + s) * shape.w + x) * shape.b + b) * shape.d + d) * 4 + - sub_c; // HSWBDC4 - case TensorStorageType::SINGLE_TEXTURE_2D: - return (((y * shape.w + x) * shape.b + b) * shape.d + d) * shape.c + sub_c; // HWBDC - default: - return -1; - } - return -1; -} - -int GetChannelsAlignment(const TensorDescriptor &desc, const BHWDC &shape) -{ - return desc.storage_type == TensorStorageType::SINGLE_TEXTURE_2D ? shape.c : 4; -} -} // namespace - -template <typename T> -void DataFromBHWDC(absl::Span<const float> src, const BHWDC &shape, const TensorDescriptor &desc, - absl::Span<T> dst) -{ - const int channels_alignment = GetChannelsAlignment(desc, shape); - const int slices = DivideRoundUp(shape.c, 4); - for (int b = 0; b < shape.b; ++b) - { - for (int s = 0; s < slices; ++s) - { - for (int y = 0; y < shape.h; ++y) - { - for (int x = 0; x < shape.w; ++x) - { - for (int d = 0; d < shape.d; ++d) - { - for (int c = 0; c < channels_alignment; ++c) - { - float value; - if (s * 4 + c < shape.c) - { - const int cpu_index = shape.LinearIndex({b, y, x, d, s * 4 + c}); - value = src[cpu_index]; - } - else - { - value = 0.0f; - } - int gpu_index = GetLinearIndex(desc, shape, b, x, y, d, s, c); - dst[gpu_index] = value; - } - } - } - } - } - } -} - -template void DataFromBHWDC<float>(absl::Span<const float> src, const BHWDC &shape, - const TensorDescriptor &desc, absl::Span<float> dst); - -template <typename T> -void DataToBHWDC(absl::Span<const T> src, const BHWDC &shape, const TensorDescriptor &desc, - absl::Span<float> dst) -{ - const int channels_alignment = GetChannelsAlignment(desc, shape); - const int slices = DivideRoundUp(shape.c, 4); - for (int b = 0; b < shape.b; ++b) - { - for (int s = 0; s < slices; ++s) - { - for (int y = 0; y < shape.h; ++y) - { - for (int x = 0; x < shape.w; ++x) - { - for (int d = 0; d < shape.d; ++d) - { - for (int c = 0; c < channels_alignment; ++c) - { - if (s * 4 + c >= shape.c) - { - continue; - } - int cpu_index = shape.LinearIndex({b, y, x, d, s * 4 + c}); - int gpu_index = GetLinearIndex(desc, shape, b, x, y, d, s, c); - dst[cpu_index] = src[gpu_index]; - } - } - } - } - } - } -} - -template void DataToBHWDC<float>(absl::Span<const float> src, const BHWDC &shape, - const TensorDescriptor &desc, absl::Span<float> dst); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/TensorType.h b/runtime/onert/backend/gpu_cl/open_cl/TensorType.h deleted file mode 100644 index 45523783f..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/TensorType.h +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_TENSOR_TYPE_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_TENSOR_TYPE_H__ - -#include <cstddef> -#include <string> - -#include "absl/types/span.h" -#include "GpuObject.h" -#include "DataType.h" -#include "InternalTensor.h" -#include "Shape.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -enum class TextureAddressMode -{ - DONT_CARE, // translated to CLK_ADDRESS_NONE - ZERO, // translated to CLK_ADDRESS_CLAMP -}; - -std::string TextureAddressModeToString(TextureAddressMode address_mode); - -enum class TensorStorageType -{ - UNKNOWN, - BUFFER, - IMAGE_BUFFER, - TEXTURE_2D, - TEXTURE_3D, - TEXTURE_ARRAY, - SINGLE_TEXTURE_2D -}; - -struct TensorDescriptor : public GPUObjectDescriptor -{ - TensorDescriptor() = default; - TensorDescriptor(DataType dt, TensorStorageType st, Layout l) - : data_type(dt), storage_type(st), layout(l) - { - } - - TensorDescriptor(const TensorDescriptor &) = default; - TensorDescriptor &operator=(const TensorDescriptor &) = default; - TensorDescriptor(TensorDescriptor &&desc); - TensorDescriptor &operator=(TensorDescriptor &&desc); - - bool operator==(const TensorDescriptor &d) const - { - return data_type == d.data_type && storage_type == d.storage_type && layout == d.layout; - } - - bool operator!=(const TensorDescriptor &d) const { return !(*this == d); } - - absl::Status PerformSelector(const std::string &selector, const std::vector<std::string> &args, - const std::vector<std::string> &template_args, - std::string *result) const override; - - GPUResources GetGPUResources() const override; - - absl::Status CreateGPUObject(CLContext *context, GPUObjectPtr *result) const override; - void Release() override { data.clear(); } - - bool HasAxis(Axis axis) const; - void SetTextureAddressMode(TextureAddressMode mode); - - absl::Status GetLinkingContextFromWriteSelector(const std::vector<std::string> &args, - std::string *value_name, std::string *x_coord, - std::string *y_coord, std::string *s_coord) const; - - void UploadData(const InternalTensor<HWC, DataType::FLOAT32> &src); - void UploadData(const InternalTensor<Linear, DataType::FLOAT32> &src); - - bool SupportsZeroClamp(const Axis &axis) const; - bool CanReadOutOfBorder(const Axis &axis) const; - bool IsLinear() const; - - // applicable only for types that: IsLinear -> true. - // In this case for address we have 1d component - addr (int) - // If for addr == -1 this linear storage type returns FLT4(0.0), this function - // returns true, otherwise false - bool ReturnsZeroForNegOneRead() const; - - DataType data_type = DataType::UNKNOWN; - TensorStorageType storage_type = TensorStorageType::UNKNOWN; - // This field describes logical layout, actual(physical) GPU layout can be - // totally different. - Layout layout = Layout::UNKNOWN; // Supported layouts is HWC, BHWC, HWDC, BHWDC - - // optional - BHWDC shape; - std::vector<uint8_t> data; - -private: - absl::Status PerformReadSelector(const std::vector<std::string> &args, - const std::vector<std::string> &template_args, - std::string *result) const; - - absl::Status PerformGetAddressSelector(const std::vector<std::string> &args, - std::string *result) const; - - absl::Status PerformGetPtrWithSliceOffsetSelector(const std::vector<std::string> &args, - std::string *result) const; - - absl::Status PerformGetWHOffsetSelector(const std::vector<std::string> &args, - std::string *result) const; - - absl::Status PerformGetHandleSelector(const std::vector<std::string> &args, - std::string *result) const; - - std::string DeclareAddress(const std::string &var_name, const std::string &address) const; - - std::string StorageTypeToAddressType() const; - - absl::Status PerformWriteSelector(const std::vector<std::string> &args, - std::string *result) const; - - absl::Status PerformWriteLinearSelector(const std::vector<std::string> &args, - std::string *result) const; - - std::string Read(DataType read_as_type, const std::string &global_address) const; - std::string Write(const std::string &var_name, const std::string &global_address) const; - - bool IsBatchedWidth() const; - - std::string GetWidth() const; - std::string GetSliceStride() const; - - TextureAddressMode ModeFromState() const; - - absl::Status GetDataTypeFromTemplateArgs(const std::string &template_arg, DataType *result) const; - - std::string GetGlobalAddressNoDeclarationWHS(const std::string &x, const std::string &y, - const std::string &s) const; - std::string GetGlobalAddressNoDeclarationWHSB(const std::string &x, const std::string &y, - const std::string &s, const std::string &b) const; - std::string GetGlobalAddressNoDeclarationWHDS(const std::string &x, const std::string &y, - const std::string &z, const std::string &s) const; - std::string GetGlobalAddressNoDeclarationWHDSB(const std::string &x, const std::string &y, - const std::string &z, const std::string &s, - const std::string &b) const; - std::string GetGlobalAddressNoDeclaration(const std::string &xc, const std::string &yc, - const std::string &zc, const std::string &sc, - const std::string &bc) const; - - bool ParseCoordsFromArgs(const std::vector<std::string> &args, int offset, std::string *xc, - std::string *yc, std::string *zc, std::string *sc, - std::string *bc) const; - - void UploadData(absl::Span<const float> src); -}; - -template <typename T> -void DataFromBHWDC(absl::Span<const float> src, const BHWDC &shape, const TensorDescriptor &desc, - absl::Span<T> dst); - -template <typename T> -void DataToBHWDC(absl::Span<const T> src, const BHWDC &shape, const TensorDescriptor &desc, - absl::Span<float> dst); - -std::string ToString(TensorStorageType type); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_TENSOR_TYPE_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/TensorTypeUtil.cc b/runtime/onert/backend/gpu_cl/open_cl/TensorTypeUtil.cc deleted file mode 100644 index b1f8309e4..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/TensorTypeUtil.cc +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "TensorTypeUtil.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -ObjectType ToObjectType(TensorStorageType type) -{ - switch (type) - { - case TensorStorageType::IMAGE_BUFFER: - case TensorStorageType::BUFFER: - return ObjectType::OPENCL_BUFFER; - case TensorStorageType::SINGLE_TEXTURE_2D: - case TensorStorageType::TEXTURE_2D: - case TensorStorageType::TEXTURE_ARRAY: - case TensorStorageType::TEXTURE_3D: - return ObjectType::OPENCL_TEXTURE; - default: - return ObjectType::UNKNOWN; - } -} - -DataLayout ToDataLayout(TensorStorageType type) -{ - switch (type) - { - case TensorStorageType::BUFFER: - return DataLayout::DHWC4; - case TensorStorageType::IMAGE_BUFFER: - return DataLayout::DHWC4; - case TensorStorageType::SINGLE_TEXTURE_2D: - return DataLayout::BHWC; - case TensorStorageType::TEXTURE_2D: - return DataLayout::HDWC4; - case TensorStorageType::TEXTURE_ARRAY: - return DataLayout::DHWC4; - case TensorStorageType::TEXTURE_3D: - return DataLayout::DHWC4; - default: - return DataLayout::UNKNOWN; - } -} - -TensorStorageType ToTensorStorageType(ObjectType object_type, DataLayout data_layout) -{ - switch (object_type) - { - case ObjectType::OPENCL_BUFFER: - return TensorStorageType::BUFFER; - case ObjectType::OPENCL_TEXTURE: - switch (data_layout) - { - case DataLayout::BHWC: - return TensorStorageType::SINGLE_TEXTURE_2D; - case DataLayout::DHWC4: - return TensorStorageType::TEXTURE_ARRAY; - case DataLayout::HDWC4: - return TensorStorageType::TEXTURE_2D; - default: - return TensorStorageType::UNKNOWN; - } - default: - return TensorStorageType::UNKNOWN; - } -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/TensorTypeUtil.h b/runtime/onert/backend/gpu_cl/open_cl/TensorTypeUtil.h deleted file mode 100644 index f56fc3d83..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/TensorTypeUtil.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_TENSOR_TYPE_UTIL_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_TENSOR_TYPE_UTIL_H__ - -#include "Api.h" -#include "TensorType.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -ObjectType ToObjectType(TensorStorageType type); - -DataLayout ToDataLayout(TensorStorageType type); - -TensorStorageType ToTensorStorageType(ObjectType object_type, DataLayout data_layout); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_TENSOR_TYPE_UTIL_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/Texture2d.cc b/runtime/onert/backend/gpu_cl/open_cl/Texture2d.cc deleted file mode 100644 index ae25e85d0..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/Texture2d.cc +++ /dev/null @@ -1,237 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Texture2d.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ -namespace -{ - -// Creates new 4-channel 2D texture with cl_channel_type elements -absl::Status CreateTexture2D(int width, int height, DataType type, void *data, CLContext *context, - Texture2D *result) -{ - cl_mem texture; - cl_channel_type channel_type = DataTypeToChannelType(type); - RETURN_IF_ERROR( - CreateRGBAImage2D(context->context(), width, height, channel_type, data, &texture)); - *result = Texture2D(texture, width, height, channel_type); - - return absl::OkStatus(); -} -} // namespace - -Texture2DDescriptor::Texture2DDescriptor(Texture2DDescriptor &&desc) - : GPUObjectDescriptor(std::move(desc)), element_type(desc.element_type), - normalized(desc.normalized), normalized_type(desc.normalized_type), size(desc.size), - data(std::move(desc.data)) -{ -} - -Texture2DDescriptor &Texture2DDescriptor::operator=(Texture2DDescriptor &&desc) -{ - if (this != &desc) - { - std::swap(element_type, desc.element_type); - std::swap(normalized, desc.normalized); - std::swap(normalized_type, desc.normalized_type); - std::swap(size, desc.size); - data = std::move(desc.data); - GPUObjectDescriptor::operator=(std::move(desc)); - } - return *this; -} - -void Texture2DDescriptor::Release() { data.clear(); } - -GPUResources Texture2DDescriptor::GetGPUResources() const -{ - GPUResources resources; - GPUImage2DDescriptor desc; - desc.data_type = element_type; - desc.access_type = access_type_; - resources.images2d.push_back({"tex2d", desc}); - return resources; -} - -absl::Status Texture2DDescriptor::PerformSelector(const std::string &selector, - const std::vector<std::string> &args, - const std::vector<std::string> &, - std::string *result) const -{ - if (selector == "Read") - { - return PerformReadSelector(args, result); - } - else - { - return absl::NotFoundError( - absl::StrCat("Texture2DDescriptor don't have selector with name - ", selector)); - } -} - -absl::Status Texture2DDescriptor::PerformReadSelector(const std::vector<std::string> &args, - std::string *result) const -{ - if (args.size() != 2) - { - return absl::NotFoundError(absl::StrCat("Texture2DDescriptor Read require two arguments, but ", - args.size(), " was passed")); - } - std::string read; - switch (element_type) - { - case DataType::FLOAT32: - read = "read_imagef"; - break; - case DataType::FLOAT16: - read = "read_imageh"; - break; - case DataType::INT8: - case DataType::INT16: - case DataType::INT32: - if (normalized) - { - read = normalized_type == DataType::FLOAT16 ? "read_imageh" : "read_imagef"; - } - else - { - read = "read_imagei"; - } - break; - case DataType::UINT8: - case DataType::UINT16: - case DataType::UINT32: - if (normalized) - { - read = normalized_type == DataType::FLOAT16 ? "read_imageh" : "read_imagef"; - } - else - { - read = "read_imageui"; - } - break; - default: - read = "unknown_type"; - break; - } - *result = absl::StrCat(read, "(tex2d, smp_none, (int2)(", args[0], ", " + args[1] + "))"); - return absl::OkStatus(); -} - -absl::Status Texture2DDescriptor::CreateGPUObject(CLContext *context, GPUObjectPtr *result) const -{ - Texture2D gpu_texture; - RETURN_IF_ERROR(gpu_texture.CreateFromTexture2DDescriptor(*this, context)); - *result = absl::make_unique<Texture2D>(std::move(gpu_texture)); - return absl::OkStatus(); -} - -Texture2D::Texture2D(cl_mem texture, int width, int height, cl_channel_type type) - : texture_(texture), width_(width), height_(height), channel_type_(type) -{ -} - -Texture2D::Texture2D(Texture2D &&texture) - : texture_(texture.texture_), width_(texture.width_), height_(texture.height_), - channel_type_(texture.channel_type_) -{ - texture.texture_ = nullptr; - texture.width_ = 0; - texture.height_ = 0; -} - -Texture2D &Texture2D::operator=(Texture2D &&texture) -{ - if (this != &texture) - { - Release(); - std::swap(channel_type_, texture.channel_type_); - std::swap(width_, texture.width_); - std::swap(height_, texture.height_); - std::swap(texture_, texture.texture_); - } - return *this; -} - -void Texture2D::Release() -{ - if (texture_) - { - clReleaseMemObject(texture_); - texture_ = nullptr; - width_ = 0; - height_ = 0; - } -} - -absl::Status Texture2D::GetGPUResources(const GPUObjectDescriptor *obj_ptr, - GPUResourcesWithValue *resources) const -{ - const auto *texture_desc = dynamic_cast<const Texture2DDescriptor *>(obj_ptr); - if (!texture_desc) - { - return absl::InvalidArgumentError("Expected Texture2DDescriptor on input."); - } - - resources->images2d.push_back({"tex2d", texture_}); - return absl::OkStatus(); -} - -absl::Status Texture2D::CreateFromTexture2DDescriptor(const Texture2DDescriptor &desc, - CLContext *context) -{ - width_ = desc.size.x; - height_ = desc.size.y; - channel_type_ = DataTypeToChannelType(desc.element_type, desc.normalized); - uint8_t *data_ptr = desc.data.empty() ? nullptr : const_cast<unsigned char *>(desc.data.data()); - return CreateRGBAImage2D(context->context(), desc.size.x, desc.size.y, channel_type_, data_ptr, - &texture_); -} - -// Creates new 4-channel 2D texture with f32 elements -absl::Status CreateTexture2DRGBA32F(int width, int height, CLContext *context, Texture2D *result) -{ - return CreateTexture2D(width, height, DataType::FLOAT32, nullptr, context, result); -} - -// Creates new 4-channel 2D texture with f16 elements -absl::Status CreateTexture2DRGBA16F(int width, int height, CLContext *context, Texture2D *result) -{ - return CreateTexture2D(width, height, DataType::FLOAT16, nullptr, context, result); -} - -absl::Status CreateTexture2DRGBA(DataType type, int width, int height, CLContext *context, - Texture2D *result) -{ - return CreateTexture2D(width, height, type, nullptr, context, result); -} - -absl::Status CreateTexture2DRGBA(DataType type, int width, int height, void *data, - CLContext *context, Texture2D *result) -{ - return CreateTexture2D(width, height, type, data, context, result); -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/Texture2d.h b/runtime/onert/backend/gpu_cl/open_cl/Texture2d.h deleted file mode 100644 index 264507079..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/Texture2d.h +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_TEXTURE2D_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_TEXTURE2D_H__ - -#include "absl/strings/str_cat.h" -#include "absl/types/span.h" -#include "ClCommandQueue.h" -#include "ClContext.h" -#include "GpuObject.h" -#include "OpenclWrapper.h" -#include "TensorType.h" -#include "Util.h" -#include "DataType.h" -#include "Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -struct Texture2DDescriptor : public GPUObjectDescriptor -{ - DataType element_type; - bool normalized = false; // used with INT data types, if normalized, we read - // in kernel float data. - DataType normalized_type; // can be FLOAT32 or FLOAT16, using with normalized - // = true - - // optional - int2 size = int2(0, 0); - std::vector<uint8_t> data; - - Texture2DDescriptor() = default; - Texture2DDescriptor(const Texture2DDescriptor &) = default; - Texture2DDescriptor &operator=(const Texture2DDescriptor &) = default; - Texture2DDescriptor(Texture2DDescriptor &&desc); - Texture2DDescriptor &operator=(Texture2DDescriptor &&desc); - - absl::Status PerformSelector(const std::string &selector, const std::vector<std::string> &args, - const std::vector<std::string> &template_args, - std::string *result) const override; - - GPUResources GetGPUResources() const override; - absl::Status PerformReadSelector(const std::vector<std::string> &args, std::string *result) const; - - absl::Status CreateGPUObject(CLContext *context, GPUObjectPtr *result) const override; - void Release() override; -}; - -// Texture2D represent formatted GPU data storage. -// Texture2D is moveable but not copyable. -class Texture2D : public GPUObject -{ -public: - Texture2D() {} // just for using Texture2D as a class members - Texture2D(cl_mem texture, int width, int height, cl_channel_type type); - - // Move only - Texture2D(Texture2D &&texture); - Texture2D &operator=(Texture2D &&texture); - Texture2D(const Texture2D &) = delete; - Texture2D &operator=(const Texture2D &) = delete; - - virtual ~Texture2D() { Release(); } - - cl_mem GetMemoryPtr() const { return texture_; } - - // Writes data to a texture. Data should point to a region that - // has exact width * height * sizeof(pixel) bytes. - template <typename T> absl::Status WriteData(CLCommandQueue *queue, const absl::Span<T> data); - - // Reads data from Texture2D into CPU memory. - template <typename T> absl::Status ReadData(CLCommandQueue *queue, std::vector<T> *result) const; - - absl::Status GetGPUResources(const GPUObjectDescriptor *obj_ptr, - GPUResourcesWithValue *resources) const override; - - absl::Status CreateFromTexture2DDescriptor(const Texture2DDescriptor &desc, CLContext *context); - -private: - void Release(); - - cl_mem texture_ = nullptr; - int width_; - int height_; - cl_channel_type channel_type_; -}; - -using Texture2DPtr = std::shared_ptr<Texture2D>; - -// Creates new 4-channel 2D texture with f32 elements -absl::Status CreateTexture2DRGBA32F(int width, int height, CLContext *context, Texture2D *result); - -// Creates new 4-channel 2D texture with f16 elements -absl::Status CreateTexture2DRGBA16F(int width, int height, CLContext *context, Texture2D *result); - -absl::Status CreateTexture2DRGBA(DataType type, int width, int height, CLContext *context, - Texture2D *result); - -absl::Status CreateTexture2DRGBA(DataType type, int width, int height, void *data, - CLContext *context, Texture2D *result); - -template <typename T> -absl::Status Texture2D::WriteData(CLCommandQueue *queue, const absl::Span<T> data) -{ - const int element_size = ChannelTypeToSizeInBytes(channel_type_); - if (sizeof(T) % element_size != 0) - { - return absl::InvalidArgumentError( - "Template type T has not suitable element type for created texture."); - } - if (4 * width_ * height_ * element_size != data.size() * sizeof(T)) - { - return absl::InvalidArgumentError( - "absl::Span<T> data size is different from texture allocated size."); - } - - RETURN_IF_ERROR(queue->EnqueueWriteImage(texture_, int3(width_, height_, 1), data.data())); - - return absl::OkStatus(); -} - -template <typename T> -absl::Status Texture2D::ReadData(CLCommandQueue *queue, std::vector<T> *result) const -{ - const int element_size = ChannelTypeToSizeInBytes(channel_type_); - if (sizeof(T) != element_size) - { - return absl::InvalidArgumentError("Pixel format is different."); - } - - const int elements_count = width_ * height_ * 4; - result->resize(elements_count); - - return queue->EnqueueReadImage(texture_, int3(width_, height_, 1), result->data()); -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_TEXTURE2D_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/Types.h b/runtime/onert/backend/gpu_cl/open_cl/Types.h deleted file mode 100644 index f3cf33450..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/Types.h +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_TYPES_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_TYPES_H__ - -#include <array> -#include <cstddef> -#include <cstdint> - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -// TODO(akulik): make these types Google-style compliant. - -template <typename T> struct alignas(sizeof(T)) Vec4 -{ - union { - struct - { - T x, y, z, w; - }; - std::array<T, 4> data_; - }; - - Vec4() : Vec4(T(0.0f)) {} - - template <typename S> Vec4(S x_, S y_, S z_, S w_) : x(x_), y(y_), z(z_), w(w_) {} - explicit Vec4(T v) : x(v), y(v), z(v), w(v) {} - - template <typename S> explicit Vec4(S v) : x(v), y(v), z(v), w(v) {} - - Vec4(const Vec4 &f) : x(f.x), y(f.y), z(f.z), w(f.w) {} - - template <typename S> Vec4(const Vec4<S> &f) : x(f.x), y(f.y), z(f.z), w(f.w) {} - - Vec4 &operator=(const Vec4 &other) - { - x = other.x; - y = other.y; - z = other.z; - w = other.w; - return *this; - } - - static constexpr int size() { return 4; } - - T &operator[](size_t n) { return data_[n]; } - T operator[](size_t n) const { return data_[n]; } - - bool operator==(const Vec4 &value) const - { - return data_[0] == value[0] && data_[1] == value[1] && data_[2] == value[2] && - data_[3] == value[3]; - } - bool operator!=(const Vec4 &value) const { return !(this->operator==(value)); } -}; - -template <typename T> struct alignas(sizeof(T)) Vec3 -{ - union { - struct - { - T x, y, z; - }; - std::array<T, 3> data_; - }; - - Vec3() : Vec3(T(0.0f)) {} - - template <typename S> constexpr Vec3(S x_, S y_, S z_) : x(x_), y(y_), z(z_) {} - explicit Vec3(T v) : x(v), y(v), z(v) {} - - template <typename S> explicit Vec3(S v) : x(v), y(v), z(v) {} - - Vec3(const Vec3 &f) : x(f.x), y(f.y), z(f.z) {} - - template <typename S> Vec3(const Vec3<S> &f) : x(f.x), y(f.y), z(f.z) {} - - Vec3 &operator=(const Vec3 &other) - { - x = other.x; - y = other.y; - z = other.z; - return *this; - } - - static constexpr int size() { return 3; } - - T &operator[](size_t n) { return data_[n]; } - T operator[](size_t n) const { return data_[n]; } - bool operator==(const Vec3 &value) const - { - return data_[0] == value[0] && data_[1] == value[1] && data_[2] == value[2]; - } - bool operator!=(const Vec3 &value) const { return !(this->operator==(value)); } -}; - -template <typename T> struct alignas(sizeof(T)) Vec2 -{ - union { - struct - { - T x, y; - }; - std::array<T, 2> data_; - }; - - Vec2() : Vec2(T(0.0f)) {} - - template <typename S> Vec2(S x_, S y_) : x(x_), y(y_) {} - explicit Vec2(T v) : x(v), y(v) {} - - template <typename S> explicit Vec2(S v) : x(v), y(v) {} - - Vec2(const Vec2 &f) : x(f.x), y(f.y) {} - - template <typename S> Vec2(const Vec2<S> &f) : x(f.x), y(f.y) {} - - Vec2 &operator=(const Vec2 &other) - { - x = other.x; - y = other.y; - return *this; - } - - bool operator==(const Vec2 &value) const { return data_[0] == value[0] && data_[1] == value[1]; } - - bool operator!=(const Vec2 &value) const { return !(this->operator==(value)); } - - static constexpr int size() { return 2; } - - T &operator[](size_t n) { return data_[n]; } - T operator[](size_t n) const { return data_[n]; } -}; - -using float2 = Vec2<float>; -using byte2 = Vec2<int8_t>; -using ubyte2 = Vec2<uint8_t>; -using short2 = Vec2<int16_t>; -using ushort2 = Vec2<uint16_t>; -using int2 = Vec2<int32_t>; -using uint2 = Vec2<uint32_t>; - -using float3 = Vec3<float>; -using byte3 = Vec3<int8_t>; -using ubyte3 = Vec3<uint8_t>; -using short3 = Vec3<int16_t>; -using ushort3 = Vec3<uint16_t>; -using int3 = Vec3<int32_t>; -using uint3 = Vec3<uint32_t>; - -using float4 = Vec4<float>; -using byte4 = Vec4<int8_t>; -using ubyte4 = Vec4<uint8_t>; -using short4 = Vec4<int16_t>; -using ushort4 = Vec4<uint16_t>; -using int4 = Vec4<int32_t>; -using uint4 = Vec4<uint32_t>; - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_TYPES_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/Util.cc b/runtime/onert/backend/gpu_cl/open_cl/Util.cc deleted file mode 100644 index 9f5a8388b..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/Util.cc +++ /dev/null @@ -1,264 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Util.h" - -#include "absl/strings/str_cat.h" -#include "absl/strings/substitute.h" -#include "Status.h" -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -std::string CLErrorCodeToString(cl_int error_code) -{ - switch (error_code) - { - case CL_SUCCESS: - return "Success"; - case CL_DEVICE_NOT_FOUND: - return "Device not found"; - case CL_DEVICE_NOT_AVAILABLE: - return "Device not available"; - case CL_COMPILER_NOT_AVAILABLE: - return "Compiler not available"; - case CL_MEM_OBJECT_ALLOCATION_FAILURE: - return "Memory object allocation failure"; - case CL_OUT_OF_RESOURCES: - return "Out of resources"; - case CL_OUT_OF_HOST_MEMORY: - return "Out of host memory"; - case CL_PROFILING_INFO_NOT_AVAILABLE: - return "Profiling information not available"; - case CL_MEM_COPY_OVERLAP: - return "Memory copy overlap"; - case CL_IMAGE_FORMAT_MISMATCH: - return "Image format mismatch"; - case CL_IMAGE_FORMAT_NOT_SUPPORTED: - return "Image format not supported"; - case CL_BUILD_PROGRAM_FAILURE: - return "Build program failure"; - case CL_MAP_FAILURE: - return "Mapping failure"; - case CL_MISALIGNED_SUB_BUFFER_OFFSET: - return "Misaligned sub-buffer offset"; - case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: - return "Execution status error for events in wait list"; - case CL_COMPILE_PROGRAM_FAILURE: - return "Compile program failure"; - case CL_LINKER_NOT_AVAILABLE: - return "Linker not available"; - case CL_LINK_PROGRAM_FAILURE: - return "Link program failure"; - case CL_DEVICE_PARTITION_FAILED: - return "Device partition failed"; - case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: - return "Kernel argument information not available"; - - case CL_INVALID_VALUE: - return "Invalid value"; - case CL_INVALID_DEVICE_TYPE: - return "Invalid device type"; - case CL_INVALID_PLATFORM: - return "Invalid platform"; - case CL_INVALID_DEVICE: - return "Invalid device"; - case CL_INVALID_CONTEXT: - return "Invalid context"; - case CL_INVALID_QUEUE_PROPERTIES: - return "Invalid queue properties"; - case CL_INVALID_COMMAND_QUEUE: - return "Invalid command queue"; - case CL_INVALID_HOST_PTR: - return "Invalid host pointer"; - case CL_INVALID_MEM_OBJECT: - return "Invalid memory object"; - case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: - return "Invalid image format descriptor"; - case CL_INVALID_IMAGE_SIZE: - return "Invalid image size"; - case CL_INVALID_SAMPLER: - return "Invalid sampler"; - case CL_INVALID_BINARY: - return "Invalid binary"; - case CL_INVALID_BUILD_OPTIONS: - return "Invalid build options"; - case CL_INVALID_PROGRAM: - return "Invalid program"; - case CL_INVALID_PROGRAM_EXECUTABLE: - return "Invalid program executable"; - case CL_INVALID_KERNEL_NAME: - return "Invalid kernel name"; - case CL_INVALID_KERNEL_DEFINITION: - return "Invalid kernel definition"; - case CL_INVALID_KERNEL: - return "Invalid kernel"; - case CL_INVALID_ARG_INDEX: - return "Invalid argument index"; - case CL_INVALID_ARG_VALUE: - return "Invalid argument value"; - case CL_INVALID_ARG_SIZE: - return "Invalid argument size"; - case CL_INVALID_KERNEL_ARGS: - return "Invalid kernel arguments"; - case CL_INVALID_WORK_DIMENSION: - return "Invalid work dimension"; - case CL_INVALID_WORK_GROUP_SIZE: - return "Invalid work group size"; - case CL_INVALID_WORK_ITEM_SIZE: - return "Invalid work item size"; - case CL_INVALID_GLOBAL_OFFSET: - return "Invalid global offset"; - case CL_INVALID_EVENT_WAIT_LIST: - return "Invalid event wait list"; - case CL_INVALID_EVENT: - return "Invalid event"; - case CL_INVALID_OPERATION: - return "Invalid operation"; - case CL_INVALID_GL_OBJECT: - return "Invalid GL object"; - case CL_INVALID_BUFFER_SIZE: - return "Invalid buffer size"; - case CL_INVALID_MIP_LEVEL: - return "Invalid mip-level"; - case CL_INVALID_GLOBAL_WORK_SIZE: - return "Invalid global work size"; - case CL_INVALID_PROPERTY: - return "Invalid property"; - case CL_INVALID_IMAGE_DESCRIPTOR: - return "Invalid image descriptor"; - case CL_INVALID_COMPILER_OPTIONS: - return "Invalid compiler options"; - case CL_INVALID_LINKER_OPTIONS: - return "Invalid linker options"; - case CL_INVALID_DEVICE_PARTITION_COUNT: - return "Invalid device partition count"; - case CL_INVALID_PIPE_SIZE: - return "Invalid pipe size"; - case CL_INVALID_DEVICE_QUEUE: - return "Invalid device queue"; - case CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR: - return "Invalid GL sharegroup reference KHR"; - - default: - return "Unknown OpenCL"; - } -} - -int ChannelTypeToSizeInBytes(cl_channel_type type) -{ - switch (type) - { - case CL_FLOAT: - return 4; - default: - return 0; - } -} - -absl::Status CreateCLBuffer(cl_context context, int size_in_bytes, bool read_only, void *data, - cl_mem *result) -{ - cl_mem_flags flags = read_only ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE; - if (data) - { - flags |= CL_MEM_COPY_HOST_PTR; - } - cl_int error_code; - *result = clCreateBuffer(context, flags, size_in_bytes, data, &error_code); - if (!*result) - { - return absl::UnknownError(absl::StrCat("Failed to allocate device memory (clCreateBuffer): ", - CLErrorCodeToString(error_code))); - } - return absl::OkStatus(); -} - -cl_channel_type DataTypeToChannelType(DataType type, bool normalized) -{ - switch (type) - { - case DataType::FLOAT32: - return CL_FLOAT; - case DataType::INT8: - return normalized ? CL_SNORM_INT8 : CL_SIGNED_INT8; - case DataType::UINT8: - return normalized ? CL_UNORM_INT8 : CL_UNSIGNED_INT8; - case DataType::INT16: - return normalized ? CL_SNORM_INT16 : CL_SIGNED_INT16; - case DataType::UINT16: - return normalized ? CL_UNORM_INT16 : CL_UNSIGNED_INT16; - case DataType::INT32: - return CL_SIGNED_INT32; - case DataType::UINT32: - return CL_UNSIGNED_INT32; - default: - return CL_FLOAT; - } -} - -absl::Status CreateRGBAImage2D(cl_context context, int width, int height, - cl_channel_type channel_type, void *data, cl_mem *result) -{ - cl_image_desc desc; - desc.image_type = CL_MEM_OBJECT_IMAGE2D; - desc.image_width = width; - desc.image_height = height; - desc.image_depth = 0; - desc.image_row_pitch = 0; - desc.image_slice_pitch = 0; - desc.num_mip_levels = 0; - desc.num_samples = 0; - desc.buffer = nullptr; - - cl_image_format format; - format.image_channel_order = CL_RGBA; - format.image_channel_data_type = channel_type; - - cl_mem_flags flags = CL_MEM_READ_WRITE; - if (data) - { - flags |= CL_MEM_COPY_HOST_PTR; - } - - cl_int error_code; - *result = CreateImage2DLegacy(context, flags, &format, &desc, data, &error_code); - if (error_code != CL_SUCCESS) - { - return absl::UnknownError(absl::StrCat("Failed to create 2D texture (clCreateImage): ", - CLErrorCodeToString(error_code))); - } - return absl::OkStatus(); -} - -std::string GetXStrideCorrected(const std::string &src_x, const std::string &batch_size, - const std::string &stride_x, const std::string &padding_x) -{ - // TODO(sorokin) check perf and optimize with floor() if needed - // int p0 = src_x / batch_size;\n"; - // int b0 = src_x % batch_size;\n"; - // return p0 * stride_x * batch_size + b0 + padding_x;\n"; - return absl::Substitute("((($0) / $1) * $2 * $1 + (($0) % $1) + $3)", src_x, batch_size, stride_x, - padding_x); -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/Util.h b/runtime/onert/backend/gpu_cl/open_cl/Util.h deleted file mode 100644 index 996c564f4..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/Util.h +++ /dev/null @@ -1,278 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_UTIL_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_UTIL_H__ - -#include <string> - -#include "absl/types/span.h" -#include "OpenclWrapper.h" -#include "DataType.h" -#include "InternalTensor.h" -#include "Status.h" -#include "Types.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ -// Calculates correct X coordinate when stride != 1 and batch != 1 for layouts -// with B after W (for example HWBC4) and WB stored in one axis of GPU -// resources. -std::string GetXStrideCorrected(const std::string &src_x, const std::string &batch_size, - const std::string &stride_x, const std::string &padding_x); - -// @param n must be non negative -// @param divisor must be greater than zero -template <typename T, typename N> T DivideRoundUp(T n, N divisor) -{ - const T div = static_cast<T>(divisor); - const T q = n / div; - return n % div == 0 ? q : q + 1; -} - -template <> inline uint3 DivideRoundUp(uint3 n, uint3 divisor) -{ - return uint3(DivideRoundUp(n.x, divisor.x), DivideRoundUp(n.y, divisor.y), - DivideRoundUp(n.z, divisor.z)); -} - -// @param number or its components must be greater than zero -// @param n must be greater than zero -template <typename T, typename N> T AlignByN(T number, N n) { return DivideRoundUp(number, n) * n; } - -std::string CLErrorCodeToString(cl_int error_code); - -int ChannelTypeToSizeInBytes(cl_channel_type type); - -template <DataType S, typename T> -void CopyLinearFLT4(const InternalTensor<Linear, S> &src, absl::Span<T> dst) -{ - const int dst_depth = dst.size(); - for (int d = 0; d < dst_depth; ++d) - { - T val; - for (int i = 0; i < 4; ++i) - { - const int dst_ch = d * 4 + i; - val[i] = dst_ch >= src.shape.v ? 0.0f : src.data[dst_ch]; - } - dst[d] = val; - } -} - -absl::Status CreateCLBuffer(cl_context context, int size_in_bytes, bool read_only, void *data, - cl_mem *result); - -cl_channel_type DataTypeToChannelType(DataType type, bool normalized = false); -absl::Status CreateRGBAImage2D(cl_context context, int width, int height, - cl_channel_type channel_type, void *data, cl_mem *result); - -template <DataType S, typename T> -void RearrangeWeightsToOHWIOGroupI4O4(const InternalTensor<OHWI, S> &weights, int out_group_size, - absl::Span<T> dst) -{ - const int dst_slices = DivideRoundUp(weights.shape.o, 4); - const int src_slices = DivideRoundUp(weights.shape.i, 4); - const int dst_groups = DivideRoundUp(dst_slices, out_group_size); - - int counter = 0; - for (int d = 0; d < dst_groups; ++d) - { - for (int y = 0; y < weights.shape.h; ++y) - { - for (int x = 0; x < weights.shape.w; ++x) - { - for (int s = 0; s < src_slices; ++s) - { - for (int d_group = 0; d_group < out_group_size; ++d_group) - { - for (int j = 0; j < 4; ++j) - { - T filter; - for (int i = 0; i < 4; ++i) - { - const int s_ch = s * 4 + j; - const int d_ch = (d * out_group_size + d_group) * 4 + i; - if (s_ch < weights.shape.i && d_ch < weights.shape.o) - { - const int f_index = weights.shape.LinearIndex({d_ch, y, x, s_ch}); - filter[i] = weights.data[f_index]; - } - else - { - filter[i] = 0.0f; - } - } - dst[counter++] = filter; - } - } - } - } - } - } -} - -template <DataType S, typename T> -void RearrangeWeightsToODHWIOGroupI4O4(const InternalTensor<OHWDI, S> &weights, int out_group_size, - absl::Span<T> dst) -{ - const int dst_slices = DivideRoundUp(weights.shape.o, 4); - const int src_slices = DivideRoundUp(weights.shape.i, 4); - const int dst_groups = DivideRoundUp(dst_slices, out_group_size); - - int counter = 0; - for (int d = 0; d < dst_groups; ++d) - { - for (int z = 0; z < weights.shape.d; ++z) - { - for (int y = 0; y < weights.shape.h; ++y) - { - for (int x = 0; x < weights.shape.w; ++x) - { - for (int s = 0; s < src_slices; ++s) - { - for (int d_group = 0; d_group < out_group_size; ++d_group) - { - for (int j = 0; j < 4; ++j) - { - T filter; - for (int i = 0; i < 4; ++i) - { - const int s_ch = s * 4 + j; - const int d_ch = (d * out_group_size + d_group) * 4 + i; - if (s_ch < weights.shape.i && d_ch < weights.shape.o) - { - const int f_index = weights.shape.LinearIndex({d_ch, y, x, z, s_ch}); - filter[i] = weights.data[f_index]; - } - else - { - filter[i] = 0.0f; - } - } - dst[counter++] = filter; - } - } - } - } - } - } - } -} - -template <DataType S, typename T> -void RearrangeWeightsToI4HWIOOGroupO4(const InternalTensor<OHWI, S> &weights, int out_group_size, - absl::Span<T> dst) -{ - const int dst_slices = DivideRoundUp(weights.shape.o, 4); - const int src_slices = DivideRoundUp(weights.shape.i, 4); - const int dst_groups = DivideRoundUp(dst_slices, out_group_size); - - int counter = 0; - for (int j = 0; j < 4; ++j) - { - for (int y = 0; y < weights.shape.h; ++y) - { - for (int x = 0; x < weights.shape.w; ++x) - { - for (int s = 0; s < src_slices; ++s) - { - for (int d = 0; d < dst_groups; ++d) - { - for (int d_group = 0; d_group < out_group_size; ++d_group) - { - T filter; - for (int i = 0; i < 4; ++i) - { - const int s_ch = s * 4 + j; - const int d_ch = (d * out_group_size + d_group) * 4 + i; - if (s_ch < weights.shape.i && d_ch < weights.shape.o) - { - const int f_index = weights.shape.LinearIndex({d_ch, y, x, s_ch}); - filter[i] = weights.data[f_index]; - } - else - { - filter[i] = 0.0f; - } - } - dst[counter++] = filter; - } - } - } - } - } - } -} - -template <DataType S, typename T> -void RearrangeWeightsToI4DHWIOOGroupO4(const InternalTensor<OHWDI, S> &weights, int out_group_size, - absl::Span<T> dst) -{ - const int dst_slices = DivideRoundUp(weights.shape.o, 4); - const int src_slices = DivideRoundUp(weights.shape.i, 4); - const int dst_groups = DivideRoundUp(dst_slices, out_group_size); - - int counter = 0; - for (int j = 0; j < 4; ++j) - { - for (int z = 0; z < weights.shape.d; ++z) - { - for (int y = 0; y < weights.shape.h; ++y) - { - for (int x = 0; x < weights.shape.w; ++x) - { - for (int s = 0; s < src_slices; ++s) - { - for (int d = 0; d < dst_groups; ++d) - { - for (int d_group = 0; d_group < out_group_size; ++d_group) - { - T filter; - for (int i = 0; i < 4; ++i) - { - const int s_ch = s * 4 + j; - const int d_ch = (d * out_group_size + d_group) * 4 + i; - if (s_ch < weights.shape.i && d_ch < weights.shape.o) - { - const int f_index = weights.shape.LinearIndex({d_ch, y, x, z, s_ch}); - filter[i] = weights.data[f_index]; - } - else - { - filter[i] = 0.0f; - } - } - dst[counter++] = filter; - } - } - } - } - } - } - } -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_UTIL_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/WinogradUtil.cc b/runtime/onert/backend/gpu_cl/open_cl/WinogradUtil.cc deleted file mode 100644 index 5f1103ad9..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/WinogradUtil.cc +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "open_cl/WinogradUtil.h" - -#include <cmath> -#include <vector> - -#include "open_cl/DataType.h" -#include "open_cl/Shape.h" -#include "open_cl/Tensor.h" - -namespace onert -{ -namespace backend -{ -namespace -{ -// Matrices for Winograd trasformations were computed with the method described -// here https://openreview.net/pdf?id=H1ZaRZVKg -std::vector<float> GetTransposedMatrixForWinograd(int width, int height) -{ - const float kDelta = std::sqrt(2.0f) / 2.0f; - std::vector<float> px(width); - - px[0] = 0.0f; - const int points_count = (width - 1) / 2; - for (int i = 0; i < points_count; ++i) - { - px[i * 2 + 1] = kDelta * (i + 1.0f); - px[i * 2 + 2] = -kDelta * (i + 1.0f); - } - px[width - 1] = 1.0f; - - std::vector<float> py(width, 1.0f); - py[width - 1] = 0.0f; - - std::vector<float> result(height * width); - for (int y = 0; y < width; ++y) - { - for (int x = 0; x < height; ++x) - { - result[x * width + y] = std::pow(px[y], 1.0f * x) * std::pow(py[y], (height - 1.0f) - x); - } - } - return result; -} - -std::vector<float> GetInversedMatrixForWinograd(int rank) -{ - auto matrix = GetTransposedMatrixForWinograd(rank, rank); - std::vector<float> inverted(rank * rank, 0.0f); - for (int i = 0; i < rank; ++i) - { - inverted[i * rank + i] = 1.0f; - } - - for (int i = 1; i < rank - 1; ++i) - { - float inv_t = 1.0f / matrix[i * rank + i]; - for (int x = i; x < rank; ++x) - { - matrix[i * rank + x] *= inv_t; - } - for (int x = 0; x < rank; ++x) - { - inverted[i * rank + x] *= inv_t; - } - - for (int y = 0; y < rank; ++y) - { - if (y == i) - continue; - float t = matrix[y * rank + i]; - for (int x = i; x < rank; ++x) - { - matrix[y * rank + x] -= t * matrix[i * rank + x]; - } - for (int x = 0; x < rank; ++x) - { - inverted[y * rank + x] -= t * inverted[i * rank + x]; - } - } - } - - return inverted; -} - -std::vector<float> Multiply(const std::vector<float> &a_mat, const std::vector<float> &b_mat, int m, - int n, int k) -{ - std::vector<float> result(m * k); - for (int y = 0; y < m; ++y) - { - for (int x = 0; x < k; ++x) - { - float sum = 0.0f; - for (int i = 0; i < n; ++i) - { - sum += a_mat[y * n + i] * b_mat[i * k + x]; - } - result[y * k + x] = sum; - } - } - return result; -} -} // namespace - -std::vector<float> AtMatrixForWinograd4x4To6x6() { return GetTransposedMatrixForWinograd(6, 4); } - -std::vector<float> BtMatrixForWinograd4x4To6x6() { return GetInversedMatrixForWinograd(6); } - -void RearrangeWeightsToWinograd4x4To6x6Weights( - const gpu_cl::InternalTensor<gpu_cl::OHWI, gpu_cl::DataType::FLOAT32> &src_weights, - gpu_cl::InternalTensor<gpu_cl::OHWI, gpu_cl::DataType::FLOAT32> *dst_weights) -{ - gpu_cl::OHWI dst_shape; - dst_shape.o = src_weights.shape.o; - dst_shape.h = 6; - dst_shape.w = 6; - dst_shape.i = src_weights.shape.i; - dst_weights->shape = dst_shape; - dst_weights->data.resize(dst_shape.DimensionsProduct()); - - auto gt_mat = GetTransposedMatrixForWinograd(6, 3); - std::vector<float> g_mat(gt_mat.size()); - for (int y = 0; y < 3; ++y) - { - for (int x = 0; x < 6; ++x) - { - g_mat[x * 3 + y] = gt_mat[y * 6 + x]; - } - } - - for (int d = 0; d < src_weights.shape.o; ++d) - { - for (int s = 0; s < src_weights.shape.i; ++s) - { - std::vector<float> in_vals(9); - for (int y = 0; y < 3; ++y) - { - for (int x = 0; x < 3; ++x) - { - const int f_index = src_weights.shape.LinearIndex({d, y, x, s}); - in_vals[y * 3 + x] = src_weights.data[f_index]; - } - } - - auto temp_vals = Multiply(g_mat, in_vals, 6, 3, 3); - auto out_vals = Multiply(temp_vals, gt_mat, 6, 3, 6); - for (int y = 0; y < 6; ++y) - { - for (int x = 0; x < 6; ++x) - { - const int f_index = dst_shape.LinearIndex({d, y, x, s}); - dst_weights->data[f_index] = out_vals[y * 6 + x]; - } - } - } - } -} - -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/WinogradUtil.h b/runtime/onert/backend/gpu_cl/open_cl/WinogradUtil.h deleted file mode 100644 index 32e21760d..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/WinogradUtil.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_WINOGRAD_UTIL_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_WINOGRAD_UTIL_H__ - -#include <vector> - -#include "open_cl/DataType.h" -#include "open_cl/Shape.h" -#include "open_cl/InternalTensor.h" - -namespace onert -{ -namespace backend -{ - -// Matrices for Winograd trasformations received with method described here -// https://openreview.net/pdf?id=H1ZaRZVKg - -// returns A transposed matrix(6 * 4) as array (24 values) for Winograd4x4To6x6 -std::vector<float> AtMatrixForWinograd4x4To6x6(); - -// returns B transposed matrix(6 * 6) as array (36 values) for Winograd4x4To6x6 -std::vector<float> BtMatrixForWinograd4x4To6x6(); - -void RearrangeWeightsToWinograd4x4To6x6Weights( - const gpu_cl::InternalTensor<gpu_cl::OHWI, gpu_cl::DataType::FLOAT32> &src_weights, - gpu_cl::InternalTensor<gpu_cl::OHWI, gpu_cl::DataType::FLOAT32> *dst_weights); - -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_WINOGRAD_UTIL_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/WorkgroupSelection.cc b/runtime/onert/backend/gpu_cl/open_cl/WorkgroupSelection.cc deleted file mode 100644 index 847c2a2aa..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/WorkgroupSelection.cc +++ /dev/null @@ -1,258 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "WorkgroupSelection.h" - -#include <math.h> - -#include <set> -#include <vector> - -#include "Util.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -namespace -{ - -template <typename T> -void AddCornerCases(const T &grid, int max_work_group_total_size, const T &max_work_group_sizes, - WorkGroupSizeAlignment x_alignment, WorkGroupSizeAlignment y_alignment, - WorkGroupSizeAlignment z_alignment, std::vector<T> *work_groups) -{ - for (int x = 1; x <= 4; ++x) - { - for (int y = 1; y <= 4; ++y) - { - for (int z = 1; z <= 4; ++z) - { - u_int32_t wg_x = DivideRoundUp(grid.x, x); - u_int32_t wg_y = DivideRoundUp(grid.y, y); - u_int32_t wg_z = DivideRoundUp(grid.z, z); - if (wg_x > static_cast<u_int32_t>(max_work_group_sizes.x) || - wg_y > static_cast<u_int32_t>(max_work_group_sizes.y) || - wg_z > static_cast<u_int32_t>(max_work_group_sizes.z) || - wg_x * wg_y * wg_z > static_cast<u_int32_t>(max_work_group_total_size)) - { - continue; - } - if (x_alignment == WorkGroupSizeAlignment::PRECISE && grid.x % wg_x != 0) - { - continue; - } - if (y_alignment == WorkGroupSizeAlignment::PRECISE && grid.y % wg_y != 0) - { - continue; - } - if (z_alignment == WorkGroupSizeAlignment::PRECISE && grid.z % wg_z != 0) - { - continue; - } - work_groups->push_back({wg_x, wg_y, wg_z}); - } - } - } - - // this will add at least {1, 1, 1} always. - for (u_int32_t x = 1; x <= 4; ++x) - { - for (u_int32_t y = 1; y <= 4; ++y) - { - for (u_int32_t z = 1; z <= 4; ++z) - { - if (x > static_cast<u_int32_t>(max_work_group_sizes.x) || - y > static_cast<u_int32_t>(max_work_group_sizes.y) || - z > static_cast<u_int32_t>(max_work_group_sizes.z) || - x * y * z > static_cast<u_int32_t>(max_work_group_total_size)) - { - continue; - } - if (x_alignment == WorkGroupSizeAlignment::PRECISE && grid.x % x != 0) - { - continue; - } - if (y_alignment == WorkGroupSizeAlignment::PRECISE && grid.y % y != 0) - { - continue; - } - if (z_alignment == WorkGroupSizeAlignment::PRECISE && grid.z % z != 0) - { - continue; - } - work_groups->push_back({x, y, z}); - } - } - } -} - -std::vector<int> GetDivisors(int number) -{ - const int max_divisor = static_cast<int>(sqrt(number)); - std::vector<int> divisors; - // we don't know the number of dividers, so it is just heuristic. - divisors.reserve(max_divisor / 3 + 1); - for (int i = 1; i <= max_divisor; ++i) - { - const int d = number / i; - if (i * d == number) - { - divisors.push_back(i); - if (d != i) - { - divisors.push_back(d); - } - } - } - return divisors; -} - -std::vector<int> GetDivisorsForRange(int number, int range) -{ - const int last_number = number + range; - const int max_divisor = static_cast<int>(sqrt(last_number)); - std::set<int> divisors; - for (int i = 1; i <= max_divisor; ++i) - { - const int reminder = number % i; - // iterate through numbers that divisible by i in our range; - const int first_number = number + (i - reminder) % i; - if (first_number <= last_number) - { - divisors.insert(i); - } - for (int j = first_number; j <= last_number; j += i) - { - const int d = j / i; - if (d != i) - { - divisors.insert(d); - } - } - } - return std::vector<int>(divisors.begin(), divisors.end()); -} - -} // namespace - -std::vector<int> GetPossibleSizes(int number, WorkGroupSizeAlignment z_alignment) -{ - if (z_alignment == WorkGroupSizeAlignment::PRECISE) - { - // we will use for potential sizes, sizes that cover grid precisely - // work group size * k (k is integer) == grid_size - return GetDivisors(number); - } - else - { - // when we chose work group size we can use work group size that - // work group size * k (k is integer) != grid_size (slightly bigger) - // so in this heuristic we trying to find potential size, that satisfies - // to this : work group size * k (k is integer) <= grid_size + 5 - // and this : work group size * k (k is integer) >= grid_size - return GetDivisorsForRange(number, 5); - } -} - -template <typename T> -std::vector<T> -GenerateWorkGroupSizes(const T &grid, int min_work_group_total_size, int max_work_group_total_size, - const T &max_work_group_sizes, WorkGroupSizeAlignment x_alignment, - WorkGroupSizeAlignment y_alignment, WorkGroupSizeAlignment z_alignment) -{ - std::vector<T> work_groups; - work_groups.reserve(64); - - std::vector<int> sizes_x = GetPossibleSizes(grid.x, x_alignment); - std::vector<int> sizes_y = GetPossibleSizes(grid.y, y_alignment); - std::vector<int> sizes_z = GetPossibleSizes(grid.z, z_alignment); - - for (auto x : sizes_x) - { - if (static_cast<int>(x) > static_cast<int>(max_work_group_sizes.x)) - continue; - for (auto y : sizes_y) - { - if (static_cast<int>(y) > static_cast<int>(max_work_group_sizes.y)) - continue; - for (auto z : sizes_z) - { - if (static_cast<int>(z) > static_cast<int>(max_work_group_sizes.z)) - continue; - const int work_group_size = x * y * z; - if (work_group_size < min_work_group_total_size || - work_group_size > max_work_group_total_size) - continue; - work_groups.push_back({x, y, z}); - } - } - } - - return work_groups; -} - -// Specializations of GenerateWorkGroupSizes for int3 and uint3 - -template std::vector<int3> GenerateWorkGroupSizes(const int3 &grid, int min_work_group_total_size, - int max_work_group_total_size, - const int3 &max_work_group_sizes, - WorkGroupSizeAlignment x_alignment, - WorkGroupSizeAlignment y_alignment, - WorkGroupSizeAlignment z_alignment); - -template std::vector<uint3> GenerateWorkGroupSizes(const uint3 &grid, int min_work_group_total_size, - int max_work_group_total_size, - const uint3 &max_work_group_sizes, - WorkGroupSizeAlignment x_alignment, - WorkGroupSizeAlignment y_alignment, - WorkGroupSizeAlignment z_alignment); - -template <typename T> -void GenerateWorkGroupSizesAlignedToGrid(const T &grid, const T &max_work_group_size, - const int max_work_group_invocations, - std::vector<T> *work_groups) -{ - auto alignment = WorkGroupSizeAlignment::PRECISE; - *work_groups = - GenerateWorkGroupSizes<T>(grid, /*min_work_group_total_size = */ 32, max_work_group_invocations, - max_work_group_size, alignment, alignment, alignment); - // If the grid parameter too small, method below cannot generate workgroups. - if (work_groups->empty()) - { - AddCornerCases(grid, max_work_group_invocations, max_work_group_size, alignment, alignment, - alignment, work_groups); - } -} - -// Specializations of GenerateWorkGroupSizesAlignedToGrid for int3 and uint3 - -template void GenerateWorkGroupSizesAlignedToGrid(const int3 &grid, const int3 &max_work_group_size, - const int max_work_group_invocations, - std::vector<int3> *work_groups); - -template void GenerateWorkGroupSizesAlignedToGrid(const uint3 &grid, - const uint3 &max_work_group_size, - const int max_work_group_invocations, - std::vector<uint3> *work_groups); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/WorkgroupSelection.h b/runtime/onert/backend/gpu_cl/open_cl/WorkgroupSelection.h deleted file mode 100644 index b0702ac7c..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/WorkgroupSelection.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_WORK_GROUP_SELECTION_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_WORK_GROUP_SELECTION_H__ - -#include <vector> - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -// PRECISE assume that WorkGroupSize * k = GridSize; -// NO_ALIGNMENT no restrictions; -// We need PRECISE when we don't have check in kernel for boundaries -// If we have the check, we can use PRECISE or NO_ALIGNMENT as well. -enum class WorkGroupSizeAlignment -{ - PRECISE, - NO_ALIGNMENT -}; - -std::vector<int> GetPossibleSizes(int number, WorkGroupSizeAlignment z_alignment); - -// Specializations exist for int3 and uint3 in the .cc file - -template <typename T> -std::vector<T> -GenerateWorkGroupSizes(const T &grid, int min_work_group_total_size, int max_work_group_total_size, - const T &max_work_group_sizes, WorkGroupSizeAlignment x_alignment, - WorkGroupSizeAlignment y_alignment, WorkGroupSizeAlignment z_alignment); - -template <typename T> -void GenerateWorkGroupSizesAlignedToGrid(const T &grid, const T &max_work_group_size, - const int max_work_group_invocations, - std::vector<T> *work_groups); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_WORK_GROUP_SELECTION_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Add.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/Add.cc deleted file mode 100644 index 09100fe1f..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/Add.cc +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Add.h" - -#include <cstring> -#include <string> - -#include "absl/strings/str_cat.h" -#include "Util.h" -#include "open_cl/Util.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -GPUOperation CreateAdd(const OperationDef &definition, const std::vector<int> &channels, - int dst_channels) -{ - GPUOperation add(definition); - int dst_depth = DivideRoundUp(dst_channels, 4); - int src0_depth = DivideRoundUp(channels[0], 4); - add.elementwise_ = true; - add.linkable_ = dst_depth == src0_depth; - if (src0_depth < dst_depth) - { - add.check_src_channels_size_ = true; - } - for (uint32_t i = 1; i < definition.src_tensors.size(); ++i) - { - const std::string tensor_name = absl::StrCat("src_data_", i); - auto src_desc = definition.src_tensors[i]; - if (definition.IsBatchSupported()) - { - src_desc.SetStateVar("BatchedWidth", "true"); - } - add.AddSrcTensor(tensor_name, src_desc); - add.code_ += "if (S_COORD < args." + tensor_name + ".Slices()) {\n"; - add.code_ += " in_out_value += args." + tensor_name + ".Read(X_COORD, Y_COORD, S_COORD);\n"; - add.code_ += "}\n"; - } - return add; -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Add.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/Add.h deleted file mode 100644 index 2335a901c..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/Add.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_ADD_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_ADD_H__ - -#include <string> -#include <vector> - -#include "GpuOperation.h" -#include "open_cl/Operations.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -// Add operation supports not equal tensors on input (for possibility to -// remove Padding operation with zeroes in channels dimension) -GPUOperation CreateAdd(const OperationDef &definition, const std::vector<int> &channels, - int dst_channels); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_ADD_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvBuffer1x1.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvBuffer1x1.cc deleted file mode 100644 index 1b9014fdf..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvBuffer1x1.cc +++ /dev/null @@ -1,480 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "open_cl/kernels/ConvBuffer1x1.h" - -#include <array> -#include <string> -#include <utility> - -#include "open_cl/ClDevice.h" -#include "open_cl/kernels/Util.h" -#include "open_cl/kernels/WorkGroupPicking.h" -#include "open_cl/Precision.h" -#include "open_cl/TensorType.h" -#include "open_cl/Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ -namespace -{ - -// element_size must be 1, 2 or 4 -// 1 - is FLT4 -// 2 - is FLT8 -// 4 - is FLT16 -// This function generates code for arithmetic part of convolution -std::string GetComputationPart(const int3 &block_size, int element_size, - CalculationsPrecision precision) -{ - const std::string hexes[16] = {"0", "1", "2", "3", "4", "5", "6", "7", - "8", "9", "a", "b", "c", "d", "e", "f"}; - std::string c; - for (int z = 0; z < block_size.z; ++z) - { - const std::string z_s = std::to_string(z); - c += " FLT16 W" + z_s + " = weights_cache[" + z_s + "];\n"; - for (int y = 0; y < block_size.y; ++y) - { - for (int x = 0; x < block_size.x; ++x) - { - std::string s_index = std::to_string(y * block_size.x + x); - for (int e = 0; e < element_size; ++e) - { - std::string r_index = z_s + std::to_string(y) + std::to_string(x * element_size + e); - const std::string f0 = "W" + z_s + ".s0123"; - const std::string f1 = "W" + z_s + ".s4567"; - const std::string f2 = "W" + z_s + ".s89ab"; - const std::string f3 = "W" + z_s + ".scdef"; - switch (precision) - { - case CalculationsPrecision::F32: - case CalculationsPrecision::F16: - c += " r" + r_index + " += " + f0 + " * s" + s_index + ".s" + hexes[e * 4 + 0] + - ";\n"; - c += " r" + r_index + " += " + f1 + " * s" + s_index + ".s" + hexes[e * 4 + 1] + - ";\n"; - c += " r" + r_index + " += " + f2 + " * s" + s_index + ".s" + hexes[e * 4 + 2] + - ";\n"; - c += " r" + r_index + " += " + f3 + " * s" + s_index + ".s" + hexes[e * 4 + 3] + - ";\n"; - break; - case CalculationsPrecision::F32_F16: - c += " r" + r_index + " += convert_float4(" + f0 + " * s" + s_index + ".s" + - hexes[e * 4 + 0] + " + " + f1 + " * s" + s_index + ".s" + hexes[e * 4 + 1] + - " + " + f2 + " * s" + s_index + ".s" + hexes[e * 4 + 2] + " + " + f3 + " * s" + - s_index + ".s" + hexes[e * 4 + 3] + ");\n"; - break; - } - } - } - } - } - return c; -} - -ConvBuffer1x1::ConvParams GetBestParams(const DeviceInfo &device_info, - const OperationDef &definition, const BHWC &shape, int, - int dst_depth) -{ - ConvBuffer1x1::ConvParams conv_params; - conv_params.element_size = 4; - conv_params.block_size = int3(1, 1, 1); - if (!device_info.IsMali()) - { - return conv_params; - } - bool can_use_flt8 = - (shape.w * shape.b) % 2 == 0 && definition.precision != CalculationsPrecision::F32; - bool is_midgard = device_info.IsMali() && device_info.mali_info.IsMidgard(); - if (is_midgard) - { - if (can_use_flt8) - { - conv_params.element_size = 8; - } - if (definition.precision == CalculationsPrecision::F16 || !can_use_flt8) - { - conv_params.block_size.x = 2; - } - return conv_params; - } - - int task_size = shape.w * shape.b * shape.h * dst_depth; - int block_size = GetRecommendedBlockSizeForConv(device_info, definition.precision, task_size); - - if (!can_use_flt8 && block_size > 4) - { - block_size = 4; - } - - if (can_use_flt8 && block_size >= 2) - { - conv_params.element_size = 8; - block_size /= 2; - } - if (block_size == 4) - { - conv_params.block_size.x = 2; - if (definition.precision == CalculationsPrecision::F32 && dst_depth < 32) - { - conv_params.block_size.y = 2; - } - else - { - conv_params.block_size.z = 2; - } - } - else if (block_size == 2) - { - if (dst_depth >= 32) - { - conv_params.block_size.z = 2; - } - else - { - conv_params.block_size.x = 2; - } - } - - return conv_params; -} - -ConvBuffer1x1::ConvParams GetBestParams(const DeviceInfo &device_info, - const OperationDef &definition, int, int) -{ - ConvBuffer1x1::ConvParams conv_params; - conv_params.element_size = 4; - conv_params.block_size = int3(1, 1, 1); - if (device_info.IsMali() && definition.precision == CalculationsPrecision::F16 && - device_info.compute_units_count <= 4) - { - conv_params.block_size.x *= 2; - } - return conv_params; -} - -} // namespace - -ConvBuffer1x1::ConvBuffer1x1(const OperationDef &definition, const ConvParams &conv_params) - : GPUOperation(definition), conv_params_(conv_params) -{ - code_ = GenerateConvBuffer1x1(definition_, conv_params_, &args_); - work_group_size_ = int3(2, 4, 1); -} - -ConvBuffer1x1::ConvBuffer1x1(ConvBuffer1x1 &&operation) - : GPUOperation(std::move(operation)), conv_params_(std::move(operation.conv_params_)) -{ -} - -ConvBuffer1x1 &ConvBuffer1x1::operator=(ConvBuffer1x1 &&operation) -{ - if (this != &operation) - { - std::swap(conv_params_, operation.conv_params_); - GPUOperation::operator=(std::move(operation)); - } - return *this; -} - -std::string ConvBuffer1x1::GenerateConvBuffer1x1(const OperationDef &op_def, - const ConvBuffer1x1::ConvParams &conv_params, - Arguments *) -{ - auto src_desc = op_def.src_tensors[0]; - if (op_def.IsBatchSupported()) - { - src_desc.SetStateVar("BatchedWidth", "true"); - } - if (conv_params_.element_size == 8) - { - src_desc.SetStateVar("ElementsX2", "true"); - } - else if (conv_params_.element_size == 16) - { - src_desc.SetStateVar("ElementsX4", "true"); - } - AddSrcTensor("src_tensor", src_desc); - if (op_def.src_tensors.size() == 2) - { - // dynamic weights - BufferDescriptor desc; - desc.element_type = op_def.src_tensors[1].data_type; - desc.element_size = 16; - desc.memory_type = MemoryType::GLOBAL; - AddSrcBuffer("weights", desc); - } - - auto dst_desc = op_def.dst_tensors[0]; - if (op_def.IsBatchSupported()) - { - dst_desc.SetStateVar("BatchedWidth", "true"); - } - AddDstTensor("dst_tensor", dst_desc); - - std::string c = GetCommonDefines(op_def.precision); - switch (op_def.precision) - { - case CalculationsPrecision::F32: - c += "#define FLT8 float8\n"; - c += "#define FLT16 float16\n"; - break; - case CalculationsPrecision::F32_F16: - case CalculationsPrecision::F16: - c += "#define FLT8 half8\n"; - c += "#define FLT16 half16\n"; - break; - } - - const int3 block_size = conv_params.block_size; - const int element_size = conv_params.element_size / 4; - - c += "__kernel void main_function(\n"; - c += "$0) {\n"; - c += " int X = get_global_id(0) * " + std::to_string(block_size.x * element_size) + ";\n"; - c += " int X_SRC = get_global_id(0) * " + std::to_string(block_size.x) + ";\n"; - c += " int Y = get_global_id(1) * " + std::to_string(block_size.y) + ";\n"; - c += " int Z = get_global_id(2) * " + std::to_string(block_size.z) + ";\n"; - c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || " - "Z >= args.dst_tensor.Slices()) return;\n"; - if (conv_params.different_weights_for_height) - { - c += " __global FLT16* weights_cache = args.weights.GetPtr() + (Z * " - "args.src_tensor.Height() + " - "Y * " + - std::to_string(block_size.z) + - ") * " - "args.src_tensor.Slices();\n"; - } - else - { - c += " __global FLT16* weights_cache = args.weights.GetPtr() + Z * " - "args.src_tensor.Slices();\n"; - } - for (int z = 0; z < block_size.z; ++z) - { - const std::string z_s = std::to_string(z); - c += " ACCUM_FLT4 bias_val_" + z_s + " = TO_ACCUM_TYPE(args.biases.Read(Z + " + z_s + "));\n"; - for (int y = 0; y < block_size.y; ++y) - { - for (int x = 0; x < block_size.x * element_size; ++x) - { - c += " ACCUM_FLT4 r" + z_s + std::to_string(y) + std::to_string(x) + " = bias_val_" + z_s + - ";\n"; - } - } - } - for (int x = 0; x < block_size.x; ++x) - { - std::string x_s = std::to_string(x); - c += " int xc" + x_s + " = min(X_SRC + " + std::to_string(x) + - ", args.src_tensor.Width() - 1);\n"; - } - for (int y = 0; y < block_size.y; ++y) - { - std::string y_s = std::to_string(y); - c += " int yc" + y_s + " = min(Y + " + y_s + ", args.src_tensor.Height() - 1);\n"; - } - for (int y = 0; y < block_size.y; ++y) - { - std::string y_s = std::to_string(y); - for (int x = 0; x < block_size.x; ++x) - { - std::string x_s = std::to_string(x); - std::string i_s = std::to_string(y * block_size.x + x); - c += " int src_addr_" + i_s + " = (yc" + y_s + ") * args.src_tensor.Width() + (xc" + x_s + - ");\n"; - } - } - c += " for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n"; - for (int y = 0; y < block_size.y; ++y) - { - std::string y_s = std::to_string(y); - for (int x = 0; x < block_size.x; ++x) - { - std::string x_s = std::to_string(x); - std::string i_s = std::to_string(y * block_size.x + x); - c += " FLT" + std::to_string(element_size * 4) + " s" + i_s + - " = args.src_tensor.Read(src_addr_" + i_s + ");\n"; - } - } - c += GetComputationPart(block_size, element_size, op_def.precision); - for (int i = 0; i < block_size.x * block_size.y; ++i) - { - std::string i_s = std::to_string(i); - c += " src_addr_" + i_s + " += args.src_tensor.SliceStride();\n"; - } - c += " weights_cache += " + std::to_string(block_size.z) + ";\n"; - c += " }\n"; // SRC_SLICES - - for (int z = 0; z < block_size.z; ++z) - { - const std::string z_s = std::to_string(z); - if (z != 0) - { - c += " if (Z + " + z_s + " >= args.dst_tensor.Slices()) return;\n"; - } - for (int y = 0; y < block_size.y; ++y) - { - const std::string y_s = std::to_string(y); - for (int x = 0; x < block_size.x * element_size; ++x) - { - const std::string x_s = std::to_string(x); - c += " if (X + " + x_s + " < args.dst_tensor.Width() && Y + " + y_s + - " < args.dst_tensor.Height()) {\n"; - c += " FLT4 res = TO_FLT4(r" + z_s + y_s + x_s + ");\n"; - c += " args.dst_tensor.Write(res, X + " + x_s + ", Y + " + y_s + ", Z + " + z_s + ");\n"; - c += " }\n"; - } - } - } - c += "}\n"; - return c; -} - -int3 ConvBuffer1x1::GetGridSize() const -{ - const int dst_width_elements = - DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(), (conv_params_.element_size / 4)); - const int grid_x = DivideRoundUp(dst_width_elements, conv_params_.block_size.x); - const int grid_y = DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y); - const int grid_z = DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z); - return int3(grid_x, grid_y, grid_z); -} - -void ConvBuffer1x1::GetPossibleKernelWorkGroups(TuningType tuning_type, - const DeviceInfo &device_info, - const KernelInfo &kernel_info, - std::vector<int3> *work_groups) const -{ - GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_, work_groups); -} - -bool IsConvBuffer1x1Supported(const OperationDef &definition, const Convolution2DAttributes &attr) -{ - auto src_storage_type = definition.src_tensors[0].storage_type; - return src_storage_type == TensorStorageType::BUFFER && attr.weights.shape.w == 1 && - attr.weights.shape.h == 1 && attr.dilations.w == 1 && attr.dilations.h == 1 && - attr.strides.w == 1 && attr.strides.h == 1 && attr.padding.prepended.w == 0 && - attr.padding.prepended.h == 0 && attr.padding.appended.w == 0 && - attr.padding.appended.h == 0; -} - -bool IsConvBuffer1x1Supported(const OperationDef &definition, const BHWC &weights_shape, - const Convolution2DAttributes &attr) -{ - auto src_storage_type = definition.src_tensors[0].storage_type; - return src_storage_type == TensorStorageType::BUFFER && weights_shape.w == 1 && - weights_shape.h == 1 && attr.dilations.w == 1 && attr.dilations.h == 1 && - attr.strides.w == 1 && attr.strides.h == 1 && attr.padding.prepended.w == 0 && - attr.padding.prepended.h == 0 && attr.padding.appended.w == 0 && - attr.padding.appended.h == 0; -} - -ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo &device_info, const OperationDef &definition, - const Convolution2DAttributes &attr, const BHWC *shape) -{ - const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4); - const int src_depth = DivideRoundUp(attr.weights.shape.i, 4); - ConvBuffer1x1::ConvParams conv_params; - if (shape) - { - conv_params = GetBestParams(device_info, definition, *shape, src_depth, dst_depth); - } - else - { - conv_params = GetBestParams(device_info, definition, src_depth, dst_depth); - } - ConvBuffer1x1 result(definition, conv_params); - result.UploadData(attr.weights, attr.bias); - return result; -} - -ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo &device_info, const OperationDef &definition, - const FullyConnectedAttributes &attr, const BHWC *shape) -{ - const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4); - const int src_depth = DivideRoundUp(attr.weights.shape.i, 4); - ConvBuffer1x1::ConvParams conv_params; - if (shape) - { - conv_params = GetBestParams(device_info, definition, *shape, src_depth, dst_depth); - } - else - { - conv_params = GetBestParams(device_info, definition, src_depth, dst_depth); - } - conv_params.block_size.x *= conv_params.block_size.y; - conv_params.block_size.y = 1; - ConvBuffer1x1 result(definition, conv_params); - result.UploadData(attr.weights, attr.bias); - return result; -} - -ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(const DeviceInfo &device_info, - const OperationDef &definition, - const Convolution2DAttributes &attr, - const BHWC *shape) -{ - const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4); - const int src_depth = DivideRoundUp(attr.weights.shape.i, 4); - ConvBuffer1x1::ConvParams conv_params; - if (shape) - { - conv_params = GetBestParams(device_info, definition, *shape, src_depth, dst_depth); - } - else - { - conv_params = GetBestParams(device_info, definition, src_depth, dst_depth); - } - conv_params.block_size.x *= conv_params.block_size.y; - conv_params.block_size.y = 1; - conv_params.different_weights_for_height = true; - ConvBuffer1x1 result(definition, conv_params); - result.UploadDataForWinograd4x4To6x6(attr.weights); - return result; -} - -ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(const DeviceInfo &device_info, - const OperationDef &definition, - const Convolution2DAttributes &attr, - const BHWC &weights_shape, const BHWC *dst_shape) -{ - const int dst_depth = DivideRoundUp(weights_shape.b, 4); - const int src_depth = DivideRoundUp(weights_shape.c, 4); - ConvBuffer1x1::ConvParams conv_params; - if (dst_shape) - { - conv_params = GetBestParams(device_info, definition, *dst_shape, src_depth, dst_depth); - } - else - { - conv_params = GetBestParams(device_info, definition, src_depth, dst_depth); - } - ConvBuffer1x1 result(definition, conv_params); - result.UploadBiases(attr.bias); - return result; -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvBuffer1x1.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvBuffer1x1.h deleted file mode 100644 index 0abd6051f..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvBuffer1x1.h +++ /dev/null @@ -1,205 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_BUFFER_1X1_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_BUFFER_1X1_H__ - -#include "open_cl/Buffer.h" -#include "open_cl/ClKernel.h" -#include "open_cl/kernels/ConvCommon.h" -#include "open_cl/kernels/GpuOperation.h" -#include "open_cl/kernels/Util.h" -#include "open_cl/LinearStorage.h" -#include "open_cl/Precision.h" -#include "open_cl/InternalTensor.h" -#include "open_cl/Util.h" -#include "open_cl/DataType.h" -#include "open_cl/Operations.h" -#include "open_cl/Shape.h" -#include "open_cl/Status.h" -#include "open_cl/Tensor.h" -#include "open_cl/Types.h" -#include "open_cl/WinogradUtil.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -class ConvBuffer1x1 : public GPUOperation -{ -public: - ConvBuffer1x1() = default; - - // Move only - ConvBuffer1x1(ConvBuffer1x1 &&operation); - ConvBuffer1x1 &operator=(ConvBuffer1x1 &&operation); - ConvBuffer1x1(const ConvBuffer1x1 &) = delete; - ConvBuffer1x1 &operator=(const ConvBuffer1x1 &) = delete; - - void GetPossibleKernelWorkGroups(TuningType tuning_type, const DeviceInfo &device_info, - const KernelInfo &kernel_info, - std::vector<int3> *work_groups) const override; - int3 GetGridSize() const override; - - ConvWeightsDescription GetConvWeightsDescription() const - { - ConvWeightsDescription desc; - desc.layout = ConvWeightsLayout::kOHWIOGroupI4O4; - desc.output_group_size = conv_params_.block_size.z; - return desc; - } - - struct ConvParams - { - int3 block_size = int3(1, 1, 1); - int element_size = 4; // can be 4, 8 or 16 - - // By default in 2d convolution we have the same weights for WH dims, but in - // some cases we need separate weights for H dimension and convolution - // kernel requires very small modifications to support it. - bool different_weights_for_height = false; - }; - -private: - ConvBuffer1x1(const OperationDef &definition, const ConvParams &conv_params); - friend ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo &device_info, - const OperationDef &definition, - const Convolution2DAttributes &attr, const BHWC *shape); - friend ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo &device_info, - const OperationDef &definition, - const FullyConnectedAttributes &attr, const BHWC *shape); - friend ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(const DeviceInfo &device_info, - const OperationDef &definition, - const Convolution2DAttributes &attr, - const BHWC *shape); - friend ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(const DeviceInfo &device_info, - const OperationDef &definition, - const Convolution2DAttributes &attr, - const BHWC &weights_shape, - const BHWC *dst_shape); - - template <DataType T> - void UploadData(const InternalTensor<OHWI, T> &weights, const InternalTensor<Linear, T> &biases); - template <DataType T> void UploadDataForWinograd4x4To6x6(const InternalTensor<OHWI, T> &weights); - - template <DataType T> void UploadWeights(const InternalTensor<OHWI, T> &weights); - - template <DataType T> void UploadBiases(const InternalTensor<Linear, T> &biases); - - std::string GenerateConvBuffer1x1(const OperationDef &op_def, - const ConvBuffer1x1::ConvParams &conv_params, Arguments *args); - - ConvParams conv_params_; -}; - -template <DataType T> -void ConvBuffer1x1::UploadData(const InternalTensor<OHWI, T> &weights, - const InternalTensor<Linear, T> &biases) -{ - UploadWeights(weights); - UploadBiases(biases); -} - -template <DataType T> -void ConvBuffer1x1::UploadDataForWinograd4x4To6x6(const InternalTensor<OHWI, T> &weights) -{ - InternalTensor<OHWI, T> wino_weights; - RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights); - UploadWeights(wino_weights); - InternalTensor<Linear, DataType::FLOAT32> bias; - bias.shape = Linear(weights.shape.o); - bias.data.resize(weights.shape.o, 0.0f); - UploadBiases(bias); -} - -template <DataType T> void ConvBuffer1x1::UploadWeights(const InternalTensor<OHWI, T> &weights) -{ - const int dst_depth = DivideRoundUp(weights.shape.o, 4); - const int src_depth = DivideRoundUp(weights.shape.i, 4); - - const bool f32_weights = definition_.precision == CalculationsPrecision::F32; - const int float4_size = sizeof(float4); - // TODO - // f32_weights ? sizeof(float4) : sizeof(half4); - - const int dst_depth_aligned = AlignByN(dst_depth, conv_params_.block_size.z); - const int elements_count = weights.shape.h * weights.shape.w * src_depth * dst_depth_aligned * 4; - - BufferDescriptor desc; - desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16; - desc.element_size = 16; - desc.memory_type = MemoryType::GLOBAL; - desc.size = float4_size * elements_count; - desc.data.resize(desc.size); - - if (f32_weights) - { - float4 *ptr = reinterpret_cast<float4 *>(desc.data.data()); - RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z, - absl::MakeSpan(ptr, elements_count)); - } - // else - // { - // half4 *ptr = reinterpret_cast<half4 *>(desc.data.data()); - // RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z, - // absl::MakeSpan(ptr, elements_count)); - // } - - args_.AddObject("weights", absl::make_unique<BufferDescriptor>(std::move(desc))); -} - -template <DataType T> void ConvBuffer1x1::UploadBiases(const InternalTensor<Linear, T> &biases) -{ - TensorLinearDescriptor desc; - desc.storage_type = LinearStorageType::BUFFER; - desc.element_type = definition_.GetDataType(); - int depth = AlignByN(biases.shape.v, 4 * conv_params_.block_size.z) / 4; - desc.UploadLinearData(biases, depth); - args_.AddObject("biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc))); -} - -bool IsConvBuffer1x1Supported(const OperationDef &definition, const Convolution2DAttributes &attr); - -bool IsConvBuffer1x1Supported(const OperationDef &definition, const BHWC &weights_shape, - const Convolution2DAttributes &attr); - -ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo &device_info, const OperationDef &definition, - const Convolution2DAttributes &attr, const BHWC *shape = nullptr); - -ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo &device_info, const OperationDef &definition, - const FullyConnectedAttributes &attr, - const BHWC *shape = nullptr); - -ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(const DeviceInfo &device_info, - const OperationDef &definition, - const Convolution2DAttributes &attr, - const BHWC &weights_shape, - const BHWC *dst_shape = nullptr); - -ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(const DeviceInfo &device_info, - const OperationDef &definition, - const Convolution2DAttributes &attr, - const BHWC *shape = nullptr); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_BUFFER_1X1_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvConstants.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvConstants.cc deleted file mode 100644 index 0a51bab5c..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvConstants.cc +++ /dev/null @@ -1,282 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "open_cl/kernels/ConvConstants.h" - -#include <string> -#include <utility> - -#include "open_cl/kernels/Util.h" -#include "open_cl/kernels/WorkGroupPicking.h" -#include "open_cl/Precision.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ -namespace -{ -// Adreno can provide up to ~3-4KB of constant memory, but in some cases even -// 3KB can have very bad performance. -int GetAdrenoOptimalMaxConstantSize(int gpu_version) -{ - if (gpu_version < 600) - { - return 256 * 10; // 2.5KB - } - else - { - return 256 * 14; // 3.5KB - } -} - -int GetOptimalMaxConstantSize(const DeviceInfo &info) -{ - if (!info.IsAdreno()) - { - // In general we do not expect that this kernel will be used with non Adreno - // so as it tuned for __constant memory that have big profit on Adreno - return 1024; // 1KB - } - else - { - return GetAdrenoOptimalMaxConstantSize(info.adreno_info.gpu_version); - } -} - -std::string GenerateConvolutionConstantCode(const OperationDef &op_def, const OHWI &weights_shape, - bool stride_correction, GPUOperation *op) -{ - auto src_desc = op_def.src_tensors[0]; - src_desc.SetTextureAddressMode(TextureAddressMode::ZERO); - if (op_def.IsBatchSupported()) - { - src_desc.SetStateVar("BatchedWidth", "true"); - } - op->AddSrcTensor("src_tensor", src_desc); - - auto dst_desc = op_def.dst_tensors[0]; - if (op_def.IsBatchSupported()) - { - dst_desc.SetStateVar("BatchedWidth", "true"); - } - op->AddDstTensor("dst_tensor", dst_desc); - - std::string c = GetCommonDefines(op_def.precision); - - const int out_z = DivideRoundUp(weights_shape.o, 4); - const std::string kOutZ = std::to_string(out_z); - const int src_depth = DivideRoundUp(weights_shape.i, 4); - - const auto src_tensor_type = op_def.src_tensors[0].storage_type; - const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER || - src_tensor_type == TensorStorageType::IMAGE_BUFFER; - - switch (op_def.precision) - { - case CalculationsPrecision::F32: - case CalculationsPrecision::F16: - c += "#define CONV4(R, SRC, F, i) \\\n"; - c += " R += SRC.x * F[i + 0]; \\\n"; - c += " R += SRC.y * F[i + 1]; \\\n"; - c += " R += SRC.z * F[i + 2]; \\\n"; - c += " R += SRC.w * F[i + 3]; \n"; - - c += "#define CONV3(R, SRC, F, i) \\\n"; - c += " R += SRC.x * F[i + 0]; \\\n"; - c += " R += SRC.y * F[i + 1]; \\\n"; - c += " R += SRC.z * F[i + 2]; \n"; - - c += "#define CONV2(R, SRC, F, i) \\\n"; - c += " R += SRC.x * F[i + 0]; \\\n"; - c += " R += SRC.y * F[i + 1]; \n"; - - c += "#define CONV1(R, SRC, F, i) \\\n"; - c += " R += SRC * F[i + 0]; \n"; - break; - case CalculationsPrecision::F32_F16: - c += "#define CONV4(R, SRC, F, i) \\\n"; - c += " R += convert_float4(SRC.x * F[i + 0] + SRC.y * F[i + 1]"; - c += " + SRC.z * F[i + 2] + SRC.w * F[i + 3]);\n"; - - c += "#define CONV3(R, SRC, F, i) \\\n"; - c += " R += convert_float4(SRC.x * F[i + 0] + SRC.y * F[i + 1]"; - c += " + SRC.z * F[i + 2]);\n"; - - c += "#define CONV2(R, SRC, F, i) \\\n"; - c += " R += convert_float4(SRC.x * F[i + 0] + SRC.y * F[i + 1]);\n"; - - c += "#define CONV1(R, SRC, F, i) \\\n"; - c += " R += convert_float4(SRC * F[i + 0]);\n"; - break; - } - - const std::string postfixes[] = {".x", ".xy", ".xyz", ""}; - - c += "__kernel void main_function(\n"; - c += "$0) {\n"; - c += " int X = get_global_id(0);\n"; - c += " int Y = get_global_id(1);\n"; - c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height()) " - "return;\n"; - if (stride_correction) - { - c += " int start_x = " + - GetXStrideCorrectedV2("X", "args.src_tensor.Batch()", "args.stride_x", "args.padding_x") + - ";\n"; - } - else - { - if (op_def.IsBatchSupported()) - { - c += " int start_x = X * args.stride_x + args.padding_x * " - "args.src_tensor.Batch();\n"; - } - else - { - c += " int start_x = X * args.stride_x + args.padding_x;\n"; - } - } - c += " int start_y = Y * args.stride_y + args.padding_y;\n"; - c += " ACCUM_FLT4 r[" + kOutZ + "];\n"; - c += " for (int i = 0; i < " + kOutZ + "; ++i) {\n"; - c += " r[i] = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"; - c += " }\n"; - int filters_counter = 0; - for (int s = 0; s < src_depth; ++s) - { - const int ch_count = std::min(4, weights_shape.i - s * 4); - const std::string s_conv = "CONV" + std::to_string(ch_count); - const std::string s_count = ch_count == 1 ? "" : std::to_string(ch_count); - const std::string s_type = absl::StrCat("FLT", s_count); - const std::string s_postfix = postfixes[ch_count - 1]; - const std::string dilation_x = - op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()" : "args.dilation_x"; - for (int ky = 0; ky < weights_shape.h; ++ky) - { - std::string s_y = absl::StrCat("(start_y + ", ky, " * args.dilation_y)"); - if (manual_clamp) - { - c += " {\n"; - c += " bool y_out = " + s_y + " < 0 || " + s_y + " >= args.src_tensor.Height();\n"; - } - for (int kx = 0; kx < weights_shape.w; ++kx) - { - c += " {\n"; - std::string s_x = absl::StrCat("(start_x + ", kx, " * " + dilation_x + ")"); - if (manual_clamp) - { - c += " bool x_out = " + s_x + "< 0 || " + s_x + ">= args.src_tensor.Width();\n"; - c += " " + s_type + " src = x_out || y_out ?"; - c += "(" + s_type + ")(0.0) : args.src_tensor.Read(" + s_x + ", " + s_y + ", " + - std::to_string(s) + ")" + s_postfix + ";\n"; - } - else - { - c += " " + s_type + " src = args.src_tensor.Read(" + s_x + ", " + s_y + ", " + - std::to_string(s) + ")" + s_postfix + ";\n"; - } - for (int d = 0; d < out_z; ++d) - { - c += " " + s_conv + "(r[" + std::to_string(d) + "], src, args.weigths.GetPtr(),"; - c += " " + std::to_string(filters_counter) + ");\n"; - filters_counter += ch_count; - } - c += " }\n"; - } - if (manual_clamp) - { - c += " }\n"; - } - } - } - for (int i = 0; i < out_z; ++i) - { - std::string s_i = std::to_string(i); - c += " {\n"; - c += " FLT4 res = TO_FLT4(r[" + s_i + "]) + args.biases.Read(" + s_i + ");\n"; - c += " args.dst_tensor.Write(res, X, Y, " + s_i + ");\n"; - c += " }\n"; - } - c += "}\n"; - return c; -} - -} // namespace - -bool IsConvConstantsSupported(const DeviceInfo &device_info, const OperationDef &definition, - const Convolution2DAttributes &attr) -{ - if (device_info.IsAMD() && definition.precision != CalculationsPrecision::F32 && - definition.src_tensors[0].storage_type != TensorStorageType::BUFFER) - { - // BUG, some AMD gpus crashe without it - return false; - } - - const auto &w_shape = attr.weights.shape; - const int dst_channels = AlignByN(w_shape.o, 4); - const int filters_count = w_shape.i * dst_channels * w_shape.h * w_shape.w; - const int float_size = sizeof(float); - // TODO F32 and F16 - // definition.precision == CalculationsPrecision::F32 ? sizeof(float) : sizeof(half); - const int filters_buffer_size = filters_count * float_size; - const int kConstantMaxSize = GetOptimalMaxConstantSize(device_info); - const int flt4_registers = DivideRoundUp(w_shape.o, 4); - return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8; -} - -GPUOperation CreateConvConstants(const DeviceInfo &device_info, const OperationDef &definition, - const Convolution2DAttributes &attr) -{ - GPUOperation op(definition); - UploadWeightsForConvConstants(attr.weights, definition.precision, &op); - op.args_.AddInt("stride_x", attr.strides.w); - op.args_.AddInt("stride_y", attr.strides.h); - op.args_.AddInt("padding_x", -attr.padding.prepended.w); - op.args_.AddInt("padding_y", -attr.padding.prepended.h); - op.args_.AddInt("dilation_x", attr.dilations.w); - op.args_.AddInt("dilation_y", attr.dilations.h); - op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_ZIs1; - - const bool stride_correction = definition.IsBatchSupported() && attr.strides.w != 1; - op.code_ = - GenerateConvolutionConstantCode(definition, attr.weights.shape, stride_correction, &op); - if (definition.precision == CalculationsPrecision::F16 && device_info.IsAdreno3xx()) - { - op.compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE); - } - if (definition.precision != CalculationsPrecision::F32 && device_info.IsPowerVR()) - { - // BUG, some PowerVRs (GE8320) produce incorrect result without it - op.compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE); - } - - TensorLinearDescriptor desc; - desc.storage_type = LinearStorageType::BUFFER; - desc.element_type = definition.GetDataType(); - desc.memory_type = MemoryType::CONSTANT; - desc.UploadLinearData(attr.bias); - op.args_.AddObject("biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc))); - return op; -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvConstants.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvConstants.h deleted file mode 100644 index be6670c53..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvConstants.h +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_CONSTANTS_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_CONSTANTS_H__ - -#include "open_cl/Buffer.h" -#include "open_cl/kernels/GpuOperation.h" -#include "open_cl/LinearStorage.h" -#include "open_cl/Tensor.h" -#include "open_cl/Util.h" -#include "open_cl/DataType.h" -#include "open_cl/Operations.h" -#include "open_cl/Shape.h" -#include "open_cl/Status.h" -#include "open_cl/Tensor.h" -#include "open_cl/Types.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -template <DataType S, typename T> -void RearrangeWeightsForConvConstants(const InternalTensor<OHWI, S> &weights, absl::Span<T> dst) -{ - const int dst_depth = DivideRoundUp(weights.shape.o, 4); - const int src_depth = DivideRoundUp(weights.shape.i, 4); - const int kernel_x = weights.shape.w; - const int kernel_y = weights.shape.h; - - int counter = 0; - for (int s = 0; s < src_depth; ++s) - { - for (int y = 0; y < kernel_y; ++y) - { - for (int x = 0; x < kernel_x; ++x) - { - for (int d = 0; d < dst_depth; ++d) - { - const int channels_count = std::min(4, weights.shape.i - s * 4); - T filters[4]; - for (int i = 0; i < 4; ++i) - { - for (int j = 0; j < channels_count; ++j) - { - const int s_ch = s * 4 + j; - const int d_ch = d * 4 + i; - if (s_ch < weights.shape.i && d_ch < weights.shape.o) - { - const int f_index = weights.shape.LinearIndex({d_ch, y, x, s_ch}); - filters[i][j] = weights.data[f_index]; - } - else - { - filters[i][j] = 0.0f; - } - } - } - T filters_new[4]; - for (int i = 0; i < 4; ++i) - { - for (int j = 0; j < 4; ++j) - { - filters_new[i][j] = filters[j][i]; - } - } - for (int i = 0; i < channels_count; ++i) - { - dst[counter++] = filters_new[i]; - } - } - } - } - } -} - -template <DataType T> -void UploadWeightsForConvConstants(const InternalTensor<OHWI, T> &weights, - CalculationsPrecision precision, GPUOperation *op) -{ - const int dst_depth = DivideRoundUp(weights.shape.o, 4); - const int kernel_x = weights.shape.w; - const int kernel_y = weights.shape.h; - - const bool f32_weights = precision == CalculationsPrecision::F32; - const int float_size = f32_weights ? 4 : 2; - const int float_count = weights.shape.i * dst_depth * 4 * kernel_x * kernel_y; - - BufferDescriptor desc; - desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16; - desc.element_size = 4; - desc.memory_type = MemoryType::CONSTANT; - desc.size = float_size * float_count; - desc.data.resize(desc.size); - - if (f32_weights) - { - float4 *ptr = reinterpret_cast<float4 *>(desc.data.data()); - RearrangeWeightsForConvConstants(weights, absl::MakeSpan(ptr, float_count / 4)); - } - // else - // { - // half4 *ptr = reinterpret_cast<half4 *>(desc.data.data()); - // RearrangeWeightsForConvConstants(weights, absl::MakeSpan(ptr, float_count / 4)); - // } - - op->args_.AddObject("weigths", absl::make_unique<BufferDescriptor>(std::move(desc))); -} - -bool IsConvConstantsSupported(const DeviceInfo &device_info, const OperationDef &definition, - const Convolution2DAttributes &attr); - -GPUOperation CreateConvConstants(const DeviceInfo &device_info, const OperationDef &definition, - const Convolution2DAttributes &attr); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_CONSTANTS_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvPowervr.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvPowervr.cc deleted file mode 100644 index 5cb0c2719..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvPowervr.cc +++ /dev/null @@ -1,1653 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "open_cl/kernels/ConvPowervr.h" - -#include <algorithm> -#include <string> -#include <utility> - -#include "absl/strings/substitute.h" -#include "open_cl/kernels/Util.h" -#include "open_cl/kernels/WorkGroupPicking.h" -#include "open_cl/Precision.h" -#include "open_cl/TensorType.h" -#include "open_cl/DataType.h" -#include "open_cl/Shape.h" -#include "open_cl/Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ -namespace -{ -std::string GenerateUploadByThreads(const std::string &local_ptr_name, - const std::string &global_ptr_name, - const std::string &global_offset_name, - const std::string &lid_name, int total_work_items, - int elements_to_upload) -{ - std::string c; - std::string offset = global_offset_name.empty() ? "" : global_offset_name + " + "; - const int groups = elements_to_upload / total_work_items; - const int reminder = elements_to_upload % total_work_items; - for (int i = 0; i < groups; ++i) - { - c += " " + local_ptr_name + "[" + lid_name + " + " + std::to_string(total_work_items * i) + - "] = " + global_ptr_name + "[" + offset + lid_name + " + " + - std::to_string(total_work_items * i) + "];\n"; - } - if (reminder != 0) - { - c += " if (" + lid_name + " < " + std::to_string(reminder) + ") {\n"; - c += " " + local_ptr_name + "[" + lid_name + " + " + - std::to_string(total_work_items * groups) + "] = " + global_ptr_name + "[" + offset + - lid_name + " + " + std::to_string(total_work_items * groups) + "];\n"; - c += " }\n"; - } - return c; -} - -std::string GenerateAsyncUpload(const std::string &local_ptr_name, - const std::string &global_ptr_name, - const std::string &global_offset_name, int elements_to_upload) -{ - std::string c; - std::string offset = global_offset_name.empty() ? "" : " + " + global_offset_name; - c += " async_work_group_copy(" + local_ptr_name + ", " + global_ptr_name + offset + ", " + - std::to_string(elements_to_upload) + ", 0);\n"; - return c; -} - -std::string GenerateBlockCoords(const int4 &block_size, const int3 &work_group_launch_order, - bool linear_spatial, bool need_depth) -{ - std::string c; - int3 launch_remap; - launch_remap[work_group_launch_order.x] = 0; - launch_remap[work_group_launch_order.y] = 1; - launch_remap[work_group_launch_order.z] = 2; - if (linear_spatial) - { - if (work_group_launch_order[0] == 0) - { - c += " int linear_spatial = get_global_id(0);\n"; - } - else - { - c += " int linear_spatial = get_group_id(" + std::to_string(launch_remap[0]) + - ") * get_local_size(0) + get_local_id(0);\n"; - } - if (need_depth) - { - c += " int DST_X = (linear_spatial % args.task_size_x) * " + std::to_string(block_size.x) + - ";\n"; - c += " linear_spatial = linear_spatial / args.task_size_x;\n"; - c += " int DST_Y = (linear_spatial % args.task_size_y) * " + std::to_string(block_size.y) + - ";\n"; - c += " int DST_Z = (linear_spatial / args.task_size_y) * " + std::to_string(block_size.z) + - ";\n"; - } - else - { - c += " int DST_Y = (linear_spatial / args.task_size_x) * " + std::to_string(block_size.y) + - ";\n"; - c += " int DST_X = (linear_spatial % args.task_size_x) * " + std::to_string(block_size.x) + - ";\n"; - } - if (work_group_launch_order[1] == 1) - { - c += " int DST_S = get_global_id(1) * " + std::to_string(block_size.w) + ";\n"; - } - else - { - c += " int DST_S = (get_group_id(" + std::to_string(launch_remap[1]) + - ") * get_local_size(1) + get_local_id(1)) * " + std::to_string(block_size.w) + ";\n"; - } - } - else - { - if (work_group_launch_order[0] == 0) - { - c += " int DST_X = get_global_id(0) * " + std::to_string(block_size.x) + ";\n"; - } - else - { - c += " int DST_X = (get_group_id(" + std::to_string(launch_remap[0]) + - ") * get_local_size(0) + get_local_id(0)) * " + std::to_string(block_size.x) + ";\n"; - } - std::string global_id_1; - if (work_group_launch_order[1] == 1) - { - global_id_1 = "get_global_id(1)"; - } - else - { - global_id_1 = "(get_group_id(" + std::to_string(launch_remap[1]) + - ") * get_local_size(1) + get_local_id(1))"; - } - if (need_depth) - { - c += " int linear_id_1 = " + global_id_1 + ";\n"; - c += - " int DST_Z = (linear_id_1 / args.task_size_y) * " + std::to_string(block_size.z) + ";\n"; - c += - " int DST_Y = (linear_id_1 % args.task_size_y) * " + std::to_string(block_size.y) + ";\n"; - } - else - { - c += " int DST_Y = " + global_id_1 + " * " + std::to_string(block_size.y) + ";\n"; - } - if (work_group_launch_order[2] == 2) - { - c += " int DST_S = get_global_id(2) * " + std::to_string(block_size.w) + ";\n"; - } - else - { - c += " int DST_S = (get_group_id(" + std::to_string(launch_remap[2]) + - ") * get_local_size(2) + get_local_id(2)) * " + std::to_string(block_size.w) + ";\n"; - } - } - - return c; -} -} // namespace - -ConvPowerVR::ConvPowerVR(const OperationDef &definition, const Convolution2DAttributes &attr, - const DeviceInfo &device_info, const BHWC *dst_shape) - : GPUOperation(definition), stride_(attr.strides.w, attr.strides.h, 1, 1), - padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0), - kernel_size_(attr.weights.shape.w, attr.weights.shape.h, 1, 1), - dilation_(attr.dilations.w, attr.dilations.h, 1, 1), - conv_params_(GuessBestParams(device_info, definition, attr, dst_shape)) -{ -} - -ConvPowerVR::ConvPowerVR(const OperationDef &definition, const Convolution2DAttributes &attr, - const BHWC &weights_shape, const DeviceInfo &device_info, - const BHWC *dst_shape) - : GPUOperation(definition), stride_(attr.strides.w, attr.strides.h, 1, 1), - padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0), - kernel_size_(weights_shape.w, weights_shape.h, 1, 1), - dilation_(attr.dilations.w, attr.dilations.h, 1, 1), - conv_params_(GuessBestParams(device_info, definition, attr, weights_shape, dst_shape)) -{ -} - -ConvPowerVR::ConvPowerVR(const OperationDef &definition, const FullyConnectedAttributes &attr, - const DeviceInfo &device_info, const BHWC *dst_shape) - : GPUOperation(definition), stride_(1, 1, 1, 1), padding_(0, 0, 0, 0), kernel_size_(1, 1, 1, 1), - dilation_(1, 1, 1, 1), conv_params_(GuessBestParams(device_info, definition, attr, dst_shape)) -{ -} - -ConvPowerVR::ConvPowerVR(const OperationDef &definition) - : GPUOperation(definition), stride_(1, 1, 1, 1), padding_(0, 0, 0, 0), kernel_size_(1, 1, 1, 1), - dilation_(1, 1, 1, 1) -{ -} - -ConvPowerVR::ConvPowerVR(ConvPowerVR &&operation) - : GPUOperation(std::move(operation)), stride_(operation.stride_), padding_(operation.padding_), - kernel_size_(operation.kernel_size_), dilation_(operation.dilation_), - conv_params_(operation.conv_params_) -{ -} - -ConvPowerVR::ConvPowerVR(const OperationDef &definition, const Convolution3DAttributes &attr, - const DeviceInfo &device_info, const BHWDC *dst_shape) - : GPUOperation(definition), stride_(attr.strides.w, attr.strides.h, attr.strides.d, 1), - padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, -attr.padding.prepended.d, 0), - kernel_size_(attr.weights.shape.w, attr.weights.shape.h, attr.weights.shape.d, 1), - dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d, 1), - conv_params_(GuessBestParams(device_info, definition, attr, dst_shape)) -{ -} - -ConvPowerVR &ConvPowerVR::operator=(ConvPowerVR &&operation) -{ - if (this != &operation) - { - std::swap(stride_, operation.stride_); - std::swap(padding_, operation.padding_); - std::swap(kernel_size_, operation.kernel_size_); - std::swap(dilation_, operation.dilation_); - std::swap(conv_params_, operation.conv_params_); - GPUOperation::operator=(std::move(operation)); - } - return *this; -} - -void ConvPowerVR::GenerateCode(const DeviceInfo &device_info) -{ - if (conv_params_.linear_spatial) - { - grid_dimension_ = 2; - } - const bool stride_correction = definition_.IsBatchSupported() && stride_.x != 1; - code_ = GenerateConv(device_info, definition_, stride_correction, conv_params_); - if (definition_.precision == CalculationsPrecision::F16 && device_info.IsPowerVR()) - { - compiler_options_.push_back(CompilerOptions::POWERVR_FP16); - } - if (conv_params_.IsPrivateMemBroadcast() && device_info.IsCL20OrHigher()) - { - compiler_options_.push_back(CompilerOptions::CL_2_0); - } - bool kernel_is_trivial = conv_params_.x_kernel_is_1 && conv_params_.y_kernel_is_1; - if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) - { - kernel_is_trivial = kernel_is_trivial & conv_params_.z_kernel_is_1; - } - if (device_info.IsAdreno3xx() && definition_.precision == CalculationsPrecision::F16 && - kernel_is_trivial) - { - compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE); - } -} - -absl::Status ConvPowerVR::BindArguments(ArgumentsBinder *args) -{ - if (!conv_params_.x_kernel_is_1) - { - RETURN_IF_ERROR(args->SetInt("stride_x", stride_.x)); - RETURN_IF_ERROR(args->SetInt("padding_x", padding_.x * src_[0]->Batch())); - RETURN_IF_ERROR(args->SetInt("kernel_size_x", kernel_size_.x)); - RETURN_IF_ERROR(args->SetInt("dilation_x", dilation_.x * src_[0]->Batch())); - } - if (!conv_params_.y_kernel_is_1) - { - RETURN_IF_ERROR(args->SetInt("stride_y", stride_.y)); - RETURN_IF_ERROR(args->SetInt("padding_y", padding_.y)); - RETURN_IF_ERROR(args->SetInt("kernel_size_y", kernel_size_.y)); - RETURN_IF_ERROR(args->SetInt("dilation_y", dilation_.y)); - } - if (definition_.src_tensors[0].HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) - { - RETURN_IF_ERROR(args->SetInt("stride_z", stride_.z)); - RETURN_IF_ERROR(args->SetInt("padding_z", padding_.z)); - RETURN_IF_ERROR(args->SetInt("kernel_size_z", kernel_size_.z)); - RETURN_IF_ERROR(args->SetInt("dilation_z", dilation_.z)); - } - if (conv_params_.linear_spatial) - { - const int grid_x = - DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(), conv_params_.block_size.x); - RETURN_IF_ERROR(args->SetInt("task_size_x", grid_x)); - } - if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) - { - const int task_size_y = DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y); - RETURN_IF_ERROR(args->SetInt("task_size_y", task_size_y)); - } - return absl::OkStatus(); -} - -int3 ConvPowerVR::GetGridSize() const -{ - const int task_size_x = - DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(), conv_params_.block_size.x); - const int task_size_y = DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y); - const int task_size_z = DivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z); - const int task_size_s = DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w); - int3 wg; - - if (conv_params_.linear_spatial) - { - int grid_x = task_size_x * task_size_y; - if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) - { - grid_x *= task_size_z; - } - return int3(grid_x, task_size_s, 1); - } - else - { - int grid_y = task_size_y; - if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) - { - grid_y *= task_size_z; - } - return int3(task_size_x, grid_y, task_size_s); - } -} - -void ConvPowerVR::GetPossibleKernelWorkGroups(TuningType tuning_type, const DeviceInfo &device_info, - const KernelInfo &kernel_info, - std::vector<int3> *work_groups) const -{ - if (conv_params_.weights_upload_type == WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP || - conv_params_.weights_upload_type == WeightsUploadType::LOCAL_MEM_BY_THREADS || - conv_params_.fixed_work_group_size) - { - work_groups->push_back(work_group_size_); - return; - } - GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_, work_groups); -} - -std::string ConvPowerVR::GenerateConv(const DeviceInfo &device_info, const OperationDef &op_def, - bool stride_correction, const ConvParams &conv_params) -{ - auto src_desc = op_def.src_tensors[0]; - src_desc.SetTextureAddressMode(TextureAddressMode::ZERO); - if (op_def.IsBatchSupported()) - { - src_desc.SetStateVar("BatchedWidth", "true"); - } - AddSrcTensor("src_tensor", src_desc); - if (op_def.src_tensors.size() == 2) - { - // dynamic weights - BufferDescriptor desc; - desc.element_type = op_def.src_tensors[1].data_type; - desc.element_size = 4; - desc.memory_type = - conv_params.weights_upload_type == ConvPowerVR::WeightsUploadType::CONSTANT_MEM - ? MemoryType::CONSTANT - : MemoryType::GLOBAL; - - AddSrcBuffer("weights", desc); - } - - const auto &src_def = op_def.src_tensors[0]; - - auto generate_id = [&](const std::string &x, const std::string &y, const std::string &z) { - std::string id; - if (src_def.HasAxis(Axis::WIDTH)) - { - id += "_w" + x; - } - if (src_def.HasAxis(Axis::HEIGHT)) - { - id += "_h" + y; - } - if (src_def.HasAxis(Axis::DEPTH)) - { - id += "_d" + z; - } - return id; - }; - - auto generate_id_full = [&](const std::string &x, const std::string &y, const std::string &z, - const std::string &s) { return generate_id(x, y, z) + "_s" + s; }; - - auto generate_check = [&](const std::string &x, const std::string &y, const std::string &z) { - std::string check; - const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH}; - const std::vector<std::string> names{"in_x", "in_y", "in_z"}; - const std::vector<bool> is_1{conv_params_.x_kernel_is_1, conv_params_.y_kernel_is_1, - conv_params_.z_kernel_is_1}; - const std::vector<std::string> coords{x, y, z}; - for (size_t i = 0; i < axes.size(); ++i) - { - const auto &axis = axes[i]; - if (src_def.HasAxis(axis) && !src_def.SupportsZeroClamp(axis) && !is_1[i]) - { - if (!check.empty()) - { - check += " && "; - } - check += names[i] + coords[i]; - } - } - return check; - }; - - auto dst_desc = op_def.dst_tensors[0]; - if (op_def.IsBatchSupported()) - { - dst_desc.SetStateVar("BatchedWidth", "true"); - } - AddDstTensor("dst_tensor", dst_desc); - - if (!conv_params_.x_kernel_is_1) - { - args_.AddInt("stride_x"); - args_.AddInt("padding_x"); - args_.AddInt("kernel_size_x"); - args_.AddInt("dilation_x"); - } - if (!conv_params_.y_kernel_is_1) - { - args_.AddInt("stride_y"); - args_.AddInt("padding_y"); - args_.AddInt("kernel_size_y"); - args_.AddInt("dilation_y"); - } - if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) - { - args_.AddInt("stride_z"); - args_.AddInt("padding_z"); - args_.AddInt("kernel_size_z"); - args_.AddInt("dilation_z"); - } - if (conv_params_.linear_spatial) - { - args_.AddInt("task_size_x"); - } - if (src_def.HasAxis(Axis::DEPTH)) - { - args_.AddInt("task_size_y"); - } - - const bool need_local_mem = - conv_params.weights_upload_type == ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS || - conv_params.weights_upload_type == ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP; - - const int local_mem_size = conv_params.block_size.w * 4 * conv_params.src_depth_loop_size; - - const bool use_simd_broadcast = conv_params.IsPrivateMemBroadcast(); - const int simd_size = conv_params.simd_size; - - const bool late_oob_check = need_local_mem || use_simd_broadcast; - - const std::string weights_space = - conv_params.weights_upload_type == ConvPowerVR::WeightsUploadType::CONSTANT_MEM ? "__constant" - : "__global"; - - const std::string weights_data_type = - conv_params.weights_data_type == DataType::FLOAT32 ? "float4" : "half4"; - - const std::string weights_global_ptr = weights_space + " " + weights_data_type + "*"; - - std::string c = GetCommonDefines(op_def.precision); - if (use_simd_broadcast) - { - if (device_info.cl_version == OpenCLVersion::CL_2_0) - { - c += "#pragma OPENCL EXTENSION cl_khr_subgroups : enable\n"; - } - else if (device_info.SupportsExtension("cl_intel_subgroups")) - { - c += "#pragma OPENCL EXTENSION cl_intel_subgroups : enable\n"; - } - } - const int4 block_size = conv_params.block_size; - if (conv_params.fixed_work_group_size) - { - c += "__attribute__((reqd_work_group_size(" + std::to_string(work_group_size_.x) + ", " + - std::to_string(work_group_size_.y) + ", " + std::to_string(work_group_size_.z) + ")))\n"; - } - if (use_simd_broadcast && device_info.IsIntel()) - { - c += "__attribute__((intel_reqd_sub_group_size(" + std::to_string(simd_size) + ")))\n"; - } - std::string dst_oob_check; - if (src_def.HasAxis(Axis::DEPTH)) - { - if (conv_params.linear_spatial) - { - dst_oob_check = "DST_Z >= args.dst_tensor.Depth() || DST_S >= " - "args.dst_tensor.Slices()"; - } - else - { - dst_oob_check = "DST_X >= args.dst_tensor.Width() || DST_Z >= " - "args.dst_tensor.Depth() || DST_S >= args.dst_tensor.Slices()"; - } - } - else - { - if (conv_params.linear_spatial) - { - dst_oob_check = "DST_Y >= args.dst_tensor.Height() || DST_S >= " - "args.dst_tensor.Slices()"; - } - else - { - dst_oob_check = "DST_X >= args.dst_tensor.Width() || DST_Y >= " - "args.dst_tensor.Height() || DST_S >= args.dst_tensor.Slices()"; - } - } - c += "__kernel void main_function(\n"; - c += "$0) {\n"; - c += GenerateBlockCoords(conv_params.block_size, work_group_launch_order_, - conv_params.linear_spatial, src_def.HasAxis(Axis::DEPTH)); - if (!late_oob_check) - { - c += " if (" + dst_oob_check + ") {\n"; - c += " return;\n"; - c += " }\n"; - } - if (conv_params.weights_upload_type == ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) - { - if (conv_params.linear_spatial) - { - c += " int lid = get_local_id(0);\n"; - } - else - { - c += " int lid = get_local_id(1) * " + std::to_string(work_group_size_.x) + - " + get_local_id(0);\n"; - } - } - if (use_simd_broadcast) - { - c += " int simd_id = get_sub_group_local_id();\n"; - } - for (int s = 0; s < block_size.w; ++s) - { - const std::string sind = std::to_string(s); - for (int z = 0; z < block_size.z; ++z) - { - const std::string zind = std::to_string(z); - for (int y = 0; y < block_size.y; ++y) - { - const std::string yind = std::to_string(y); - for (int x = 0; x < block_size.x; ++x) - { - const std::string xind = std::to_string(x); - c += " ACCUM_FLT4 r" + generate_id_full(xind, yind, zind, sind) + - " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"; - } - } - } - } - if (!conv_params_.x_kernel_is_1) - { - for (int x = 0; x < block_size.x; ++x) - { - const std::string xind = std::to_string(x); - const std::string xc = "(DST_X + " + xind + ")"; - if (stride_correction) - { - c += " int xc" + xind + " = " + - GetXStrideCorrected(xc, "args.src_tensor.Batch()", "args.stride_x", "args.padding_x") + - ";\n"; - } - else - { - c += " int xc" + xind + " = " + xc + " * args.stride_x + args.padding_x;\n"; - } - } - } - else - { - for (int x = 0; x < block_size.x; ++x) - { - const std::string xind = std::to_string(x); - c += " int xc" + xind + " = DST_X + " + xind + ";\n"; - if (!src_def.CanReadOutOfBorder(Axis::WIDTH)) - { - c += " xc" + xind + " = clamp(xc" + xind + ", 0, args.src_tensor.Width() - 1);\n"; - } - } - } - if (!conv_params_.y_kernel_is_1) - { - for (int y = 0; y < block_size.y; ++y) - { - const std::string yind = std::to_string(y); - const std::string yc = "(DST_Y + " + yind + ")"; - c += " int yc" + yind + " = " + yc + " * args.stride_y + args.padding_y;\n"; - } - } - else - { - for (int y = 0; y < block_size.y; ++y) - { - const std::string yind = std::to_string(y); - c += " int yc" + yind + " = DST_Y + " + yind + ";\n"; - if (!src_def.CanReadOutOfBorder(Axis::HEIGHT)) - { - c += " yc" + yind + " = clamp(yc" + yind + ", 0, args.src_tensor.Height() - 1);\n"; - } - } - } - if (src_def.HasAxis(Axis::DEPTH)) - { - if (!conv_params_.z_kernel_is_1) - { - for (int z = 0; z < block_size.z; ++z) - { - const std::string zind = std::to_string(z); - const std::string zc = "(DST_Z + " + zind + ")"; - c += " int zc" + zind + " = " + zc + " * args.stride_z + args.padding_z;\n"; - } - } - else - { - for (int z = 0; z < block_size.z; ++z) - { - const std::string zind = std::to_string(z); - c += " int zc" + zind + " = DST_Z + " + zind + ";\n"; - if (!src_def.CanReadOutOfBorder(Axis::DEPTH)) - { - c += " zc" + zind + " = clamp(zc" + zind + ", 0, args.src_tensor.Depth() - 1);\n"; - } - } - } - } - bool trivial_kernel_size = conv_params_.x_kernel_is_1 && conv_params_.y_kernel_is_1; - if (src_def.HasAxis(Axis::DEPTH)) - { - trivial_kernel_size = trivial_kernel_size && conv_params_.z_kernel_is_1; - } - if (need_local_mem) - { - c += " __local " + weights_data_type + " weights_cache[" + std::to_string(local_mem_size) + - "];\n"; - } - else if (conv_params.AreWeightsBuffer()) - { - c += " " + weights_global_ptr + " weights_cache;\n"; - } - else if (!trivial_kernel_size) - { - c += " int filter_offset = 0;\n"; - } - if (conv_params.AreWeightsBuffer()) - { - if (conv_params.different_weights_for_height) - { - c += " " + weights_global_ptr + - " filters_loc = args.weights.GetPtr() + (DST_S * " - "args.src_tensor.Height() + DST_Y * " + - std::to_string(block_size.w) + ") * 4 * args.src_tensor.Slices();\n"; - } - else - { - std::string kernel_spatial_offset = ""; - if (!conv_params_.x_kernel_is_1) - { - kernel_spatial_offset += " * args.kernel_size_x"; - } - if (!conv_params_.y_kernel_is_1) - { - kernel_spatial_offset += " * args.kernel_size_y"; - } - if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) - { - kernel_spatial_offset += " * args.kernel_size_z"; - } - c += " " + weights_global_ptr + - " filters_loc = args.weights.GetPtr() + DST_S * 4 * " - "args.src_tensor.Slices()" + - kernel_spatial_offset + ";\n"; - } - } - if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) - { - c += " for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n"; - for (int z = 0; z < block_size.z; ++z) - { - const std::string zck = "zck" + std::to_string(z); - c += " int zck" + std::to_string(z) + " = kz * args.dilation_z + zc" + std::to_string(z) + - ";\n"; - if (!src_def.SupportsZeroClamp(Axis::DEPTH)) - { - c += " bool in_z" + std::to_string(z) + " = " + zck + " >= 0 && " + zck + - " < args.src_tensor.Depth();\n"; - if (!src_def.CanReadOutOfBorder(Axis::DEPTH)) - { - c += " " + zck + " = clamp(" + zck + ", 0, args.src_tensor.Depth() - 1);\n"; - } - } - } - } - if (!conv_params_.y_kernel_is_1) - { - c += " for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n"; - for (int y = 0; y < block_size.y; ++y) - { - const std::string yck = "yck" + std::to_string(y); - c += " int " + yck + " = ky * args.dilation_y + yc" + std::to_string(y) + ";\n"; - if (!src_def.SupportsZeroClamp(Axis::HEIGHT)) - { - c += " bool in_y" + std::to_string(y) + " = " + yck + " >= 0 && " + yck + - " < args.src_tensor.Height();\n"; - if (!src_def.CanReadOutOfBorder(Axis::HEIGHT)) - { - c += " " + yck + " = clamp(" + yck + ", 0, args.src_tensor.Height() - 1);\n"; - } - } - } - } - if (!conv_params_.x_kernel_is_1) - { - c += " for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n"; - for (int x = 0; x < block_size.x; ++x) - { - const std::string xck = "xck" + std::to_string(x); - c += " int xck" + std::to_string(x) + " = kx * args.dilation_x + xc" + std::to_string(x) + - ";\n"; - if (!src_def.SupportsZeroClamp(Axis::WIDTH)) - { - c += " bool in_x" + std::to_string(x) + " = " + xck + " >= 0 && " + xck + - " < args.src_tensor.Width();\n"; - if (!src_def.CanReadOutOfBorder(Axis::WIDTH)) - { - c += " " + xck + " = clamp(" + xck + ", 0, args.src_tensor.Width() - 1);\n"; - } - } - } - } - const bool need_multiple_slice_strides = - src_def.ReturnsZeroForNegOneRead() && !trivial_kernel_size; - for (int z = 0; z < block_size.z; ++z) - { - const std::string zind = std::to_string(z); - for (int y = 0; y < block_size.y; ++y) - { - const std::string yind = std::to_string(y); - for (int x = 0; x < block_size.x; ++x) - { - const std::string xind = std::to_string(x); - std::string xc = conv_params.x_kernel_is_1 ? "xc" + xind : "xck" + xind; - std::string yc = conv_params.y_kernel_is_1 ? "yc" + yind : "yck" + yind; - const std::string id = generate_id(xind, yind, zind); - std::string coords = "" + xc + ", " + yc; - if (src_def.HasAxis(Axis::DEPTH)) - { - std::string zc = conv_params.z_kernel_is_1 ? "zc" + zind : "zck" + zind; - coords += ", " + zc; - } - if (src_def.IsLinear()) - { - c += " args.src_tensor.GetAddress(addr" + id + ", " + coords + ", 0);\n"; - if (need_multiple_slice_strides) - { - const std::string check = generate_check(xind, yind, zind); - c += " addr" + id + " = select(-1, addr" + id + ", (" + check + "));\n"; - c += - " int ds" + id + " = select(0, args.src_tensor.SliceStride(), (" + check + "));\n"; - } - } - } - } - } - if (src_def.IsLinear() && !need_multiple_slice_strides) - { - c += " int ds = args.src_tensor.SliceStride();\n"; - } - - auto declare_src = [&]() { - for (int z = 0; z < block_size.z; ++z) - { - const std::string zind = std::to_string(z); - for (int y = 0; y < block_size.y; ++y) - { - const std::string yind = std::to_string(y); - for (int x = 0; x < block_size.x; ++x) - { - const std::string xind = std::to_string(x); - const std::string id = generate_id(xind, yind, zind); - c += " " + weights_data_type + " src" + id + ";\n"; - } - } - } - }; - const bool conditional_read = device_info.IsMali(); - auto read_src = [&]() { - const std::string cl_type = ToCLDataType(conv_params.weights_data_type); - for (int z = 0; z < block_size.z; ++z) - { - const std::string zind = std::to_string(z); - for (int y = 0; y < block_size.y; ++y) - { - const std::string yind = std::to_string(y); - for (int x = 0; x < block_size.x; ++x) - { - const std::string xind = std::to_string(x); - std::string id = generate_id(xind, yind, zind); - const std::string check = generate_check(xind, yind, zind); - std::string address; - if (src_def.IsLinear()) - { - address = "addr" + id; - } - else - { - std::string xc = conv_params.x_kernel_is_1 ? "xc" + xind : "xck" + xind; - std::string yc = conv_params.y_kernel_is_1 ? "yc" + yind : "yck" + yind; - address = "" + xc + ", " + yc; - if (src_def.HasAxis(Axis::DEPTH)) - { - std::string zc = conv_params.z_kernel_is_1 ? "zc" + zind : "zck" + zind; - address += ", " + zc; - } - address += ", s"; - } - if (src_def.ReturnsZeroForNegOneRead()) - { - c += " src" + id + " = args.src_tensor.Read<" + cl_type + ">(" + address + ");\n"; - const std::string ds = trivial_kernel_size ? "ds" : "ds" + id; - c += " " + address + " += " + ds + ";\n"; - } - else - { - if (!check.empty()) - { - if (conditional_read) - { - c += " src" + id + " = " + check + " ? args.src_tensor.Read<" + cl_type + ">(" + - address + ") : (FLT4)(0.0f);\n"; - } - else - { - c += " src" + id + " = args.src_tensor.Read<" + cl_type + ">(" + address + - ") * (FLT)(" + check + ");\n"; - } - } - else - { - c += " src" + id + " = args.src_tensor.Read<" + cl_type + ">(" + address + ");\n"; - } - if (src_def.IsLinear()) - { - c += " " + address + " += ds;\n"; - } - } - } - } - } - }; - const bool weights_type_as_accum_type = !(op_def.precision == CalculationsPrecision::F32_F16 && - conv_params.weights_data_type == DataType::FLOAT16); - auto conv_core = [&](int shared_offset) { - const std::string channels[] = {"x", "y", "z", "w"}; - for (int s = 0; s < block_size.w; ++s) - { - const std::string sind = std::to_string(s); - if (weights_type_as_accum_type) - { - for (int ch = 0; ch < 4; ++ch) - { - for (int z = 0; z < block_size.z; ++z) - { - const std::string zind = std::to_string(z); - for (int y = 0; y < block_size.y; ++y) - { - const std::string yind = std::to_string(y); - for (int x = 0; x < block_size.x; ++x) - { - const std::string xind = std::to_string(x); - std::string R = "r" + generate_id_full(xind, yind, zind, sind); - std::string S = "src" + generate_id(xind, yind, zind); - if (use_simd_broadcast) - { - int simd_id = (s * 4 + ch + shared_offset) / simd_size; - int thread_id = (s * 4 + ch + shared_offset) % simd_size; - std::string w_val_x = "sub_group_broadcast(simd_w" + std::to_string(simd_id) + - ".x, " + std::to_string(thread_id) + "u)"; - std::string w_val_y = "sub_group_broadcast(simd_w" + std::to_string(simd_id) + - ".y, " + std::to_string(thread_id) + "u)"; - std::string w_val_z = "sub_group_broadcast(simd_w" + std::to_string(simd_id) + - ".z, " + std::to_string(thread_id) + "u)"; - std::string w_val_w = "sub_group_broadcast(simd_w" + std::to_string(simd_id) + - ".w, " + std::to_string(thread_id) + "u)"; - c += " " + R + ".x += " + w_val_x + " * " + S + "." + channels[ch] + ";\n"; - c += " " + R + ".y += " + w_val_y + " * " + S + "." + channels[ch] + ";\n"; - c += " " + R + ".z += " + w_val_z + " * " + S + "." + channels[ch] + ";\n"; - c += " " + R + ".w += " + w_val_w + " * " + S + "." + channels[ch] + ";\n"; - } - else - { - const std::string weight_id = std::to_string(s * 4 + ch + shared_offset); - std::string w_val; - if (conv_params.AreWeightsBuffer()) - { - w_val = "weights_cache[" + weight_id + "]"; - } - else - { - w_val = "f" + weight_id; - } - c += " " + R + " += " + w_val + " * " + S + "." + channels[ch] + ";\n"; - } - } - } - } - } - } - else - { // F32_F16 precision and weights type is float16 - for (int z = 0; z < block_size.z; ++z) - { - const std::string zind = std::to_string(z); - for (int y = 0; y < block_size.y; ++y) - { - const std::string yind = std::to_string(y); - for (int x = 0; x < block_size.x; ++x) - { - const std::string xind = std::to_string(x); - std::string R = "r" + generate_id_full(xind, yind, zind, sind); - std::string S = "src" + generate_id(xind, yind, zind); - std::vector<std::string> F(4); - for (int i = 0; i < 4; ++i) - { - std::string weight_id = std::to_string(s * 4 + i + shared_offset); - if (conv_params.AreWeightsBuffer()) - { - F[i] = "weights_cache[" + weight_id + "]"; - } - else - { - F[i] = "f" + weight_id; - } - } - c += " " + R + " += convert_float4(" + S + ".x * " + F[0] + " + " + S + ".y * " + - F[1] + " + " + S + ".z * " + F[2] + " + " + S + ".w * " + F[3] + ");\n"; - } - } - } - } - } - }; - - c += " int s = 0;\n"; - c += " do {\n"; - declare_src(); - const int total_work_items = work_group_size_.x * work_group_size_.y * work_group_size_.z; - if (conv_params.weights_upload_type == ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) - { - c += GenerateAsyncUpload("weights_cache", "filters_loc", - /*global_offset_name*/ "", local_mem_size); - } - else if (conv_params.weights_upload_type == ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) - { - c += " barrier(CLK_LOCAL_MEM_FENCE);\n"; - c += - GenerateUploadByThreads("weights_cache", "filters_loc", - /*global_offset_name*/ "", "lid", total_work_items, local_mem_size); - } - else if (use_simd_broadcast) - { - int parts = local_mem_size / simd_size; - int reminder = local_mem_size % simd_size; - for (int i = 0; i < parts; ++i) - { - c += " FLT4 simd_w" + std::to_string(i) + " = filters_loc[simd_id + " + - std::to_string(i * simd_size) + "];\n"; - } - if (reminder) - { - c += " FLT4 simd_w" + std::to_string(parts) + ";\n"; - c += " if (simd_id < " + std::to_string(reminder) + ") {\n"; - c += " simd_w" + std::to_string(parts) + " = filters_loc[simd_id + " + - std::to_string(parts * simd_size) + "];\n"; - c += " }\n"; - } - } - else if (conv_params.AreWeightsBuffer()) - { // GLOBAL_MEM/CONSTANT_MEM - c += " weights_cache = filters_loc;\n"; - } - else - { // TEXTURES_MEM - for (int dst_s = 0; dst_s < block_size.w; ++dst_s) - { - std::string f_y = trivial_kernel_size ? "s" : "filter_offset"; - if (conv_params.different_weights_for_height) - { - f_y = "DST_Y * args.src_tensor.Slices() + s"; - } - c += absl::Substitute( - R"( FLT4 f$2 = args.weights0.Read(DST_S + $0, $1); - FLT4 f$3 = args.weights1.Read(DST_S + $0, $1); - FLT4 f$4 = args.weights2.Read(DST_S + $0, $1); - FLT4 f$5 = args.weights3.Read(DST_S + $0, $1); -)", - dst_s, f_y, dst_s * 4 + 0, dst_s * 4 + 1, dst_s * 4 + 2, dst_s * 4 + 3); - } - if (!trivial_kernel_size) - { - c += " filter_offset++;\n"; - } - } - read_src(); - c += " s += 1;\n"; - if (conv_params.weights_upload_type == ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) - { - c += " barrier(CLK_LOCAL_MEM_FENCE);\n"; - } - conv_core(0); - for (int i = 1; i < conv_params.src_depth_loop_size; ++i) - { - read_src(); - conv_core(i * block_size.w * 4); - c += " s += 1;\n"; - } - if (conv_params.AreWeightsBuffer()) - { - c += " filters_loc += " + std::to_string(local_mem_size) + ";\n"; - } - c += " } while (s < args.src_tensor.Slices());\n"; - if (!conv_params.x_kernel_is_1) - { - c += " };\n"; - } - if (!conv_params.y_kernel_is_1) - { - c += " };\n"; - } - if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) - { - c += " };\n"; - } - if (conv_params.AreWeightsBuffer()) - { - if (conv_params.weights_upload_type == ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) - { - c += GenerateAsyncUpload("weights_cache", "args.biases.GetPtr()", "DST_S", block_size.w); - } - else if (conv_params.weights_upload_type == - ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) - { - c += " barrier(CLK_LOCAL_MEM_FENCE);\n"; - c += GenerateUploadByThreads("weights_cache", "args.biases.GetPtr()", "DST_S", "lid", - total_work_items, block_size.w); - c += " barrier(CLK_LOCAL_MEM_FENCE);\n"; - } - else - { - c += " weights_cache = args.biases.GetPtr() + DST_S;\n"; - } - } - if (late_oob_check) - { - c += " if (" + dst_oob_check + ") {\n"; - c += " return;\n"; - c += " }\n"; - } - - auto generate_dst_check = [&](int x, int y, int z) { - std::string check; - const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH}; - const std::vector<std::string> names{"Width()", "Height()", "Depth()"}; - std::vector<std::string> coords(3); - coords[0] = "DST_X + " + std::to_string(x); - coords[1] = "DST_Y + " + std::to_string(y); - coords[2] = "DST_Z + " + std::to_string(z); - const std::vector<int> ids{x, y, z}; - for (size_t i = 0; i < axes.size(); ++i) - { - const auto &axis = axes[i]; - if (src_def.HasAxis(axis) && ids[i] != 0) - { - if (!check.empty()) - { - check += " && "; - } - check += coords[i] + " < args.dst_tensor." + names[i]; - } - } - return check; - }; - - for (int s = 0; s < block_size.w; ++s) - { - const std::string sind = std::to_string(s); - c += " if (DST_S + " + sind + " >= args.dst_tensor.Slices()) return;\n"; - c += " {\n"; - if (conv_params.AreWeightsBuffer()) - { - c += " FLT4 bias_val = TO_FLT4(weights_cache[" + sind + "]);\n"; - } - else - { - c += " FLT4 bias_val = args.biases.Read(DST_S + " + sind + ");\n"; - } - for (int z = 0; z < block_size.z; ++z) - { - const std::string zind = std::to_string(z); - for (int y = 0; y < block_size.y; ++y) - { - const std::string yind = std::to_string(y); - for (int x = 0; x < block_size.x; ++x) - { - const std::string xind = std::to_string(x); - const std::string id = generate_id_full(xind, yind, zind, sind); - const std::string check = generate_dst_check(x, y, z); - std::string coords = "DST_X + " + xind + ", DST_Y + " + yind; - if (src_def.HasAxis(Axis::DEPTH)) - { - coords += ", DST_Z + " + zind; - } - coords += ", DST_S + " + sind; - if (!check.empty()) - { - c += " if (" + check + ") {\n"; - } - else - { - c += " {\n"; - } - c += " FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n"; - c += " args.dst_tensor.Write(res, " + coords + ");\n"; - c += " }\n"; - } - } - } - c += " }\n"; - } - c += "}\n"; - return c; -} - -ConvPowerVR::ConvParams -ConvPowerVR::GuessBestParams(const DeviceInfo &device_info, const OperationDef &definition, - int src_depth, int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1, - bool different_weights_for_height, const BHWC *dst_shape) -{ - ConvParams conv_params; - conv_params.linear_spatial = false; - conv_params.weights_data_type = DeduceDataTypeFromPrecision(definition.precision); - conv_params.x_kernel_is_1 = x_kernel_is_1; - conv_params.y_kernel_is_1 = y_kernel_is_1; - conv_params.different_weights_for_height = different_weights_for_height; - if (device_info.IsNvidia()) - { - if (different_weights_for_height) - { - work_group_size_ = int3(32, 1, 1); - work_group_launch_order_ = int3(2, 0, 1); - conv_params.fixed_work_group_size = true; - } - else - { - conv_params.linear_spatial = true; - work_group_size_ = int3(32, 1, 1); - work_group_launch_order_ = int3(1, 0, 2); - conv_params.fixed_work_group_size = true; - } - conv_params.block_size = int4(2, 1, 1, 4); - conv_params.src_depth_loop_size = 1; - conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS; - if (dst_depth % 4 == 0 || dst_depth >= 8) - { - conv_params.block_size.w = 4; - } - else if (dst_depth % 2 == 0 || dst_depth >= 4) - { - conv_params.block_size.w = 2; - } - else - { - conv_params.block_size.w = dst_depth; - } - if (dst_shape) - { - int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth; - float task_size_per_cu = static_cast<float>(task_size) / device_info.compute_units_count; - int block_size = - conv_params.block_size.x * conv_params.block_size.y * conv_params.block_size.w; - float threads_per_cu = task_size_per_cu / block_size; - float warps_per_cu = threads_per_cu / 32 /*warp_size*/; - if (warps_per_cu < 8.0f) - { - conv_params.block_size.x = 1; - } - if (warps_per_cu < 4.0f && conv_params.block_size.w >= 4) - { - conv_params.block_size.w /= 2; - } - if (warps_per_cu < 2.0f && conv_params.block_size.w >= 2) - { - conv_params.block_size.w /= 2; - } - } - if (src_depth % 2 == 0) - { - conv_params.src_depth_loop_size = 2; - } - if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) - { - conv_params.src_depth_loop_size = 4; - } - } - else if (device_info.IsPowerVR()) - { - if (different_weights_for_height) - { - work_group_size_ = int3(32, 1, 1); - work_group_launch_order_ = int3(2, 0, 1); - conv_params.fixed_work_group_size = true; - } - else - { - conv_params.linear_spatial = true; - work_group_size_ = int3(32, 1, 1); - work_group_launch_order_ = int3(1, 0, 2); - conv_params.fixed_work_group_size = true; - } - conv_params.weights_data_type = - definition.precision == CalculationsPrecision::F16 ? DataType::FLOAT16 : DataType::FLOAT32; - conv_params.block_size = int4(1, 1, 1, 4); - conv_params.src_depth_loop_size = 1; - conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP; - if (dst_depth % 8 == 0 || dst_depth >= 32) - { - conv_params.block_size.w = 8; - } - else if (dst_depth % 4 == 0 || dst_depth >= 8) - { - conv_params.block_size.w = 4; - } - else if (dst_depth % 2 == 0 || dst_depth >= 4) - { - conv_params.block_size.w = 2; - } - else - { - conv_params.block_size.w = dst_depth; - } - if (definition.precision == CalculationsPrecision::F16) - { - conv_params.block_size.w = std::min(4, conv_params.block_size.w); - if (src_depth % 2 == 0) - { - conv_params.src_depth_loop_size = 2; - } - if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) - { - conv_params.src_depth_loop_size = 4; - } - if (conv_params.block_size.w == 1) - { - if (src_depth % 2 == 0) - { - conv_params.src_depth_loop_size = 2; - } - if (src_depth % 4 == 0) - { - conv_params.src_depth_loop_size = 4; - } - if (src_depth <= 8) - { - conv_params.src_depth_loop_size = src_depth; - } - } - conv_params.block_size.x = 2; - } - } - else if (device_info.IsAMD()) - { - if (different_weights_for_height) - { - work_group_size_ = int3(32, 1, 1); - work_group_launch_order_ = int3(2, 0, 1); - conv_params.fixed_work_group_size = true; - } - else - { - work_group_size_ = int3(8, 4, 1); - work_group_launch_order_ = int3(2, 0, 1); - conv_params.fixed_work_group_size = true; - } - - conv_params.block_size = int4(2, 1, 1, 1); - if (x_kernel_is_1 && y_kernel_is_1) - { - conv_params.block_size.y = 2; - } - conv_params.src_depth_loop_size = 1; - conv_params.weights_upload_type = WeightsUploadType::CONSTANT_MEM; - if (dst_depth % 8 == 0 || dst_depth >= 32) - { - conv_params.block_size.w = 8; - } - else if (dst_depth % 4 == 0 || dst_depth >= 8) - { - conv_params.block_size.w = 4; - } - else if (dst_depth % 2 == 0 || dst_depth >= 4) - { - conv_params.block_size.w = 2; - } - else - { - conv_params.block_size.w = 1; - } - if (src_depth % 2 == 0 && src_depth >= 16) - { - conv_params.src_depth_loop_size = 2; - } - } - else if (device_info.IsMali()) - { - int block_size = 2; - if (dst_shape) - { - int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth; - block_size = GetRecommendedBlockSizeForConv(device_info, definition.precision, task_size); - } - if (!x_kernel_is_1 || !y_kernel_is_1) - { - block_size = std::min(block_size, 4); - } - if (block_size == 8) - { - if (dst_depth == 1 || dst_depth == 3) - { - conv_params.block_size = int4(2, 2, 1, 1); - } - else - { - conv_params.block_size = int4(2, 2, 1, 2); - } - } - else if (block_size == 4) - { - if (dst_depth == 1 || dst_depth == 3) - { - conv_params.block_size = int4(2, 2, 1, 1); - } - else - { - conv_params.block_size = int4(2, 1, 1, 2); - } - } - else if (block_size == 2) - { - conv_params.block_size = int4(2, 1, 1, 1); - } - else - { - conv_params.block_size = int4(1, 1, 1, 1); - } - conv_params.src_depth_loop_size = 1; - MaliInfo mali_info = device_info.mali_info; - if (src_depth % 2 == 0 && block_size <= 2 && !mali_info.IsMidgard()) - { - conv_params.src_depth_loop_size = 2; - } - if (src_depth % 4 == 0 && block_size == 1 && !mali_info.IsMidgard() && - definition.precision == CalculationsPrecision::F16) - { - conv_params.src_depth_loop_size = 4; - } - work_group_size_ = int3(4, 4, 1); - work_group_launch_order_ = int3(0, 1, 2); - conv_params.fixed_work_group_size = false; - conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM; - } - else if (device_info.IsAdreno()) - { - conv_params.block_size = int4(2, 2, 1, 2); - if (device_info.IsAdreno3xx()) - { - if (definition.precision == CalculationsPrecision::F16) - { - conv_params.block_size = int4(2, 2, 1, 2); - } - else if (definition.precision == CalculationsPrecision::F32_F16) - { - conv_params.block_size = int4(2, 1, 1, 2); - } - else - { // F32 - conv_params.block_size = int4(2, 2, 1, 1); - } - } - work_group_size_ = int3(8, 2, 1); - work_group_launch_order_ = int3(0, 1, 2); - conv_params.fixed_work_group_size = false; - conv_params.src_depth_loop_size = 1; - if (definition.src_tensors.size() == 2) - { - // dynamic weights supported only with buffers. - conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM; - } - else - { - conv_params.weights_upload_type = WeightsUploadType::TEXTURES_MEM_X4; - } - } - else if (device_info.IsIntel()) - { - if (different_weights_for_height) - { - work_group_size_ = int3(16, 1, 1); - work_group_launch_order_ = int3(0, 1, 2); - conv_params.fixed_work_group_size = true; - } - else - { - conv_params.linear_spatial = true; - work_group_size_ = int3(16, 1, 1); - work_group_launch_order_ = int3(0, 1, 2); - conv_params.fixed_work_group_size = true; - } - conv_params.block_size = int4(1, 1, 1, 4); - conv_params.src_depth_loop_size = 1; - int sub_group_size = 16; - const bool supports_subgroups = device_info.SupportsExtension("cl_khr_subgroups") || - device_info.SupportsExtension("cl_intel_subgroups"); - if (definition.precision != CalculationsPrecision::F32_F16 && supports_subgroups && - device_info.SupportsExtension("cl_intel_required_subgroup_size") && - device_info.SupportsSubGroupWithSize(sub_group_size)) - { - conv_params.weights_upload_type = WeightsUploadType::PRIVATE_MEM_SIMD_BROADCAST; - conv_params.simd_size = sub_group_size; - } - else - { - conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS; - } - if (dst_depth % 4 == 0 || dst_depth >= 8) - { - conv_params.block_size.w = 4; - } - else if (dst_depth % 2 == 0 || dst_depth >= 4) - { - conv_params.block_size.w = 2; - } - else - { - conv_params.block_size.w = dst_depth; - } - if (src_depth % 2 == 0) - { - conv_params.src_depth_loop_size = 2; - } - if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) - { - conv_params.src_depth_loop_size = 4; - } - } - else - { - conv_params.block_size = int4(1, 1, 1, 4); - work_group_size_ = int3(8, 2, 1); - work_group_launch_order_ = int3(0, 1, 2); - conv_params.fixed_work_group_size = false; - conv_params.src_depth_loop_size = 1; - conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM; - if (dst_depth % 4 == 0 || dst_depth >= 8) - { - conv_params.block_size.w = 4; - } - else if (dst_depth % 2 == 0 || dst_depth >= 4) - { - conv_params.block_size.w = 2; - } - else - { - conv_params.block_size.w = dst_depth; - } - if (src_depth % 2 == 0) - { - conv_params.src_depth_loop_size = 2; - } - if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) - { - conv_params.src_depth_loop_size = 4; - } - } - - return conv_params; -} - -ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(const DeviceInfo &device_info, - const OperationDef &definition, - const Convolution2DAttributes &attr, - const BHWC *dst_shape) -{ - const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4); - const int src_depth = DivideRoundUp(attr.weights.shape.i, 4); - const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 && - attr.dilations.w == 1 && attr.padding.prepended.w == 0 && - attr.padding.appended.w == 0; - const bool y_kernel_is_1 = attr.weights.shape.h == 1 && attr.strides.h == 1 && - attr.dilations.h == 1 && attr.padding.prepended.h == 0 && - attr.padding.appended.h == 0; - return GuessBestParams(device_info, definition, src_depth, dst_depth, x_kernel_is_1, - y_kernel_is_1, false, dst_shape); -} - -ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(const DeviceInfo &device_info, - const OperationDef &definition, - const Convolution3DAttributes &attr, - const BHWDC *dst_shape) -{ - const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4); - const int src_depth = DivideRoundUp(attr.weights.shape.i, 4); - const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 && - attr.dilations.w == 1 && attr.padding.prepended.w == 0 && - attr.padding.appended.w == 0; - const bool y_kernel_is_1 = attr.weights.shape.h == 1 && attr.strides.h == 1 && - attr.dilations.h == 1 && attr.padding.prepended.h == 0 && - attr.padding.appended.h == 0; - const bool z_kernel_is_1 = attr.weights.shape.d == 1 && attr.strides.d == 1 && - attr.dilations.d == 1 && attr.padding.prepended.d == 0 && - attr.padding.appended.d == 0; - - ConvPowerVR::ConvParams result; - BHWC shape; - if (dst_shape) - { - shape.b = dst_shape->b; - shape.h = dst_shape->h * dst_shape->d; - shape.w = dst_shape->w; - shape.c = dst_shape->c; - result = GuessBestParams(device_info, definition, src_depth, dst_depth, x_kernel_is_1, - y_kernel_is_1, false, &shape); - } - else - { - result = GuessBestParams(device_info, definition, src_depth, dst_depth, x_kernel_is_1, - y_kernel_is_1, false, nullptr); - } - result.z_kernel_is_1 = z_kernel_is_1; - return result; -} - -ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(const DeviceInfo &device_info, - const OperationDef &definition, - const Convolution2DAttributes &attr, - const BHWC &weights_shape, - const BHWC *dst_shape) -{ - const int dst_depth = DivideRoundUp(weights_shape.b, 4); - const int src_depth = DivideRoundUp(weights_shape.c, 4); - const bool x_kernel_is_1 = weights_shape.w == 1 && attr.strides.w == 1 && attr.dilations.w == 1 && - attr.padding.prepended.w == 0 && attr.padding.appended.w == 0; - const bool y_kernel_is_1 = weights_shape.h == 1 && attr.strides.h == 1 && attr.dilations.h == 1 && - attr.padding.prepended.h == 0 && attr.padding.appended.h == 0; - return GuessBestParams(device_info, definition, src_depth, dst_depth, x_kernel_is_1, - y_kernel_is_1, false, dst_shape); -} - -ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(const DeviceInfo &device_info, - const OperationDef &definition, - const FullyConnectedAttributes &attr, - const BHWC *dst_shape) -{ - const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4); - const int src_depth = DivideRoundUp(attr.weights.shape.i, 4); - ConvPowerVR::ConvParams params = - GuessBestParams(device_info, definition, src_depth, dst_depth, true, true, false, dst_shape); - work_group_size_.x *= work_group_size_.y; - work_group_size_.y = 1; - params.block_size.x *= params.block_size.y; - params.block_size.y = 1; - return params; -} - -ConvPowerVR::ConvParams ConvPowerVR::GuessBestParamsWinograd(const DeviceInfo &device_info, - const OperationDef &definition, - const Convolution2DAttributes &attr, - const BHWC *dst_shape) -{ - const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4); - const int src_depth = DivideRoundUp(attr.weights.shape.i, 4); - ConvPowerVR::ConvParams params = - GuessBestParams(device_info, definition, src_depth, dst_depth, true, true, true, dst_shape); - params.block_size.x *= params.block_size.y; - params.block_size.y = 1; - return params; -} - -ConvPowerVR CreateConvPowerVR(const DeviceInfo &device_info, const OperationDef &definition, - const Convolution2DAttributes &attr, const BHWC *dst_shape) -{ - ConvPowerVR result(definition, attr, device_info, dst_shape); - result.GenerateCode(device_info); - result.UploadData(attr.weights, attr.bias); - return result; -} - -ConvPowerVR CreateConvPowerVR(const DeviceInfo &device_info, const OperationDef &definition, - const FullyConnectedAttributes &attr, const BHWC *dst_shape) -{ - ConvPowerVR result(definition, attr, device_info, dst_shape); - result.GenerateCode(device_info); - result.UploadData(attr.weights, attr.bias); - return result; -} - -ConvPowerVR CreateConvPowerVRDynamicWeights(const DeviceInfo &device_info, - const OperationDef &definition, - const Convolution2DAttributes &attr, - const BHWC &weights_shape, const BHWC *dst_shape) -{ - ConvPowerVR result(definition, attr, weights_shape, device_info, dst_shape); - result.GenerateCode(device_info); - result.UploadBias(attr.bias); - return result; -} - -ConvPowerVR CreateConvPowerVRWino4x4To6x6(const DeviceInfo &device_info, - const OperationDef &definition, - const Convolution2DAttributes &attr, - const BHWC *dst_shape) -{ - ConvPowerVR result(definition); - result.conv_params_ = result.GuessBestParamsWinograd(device_info, definition, attr, dst_shape); - result.GenerateCode(device_info); - result.UploadDataForWinograd4x4To6x6(attr.weights); - return result; -} - -ConvPowerVR CreateConvPowerVR3D(const DeviceInfo &device_info, const OperationDef &definition, - const Convolution3DAttributes &attr, const BHWDC *dst_shape) -{ - ConvPowerVR result(definition, attr, device_info, dst_shape); - result.GenerateCode(device_info); - result.UploadWeights(attr.weights); - result.UploadBias(attr.bias); - return result; -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvPowervr.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvPowervr.h deleted file mode 100644 index f83f05730..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvPowervr.h +++ /dev/null @@ -1,413 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_POWERVR_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_POWERVR_H__ - -#include <cstring> -#include <vector> - -#include "open_cl/Buffer.h" -#include "open_cl/ClDevice.h" -#include "open_cl/kernels/ConvCommon.h" -#include "open_cl/kernels/GpuOperation.h" -#include "open_cl/kernels/Util.h" -#include "open_cl/LinearStorage.h" -#include "open_cl/Tensor.h" -#include "open_cl/Texture2d.h" -#include "open_cl/Util.h" -#include "open_cl/DataType.h" -#include "open_cl/Operations.h" -#include "open_cl/Shape.h" -#include "open_cl/Status.h" -#include "open_cl/Tensor.h" -#include "open_cl/Types.h" -#include "open_cl/WinogradUtil.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -class ConvPowerVR : public GPUOperation -{ -public: - ConvPowerVR() = default; - void GetPossibleKernelWorkGroups(TuningType tuning_type, const DeviceInfo &device_info, - const KernelInfo &kernel_info, - std::vector<int3> *work_groups) const override; - absl::Status BindArguments(ArgumentsBinder *args) override; - int3 GetGridSize() const override; - - ConvWeightsDescription GetConvWeightsDescription() const - { - ConvWeightsDescription desc; - desc.layout = ConvWeightsLayout::kOHWIOGroupI4O4; - desc.output_group_size = conv_params_.block_size.w; - return desc; - } - - // Move only - ConvPowerVR(ConvPowerVR &&operation); - ConvPowerVR &operator=(ConvPowerVR &&operation); - ConvPowerVR(const ConvPowerVR &) = delete; - ConvPowerVR &operator=(const ConvPowerVR &) = delete; - -private: - enum class WeightsUploadType - { - LOCAL_MEM_ASYNC_SUBGROUP, // we use it for PowerVR with workgroup size = 32 - LOCAL_MEM_BY_THREADS, - GLOBAL_MEM, - CONSTANT_MEM, - PRIVATE_MEM_SIMD_BROADCAST, - TEXTURES_MEM_X4, // 4 textures for weights - }; - - struct ConvParams - { - // Usually we use this combinations for CalculationPrecision: - // F32: all F32 - // F16: all F16 - // F32_F16: all besides accumulator is F16, including weights - // But for PowerVR we can achieve better performance in F32_F16 with F32 - // weights, so for PowerVR in this kernel we have F32 weights for - // F32_F16 precision mode - DataType weights_data_type; // used for weights and biases - int4 block_size; // WHDS - bool fixed_work_group_size; - bool linear_spatial; // spatial dimensions are Width/Height/Depth - bool different_weights_for_height; - int src_depth_loop_size; - WeightsUploadType weights_upload_type; - bool x_kernel_is_1; - bool y_kernel_is_1; - bool z_kernel_is_1; - - // used only with PRIVATE_MEM_SIMD_BROADCAST - int simd_size = 1; - - bool AreWeightsBuffer() const - { - return weights_upload_type != WeightsUploadType::TEXTURES_MEM_X4; - } - - bool IsPrivateMemBroadcast() const - { - return weights_upload_type == WeightsUploadType::PRIVATE_MEM_SIMD_BROADCAST; - } - }; - - ConvPowerVR(const OperationDef &definition, const Convolution2DAttributes &attr, - const DeviceInfo &device_info, const BHWC *dst_shape = nullptr); - ConvPowerVR(const OperationDef &definition, const Convolution2DAttributes &attr, - const BHWC &weights_shape, const DeviceInfo &device_info, - const BHWC *dst_shape = nullptr); - ConvPowerVR(const OperationDef &definition, const FullyConnectedAttributes &attr, - const DeviceInfo &device_info, const BHWC *dst_shape = nullptr); - explicit ConvPowerVR(const OperationDef &definition); - ConvPowerVR(const OperationDef &definition, const Convolution3DAttributes &attr, - const DeviceInfo &device_info, const BHWDC *dst_shape = nullptr); - - void GenerateCode(const DeviceInfo &device_info); - - template <DataType T> - void UploadData(const InternalTensor<OHWI, T> &weights, const InternalTensor<Linear, T> &biases); - template <DataType T> void UploadDataForWinograd4x4To6x6(const InternalTensor<OHWI, T> &weights); - - template <DataType T> void UploadWeights(const InternalTensor<OHWI, T> &weights); - - template <DataType T> void UploadWeights(const InternalTensor<OHWDI, T> &weights); - - template <DataType T> void UploadBias(const InternalTensor<Linear, T> &bias); - - friend ConvPowerVR CreateConvPowerVR(const DeviceInfo &device_info, - const OperationDef &definition, - const Convolution2DAttributes &attr, const BHWC *dst_shape); - - friend ConvPowerVR CreateConvPowerVR(const DeviceInfo &device_info, - const OperationDef &definition, - const FullyConnectedAttributes &attr, const BHWC *dst_shape); - - friend ConvPowerVR CreateConvPowerVRDynamicWeights(const DeviceInfo &device_info, - const OperationDef &definition, - const Convolution2DAttributes &attr, - const BHWC &weights_shape, - const BHWC *dst_shape); - - friend ConvPowerVR CreateConvPowerVRWino4x4To6x6(const DeviceInfo &device_info, - const OperationDef &definition, - const Convolution2DAttributes &attr, - const BHWC *dst_shape); - - friend ConvPowerVR CreateConvPowerVR3D(const DeviceInfo &device_info, - const OperationDef &definition, - const Convolution3DAttributes &attr, - const BHWDC *dst_shape); - - ConvParams GuessBestParams(const DeviceInfo &device_info, const OperationDef &definition, - const Convolution2DAttributes &attr, const BHWC *dst_shape = nullptr); - ConvParams GuessBestParams(const DeviceInfo &device_info, const OperationDef &definition, - const Convolution2DAttributes &attr, const BHWC &weights_shape, - const BHWC *dst_shape = nullptr); - ConvParams GuessBestParams(const DeviceInfo &device_info, const OperationDef &definition, - const FullyConnectedAttributes &attr, const BHWC *dst_shape = nullptr); - ConvParams GuessBestParamsWinograd(const DeviceInfo &device_info, const OperationDef &definition, - const Convolution2DAttributes &attr, - const BHWC *dst_shape = nullptr); - ConvParams GuessBestParams(const DeviceInfo &device_info, const OperationDef &definition, - const Convolution3DAttributes &attr, const BHWDC *dst_shape = nullptr); - ConvParams GuessBestParams(const DeviceInfo &device_info, const OperationDef &definition, - int src_depth, int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1, - bool different_weights_for_height, const BHWC *dst_shape = nullptr); - - std::string GenerateConv(const DeviceInfo &device_info, const OperationDef &op_def, - bool stride_correction, const ConvParams &conv_params); - - int4 stride_; - int4 padding_; - int4 kernel_size_; - int4 dilation_; - ConvParams conv_params_; -}; - -template <DataType T> -void ConvPowerVR::UploadData(const InternalTensor<OHWI, T> &weights, - const InternalTensor<Linear, T> &biases) -{ - UploadWeights(weights); - UploadBias(biases); -} - -template <DataType T> -void ConvPowerVR::UploadDataForWinograd4x4To6x6(const InternalTensor<OHWI, T> &weights) -{ - InternalTensor<OHWI, T> wino_weights; - RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights); - UploadWeights(wino_weights); - InternalTensor<Linear, DataType::FLOAT32> biases; - biases.shape = Linear(weights.shape.o); - biases.data.resize(weights.shape.o, 0.0f); - UploadBias(biases); -} - -template <DataType T> void ConvPowerVR::UploadBias(const InternalTensor<Linear, T> &bias) -{ - BufferDescriptor desc; - desc.element_type = conv_params_.weights_data_type; - desc.element_size = 4; - desc.memory_type = - conv_params_.weights_upload_type == ConvPowerVR::WeightsUploadType::CONSTANT_MEM - ? MemoryType::CONSTANT - : MemoryType::GLOBAL; - const int float_size = sizeof(float); - // TODO - // conv_params_.weights_data_type == DataType::FLOAT32 ? sizeof(float) : sizeof(half); - int aligned_channels = AlignByN(bias.shape.v, 4 * conv_params_.block_size.w); - desc.size = float_size * aligned_channels; - desc.data.resize(desc.size); - if (conv_params_.weights_data_type == DataType::FLOAT32) - { - float *gpu_data = reinterpret_cast<float *>(desc.data.data()); - for (int i = 0; i < aligned_channels; ++i) - { - gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f; - } - } - // else - // { - // half *gpu_data = reinterpret_cast<half *>(desc.data.data()); - // for (int i = 0; i < aligned_channels; ++i) - // { - // gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f; - // } - // } - args_.AddObject("biases", absl::make_unique<BufferDescriptor>(std::move(desc))); -} - -template <DataType T> void ConvPowerVR::UploadWeights(const InternalTensor<OHWI, T> &weights) -{ - const int dst_slices = AlignByN(DivideRoundUp(weights.shape.o, 4), conv_params_.block_size.w); - const int src_slices = DivideRoundUp(weights.shape.i, 4); - - const bool f32_weights = conv_params_.weights_data_type == DataType::FLOAT32; - const int float4_size = sizeof(float4); - // TODO - // f32_weights ? sizeof(float4) : sizeof(half4); - - const int elements_count = weights.shape.h * weights.shape.w * src_slices * dst_slices * 4; - - std::vector<uint8_t> data(float4_size * elements_count); - - if (f32_weights) - { - float4 *ptr = reinterpret_cast<float4 *>(data.data()); - if (conv_params_.AreWeightsBuffer()) - { - RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.w, - absl::MakeSpan(ptr, elements_count)); - } - else - { - RearrangeWeightsToI4HWIOOGroupO4(weights, conv_params_.block_size.w, - absl::MakeSpan(ptr, elements_count)); - } - } - // else - // { - // half4 *ptr = reinterpret_cast<half4 *>(data.data()); - // if (conv_params_.AreWeightsBuffer()) - // { - // RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.w, - // absl::MakeSpan(ptr, elements_count)); - // } - // else - // { - // RearrangeWeightsToI4HWIOOGroupO4(weights, conv_params_.block_size.w, - // absl::MakeSpan(ptr, elements_count)); - // } - // } - if (conv_params_.AreWeightsBuffer()) - { - BufferDescriptor desc; - desc.element_type = conv_params_.weights_data_type; - desc.element_size = 4; - desc.memory_type = - conv_params_.weights_upload_type == ConvPowerVR::WeightsUploadType::CONSTANT_MEM - ? MemoryType::CONSTANT - : MemoryType::GLOBAL; - desc.size = float4_size * elements_count; - desc.data = std::move(data); - args_.AddObject("weights", absl::make_unique<BufferDescriptor>(std::move(desc))); - } - else - { - const int texture_width = dst_slices; - const int texture_height = src_slices * weights.shape.h * weights.shape.w; - const int sub_size = float4_size * texture_width * texture_height; - for (int i = 0; i < 4; ++i) - { - Texture2DDescriptor desc; - desc.element_type = conv_params_.weights_data_type; - desc.size = int2(texture_width, texture_height); - desc.data.resize(sub_size); - std::memcpy(desc.data.data(), data.data() + sub_size * i, sub_size); - const std::string name = "weights" + std::to_string(i); - args_.AddObject(name, absl::make_unique<Texture2DDescriptor>(std::move(desc))); - } - } -} - -template <DataType T> void ConvPowerVR::UploadWeights(const InternalTensor<OHWDI, T> &weights) -{ - const int block_size = conv_params_.block_size.w; - const int dst_slices = AlignByN(DivideRoundUp(weights.shape.o, 4), block_size); - const int src_slices = DivideRoundUp(weights.shape.i, 4); - - const int elements_count = - weights.shape.d * weights.shape.h * weights.shape.w * src_slices * dst_slices * 4; - const bool f32_weights = definition_.precision == CalculationsPrecision::F32; - - const int float4_size = f32_weights ? 16 : 8; - - std::vector<uint8_t> data(float4_size * elements_count); - - if (f32_weights) - { - float4 *ptr = reinterpret_cast<float4 *>(data.data()); - if (conv_params_.AreWeightsBuffer()) - { - RearrangeWeightsToODHWIOGroupI4O4(weights, conv_params_.block_size.w, - absl::MakeSpan(ptr, elements_count)); - } - else - { - RearrangeWeightsToI4DHWIOOGroupO4(weights, conv_params_.block_size.w, - absl::MakeSpan(ptr, elements_count)); - } - } - // else - // { - // half4 *ptr = reinterpret_cast<half4 *>(data.data()); - // if (conv_params_.AreWeightsBuffer()) - // { - // RearrangeWeightsToODHWIOGroupI4O4(weights, conv_params_.block_size.w, - // absl::MakeSpan(ptr, elements_count)); - // } - // else - // { - // RearrangeWeightsToI4DHWIOOGroupO4(weights, conv_params_.block_size.w, - // absl::MakeSpan(ptr, elements_count)); - // } - // } - - if (conv_params_.AreWeightsBuffer()) - { - BufferDescriptor desc; - desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16; - desc.element_size = 4; - desc.size = float4_size * elements_count; - desc.data = std::move(data); - args_.AddObject("weights", absl::make_unique<BufferDescriptor>(std::move(desc))); - } - else - { - const int texture_width = dst_slices; - const int texture_height = src_slices * weights.shape.d * weights.shape.h * weights.shape.w; - int sub_size = float4_size * texture_width * texture_height; - for (int i = 0; i < 4; ++i) - { - Texture2DDescriptor desc; - desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16; - desc.size = int2(texture_width, texture_height); - desc.data.resize(sub_size); - memcpy(desc.data.data(), data.data() + sub_size * i, sub_size); - const std::string name = "weights" + std::to_string(i); - args_.AddObject(name, absl::make_unique<Texture2DDescriptor>(std::move(desc))); - } - } -} - -ConvPowerVR CreateConvPowerVR(const DeviceInfo &device_info, const OperationDef &definition, - const Convolution2DAttributes &attr, const BHWC *dst_shape = nullptr); - -ConvPowerVR CreateConvPowerVR(const DeviceInfo &device_info, const OperationDef &definition, - const FullyConnectedAttributes &attr, - const BHWC *dst_shape = nullptr); - -ConvPowerVR CreateConvPowerVRDynamicWeights(const DeviceInfo &device_info, - const OperationDef &definition, - const Convolution2DAttributes &attr, - const BHWC &weights_shape, - const BHWC *dst_shape = nullptr); - -ConvPowerVR CreateConvPowerVRWino4x4To6x6(const DeviceInfo &device_info, - const OperationDef &definition, - const Convolution2DAttributes &attr, - const BHWC *dst_shape = nullptr); - -ConvPowerVR CreateConvPowerVR3D(const DeviceInfo &device_info, const OperationDef &definition, - const Convolution3DAttributes &attr, - const BHWDC *dst_shape = nullptr); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_POWERVR_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvWeightsConverter.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvWeightsConverter.cc deleted file mode 100644 index 95172bd05..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvWeightsConverter.cc +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "open_cl/kernels/ConvWeightsConverter.h" - -#include <string> - -#include "open_cl/kernels/Util.h" -#include "open_cl/kernels/WorkGroupPicking.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -ConverterToConvWeights::ConverterToConvWeights(const OperationDef &definition, - const ConvWeightsDescription &conv_weights_desc) - : GPUOperation(definition), conv_weights_desc_(conv_weights_desc) -{ - code_ = GetConverterToConvWeightsCode(definition_, conv_weights_desc_); -} - -ConverterToConvWeights::ConverterToConvWeights(ConverterToConvWeights &&operation) - : GPUOperation(std::move(operation)), conv_weights_desc_(operation.conv_weights_desc_) -{ -} - -ConverterToConvWeights &ConverterToConvWeights::operator=(ConverterToConvWeights &&operation) -{ - if (this != &operation) - { - conv_weights_desc_ = operation.conv_weights_desc_; - GPUOperation::operator=(std::move(operation)); - } - return *this; -} - -std::string ConverterToConvWeights::GetConverterToConvWeightsCode( - const OperationDef &op_def, const ConvWeightsDescription &conv_weights_desc) -{ - AddSrcTensor("src_tensor", op_def.src_tensors[0]); - AddDstTensor("dst_tensor", op_def.dst_tensors[0]); - args_.AddFloat("mask_x"); - args_.AddFloat("mask_y"); - args_.AddFloat("mask_z"); - args_.AddFloat("mask_w"); - - std::string c = GetCommonDefines(op_def.precision); - c += "__kernel void main_function(\n"; - c += "$0) {\n"; - c += " int GROUP_SIZE = " + std::to_string(conv_weights_desc.output_group_size) + ";\n"; - c += " int O = get_global_id(0) * 4;\n"; - c += " int I = get_global_id(1);\n"; - c += " int Z = get_global_id(2);\n"; - c += " int W = Z % args.src_tensor.Width();\n"; - c += " int H = Z / args.src_tensor.Width();\n"; - c += " if (O >= args.src_tensor.Batch() || I >= args.src_tensor.Slices() || " - "H >= args.src_tensor.Height()) return;\n"; - c += " FLT4 v0 = args.src_tensor.Read(W, H, I, O + 0);\n"; - c += " FLT4 v1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"; - c += " FLT4 v2 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"; - c += " FLT4 v3 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"; - c += " if (O + 1 < args.src_tensor.Batch()) {\n"; - c += " v1 = args.src_tensor.Read(W, H, I, O + 1);\n"; - c += " }\n"; - c += " if (O + 2 < args.src_tensor.Batch()) {\n"; - c += " v2 = args.src_tensor.Read(W, H, I, O + 2);\n"; - c += " }\n"; - c += " if (O + 3 < args.src_tensor.Batch()) {\n"; - c += " v3 = args.src_tensor.Read(W, H, I, O + 3);\n"; - c += " }\n"; - c += " if (I == args.src_tensor.Slices() - 1) {\n"; - c += " FLT4 mask = (FLT4)(args.mask_x, args.mask_y, args.mask_z, " - "args.mask_w);\n"; - c += " v0 *= mask;\n"; - c += " v1 *= mask;\n"; - c += " v2 *= mask;\n"; - c += " v3 *= mask;\n"; - c += " }\n"; - c += " FLT4 r0 = (FLT4)(v0.x, v1.x, v2.x, v3.x);\n"; - c += " FLT4 r1 = (FLT4)(v0.y, v1.y, v2.y, v3.y);\n"; - c += " FLT4 r2 = (FLT4)(v0.z, v1.z, v2.z, v3.z);\n"; - c += " FLT4 r3 = (FLT4)(v0.w, v1.w, v2.w, v3.w);\n"; - c += " int d_index = O / (GROUP_SIZE * 4);\n"; - c += " int k_index = (O % (GROUP_SIZE * 4)) / 4;\n"; - c += " int dst_offset = (((d_index * args.src_tensor.Height() + H) * " - "args.src_tensor.Width() + W) * " - "args.src_tensor.Slices() + I) * GROUP_SIZE + " - "k_index;\n"; - c += " int address0 = dst_offset * 4 + 0;\n"; - c += " int address1 = dst_offset * 4 + 1;\n"; - c += " int address2 = dst_offset * 4 + 2;\n"; - c += " int address3 = dst_offset * 4 + 3;\n"; - c += " args.dst_tensor.WriteLinear(r0, dst_offset * 4 + 0)\n;"; - c += " args.dst_tensor.WriteLinear(r1, dst_offset * 4 + 1)\n;"; - c += " args.dst_tensor.WriteLinear(r2, dst_offset * 4 + 2)\n;"; - c += " args.dst_tensor.WriteLinear(r3, dst_offset * 4 + 3)\n;"; - c += "}\n"; - return c; -} - -absl::Status ConverterToConvWeights::BindArguments(ArgumentsBinder *args) -{ - float4 mask = GetMaskForLastPlane(src_[0]->Channels()); - RETURN_IF_ERROR(args->SetFloat("mask_x", mask.x)); - RETURN_IF_ERROR(args->SetFloat("mask_y", mask.y)); - RETURN_IF_ERROR(args->SetFloat("mask_z", mask.z)); - return args->SetFloat("mask_w", mask.w); -} - -int3 ConverterToConvWeights::GetGridSize() const -{ - const int grid_x = - DivideRoundUp(AlignByN(src_[0]->Batch(), 4 * conv_weights_desc_.output_group_size), 4); - const int grid_y = src_[0]->Slices(); - const int grid_z = src_[0]->Width() * src_[0]->Height(); - return int3(grid_x, grid_y, grid_z); -} - -ConverterToConvWeights CreateConverterToConvWeights(const OperationDef &definition, - const ConvWeightsDescription &conv_weights_desc) -{ - return ConverterToConvWeights(definition, conv_weights_desc); -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvWeightsConverter.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvWeightsConverter.h deleted file mode 100644 index bb68977eb..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvWeightsConverter.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_WEIGHTS_CONVERTER_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_WEIGHTS_CONVERTER_H__ - -#include "open_cl/ClCommandQueue.h" -#include "open_cl/ClKernel.h" -#include "open_cl/kernels/ConvCommon.h" -#include "open_cl/kernels/GpuOperation.h" -#include "open_cl/Status.h" -#include "open_cl/Types.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -class ConverterToConvWeights : public GPUOperation -{ -public: - ConverterToConvWeights(const OperationDef &definition, - const ConvWeightsDescription &conv_weights_desc); - absl::Status BindArguments(ArgumentsBinder *args) override; - int3 GetGridSize() const override; - - // Move only - ConverterToConvWeights(ConverterToConvWeights &&operation); - ConverterToConvWeights &operator=(ConverterToConvWeights &&operation); - ConverterToConvWeights(const ConverterToConvWeights &) = delete; - ConverterToConvWeights &operator=(const ConverterToConvWeights &) = delete; - -private: - std::string GetConverterToConvWeightsCode(const OperationDef &op_def, - const ConvWeightsDescription &conv_weights_desc); - - ConvWeightsDescription conv_weights_desc_; -}; - -// We expect src BHWC tensor and we assume that B is O, H = H, W = W, C is I -// as dst we expect Tensor with storage type BUFFER and -// dst.b * dst.h * dst.w * dst.c = AlignByN(src.b, 4) * src.h * src.w -// AlignByN(src.c, 4) -ConverterToConvWeights -CreateConverterToConvWeights(const OperationDef &definition, - const ConvWeightsDescription &conv_weights_desc); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_WEIGHTS_CONVERTER_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Converter.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/Converter.cc deleted file mode 100644 index cc2bc41d4..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/Converter.cc +++ /dev/null @@ -1,592 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Converter.h" - -#include <algorithm> -#include <array> -#include <string> - -#include "open_cl/Arguments.h" -#include "open_cl/ClCommandQueue.h" -#include "open_cl/ClErrors.h" -#include "open_cl/kernels/Util.h" -#include "open_cl/Precision.h" -#include "open_cl/InternalTensor.h" -#include "open_cl/TensorType.h" -#include "open_cl/TensorTypeUtil.h" -#include "open_cl/Util.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ -namespace -{ - -class OpenClConverterImpl : public TensorObjectConverter -{ -public: - virtual absl::Status Init(const TensorObjectDef &input_def, const TensorObjectDef &output_def, - Environment *environment) = 0; - -protected: - absl::Status DispatchKernel(cl_mem buffer_mem, Tensor *tensor) - { - kernel_.ResetBindingCounter(); - RETURN_IF_ERROR(kernel_.SetMemoryAuto(buffer_mem)); - RETURN_IF_ERROR(args_.SetObjectRef("tensor", tensor)); - RETURN_IF_ERROR(args_.Bind(kernel_.kernel(), kernel_.GetBindingCounter())); - const int3 grid = int3(tensor->Width() * tensor->Batch(), tensor->Height(), tensor->Slices()); - const int3 work_group_size = {16, 8, 1}; - const int3 work_groups_count = GetWorkGroupsCount(grid, work_group_size); - return queue_->Dispatch(kernel_, work_groups_count, work_group_size); - } - - Arguments args_; - BHWC shape_; - CLKernel kernel_; - TensorDescriptor tensor_descriptor_; - CLCommandQueue *queue_ = nullptr; - const CLContext *context_ = nullptr; -}; - -bool IsSupportedDataType(DataType type) -{ - return type == DataType::FLOAT16 || type == DataType::FLOAT32; -} - -bool IsBHWCOpenCLBuffer(const ObjectDef &def) -{ - return IsSupportedDataType(def.data_type) && def.object_type == ObjectType::OPENCL_BUFFER && - def.data_layout == DataLayout::BHWC; -} - -bool IsOpenCLTensor(const ObjectDef &def) -{ - const bool is_buffer_tensor = - def.object_type == ObjectType::OPENCL_BUFFER && def.data_layout == DataLayout::DHWC4; - const bool is_image2d_tensor = - def.object_type == ObjectType::OPENCL_TEXTURE && def.data_layout == DataLayout::HDWC4; - const bool is_image2d_array_tensor = - def.object_type == ObjectType::OPENCL_TEXTURE && def.data_layout == DataLayout::DHWC4; - const bool is_single_image_tensor = - def.object_type == ObjectType::OPENCL_TEXTURE && def.data_layout == DataLayout::BHWC; - return IsSupportedDataType(def.data_type) && (is_buffer_tensor || is_image2d_tensor || - is_image2d_array_tensor || is_single_image_tensor); -} - -absl::Status GetOpenCLMemory(const TensorObject &obj, cl_mem *memory) -{ - auto texture = absl::get_if<OpenClTexture>(&obj); - auto buffer = absl::get_if<OpenClBuffer>(&obj); - if (texture && texture->memobj) - { - *memory = texture->memobj; - } - else if (buffer && buffer->memobj) - { - *memory = buffer->memobj; - } - else - { - return absl::InvalidArgumentError("Missing OpenCL object."); - } - return absl::OkStatus(); -} - -// Implements conversion from OpenCL tensor to another OpenCL tensor. -class TensorToTensorConverter : public OpenClConverterImpl -{ -public: - static bool IsSupported(const ObjectDef &input, const ObjectDef &output) - { - return IsOpenCLTensor(input) && IsOpenCLTensor(output); - } - - absl::Status Init(const TensorObjectDef &input_def, const TensorObjectDef &output_def, - Environment *environment) final - { - src_tensor_descriptor_.layout = Layout::BHWC; - src_tensor_descriptor_.storage_type = - ToTensorStorageType(input_def.object_def.object_type, input_def.object_def.data_layout); - src_tensor_descriptor_.data_type = input_def.object_def.data_type; - args_.AddObjectRef("src_tensor", AccessType::READ, - absl::make_unique<TensorDescriptor>(src_tensor_descriptor_)); - - dst_tensor_descriptor_.layout = Layout::BHWC; - dst_tensor_descriptor_.storage_type = - ToTensorStorageType(output_def.object_def.object_type, output_def.object_def.data_layout); - dst_tensor_descriptor_.data_type = output_def.object_def.data_type; - args_.AddObjectRef("dst_tensor", AccessType::WRITE, - absl::make_unique<TensorDescriptor>(dst_tensor_descriptor_)); - - const bool need_fp16_support = input_def.object_def.data_type == DataType::FLOAT16 || - output_def.object_def.data_type == DataType::FLOAT16; - const std::string out_data_type = ToCLDataType(output_def.object_def.data_type); - std::string shader_src; - if (need_fp16_support) - { - shader_src += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"; - } - shader_src += - R"(__kernel void tensor_to_tensor($0) { - int linear_id = get_global_id(0); - int x = linear_id / args.dst_tensor.Batch(); - int b = linear_id % args.dst_tensor.Batch(); - int y = get_global_id(1); - int d = get_global_id(2); - if (x >= args.dst_tensor.Width() || y >= args.dst_tensor.Height() || d >= args.dst_tensor.Slices()) return; -)"; - shader_src += - " " + out_data_type + "4 input = args.src_tensor.Read<" + out_data_type + ">(x, y, d, b);\n"; - shader_src += " args.dst_tensor.Write(input, x, y, d, b);\n}"; - queue_ = environment->queue(); - context_ = &environment->context(); - shape_ = BHWC(input_def.dimensions.b, input_def.dimensions.h, input_def.dimensions.w, - input_def.dimensions.c); - RETURN_IF_ERROR(args_.TransformToCLCode(environment->device().info_, {}, &shader_src)); - return environment->program_cache()->GetOrCreateCLKernel( - shader_src, "tensor_to_tensor", environment->context(), environment->device(), &kernel_); - } - - absl::Status Convert(const TensorObject &input_obj, const TensorObject &output_obj) override - { - cl_mem in_memory = nullptr; - RETURN_IF_ERROR(GetOpenCLMemory(input_obj, &in_memory)); - cl_mem out_memory = nullptr; - RETURN_IF_ERROR(GetOpenCLMemory(output_obj, &out_memory)); - - Tensor src_tensor; - RETURN_IF_ERROR( - CreateSharedTensor(*context_, in_memory, shape_, src_tensor_descriptor_, &src_tensor)); - Tensor dst_tensor; - RETURN_IF_ERROR( - CreateSharedTensor(*context_, out_memory, shape_, dst_tensor_descriptor_, &dst_tensor)); - - RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", &src_tensor)); - RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", &dst_tensor)); - - RETURN_IF_ERROR(args_.Bind(kernel_.kernel())); - const int3 grid = - int3(dst_tensor.Width() * dst_tensor.Batch(), dst_tensor.Height(), dst_tensor.Slices()); - const int3 work_group_size = {16, 8, 1}; - const int3 work_groups_count = GetWorkGroupsCount(grid, work_group_size); - return queue_->Dispatch(kernel_, work_groups_count, work_group_size); - } - -private: - TensorDescriptor src_tensor_descriptor_; - TensorDescriptor dst_tensor_descriptor_; -}; - -// Implements conversion from OpenCL-specific tensor layout to BHWC OpenCL -// buffer. -class TensorToBHWCBufferConverter : public OpenClConverterImpl -{ -public: - static bool IsSupported(const ObjectDef &input, const ObjectDef &output) - { - return IsOpenCLTensor(input) && IsBHWCOpenCLBuffer(output); - } - - absl::Status Init(const TensorObjectDef &input_def, const TensorObjectDef &output_def, - Environment *environment) final - { - TensorStorageType src_tensor_type = - ToTensorStorageType(input_def.object_def.object_type, input_def.object_def.data_layout); - tensor_descriptor_.layout = Layout::BHWC; - tensor_descriptor_.storage_type = src_tensor_type; - tensor_descriptor_.data_type = input_def.object_def.data_type; - args_.AddObjectRef("tensor", AccessType::READ, - absl::make_unique<TensorDescriptor>(tensor_descriptor_)); - - const bool need_fp16_support = input_def.object_def.data_type == DataType::FLOAT16 || - output_def.object_def.data_type == DataType::FLOAT16; - std::string shader_src; - if (need_fp16_support) - { - shader_src += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"; - } - const std::string out_data_type = ToCLDataType(output_def.object_def.data_type); - shader_src += "__kernel void tensor_to_bhwc("; - shader_src += "__global " + out_data_type + "* dst, $0) {\n"; - shader_src += R"( int linear_id = get_global_id(0); - int x = linear_id / args.tensor.Batch(); - int b = linear_id % args.tensor.Batch(); - int y = get_global_id(1); - int d = get_global_id(2); - if (x >= args.tensor.Width() || y >= args.tensor.Height() || d >= args.tensor.Slices()) return; -)"; - shader_src += - " " + out_data_type + "4 input = args.tensor.Read<" + out_data_type + ">(x, y, d, b);\n"; - shader_src += R"( int c = d * 4; - int index = ((b * args.tensor.Height() + y) * args.tensor.Width() + x) * args.tensor.Channels() + c; - - dst[index] = input.x; - if (c + 1 < args.tensor.Channels()) { - dst[index + 1] = input.y; - } - if (c + 2 < args.tensor.Channels()) { - dst[index + 2] = input.z; - } - if (c + 3 < args.tensor.Channels()) { - dst[index + 3] = input.w; - } -})"; - queue_ = environment->queue(); - context_ = &environment->context(); - shape_ = BHWC(input_def.dimensions.b, input_def.dimensions.h, input_def.dimensions.w, - input_def.dimensions.c); - RETURN_IF_ERROR(args_.TransformToCLCode(environment->device().info_, {}, &shader_src)); - return environment->program_cache()->GetOrCreateCLKernel( - shader_src, "tensor_to_bhwc", environment->context(), environment->device(), &kernel_); - } - - absl::Status Convert(const TensorObject &input_obj, const TensorObject &output_obj) override - { - auto output = absl::get_if<OpenClBuffer>(&output_obj); - if (!output || !output->memobj) - { - return absl::InvalidArgumentError("Missing output in tensor_to_bhwc converter"); - } - - cl_mem in_memory = nullptr; - RETURN_IF_ERROR(GetOpenCLMemory(input_obj, &in_memory)); - Tensor tensor; - RETURN_IF_ERROR(CreateSharedTensor(*context_, in_memory, shape_, tensor_descriptor_, &tensor)); - return DispatchKernel(output->memobj, &tensor); - } -}; - -// Implements conversion from BHWC OpenCL buffer to OpenCL-specific tensor -// layout. -class BHWCBufferToTensorConverter : public OpenClConverterImpl -{ -public: - static bool IsSupported(const ObjectDef &input, const ObjectDef &output) - { - return IsBHWCOpenCLBuffer(input) && IsOpenCLTensor(output); - } - - std::pair<std::string, std::string> GetFromBhwcKernel(const TensorObjectDef &input_def, - const TensorObjectDef &) const - { - return std::make_pair("__global " + ToCLDataType(input_def.object_def.data_type) + "* src", - R"(int c = d * 4; - int index = ((b * args.tensor.Height() + y) * args.tensor.Width() + x) * args.tensor.Channels() + c; - result.x = src[index]; - result.y = c + 1 < args.tensor.Channels() ? src[index + 1] : 1; - result.z = c + 2 < args.tensor.Channels() ? src[index + 2] : 2; - result.w = c + 3 < args.tensor.Channels() ? src[index + 3] : 3; -)"); - } - - absl::Status Init(const TensorObjectDef &input_def, const TensorObjectDef &output_def, - Environment *environment) final - { - auto params_kernel = GetFromBhwcKernel(input_def, output_def); - - TensorStorageType dst_tensor_type = - ToTensorStorageType(output_def.object_def.object_type, output_def.object_def.data_layout); - tensor_descriptor_.layout = Layout::BHWC; - tensor_descriptor_.storage_type = dst_tensor_type; - tensor_descriptor_.data_type = output_def.object_def.data_type; - args_.AddObjectRef("tensor", AccessType::WRITE, - absl::make_unique<TensorDescriptor>(tensor_descriptor_)); - - const bool need_fp16_support = input_def.object_def.data_type == DataType::FLOAT16 || - output_def.object_def.data_type == DataType::FLOAT16; - std::string shader_src; - if (need_fp16_support) - { - shader_src += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"; - } - const std::string in_data_type = ToCLDataType(input_def.object_def.data_type); - const std::string out_data_type = ToCLDataType(output_def.object_def.data_type); - shader_src += "__kernel void bhwc_to_tensor("; - shader_src += "__global " + in_data_type + "* src, $0) {\n"; - - shader_src += R"( int linear_id = get_global_id(0); - int x = linear_id / args.tensor.Batch(); - int b = linear_id % args.tensor.Batch(); - int y = get_global_id(1); - int d = get_global_id(2); - - if (x >= args.tensor.Width() || y >= args.tensor.Height() || d >= args.tensor.Slices()) return; -)"; - shader_src += " " + out_data_type + "4 result;\n"; - shader_src += R"( int c = d * 4; - int index = ((b * args.tensor.Height() + y) * args.tensor.Width() + x) * args.tensor.Channels() + c; - result.x = src[index]; - result.y = c + 1 < args.tensor.Channels() ? src[index + 1] : 1; - result.z = c + 2 < args.tensor.Channels() ? src[index + 2] : 2; - result.w = c + 3 < args.tensor.Channels() ? src[index + 3] : 3; -)"; - shader_src += " args.tensor.Write(result, x, y, d, b);\n}"; - queue_ = environment->queue(); - context_ = &environment->context(); - shape_ = BHWC(output_def.dimensions.b, output_def.dimensions.h, output_def.dimensions.w, - output_def.dimensions.c); - RETURN_IF_ERROR(args_.TransformToCLCode(environment->device().info_, {}, &shader_src)); - return environment->program_cache()->GetOrCreateCLKernel( - shader_src, "bhwc_to_tensor", environment->context(), environment->device(), &kernel_); - } - - absl::Status Convert(const TensorObject &input_obj, const TensorObject &output_obj) override - { - auto input = absl::get_if<OpenClBuffer>(&input_obj); - if (!input || !input->memobj) - { - return absl::InvalidArgumentError("Missing input in bhwc_to_tensor converter"); - } - cl_mem out_memory = nullptr; - RETURN_IF_ERROR(GetOpenCLMemory(output_obj, &out_memory)); - Tensor tensor; - RETURN_IF_ERROR(CreateSharedTensor(*context_, out_memory, shape_, tensor_descriptor_, &tensor)); - return DispatchKernel(input->memobj, &tensor); - } -}; - -std::array<size_t, 3> CalculateTextureRegion(const TensorObjectDef &def) -{ - const auto &dims = def.dimensions; - std::array<size_t, 3> region = {0, 0, 1}; - switch (ToTensorStorageType(def.object_def.object_type, def.object_def.data_layout)) - { - case TensorStorageType::SINGLE_TEXTURE_2D: - region[0] = static_cast<size_t>(dims.w * dims.b); - region[1] = static_cast<size_t>(dims.h); - break; - case TensorStorageType::TEXTURE_2D: - region[0] = static_cast<size_t>(dims.w * dims.b); - region[1] = static_cast<size_t>(dims.h * dims.d()); - break; - case TensorStorageType::TEXTURE_ARRAY: - region[0] = static_cast<size_t>(dims.w * dims.b); - region[1] = static_cast<size_t>(dims.h); - region[2] = static_cast<size_t>(dims.d()); - break; - default: - break; - } - return region; -} - -bool IsOpenClTextureOrBuffer(ObjectType type) -{ - return type == ObjectType::OPENCL_BUFFER || type == ObjectType::OPENCL_TEXTURE; -} - -// Copies data from one object of the same type and layout to another object. -class TrivialCopier : public OpenClConverterImpl -{ -public: - static bool IsSupported(const ObjectDef &input, const ObjectDef &output) - { - return IsOpenClTextureOrBuffer(input.object_type) && input.data_type == output.data_type && - input.object_type == output.object_type && input.data_layout == output.data_layout; - } - - absl::Status Init(const TensorObjectDef &input_def, const TensorObjectDef &output_def, - Environment *environment) final - { - shape_ = BHWC(input_def.dimensions.b, input_def.dimensions.h, input_def.dimensions.w, - input_def.dimensions.c); - data_type_ = input_def.object_def.data_type; - queue_ = environment->queue(); - region_ = CalculateTextureRegion(output_def); - return absl::OkStatus(); - } - - absl::Status Convert(const TensorObject &input_obj, const TensorObject &output_obj) override - { - auto texture_input = absl::get_if<OpenClTexture>(&input_obj); - auto texture_output = absl::get_if<OpenClTexture>(&output_obj); - if (texture_input && texture_output) - { - return Copy(*texture_input, *texture_output); - } - auto buffer_input = absl::get_if<OpenClBuffer>(&input_obj); - auto buffer_output = absl::get_if<OpenClBuffer>(&output_obj); - if (buffer_input && buffer_output) - { - return Copy(*buffer_input, *buffer_output); - } - return absl::InternalError("Unexpected object"); - } - - absl::Status Copy(const OpenClBuffer &input, const OpenClBuffer &output) - { - if (input.memobj == output.memobj) - { - return absl::OkStatus(); - } - return GetOpenCLError(clEnqueueCopyBuffer(queue_->queue(), input.memobj, output.memobj, 0, 0, - SizeOf(data_type_) * shape_.w * shape_.h * - AlignByN(shape_.c, 4) * shape_.b, - 0, nullptr, nullptr)); - } - - absl::Status Copy(const OpenClTexture &input, const OpenClTexture &output) - { - if (input.memobj == output.memobj) - { - return absl::OkStatus(); - } - size_t origin[3] = {0, 0, 0}; - return GetOpenCLError(clEnqueueCopyImage(queue_->queue(), input.memobj, output.memobj, origin, - origin, region_.data(), 0, nullptr, nullptr)); - } - -private: - DataType data_type_ = DataType::UNKNOWN; - std::array<size_t, 3> region_; -}; - -// Copies data from/to CPU into a tensor. -class CpuCopier : public OpenClConverterImpl -{ -public: - static bool IsSupported(const ObjectDef &input, const ObjectDef &output) - { - return input.data_type == output.data_type && input.data_layout == output.data_layout && - ((input.object_type == ObjectType::CPU_MEMORY && - IsOpenClTextureOrBuffer(output.object_type)) || - (output.object_type == ObjectType::CPU_MEMORY && - IsOpenClTextureOrBuffer(input.object_type))); - } - - absl::Status Init(const TensorObjectDef &input_def, const TensorObjectDef &output_def, - Environment *environment) final - { - - region_ = CalculateTextureRegion( - input_def.object_def.object_type == ObjectType::CPU_MEMORY ? output_def : input_def); - queue_ = environment->queue(); - return absl::OkStatus(); - } - - absl::Status Convert(const TensorObject &input_obj, const TensorObject &output_obj) override - { - auto cpu_input = absl::get_if<CpuMemory>(&input_obj); - auto cpu_output = absl::get_if<CpuMemory>(&output_obj); - - if (cpu_input) - { - auto texture_output = absl::get_if<OpenClTexture>(&output_obj); - if (texture_output) - { - return queue_->EnqueueWriteImage(texture_output->memobj, - int3(region_[0], region_[1], region_[2]), cpu_input->data); - } - auto buffer_output = absl::get_if<OpenClBuffer>(&output_obj); - if (buffer_output) - { - return queue_->EnqueueWriteBuffer(buffer_output->memobj, cpu_input->size_bytes, - cpu_input->data); - } - } - else if (cpu_output) - { - auto texture_input = absl::get_if<OpenClTexture>(&input_obj); - if (texture_input) - { - return queue_->EnqueueReadImage(texture_input->memobj, - int3(region_[0], region_[1], region_[2]), cpu_output->data); - } - auto buffer_input = absl::get_if<OpenClBuffer>(&input_obj); - if (buffer_input) - { - return queue_->EnqueueReadBuffer(buffer_input->memobj, cpu_output->size_bytes, - cpu_output->data); - } - } - return absl::InternalError("Unexpected object"); - } - -private: - std::array<size_t, 3> region_; -}; - -class OpenClTensorConverterBuilder : public TensorObjectConverterBuilder -{ -public: - explicit OpenClTensorConverterBuilder(Environment *environment) : environment_(environment) {} - - bool IsSupported(const TensorObjectDef &input, const TensorObjectDef &output) const final - { - const auto &input_def = input.object_def; - const auto &output_def = output.object_def; - return input.dimensions == output.dimensions && - (TrivialCopier::IsSupported(input_def, output_def) || - TensorToTensorConverter::IsSupported(input_def, output_def) || - CpuCopier::IsSupported(input_def, output_def) || - TensorToBHWCBufferConverter::IsSupported(input_def, output_def) || - BHWCBufferToTensorConverter::IsSupported(input_def, output_def)); - } - - absl::Status MakeConverter(const TensorObjectDef &input, const TensorObjectDef &output, - std::unique_ptr<TensorObjectConverter> *converter) final - { - std::unique_ptr<OpenClConverterImpl> impl; - const auto &input_def = input.object_def; - const auto &output_def = output.object_def; - if (TrivialCopier::IsSupported(input_def, output_def)) - { - impl = absl::make_unique<TrivialCopier>(); - } - else if (TensorToTensorConverter::IsSupported(input_def, output_def)) - { - impl = absl::make_unique<TensorToTensorConverter>(); - } - else if (CpuCopier::IsSupported(input_def, output_def)) - { - impl = absl::make_unique<CpuCopier>(); - } - else if (TensorToBHWCBufferConverter::IsSupported(input_def, output_def)) - { - impl = absl::make_unique<TensorToBHWCBufferConverter>(); - } - else if (BHWCBufferToTensorConverter::IsSupported(input_def, output_def)) - { - impl = absl::make_unique<BHWCBufferToTensorConverter>(); - } - else - { - return absl::UnimplementedError("Unsupported conversion"); - } - RETURN_IF_ERROR(impl->Init(input, output, environment_)); - *converter = std::move(impl); - return absl::OkStatus(); - } - - Environment *environment_; -}; - -} // namespace - -std::unique_ptr<TensorObjectConverterBuilder> NewConverterBuilder(Environment *environment) -{ - return absl::make_unique<OpenClTensorConverterBuilder>(environment); -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Converter.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/Converter.h deleted file mode 100644 index d69ec85bb..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/Converter.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONVERTER_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONVERTER_H__ - -#include <memory> - -#include "open_cl/Environment.h" -#include "open_cl/Spi.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ -// Supports conversions from BHWC to internal OpenCL tensor representation and -// back. Also supports F16/F32. -std::unique_ptr<TensorObjectConverterBuilder> NewConverterBuilder(Environment *environment); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONVERTER_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv.cc deleted file mode 100644 index e409fef47..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv.cc +++ /dev/null @@ -1,382 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "DepthwiseConv.h" - -#include <string> -#include <utility> -#include <vector> - -#include "open_cl/ClDevice.h" -#include "open_cl/kernels/Util.h" -#include "open_cl/kernels/WorkGroupPicking.h" -#include "open_cl/LinearStorage.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ -namespace -{ - -bool IsSpecializedCase(int channel_multiplier) -{ - return channel_multiplier == 1 || channel_multiplier == 2 || channel_multiplier == 4; -} - -std::string GetSrcValue(int channel_multiplier, const std::string coords) -{ - std::string c; - if (channel_multiplier == 1) - { - c += " FLT4 src_final = args.src_tensor.Read(" + coords + ", S);\n"; - } - else if (channel_multiplier == 2) - { - c += " int s_layer = S / 2;\n"; - c += " FLT4 src = args.src_tensor.Read(" + coords + ", s_layer);\n"; - c += " FLT2 t0 = S % 2 == 0 ? src.xy : src.zw;\n"; - c += " FLT4 src_final = (FLT4)(t0.x, t0.x, t0.y, t0.y);\n"; - } - else if (channel_multiplier == 4) - { - c += " int s_layer = S / 4;\n"; - c += " FLT4 src = args.src_tensor.Read(" + coords + ", s_layer);\n"; - c += " FLT t0 = src.x;\n"; - c += " int reminder = S % 4;\n"; - c += " if (reminder == 1) t0 = src.y;\n"; - c += " if (reminder == 2) t0 = src.z;\n"; - c += " if (reminder == 3) t0 = src.w;\n"; - c += " FLT4 src_final = (FLT4)(t0, t0, t0, t0);\n"; - } - else - { - c += " int s_layer = S / args.ch_multiplier;\n"; - c += " FLT4 src = args.src_tensor.Read(" + coords + ", s_layer);\n"; - c += " int s_offset = (S % args.ch_multiplier) * 4;\n"; - c += " FLT4 src_final;\n"; - c += " FLT temp_arr[4] = {src.x, src.y, src.z, src.w};\n"; - c += " src_final.x = temp_arr[(s_offset + 0) / args.ch_multiplier];\n"; - c += " src_final.y = temp_arr[(s_offset + 1) / args.ch_multiplier];\n"; - c += " src_final.z = temp_arr[(s_offset + 2) / args.ch_multiplier];\n"; - c += " src_final.w = temp_arr[(s_offset + 3) / args.ch_multiplier];\n"; - } - - return c; -} - -std::string GenerateDepthwiseConvolutionCode(const OperationDef &op_def, bool stride_correction, - int channel_multiplier, bool weights_are_buffer, - bool dynamic_weights, GPUOperation *op) -{ - auto src_desc = op_def.src_tensors[0]; - src_desc.SetTextureAddressMode(TextureAddressMode::ZERO); - if (op_def.IsBatchSupported()) - { - src_desc.SetStateVar("BatchedWidth", "true"); - } - op->AddSrcTensor("src_tensor", src_desc); - if (dynamic_weights) - { - op->AddSrcTensor("weights", op_def.src_tensors[1]); - } - - auto dst_desc = op_def.dst_tensors[0]; - if (op_def.IsBatchSupported()) - { - dst_desc.SetStateVar("BatchedWidth", "true"); - } - op->AddDstTensor("dst_tensor", dst_desc); - - const auto src_tensor_type = op_def.src_tensors[0].storage_type; - - std::string c = GetCommonDefines(op_def.precision); - - const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER || - src_tensor_type == TensorStorageType::IMAGE_BUFFER; - - c += "__kernel void main_function(\n"; - c += "$0) {\n"; - c += " int X = get_global_id(0);\n"; - if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) - { - c += " int linear_id_1 = get_global_id(1);\n"; - c += " int Y = linear_id_1 / args.dst_tensor.Depth();\n"; - c += " int Z = linear_id_1 % args.dst_tensor.Depth();\n"; - } - else - { - c += " int Y = get_global_id(1);\n"; - } - c += " int S = get_global_id(2);\n"; - c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || " - "S >= args.dst_tensor.Slices()) { \n"; - c += " return; \n"; - c += " } \n"; - c += " ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"; - if (stride_correction) - { - c += " int x_offseted = " + - GetXStrideCorrectedV2("X", "args.src_tensor.Batch()", "args.stride_x", "args.padding_x") + - ";\n"; - } - else - { - if (op_def.IsBatchSupported()) - { - c += " int x_offseted = X * args.stride_x + args.padding_x * " - "args.src_tensor.Batch();\n"; - } - else - { - c += " int x_offseted = X * args.stride_x + args.padding_x;\n"; - } - } - c += " int y_offseted = Y * args.stride_y + args.padding_y;\n"; - if (!dynamic_weights) - { - std::string weights_offset = "args.kernel_size_x * args.kernel_size_y"; - if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) - { - c += " int z_offseted = Z * args.stride_z + args.padding_z;\n"; - weights_offset += " * args.kernel_size_z"; - } - if (weights_are_buffer) - { - c += " int fx_c = S * " + weights_offset + ";\n"; - } - else - { - c += " int fx_c = 0;\n"; - } - } - std::string kernel_size_x = dynamic_weights ? "args.weights.Width()" : "args.kernel_size_x"; - std::string kernel_size_y = dynamic_weights ? "args.weights.Height()" : "args.kernel_size_y"; - std::string kernel_size_z = dynamic_weights ? "args.weights.Depth()" : "args.kernel_size_z"; - - std::string flat_coords = "x_c, y_c"; - if (manual_clamp) - { - std::string check = "!outside_x && !outside_y"; - if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) - { - check += " && !outside_z"; - flat_coords += ", z_c"; - c += " for (int kz = 0; kz < " + kernel_size_z + "; ++kz) {\n"; - c += " int z_c = z_offseted + kz * args.dilation_z;\n"; - c += " bool outside_z = z_c < 0 || z_c >= args.src_tensor.Depth();\n"; - } - c += " for (int ky = 0; ky < " + kernel_size_y + "; ++ky) {\n"; - c += " int y_c = y_offseted + ky * args.dilation_y;\n"; - c += " bool outside_y = y_c < 0 || y_c >= args.src_tensor.Height();\n"; - c += " for (int kx = 0; kx < " + kernel_size_x + "; ++kx) {\n"; - const std::string dilation_x = - op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()" : "args.dilation_x"; - c += " int x_c = x_offseted + kx * " + dilation_x + ";\n"; - c += " bool outside_x = x_c < 0 || x_c >= args.src_tensor.Width();\n"; - c += " if (" + check + ") {\n"; - if (dynamic_weights) - { - c += " FLT4 f = args.weights.Read(kx, ky, S);\n"; - } - else - { - if (weights_are_buffer) - { - c += " FLT4 f = args.weights.Read(fx_c);\n"; - } - else - { - c += " FLT4 f = args.weights.Read(fx_c, S);\n"; - } - } - c += GetSrcValue(channel_multiplier, flat_coords); - c += " r += TO_ACCUM_TYPE(src_final * f);\n"; - c += " };\n"; - if (!dynamic_weights) - { - c += " fx_c++;\n"; - } - c += " }\n"; - c += " }\n"; - if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) - { - c += " }\n"; - } - } - else - { // Texture types with ZERO clamping - if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) - { - flat_coords += ", z_c"; - c += " for (int kz = 0; kz < " + kernel_size_z + "; ++kz) {\n"; - c += " int z_c = z_offseted + kz * args.dilation_z;\n"; - if (src_tensor_type != TensorStorageType::TEXTURE_3D) - { // Only TEXTURE_3D supports clamping - // in DEPTH dimension - c += " if (z_c < 0 || z_c >= args.src_tensor.Depth()) {\n"; - c += " fx_c += args.kernel_size_y * args.kernel_size_x;\n"; - c += " continue;\n"; - c += " }\n"; - } - } - c += " for (int ky = 0; ky < " + kernel_size_y + "; ++ky) {\n"; - c += " int y_c = y_offseted + ky * args.dilation_y;\n"; - c += " for (int kx = 0; kx < " + kernel_size_x + "; ++kx) {\n"; - const std::string dilation_x = - op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()" : "args.dilation_x"; - c += " int x_c = x_offseted + kx * " + dilation_x + ";\n"; - c += GetSrcValue(channel_multiplier, flat_coords); - if (dynamic_weights) - { - c += " FLT4 f = args.weights.Read(kx, ky, S);\n"; - } - else - { - if (weights_are_buffer) - { - c += " FLT4 f = args.weights.Read(fx_c);\n"; - } - else - { - c += " FLT4 f = args.weights.Read(fx_c, S);\n"; - } - c += " fx_c++;\n"; - } - c += " r += TO_ACCUM_TYPE(src_final * f);\n"; - c += " }\n"; - c += " }\n"; - if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) - { - c += " }\n"; - } - } - c += " FLT4 res0 = TO_FLT4(r) + args.biases.Read(S);\n"; - if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) - { - c += " args.dst_tensor.Write(res0, X, Y, Z, S);\n"; - } - else - { - c += " args.dst_tensor.Write(res0, X, Y, S);\n"; - } - c += "}\n"; - - return c; -} -} // namespace - -GPUOperation CreateDepthwiseConvolution2D(const DeviceInfo &device_info, - const OperationDef &definition, - const DepthwiseConvolution2DAttributes &attr) -{ - bool weights_are_buffer = device_info.IsMali(); - GPUOperation op(definition); - op.args_.AddInt("kernel_size_x", attr.weights.shape.w); - op.args_.AddInt("stride_x", attr.strides.w); - op.args_.AddInt("padding_x", -attr.padding.prepended.w); - op.args_.AddInt("dilation_x", attr.dilations.w); - op.args_.AddInt("kernel_size_y", attr.weights.shape.h); - op.args_.AddInt("stride_y", attr.strides.h); - op.args_.AddInt("padding_y", -attr.padding.prepended.h); - op.args_.AddInt("dilation_y", attr.dilations.h); - if (!IsSpecializedCase(attr.weights.shape.o)) - { - op.args_.AddInt("ch_multiplier", attr.weights.shape.o); - } - const bool stride_correction = definition.IsBatchSupported() && attr.strides.w != 1; - op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction, attr.weights.shape.o, - weights_are_buffer, false, &op); - UploadWeightsForDWConv2D(attr.weights, weights_are_buffer, definition.precision, &op); - op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ; - - TensorLinearDescriptor desc; - desc.storage_type = - weights_are_buffer ? LinearStorageType::BUFFER : LinearStorageType::TEXTURE_2D; - desc.element_type = definition.GetDataType(); - desc.UploadLinearData(attr.bias); - op.args_.AddObject("biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc))); - return op; -} - -GPUOperation -CreateDepthwiseConvolution2DDynamicWeights(const DeviceInfo &device_info, - const OperationDef &definition, - const DepthwiseConvolution2DAttributes &attr) -{ - GPUOperation op(definition); - op.args_.AddInt("stride_x", attr.strides.w); - op.args_.AddInt("padding_x", -attr.padding.prepended.w); - op.args_.AddInt("dilation_x", attr.dilations.w); - op.args_.AddInt("stride_y", attr.strides.h); - op.args_.AddInt("padding_y", -attr.padding.prepended.h); - op.args_.AddInt("dilation_y", attr.dilations.h); - const bool stride_correction = definition.IsBatchSupported() && attr.strides.w != 1; - op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction, 1, false, true, &op); - op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ; - - TensorLinearDescriptor desc; - desc.storage_type = - device_info.IsMali() ? LinearStorageType::BUFFER : LinearStorageType::TEXTURE_2D; - desc.element_type = definition.GetDataType(); - desc.UploadLinearData(attr.bias); - op.args_.AddObject("biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc))); - return op; -} - -GPUOperation CreateDepthwiseConvolution3D(const DeviceInfo &device_info, - const OperationDef &definition, - const DepthwiseConvolution3DAttributes &attr) -{ - bool weights_are_buffer = device_info.IsMali(); - GPUOperation op(definition); - op.args_.AddInt("kernel_size_x", attr.weights.shape.w); - op.args_.AddInt("stride_x", attr.strides.w); - op.args_.AddInt("padding_x", -attr.padding.prepended.w); - op.args_.AddInt("dilation_x", attr.dilations.w); - op.args_.AddInt("kernel_size_y", attr.weights.shape.h); - op.args_.AddInt("stride_y", attr.strides.h); - op.args_.AddInt("padding_y", -attr.padding.prepended.h); - op.args_.AddInt("dilation_y", attr.dilations.h); - op.args_.AddInt("kernel_size_z", attr.weights.shape.d); - op.args_.AddInt("stride_z", attr.strides.d); - op.args_.AddInt("padding_z", -attr.padding.prepended.d); - op.args_.AddInt("dilation_z", attr.dilations.d); - if (!IsSpecializedCase(attr.weights.shape.o)) - { - op.args_.AddInt("ch_multiplier", attr.weights.shape.o); - } - const bool stride_correction = definition.IsBatchSupported() && attr.strides.w != 1; - op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction, attr.weights.shape.o, - weights_are_buffer, false, &op); - UploadWeightsForDWConv3D(attr.weights, weights_are_buffer, definition.precision, &op); - op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ; - - TensorLinearDescriptor desc; - desc.storage_type = - weights_are_buffer ? LinearStorageType::BUFFER : LinearStorageType::TEXTURE_2D; - desc.element_type = definition.GetDataType(); - desc.UploadLinearData(attr.bias); - op.args_.AddObject("biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc))); - return op; -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv.h deleted file mode 100644 index cbadd9fde..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv.h +++ /dev/null @@ -1,233 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_DEPTHWISE_CONV_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_DEPTHWISE_CONV_H__ - -#include <vector> - -#include "open_cl/Buffer.h" -#include "open_cl/kernels/GpuOperation.h" -#include "open_cl/LinearStorage.h" -#include "open_cl/Tensor.h" -#include "open_cl/Texture2d.h" -#include "open_cl/Util.h" -#include "open_cl/DataType.h" -#include "open_cl/Operations.h" -#include "open_cl/Shape.h" -#include "open_cl/Status.h" -#include "open_cl/Tensor.h" -#include "open_cl/Types.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -template <DataType S, typename T> -void RearrangeWeightsForDWConv2D(const InternalTensor<OHWI, S> &weights, absl::Span<T> dst) -{ - const int dst_channels = weights.shape.i * weights.shape.o; - const int dst_depth = DivideRoundUp(dst_channels, 4); - const int kernel_x = weights.shape.w; - const int kernel_y = weights.shape.h; - - int counter = 0; - for (int d = 0; d < dst_depth; ++d) - { - for (int y = 0; y < kernel_y; ++y) - { - for (int x = 0; x < kernel_x; ++x) - { - T filter_val; - for (int i = 0; i < 4; ++i) - { - const int d_ch = d * 4 + i; - if (d_ch < dst_channels) - { - const int f_index = - weights.shape.LinearIndex({d_ch % weights.shape.o, y, x, d_ch / weights.shape.o}); - filter_val[i] = weights.data[f_index]; - } - else - { - filter_val[i] = 0.0f; - } - } - dst[counter++] = filter_val; - } - } - } -} - -template <DataType T> -void UploadWeightsForDWConv2D(const InternalTensor<OHWI, T> &weights, bool weights_are_buffer, - CalculationsPrecision precision, GPUOperation *op) -{ - const int dst_channels = weights.shape.i * weights.shape.o; - const int dst_slices = DivideRoundUp(dst_channels, 4); - const int kernel_x = weights.shape.w; - const int kernel_y = weights.shape.h; - - const int elements_count = kernel_x * kernel_y * dst_slices; - - const bool fp32_weights = precision == CalculationsPrecision::F32; - const int float4_size = fp32_weights ? 16 : 8; - - std::vector<uint8_t> data(float4_size * elements_count); - - if (fp32_weights) - { - float4 *ptr = reinterpret_cast<float4 *>(data.data()); - RearrangeWeightsForDWConv2D(weights, absl::MakeSpan(ptr, elements_count)); - } - // TODO - // It doesn't support F16 yet. I will try to add it later. - // - // else { - // half4* ptr = reinterpret_cast<half4*>(data.data()); - // RearrangeWeightsForDWConv2D(weights, absl::MakeSpan(ptr, elements_count)); - // } - - if (weights_are_buffer) - { - BufferDescriptor desc; - desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16; - desc.element_size = 4; - desc.size = float4_size * elements_count; - desc.data = std::move(data); - op->args_.AddObject("weights", absl::make_unique<BufferDescriptor>(desc)); - } - else - { - Texture2DDescriptor desc; - desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16; - desc.size = int2(kernel_x * kernel_y, dst_slices); - desc.data = std::move(data); - op->args_.AddObject("weights", absl::make_unique<Texture2DDescriptor>(desc)); - } -} - -template <DataType S, typename T> -void RearrangeWeightsForDWConv3D(const InternalTensor<OHWDI, S> &weights, absl::Span<T> dst) -{ - const int dst_channels = weights.shape.i * weights.shape.o; - const int dst_slices = DivideRoundUp(dst_channels, 4); - const int kernel_x = weights.shape.w; - const int kernel_y = weights.shape.h; - const int kernel_z = weights.shape.d; - - int counter = 0; - for (int d = 0; d < dst_slices; ++d) - { - for (int z = 0; z < kernel_z; ++z) - { - for (int y = 0; y < kernel_y; ++y) - { - for (int x = 0; x < kernel_x; ++x) - { - T filter_val; - for (int i = 0; i < 4; ++i) - { - const int d_ch = d * 4 + i; - if (d_ch < dst_channels) - { - const int f_index = weights.shape.LinearIndex( - {d_ch % weights.shape.o, y, x, z, d_ch / weights.shape.o}); - filter_val[i] = weights.data[f_index]; - } - else - { - filter_val[i] = 0.0f; - } - } - dst[counter++] = filter_val; - } - } - } - } -} - -template <DataType T> -void UploadWeightsForDWConv3D(const InternalTensor<OHWDI, T> &weights, bool weights_are_buffer, - CalculationsPrecision precision, GPUOperation *op) -{ - const int dst_channels = weights.shape.i * weights.shape.o; - const int dst_slices = DivideRoundUp(dst_channels, 4); - const int kernel_x = weights.shape.w; - const int kernel_y = weights.shape.h; - const int kernel_z = weights.shape.d; - - const int elements_count = kernel_x * kernel_y * kernel_z * dst_slices; - - const bool fp32_weights = precision == CalculationsPrecision::F32; - const int float4_size = fp32_weights ? 16 : 8; - - std::vector<uint8_t> data(float4_size * elements_count); - - if (fp32_weights) - { - float4 *ptr = reinterpret_cast<float4 *>(data.data()); - RearrangeWeightsForDWConv3D(weights, absl::MakeSpan(ptr, elements_count)); - } - // TODO - // It doesn't support F16 yet. I will try to add it later. - // - // else { - // half4* ptr = reinterpret_cast<half4*>(data.data()); - // RearrangeWeightsForDWConv3D(weights, absl::MakeSpan(ptr, elements_count)); - // } - - if (weights_are_buffer) - { - BufferDescriptor desc; - desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16; - desc.element_size = 4; - desc.size = float4_size * elements_count; - desc.data = std::move(data); - op->args_.AddObject("weights", absl::make_unique<BufferDescriptor>(std::move(desc))); - } - else - { - Texture2DDescriptor desc; - desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16; - desc.size = int2(kernel_x * kernel_y * kernel_z, dst_slices); - desc.data = std::move(data); - op->args_.AddObject("weights", absl::make_unique<Texture2DDescriptor>(std::move(desc))); - } -} - -GPUOperation CreateDepthwiseConvolution2D(const DeviceInfo &device_info, - const OperationDef &definition, - const DepthwiseConvolution2DAttributes &attr); - -GPUOperation -CreateDepthwiseConvolution2DDynamicWeights(const DeviceInfo &device_info, - const OperationDef &definition, - const DepthwiseConvolution2DAttributes &attr); - -GPUOperation CreateDepthwiseConvolution3D(const DeviceInfo &device_info, - const OperationDef &definition, - const DepthwiseConvolution3DAttributes &attr); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_DEPTHWISE_CONV_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv3x3.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv3x3.cc deleted file mode 100644 index 89a14f14d..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv3x3.cc +++ /dev/null @@ -1,358 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "DepthwiseConv3x3.h" - -#include <string> -#include <utility> - -#include "open_cl/kernels/Util.h" -#include "open_cl/kernels/WorkGroupPicking.h" -#include "open_cl/Precision.h" -#include "open_cl/Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -DepthwiseConv3x3::DepthwiseConv3x3(const OperationDef &definition, bool weights_are_buffer, - bool local_mem_uploads, const DeviceInfo &device_info) - : GPUOperation(definition), local_mem_uploads_(local_mem_uploads) -{ - work_group_size_ = int3(8, 4, 1); - code_ = GenerateDepthwiseConvCode(definition_, weights_are_buffer, local_mem_uploads_); - - if (definition_.precision == CalculationsPrecision::F16 && device_info.IsPowerVR()) - { - compiler_options_.push_back(CompilerOptions::POWERVR_FP16); - } -} - -DepthwiseConv3x3::DepthwiseConv3x3(DepthwiseConv3x3 &&operation) - : GPUOperation(std::move(operation)), local_mem_uploads_(operation.local_mem_uploads_) -{ -} - -DepthwiseConv3x3 &DepthwiseConv3x3::operator=(DepthwiseConv3x3 &&operation) -{ - if (this != &operation) - { - std::swap(local_mem_uploads_, operation.local_mem_uploads_); - GPUOperation::operator=(std::move(operation)); - } - return *this; -} - -std::string DepthwiseConv3x3::GenerateDepthwiseConvCode(const OperationDef &op_def, - bool weights_are_buffer, - bool local_mem_uploads) -{ - auto src_desc = op_def.src_tensors[0]; - src_desc.SetTextureAddressMode(TextureAddressMode::ZERO); - AddSrcTensor("src_tensor", src_desc); - AddDstTensor("dst_tensor", op_def.dst_tensors[0]); - - const auto src_tensor_type = op_def.src_tensors[0].storage_type; - - const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER || - src_tensor_type == TensorStorageType::IMAGE_BUFFER; - - std::string c = GetCommonDefines(op_def.precision); - if (local_mem_uploads) - { - c += "__attribute__((reqd_work_group_size(8, 4, 1)))\n"; - } - c += "__kernel void main_function(\n"; - c += "$0) {\n"; - if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) - { - c += " int linear_id = get_global_id(0);\n"; - c += " int X = (linear_id / args.dst_tensor.Batch()) * 2;\n"; - c += " int B = linear_id % args.dst_tensor.Batch();\n"; - c += " args.dst_tensor.SetBatchRef(B);\n"; - c += " args.src_tensor.SetBatchRef(B);\n"; - } - else - { - c += " int X = get_global_id(0) * 2;\n"; - } - c += " int Y = get_global_id(1) * 2;\n"; - c += " int S = get_global_id(2);\n"; - c += " ACCUM_FLT4 r0 = (ACCUM_FLT4)(0.0f);\n"; - c += " ACCUM_FLT4 r1 = (ACCUM_FLT4)(0.0f);\n"; - c += " ACCUM_FLT4 r2 = (ACCUM_FLT4)(0.0f);\n"; - c += " ACCUM_FLT4 r3 = (ACCUM_FLT4)(0.0f);\n"; - if (!local_mem_uploads) - { - c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() " - "|| S >= args.dst_tensor.Slices()) { \n"; - c += " return; \n"; - c += " } \n"; - } - if (local_mem_uploads) - { - c += " __local FLT4 f[10];\n"; - c += " event_t e = async_work_group_copy(f, args.weights.GetPtr() + S * " - "10, 10, 0);\n"; - c += " wait_group_events(1, &e);\n"; - } - else if (weights_are_buffer) - { - c += " __global FLT4* f = args.weights.GetPtr() + S * 10;\n"; - } - c += " FLT4 s0;\n"; - c += " FLT4 s1;\n"; - c += " FLT4 s2;\n"; - c += " FLT4 s3;\n"; - std::string W[9] = {"f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8"}; - std::string bias = "bias"; - std::string xc[4] = {"X - 1", "X", "X + 1", "X + 2"}; - std::string yc[4] = {"Y - 1", "Y", "Y + 1", "Y + 2"}; - if (!weights_are_buffer) - { - c += " FLT4 f0 = args.weights.Read(0, S);\n"; - c += " FLT4 f1 = args.weights.Read(1, S);\n"; - c += " FLT4 f2 = args.weights.Read(2, S);\n"; - c += " FLT4 f3 = args.weights.Read(3, S);\n"; - c += " FLT4 f4 = args.weights.Read(4, S);\n"; - c += " FLT4 f5 = args.weights.Read(5, S);\n"; - c += " FLT4 f6 = args.weights.Read(6, S);\n"; - c += " FLT4 f7 = args.weights.Read(7, S);\n"; - c += " FLT4 f8 = args.weights.Read(8, S);\n"; - } - if (manual_clamp) - { - c += " int x0 = X - 1;\n"; - c += " int x1 = X;\n"; - c += " int x2 = X + 1;\n"; - c += " int x3 = X + 2;\n"; - c += " int y0 = Y - 1;\n"; - c += " int y1 = Y;\n"; - c += " int y2 = Y + 1;\n"; - c += " int y3 = Y + 2;\n"; - c += " bool x0_in = x0 >= 0 && x0 < args.dst_tensor.Width();\n"; - c += " bool x1_in = x1 >= 0 && x1 < args.dst_tensor.Width();\n"; - c += " bool x2_in = x2 >= 0 && x2 < args.dst_tensor.Width();\n"; - c += " bool x3_in = x3 >= 0 && x3 < args.dst_tensor.Width();\n"; - c += " bool y0_in = y0 >= 0 && y0 < args.dst_tensor.Height();\n"; - c += " bool y1_in = y1 >= 0 && y1 < args.dst_tensor.Height();\n"; - c += " bool y2_in = y2 >= 0 && y2 < args.dst_tensor.Height();\n"; - c += " bool y3_in = y3 >= 0 && y3 < args.dst_tensor.Height();\n"; - c += " x0 = clamp(x0, 0, args.dst_tensor.Width() - 1);\n"; - c += " x1 = clamp(x1, 0, args.dst_tensor.Width() - 1);\n"; - c += " x2 = clamp(x2, 0, args.dst_tensor.Width() - 1);\n"; - c += " x3 = clamp(x3, 0, args.dst_tensor.Width() - 1);\n"; - c += " y0 = clamp(y0, 0, args.dst_tensor.Height() - 1);\n"; - c += " y1 = clamp(y1, 0, args.dst_tensor.Height() - 1);\n"; - c += " y2 = clamp(y2, 0, args.dst_tensor.Height() - 1);\n"; - c += " y3 = clamp(y3, 0, args.dst_tensor.Height() - 1);\n"; - if (src_tensor_type == TensorStorageType::BUFFER) - { - c += " __global FLT4* src_loc = " - "args.src_tensor.GetPtrWithSliceOffset(S);\n"; - } - xc[0] = "x0"; - xc[1] = "x1"; - xc[2] = "x2"; - xc[3] = "x3"; - yc[0] = "y0"; - yc[1] = "y1"; - yc[2] = "y2"; - yc[3] = "y3"; - } - if (local_mem_uploads || weights_are_buffer) - { - W[0] = "f[0]"; - W[1] = "f[1]"; - W[2] = "f[2]"; - W[3] = "f[3]"; - W[4] = "f[4]"; - W[5] = "f[5]"; - W[6] = "f[6]"; - W[7] = "f[7]"; - W[8] = "f[8]"; - bias = "f[9]"; - } - auto read_4x_line = [&](int y) { - if (src_tensor_type == TensorStorageType::BUFFER) - { - const std::string y_in = "y" + std::to_string(y) + "_in"; - c += " s0 = src_loc[args.src_tensor.GetWHOffset(" + xc[0] + ", " + yc[y] + - ")] * (FLT)(x0_in && " + y_in + ");\n"; - c += " s1 = src_loc[args.src_tensor.GetWHOffset(" + xc[1] + ", " + yc[y] + - ")] * (FLT)(x1_in && " + y_in + ");\n"; - c += " s2 = src_loc[args.src_tensor.GetWHOffset(" + xc[2] + ", " + yc[y] + - ")] * (FLT)(x2_in && " + y_in + ");\n"; - c += " s3 = src_loc[args.src_tensor.GetWHOffset(" + xc[3] + ", " + yc[y] + - ")] * (FLT)(x3_in && " + y_in + ");\n"; - } - else if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) - { - const std::string y_in = "y" + std::to_string(y) + "_in"; - c += " s0 = args.src_tensor.Read(" + xc[0] + ", " + yc[y] + ", S) * (FLT)(x0_in && " + - y_in + ");\n"; - c += " s1 = args.src_tensor.Read(" + xc[1] + ", " + yc[y] + ", S) * (FLT)(x1_in && " + - y_in + ");\n"; - c += " s2 = args.src_tensor.Read(" + xc[2] + ", " + yc[y] + ", S) * (FLT)(x2_in && " + - y_in + ");\n"; - c += " s3 = args.src_tensor.Read(" + xc[3] + ", " + yc[y] + ", S) * (FLT)(x3_in && " + - y_in + ");\n"; - } - else - { - c += " s0 = args.src_tensor.Read(" + xc[0] + ", " + yc[y] + ", S);\n"; - c += " s1 = args.src_tensor.Read(" + xc[1] + ", " + yc[y] + ", S);\n"; - c += " s2 = args.src_tensor.Read(" + xc[2] + ", " + yc[y] + ", S);\n"; - c += " s3 = args.src_tensor.Read(" + xc[3] + ", " + yc[y] + ", S);\n"; - } - }; - c += " {\n"; - read_4x_line(0); - c += " r0 += TO_ACCUM_TYPE(" + W[0] + " * s0);\n"; - c += " r0 += TO_ACCUM_TYPE(" + W[1] + " * s1);\n"; - c += " r1 += TO_ACCUM_TYPE(" + W[0] + " * s1);\n"; - c += " r0 += TO_ACCUM_TYPE(" + W[2] + " * s2);\n"; - c += " r1 += TO_ACCUM_TYPE(" + W[1] + " * s2);\n"; - c += " r1 += TO_ACCUM_TYPE(" + W[2] + " * s3);\n"; - c += " }\n"; - c += " {\n"; - read_4x_line(1); - c += " r0 += TO_ACCUM_TYPE(" + W[3] + " * s0);\n"; - c += " r2 += TO_ACCUM_TYPE(" + W[0] + " * s0);\n"; - c += " r0 += TO_ACCUM_TYPE(" + W[4] + " * s1);\n"; - c += " r1 += TO_ACCUM_TYPE(" + W[3] + " * s1);\n"; - c += " r2 += TO_ACCUM_TYPE(" + W[1] + " * s1);\n"; - c += " r3 += TO_ACCUM_TYPE(" + W[0] + " * s1);\n"; - c += " r0 += TO_ACCUM_TYPE(" + W[5] + " * s2);\n"; - c += " r1 += TO_ACCUM_TYPE(" + W[4] + " * s2);\n"; - c += " r2 += TO_ACCUM_TYPE(" + W[2] + " * s2);\n"; - c += " r3 += TO_ACCUM_TYPE(" + W[1] + " * s2);\n"; - c += " r1 += TO_ACCUM_TYPE(" + W[5] + " * s3);\n"; - c += " r3 += TO_ACCUM_TYPE(" + W[2] + " * s3);\n"; - c += " }\n"; - c += " {\n"; - read_4x_line(2); - c += " r0 += TO_ACCUM_TYPE(" + W[6] + " * s0);\n"; - c += " r2 += TO_ACCUM_TYPE(" + W[3] + " * s0);\n"; - c += " r0 += TO_ACCUM_TYPE(" + W[7] + " * s1);\n"; - c += " r1 += TO_ACCUM_TYPE(" + W[6] + " * s1);\n"; - c += " r2 += TO_ACCUM_TYPE(" + W[4] + " * s1);\n"; - c += " r3 += TO_ACCUM_TYPE(" + W[3] + " * s1);\n"; - c += " r0 += TO_ACCUM_TYPE(" + W[8] + " * s2);\n"; - c += " r1 += TO_ACCUM_TYPE(" + W[7] + " * s2);\n"; - c += " r2 += TO_ACCUM_TYPE(" + W[5] + " * s2);\n"; - c += " r3 += TO_ACCUM_TYPE(" + W[4] + " * s2);\n"; - c += " r1 += TO_ACCUM_TYPE(" + W[8] + " * s3);\n"; - c += " r3 += TO_ACCUM_TYPE(" + W[5] + " * s3);\n"; - c += " }\n"; - c += " {\n"; - read_4x_line(3); - c += " r2 += TO_ACCUM_TYPE(" + W[6] + " * s0);\n"; - c += " r2 += TO_ACCUM_TYPE(" + W[7] + " * s1);\n"; - c += " r3 += TO_ACCUM_TYPE(" + W[6] + " * s1);\n"; - c += " r2 += TO_ACCUM_TYPE(" + W[8] + " * s2);\n"; - c += " r3 += TO_ACCUM_TYPE(" + W[7] + " * s2);\n"; - c += " r3 += TO_ACCUM_TYPE(" + W[8] + " * s3);\n"; - c += " }\n"; - if (!weights_are_buffer) - { - c += " FLT4 bias = args.weights.Read(9, S);\n"; - } - c += " r0 += TO_ACCUM_TYPE(" + bias + ");\n"; - c += " r1 += TO_ACCUM_TYPE(" + bias + ");\n"; - c += " r2 += TO_ACCUM_TYPE(" + bias + ");\n"; - c += " r3 += TO_ACCUM_TYPE(" + bias + ");\n"; - if (local_mem_uploads) - { - c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() " - "|| S >= args.dst_tensor.Slices()) { \n"; - c += " return; \n"; - c += " } \n"; - } - c += " if(X + 0 < args.dst_tensor.Width() && Y + 0 < " - "args.dst_tensor.Height()) {\n"; - c += " FLT4 result = TO_FLT4(r0);\n"; - c += " args.dst_tensor.Write(result, X + 0, Y + 0, S)\n"; - c += " }\n"; - c += " if(X + 1 < args.dst_tensor.Width() && Y + 0 < " - "args.dst_tensor.Height()) {\n"; - c += " FLT4 result = TO_FLT4(r1);\n"; - c += " args.dst_tensor.Write(result, X + 1, Y + 0, S)\n"; - c += " }\n"; - c += " if(X + 0 < args.dst_tensor.Width() && Y + 1 < " - "args.dst_tensor.Height()) {\n"; - c += " FLT4 result = TO_FLT4(r2);\n"; - c += " args.dst_tensor.Write(result, X + 0, Y + 1, S)\n"; - c += " }\n"; - c += " if(X + 1 < args.dst_tensor.Width() && Y + 1 < " - "args.dst_tensor.Height()) {\n"; - c += " FLT4 result = TO_FLT4(r3);\n"; - c += " args.dst_tensor.Write(result, X + 1, Y + 1, S)\n"; - c += " }\n"; - c += "}\n"; - - return c; -} - -int3 DepthwiseConv3x3::GetGridSize() const -{ - const int grid_x = DivideRoundUp(dst_[0]->Width(), 2) * dst_[0]->Batch(); - const int grid_y = DivideRoundUp(dst_[0]->Height(), 2); - const int grid_z = dst_[0]->Slices(); - return int3(grid_x, grid_y, grid_z); -} - -void DepthwiseConv3x3::GetPossibleKernelWorkGroups(TuningType tuning_type, - const DeviceInfo &device_info, - const KernelInfo &kernel_info, - std::vector<int3> *work_groups) const -{ - if (local_mem_uploads_) - { - work_groups->push_back(work_group_size_); - } - else - { - GetPossibleWorkGroups(tuning_type, device_info, kernel_info, grid_size_, work_groups); - } -} - -bool IsDepthwiseConv3x3Supported(const DepthwiseConvolution2DAttributes &attr) -{ - return attr.weights.shape.o == 1 && attr.dilations.w == 1 && attr.dilations.h == 1 && - attr.weights.shape.w == 3 && attr.weights.shape.h == 3 && attr.strides.w == 1 && - attr.strides.h == 1 && attr.padding.prepended.w == 1 && attr.padding.prepended.h == 1 && - attr.padding.appended.w == 1 && attr.padding.appended.h == 1; -} - -DepthwiseConv3x3 CreateDepthwiseConv3x3(const DeviceInfo &device_info, - const OperationDef &definition, - const DepthwiseConvolution2DAttributes &attr) -{ - bool weights_are_buffer = device_info.IsPowerVR() || device_info.IsMali(); - bool local_mem_uploads = weights_are_buffer && device_info.IsPowerVR(); - DepthwiseConv3x3 result(definition, weights_are_buffer, local_mem_uploads, device_info); - result.UploadWeightsAndBiases(attr.weights, attr.bias, weights_are_buffer); - return result; -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv3x3.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv3x3.h deleted file mode 100644 index 8c571105a..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv3x3.h +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_DEPTHWISE_CONV_3X3_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_DEPTHWISE_CONV_3X3_H__ - -#include <memory> -#include <vector> - -#include "open_cl/Buffer.h" -#include "open_cl/kernels/GpuOperation.h" -#include "open_cl/Tensor.h" -#include "open_cl/Texture2d.h" -#include "open_cl/Util.h" -#include "open_cl/DataType.h" -#include "open_cl/Operations.h" -#include "open_cl/Shape.h" -#include "open_cl/Status.h" -#include "open_cl/Tensor.h" -#include "open_cl/Types.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -class DepthwiseConv3x3 : public GPUOperation -{ -public: - DepthwiseConv3x3() = default; - void GetPossibleKernelWorkGroups(TuningType tuning_type, const DeviceInfo &device_info, - const KernelInfo &kernel_info, - std::vector<int3> *work_groups) const override; - int3 GetGridSize() const override; - - // Move only - DepthwiseConv3x3(DepthwiseConv3x3 &&operation); - DepthwiseConv3x3 &operator=(DepthwiseConv3x3 &&operation); - DepthwiseConv3x3(const DepthwiseConv3x3 &) = delete; - DepthwiseConv3x3 &operator=(const DepthwiseConv3x3 &) = delete; - -private: - explicit DepthwiseConv3x3(const OperationDef &definition, bool weights_are_buffer, - bool local_mem_uploads, const DeviceInfo &device_info); - template <DataType T> - void UploadWeightsAndBiases(const InternalTensor<OHWI, T> &weights, - const InternalTensor<Linear, T> &biases, bool weights_are_buffer); - - friend DepthwiseConv3x3 CreateDepthwiseConv3x3(const DeviceInfo &device_info, - const OperationDef &definition, - const DepthwiseConvolution2DAttributes &attr); - - template <DataType S, typename T> - void RearrangeWeightsAndBiasesData(const InternalTensor<OHWI, S> &weights, - const InternalTensor<Linear, S> &biases, absl::Span<T> dst); - - std::string GenerateDepthwiseConvCode(const OperationDef &op_def, bool weights_are_buffer, - bool local_mem_uploads); - - bool local_mem_uploads_; -}; - -template <DataType T> -void DepthwiseConv3x3::UploadWeightsAndBiases(const InternalTensor<OHWI, T> &weights, - const InternalTensor<Linear, T> &biases, - bool weights_are_buffer) -{ - const int src_depth = DivideRoundUp(weights.shape.i, 4); - int texture_width = 10; // 3x3 kernel + 1 bias - int texture_height = src_depth; - const int elements_count = texture_width * texture_height; - const bool fp32_weights = definition_.precision == CalculationsPrecision::F32; - const int float4_size = fp32_weights ? 16 : 8; - - std::vector<uint8_t> data(float4_size * elements_count); - if (fp32_weights) - { - float4 *ptr = reinterpret_cast<float4 *>(data.data()); - RearrangeWeightsAndBiasesData(weights, biases, absl::MakeSpan(ptr, elements_count)); - } - // TODO - // It doesn't support F16 yet. I will try to add it later. - // - // else { - // half4* ptr = reinterpret_cast<half4*>(data.data()); - // RearrangeWeightsAndBiasesData(weights, biases, - // absl::MakeSpan(ptr, elements_count)); - // } - - if (weights_are_buffer) - { - BufferDescriptor desc; - desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16; - desc.element_size = 4; - desc.size = float4_size * elements_count; - desc.data = std::move(data); - args_.AddObject("weights", absl::make_unique<BufferDescriptor>(std::move(desc))); - } - else - { - Texture2DDescriptor desc; - desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16; - desc.size = int2(texture_width, texture_height); - desc.data = std::move(data); - args_.AddObject("weights", absl::make_unique<Texture2DDescriptor>(std::move(desc))); - } -} - -template <DataType S, typename T> -void DepthwiseConv3x3::RearrangeWeightsAndBiasesData(const InternalTensor<OHWI, S> &weights, - const InternalTensor<Linear, S> &biases, - absl::Span<T> dst) -{ - const int src_depth = DivideRoundUp(weights.shape.i, 4); - - int counter = 0; - for (int s = 0; s < src_depth; ++s) - { - for (int y = 0; y < 3; ++y) - { - for (int x = 0; x < 3; ++x) - { - T filter_val; - for (int i = 0; i < 4; ++i) - { - const int s_ch = s * 4 + i; - if (s_ch < weights.shape.i) - { - const int f_index = weights.shape.LinearIndex({0, y, x, s_ch}); - filter_val[i] = weights.data[f_index]; - } - else - { - filter_val[i] = 0.0f; - } - } - dst[counter++] = filter_val; - } - } - - T bias_val; - for (int i = 0; i < 4; ++i) - { - const int dst_ch = s * 4 + i; - bias_val[i] = dst_ch >= biases.shape.v ? 0.0f : biases.data[dst_ch]; - } - dst[counter++] = bias_val; - } -} - -bool IsDepthwiseConv3x3Supported(const DepthwiseConvolution2DAttributes &attr); - -DepthwiseConv3x3 CreateDepthwiseConv3x3(const DeviceInfo &device_info, - const OperationDef &definition, - const DepthwiseConvolution2DAttributes &attr); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_DEPTHWISE_CONV_3X3_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/GpuOperation.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/GpuOperation.cc deleted file mode 100644 index 8839d9687..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/GpuOperation.cc +++ /dev/null @@ -1,385 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "GpuOperation.h" - -#include "Util.h" -#include "WorkGroupPicking.h" -#include "open_cl/AccessType.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ -namespace -{ - -std::string GetElementWiseCode(const OperationDef &op_def, bool check_src_slices) -{ - std::string c = GetCommonDefines(op_def.precision); - - c += "__kernel void main_function(\n"; - c += "$0) {\n"; - c += " int X = get_global_id(0);\n"; - c += " int Y = get_global_id(1);\n"; - c += " int Z = get_global_id(2);\n"; - c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || " - "Z >= args.dst_tensor.Slices()) return; \n"; - if (check_src_slices) - { - c += " FLT4 src = (FLT4)(0.0f);\n"; - c += " if (Z < args.src_tensor.Slices()) {\n"; - c += " src = args.src_tensor.Read(X, Y, Z);\n"; - c += " }\n"; - } - else - { - c += " FLT4 src = args.src_tensor.Read(X, Y, Z);\n"; - } - c += " args.dst_tensor.Write(src, X, Y, Z);\n"; - c += "} \n"; - return c; -} - -int3 GetWorkGroupsCount(int grid_dimension, const int3 &grid_size, const int3 &work_group_size, - const int3 &work_group_launch_order) -{ - int3 work_groups_count; - if (grid_dimension == 1) - { - work_groups_count.x = DivideRoundUp(grid_size.x, work_group_size.x); - work_groups_count.y = 1; - work_groups_count.z = 1; - } - else if (grid_dimension == 2) - { - int3 wgs; - wgs.x = DivideRoundUp(grid_size.x, work_group_size.x); - wgs.y = DivideRoundUp(grid_size.y, work_group_size.y); - work_groups_count.x = wgs[work_group_launch_order[0]]; - work_groups_count.y = wgs[work_group_launch_order[1]]; - work_groups_count.z = 1; - } - else - { // grid_dimension == 3 - int3 wgs; - wgs.x = DivideRoundUp(grid_size.x, work_group_size.x); - wgs.y = DivideRoundUp(grid_size.y, work_group_size.y); - wgs.z = DivideRoundUp(grid_size.z, work_group_size.z); - work_groups_count.x = wgs[work_group_launch_order[0]]; - work_groups_count.y = wgs[work_group_launch_order[1]]; - work_groups_count.z = wgs[work_group_launch_order[2]]; - } - return work_groups_count; -} - -} // namespace - -DataType OperationDef::GetDataType() const { return DeduceDataTypeFromPrecision(precision); } - -DataType OperationDef::GetPrimaryDataType() const { return src_tensors[0].data_type; } -TensorStorageType OperationDef::GetPrimaryStorageType() const -{ - return src_tensors[0].storage_type; -} - -bool OperationDef::IsBatchSupported() const -{ - for (const auto &src : src_tensors) - { - if (HasAxis(src.layout, Axis::BATCH)) - { - return true; - } - } - for (const auto &dst : dst_tensors) - { - if (HasAxis(dst.layout, Axis::BATCH)) - { - return true; - } - } - return false; -} - -GPUOperation::GPUOperation(const OperationDef &definition) : definition_(definition) {} - -void GPUOperation::SetSrc(Tensor *ptr, int index) -{ - if (index >= (int)src_.size()) - { - src_.resize(index + 1, nullptr); - } - src_[index] = ptr; -} - -void GPUOperation::SetDst(Tensor *ptr, int index) -{ - if (index >= (int)dst_.size()) - { - dst_.resize(index + 1, nullptr); - } - dst_[index] = ptr; -} - -GPUOperation::GPUOperation(GPUOperation &&operation) - : args_(std::move(operation.args_)), code_(std::move(operation.code_)), - work_group_size_(operation.work_group_size_), - compiler_options_(std::move(operation.compiler_options_)), - tensor_to_grid_(operation.tensor_to_grid_), elementwise_(operation.elementwise_), - linkable_(operation.linkable_), check_src_channels_size_(operation.check_src_channels_size_), - definition_(std::move(operation.definition_)), src_(std::move(operation.src_)), - dst_(std::move(operation.dst_)), kernel_(std::move(operation.kernel_)), - grid_dimension_(operation.grid_dimension_), - work_group_launch_order_(operation.work_group_launch_order_), grid_size_(operation.grid_size_), - src_tensors_names_(std::move(operation.src_tensors_names_)), - dst_tensors_names_(std::move(operation.dst_tensors_names_)), - work_groups_count_(operation.work_groups_count_), linkable_count_(operation.linkable_count_), - elementwise_code_(std::move(operation.elementwise_code_)) -{ -} - -GPUOperation &GPUOperation::operator=(GPUOperation &&operation) -{ - if (this != &operation) - { - args_ = std::move(operation.args_); - code_ = std::move(operation.code_); - std::swap(work_group_size_, operation.work_group_size_); - compiler_options_ = std::move(operation.compiler_options_); - tensor_to_grid_ = operation.tensor_to_grid_; - elementwise_ = operation.elementwise_; - linkable_ = operation.linkable_; - check_src_channels_size_ = operation.check_src_channels_size_; - definition_ = std::move(operation.definition_); - src_ = std::move(operation.src_); - dst_ = std::move(operation.dst_); - kernel_ = std::move(operation.kernel_); - std::swap(grid_dimension_, operation.grid_dimension_); - std::swap(work_group_launch_order_, operation.work_group_launch_order_); - std::swap(grid_size_, operation.grid_size_); - src_tensors_names_ = std::move(operation.src_tensors_names_); - dst_tensors_names_ = std::move(operation.dst_tensors_names_); - std::swap(work_groups_count_, operation.work_groups_count_); - std::swap(linkable_count_, operation.linkable_count_); - elementwise_code_ = std::move(operation.elementwise_code_); - } - return *this; -} - -absl::Status GPUOperation::AddOperation(GPUOperation *operation) -{ - linkable_count_ += 1; - std::string code = operation->code_; - std::string unique_postfix = absl::StrCat("_link", linkable_count_); - operation->args_.RenameArgs(unique_postfix, &code); - elementwise_code_ += "{\n" + code + "\n}\n"; - RETURN_IF_ERROR(args_.Merge(std::move(operation->args_), unique_postfix)); - for (size_t i = 0; i < operation->src_tensors_names_.size(); ++i) - { - definition_.src_tensors.push_back(operation->definition_.src_tensors[i + 1]); - src_tensors_names_.push_back(operation->src_tensors_names_[i] + unique_postfix); - } - for (size_t i = 0; i < operation->dst_tensors_names_.size(); ++i) - { - dst_tensors_names_.push_back(operation->dst_tensors_names_[i] + unique_postfix); - } - return absl::OkStatus(); -} - -void GPUOperation::AddSrcTensor(const std::string &tensor_name, const TensorDescriptor &desc) -{ - src_tensors_names_.push_back(tensor_name); - auto desc_new = std::make_unique<TensorDescriptor>(desc); - args_.AddObjectRef(tensor_name, AccessType::READ, std::move(desc_new)); -} - -void GPUOperation::AddSrcBuffer(const std::string &buffer_name, const BufferDescriptor &desc) -{ - src_tensors_names_.push_back(buffer_name); - auto desc_new = std::make_unique<BufferDescriptor>(desc); - args_.AddObjectRef(buffer_name, AccessType::READ, std::move(desc_new)); -} - -void GPUOperation::AddDstTensor(const std::string &tensor_name, const TensorDescriptor &desc) -{ - dst_tensors_names_.push_back(tensor_name); - auto desc_new = std::make_unique<TensorDescriptor>(desc); - args_.AddObjectRef(tensor_name, AccessType::WRITE, std::move(desc_new)); -} - -absl::Status GPUOperation::UpdateParams() -{ - for (size_t i = 0; i < src_tensors_names_.size(); ++i) - { - RETURN_IF_ERROR(args_.SetObjectRef(src_tensors_names_[i], src_[i])); - } - for (size_t i = 0; i < dst_tensors_names_.size(); ++i) - { - RETURN_IF_ERROR(args_.SetObjectRef(dst_tensors_names_[i], dst_[i])); - } - RETURN_IF_ERROR(BindArguments(&args_)); - grid_size_ = GetGridSize(); - work_groups_count_ = - GetWorkGroupsCount(grid_dimension_, grid_size_, work_group_size_, work_group_launch_order_); - return absl::OkStatus(); -} - -absl::Status GPUOperation::AssembleCode(const DeviceInfo &device_info, CLContext *context) -{ - if (elementwise_) - { - auto src_desc = absl::make_unique<TensorDescriptor>(definition_.src_tensors[0]); - if (definition_.IsBatchSupported()) - { - src_desc->SetStateVar("BatchedWidth", "true"); - } - src_tensors_names_.insert(src_tensors_names_.begin(), "src_tensor"); - args_.AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc)); - - auto dst_desc = absl::make_unique<TensorDescriptor>(definition_.dst_tensors[0]); - if (definition_.IsBatchSupported()) - { - dst_desc->SetStateVar("BatchedWidth", "true"); - } - dst_tensors_names_.insert(dst_tensors_names_.begin(), "dst_tensor"); - args_.AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc)); - - elementwise_code_ = "{\n" + code_ + "\n}\n" + elementwise_code_; - code_ = GetElementWiseCode(definition_, check_src_channels_size_); - RETURN_IF_ERROR(args_.AllocateObjects(context)); - RETURN_IF_ERROR( - args_.TransformToCLCode(device_info, {{dst_tensors_names_[0], elementwise_code_}}, &code_)); - } - else - { - RETURN_IF_ERROR(args_.AllocateObjects(context)); - RETURN_IF_ERROR( - args_.TransformToCLCode(device_info, {{dst_tensors_names_[0], elementwise_code_}}, &code_)); - } - return absl::OkStatus(); -} - -absl::Status GPUOperation::Compile(const CreationContext &creation_context) -{ - RETURN_IF_ERROR(AssembleCode(creation_context.GetDeviceInfo(), creation_context.context)); - RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel( - code_, "main_function", compiler_options_, *creation_context.context, *creation_context.device, - &kernel_)); - return PostCompileCheck(creation_context.device->info_, kernel_.info_); -} - -absl::Status GPUOperation::CompileDeserialized(const CreationContext &creation_context) -{ - return creation_context.cache->GetOrCreateCLKernel(code_, "main_function", compiler_options_, - *creation_context.context, - *creation_context.device, &kernel_); -} - -void GPUOperation::GetPossibleKernelWorkGroups(TuningType tuning_type, - const DeviceInfo &device_info, - const KernelInfo &kernel_info, - std::vector<int3> *work_groups) const -{ - GetPossibleWorkGroups(tuning_type, device_info, kernel_info, grid_size_, work_groups); -} - -absl::Status GPUOperation::Tune(const TuningParameters ¶ms) -{ - std::vector<int3> possible_work_groups; - GetPossibleKernelWorkGroups(params.tuning_type, *params.info, kernel_.info_, - &possible_work_groups); - if (possible_work_groups.empty()) - { - return absl::NotFoundError("Can not found work_group size to launch kernel"); - } - if (possible_work_groups.size() == 1) - { - work_group_size_ = possible_work_groups[0]; - work_groups_count_ = - GetWorkGroupsCount(grid_dimension_, grid_size_, work_group_size_, work_group_launch_order_); - return absl::OkStatus(); - } - else - { - std::vector<int3> work_groups_count(possible_work_groups.size()); - for (size_t i = 0; i < work_groups_count.size(); ++i) - { - work_groups_count[i] = GetWorkGroupsCount(grid_dimension_, grid_size_, - possible_work_groups[i], work_group_launch_order_); - } - RETURN_IF_ERROR(args_.Bind(kernel_.kernel())); - int best_work_group_index; - RETURN_IF_ERROR(params.queue->GetBestWorkGroupIndex( - kernel_, *params.info, work_groups_count, possible_work_groups, &best_work_group_index)); - work_group_size_ = possible_work_groups[best_work_group_index]; - work_groups_count_ = - GetWorkGroupsCount(grid_dimension_, grid_size_, work_group_size_, work_group_launch_order_); - return absl::OkStatus(); - } -} - -int3 GPUOperation::GetGridSize() const -{ - if (elementwise_ || tensor_to_grid_ == TensorToGrid::kWBToX_HDToY_SToZ) - { - const int grid_x = dst_[0]->Width() * dst_[0]->Batch(); - const int grid_y = dst_[0]->Height() * dst_[0]->Depth(); - const int grid_z = dst_[0]->Slices(); - return int3(grid_x, grid_y, grid_z); - } - if (tensor_to_grid_ == TensorToGrid::kWBToX_HDToY_ZIs1) - { - const int grid_x = dst_[0]->Width() * dst_[0]->Batch(); - const int grid_y = dst_[0]->Height() * dst_[0]->Depth(); - const int grid_z = 1; - return int3(grid_x, grid_y, grid_z); - } - if (tensor_to_grid_ == TensorToGrid::kWBToX_HToY_DToZ) - { - const int grid_x = dst_[0]->Width() * dst_[0]->Batch(); - const int grid_y = dst_[0]->Height(); - const int grid_z = dst_[0]->Depth(); - return int3(grid_x, grid_y, grid_z); - } - if (tensor_to_grid_ == TensorToGrid::kBToX_YIs1_ZIs1) - { - const int grid_x = dst_[0]->Batch(); - const int grid_y = 1; - const int grid_z = 1; - return int3(grid_x, grid_y, grid_z); - } - return grid_size_; -} - -void GPUOperation::AddUniquePostfix(const std::string &unique_postfix) -{ - for (uint32_t i = 0; i < src_tensors_names_.size(); ++i) - { - src_tensors_names_[i] += unique_postfix; - } - for (uint32_t i = 0; i < dst_tensors_names_.size(); ++i) - { - dst_tensors_names_[i] += unique_postfix; - } -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/GpuOperation.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/GpuOperation.h deleted file mode 100644 index 4f531c629..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/GpuOperation.h +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_GPU_OPERATION_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_GPU_OPERATION_H__ - -#include <string> -#include <vector> - -#include "TuningParameters.h" - -#include "open_cl/Arguments.h" -#include "open_cl/Buffer.h" -#include "open_cl/ClCommandQueue.h" -#include "open_cl/ClContext.h" -#include "open_cl/ClDevice.h" -#include "open_cl/ClKernel.h" -#include "open_cl/ClProgram.h" -#include "open_cl/DataType.h" -#include "open_cl/DeviceInfo.h" -#include "open_cl/Precision.h" -#include "open_cl/ProgramCache.h" -#include "open_cl/Tensor.h" -#include "open_cl/TensorType.h" -#include "open_cl/Types.h" -#include "open_cl/Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -// kCustom: default value -// GPUOperation::GetGridSize must be overloaded -// kWBToX_HDToY_SToZ: -// grid_x = dst_[0]->Width() * dst_[0]->Batch(); -// grid_y = dst_[0]->Height() * dst_[0]->Depth(); -// grid_z = dst_[0]->Slices(); -// kWBToX_HDToY_ZIs1: -// grid_x = dst_[0]->Width() * dst_[0]->Batch(); -// grid_y = dst_[0]->Height() * dst_[0]->Depth(); -// grid_z = 1; -// kWBToX_HToY_DToZ: -// grid_x = dst_[0]->Width() * dst_[0]->Batch(); -// grid_y = dst_[0]->Height(); -// grid_z = dst_[0]->Depth(); -// kBToX_YIs1_ZIs1: -// grid_x = dst_[0]->Batch(); -// grid_y = 1; -// grid_z = 1; -enum class TensorToGrid -{ - kCustom, - kWBToX_HDToY_SToZ, - kWBToX_HDToY_ZIs1, - kWBToX_HToY_DToZ, - kBToX_YIs1_ZIs1 -}; - -struct CreationContext -{ - const CLDevice *device; - CLContext *context; - CLCommandQueue *queue; - ProgramCache *cache; - - const DeviceInfo &GetDeviceInfo() const { return device->info_; } -}; - -struct OperationDef -{ - CalculationsPrecision precision; - std::vector<TensorDescriptor> src_tensors; - std::vector<TensorDescriptor> dst_tensors; - - // returns FLOAT32 for F32 precision and FLOAT16 for F16 precision - DataType GetDataType() const; - // Primary means the first src tensor, because first tensor usually defines - // the structure of kernel, all other resources(biases) types and etc. - DataType GetPrimaryDataType() const; - TensorStorageType GetPrimaryStorageType() const; - bool IsBatchSupported() const; -}; - -// GPUOperation represents some implementation of neural network operation on -// GPU. GPUOperation can contain another GPU operations with flag elementwise_. -// When GPUOperation contains another GPU ops, this GPUoperation replaces -// some sequence of operations Op + op0 + op1 + ... -// Because of this abilities of GPUOperation, usage scenario is next: -// Create instance of GPUOperation. -// Create all instances of GPUOperations that we will(probably) attach -// to GPUOperation. Attach all GPUOperations to GPUOperation. Call -// GPUOperation.Compile(). Don't call GPUOperations.Compile() if it -// attached, it useless(and may be error) -class GPUOperation -{ -public: - GPUOperation() = default; - explicit GPUOperation(const OperationDef &definition); - virtual ~GPUOperation() = default; - // Move only - GPUOperation(GPUOperation &&operation); - GPUOperation &operator=(GPUOperation &&operation); - GPUOperation(const GPUOperation &) = delete; - GPUOperation &operator=(const GPUOperation &) = delete; - - absl::Status AddOperation(GPUOperation *operation); - - void SetSrc(Tensor *ptr, int index = 0); - void SetDst(Tensor *ptr, int index = 0); - - // should be called after changes of inputs/outputs. - absl::Status UpdateParams(); - - absl::Status AddToQueue(CLCommandQueue *queue) - { - RETURN_IF_ERROR(args_.Bind(kernel_.kernel())); - return queue->Dispatch(kernel_, work_groups_count_, work_group_size_); - } - - virtual void GetPossibleKernelWorkGroups(TuningType tuning_type, const DeviceInfo &device_info, - const KernelInfo &kernel_info, - std::vector<int3> *work_groups) const; - - absl::Status Tune(const TuningParameters ¶ms); - - absl::Status AssembleCode(const DeviceInfo &device_info, CLContext *context); - - absl::Status Compile(const CreationContext &creation_context); - - absl::Status CompileDeserialized(const CreationContext &creation_context); - - virtual absl::Status PostCompileCheck(const DeviceInfo &, const KernelInfo &) - { - return absl::OkStatus(); - } - - const OperationDef &GetDefinition() const { return definition_; } - - void AddSrcTensor(const std::string &tensor_name, const TensorDescriptor &desc); - void AddSrcBuffer(const std::string &buffer_name, const BufferDescriptor &desc); - void AddDstTensor(const std::string &tensor_name, const TensorDescriptor &desc); - - bool IsLinkable() const { return elementwise_ && linkable_; } - - // for linking - void AddUniquePostfix(const std::string &unique_postfix); - - Arguments args_; - std::string code_; - int3 work_group_size_ = int3(8, 4, 1); - std::vector<CompilerOptions> compiler_options_; - // not applicable to elementwise - TensorToGrid tensor_to_grid_ = TensorToGrid::kCustom; - - bool elementwise_ = false; - // applicable only with elementwise_ = true; - bool linkable_ = true; // by default every elementwise is linkable - // applicable only with elementwise_ = true; - bool check_src_channels_size_ = false; - -protected: - virtual absl::Status BindArguments(ArgumentsBinder *) { return absl::OkStatus(); } - virtual int3 GetGridSize() const; - - // Defines operation calculation precision and format of src/dst tensors. - OperationDef definition_; - std::vector<Tensor *> src_; - std::vector<Tensor *> dst_; - CLKernel kernel_; - int grid_dimension_ = 3; // can be 1, 2 or 3 - int3 work_group_launch_order_ = int3(0, 1, 2); - int3 grid_size_ = int3(0, 0, 0); - std::vector<std::string> src_tensors_names_; - std::vector<std::string> dst_tensors_names_; - -private: - int3 work_groups_count_ = int3(0, 0, 0); - int linkable_count_ = 0; - std::string elementwise_code_; // temporary, used during op construction -}; - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_GPU_OPERATION_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Pooling.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/Pooling.cc deleted file mode 100644 index ceeab2f39..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/Pooling.cc +++ /dev/null @@ -1,400 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Pooling.h" - -#include <string> - -#include "Util.h" -#include "open_cl/Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ -namespace -{ - -std::string GetAveragePoolingKernelCode(const OperationDef &op_def, bool stride_correction, - GPUOperation *op) -{ - auto src_desc = op_def.src_tensors[0]; - - src_desc.SetTextureAddressMode(TextureAddressMode::ZERO); - - if (op_def.IsBatchSupported()) - { - src_desc.SetStateVar("BatchedWidth", "true"); - } - op->AddSrcTensor("src_tensor", src_desc); - auto dst_desc = op_def.dst_tensors[0]; - if (op_def.IsBatchSupported()) - { - dst_desc.SetStateVar("BatchedWidth", "true"); - } - op->AddDstTensor("dst_tensor", dst_desc); - - std::map<Axis, std::string> axis_to_src_coord = { - {Axis::WIDTH, "x_c"}, {Axis::HEIGHT, "y_c"}, {Axis::DEPTH, "d_c"}, - {Axis::CHANNELS, "Z"}, {Axis::BATCH, "B"}, - }; - - std::map<Axis, std::string> axis_to_dst_coord = { - {Axis::WIDTH, "X"}, {Axis::HEIGHT, "Y"}, {Axis::DEPTH, "D"}, - {Axis::CHANNELS, "Z"}, {Axis::BATCH, "B"}, - }; - - std::vector<std::string> src_coords; - std::vector<std::string> dst_coords; - for (auto axis : {Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH, Axis::CHANNELS}) - { - if (op_def.dst_tensors[0].HasAxis(axis)) - { - dst_coords.push_back(axis_to_dst_coord[axis]); - } - if (op_def.src_tensors[0].HasAxis(axis)) - { - src_coords.push_back(axis_to_src_coord[axis]); - } - } - std::string src_coord = src_coords[0]; - for (size_t i = 1; i < src_coords.size(); ++i) - { - src_coord += ", " + src_coords[i]; - } - std::string dst_coord = dst_coords[0]; - for (size_t i = 1; i < dst_coords.size(); ++i) - { - dst_coord += ", " + dst_coords[i]; - } - - const bool manual_clamp = op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER || - op_def.src_tensors[0].storage_type == TensorStorageType::IMAGE_BUFFER; - - std::string c = GetCommonDefines(op_def.precision); - c += "__kernel void main_function(\n"; - c += "$0) {\n"; - c += " int X = get_global_id(0);\n"; - if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) - { - c += " int linear_id_1 = get_global_id(1);\n"; - c += " int Y = linear_id_1 / args.dst_tensor.Depth();\n"; - c += " int D = linear_id_1 % args.dst_tensor.Depth();\n"; - } - else - { - c += " int Y = get_global_id(1);\n"; - } - c += " int Z = get_global_id(2);\n"; - c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || " - "Z >= args.dst_tensor.Slices()) { \n"; - c += " return; \n"; - c += " } \n"; - c += " float4 r = (float4)(0.0f);\n"; - c += " float window_size = 0.0;\n"; - if (stride_correction) - { - c += " int xs = " + - GetXStrideCorrectedV2("X", "args.src_tensor.Batch()", "args.stride_x", "args.padding_x") + - ";\n"; - } - else - { - if (op_def.IsBatchSupported()) - { - c += " int xs = X * args.stride_x + args.padding_x * " - "args.src_tensor.Batch();\n"; - } - else - { - c += " int xs = X * args.stride_x + args.padding_x;\n"; - } - } - c += " int ys = Y * args.stride_y + args.padding_y;\n"; - if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) - { - c += " int ds = D * args.stride_z + args.padding_z;\n"; - c += " for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n"; - c += " int d_c = ds + kz;\n"; - c += " if (d_c < 0 || d_c >= args.src_tensor.Depth()) continue;\n"; - } - c += " for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n"; - c += " int y_c = ys + ky;\n"; - c += " bool outside_y = y_c < 0 || y_c >= args.src_tensor.Height();\n"; - c += " for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n"; - if (op_def.IsBatchSupported()) - { - c += " int x_c = xs + kx * args.src_tensor.Batch();\n"; - } - else - { - c += " int x_c = xs + kx;\n"; - } - c += " bool outside = outside_y || x_c < 0 || x_c >= " - "args.src_tensor.Width();\n"; - if (manual_clamp) - { - c += " r += !outside ? args.src_tensor.Read<float>(" + src_coord + - ") : " - "(float4)(0.0f);\n"; - } - else - { - c += " r += args.src_tensor.Read<float>(" + src_coord + ");\n"; - } - c += " window_size += !outside ? 1.0 : 0.0;\n"; - c += " }\n"; - c += " }\n"; - if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) - { - c += " } // Depth\n"; - } - // If window_size==0, window covered nothing. This situation is a sign of - // incorrectly constructed operation. NaNs are expected as output. - c += " FLT4 result = TO_FLT4(r / window_size);\n"; - c += " args.dst_tensor.Write(result, " + dst_coord + ");\n"; - c += "}\n"; - - return c; -} - -std::string GetMaxPoolingKernelCode(const OperationDef &op_def, bool stride_correction, - bool output_indices, GPUOperation *op) -{ - auto src_desc = op_def.src_tensors[0]; - if (op_def.IsBatchSupported()) - { - src_desc.SetStateVar("BatchedWidth", "true"); - } - op->AddSrcTensor("src_tensor", src_desc); - auto dst_desc = op_def.dst_tensors[0]; - if (op_def.IsBatchSupported()) - { - dst_desc.SetStateVar("BatchedWidth", "true"); - } - op->AddDstTensor("dst_tensor", dst_desc); - if (output_indices) - { - auto dst_ind_desc = op_def.dst_tensors[1]; - if (op_def.IsBatchSupported()) - { - dst_ind_desc.SetStateVar("BatchedWidth", "true"); - } - op->AddDstTensor("dst_indices", dst_ind_desc); - } - - std::map<Axis, std::string> axis_to_src_coord = { - {Axis::WIDTH, "x_c"}, {Axis::HEIGHT, "y_c"}, {Axis::DEPTH, "d_c"}, - {Axis::CHANNELS, "Z"}, {Axis::BATCH, "B"}, - }; - - std::map<Axis, std::string> axis_to_dst_coord = { - {Axis::WIDTH, "X"}, {Axis::HEIGHT, "Y"}, {Axis::DEPTH, "D"}, - {Axis::CHANNELS, "Z"}, {Axis::BATCH, "B"}, - }; - - std::vector<std::string> src_coords; - std::vector<std::string> dst_coords; - for (auto axis : {Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH, Axis::CHANNELS}) - { - if (op_def.dst_tensors[0].HasAxis(axis)) - { - dst_coords.push_back(axis_to_dst_coord[axis]); - } - if (op_def.src_tensors[0].HasAxis(axis)) - { - src_coords.push_back(axis_to_src_coord[axis]); - } - } - std::string src_coord = src_coords[0]; - for (size_t i = 1; i < src_coords.size(); ++i) - { - src_coord += ", " + src_coords[i]; - } - std::string dst_coord = dst_coords[0]; - for (size_t i = 1; i < dst_coords.size(); ++i) - { - dst_coord += ", " + dst_coords[i]; - } - - std::string c = GetCommonDefines(op_def.precision); - c += "__kernel void main_function(\n"; - c += "$0) {\n"; - c += " int X = get_global_id(0);\n"; - if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) - { - c += " int linear_id_1 = get_global_id(1);\n"; - c += " int Y = linear_id_1 / args.dst_tensor.Depth();\n"; - c += " int D = linear_id_1 % args.dst_tensor.Depth();\n"; - } - else - { - c += " int Y = get_global_id(1);\n"; - } - c += " int Z = get_global_id(2);\n"; - c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || " - "Z >= args.dst_tensor.Slices()) { \n"; - c += " return; \n"; - c += " } \n"; - c += " FLT4 maximum = (FLT4)(-10000.0f);\n"; - if (output_indices) - { - c += " FLT4 indexes = (FLT4)(0.0f);\n"; - } - if (stride_correction) - { - c += " int xs = " + - GetXStrideCorrectedV2("X", "args.src_tensor.Batch()", "args.stride_x", "args.padding_x") + - ";\n"; - } - else - { - if (op_def.IsBatchSupported()) - { - c += " int xs = X * args.stride_x + args.padding_x * " - "args.src_tensor.Batch();\n"; - } - else - { - c += " int xs = X * args.stride_x + args.padding_x;\n"; - } - } - c += " int ys = Y * args.stride_y + args.padding_y;\n"; - c += " for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n"; - c += " int y_c = ys + ky;\n"; - c += " if (y_c < 0 || y_c >= args.src_tensor.Height()) continue;\n"; - c += " for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n"; - if (op_def.IsBatchSupported()) - { - c += " int x_c = xs + kx * args.src_tensor.Batch();\n"; - } - else - { - c += " int x_c = xs + kx;\n"; - } - c += " if (x_c < 0 || x_c >= args.src_tensor.Width()) continue;\n"; - if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) - { - c += " int ds = D * args.stride_z + args.padding_z;\n"; - c += " for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n"; - c += " int d_c = ds + kz;\n"; - c += " if (d_c < 0 || d_c >= args.src_tensor.Depth()) continue;\n"; - } - c += " FLT4 src = args.src_tensor.Read(" + src_coord + ");\n"; - if (output_indices) - { - if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) - { - c += " FLT index_counter = (FLT)((ky * args.kernel_size_x + kx) * " - "args.kernel_size_z + kz) + (FLT)(0.1f);\n"; - } - else - { - c += " FLT index_counter = (FLT)(ky * args.kernel_size_x + kx) + " - "(FLT)(0.1f);\n"; - } - c += " if (src.x > maximum.x) {\n"; - c += " indexes.x = index_counter;\n"; - c += " maximum.x = src.x;\n"; - c += " }\n"; - c += " if (src.y > maximum.y) {\n"; - c += " indexes.y = index_counter;\n"; - c += " maximum.y = src.y;\n"; - c += " }\n"; - c += " if (src.z > maximum.z) {\n"; - c += " indexes.z = index_counter;\n"; - c += " maximum.z = src.z;\n"; - c += " }\n"; - c += " if (src.w > maximum.w) {\n"; - c += " indexes.w = index_counter;\n"; - c += " maximum.w = src.w;\n"; - c += " }\n"; - } - else - { - c += " maximum = max(src, maximum);\n"; - } - if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) - { - c += " } // Depth\n"; - } - c += " }\n"; - c += " }\n"; - c += " args.dst_tensor.Write(maximum, " + dst_coord + ");\n"; - if (output_indices) - { - c += " args.dst_indices.Write(indexes, " + dst_coord + ");\n"; - } - c += "}\n"; - - return c; -} -} // namespace - -GPUOperation CreatePooling(const OperationDef &definition, const Pooling2DAttributes &attr) -{ - GPUOperation op(definition); - op.args_.AddInt("kernel_size_x", attr.kernel.w); - op.args_.AddInt("padding_x", -attr.padding.prepended.w); - op.args_.AddInt("stride_x", attr.strides.w); - op.args_.AddInt("kernel_size_y", attr.kernel.h); - op.args_.AddInt("padding_y", -attr.padding.prepended.h); - op.args_.AddInt("stride_y", attr.strides.h); - - const bool stride_correction = definition.IsBatchSupported() && attr.strides.w != 1; - if (attr.type == PoolingType::AVERAGE) - { - op.code_ = GetAveragePoolingKernelCode(definition, stride_correction, &op); - } - else if (attr.type == PoolingType::MAX) - { - op.code_ = GetMaxPoolingKernelCode(definition, stride_correction, attr.output_indices, &op); - } - op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ; - return op; -} - -GPUOperation CreatePooling(const OperationDef &definition, const Pooling3DAttributes &attr) -{ - GPUOperation op(definition); - op.args_.AddInt("kernel_size_x", attr.kernel.w); - op.args_.AddInt("padding_x", -attr.padding.prepended.w); - op.args_.AddInt("stride_x", attr.strides.w); - op.args_.AddInt("kernel_size_y", attr.kernel.h); - op.args_.AddInt("padding_y", -attr.padding.prepended.h); - op.args_.AddInt("stride_y", attr.strides.h); - op.args_.AddInt("kernel_size_z", attr.kernel.d); - op.args_.AddInt("padding_z", -attr.padding.prepended.d); - op.args_.AddInt("stride_z", attr.strides.d); - const bool stride_correction = definition.IsBatchSupported() && attr.strides.w != 1; - if (attr.type == PoolingType::AVERAGE) - { - op.code_ = GetAveragePoolingKernelCode(definition, stride_correction, &op); - } - else if (attr.type == PoolingType::MAX) - { - op.code_ = GetMaxPoolingKernelCode(definition, stride_correction, attr.output_indices, &op); - } - op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ; - return op; -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Pooling.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/Pooling.h deleted file mode 100644 index 166d81591..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/Pooling.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_POOLING_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_POOLING_H__ - -#include "GpuOperation.h" - -#include "open_cl/Operations.h" -#include "open_cl/Precision.h" -#include "open_cl/ClKernel.h" -#include "open_cl/Tensor.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -GPUOperation CreatePooling(const OperationDef &definition, const Pooling2DAttributes &attr); - -GPUOperation CreatePooling(const OperationDef &definition, const Pooling3DAttributes &attr); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_ADD_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Relu.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/Relu.cc deleted file mode 100644 index 37f87e599..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/Relu.cc +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Relu.h" - -#include <string> -#include "Util.h" -#include "GpuOperation.h" -#include "absl/strings/str_cat.h" -#include "open_cl/Precision.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -GPUOperation CreateReLU(const OperationDef &definition, const ReLUAttributes &attr) -{ - GPUOperation op(definition); - op.elementwise_ = true; - - std::string min_func; - if (attr.alpha != 0.0f) - { - min_func = "min(in_out_value * args.alpha, (FLT)(0.0f))"; - if (definition.precision == CalculationsPrecision::F32) - { - op.args_.AddFloat("alpha", attr.alpha); - } - else - { -#ifdef FIXME_PORTING_HALF_REQIRED - op.args_.AddHalf("alpha", half(attr.alpha)); -#endif - } - } - else - { - min_func = "(FLT)(0.0f)"; - } - if (attr.clip != 0.0f) - { - if (definition.precision == CalculationsPrecision::F32) - { - op.args_.AddFloat("clip", attr.clip); - } - else - { -#ifdef FIXME_PORTING_HALF_REQIRED - op.args_.AddHalf("clip", half(attr.clip)); -#endif - } - op.code_ = absl::StrCat("in_out_value = clamp(in_out_value, " + min_func + ", args.clip);"); - } - else - { - op.code_ = absl::StrCat("in_out_value = max(in_out_value, ", min_func, ");"); - } - return op; -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Relu.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/Relu.h deleted file mode 100644 index eb6b1ad1d..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/Relu.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2020 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPEN_CL_KERNELS_RELU_H__ -#define __ONERT_BACKEND_GPU_CL_OPEN_CL_KERNELS_RELU_H__ - -#include "open_cl/ClKernel.h" -#include "GpuOperation.h" -#include "open_cl/Precision.h" -#include "open_cl/Tensor.h" -#include "open_cl/Types.h" -#include "open_cl/Operations.h" -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -GPUOperation CreateReLU(const OperationDef &definition, const ReLUAttributes &attr); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPEN_CL_KERNELS_RELU_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Reshape.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/Reshape.cc deleted file mode 100644 index cdd3e8364..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/Reshape.cc +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Reshape.h" - -#include <string> - -#include "Util.h" -#include "open_cl/Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ -namespace -{ -std::string GetReshapeCode(const OperationDef &op_def) -{ - std::string c = GetCommonDefines(op_def.precision); - c += "__kernel void main_function(\n"; - c += "$0) {\n"; - if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) - { - c += " int linear_id = get_global_id(0);\n"; - c += " int X = linear_id / args.dst_tensor.Batch();\n"; - c += " int B = linear_id % args.dst_tensor.Batch();\n"; - c += " args.dst_tensor.SetBatchRef(B);\n"; - } - else - { - c += " int X = get_global_id(0);\n"; - } - c += " int Y = get_global_id(1);\n"; - c += " int Z = get_global_id(2);\n"; - c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || " - "Z >= args.dst_tensor.Slices()) { \n"; - c += " return; \n"; - c += " } \n"; - c += " FLT temps[4];\n"; - c += " temps[0] = (FLT)(0.0f);\n"; - c += " temps[1] = (FLT)(0.0f);\n"; - c += " temps[2] = (FLT)(0.0f);\n"; - c += " temps[3] = (FLT)(0.0f);\n"; - if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) - { - c += " int base = B;\n"; - } - else - { - c += " int base = 0;\n"; - } - c += " base = ((base * args.dst_tensor.Height() + Y) * " - "args.dst_tensor.Width() + X) * args.dst_tensor.Channels() + Z * 4;\n"; - c += " for (int i = 0; i < 4; ++i) {\n"; - c += " int dst_channel = Z * 4 + i;\n"; - c += " if (dst_channel < args.dst_tensor.Channels()) {;\n"; - c += " int p = base + i;\n"; - c += " int src_c = p % args.src_tensor.Channels();\n"; - c += " p = p / args.src_tensor.Channels();\n"; - c += " int src_x = p % args.src_tensor.Width();\n"; - c += " p = p / args.src_tensor.Width();\n"; - c += " int src_y = p % args.src_tensor.Height();\n"; - if (op_def.src_tensors[0].HasAxis(Axis::BATCH)) - { - c += " int src_b = p / args.src_tensor.Height();\n"; - c += " args.src_tensor.SetBatchRef(src_b);\n"; - } - c += " int src_z = src_c / 4;\n"; - c += " int src_sub_ch = src_c % 4;\n"; - c += " FLT4 t = args.src_tensor.Read(src_x, src_y, src_z);\n"; - c += " FLT t_ar[4] = {t.x, t.y, t.z, t.w};\n"; - c += " temps[i] = t_ar[src_sub_ch];\n"; - c += " }\n"; - c += " }\n"; - c += " FLT4 result = (FLT4)(temps[0], temps[1], temps[2], temps[3]);\n"; - c += " args.dst_tensor.Write(result, X, Y, Z);\n"; - c += "}\n"; - return c; -} - -} // namespace - -GPUOperation CreateReshape(const OperationDef &definition) -{ - GPUOperation op(definition); - op.AddSrcTensor("src_tensor", definition.src_tensors[0]); - op.AddDstTensor("dst_tensor", definition.dst_tensors[0]); - op.code_ = GetReshapeCode(definition); - op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ; - return op; -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Reshape.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/Reshape.h deleted file mode 100644 index 4f7c5ea38..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/Reshape.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_RESHAPE_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_RESHAPE_H__ - -#include "GpuOperation.h" - -#include "open_cl/Operations.h" -#include "open_cl/Precision.h" -#include "open_cl/ClKernel.h" -#include "open_cl/Tensor.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -GPUOperation CreateReshape(const OperationDef &definition); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_RESHAPE_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Reshapex4.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/Reshapex4.cc deleted file mode 100644 index 13010e791..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/Reshapex4.cc +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Reshape.h" - -#include <string> - -#include "Util.h" -#include "open_cl/Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ -namespace -{ - -std::string GetReshapeCode(const OperationDef &op_def) -{ - std::string c = GetCommonDefines(op_def.precision); - c += "__kernel void main_function(\n"; - c += "$0) {\n"; - if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) - { - c += " int linear_id = get_global_id(0);\n"; - c += " int X = linear_id / args.dst_tensor.Batch();\n"; - c += " int B = linear_id % args.dst_tensor.Batch();\n"; - c += " args.dst_tensor.SetBatchRef(B);\n"; - } - else - { - c += " int X = get_global_id(0);\n"; - } - c += " int Y = get_global_id(1);\n"; - c += " int Z = get_global_id(2);\n"; - c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || " - "Z >= args.dst_tensor.Slices()) { \n"; - c += " return; \n"; - c += " } \n"; - if (op_def.dst_tensors[0].HasAxis(Axis::BATCH)) - { - c += " int dst_bhwc4 = B;\n"; - } - else - { - c += " int dst_bhwc4 = 0;\n"; - } - c += " dst_bhwc4 = ((dst_bhwc4 * args.dst_tensor.Height() + Y) * " - "args.dst_tensor.Width() + X) * args.dst_tensor.Slices() + Z;\n"; - c += " int src_z = dst_bhwc4 % args.src_tensor.Slices();\n"; - c += " dst_bhwc4 = dst_bhwc4 / args.src_tensor.Slices();\n"; - c += " int src_x = dst_bhwc4 % args.src_tensor.Width();\n"; - c += " dst_bhwc4 = dst_bhwc4 / args.src_tensor.Width();\n"; - c += " int src_y = dst_bhwc4 % args.src_tensor.Height();\n"; - if (op_def.src_tensors[0].HasAxis(Axis::BATCH)) - { - c += " int src_b = dst_bhwc4 / args.src_tensor.Height();\n"; - c += " args.src_tensor.SetBatchRef(src_b);\n"; - } - c += " FLT4 result = args.src_tensor.Read(src_x, src_y, src_z);\n"; - c += " args.dst_tensor.Write(result, X, Y, Z);\n"; - c += "}\n"; - return c; -} - -} // namespace - -GPUOperation CreateReshapex4(const OperationDef &definition) -{ - GPUOperation op(definition); - op.AddSrcTensor("src_tensor", definition.src_tensors[0]); - op.AddDstTensor("dst_tensor", definition.dst_tensors[0]); - op.code_ = GetReshapeCode(definition); - op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ; - return op; -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Reshapex4.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/Reshapex4.h deleted file mode 100644 index 8988e8bd4..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/Reshapex4.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPEN_CL_KERNELS_RESHAPEX4_H__ -#define __ONERT_BACKEND_GPU_CL_OPEN_CL_KERNELS_RESHAPEX4_H__ - -#include "GpuOperation.h" - -#include "open_cl/Operations.h" -#include "open_cl/Precision.h" -#include "open_cl/ClKernel.h" -#include "open_cl/Tensor.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -// More optimized, but require src_channels % 4 == 0 and dst_channels % 4 == 0 -GPUOperation CreateReshapex4(const OperationDef &definition); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPEN_CL_KERNELS_RESHAPEX4_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax.cc deleted file mode 100644 index 4ee164d82..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax.cc +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Softmax.h" - -#include <string> - -#include "Util.h" -#include "WorkGroupPicking.h" -#include "GpuOperation.h" -#include "open_cl/Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -namespace -{ -std::string GetSoftmaxKernelCode(const OperationDef &op_def) -{ - std::string c = GetCommonDefines(op_def.precision); - c += "__kernel void main_function(\n"; - c += "$0) {\n"; - c += " int X = get_global_id(0);\n"; - c += " int Y = get_global_id(1);\n"; - c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height()) " - "return; \n"; - c += " float sum = 0.0f;\n"; - c += " for (int d = 0; d < args.dst_tensor.Slices(); ++d) {\n"; - c += " float4 t = args.src_tensor.Read<float>(X, Y, d);\n"; - c += " sum += exp(t.x);\n"; - c += " if (d * 4 + 1 < args.dst_tensor.Channels()) sum += exp(t.y);\n"; - c += " if (d * 4 + 2 < args.dst_tensor.Channels()) sum += exp(t.z);\n"; - c += " if (d * 4 + 3 < args.dst_tensor.Channels()) sum += exp(t.w);\n"; - c += " }\n"; - c += " for (int d = 0; d < args.dst_tensor.Slices(); ++d) {\n"; - c += " float4 t = args.src_tensor.Read<float>(X, Y, d);\n"; - c += " t = exp(t) / sum;\n"; - c += " FLT4 result = TO_FLT4(t);\n"; - c += " args.dst_tensor.Write(result, X, Y, d);\n"; - c += " }\n"; - c += "}\n"; - return c; -} -} // namespace - -GPUOperation CreateSoftmax(const OperationDef &definition) -{ - GPUOperation op(definition); - auto src_desc = definition.src_tensors[0]; - if (definition.IsBatchSupported()) - { - src_desc.SetStateVar("BatchedWidth", "true"); - } - op.AddSrcTensor("src_tensor", src_desc); - auto dst_desc = definition.dst_tensors[0]; - if (definition.IsBatchSupported()) - { - dst_desc.SetStateVar("BatchedWidth", "true"); - } - op.AddDstTensor("dst_tensor", dst_desc); - op.code_ = GetSoftmaxKernelCode(definition); - op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_ZIs1; - return op; -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax.h deleted file mode 100644 index 594bab042..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPEN_CL_KERNELS_SOFTMAX_H__ -#define __ONERT_BACKEND_GPU_CL_OPEN_CL_KERNELS_SOFTMAX_H__ - -#include "open_cl/ClKernel.h" -#include "GpuOperation.h" -#include "open_cl/Precision.h" -#include "open_cl/Tensor.h" -#include "open_cl/Types.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -GPUOperation CreateSoftmax(const OperationDef &definition); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPEN_CL_KERNELS_SOFTMAX_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax1x1.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax1x1.cc deleted file mode 100644 index 590952dca..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax1x1.cc +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Softmax1x1.h" - -#include <string> - -#include "Util.h" -#include "open_cl/Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -Softmax1x1::Softmax1x1(const OperationDef &definition) : GPUOperation(definition) -{ - work_group_size_ = int3(32, 1, 1); - code_ = GetSoftmaxKernelCode(definition_); -} - -Softmax1x1::Softmax1x1(Softmax1x1 &&kernel) : GPUOperation(std::move(kernel)) {} - -Softmax1x1 &Softmax1x1::operator=(Softmax1x1 &&kernel) -{ - if (this != &kernel) - { - GPUOperation::operator=(std::move(kernel)); - } - return *this; -} - -std::string Softmax1x1::GetSoftmaxKernelCode(const OperationDef &op_def) -{ - AddSrcTensor("src_tensor", op_def.src_tensors[0]); - AddDstTensor("dst_tensor", op_def.dst_tensors[0]); - args_.AddFloat("mask_x"); - args_.AddFloat("mask_y"); - args_.AddFloat("mask_z"); - args_.AddFloat("mask_w"); - args_.AddInt("slices_x32"); - - std::string c = GetCommonDefines(op_def.precision); - c += "__kernel void main_function(\n"; - c += "$0) {\n"; - if (op_def.IsBatchSupported()) - { - c += " int batch_id = get_global_id(1);\n"; - c += " if (batch_id >= args.dst_tensor.Batch()) return;\n"; - c += " args.dst_tensor.SetBatchRef(batch_id);\n"; - c += " args.src_tensor.SetBatchRef(batch_id);\n"; - } - c += " float4 mask = (float4)(args.mask_x, args.mask_y, args.mask_z, " - "args.mask_w);\n"; - c += " int offset = 0;\n"; - c += " float sum = 0.0f;\n"; - c += " int s = 0;\n"; - c += " int tid = get_local_id(0);\n"; - c += " do {\n"; - c += " int z = offset + tid;\n"; - c += " if (z < args.dst_tensor.Slices()) {\n"; - c += " float4 mask_temp = z == args.dst_tensor.Slices() - 1 ? mask : " - "(float4)(1.0f);\n"; - c += " float4 src = args.src_tensor.Read<float>(0, 0, z);\n"; - c += " sum += dot(mask_temp, exp(src));\n"; - c += " offset += 32;\n"; - c += " }\n"; - c += " s++;\n"; - c += " } while (s < args.slices_x32);\n"; - c += "\n"; - c += " __local float4 tmp[8];\n"; - c += " __local float* tmpx1 = (__local float*)tmp;\n"; - c += " tmpx1[tid] = sum;\n"; - c += " barrier(CLK_LOCAL_MEM_FENCE);\n"; - c += " if (tid == 0) {\n"; - c += " sum = dot((float4)(1.0f), tmp[0]);\n"; - c += " sum += dot((float4)(1.0f), tmp[1]);\n"; - c += " sum += dot((float4)(1.0f), tmp[2]);\n"; - c += " sum += dot((float4)(1.0f), tmp[3]);\n"; - c += " sum += dot((float4)(1.0f), tmp[4]);\n"; - c += " sum += dot((float4)(1.0f), tmp[5]);\n"; - c += " sum += dot((float4)(1.0f), tmp[6]);\n"; - c += " sum += dot((float4)(1.0f), tmp[7]);\n"; - c += " tmpx1[0] = 1.0f / sum;\n"; - c += " }\n"; - c += " barrier(CLK_LOCAL_MEM_FENCE);\n"; - c += " sum = tmpx1[0];\n"; - c += "\n"; - c += " offset = 0;\n"; - c += " s = 0;\n"; - c += " do {\n"; - c += " int z = offset + tid;\n"; - c += " if (z < args.dst_tensor.Slices()) {\n"; - c += " FLT4 res = TO_FLT4(exp(args.src_tensor.Read<float>(0, 0, " - "z))*sum);\n"; - c += " args.dst_tensor.Write(res, 0, 0, z);\n"; - c += " offset += 32;\n"; - c += " }\n"; - c += " s++;\n"; - c += " } while (s < args.slices_x32);\n"; - c += "}\n"; - return c; -} - -absl::Status Softmax1x1::BindArguments(ArgumentsBinder *args) -{ - float4 mask = GetMaskForLastPlane(src_[0]->Channels()); - RETURN_IF_ERROR(args->SetFloat("mask_x", mask.x)); - RETURN_IF_ERROR(args->SetFloat("mask_y", mask.y)); - RETURN_IF_ERROR(args->SetFloat("mask_z", mask.z)); - RETURN_IF_ERROR(args->SetFloat("mask_w", mask.w)); - RETURN_IF_ERROR(args->SetInt("slices_x32", DivideRoundUp(src_[0]->Slices(), 32))); - return absl::OkStatus(); -} - -int3 Softmax1x1::GetGridSize() const { return int3(32, dst_[0]->Batch(), 1); } - -Softmax1x1 CreateSoftmax1x1(const OperationDef &definition) { return Softmax1x1(definition); } - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax1x1.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax1x1.h deleted file mode 100644 index da375d457..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax1x1.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPEN_CL_KERNELS_SOFTMAX1X1_H__ -#define __ONERT_BACKEND_GPU_CL_OPEN_CL_KERNELS_SOFTMAX1X1_H__ - -#include "GpuOperation.h" - -#include "open_cl/Precision.h" -#include "open_cl/ClKernel.h" -#include "open_cl/Tensor.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -class Softmax1x1 : public GPUOperation -{ -public: - Softmax1x1() = default; - explicit Softmax1x1(const OperationDef &definition); - - absl::Status BindArguments(ArgumentsBinder *args) override; - int3 GetGridSize() const override; - - // Move only - Softmax1x1(Softmax1x1 &&kernel); - Softmax1x1 &operator=(Softmax1x1 &&kernel); - Softmax1x1(const Softmax1x1 &) = delete; - Softmax1x1 &operator=(const Softmax1x1 &) = delete; - - friend Softmax1x1 CreateSoftmax1x1(); - -private: - std::string GetSoftmaxKernelCode(const OperationDef &op_def); -}; - -Softmax1x1 CreateSoftmax1x1(const OperationDef &definition); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPEN_CL_KERNELS_SOFTMAX1X1_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/TuningParameters.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/TuningParameters.h deleted file mode 100644 index 3d99b4fda..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/TuningParameters.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_KERNELS_TUNING_PARAMETERS_H__ -#define __ONERT_BACKEND_GPU_CL_KERNELS_TUNING_PARAMETERS_H__ - -#include "open_cl/ClCommandQueue.h" -#include "open_cl/DeviceInfo.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -enum class TuningType -{ - EXHAUSTIVE, - FAST -}; - -struct TuningParameters -{ - ProfilingCommandQueue *queue; - const DeviceInfo *info; - TuningType tuning_type = TuningType::EXHAUSTIVE; -}; - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_KERNELS_TUNING_PARAMETERS_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Util.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/Util.cc deleted file mode 100644 index df42c66e8..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/Util.cc +++ /dev/null @@ -1,230 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "Util.h" - -#include <cfloat> -#include <cmath> -#include <string> -#include <vector> - -#include "absl/strings/str_cat.h" -#include "absl/strings/substitute.h" -#include "open_cl/Precision.h" -#include "open_cl/DataType.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -std::string GetCommonDefines(CalculationsPrecision precision) -{ - std::string result; - - switch (precision) - { - case CalculationsPrecision::F32: - result += "#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable\n"; - result += "#define ACCUM_FLT4 float4\n"; - result += "#define FLT float\n"; - result += "#define FLT2 float2\n"; - result += "#define FLT3 float3\n"; - result += "#define FLT4 float4\n"; - result += "#define TO_FLT4 convert_float4\n"; - result += "#define TO_ACCUM_TYPE convert_float4\n"; - result += "#define TO_ACCUM_FLT convert_float\n"; - break; - case CalculationsPrecision::F16: - result += "#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable\n"; - result += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"; - result += "#define ACCUM_FLT4 half4\n"; - result += "#define FLT half\n"; - result += "#define FLT2 half2\n"; - result += "#define FLT3 half3\n"; - result += "#define FLT4 half4\n"; - result += "#define TO_FLT4 convert_half4\n"; - result += "#define TO_ACCUM_TYPE convert_half4\n"; - result += "#define TO_ACCUM_FLT convert_half\n"; - break; - case CalculationsPrecision::F32_F16: - result += "#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable\n"; - result += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"; - result += "#define ACCUM_FLT4 float4\n"; - result += "#define FLT half\n"; - result += "#define FLT2 half2\n"; - result += "#define FLT3 half3\n"; - result += "#define FLT4 half4\n"; - result += "#define TO_FLT4 convert_half4\n"; - result += "#define TO_ACCUM_TYPE convert_float4\n"; - result += "#define TO_ACCUM_FLT convert_float\n"; - break; - } - return result; -} - -std::string GetXStrideCorrectedV2(const std::string &src_x, const std::string &batch_size, - const std::string &stride_x, const std::string &padding_x) -{ - // int p0 = src_x / batch_size;\n"; - // int b0 = src_x % batch_size;\n"; - // return (p0 * stride_x + padding_x) * batch_size + b0;\n"; - return absl::Substitute("(((($0) / $1) * $2 + $3) * $1 + ($0) % $1)", src_x, batch_size, stride_x, - padding_x); -} - -float4 GetMaskForLastPlane(int channels) -{ - float4 mask = float4(0.0f); - const int reminder = channels % 4 == 0 ? 4 : channels % 4; - for (int i = 0; i < reminder; ++i) - { - mask[i] = 1.0f; - } - return mask; -} - -int3 GetFirstSuitableWorkGroup(const std::vector<int3> &wgs, int max_wg_size) -{ - for (const auto &wg : wgs) - { - const int wg_size = wg.x * wg.y * wg.z; - if (wg_size <= max_wg_size) - { - return wg; - } - } - return {1, 1, 1}; -} - -int GetRecommendedBlockSizeForConv(const DeviceInfo &device_info, CalculationsPrecision precision, - int task_size) -{ - const float task_size_per_cu = task_size / static_cast<float>(device_info.compute_units_count); - int block_size = 1; - float threshold_1 = FLT_MAX; - float threshold_2 = FLT_MAX; - float threshold_4 = FLT_MAX; - if (!device_info.IsMali()) - { - return 1; - } - MaliInfo mali_info = device_info.mali_info; - switch (precision) - { - case CalculationsPrecision::F16: - if (mali_info.IsBifrostGen1()) - { - threshold_1 = 256.0f; - threshold_2 = 256.0f * 4.0f; - threshold_4 = 256.0f * 8.0f; - } - else if (mali_info.IsBifrostGen2()) - { - threshold_1 = 256.0f * 2.0f; - threshold_2 = 256.0f * 8.0f; - threshold_4 = 256.0f * 16.0f; - } - else if (mali_info.IsBifrostGen3() || mali_info.IsValhall()) - { - threshold_1 = 256.0f; - threshold_2 = 256.0f * 6.0f; - threshold_4 = 256.0f * 16.0f; - } - else if (mali_info.IsMidgard()) - { - threshold_1 = 256.0f * 4.0f; - threshold_2 = 256.0f * 16.0f; - } - break; - case CalculationsPrecision::F32_F16: - if (mali_info.IsBifrostGen1()) - { - threshold_1 = 256.0f; - threshold_2 = 256.0f * 3.0f; - threshold_4 = 256.0f * 32.0f; - } - else if (mali_info.IsBifrostGen2()) - { - threshold_1 = 256.0f * 2.0f; - threshold_2 = 256.0f * 8.0f; - } - else if (mali_info.IsBifrostGen3() || mali_info.IsValhall()) - { - threshold_1 = 256.0f; - threshold_2 = 256.0f * 8.0f; - } - else if (mali_info.IsMidgard()) - { - threshold_1 = 256.0f * 4.0f; - } - break; - case CalculationsPrecision::F32: - if (mali_info.IsBifrostGen1()) - { - threshold_1 = 256.0f; - threshold_2 = 256.0f * 4.0f; - } - else if (mali_info.IsBifrostGen2()) - { - threshold_1 = 128.0f; - threshold_2 = 256.0f * 4.0f; - } - else if (mali_info.IsBifrostGen3() || mali_info.IsValhall()) - { - threshold_1 = 256.0f; - threshold_2 = 256.0f * 12.0f; - } - else if (mali_info.IsMidgard()) - { - threshold_1 = 256.0f * 16.0f; - } - break; - } - if (task_size_per_cu <= threshold_1) - { - block_size = 1; - } - else if (task_size_per_cu <= threshold_2) - { - block_size = 2; - } - else if (task_size_per_cu <= threshold_4) - { - block_size = 4; - } - else - { - block_size = 8; - } - return block_size; -} - -int3 GetWorkGroupsCount(const int3 &grid_size, const int3 &work_group_size) -{ - int3 work_groups_count; - work_groups_count.x = DivideRoundUp(grid_size.x, work_group_size.x); - work_groups_count.y = DivideRoundUp(grid_size.y, work_group_size.y); - work_groups_count.z = DivideRoundUp(grid_size.z, work_group_size.z); - return work_groups_count; -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Util.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/Util.h deleted file mode 100644 index 8363862c1..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/Util.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_UTIL_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_UTIL_H__ - -#include <string> -#include <vector> - -#include "open_cl/DeviceInfo.h" -#include "open_cl/Precision.h" -#include "open_cl/DataType.h" -#include "open_cl/Shape.h" -#include "open_cl/Tensor.h" -#include "open_cl/Types.h" -#include "open_cl/Util.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -std::string GetCommonDefines(CalculationsPrecision precision); - -// Calculates correct X coordinate when stride != 1 and batch != 1 for layouts -// with B after W (for example HWBC4) and WB stored in one axis of GPU -// resources. -std::string GetXStrideCorrected(const std::string &src_x, const std::string &batch_size, - const std::string &stride_x, const std::string &padding_x); - -// Calculates correct X coordinate when stride != 1 and batch != 1 for layouts -// with B after W (for example HWBC4) and WB stored in one axis of GPU -// resources. -std::string GetXStrideCorrectedV2(const std::string &src_x, const std::string &batch_size, - const std::string &stride_x, const std::string &padding_x); - -// Returns float4 mask for last plane(batch of 4 channels) -// assumes that plane size is 4; -// for example we have 7 channels, in our data structures we align it to 8 -// but 8s-channel will be empty, then last plane (batch of 4 channels) will -// have this mask (1, 1, 1, 0). -float4 GetMaskForLastPlane(int channels); - -// returns first work group from wgs that has size not bigger than max_wg_size -// if no suitable groups among wgs, returns {1, 1, 1} -int3 GetFirstSuitableWorkGroup(const std::vector<int3> &wgs, int max_wg_size); - -// task_size as amount of FLT4 processed elements. -int GetRecommendedBlockSizeForConv(const DeviceInfo &device, CalculationsPrecision precision, - int task_size); - -int3 GetWorkGroupsCount(const int3 &grid_size, const int3 &work_group_size); -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_UTIL_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/WorkGroupPicking.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/WorkGroupPicking.cc deleted file mode 100644 index 214fec271..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/WorkGroupPicking.cc +++ /dev/null @@ -1,348 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "WorkGroupPicking.h" - -#include <algorithm> -#include <limits> -#include <set> -#include <vector> - -#include "open_cl/Util.h" -#include "open_cl/Types.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -namespace -{ - -std::vector<int2> Get2DWorkgroupsEqualTo128() -{ - return {{128, 1}, {64, 2}, {32, 4}, {16, 8}, {8, 16}, {4, 32}, {2, 64}, {1, 128}}; -} - -std::vector<int3> GenerateWorkGroupSizesXYMultipleOf(int multiplier, int3 grid, - const KernelInfo &kernel_info, - const DeviceInfo &device_info, - WorkGroupSizeAlignment z_alignment) -{ - std::vector<int3> work_groups; - work_groups.reserve(32); - - std::vector<int> possible_z_sizes = GetPossibleSizes(grid.z, z_alignment); - - for (int x = 1; x <= kernel_info.max_work_group_size; x *= 2) - { - for (int y = 1; y <= kernel_info.max_work_group_size; y *= 2) - { - int work_group_size_xy = x * y; - if (work_group_size_xy % multiplier != 0 || - work_group_size_xy > kernel_info.max_work_group_size) - { - continue; - } - for (auto z : possible_z_sizes) - { - if (work_group_size_xy * z > kernel_info.max_work_group_size) - { - continue; - } - if (x <= device_info.max_work_group_size_x && y <= device_info.max_work_group_size_y && - z <= device_info.max_work_group_size_z) - { - work_groups.push_back({x, y, z}); - } - } - } - } - return work_groups; -} - -std::vector<int3> GenerateWorkGroupSizesXMultipleOf(int multiplier, int3 grid, - const KernelInfo &kernel_info, - const DeviceInfo &device_info, - WorkGroupSizeAlignment z_alignment) -{ - std::vector<int3> work_groups; - work_groups.reserve(32); - - std::vector<int> possible_z_sizes = GetPossibleSizes(grid.z, z_alignment); - std::vector<int> possible_y_sizes = GetPossibleSizes(grid.y, WorkGroupSizeAlignment::PRECISE); - - for (int x = multiplier; x <= kernel_info.max_work_group_size && x < grid.x + multiplier; - x += multiplier) - { - for (auto y : possible_y_sizes) - { - for (auto z : possible_z_sizes) - { - if (x <= device_info.max_work_group_size_x && y <= device_info.max_work_group_size_y && - z <= device_info.max_work_group_size_z && x * y * z <= kernel_info.max_work_group_size) - { - work_groups.push_back({x, y, z}); - } - } - } - } - return work_groups; -} - -void GetWorkGroupsAlignedToGrid(const DeviceInfo &device_info, const KernelInfo &kernel_info, - const int3 &grid, std::vector<int3> *work_groups) -{ - int3 max_wg_size; - max_wg_size.x = device_info.max_work_group_size_x; - max_wg_size.y = device_info.max_work_group_size_y; - max_wg_size.z = device_info.max_work_group_size_z; - GenerateWorkGroupSizesAlignedToGrid(grid, max_wg_size, kernel_info.max_work_group_size, - work_groups); -} - -int GetPenalty(int grid_size, int group_size) -{ - const int reminder = grid_size % group_size; - return reminder == 0 ? 0 : group_size - reminder; -} - -int GetPenalty(int2 grid_size, int2 group_size) -{ - const int p_x = GetPenalty(grid_size.x, group_size.x); - const int p_y = GetPenalty(grid_size.y, group_size.y); - return p_x * grid_size.y + p_y * grid_size.x + p_x * p_y; -} - -int GetMaxSizeWithMinPenalty(int size, int max_size) -{ - int best_size = 128; - int min_penalty = GetPenalty(size, best_size); - for (int i = 2; i * 128 <= max_size; ++i) - { - if (GetPenalty(size, i * 128) == min_penalty) - { - best_size = i * 128; - } - } - return best_size; -} - -int2 GetMaxSizeWithMinPenalty(int2 size, int max_size) -{ - std::vector<int2> base_groups = Get2DWorkgroupsEqualTo128(); - int min_penalty = std::numeric_limits<int>::max(); - for (const auto &group : base_groups) - { - min_penalty = std::min(GetPenalty(size, group), min_penalty); - } - for (const auto &group : base_groups) - { - for (int y = 1; y * group.y <= max_size; ++y) - { - int new_group_y = y * group.y; - for (int x = 1; x * group.x <= max_size; ++x) - { - int new_group_x = x * group.x; - if (new_group_x * new_group_y > max_size) - { - break; - } - if (GetPenalty(size, int2(new_group_x, new_group_y)) == min_penalty) - { - return int2(new_group_x, new_group_y); - } - } - } - } - return int2(0, 0); -} - -int GetBiggestDividerWithPriority(int number, int max_divider) -{ - if (number % 8 == 0 && 8 <= max_divider) - { - return 8; - } - if (number % 4 == 0 && 4 <= max_divider) - { - return 4; - } - if (number % 2 == 0 && 2 <= max_divider) - { - return 2; - } - for (int i = max_divider; i != 0; i--) - { - if (number % i == 0) - { - return i; - } - } - return 1; -} - -int GetBiggestDivider(int number, int max_divider) -{ - for (int i = max_divider; i != 0; i--) - { - if (number % i == 0) - { - return i; - } - } - return 1; -} - -} // namespace - -int3 GetWorkGroupXY128ConvLinear(const int3 &grid) -{ - int grid_z = GetBiggestDividerWithPriority(grid.z, 4); - if (grid.x <= 128) - { - return int3(128, 1, grid_z); - } - int grid_x = GetMaxSizeWithMinPenalty(grid.x, 512 / grid_z); - return {grid_x, 1, grid_z}; -} - -int3 GetWorkGroupXY128Conv(const int3 &grid) -{ - int grid_z = GetBiggestDividerWithPriority(grid.z, 4); - if (grid.x <= 16 && grid.y <= 8) - { - return int3(16, 8, grid_z); - } - int2 grid_xy = GetMaxSizeWithMinPenalty(int2(grid.x, grid.y), 512 / grid_z); - return int3(grid_xy.x, grid_xy.y, grid_z); -} - -// int3 GetWorkGroupXY128Simple(const int3& grid) { return int3(16, 8, 1); } - -int3 GetWorkGroup(const int3 &grid, int max_size) -{ - int wg_z = GetBiggestDividerWithPriority(grid.z, 8); - int wg_xy_size = max_size / wg_z; - int wg_x = std::min(DivideRoundUp(grid.x, 2), wg_xy_size); - int wg_y = std::min(wg_xy_size / wg_x, grid.y); - return int3(wg_x, wg_y, wg_z); -} - -int3 GetWorkGroupConv(const int3 &grid, int max_size, int max_z_size) -{ - int wg_z = GetBiggestDivider(grid.z, max_z_size); - int wg_xy_size = std::min(256, max_size) / wg_z; - int wg_x = std::min(grid.x, wg_xy_size); - int wg_y = std::min(wg_xy_size / wg_x, grid.y); - if (wg_y == grid.y && grid.y % 2 == 0) - { - wg_y = grid.y / 2; - } - return int3(wg_x, wg_y, wg_z); -} - -void GetPossibleWorkGroupsXYMultipleOf(int multiplier, const DeviceInfo &device_info, - const KernelInfo &kernel_info, const int3 &grid, - WorkGroupSizeAlignment z_alignment, - std::vector<int3> *work_groups) -{ - *work_groups = - GenerateWorkGroupSizesXYMultipleOf(multiplier, grid, kernel_info, device_info, z_alignment); -} - -void GetPossibleWorkGroupsXMultipleOf(int multiplier, const DeviceInfo &device_info, - const KernelInfo &kernel_info, const int3 &grid, - WorkGroupSizeAlignment z_alignment, - std::vector<int3> *work_groups) -{ - *work_groups = - GenerateWorkGroupSizesXMultipleOf(multiplier, grid, kernel_info, device_info, z_alignment); -} - -bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height) -{ - int planar_work_groups = DivideRoundUp(width * height, 128); - auto base_work_groups = Get2DWorkgroupsEqualTo128(); - bool have_equal_work_groups = false; - for (auto &work_group : base_work_groups) - { - int x_groups = DivideRoundUp(width, work_group.x); - int y_groups = DivideRoundUp(height, work_group.y); - int xy_groups = x_groups * y_groups; - if (xy_groups == planar_work_groups) - { - have_equal_work_groups = true; - break; - } - } - return !have_equal_work_groups; -} - -void GetPossibleWorkGroups(TuningType tuning_type, const DeviceInfo &device_info, - const KernelInfo &kernel_info, const int3 &grid, - std::vector<int3> *work_groups) -{ - switch (tuning_type) - { - case TuningType::FAST: - work_groups->push_back(GetWorkGroup(grid, kernel_info.max_work_group_size)); - return; - case TuningType::EXHAUSTIVE: - { - GetWorkGroupsAlignedToGrid(device_info, kernel_info, grid, work_groups); - return; - } - default: - work_groups->push_back({8, 4, 1}); - return; - } -} - -void GetPossibleWorkGroupsConv(TuningType tuning_type, const DeviceInfo &device_info, - const KernelInfo &kernel_info, const int3 &grid, - std::vector<int3> *work_groups) -{ - switch (tuning_type) - { - case TuningType::FAST: - { - int max_z_size = 16; - if (device_info.IsAdreno()) - { - max_z_size = device_info.IsAdreno3xx() ? 16 : 64; - } - max_z_size = std::min(max_z_size, device_info.max_work_group_size_z); - work_groups->push_back(GetWorkGroupConv(grid, kernel_info.max_work_group_size, max_z_size)); - return; - } - case TuningType::EXHAUSTIVE: - { - GetWorkGroupsAlignedToGrid(device_info, kernel_info, grid, work_groups); - return; - } - default: - work_groups->push_back({8, 4, 1}); - return; - } -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/WorkGroupPicking.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/WorkGroupPicking.h deleted file mode 100644 index c19890de1..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/kernels/WorkGroupPicking.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_WROK_GROUP_PICKING_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_WROK_GROUP_PICKING_H__ - -#include <vector> - -#include "TuningParameters.h" - -#include "open_cl/ClKernel.h" -#include "open_cl/DeviceInfo.h" -#include "open_cl/Types.h" -#include "open_cl/WorkgroupSelection.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -// multiplier can be power of two only -void GetPossibleWorkGroupsXYMultipleOf(int multiplier, const DeviceInfo &device_info, - const KernelInfo &kernel_info, const int3 &grid, - WorkGroupSizeAlignment z_alignment, - std::vector<int3> *work_groups); - -void GetPossibleWorkGroupsXMultipleOf(int multiplier, const DeviceInfo &device_info, - const KernelInfo &kernel_info, const int3 &grid, - WorkGroupSizeAlignment z_alignment, - std::vector<int3> *work_groups); - -int3 GetWorkGroupXY128ConvLinear(const int3 &grid); - -int3 GetWorkGroupXY128Simple(const int3 &grid); -int3 GetWorkGroupXY128Conv(const int3 &grid); - -bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height); - -void GetPossibleWorkGroups(TuningType tuning_type, const DeviceInfo &device_info, - const KernelInfo &kernel_info, const int3 &grid, - std::vector<int3> *work_groups); - -void GetPossibleWorkGroupsConv(TuningType tuning_type, const DeviceInfo &device_info, - const KernelInfo &kernel_info, const int3 &grid, - std::vector<int3> *work_groups); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_WROK_GROUP_PICKING_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/selectors/ConvolutionSelector.cc b/runtime/onert/backend/gpu_cl/open_cl/selectors/ConvolutionSelector.cc deleted file mode 100644 index eac6f3270..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/selectors/ConvolutionSelector.cc +++ /dev/null @@ -1,249 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#include "ConvolutionSelector.h" - -#include "absl/memory/memory.h" -#include "open_cl/kernels/ConvBuffer1x1.h" -#include "open_cl/kernels/ConvConstants.h" -#include "open_cl/kernels/ConvPowervr.h" -#include "open_cl/kernels/ConvWeightsConverter.h" -#include "open_cl/kernels/WorkGroupPicking.h" -#include "open_cl/TensorType.h" -#include "open_cl/Util.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ -namespace -{ - -std::unique_ptr<GPUOperation> SelectConvolutionAdreno(const Convolution2DAttributes &attr, - const BHWC &dst_shape, - const DeviceInfo &device_info, - const OperationDef &op_def, ModelHints) -{ - if (IsConvConstantsSupported(device_info, op_def, attr)) - { - GPUOperation conv = CreateConvConstants(device_info, op_def, attr); - return absl::make_unique<GPUOperation>(std::move(conv)); - } - else - { - ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape); - return absl::make_unique<ConvPowerVR>(std::move(conv)); - } -} - -std::unique_ptr<GPUOperation> SelectConvolutionWinogradAdreno(const Convolution2DAttributes &attr, - const BHWC &dst_shape, - const DeviceInfo &device_info, - const OperationDef &op_def, - ModelHints) -{ - ConvPowerVR conv = CreateConvPowerVRWino4x4To6x6(device_info, op_def, attr, &dst_shape); - return absl::make_unique<ConvPowerVR>(std::move(conv)); -} - -std::unique_ptr<GPUOperation> -SelectConvolutionDynamicWeightsAdreno(const Convolution2DAttributes &attr, - const BHWC &weights_shape, const BHWC &dst_shape, - const DeviceInfo &device_info, const OperationDef &op_def, - ModelHints, ConvWeightsDescription *weights_desc) -{ - ConvPowerVR conv = - CreateConvPowerVRDynamicWeights(device_info, op_def, attr, weights_shape, &dst_shape); - *weights_desc = conv.GetConvWeightsDescription(); - return absl::make_unique<ConvPowerVR>(std::move(conv)); -} - -std::unique_ptr<GPUOperation> SelectConvolutionNVidia(const Convolution2DAttributes &attr, - const BHWC &dst_shape, - const DeviceInfo &device_info, - const OperationDef &op_def) -{ - if (IsConvConstantsSupported(device_info, op_def, attr)) - { - GPUOperation conv = CreateConvConstants(device_info, op_def, attr); - return absl::make_unique<GPUOperation>(std::move(conv)); - } - else - { - ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape); - return absl::make_unique<ConvPowerVR>(std::move(conv)); - } -} - -std::unique_ptr<GPUOperation> SelectConvolutionPowerVR(const Convolution2DAttributes &attr, - const DeviceInfo &device_info, - const OperationDef &op_def) -{ - ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr); - return absl::make_unique<ConvPowerVR>(std::move(conv)); -} - -std::unique_ptr<GPUOperation> SelectConvolutionMali(const Convolution2DAttributes &attr, - const BHWC &dst_shape, - const DeviceInfo &device_info, - const OperationDef &op_def) -{ - if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER && - IsConvBuffer1x1Supported(op_def, attr)) - { - ConvBuffer1x1 conv = CreateConvBuffer1x1(device_info, op_def, attr, &dst_shape); - return absl::make_unique<ConvBuffer1x1>(std::move(conv)); - } - else - { - ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape); - return absl::make_unique<ConvPowerVR>(std::move(conv)); - } -} - -std::unique_ptr<GPUOperation> SelectConvolutionWinogradMali(const Convolution2DAttributes &attr, - const BHWC &dst_shape, - const DeviceInfo &device_info, - const OperationDef &op_def) -{ - if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER) - { - ConvBuffer1x1 conv = CreateConvBuffer1x1Wino4x4To6x6(device_info, op_def, attr, &dst_shape); - return absl::make_unique<ConvBuffer1x1>(std::move(conv)); - } - else - { - ConvPowerVR conv = CreateConvPowerVRWino4x4To6x6(device_info, op_def, attr, &dst_shape); - return absl::make_unique<ConvPowerVR>(std::move(conv)); - } -} - -std::unique_ptr<GPUOperation> -SelectConvolutionDynamicWeightsMali(const Convolution2DAttributes &attr, const BHWC &weights_shape, - const BHWC &dst_shape, const DeviceInfo &device_info, - const OperationDef &op_def, ModelHints, - ConvWeightsDescription *weights_desc) -{ - if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER && - IsConvBuffer1x1Supported(op_def, weights_shape, attr)) - { - ConvBuffer1x1 conv = - CreateConvBuffer1x1DynamicWeights(device_info, op_def, attr, weights_shape, &dst_shape); - *weights_desc = conv.GetConvWeightsDescription(); - return absl::make_unique<ConvBuffer1x1>(std::move(conv)); - } - else - { - ConvPowerVR conv = - CreateConvPowerVRDynamicWeights(device_info, op_def, attr, weights_shape, &dst_shape); - *weights_desc = conv.GetConvWeightsDescription(); - return absl::make_unique<ConvPowerVR>(std::move(conv)); - } -} - -} // namespace - -std::unique_ptr<GPUOperation> SelectConvolution(const Convolution2DAttributes &attr, - const BHWC &dst_shape, - const DeviceInfo &device_info, - const OperationDef &op_def, ModelHints hints) -{ - if (device_info.IsAdreno()) - { - return SelectConvolutionAdreno(attr, dst_shape, device_info, op_def, hints); - } - else if (device_info.IsPowerVR() || device_info.IsAMD() || device_info.IsIntel()) - { - return SelectConvolutionPowerVR(attr, device_info, op_def); - } - else if (device_info.IsNvidia()) - { - return SelectConvolutionNVidia(attr, dst_shape, device_info, op_def); - } - else if (device_info.IsMali()) - { - return SelectConvolutionMali(attr, dst_shape, device_info, op_def); - } - else - { - return SelectConvolutionAdreno(attr, dst_shape, device_info, op_def, hints); - } -} - -std::unique_ptr<GPUOperation> SelectConvolutionForWinograd(const Convolution2DAttributes &attr, - const BHWC &dst_shape, - const DeviceInfo &device_info, - const OperationDef &op_def, - ModelHints hints) -{ - if (device_info.IsAdreno()) - { - return SelectConvolutionWinogradAdreno(attr, dst_shape, device_info, op_def, hints); - } - else if (device_info.IsPowerVR() || device_info.IsAMD() || device_info.IsNvidia() || - device_info.IsIntel()) - { - ConvPowerVR conv = CreateConvPowerVRWino4x4To6x6(device_info, op_def, attr, &dst_shape); - return absl::make_unique<ConvPowerVR>(std::move(conv)); - } - else if (device_info.IsMali()) - { - return SelectConvolutionWinogradMali(attr, dst_shape, device_info, op_def); - } - else - { - return SelectConvolutionWinogradAdreno(attr, dst_shape, device_info, op_def, hints); - } -} - -std::unique_ptr<GPUOperation> -SelectConvolutionWithDynamicWeights(const Convolution2DAttributes &attr, const BHWC &weights_shape, - const BHWC &dst_shape, const DeviceInfo &device_info, - const OperationDef &op_def, ModelHints hints, - ConvWeightsDescription *weights_desc) -{ - if (device_info.IsAdreno()) - { - return SelectConvolutionDynamicWeightsAdreno(attr, weights_shape, dst_shape, device_info, - op_def, hints, weights_desc); - } - else if (device_info.IsMali()) - { - return SelectConvolutionDynamicWeightsMali(attr, weights_shape, dst_shape, device_info, op_def, - hints, weights_desc); - } - else - { - ConvPowerVR conv = - CreateConvPowerVRDynamicWeights(device_info, op_def, attr, weights_shape, &dst_shape); - *weights_desc = conv.GetConvWeightsDescription(); - return absl::make_unique<ConvPowerVR>(std::move(conv)); - } -} - -std::unique_ptr<GPUOperation> -SelectConverterToConvWeights(const ConvWeightsDescription &weights_desc, const OperationDef &op_def, - ModelHints) -{ - ConverterToConvWeights converter = ConverterToConvWeights(op_def, weights_desc); - return absl::make_unique<ConverterToConvWeights>(std::move(converter)); -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/selectors/ConvolutionSelector.h b/runtime/onert/backend/gpu_cl/open_cl/selectors/ConvolutionSelector.h deleted file mode 100644 index d45eea8bd..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/selectors/ConvolutionSelector.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_SELECTORS_CONVOLUTION_SELECTOR_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_SELECTORS_CONVOLUTION_SELECTOR_H__ - -#include <memory> - -#include "open_cl/kernels/ConvCommon.h" -#include "open_cl/kernels/GpuOperation.h" -#include "open_cl/ModelHints.h" -#include "open_cl/Operations.h" -#include "open_cl/Shape.h" -#include "open_cl/Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -std::unique_ptr<GPUOperation> SelectConvolution(const Convolution2DAttributes &attr, - const BHWC &dst_shape, - const DeviceInfo &device_info, - const OperationDef &op_def, ModelHints hints); - -std::unique_ptr<GPUOperation> SelectConvolutionForWinograd(const Convolution2DAttributes &attr, - const BHWC &dst_shape, - const DeviceInfo &device_info, - const OperationDef &op_def, - ModelHints hints); - -std::unique_ptr<GPUOperation> -SelectConvolutionWithDynamicWeights(const Convolution2DAttributes &attr, const BHWC &weights_shape, - const BHWC &dst_shape, const DeviceInfo &device_info, - const OperationDef &op_def, ModelHints hints, - ConvWeightsDescription *weights_desc); - -std::unique_ptr<GPUOperation> -SelectConverterToConvWeights(const ConvWeightsDescription &weights_desc, const OperationDef &op_def, - ModelHints hints); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_SELECTORS_CONVOLUTION_SELECTOR_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/selectors/DwConvolutionSelector.cc b/runtime/onert/backend/gpu_cl/open_cl/selectors/DwConvolutionSelector.cc deleted file mode 100644 index f07eef689..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/selectors/DwConvolutionSelector.cc +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2020 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "DwConvolutionSelector.h" - -#include "absl/memory/memory.h" -#include "open_cl/ClDevice.h" -#include "open_cl/kernels/DepthwiseConv.h" -#include "open_cl/kernels/DepthwiseConv3x3.h" -#include "open_cl/Precision.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ -namespace -{ - -std::unique_ptr<GPUOperation> -SelectDWConvolutionAdreno(const DepthwiseConvolution2DAttributes &attr, - const DeviceInfo &device_info, const OperationDef &op_def) -{ - if (IsDepthwiseConv3x3Supported(attr)) - { - return absl::make_unique<DepthwiseConv3x3>(CreateDepthwiseConv3x3(device_info, op_def, attr)); - } - else - { - return absl::make_unique<GPUOperation>(CreateDepthwiseConvolution2D(device_info, op_def, attr)); - } -} - -std::unique_ptr<GPUOperation> -SelectDWConvolutionPowerVR(const DepthwiseConvolution2DAttributes &attr, - const DeviceInfo &device_info, const OperationDef &op_def) -{ - if (IsDepthwiseConv3x3Supported(attr)) - { - return absl::make_unique<DepthwiseConv3x3>(CreateDepthwiseConv3x3(device_info, op_def, attr)); - } - else - { - return absl::make_unique<GPUOperation>(CreateDepthwiseConvolution2D(device_info, op_def, attr)); - } -} - -std::unique_ptr<GPUOperation> SelectDWConvolutionMali(const DepthwiseConvolution2DAttributes &attr, - const DeviceInfo &device_info, - const OperationDef &op_def) -{ - const auto storage_type = op_def.src_tensors[0].storage_type; - bool buffer_type = - storage_type == TensorStorageType::BUFFER || storage_type == TensorStorageType::IMAGE_BUFFER; - const MaliInfo mali_info = device_info.mali_info; - if (IsDepthwiseConv3x3Supported(attr) && !mali_info.IsMidgard() && !buffer_type && - op_def.precision != CalculationsPrecision::F32) - { - return absl::make_unique<DepthwiseConv3x3>(CreateDepthwiseConv3x3(device_info, op_def, attr)); - } - else - { - return absl::make_unique<GPUOperation>(CreateDepthwiseConvolution2D(device_info, op_def, attr)); - } -} -} // namespace - -std::unique_ptr<GPUOperation> SelectDWConvolution(const DepthwiseConvolution2DAttributes &attr, - const DeviceInfo &device_info, - const OperationDef &op_def) -{ - if (device_info.IsAdreno()) - { - return SelectDWConvolutionAdreno(attr, device_info, op_def); - } - else if (device_info.IsPowerVR()) - { - return SelectDWConvolutionPowerVR(attr, device_info, op_def); - } - else if (device_info.IsMali()) - { - return SelectDWConvolutionMali(attr, device_info, op_def); - } - else - { - return SelectDWConvolutionAdreno(attr, device_info, op_def); - } -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/selectors/DwConvolutionSelector.h b/runtime/onert/backend/gpu_cl/open_cl/selectors/DwConvolutionSelector.h deleted file mode 100644 index 2fa40c5c3..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/selectors/DwConvolutionSelector.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2020 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_SELECTORS_DW_CONVOLUTION_SELECTOR_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_SELECTORS_DW_CONVOLUTION_SELECTOR_H__ - -#include <memory> - -#include "open_cl/kernels/GpuOperation.h" -#include "open_cl/Operations.h" -#include "open_cl/Status.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -std::unique_ptr<GPUOperation> SelectDWConvolution(const DepthwiseConvolution2DAttributes &attr, - const DeviceInfo &device_info, - const OperationDef &op_def); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_SELECTORS_DW_CONVOLUTION_SELECTOR_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/selectors/SimpleSelectors.cc b/runtime/onert/backend/gpu_cl/open_cl/selectors/SimpleSelectors.cc deleted file mode 100644 index ac514b26c..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/selectors/SimpleSelectors.cc +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2020 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "SimpleSelectors.h" - -#include <memory> -#include <set> - -#include "open_cl/kernels/Add.h" -#include "open_cl/kernels/DepthwiseConv.h" -#include "open_cl/kernels/Pooling.h" -#include "open_cl/kernels/Relu.h" -#include "open_cl/kernels/Reshape.h" -#include "open_cl/kernels/Reshapex4.h" -#include "open_cl/kernels/Softmax.h" -#include "open_cl/kernels/Softmax1x1.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -void SelectAdd(const OperationDef &op_def, const std::vector<int> &channels, int dst_channels, - std::unique_ptr<GPUOperation> *ptr) -{ - GPUOperation operation = CreateAdd(op_def, channels, dst_channels); - *ptr = std::make_unique<GPUOperation>(std::move(operation)); -} - -std::unique_ptr<GPUOperation> -SelectDWConvolutionDynamicWeights(const DepthwiseConvolution2DAttributes &attr, - const DeviceInfo &device_info, const OperationDef &op_def) -{ - return absl::make_unique<GPUOperation>( - CreateDepthwiseConvolution2DDynamicWeights(device_info, op_def, attr)); -} - -std::unique_ptr<GPUOperation> SelectPooling(const Pooling2DAttributes &attr, - const OperationDef &op_def) -{ - GPUOperation operation = CreatePooling(op_def, attr); - return absl::make_unique<GPUOperation>(std::move(operation)); -} - -std::unique_ptr<GPUOperation> SelectReLU(const ReLUAttributes &attr, const OperationDef &op_def) -{ - return absl::make_unique<GPUOperation>(CreateReLU(op_def, attr)); -} - -void SelectReshape(int src_channels, int dst_channels, const OperationDef &op_def, - std::unique_ptr<GPUOperation> *ptr) -{ - if (src_channels % 4 == 0 && dst_channels % 4 == 0) - { - GPUOperation operation = CreateReshapex4(op_def); - *ptr = std::make_unique<GPUOperation>(std::move(operation)); - } - else - { - GPUOperation operation = CreateReshape(op_def); - *ptr = std::make_unique<GPUOperation>(std::move(operation)); - } -} - -void SelectSoftmax(const BHWC &shape, const OperationDef &op_def, - std::unique_ptr<GPUOperation> *ptr) -{ - if (shape.w == 1 && shape.h == 1) - { - Softmax1x1 operation = CreateSoftmax1x1(op_def); - *ptr = absl::make_unique<Softmax1x1>(std::move(operation)); - } - else - { - GPUOperation operation = CreateSoftmax(op_def); - *ptr = absl::make_unique<GPUOperation>(std::move(operation)); - } -} - -} // namespace gpu_cl -} // namespace backend -} // namespace onert diff --git a/runtime/onert/backend/gpu_cl/open_cl/selectors/SimpleSelectors.h b/runtime/onert/backend/gpu_cl/open_cl/selectors/SimpleSelectors.h deleted file mode 100644 index 2c5837a1d..000000000 --- a/runtime/onert/backend/gpu_cl/open_cl/selectors/SimpleSelectors.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2020 The TensorFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_SELECTORS_SIMPLE_SELECTORS_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_SELECTORS_SIMPLE_SELECTORS_H__ - -#include <memory> - -#include "open_cl/ClDevice.h" -#include "open_cl/kernels/GpuOperation.h" -#include "open_cl/Operations.h" -#include "open_cl/Shape.h" - -namespace onert -{ -namespace backend -{ -namespace gpu_cl -{ - -void SelectAdd(const OperationDef &op_def, const std::vector<int> &channels, int dst_channels, - std::unique_ptr<GPUOperation> *ptr); - -std::unique_ptr<GPUOperation> -SelectDWConvolutionDynamicWeights(const DepthwiseConvolution2DAttributes &attr, - const DeviceInfo &device_info, const OperationDef &op_def); - -std::unique_ptr<GPUOperation> SelectPooling(const Pooling2DAttributes &attr, - const OperationDef &op_def); - -std::unique_ptr<GPUOperation> SelectReLU(const ReLUAttributes &attr, const OperationDef &op_def); - -void SelectReshape(int src_channels, int dst_channels, const OperationDef &op_def, - std::unique_ptr<GPUOperation> *ptr); - -void SelectSoftmax(const BHWC &shape, const OperationDef &op_def, - std::unique_ptr<GPUOperation> *ptr); - -} // namespace gpu_cl -} // namespace backend -} // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_SELECTORS_SIMPLE_SELECTORS_H__ diff --git a/runtime/onert/backend/gpu_cl/operand/CLTensor.cc b/runtime/onert/backend/gpu_cl/operand/CLTensor.cc index 6dd9bd252..d3ed102a1 100644 --- a/runtime/onert/backend/gpu_cl/operand/CLTensor.cc +++ b/runtime/onert/backend/gpu_cl/operand/CLTensor.cc @@ -16,10 +16,12 @@ #include "CLTensor.h" -#include "open_cl/Buffer.h" -#include "open_cl/ClContext.h" -#include "open_cl/Tensor.h" -#include "open_cl/TensorType.h" +#include "tensorflow/lite/delegates/gpu/cl/buffer.h" +#include "tensorflow/lite/delegates/gpu/cl/cl_context.h" +#include "tensorflow/lite/delegates/gpu/cl/tensor.h" +#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h" + +using namespace tflite::gpu::cl; namespace onert { @@ -30,16 +32,15 @@ namespace gpu_cl namespace operand { -CLTensor::CLTensor(size_t rank, ir::Shape shape, std::shared_ptr<Environment> environment) - : ICLTensor{rank, shape, environment}, _tensor(std::make_shared<Tensor>()) +CLTensor::CLTensor(size_t rank, ir::Shape shape, + std::shared_ptr<tflite::gpu::cl::Environment> environment, TensorType type) + : ICLTensor{rank, shape, environment, type}, _tensor(std::make_shared<Tensor>()) { } -const Tensor *CLTensor::handle() const { return _tensor.get(); } - -Tensor *CLTensor::handle() { return _tensor.get(); } +const tflite::gpu::cl::Tensor *CLTensor::handle() const { return _tensor.get(); } -void CLTensor::setBuffer(void *host_ptr) { (void)host_ptr; } +tflite::gpu::cl::Tensor *CLTensor::handle() { return _tensor.get(); } } // namespace operand } // namespace gpu_cl diff --git a/runtime/onert/backend/gpu_cl/operand/CLTensor.h b/runtime/onert/backend/gpu_cl/operand/CLTensor.h index 7d2e70a99..f2153f430 100644 --- a/runtime/onert/backend/gpu_cl/operand/CLTensor.h +++ b/runtime/onert/backend/gpu_cl/operand/CLTensor.h @@ -19,9 +19,9 @@ #include "ICLTensor.h" -#include "open_cl/Buffer.h" -#include "open_cl/ClContext.h" -#include "open_cl/Tensor.h" +#include "tensorflow/lite/delegates/gpu/cl/buffer.h" +#include "tensorflow/lite/delegates/gpu/cl/cl_context.h" +#include "tensorflow/lite/delegates/gpu/cl/tensor.h" namespace onert { @@ -38,11 +38,12 @@ public: CLTensor() = delete; public: - CLTensor(size_t rank, ir::Shape shape, std::shared_ptr<Environment> environment); + CLTensor(size_t rank, ir::Shape shape, std::shared_ptr<tflite::gpu::cl::Environment> environment, + TensorType type); public: - const Tensor *handle() const override; - Tensor *handle() override; + const tflite::gpu::cl::Tensor *handle() const override; + tflite::gpu::cl::Tensor *handle() override; public: /** Set given buffer as the buffer of the tensor @@ -55,7 +56,7 @@ public: void setBuffer(void *host_ptr); private: - std::shared_ptr<Tensor> _tensor; + std::shared_ptr<tflite::gpu::cl::Tensor> _tensor; }; } // namespace operand diff --git a/runtime/onert/backend/gpu_cl/operand/ICLTensor.cc b/runtime/onert/backend/gpu_cl/operand/ICLTensor.cc index 3f070be0c..a95f78056 100644 --- a/runtime/onert/backend/gpu_cl/operand/ICLTensor.cc +++ b/runtime/onert/backend/gpu_cl/operand/ICLTensor.cc @@ -16,11 +16,11 @@ #include "ICLTensor.h" -#include "open_cl/Api.h" -#include "open_cl/Spi.h" -#include "open_cl/OpenclWrapper.h" -#include "open_cl/TensorTypeUtil.h" -#include "open_cl/kernels/Converter.h" +#include "tensorflow/lite/delegates/gpu/api.h" +#include "tensorflow/lite/delegates/gpu/spi.h" +#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h" +#include "tensorflow/lite/delegates/gpu/cl/tensor_type_util.h" +#include "tensorflow/lite/delegates/gpu/cl/kernels/converter.h" namespace onert { @@ -31,6 +31,10 @@ namespace gpu_cl namespace operand { +using namespace tflite::gpu; +using namespace tflite::gpu::cl; +using namespace tflite::gpu::internal_tensor; + void ICLTensor::access(const std::function<void(ITensor &tensor)> &fn) { if (total_size() == 0) @@ -39,100 +43,133 @@ void ICLTensor::access(const std::function<void(ITensor &tensor)> &fn) fn(*this); } -void ICLTensor::enqueueWriteBuffer(const void *ptr, bool) +void ICLTensor::writeConvertInit() { - const float *arr = (float *)ptr; - TensorObject input_obj = MakeReadableCpuMemory(absl::MakeSpan(arr, total_size() / 4)); + TensorObjectDef input_def; + input_def.dimensions.b = handle()->Batch(); + input_def.dimensions.h = handle()->Height(); + input_def.dimensions.w = handle()->Width(); + input_def.dimensions.c = handle()->Channels(); + input_def.object_def.data_layout = DataLayout::BHWC; + input_def.object_def.data_type = DataType::FLOAT32; + input_def.object_def.object_type = ObjectType::CPU_MEMORY; + input_def.object_def.user_provided = true; - TensorObject output_obj; + TensorObjectDef permute_def = input_def; + permute_def.object_def.object_type = ToObjectType(handle()->GetStorageType()); - if (handle()->GetStorageType() == TensorStorageType::BUFFER) + auto dims = permute_def.dimensions; + const BHWC shape(dims.b, dims.h, dims.w, dims.c); + const TensorDescriptor desc{ + permute_def.object_def.data_type, + ToTensorStorageType(permute_def.object_def.object_type, permute_def.object_def.data_layout), + Layout::BHWC}; + if (!AllocateTensorMemory(_environment->context(), shape, desc, &_cl_memory).ok()) { - output_obj = OpenClBuffer{handle()->GetMemoryPtr()}; + throw std::runtime_error("Failed to AllocateTensorMemory"); } - else if (handle()->GetStorageType() == TensorStorageType::IMAGE_BUFFER) + + TensorObjectDef output_def = permute_def; + output_def.object_def.data_layout = ToDataLayout(handle()->GetStorageType()); + output_def.object_def.data_type = handle()->GetDataType(); + input_def.object_def.user_provided = false; + + _converter_builder = NewConverterBuilder(_environment.get()); + if (!_converter_builder->MakeConverter(input_def, permute_def, &_converter_to).ok()) { - output_obj = OpenClBuffer{handle()->GetMemoryPtrForWriting()}; + throw std::runtime_error("Failed to make converter_to"); } - else + if (!_converter_builder->MakeConverter(permute_def, output_def, &_converter_from).ok()) { - output_obj = OpenClTexture{handle()->GetMemoryPtr()}; + throw std::runtime_error("Failed to make converter_from"); } +} + +void ICLTensor::readConvertInit() +{ + _converter_builder = NewConverterBuilder(_environment.get()); TensorObjectDef input_def; input_def.dimensions.b = handle()->Batch(); input_def.dimensions.h = handle()->Height(); input_def.dimensions.w = handle()->Width(); input_def.dimensions.c = handle()->Channels(); - input_def.object_def.data_layout = DataLayout::BHWC; - input_def.object_def.data_type = DataType::FLOAT32; - input_def.object_def.object_type = ObjectType::CPU_MEMORY; - input_def.object_def.user_provided = true; + input_def.object_def.data_layout = ToDataLayout(handle()->GetStorageType()); + input_def.object_def.data_type = handle()->GetDataType(); + input_def.object_def.object_type = ToObjectType(handle()->GetStorageType()); + input_def.object_def.user_provided = false; - TensorObjectDef tmp_def; - tmp_def.dimensions.b = handle()->Batch(); - tmp_def.dimensions.h = handle()->Height(); - tmp_def.dimensions.w = handle()->Width(); - tmp_def.dimensions.c = handle()->Channels(); - tmp_def.object_def.data_layout = DataLayout::BHWC; - tmp_def.object_def.data_type = DataType::FLOAT32; - tmp_def.object_def.object_type = ToObjectType(handle()->GetStorageType()); - tmp_def.object_def.user_provided = true; - - auto dims = tmp_def.dimensions; + TensorObjectDef permute_def = input_def; + permute_def.object_def.data_layout = DataLayout::BHWC; + permute_def.object_def.data_type = DataType::FLOAT32; + permute_def.object_def.user_provided = true; + + auto dims = permute_def.dimensions; const BHWC shape(dims.b, dims.h, dims.w, dims.c); const TensorDescriptor desc{ - tmp_def.object_def.data_type, - ToTensorStorageType(tmp_def.object_def.object_type, tmp_def.object_def.data_layout), + permute_def.object_def.data_type, + ToTensorStorageType(permute_def.object_def.object_type, permute_def.object_def.data_layout), Layout::BHWC}; if (!AllocateTensorMemory(_environment->context(), shape, desc, &_cl_memory).ok()) { - throw std::runtime_error("AllocateTensorMemory error."); + throw std::runtime_error("Failed to AllocateTensorMemory"); } - TensorObject tmp_obj; - if (tmp_def.object_def.object_type == ObjectType::OPENCL_TEXTURE) + + TensorObjectDef output_def = permute_def; + output_def.object_def.object_type = ObjectType::CPU_MEMORY; + + if (!_converter_builder->MakeConverter(input_def, permute_def, &_converter_from).ok()) { - tmp_obj = OpenClTexture{_cl_memory.memory()}; + throw std::runtime_error("Failed to make converter_from"); } - else + if (!_converter_builder->MakeConverter(permute_def, output_def, &_converter_to).ok()) { - tmp_obj = OpenClBuffer{_cl_memory.memory()}; + throw std::runtime_error("Failed to make converter_to"); } +} - TensorObjectDef output_def = input_def; - output_def.dimensions.b = handle()->Batch(); - output_def.dimensions.h = handle()->Height(); - output_def.dimensions.w = handle()->Width(); - output_def.dimensions.c = handle()->Channels(); - output_def.object_def.data_layout = ToDataLayout(handle()->GetStorageType()); - output_def.object_def.data_type = handle()->GetDataType(); - output_def.object_def.object_type = ToObjectType(handle()->GetStorageType()); +void ICLTensor::enqueueWriteBuffer(const void *ptr, bool) +{ + TensorObject input_obj = + MakeReadableCpuMemory(absl::MakeSpan(static_cast<const float *>(ptr), _shape.num_elements())); - _converter_builder = NewConverterBuilder(_environment.get()); - if (!_converter_builder->MakeConverter(input_def, tmp_def, &_converter_cpu).ok()) + TensorObject output_obj; + + TensorObject permute_obj; + if (ToObjectType(handle()->GetStorageType()) == ObjectType::OPENCL_TEXTURE) { - throw std::runtime_error("MakeConverter<_converter_cpu> error."); + permute_obj = OpenClTexture{_cl_memory.memory()}; } - if (!_converter_builder->MakeConverter(tmp_def, output_def, &_converter_bhwc).ok()) + else { - throw std::runtime_error("MakeConverter<_converter_bhwc> error."); + permute_obj = OpenClBuffer{_cl_memory.memory()}; } - if (!_converter_cpu->Convert(input_obj, tmp_obj).ok()) + if (handle()->GetStorageType() == TensorStorageType::BUFFER) { - throw std::runtime_error("[w] _converter_cpu Convert error."); + output_obj = OpenClBuffer{handle()->GetMemoryPtr()}; } - if (!_converter_bhwc->Convert(tmp_obj, output_obj).ok()) + else if (handle()->GetStorageType() == TensorStorageType::IMAGE_BUFFER) { - throw std::runtime_error("[w] _converter_bhwc Convert error."); + output_obj = OpenClBuffer{handle()->GetMemoryPtrForWriting()}; + } + else + { + output_obj = OpenClTexture{handle()->GetMemoryPtr()}; + } + + if (!_converter_to->Convert(input_obj, permute_obj).ok()) + { + throw std::runtime_error("Failed to write cl buffer from cpu memory"); + } + if (!_converter_from->Convert(permute_obj, output_obj).ok()) + { + throw std::runtime_error("Failed to change layout"); } } void ICLTensor::enqueueReadBuffer(void *ptr, bool) { - float *arr = (float *)ptr; - TensorObject output_obj = MakeCpuMemory(absl::MakeSpan(arr, total_size() / 4)); - TensorObject input_obj; if (handle()->GetStorageType() == TensorStorageType::BUFFER) @@ -148,72 +185,26 @@ void ICLTensor::enqueueReadBuffer(void *ptr, bool) input_obj = OpenClTexture{handle()->GetMemoryPtr()}; } - TensorObjectDef input_def; - input_def.dimensions.b = handle()->Batch(); - input_def.dimensions.h = handle()->Height(); - input_def.dimensions.w = handle()->Width(); - input_def.dimensions.c = handle()->Channels(); - input_def.object_def.data_layout = ToDataLayout(handle()->GetStorageType()); - input_def.object_def.data_type = handle()->GetDataType(); - input_def.object_def.object_type = ToObjectType(handle()->GetStorageType()); - input_def.object_def.user_provided = false; - - TensorObjectDef tmp_def; - tmp_def.dimensions.b = handle()->Batch(); - tmp_def.dimensions.h = handle()->Height(); - tmp_def.dimensions.w = handle()->Width(); - tmp_def.dimensions.c = handle()->Channels(); - tmp_def.object_def.data_layout = DataLayout::BHWC; - tmp_def.object_def.data_type = DataType::FLOAT32; - tmp_def.object_def.object_type = ToObjectType(handle()->GetStorageType()); - tmp_def.object_def.user_provided = true; - - auto dims = tmp_def.dimensions; - const BHWC shape(dims.b, dims.h, dims.w, dims.c); - const TensorDescriptor desc{ - tmp_def.object_def.data_type, - ToTensorStorageType(tmp_def.object_def.object_type, tmp_def.object_def.data_layout), - Layout::BHWC}; - if (!AllocateTensorMemory(_environment->context(), shape, desc, &_cl_memory).ok()) + TensorObject permute_obj; + if (ToObjectType(handle()->GetStorageType()) == ObjectType::OPENCL_TEXTURE) { - throw std::runtime_error("AllocateTensorMemory error."); - } - TensorObject tmp_obj; - if (tmp_def.object_def.object_type == ObjectType::OPENCL_TEXTURE) - { - tmp_obj = OpenClTexture{_cl_memory.memory()}; + permute_obj = OpenClTexture{_cl_memory.memory()}; } else { - tmp_obj = OpenClBuffer{_cl_memory.memory()}; + permute_obj = OpenClBuffer{_cl_memory.memory()}; } - TensorObjectDef output_def = input_def; - output_def.dimensions.b = handle()->Batch(); - output_def.dimensions.h = handle()->Height(); - output_def.dimensions.w = handle()->Width(); - output_def.dimensions.c = handle()->Channels(); - output_def.object_def.data_layout = DataLayout::BHWC; - output_def.object_def.data_type = DataType::FLOAT32; - output_def.object_def.object_type = ObjectType::CPU_MEMORY; - output_def.object_def.user_provided = true; - _converter_builder = NewConverterBuilder(_environment.get()); - if (!_converter_builder->MakeConverter(input_def, tmp_def, &_converter_bhwc).ok()) - { - throw std::runtime_error("MakeConverter<_converter_bhwc> error."); - } - if (!_converter_builder->MakeConverter(tmp_def, output_def, &_converter_cpu).ok()) - { - throw std::runtime_error("MakeConverter<_converter_cpu> error."); - } + TensorObject output_obj = + MakeCpuMemory(absl::MakeSpan(static_cast<float *>(ptr), _shape.num_elements())); - if (!_converter_bhwc->Convert(input_obj, tmp_obj).ok()) + if (!_converter_from->Convert(input_obj, permute_obj).ok()) { - throw std::runtime_error("[r] _converter_bhwc Convert error."); + throw std::runtime_error("Failed to change layout"); } - if (!_converter_cpu->Convert(tmp_obj, output_obj).ok()) + if (!_converter_to->Convert(permute_obj, output_obj).ok()) { - throw std::runtime_error("[r] _converter_cpu Convert error."); + throw std::runtime_error("Failed to read cl buffer"); } } diff --git a/runtime/onert/backend/gpu_cl/operand/ICLTensor.h b/runtime/onert/backend/gpu_cl/operand/ICLTensor.h index 28e905d48..b8ad4469f 100644 --- a/runtime/onert/backend/gpu_cl/operand/ICLTensor.h +++ b/runtime/onert/backend/gpu_cl/operand/ICLTensor.h @@ -19,11 +19,14 @@ #include <backend/ITensor.h> -#include "open_cl/Api.h" -#include "open_cl/Spi.h" -#include "open_cl/ClCommandQueue.h" -#include "open_cl/kernels/Converter.h" -#include "open_cl/Tensor.h" +#include "tensorflow/lite/delegates/gpu/api.h" +#include "tensorflow/lite/delegates/gpu/spi.h" +#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h" +#include "tensorflow/lite/delegates/gpu/cl/kernels/converter.h" +#include "tensorflow/lite/delegates/gpu/cl/tensor.h" +#include "tensorflow/lite/delegates/gpu/cl/environment.h" + +#include "TensorBuilderHelper.h" namespace onert { @@ -43,19 +46,18 @@ public: ICLTensor(ICLTensor &&) = default; ICLTensor &operator=(ICLTensor &&) = default; - ICLTensor(size_t rank, ir::Shape shape, std::shared_ptr<Environment> environment) - : _rank{rank}, _shape{shape}, _environment(environment) + ICLTensor(size_t rank, ir::Shape shape, std::shared_ptr<tflite::gpu::cl::Environment> environment, + TensorType type) + : _rank{rank}, _shape{shape}, _environment(environment), _type(type) { } public: uint8_t *buffer() const final { return reinterpret_cast<uint8_t *>(handle()->GetMemoryPtr()); } size_t total_size() const final { return _shape.num_elements() * sizeof(float); } - size_t calcOffset(const ir::Coordinates &coords) const final + size_t calcOffset(const ir::Coordinates &) const final { - // NYI - (void)coords; - return 0; + throw std::runtime_error("ICLTensor::calcOffset() is not supported."); } ir::Layout layout() const final { return ir::Layout::NHWC; } ir::DataType data_type() const final { return ir::DataType::FLOAT32; } @@ -83,19 +85,24 @@ public: void enqueueWriteBuffer(const void *ptr, bool blocking = true) final; void enqueueReadBuffer(void *ptr, bool blocking = true) final; + void writeConvertInit(); + void readConvertInit(); + TensorType get_type() { return _type; } + public: - virtual const Tensor *handle() const = 0; - virtual Tensor *handle() = 0; + virtual const tflite::gpu::cl::Tensor *handle() const = 0; + virtual tflite::gpu::cl::Tensor *handle() = 0; private: protected: size_t _rank; // Actual rank (reflects extended rank) ir::Shape _shape; - std::shared_ptr<Environment> _environment; - std::unique_ptr<TensorObjectConverterBuilder> _converter_builder; - CLMemory _cl_memory; - std::unique_ptr<TensorObjectConverter> _converter_cpu; - std::unique_ptr<TensorObjectConverter> _converter_bhwc; + std::shared_ptr<tflite::gpu::cl::Environment> _environment; + TensorType _type; + std::unique_ptr<tflite::gpu::TensorObjectConverterBuilder> _converter_builder; + tflite::gpu::cl::CLMemory _cl_memory; + std::unique_ptr<tflite::gpu::TensorObjectConverter> _converter_to; + std::unique_ptr<tflite::gpu::TensorObjectConverter> _converter_from; }; } // namespace operand diff --git a/runtime/onert/backend/ruy/ops/OperationUtils.h b/runtime/onert/backend/ruy/ops/OperationUtils.h index 5dfdc7ec5..716400c1f 100644 --- a/runtime/onert/backend/ruy/ops/OperationUtils.h +++ b/runtime/onert/backend/ruy/ops/OperationUtils.h @@ -18,17 +18,17 @@ #define __ONERT_BACKEND_RUY_OPS_OPERATION_UTILS_H__ #include <backend/IPortableTensor.h> +#include <ir/DataType.h> +#include <ir/Padding.h> +#include <util/CalculateActivationRange.h> #include <ruy/Shape.h> #include <ruy/Types.h> -#include <iostream> -#include <ir/DataType.h> -#include <ir/InternalType.h> -#include <ir/Padding.h> #include <limits> using OperandType = onert::ir::DataType; +using namespace onert::util; namespace onert { @@ -79,40 +79,6 @@ inline nnfw::ruy::FusedActivationFunctionType convertActivationType(const ir::Ac } } -template <typename T> -void CalculateActivationRange(ir::Activation activation, T *activation_min, T *activation_max) -{ - if (activation == ir::Activation::RELU) - { - *activation_min = 0; - *activation_max = std::numeric_limits<T>::max(); - } - else if (activation == ir::Activation::RELU6) - { - *activation_min = 0; - *activation_max = 6; - } - else if (activation == ir::Activation::RELU1) - { - *activation_min = -1; - *activation_max = 1; - } - else if (activation == ir::Activation::SIGMOID) - { - *activation_min = 0; - *activation_max = 1; - } - else if (activation == ir::Activation::NONE) - { - *activation_min = std::numeric_limits<T>::lowest(); - *activation_max = std::numeric_limits<T>::max(); - } - else - { - std::cout << "Unsupported fused activation function." << std::endl; - } -} - nnfw::ruy::PaddingType getPaddingType(ir::PaddingType ir_padding_type); } // namespace ops diff --git a/runtime/onert/backend/trix/Backend.h b/runtime/onert/backend/trix/Backend.h new file mode 100644 index 000000000..a63839720 --- /dev/null +++ b/runtime/onert/backend/trix/Backend.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_TRIX_BACKEND_H__ +#define __ONERT_BACKEND_TRIX_BACKEND_H__ + +#include "BackendContext.h" +#include "Config.h" +#include "KernelGenerator.h" + +#include <backend/Backend.h> + +#include <memory> + +namespace onert +{ +namespace backend +{ +namespace trix +{ + +class Backend : public ::onert::backend::Backend +{ +public: + Backend() : _config{std::make_shared<Config>()} {} + + std::shared_ptr<IConfig> config() const override { return _config; } + + std::unique_ptr<onert::backend::BackendContext> newContext(ContextData &&data) const override + { + auto &graph = *data.graph; + auto context = std::make_unique<BackendContext>(this, std::move(data)); + auto tr = std::make_shared<basic::TensorRegistry>(); + auto tb = std::make_shared<TensorBuilder>(tr); + context->tensor_registry = tr; + context->tensor_builder = tb; + context->kernel_gen = std::make_shared<KernelGenerator>(graph, tb, tr, context->dev_context()); + return context; + } + +private: + std::shared_ptr<IConfig> _config; +}; + +} // namespace trix +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_TRIX_BACKEND_H__ diff --git a/runtime/onert/backend/trix/BackendContext.cc b/runtime/onert/backend/trix/BackendContext.cc new file mode 100644 index 000000000..e46b11d20 --- /dev/null +++ b/runtime/onert/backend/trix/BackendContext.cc @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BackendContext.h" + +#include "TensorBuilder.h" +#include "KernelGenerator.h" +#include "util/logging.h" +#include "ir/Index.h" +#include "ir/OperandIndexMap.h" +#include "ir/OperandIndexSequence.h" +#include "backend/basic/BackendContextHelpers.h" + +namespace onert +{ +namespace backend +{ +namespace trix +{ + +ITensorRegistry *BackendContext::genTensors() { return basic::genTensors(*this); } + +FunctionMap BackendContext::genKernels() +{ + FunctionMap ret; + + for (auto op_ind : _data.op_order) + { + auto fn_seq = kernel_gen->generate(op_ind); + ret.emplace_back(op_ind, std::move(fn_seq)); + } + + basic::initConsts(*this); + + // NOTE For memory optimization, we want to free some operand data + const_cast<ir::Graph &>(*_data.graph) + .operands() + .iterate([&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); }); + + for (auto &it : ret) + { + auto &fn_seq = it.second; + fn_seq->iterate([&](exec::IFunction &ifunc) { ifunc.prepare(); }); + } + + return ret; +} + +} // namespace trix +} // namespace backend +} // namespace onert diff --git a/runtime/onert/backend/trix/BackendContext.h b/runtime/onert/backend/trix/BackendContext.h new file mode 100644 index 000000000..c0734c46d --- /dev/null +++ b/runtime/onert/backend/trix/BackendContext.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_TRIX_BACKEND_CONTEXT_H__ +#define __ONERT_BACKEND_TRIX_BACKEND_CONTEXT_H__ + +#include <backend/BackendContext.h> +#include "TensorBuilder.h" +#include "KernelGenerator.h" +#include "DevContext.h" + +namespace onert +{ +namespace backend +{ +namespace trix +{ + +class BackendContext : public onert::backend::BackendContext +{ +public: + BackendContext(const Backend *backend, ContextData &&data, + std::shared_ptr<ITensorRegistry> tensor_registry = nullptr, + std::shared_ptr<TensorBuilder> tensor_builder = nullptr, + std::shared_ptr<KernelGenerator> kernel_gen = nullptr) + : onert::backend::BackendContext(backend, std::move(data), tensor_registry), + tensor_builder{tensor_builder}, kernel_gen{kernel_gen}, _dev_context(new DevContext) + { + } + + ITensorRegistry *genTensors() override; + FunctionMap genKernels() override; + + std::shared_ptr<DevContext> dev_context() { return _dev_context; } + +public: + // TODO Make it private + std::shared_ptr<TensorBuilder> tensor_builder; + std::shared_ptr<KernelGenerator> kernel_gen; + +private: + std::shared_ptr<DevContext> _dev_context; +}; + +} // namespace trix +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_TRIX_BACKEND_CONTEXT_H__ diff --git a/runtime/onert/backend/trix/CMakeLists.txt b/runtime/onert/backend/trix/CMakeLists.txt new file mode 100644 index 000000000..5455757ca --- /dev/null +++ b/runtime/onert/backend/trix/CMakeLists.txt @@ -0,0 +1,24 @@ +set(LIB_ONERT_BACKEND_TRIX onert_backend_trix) + +nnfw_find_package(TRIXEngine EXACT 2.5.0 QUIET) +if(NOT TRIXEngine_FOUND) + return() +endif(NOT TRIXEngine_FOUND) + +file(GLOB_RECURSE SOURCES "*.cc") + +add_library(${LIB_ONERT_BACKEND_TRIX} SHARED ${SOURCES}) + +target_link_libraries(${LIB_ONERT_BACKEND_TRIX} PRIVATE onert_core) +target_link_libraries(${LIB_ONERT_BACKEND_TRIX} PRIVATE trix_engine) +target_link_libraries(${LIB_ONERT_BACKEND_TRIX} PRIVATE nnfw_common) +target_link_libraries(${LIB_ONERT_BACKEND_TRIX} PRIVATE nnfw_coverage) + +set_target_properties(${LIB_ONERT_BACKEND_TRIX} PROPERTIES OUTPUT_NAME backend_trix) + +if(CMAKE_BUILD_TYPE_LC STREQUAL "release") + add_custom_command(TARGET ${LIB_ONERT_BACKEND_TRIX} POST_BUILD + COMMAND ${CMAKE_STRIP} "--strip-unneeded" $<TARGET_FILE_NAME:${LIB_ONERT_BACKEND_TRIX}>) +endif() + +install(TARGETS ${LIB_ONERT_BACKEND_TRIX} DESTINATION lib) diff --git a/runtime/onert/backend/gpu_cl/open_cl/AccessType.h b/runtime/onert/backend/trix/Config.cc index 81efd666f..c23326423 100644 --- a/runtime/onert/backend/gpu_cl/open_cl/AccessType.h +++ b/runtime/onert/backend/trix/Config.cc @@ -1,12 +1,11 @@ /* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2019 The TensorFlow Authors. All Rights Reserved. + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -15,25 +14,19 @@ * limitations under the License. */ -#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_ACCESS_TYPE_H__ -#define __ONERT_BACKEND_GPU_CL_OPENCL_ACCESS_TYPE_H__ +#include "Config.h" namespace onert { namespace backend { -namespace gpu_cl +namespace trix { -enum class AccessType -{ - UNKNOWN, - READ, - WRITE, - READ_WRITE, -}; -} // namespace gpu_cl +bool Config::initialize() { return true; } + +ir::Layout Config::supportLayout(const ir::Operation &, ir::Layout) { return ir::Layout::NHWC; } + +} // namespace trix } // namespace backend } // namespace onert - -#endif // __ONERT_BACKEND_GPU_CL_OPENCL_ACCESS_TYPE_H__ diff --git a/runtime/onert/backend/trix/Config.h b/runtime/onert/backend/trix/Config.h new file mode 100644 index 000000000..799047d6f --- /dev/null +++ b/runtime/onert/backend/trix/Config.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_TRIX_CONFIG_H__ +#define __ONERT_BACKEND_TRIX_CONFIG_H__ + +#include <backend/IConfig.h> +#include <memory> +#include <util/ITimer.h> + +namespace onert +{ +namespace backend +{ +namespace trix +{ + +class Config : public IConfig +{ +public: + std::string id() override { return "trix"; } + bool initialize() override; + ir::Layout supportLayout(const ir::Operation &node, ir::Layout frontend_layout) override; + bool supportPermutation() override { return true; } + bool supportDynamicTensor() override { return false; } + bool supportFP16() override { return false; } + + std::unique_ptr<util::ITimer> timer() override { return std::make_unique<util::CPUTimer>(); } +}; + +} // namespace trix +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_TRIX_CONFIG_H__ diff --git a/runtime/onert/backend/trix/DevContext.h b/runtime/onert/backend/trix/DevContext.h new file mode 100644 index 000000000..482932fd4 --- /dev/null +++ b/runtime/onert/backend/trix/DevContext.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_TRIX_DEV_CONTEXT_H__ +#define __ONERT_BACKEND_TRIX_DEV_CONTEXT_H__ + +#include <libnpuhost.h> + +namespace onert +{ +namespace backend +{ +namespace trix +{ + +class DevContext +{ +public: + DevContext() + { + auto device_count = getnumNPUdeviceByType(NPUCOND_TRIV2_CONN_SOCIP); + if (device_count <= 0) + { + throw std::runtime_error("Unable to find TRIV2 NPU device"); + } + + // Use NPU 0 device + if (getNPUdeviceByType(&_dev_handle, NPUCOND_TRIV2_CONN_SOCIP, 0) < 0) + { + throw std::runtime_error("Failed to get TRIV2 NPU device handle"); + } + } + + ~DevContext() + { + if (_dev_handle != nullptr) + { + unregisterNPUmodel_all(_dev_handle); + putNPUdevice(_dev_handle); + } + } + + npudev_h getDev() { return _dev_handle; } + + template <typename T> void setDataInfo(tensors_data_info *info, std::vector<T *> &tensors) + { + info->num_info = static_cast<uint32_t>(tensors.size()); + + for (uint32_t idx = 0; idx < info->num_info; ++idx) + { + info->info[idx].layout = convertDataLayout(tensors[idx]->layout()); + info->info[idx].type = convertDataType(tensors[idx]->data_type()); + } + } + + template <typename T> void setBuffer(generic_buffers *buf, std::vector<T *> &tensors) + { + buf->num_buffers = static_cast<uint32_t>(tensors.size()); + + for (uint32_t idx = 0; idx < buf->num_buffers; ++idx) + { + buf->bufs[idx].addr = tensors[idx]->buffer(); + buf->bufs[idx].size = static_cast<uint64_t>(tensors[idx]->total_size()); + buf->bufs[idx].type = BUFFER_MAPPED; + } + } + +private: + data_layout convertDataLayout(const ir::Layout layout) + { + switch (layout) + { + case ir::Layout::NCHW: + return DATA_LAYOUT_NCHW; + case ir::Layout::NHWC: + return DATA_LAYOUT_NHWC; + default: + throw std::runtime_error("Unknown Layout"); + } + } + + data_type convertDataType(const ir::DataType type) + { + switch (type) + { + case ir::DataType::QUANT_UINT8_ASYMM: + return DATA_TYPE_QASYMM8; + case ir::DataType::QUANT_INT16_SYMM: + return DATA_TYPE_QSYMM16; + default: + throw std::runtime_error("Unsupported data type"); + } + } + +private: + // NPU device handle + // TODO Support multicore npu device + npudev_h _dev_handle; +}; + +} // namespace trix +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_TRIX_DEV_CONTEXT_H__ diff --git a/runtime/onert/backend/trix/KernelGenerator.cc b/runtime/onert/backend/trix/KernelGenerator.cc new file mode 100644 index 000000000..68e6840dd --- /dev/null +++ b/runtime/onert/backend/trix/KernelGenerator.cc @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "KernelGenerator.h" + +#include "ops/BulkLayer.h" + +#include <backend/Backend.h> +#include <backend/IConfig.h> +#include <memory> +#include <util/Utils.h> +#include <util/logging.h> +#include <exec/DynamicShapeInferer.h> + +#include <stdexcept> + +namespace onert +{ +namespace backend +{ +namespace trix +{ + +KernelGenerator::KernelGenerator(const ir::Graph &graph, + const std::shared_ptr<TensorBuilder> &tensor_builder, + const std::shared_ptr<basic::TensorRegistry> &tensor_reg, + const std::shared_ptr<DevContext> &dev_context) + : basic::KernelGeneratorBase{graph}, + _ctx(graph.operands()), _operations_ctx{graph.operations()}, _current_layout{graph.layout()}, + _tensor_builder(tensor_builder), _tensor_reg{tensor_reg}, _dev_context{dev_context} +{ + // DO NOTHING +} + +std::unique_ptr<exec::FunctionSequence> KernelGenerator::generate(ir::OperationIndex ind) +{ + auto ret = std::make_unique<exec::FunctionSequence>(); + ret->enableDynamicShapeInferer(false); + + const auto &op = _graph.operations().at(ind); + op.accept(*this); + ret->append(releaseFunction()); + return ret; +} + +void KernelGenerator::visit(const ir::operation::Bulk &node) +{ + using ir::operation::Bulk; + + std::vector<IPortableTensor *> output_tensors; + for (auto &ofm_idx : node.getOutputs()) + output_tensors.emplace_back(_tensor_reg->getPortableTensor(ofm_idx)); + + std::vector<const IPortableTensor *> input_tensors; + for (auto &ifm_idx : node.getInputs()) + input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx)); + + // parameters + const auto binary_path = node.param().binary_path; + + auto fn = std::make_unique<ops::BulkLayer>(); + + fn->configure(input_tensors, output_tensors, binary_path, _dev_context); + + _return_fn = std::move(fn); +} + +} // namespace trix +} // namespace backend +} // namespace onert diff --git a/runtime/onert/backend/trix/KernelGenerator.h b/runtime/onert/backend/trix/KernelGenerator.h new file mode 100644 index 000000000..d87dc6952 --- /dev/null +++ b/runtime/onert/backend/trix/KernelGenerator.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_TRIX_KERNEL_GENERATOR_H__ +#define __ONERT_BACKEND_TRIX_KERNEL_GENERATOR_H__ + +#include "TensorBuilder.h" +#include "backend/basic/TensorRegistry.h" +#include "Tensor.h" +#include "DevContext.h" + +#include <backend/basic/KernelGeneratorBase.h> +#include <ir/Operands.h> +#include <ir/Operations.h> + +namespace onert +{ +namespace backend +{ +namespace trix +{ + +class KernelGenerator : public basic::KernelGeneratorBase +{ +public: + KernelGenerator(const ir::Graph &graph, const std::shared_ptr<TensorBuilder> &tensor_builder, + const std::shared_ptr<basic::TensorRegistry> &tensor_reg, + const std::shared_ptr<DevContext> &dev_context); + + std::unique_ptr<exec::FunctionSequence> generate(ir::OperationIndex op_ind) override; + +private: + void visit(const ir::operation::Bulk &node) override; + +private: + const ir::Operands &_ctx; + const ir::Operations &_operations_ctx; + ir::Layout _current_layout; + std::shared_ptr<TensorBuilder> _tensor_builder; + std::shared_ptr<basic::TensorRegistry> _tensor_reg; + const std::shared_ptr<DevContext> _dev_context; +}; + +} // namespace trix +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_TRIX_KERNEL_GENERATOR_H__ diff --git a/runtime/onert/backend/acl_common/ParentInfo.h b/runtime/onert/backend/trix/Tensor.h index 708436327..5138cee71 100644 --- a/runtime/onert/backend/acl_common/ParentInfo.h +++ b/runtime/onert/backend/trix/Tensor.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,31 +14,24 @@ * limitations under the License. */ -#ifndef __ONERT_BACKEND_ACL_COMMON_PARENT_INFO_H__ -#define __ONERT_BACKEND_ACL_COMMON_PARENT_INFO_H__ +#ifndef __ONERT_BACKEND_TRIX_TENSOR_H__ +#define __ONERT_BACKEND_TRIX_TENSOR_H__ -#include <ir/Index.h> -#include <ir/Coordinates.h> +#include <backend/basic/Tensor.h> +#include <ir/Data.h> namespace onert { namespace backend { -namespace acl_common +namespace trix { -/** - * @brief Struct to represent parent operand in child operand - */ -struct ParentInfo -{ - ir::OperandIndex parent; - ir::Layout frontend_layout; - ir::Coordinates coordinates; -}; +using Tensor = basic::Tensor; +using ExternalTensor = basic::ExternalTensor; -} // namespace acl_common +} // namespace trix } // namespace backend } // namespace onert -#endif // __ONERT_BACKEND_ACL_COMMON_PARENT_INFO_H__ +#endif // __ONERT_BACKEND_TRIX_TENSOR_H__ diff --git a/runtime/onert/backend/gpu_cl/open_cl/GpuObject.cc b/runtime/onert/backend/trix/TensorBuilder.h index 774f8151f..ac6ca0f9a 100644 --- a/runtime/onert/backend/gpu_cl/open_cl/GpuObject.cc +++ b/runtime/onert/backend/trix/TensorBuilder.h @@ -1,12 +1,11 @@ /* - * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright 2020 The TensorFlow Authors. All Rights Reserved. + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -15,30 +14,22 @@ * limitations under the License. */ -#include "GpuObject.h" +#ifndef __ONERT_BACKEND_TRIX_TENSOR_BUILDER_H__ +#define __ONERT_BACKEND_TRIX_TENSOR_BUILDER_H__ + +#include <backend/basic/TensorBuilder.h> namespace onert { namespace backend { -namespace gpu_cl +namespace trix { -std::string MemoryTypeToCLType(MemoryType type) -{ - switch (type) - { - case MemoryType::GLOBAL: - return "__global"; - case MemoryType::CONSTANT: - return "__constant"; - break; - case MemoryType::LOCAL: - return "__local"; - } - return ""; -} +using TensorBuilder = basic::TensorBuilder; -} // namespace gpu_cl +} // namespace trix } // namespace backend } // namespace onert + +#endif // __ONERT_BACKEND_TRIX_TENSOR_BUILDER_H__ diff --git a/runtime/onert/backend/trix/ops/BulkLayer.cc b/runtime/onert/backend/trix/ops/BulkLayer.cc new file mode 100644 index 000000000..71fdf3f0d --- /dev/null +++ b/runtime/onert/backend/trix/ops/BulkLayer.cc @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BulkLayer.h" +#include <util/logging.h> + +#include <libnpuhost.h> + +namespace onert +{ +namespace backend +{ +namespace trix +{ +namespace ops +{ + +BulkLayer::BulkLayer() : _inputs(), _outputs(), _model_id(0), _meta(nullptr), _dev_context(nullptr) +{ + // DO NOTHING +} + +BulkLayer::~BulkLayer() { free(_meta); } + +void BulkLayer::configure(const std::vector<const IPortableTensor *> &inputs, + std::vector<IPortableTensor *> &outputs, std::string binary_path, + const std::shared_ptr<DevContext> &dev_context) +{ + _inputs = inputs; + _outputs = outputs; + _dev_context = dev_context; + + _meta = getNPUmodel_metadata(binary_path.c_str(), false); + if (_meta == nullptr) + { + throw std::runtime_error("Unable to extract the model metadata"); + } + + generic_buffer model_file; + model_file.type = BUFFER_FILE; + model_file.filepath = binary_path.c_str(); + model_file.size = _meta->size; + + if (registerNPUmodel(dev_context->getDev(), &model_file, &_model_id) < 0) + { + throw std::runtime_error("Failed to register npu model"); + } +} + +void BulkLayer::run() +{ + int req_id; + if (createNPU_request(_dev_context->getDev(), _model_id, &req_id)) + { + throw std::runtime_error("Unable to create NPU request with model id (" + + std::to_string(_model_id) + ")"); + } + + if (_meta->input_seg_num != _inputs.size()) + { + throw std::runtime_error("input size does not match to model input seg num"); + } + + if (_meta->output_seg_num != _outputs.size()) + { + throw std::runtime_error("output size does not match to model output seg num"); + } + + tensors_data_info in_info; + tensors_data_info out_info; + _dev_context->setDataInfo<const IPortableTensor>(&in_info, _inputs); + _dev_context->setDataInfo<IPortableTensor>(&out_info, _outputs); + + input_buffers input_buf; + output_buffers output_buf; + _dev_context->setBuffer<const IPortableTensor>(&input_buf, _inputs); + _dev_context->setBuffer<IPortableTensor>(&output_buf, _outputs); + + if (setNPU_requestData(_dev_context->getDev(), req_id, &input_buf, &in_info, &output_buf, + &out_info)) + { + throw std::runtime_error("Unable to create NPU request for model id (" + + std::to_string(_model_id) + ")"); + } + + if (submitNPU_request(_dev_context->getDev(), req_id)) + { + throw std::runtime_error("Unable to submit NPU request with req id (" + std::to_string(req_id) + + ")"); + } + + if (removeNPU_request(_dev_context->getDev(), req_id)) + { + throw std::runtime_error("Unable to remove NPU request with req id (" + std::to_string(req_id) + + ")"); + } +} + +void BulkLayer::prepare() +{ + // DO NOTHING +} + +} // namespace ops +} // namespace trix +} // namespace backend +} // namespace onert diff --git a/runtime/onert/backend/trix/ops/BulkLayer.h b/runtime/onert/backend/trix/ops/BulkLayer.h new file mode 100644 index 000000000..f7080ccad --- /dev/null +++ b/runtime/onert/backend/trix/ops/BulkLayer.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_TRIX_OPS_BULKLAYER_H__ +#define __ONERT_BACKEND_TRIX_OPS_BULKLAYER_H__ + +#include <backend/IPortableTensor.h> +#include "../DevContext.h" + +#include <exec/IFunction.h> + +namespace onert +{ +namespace backend +{ +namespace trix +{ +namespace ops +{ + +class BulkLayer : public ::onert::exec::IFunction +{ +public: + BulkLayer(); + ~BulkLayer(); + +public: + void configure(const std::vector<const IPortableTensor *> &inputs, + std::vector<IPortableTensor *> &outputs, std::string binary_path, + const std::shared_ptr<DevContext> &dev_context); + + void run() override; + + void prepare() override; + +private: + std::vector<const IPortableTensor *> _inputs; + std::vector<IPortableTensor *> _outputs; + + uint32_t _model_id; + npubin_meta *_meta; + std::shared_ptr<DevContext> _dev_context; +}; + +} // namespace ops +} // namespace trix +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_TRIX_OPS_BULKLAYER_H__ diff --git a/runtime/onert/backend/trix/trix.cc b/runtime/onert/backend/trix/trix.cc new file mode 100644 index 000000000..816fb4406 --- /dev/null +++ b/runtime/onert/backend/trix/trix.cc @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Backend.h" + +extern "C" { + +onert::backend::Backend *onert_backend_create() { return new onert::backend::trix::Backend; } + +void onert_backend_destroy(onert::backend::Backend *backend) { delete backend; } +} diff --git a/runtime/onert/backend/xnnpack/ops/OperationUtils.h b/runtime/onert/backend/xnnpack/ops/OperationUtils.h index 5102e32dd..fe93fccc0 100644 --- a/runtime/onert/backend/xnnpack/ops/OperationUtils.h +++ b/runtime/onert/backend/xnnpack/ops/OperationUtils.h @@ -17,10 +17,10 @@ #ifndef __ONERT_BACKEND_XNNPACK_OPS_OPERATION_UTILS_H__ #define __ONERT_BACKEND_XNNPACK_OPS_OPERATION_UTILS_H__ -// duplicated from cpu/ops/OperationUtils.h +#include <ir/DataType.h> #include <ir/InternalType.h> #include <ir/Padding.h> -#include <ir/DataType.h> +#include <util/CalculateActivationRange.h> namespace onert { @@ -32,40 +32,7 @@ namespace ops { using OperandType = ir::DataType; - -template <typename T> -void CalculateActivationRange(ir::Activation activation, T *activation_min, T *activation_max) -{ - if (activation == ir::Activation::RELU) - { - *activation_min = 0; - *activation_max = std::numeric_limits<T>::max(); - } - else if (activation == ir::Activation::RELU6) - { - *activation_min = 0; - *activation_max = 6; - } - else if (activation == ir::Activation::RELU1) - { - *activation_min = -1; - *activation_max = 1; - } - else if (activation == ir::Activation::SIGMOID) - { - *activation_min = 0; - *activation_max = 1; - } - else if (activation == ir::Activation::NONE) - { - *activation_min = std::numeric_limits<T>::lowest(); - *activation_max = std::numeric_limits<T>::max(); - } - else - { - throw std::runtime_error{"Unsupported fused activation function"}; - } -} +using namespace onert::util; // CalculateActivationRange } // namespace ops } // namespace xnnpack |