summaryrefslogtreecommitdiff
path: root/runtime/onert
diff options
context:
space:
mode:
authorChunseok Lee <chunseok.lee@samsung.com>2021-08-23 13:25:15 +0900
committerChunseok Lee <chunseok.lee@samsung.com>2021-08-23 13:25:15 +0900
commitf4cf19e579a19c5346ccb2aad55bfd251065e447 (patch)
tree5d436b11f89be0e8a8289ea82b773da6402c1add /runtime/onert
parent589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e (diff)
downloadnnfw-f4cf19e579a19c5346ccb2aad55bfd251065e447.tar.gz
nnfw-f4cf19e579a19c5346ccb2aad55bfd251065e447.tar.bz2
nnfw-f4cf19e579a19c5346ccb2aad55bfd251065e447.zip
Diffstat (limited to 'runtime/onert')
-rw-r--r--runtime/onert/api/include/nnfw_experimental.h56
-rw-r--r--runtime/onert/api/include/nnfw_version.h2
-rw-r--r--runtime/onert/api/src/nnfw_api.cc25
-rw-r--r--runtime/onert/api/src/nnfw_api_internal.cc244
-rw-r--r--runtime/onert/api/src/nnfw_api_internal.h13
-rw-r--r--runtime/onert/backend/CMakeLists.txt1
-rw-r--r--runtime/onert/backend/cpu/KernelGenerator.cc2
-rw-r--r--runtime/onert/backend/cpu/ops/ElementwiseBinaryLayer.cc31
-rw-r--r--runtime/onert/backend/cpu/ops/ElementwiseBinaryLayer.h1
-rw-r--r--runtime/onert/backend/cpu/ops/PoolLayer.cc6
-rw-r--r--runtime/onert/backend/gpu_cl/Backend.h91
-rw-r--r--runtime/onert/backend/gpu_cl/BackendContext.cc242
-rw-r--r--runtime/onert/backend/gpu_cl/BackendContext.h66
-rw-r--r--runtime/onert/backend/gpu_cl/CMakeLists.txt44
-rw-r--r--runtime/onert/backend/gpu_cl/ClConstantInitializer.cc104
-rw-r--r--runtime/onert/backend/gpu_cl/ClConstantInitializer.h140
-rw-r--r--runtime/onert/backend/gpu_cl/ClFunction.h88
-rw-r--r--runtime/onert/backend/gpu_cl/ClMemoryManager.h135
-rw-r--r--runtime/onert/backend/gpu_cl/ClTensorBuilder.h289
-rw-r--r--runtime/onert/backend/gpu_cl/ClTensorManager.h235
-rw-r--r--runtime/onert/backend/gpu_cl/ClTensorRegistry.h55
-rw-r--r--runtime/onert/backend/gpu_cl/Config.cc48
-rw-r--r--runtime/onert/backend/gpu_cl/Config.h53
-rw-r--r--runtime/onert/backend/gpu_cl/ConstantInitializer.cc35
-rw-r--r--runtime/onert/backend/gpu_cl/ConstantInitializer.h43
-rw-r--r--runtime/onert/backend/gpu_cl/KernelGenerator.cc593
-rw-r--r--runtime/onert/backend/gpu_cl/KernelGenerator.h70
-rw-r--r--runtime/onert/backend/gpu_cl/ParentInfo.h44
-rw-r--r--runtime/onert/backend/gpu_cl/TensorBuilder.h38
-rw-r--r--runtime/onert/backend/gpu_cl/TensorManager.h48
-rw-r--r--runtime/onert/backend/gpu_cl/gpu_cl.cc33
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/AccessType.h39
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/Api.cc202
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/Api.h359
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/Arguments.cc926
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/Arguments.h175
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/Buffer.cc234
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/Buffer.h121
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/ClCommandQueue.cc359
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/ClCommandQueue.h157
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/ClContext.cc177
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/ClContext.h68
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/ClDevice.cc448
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/ClDevice.h119
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/ClErrors.h48
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/ClEvent.cc88
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/ClEvent.h75
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/ClImageFormat.cc59
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/ClImageFormat.h39
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/ClKernel.cc171
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/ClKernel.h101
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/ClMemory.cc46
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/ClMemory.h100
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/ClProgram.cc224
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/ClProgram.h98
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/DataType.cc122
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/DataType.h57
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/DeviceInfo.cc383
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/DeviceInfo.h203
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/Environment.cc276
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/Environment.h90
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/GpuObject.cc44
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/GpuObject.h222
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/InferenceContext.cc71
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/InferenceContext.h143
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/InternalTensor.h106
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/LinearStorage.cc265
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/LinearStorage.h110
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/Model.h56
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/ModelHints.h67
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/OpenclWrapper.cc407
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/OpenclWrapper.h560
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/Operations.cc704
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/Operations.h586
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/Precision.cc56
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/Precision.h53
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/ProgramCache.cc97
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/ProgramCache.h112
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/Shape.cc141
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/Shape.h668
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/Spi.h94
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/Status.h29
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/StorageTypeUtil.cc149
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/StorageTypeUtil.h47
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/Tensor.cc690
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/Tensor.h142
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/TensorType.cc1116
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/TensorType.h188
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/TensorTypeUtil.cc90
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/TensorTypeUtil.h41
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/Texture2d.cc237
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/Texture2d.h160
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/Types.h183
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/Util.cc264
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/Util.h278
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/WinogradUtil.cc178
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/WinogradUtil.h48
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/WorkgroupSelection.cc258
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/WorkgroupSelection.h59
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/Add.cc64
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/Add.h43
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/ConvBuffer1x1.cc480
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/ConvBuffer1x1.h205
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/ConvCommon.h44
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/ConvConstants.cc282
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/ConvConstants.h137
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/ConvPowervr.cc1653
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/ConvPowervr.h413
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/ConvWeightsConverter.cc143
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/ConvWeightsConverter.h68
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/Converter.cc592
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/Converter.h40
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv.cc382
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv.h233
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv3x3.cc358
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv3x3.h177
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/GpuOperation.cc385
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/GpuOperation.h203
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/Pooling.cc400
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/Pooling.h43
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/Relu.cc80
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/Relu.h40
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/Reshape.cc111
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/Reshape.h41
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/Reshapex4.cc96
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/Reshapex4.h42
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax.cc86
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax.h40
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax1x1.cc138
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax1x1.h61
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/TuningParameters.h48
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/Util.cc230
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/Util.h73
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/WorkGroupPicking.cc348
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/kernels/WorkGroupPicking.h67
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/selectors/ConvolutionSelector.cc249
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/selectors/ConvolutionSelector.h62
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/selectors/DwConvolutionSelector.cc107
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/selectors/DwConvolutionSelector.h42
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/selectors/SimpleSelectors.cc98
-rw-r--r--runtime/onert/backend/gpu_cl/open_cl/selectors/SimpleSelectors.h57
-rw-r--r--runtime/onert/backend/gpu_cl/operand/CLTensor.cc47
-rw-r--r--runtime/onert/backend/gpu_cl/operand/CLTensor.h66
-rw-r--r--runtime/onert/backend/gpu_cl/operand/ICLTensor.cc223
-rw-r--r--runtime/onert/backend/gpu_cl/operand/ICLTensor.h106
-rw-r--r--runtime/onert/core/include/compiler/Compiler.h26
-rw-r--r--runtime/onert/core/include/compiler/LoweredGraph.h5
-rw-r--r--runtime/onert/core/include/exec/Execution.h107
-rw-r--r--runtime/onert/core/include/exec/IExecutor.h7
-rw-r--r--runtime/onert/core/include/exec/IODescription.h1
-rw-r--r--runtime/onert/core/include/ir/Graph.h36
-rw-r--r--runtime/onert/core/include/ir/OperandIndexSequence.h7
-rw-r--r--runtime/onert/core/include/ir/operation/ElementwiseBinary.h1
-rw-r--r--runtime/onert/core/include/util/Config.lst2
-rw-r--r--runtime/onert/core/src/compiler/Compiler.cc400
-rw-r--r--runtime/onert/core/src/compiler/LoweredGraph.cc79
-rw-r--r--runtime/onert/core/src/exec/Execution.cc237
-rw-r--r--runtime/onert/core/src/exec/ExecutorBase.cc4
-rw-r--r--runtime/onert/core/src/exec/ExecutorBase.h3
-rw-r--r--runtime/onert/core/src/exec/LinearExecutor.cc52
-rw-r--r--runtime/onert/core/src/interp/InterpExecutor.h5
-rw-r--r--runtime/onert/core/src/ir/operation/ElementwiseBinary.cc1
-rw-r--r--runtime/onert/frontend/base_loader/include/base_loader.h11
-rw-r--r--runtime/onert/frontend/circle/src/circle_loader.cc4
-rw-r--r--runtime/onert/frontend/tflite/src/tflite_loader.cc5
165 files changed, 27014 insertions, 40 deletions
diff --git a/runtime/onert/api/include/nnfw_experimental.h b/runtime/onert/api/include/nnfw_experimental.h
index 94f781988..b20447e9e 100644
--- a/runtime/onert/api/include/nnfw_experimental.h
+++ b/runtime/onert/api/include/nnfw_experimental.h
@@ -96,4 +96,60 @@ NNFW_STATUS nnfw_input_tensorindex(nnfw_session *session, const char *tensorname
*/
NNFW_STATUS nnfw_output_tensorindex(nnfw_session *session, const char *tensorname, uint32_t *index);
+/**
+ * @brief Set the backend for each operation in the session
+ *
+ * This function assigns backends (acl_cl, acl_neon, cpu) to each operation in the session.
+ * If successful,the function returns @c NNFW_STATUS_NO_ERROR. Otherwise, the function returns
+ * @c NNFW_STATUS_ERROR.
+ *
+ * @note The argument specifying backends must be in the format
+ * "OP_BACKEND_MAP=\"0=acl_cl;1=cpu;2=acl_cl\"".
+ *
+ * @param[in] session the session object
+ * @param[in] backend_settings String containing backend assignments indexed by operation sequence
+ * @return @c NNFW_STATUS_NO_ERROR if successful
+ */
+NNFW_STATUS nnfw_set_backends_per_operation(nnfw_session *session, const char *backend_settings);
+
+/*
+ * Prepare session to be ready for inference
+ * This phase may finalize model compilation, scheduling, and additional settings.
+ *
+ * @param session the session to be prepared
+ * @return NNFW_STATUS_NO_ERROR if successful
+ */
+NNFW_STATUS nnfw_prepare_pipeline(nnfw_session *session, const char *map_file_path = nullptr);
+
+/**
+ * @brief Set input buffer
+ *
+ * This function must be called after {@link nnfw_prepare_pipeline}, \p inputs given to this
+ * function can be reused for many inferences. \p lengths must be greater or equal than the operand
+ * requires. if you give empty \p inputs to this function, then this function will join all threads.
+ *
+ * @param[in] session Session to the input is to be set
+ * @param[in] inputs Raw buffers for input, it must be \p std::vector<void *> type pointer for
+ * multiple input model
+ * @param[in] lengths Size of bytes of input buffers, it must be \p std::vector<uint32_t> type
+ * pointer for multiple input model
+ *
+ * @return @c NNFW_STATUS_NO_ERROR if successful
+ */
+NNFW_STATUS nnfw_push_pipeline_input(nnfw_session *session, void *inputs, void *lengths);
+
+/**
+ * @brief Get last outputs of partitioned model in session
+ *
+ * This function must be called after {@link nnfw_prepare_pipeline}, \p outputs given to this
+ * function must be cleared for memory management.
+ *
+ * @param[in] session Session from last outputs is to be extracted
+ * @param[out] outputs Raw buffer for outputs, it must be \p std::vector<void *> type pointer for
+ * multiple output model
+ *
+ * @return @c NNFW_STATUS_NO_ERROR if successful
+ */
+NNFW_STATUS nnfw_pop_pipeline_output(nnfw_session *session, void *outputs);
+
#endif // __NNFW_EXPERIMENTAL_H__
diff --git a/runtime/onert/api/include/nnfw_version.h b/runtime/onert/api/include/nnfw_version.h
index 1210e274f..6624ae676 100644
--- a/runtime/onert/api/include/nnfw_version.h
+++ b/runtime/onert/api/include/nnfw_version.h
@@ -21,6 +21,6 @@
* NNFW_VERSION is a uint32 value representing nnfw runtime version
* in 0xMMmmmmPP, where MM = major, mmmm = minor, PP = patch
*/
-#define NNFW_VERSION 0x01000f00
+#define NNFW_VERSION 0x01001100
#endif // __NNFW_VERSION_H__
diff --git a/runtime/onert/api/src/nnfw_api.cc b/runtime/onert/api/src/nnfw_api.cc
index 4eba4ecec..b69dd83e4 100644
--- a/runtime/onert/api/src/nnfw_api.cc
+++ b/runtime/onert/api/src/nnfw_api.cc
@@ -367,3 +367,28 @@ NNFW_STATUS nnfw_output_tensorindex(nnfw_session *session, const char *tensornam
NNFW_RETURN_ERROR_IF_NULL(session);
return session->output_tensorindex(tensorname, index);
}
+
+NNFW_STATUS nnfw_set_backends_per_operation(nnfw_session *session, const char *backend_settings)
+{
+ NNFW_RETURN_ERROR_IF_NULL(session);
+ return session->set_backends_per_operation(backend_settings);
+}
+
+NNFW_STATUS nnfw_prepare_pipeline(nnfw_session *session, const char *map_file_path)
+{
+ NNFW_RETURN_ERROR_IF_NULL(session);
+ return session->prepare_pipeline(map_file_path);
+}
+
+NNFW_STATUS nnfw_push_pipeline_input(nnfw_session *session, void *inputs, void *lengths)
+{
+ NNFW_RETURN_ERROR_IF_NULL(session);
+ return session->push_pipeline_input((std::vector<void *> *)inputs,
+ (std::vector<uint32_t> *)lengths);
+}
+
+NNFW_STATUS nnfw_pop_pipeline_output(nnfw_session *session, void *outputs)
+{
+ NNFW_RETURN_ERROR_IF_NULL(session);
+ return session->pop_pipeline_output((std::vector<void *> *)outputs);
+}
diff --git a/runtime/onert/api/src/nnfw_api_internal.cc b/runtime/onert/api/src/nnfw_api_internal.cc
index 316bafb52..1a3aaf9e9 100644
--- a/runtime/onert/api/src/nnfw_api_internal.cc
+++ b/runtime/onert/api/src/nnfw_api_internal.cc
@@ -158,7 +158,7 @@ void setConfigKeyValues(const CfgKeyValues &keyValues)
} // namespace
nnfw_session::nnfw_session()
- : _subgraphs{nullptr}, _execution{nullptr},
+ : _subgraphs{nullptr}, _compiler{nullptr}, _execution{nullptr},
_kernel_registry{std::make_shared<onert::api::CustomKernelRegistry>()}, _tracing_ctx{nullptr}
{
// DO NOTHING
@@ -277,6 +277,7 @@ NNFW_STATUS nnfw_session::load_model_from_nnpackage(const char *package_dir)
std::string manifest_file_name = package_path + "/metadata/MANIFEST";
std::ifstream mfs(manifest_file_name);
+ _package_file_path = package_path;
// extract the filename of the first(index 0) model
// e.g. In MANIFEST file, { "models" : [ "firstmodel.tflite", "2nd.tflite" ] }
Json::Value root;
@@ -361,6 +362,51 @@ NNFW_STATUS nnfw_session::prepare()
return NNFW_STATUS_NO_ERROR;
}
+NNFW_STATUS nnfw_session::prepare_pipeline(const char *map_file_path)
+{
+ // NOTE. If users want to run prepare_pipeline() more than one time, this could be removed.
+ if (!isStateModelLoaded())
+ {
+ std::cerr << "Error during model prepare pipeline : ";
+ if (isStateInitialized())
+ {
+ std::cerr << "prepare_pipeline should be run once";
+ }
+ else
+ {
+ std::cerr << "invalid state";
+ }
+ std::cerr << std::endl;
+ return NNFW_STATUS_INVALID_STATE;
+ }
+
+ try
+ {
+ _subgraphs.reset();
+ std::vector<std::shared_ptr<onert::exec::ExecutorMap>> executor_maps =
+ _compiler->compile(_package_file_path.c_str(), map_file_path);
+
+ for (auto it = executor_maps.begin(); it != executor_maps.end(); ++it)
+ {
+ _executions.push_back(std::make_shared<onert::exec::Execution>(*it));
+ }
+ make_dependency();
+ _threads.resize(_executions.size());
+ for (uint32_t i = 0; i < _threads.size(); i++)
+ {
+ _threads[i] = std::thread(&onert::exec::Execution::runInference, _executions[i].get());
+ }
+ }
+ catch (const std::exception &e)
+ {
+ std::cerr << "Error during model prepare : " << e.what() << std::endl;
+ return NNFW_STATUS_ERROR;
+ }
+
+ _state = State::PREPARED;
+ return NNFW_STATUS_NO_ERROR;
+}
+
NNFW_STATUS nnfw_session::run()
{
if (!isStatePreparedOrFinishedRun())
@@ -370,6 +416,12 @@ NNFW_STATUS nnfw_session::run()
return NNFW_STATUS_INVALID_STATE;
}
+ if (!_executions.empty())
+ {
+ std::cerr << "Error during nnfw_session::run : not supported for pipeline run" << std::endl;
+ return NNFW_STATUS_ERROR;
+ }
+
try
{
_execution->execute();
@@ -399,6 +451,13 @@ NNFW_STATUS nnfw_session::run_async()
return NNFW_STATUS_INVALID_STATE;
}
+ if (!_executions.empty())
+ {
+ std::cerr << "Error during nnfw_session::run_async : not supported for pipeline run"
+ << std::endl;
+ return NNFW_STATUS_ERROR;
+ }
+
_execution->startExecute();
_state = State::RUNNING;
@@ -414,6 +473,12 @@ NNFW_STATUS nnfw_session::await()
return NNFW_STATUS_ERROR;
}
+ if (!_executions.empty())
+ {
+ std::cerr << "Error during nnfw_session::await : not supported for pipeline run" << std::endl;
+ return NNFW_STATUS_ERROR;
+ }
+
_execution->waitFinish();
_state = State::FINISHED_RUN;
@@ -437,6 +502,13 @@ NNFW_STATUS nnfw_session::set_input(uint32_t index, NNFW_TYPE /*type*/, const vo
return NNFW_STATUS_ERROR;
}
+ if (!_executions.empty())
+ {
+ std::cerr << "Error during nnfw_session::set_input : not supported for pipeline run"
+ << std::endl;
+ return NNFW_STATUS_ERROR;
+ }
+
try
{
_execution->setInput(onert::ir::IOIndex(index), buffer, length);
@@ -466,6 +538,13 @@ NNFW_STATUS nnfw_session::set_output(uint32_t index, NNFW_TYPE /*type*/, void *b
return NNFW_STATUS_ERROR;
}
+ if (!_executions.empty())
+ {
+ std::cerr << "Error during nnfw_session::set_output : not supported for pipeline run"
+ << std::endl;
+ return NNFW_STATUS_ERROR;
+ }
+
try
{
_execution->setOutput(onert::ir::IOIndex(index), buffer, length);
@@ -532,7 +611,14 @@ NNFW_STATUS nnfw_session::set_input_layout(uint32_t index, NNFW_LAYOUT layout)
std::cerr << "Error during nnfw_session::set_input_layout, not supported layout" << std::endl;
return NNFW_STATUS_ERROR;
}
- _execution->setInputLayout(onert::ir::IOIndex(index), convertLayout(layout));
+ if (_execution)
+ {
+ _execution->setInputLayout(onert::ir::IOIndex(index), convertLayout(layout));
+ }
+ else
+ {
+ _executions.at(0)->setInputLayout(onert::ir::IOIndex(index), convertLayout(layout));
+ }
}
catch (const std::exception &e)
{
@@ -553,7 +639,15 @@ NNFW_STATUS nnfw_session::set_output_layout(uint32_t index, NNFW_LAYOUT layout)
<< std::endl;
return NNFW_STATUS_ERROR;
}
- _execution->setOutputLayout(onert::ir::IOIndex(index), convertLayout(layout));
+ if (_execution)
+ {
+ _execution->setOutputLayout(onert::ir::IOIndex(index), convertLayout(layout));
+ }
+ else
+ {
+ _executions.at(_executions.size() - 1)
+ ->setOutputLayout(onert::ir::IOIndex(index), convertLayout(layout));
+ }
}
catch (const std::exception &e)
{
@@ -633,7 +727,14 @@ NNFW_STATUS nnfw_session::apply_tensorinfo(uint32_t index, nnfw_tensorinfo ti)
}
else // when called after nnfw_session::prepare()
{
- _execution->changeInputShape(onert::ir::IOIndex(index), new_shape);
+ if (_execution)
+ {
+ _execution->changeInputShape(onert::ir::IOIndex(index), new_shape);
+ }
+ else
+ {
+ _executions.at(0)->changeInputShape(onert::ir::IOIndex(index), new_shape);
+ }
}
return NNFW_STATUS_NO_ERROR;
@@ -667,7 +768,17 @@ NNFW_STATUS nnfw_session::input_tensorinfo(uint32_t index, nnfw_tensorinfo *ti)
auto opidx = primary_subgraph()->getInputs().at(index);
auto shape = primary_subgraph()->operands().at(opidx).shape();
if (isStatePreparedOrFinishedRun())
- shape = _execution->getInputShape(onert::ir::IOIndex{index});
+ {
+ if (_execution)
+ {
+ shape = _execution->getInputShape(onert::ir::IOIndex{index});
+ }
+ else
+ {
+ shape = _executions.at(0)->getInputShape(onert::ir::IOIndex{index});
+ }
+ }
+
ti->rank = shape.rank();
for (int j = 0; j < ti->rank; ++j)
{
@@ -708,7 +819,16 @@ NNFW_STATUS nnfw_session::output_tensorinfo(uint32_t index, nnfw_tensorinfo *ti)
auto shape = primary_subgraph()->operands().at(opidx).shape();
// If it is called after `nnfw_run` then get the shape from Execution, not from the graph
if (isStateFinishedRun())
- shape = _execution->getOutputShape(onert::ir::IOIndex{index});
+ {
+ if (_execution)
+ {
+ shape = _execution->getOutputShape(onert::ir::IOIndex{index});
+ }
+ else
+ {
+ shape = _executions.at(_executions.size() - 1)->getOutputShape(onert::ir::IOIndex{index});
+ }
+ }
ti->rank = shape.rank();
for (int j = 0; j < ti->rank; ++j)
{
@@ -724,6 +844,89 @@ NNFW_STATUS nnfw_session::output_tensorinfo(uint32_t index, nnfw_tensorinfo *ti)
return NNFW_STATUS_NO_ERROR;
}
+
+void nnfw_session::make_dependency()
+{
+ for (uint32_t out_exe = 0; out_exe < _executions.size(); out_exe++)
+ {
+ auto out_graph = _executions[out_exe]->primary_subgraph();
+ for (uint32_t in_exe = 0; in_exe < _executions.size(); in_exe++)
+ {
+ if (out_exe == in_exe)
+ continue;
+ auto in_graph = _executions[in_exe]->primary_subgraph();
+ for (auto out = out_graph._name_to_output_begin(); out != out_graph._name_to_output_end();
+ out++)
+ {
+ auto out_opidx = out_graph.getOutputs().at(out->second);
+ auto out_shape = out_graph.operands().at(out_opidx).shape();
+ for (auto in = in_graph._name_to_input_begin(); in != in_graph._name_to_input_end(); in++)
+ {
+ if (out->first != in->first)
+ continue;
+
+ auto in_opidx = in_graph.getInputs().at(in->second);
+ auto in_shape = in_graph.operands().at(in_opidx).shape();
+ if (out_shape.rank() != in_shape.rank())
+ continue;
+
+ bool is_same = true;
+ for (int32_t i = 0; i < out_shape.rank(); i++)
+ {
+ if (out_shape.dim(i) != in_shape.dim(i))
+ {
+ is_same = false;
+ break;
+ }
+ }
+
+ if (is_same)
+ _executions[out_exe]->pushNextExe(_executions[in_exe], out->second, in->second);
+ }
+ }
+ }
+ }
+}
+
+NNFW_STATUS nnfw_session::push_pipeline_input(std::vector<void *> *inputs,
+ std::vector<uint32_t> *lengths)
+{
+ static uint32_t count = 0;
+ if (inputs->empty())
+ {
+ _executions[0]->setFinish();
+ for (uint32_t i = 0; i < _threads.size(); i++)
+ {
+ _threads[i].join();
+ }
+ return NNFW_STATUS_NO_ERROR;
+ }
+ _executions[0]->asyncIoDescSemWait();
+ _executions[0]->createNewAsyncDesc(count++);
+ for (uint32_t i = 0; i < inputs->size(); i++)
+ {
+ _executions[0]->executeAsyncInput(onert::ir::IOIndex(i), inputs->at(i), lengths->at(i));
+ }
+ _executions[0]->asyncIoDescSemPost();
+ return NNFW_STATUS_NO_ERROR;
+}
+
+NNFW_STATUS nnfw_session::pop_pipeline_output(std::vector<void *> *outputs)
+{
+ auto results = _executions[_executions.size() - 1]->getAsyncResults();
+ while (results->empty())
+ {
+ if (_executions[_executions.size() - 1]->stopWait())
+ return NNFW_STATUS_ERROR;
+ }
+
+ auto result = results->front();
+ results->pop_front();
+ for (uint32_t i = 0; i < result.size(); i++)
+ outputs->push_back(result[i]);
+ return NNFW_STATUS_NO_ERROR;
+}
+
NNFW_STATUS nnfw_session::register_custom_operation(const std::string &id,
nnfw_custom_eval eval_func)
{
@@ -864,14 +1067,19 @@ const onert::ir::Graph *nnfw_session::primary_subgraph()
{
if (_subgraphs)
{
- assert(!_execution);
+ assert(!_execution && _executions.empty());
return _subgraphs->primary().get();
}
else
{
- assert(_execution);
+ assert(_execution || !_executions.empty());
// TODO Remove const_cast
// We assumed the graph will not change after compilation, but shape could change
+ if (!_executions.empty())
+ {
+ return &_executions[0]->primary_parentgraph();
+ }
+
return &_execution->primary_subgraph();
}
}
@@ -930,7 +1138,7 @@ bool nnfw_session::isStateInitialized()
{
assert(!_subgraphs);
assert(!_compiler);
- assert(!_execution);
+ assert(!_execution && _executions.empty());
return true;
}
else
@@ -945,7 +1153,7 @@ bool nnfw_session::isStateModelLoaded()
{
assert(_subgraphs);
assert(_compiler);
- assert(!_execution);
+ assert(!_execution && _executions.empty());
return true;
}
else
@@ -960,7 +1168,7 @@ bool nnfw_session::isStatePrepared()
{
assert(!_subgraphs);
assert(_compiler);
- assert(_execution);
+ assert(_execution || !_executions.empty());
return true;
}
else
@@ -975,7 +1183,7 @@ bool nnfw_session::isStateRunning()
{
assert(!_subgraphs);
assert(_compiler);
- assert(_execution);
+ assert(_execution || !_executions.empty());
return true;
}
return false;
@@ -987,7 +1195,7 @@ bool nnfw_session::isStateFinishedRun()
{
assert(!_subgraphs);
assert(_compiler);
- assert(_execution);
+ assert(_execution || !_executions.empty());
return true;
}
else
@@ -1010,3 +1218,13 @@ NNFW_STATUS nnfw_session::output_tensorindex(const char *tensorname, uint32_t *i
{
return getTensorIndexImpl(*primary_subgraph(), tensorname, index, false);
}
+
+NNFW_STATUS nnfw_session::set_backends_per_operation(const char *backend_settings)
+{
+ if (backend_settings == NULL)
+ {
+ return NNFW_STATUS_ERROR;
+ }
+ _compiler->set_backend_from_str(backend_settings);
+ return NNFW_STATUS_NO_ERROR;
+}
diff --git a/runtime/onert/api/src/nnfw_api_internal.h b/runtime/onert/api/src/nnfw_api_internal.h
index b13962907..6d75d894f 100644
--- a/runtime/onert/api/src/nnfw_api_internal.h
+++ b/runtime/onert/api/src/nnfw_api_internal.h
@@ -25,6 +25,8 @@
#include <string>
#include <memory>
+#include <thread>
+#include <vector>
namespace onert
{
@@ -100,6 +102,7 @@ public:
NNFW_STATUS load_model_from_nnpackage(const char *package_file_path);
NNFW_STATUS prepare();
+ NNFW_STATUS prepare_pipeline(const char *map_file_path);
NNFW_STATUS run();
NNFW_STATUS run_async();
@@ -123,6 +126,9 @@ public:
NNFW_STATUS set_available_backends(const char *backends);
NNFW_STATUS set_op_backend(const char *op, const char *backend);
+ // accessor
+ std::vector<std::shared_ptr<onert::exec::Execution>> *get_executions() { return &_executions; }
+
//
// Internal-only API
//
@@ -135,10 +141,14 @@ public:
//
// Experimental API
//
+ void make_dependency();
+ NNFW_STATUS push_pipeline_input(std::vector<void *> *inputs, std::vector<uint32_t> *lengths);
+ NNFW_STATUS pop_pipeline_output(std::vector<void *> *outputs);
NNFW_STATUS register_custom_operation(const std::string &id, nnfw_custom_eval eval_func);
NNFW_STATUS input_tensorindex(const char *tensorname, uint32_t *index);
NNFW_STATUS output_tensorindex(const char *tensorname, uint32_t *index);
+ NNFW_STATUS set_backends_per_operation(const char *backend_settings);
private:
const onert::ir::Graph *primary_subgraph();
@@ -155,6 +165,9 @@ private:
std::unique_ptr<onert::compiler::Compiler> _compiler;
std::unique_ptr<onert::exec::Execution> _execution;
std::shared_ptr<onert::api::CustomKernelRegistry> _kernel_registry;
+ std::vector<std::thread> _threads;
+ std::vector<std::shared_ptr<onert::exec::Execution>> _executions;
+ std::string _package_file_path;
std::unique_ptr<onert::util::TracingCtx> _tracing_ctx;
};
diff --git a/runtime/onert/backend/CMakeLists.txt b/runtime/onert/backend/CMakeLists.txt
index dc038c975..4b21e0ace 100644
--- a/runtime/onert/backend/CMakeLists.txt
+++ b/runtime/onert/backend/CMakeLists.txt
@@ -5,4 +5,5 @@ add_subdirectory(acl_cl)
add_subdirectory(acl_neon)
add_subdirectory(acl_common)
add_subdirectory(ruy)
+add_subdirectory(gpu_cl)
add_subdirectory(xnnpack)
diff --git a/runtime/onert/backend/cpu/KernelGenerator.cc b/runtime/onert/backend/cpu/KernelGenerator.cc
index d5096ff09..59fb68d55 100644
--- a/runtime/onert/backend/cpu/KernelGenerator.cc
+++ b/runtime/onert/backend/cpu/KernelGenerator.cc
@@ -130,6 +130,8 @@ convertElementwiseBinaryType(ir::operation::ElementwiseBinary::ElementwiseBinary
{
switch (type_ir)
{
+ case ir::operation::ElementwiseBinary::ElementwiseBinaryType::FLOOR_DIV:
+ return ops::ElementwiseBinaryType::kFloorDiv;
case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_AND:
return ops::ElementwiseBinaryType::kLogicalAnd;
case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR:
diff --git a/runtime/onert/backend/cpu/ops/ElementwiseBinaryLayer.cc b/runtime/onert/backend/cpu/ops/ElementwiseBinaryLayer.cc
index 1704c7cc6..391bf512c 100644
--- a/runtime/onert/backend/cpu/ops/ElementwiseBinaryLayer.cc
+++ b/runtime/onert/backend/cpu/ops/ElementwiseBinaryLayer.cc
@@ -18,6 +18,7 @@
#include "OperationUtils.h"
+#include <cker/operation/FloorDiv.h>
#include <cker/operation/LogicalAnd.h>
#include <cker/operation/LogicalOr.h>
#include <cker/operation/MaxMin.h>
@@ -34,6 +35,22 @@ namespace ops
namespace
{
template <typename T>
+void FloorDivGeneric(const IPortableTensor *lhs, const IPortableTensor *rhs,
+ IPortableTensor *output)
+{
+ if (!HaveSameShapes(lhs, rhs))
+ {
+ nnfw::cker::FloorDivBroadcast<T>(getShape(lhs), getBuffer<T>(lhs), getShape(rhs),
+ getBuffer<T>(rhs), getShape(output), getBuffer<T>(output));
+ }
+ else
+ {
+ nnfw::cker::FloorDivElementwise<T>(getShape(lhs), getBuffer<T>(lhs), getBuffer<T>(rhs),
+ getBuffer<T>(output));
+ }
+}
+
+template <typename T>
void logicalAndGeneric(const IPortableTensor *lhs, const IPortableTensor *rhs,
IPortableTensor *output)
{
@@ -101,6 +118,20 @@ void ElementwiseBinaryLayer::configure(const IPortableTensor *lhs, const IPortab
switch (op_type)
{
+ case ElementwiseBinaryType::kFloorDiv:
+ if (_lhs->data_type() == OperandType::FLOAT32)
+ {
+ _kernel = FloorDivGeneric<float>;
+ }
+ else if (_lhs->data_type() == OperandType::INT32)
+ {
+ _kernel = FloorDivGeneric<int32_t>;
+ }
+ else
+ {
+ throw std::runtime_error{"Max: unsupported data type"};
+ }
+ break;
case ElementwiseBinaryType::kLogicalAnd:
if ((_lhs->data_type() == OperandType::BOOL8) && (_rhs->data_type() == OperandType::BOOL8))
{
diff --git a/runtime/onert/backend/cpu/ops/ElementwiseBinaryLayer.h b/runtime/onert/backend/cpu/ops/ElementwiseBinaryLayer.h
index 052747a4c..af3bb63c7 100644
--- a/runtime/onert/backend/cpu/ops/ElementwiseBinaryLayer.h
+++ b/runtime/onert/backend/cpu/ops/ElementwiseBinaryLayer.h
@@ -32,6 +32,7 @@ namespace ops
enum class ElementwiseBinaryType
{
+ kFloorDiv,
kLogicalAnd,
kLogicalOr,
kMax,
diff --git a/runtime/onert/backend/cpu/ops/PoolLayer.cc b/runtime/onert/backend/cpu/ops/PoolLayer.cc
index 101b6f266..088ca5fd7 100644
--- a/runtime/onert/backend/cpu/ops/PoolLayer.cc
+++ b/runtime/onert/backend/cpu/ops/PoolLayer.cc
@@ -79,7 +79,11 @@ PoolLayer::PoolLayer() : _input(nullptr), _output(nullptr), _kernel()
op_params.filter_height = kernelHeight; \
op_params.filter_width = kernelWidth; \
op_params.padding_values.height = (int8_t)paddingTop; \
- op_params.padding_values.width = (int8_t)paddingLeft;
+ op_params.padding_values.width = (int8_t)paddingLeft; \
+ op_params.float_activation_min = 0; \
+ op_params.float_activation_max = 0; \
+ op_params.quantized_activation_min = 0; \
+ op_params.quantized_activation_max = 0;
void PoolLayer::configure(const IPortableTensor *input, const uint32_t paddingLeft, const uint32_t,
const uint32_t paddingTop, const uint32_t, const uint32_t strideWidth,
diff --git a/runtime/onert/backend/gpu_cl/Backend.h b/runtime/onert/backend/gpu_cl/Backend.h
new file mode 100644
index 000000000..dc0b8596c
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/Backend.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_BACKEND_H__
+#define __ONERT_BACKEND_GPU_CL_BACKEND_H__
+
+#include <backend/Backend.h>
+#include <memory>
+
+#include "BackendContext.h"
+#include "Config.h"
+#include "ClTensorRegistry.h"
+#include "KernelGenerator.h"
+#include "TensorManager.h"
+#include "TensorBuilder.h"
+
+#include "open_cl/Environment.h"
+#include "open_cl/Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+class Backend : public ::onert::backend::Backend
+{
+public:
+ Backend() : _config{std::make_shared<Config>()} {}
+
+ std::shared_ptr<IConfig> config() const override { return _config; }
+
+ std::unique_ptr<onert::backend::BackendContext> newContext(ContextData &&data) const override
+ {
+ const auto &graph = *data.graph;
+ const auto &operands = data.graph->operands();
+ auto context = std::make_unique<gpu_cl::BackendContext>(this, std::move(data));
+
+ auto environment = std::make_shared<Environment>();
+ if (!CreateEnvironment(environment.get()).ok())
+ {
+ return nullptr;
+ }
+ auto tm = createTensorManager(&environment->context());
+
+ auto tr = std::make_shared<ClTensorRegistry<TensorManager>>(tm);
+
+ InferenceContext::CreateInferenceInfo create_info;
+ create_info.precision = CalculationsPrecision::F32;
+ create_info.storage_type =
+ GetStorageTypeWithMinimalMemoryConsumption(environment->device().GetInfo());
+ create_info.hints.Add(ModelHints::kFastestInference);
+
+ auto cc = std::make_shared<CreationContext>();
+ cc->device = environment->GetDevicePtr();
+ cc->context = &environment->context();
+ cc->queue = environment->queue();
+ cc->cache = environment->program_cache();
+
+ auto tb = std::make_shared<TensorBuilder>(operands, tm, create_info, environment);
+ context->tensor_registry = tr;
+ context->tensor_builder = tb;
+
+ context->kernel_gen = std::make_shared<KernelGenerator>(graph, tb, tr, cc);
+ context->constant_initializer = std::make_shared<ConstantInitializer>(operands, tr);
+ return context;
+ }
+
+private:
+ std::shared_ptr<IConfig> _config;
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_BACKEND_H__
diff --git a/runtime/onert/backend/gpu_cl/BackendContext.cc b/runtime/onert/backend/gpu_cl/BackendContext.cc
new file mode 100644
index 000000000..6c3ac81a2
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/BackendContext.cc
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BackendContext.h"
+
+#include "ConstantInitializer.h"
+#include "TensorBuilder.h"
+#include "KernelGenerator.h"
+
+#include "util/logging.h"
+#include "ir/Index.h"
+#include "ir/Operations.h"
+#include "ir/OperandIndexMap.h"
+#include "ir/OperandIndexSequence.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+void BackendContext::initConsts()
+{
+ _data.graph->operations().iterate([&](const ir::OperationIndex &, const ir::Operation &op) {
+ constant_initializer->setLayout(graph()->layout());
+ op.accept(*constant_initializer);
+ });
+ _data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &operand) {
+ if (_data.external_operands.contains(ind) || !operand.isConstant())
+ return;
+ const auto &obj = graph()->operands().at(ind);
+ if (obj.isConstant() && !constant_initializer->exist(ind))
+ {
+ constant_initializer->registerDefaultInitializer(ind, obj);
+ }
+ });
+
+ constant_initializer->run();
+}
+
+void BackendContext::planTensors()
+{
+ ir::OperandIndexMap<uint32_t> uses_map;
+ ir::OperandIndexMap<uint32_t> def_map;
+ ir::OperandIndexSequence constants;
+
+ // Prepare scanning
+ _data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) {
+ if (_data.external_operands.contains(ind))
+ return;
+ uses_map[ind] = obj.getUses().size();
+ def_map[ind] = obj.getDef().valid() ? 1 : 0;
+
+ if (obj.isConstant())
+ constants.append(ind);
+
+ if (!tensor_builder->isRegistered(ind))
+ {
+ // These tensors do not exist in any operation (No use and def)
+ const auto info = obj.info();
+ const auto layout = _data.operand_layouts.at(ind);
+ // TODO Change tensor info to have permuted shape
+ tensor_builder->registerTensorInfo(ind, info, layout);
+ }
+ });
+
+ // Start scanning to do notify{First|Last}Use for each tensor
+
+ // If a tensor is a constant, increase the use of the tensor and allocate it first.
+ // Increasing use count here makes the tensor never be deallocated, i.e it they will be
+ // deallocated last.
+ VERBOSE(planTensors) << "TENSORS as CONSTANT" << std::endl;
+ for (const auto &ind : constants)
+ {
+ uses_map[ind]++;
+ tensor_builder->notifyFirstUse(ind);
+ }
+
+ // At each operation,
+ // 1. Scan DEF of outputs. If the DEF, allocate it
+ // 2. Scan DEF of inputs. If variable tensor, allocate it
+ // 3. Scan USE of inputs. Decrease the USE and deallocate if the USE is 0
+ for (const auto op_ind : _data.op_order)
+ {
+ const auto &op = graph()->operations().at(op_ind);
+ auto op_inputs = op.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
+ auto op_outputs = op.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
+
+ // Define outputs
+ for (const auto &ind : op_outputs)
+ {
+ if (!tensor_builder->isRegistered(ind))
+ continue;
+ assert(def_map.find(ind) != def_map.end());
+ if (def_map[ind])
+ {
+ def_map[ind] = 0;
+ tensor_builder->notifyFirstUse(ind);
+ }
+ }
+
+ // Scan variable tensors
+ // This tensor has features like constant. But OperandInfo and LowerInfo treat them as
+ // non-constant because of less memory usage by memory planning in here
+ for (const auto &ind : op_inputs)
+ {
+ if (!tensor_builder->isRegistered(ind))
+ continue;
+ const auto &operand = graph()->operands().at(ind);
+ if (operand.info().isVariable())
+ {
+ // The variable tensor with buffer is not supported yet
+ assert(operand.data() == nullptr);
+ assert(operand.getUses().size() == 1 && !operand.getDef().valid());
+ assert(uses_map[ind] == 1 && def_map[ind] == 0);
+ tensor_builder->notifyFirstUse(ind);
+ }
+ }
+
+ for (const auto &ind : op_inputs)
+ {
+ if (!tensor_builder->isRegistered(ind))
+ continue;
+ assert(uses_map.find(ind) != uses_map.end());
+ assert(uses_map[ind] > 0);
+ uses_map[ind]--;
+ if (uses_map[ind] == 0)
+ {
+ // plan for deallocation of static tensornode
+ tensor_builder->notifyLastUse(ind);
+ }
+ }
+ }
+
+ _data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &) {
+ if (uses_map[ind] == 0)
+ {
+ tensor_builder->notifyLastUse(ind);
+ }
+ });
+
+ // Dispose and validate
+ for (const auto &ind : constants)
+ {
+ --uses_map[ind];
+ if (uses_map[ind] == 0) // To prevent notifyLastUse from being called twice
+ {
+ tensor_builder->notifyLastUse(ind);
+ }
+ }
+
+ assert(
+ std::all_of(uses_map.begin(), uses_map.end(),
+ [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
+
+ assert(
+ std::all_of(def_map.begin(), def_map.end(),
+ [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
+}
+
+ITensorRegistry *BackendContext::genTensors()
+{
+ graph()->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) {
+ if (external_operands().contains(ind))
+ return;
+
+ const auto frontend_layout = graph()->layout();
+ const auto backend_layout = operand_layouts().at(ind);
+ ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout),
+ obj.typeInfo(), obj.info().memAllocType(), obj.isConstant()};
+ tensor_builder->registerTensorInfo(ind, backend_info, backend_layout);
+ });
+
+ // TODO Get compiler options from compiler, and use it rather than getting it from Env
+ if (util::getConfigString(util::config::EXECUTOR) == "Linear")
+ {
+ planTensors();
+ }
+ else
+ {
+ // For the executors that does not have fixed linear execution order:
+ // To make tensors never be deallocated, this is a workaround to use static memory planner
+ graph()->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &) {
+ if (tensor_builder->isRegistered(ind))
+ tensor_builder->notifyFirstUse(ind);
+ });
+ }
+
+ tensor_builder->prepare();
+
+ return tensor_registry.get();
+}
+
+FunctionMap BackendContext::genKernels()
+{
+ FunctionMap ret;
+
+ // kernel_gen
+ for (auto op_ind : _data.op_order)
+ {
+ auto fn_seq = kernel_gen->generate(op_ind);
+ ret.emplace_back(op_ind, std::move(fn_seq));
+ }
+
+ tensor_builder->allocate();
+
+ initConsts();
+
+ // NOTE For memory optimization, we want to free some operand data
+ const_cast<ir::Graph &>(*_data.graph)
+ .operands()
+ .iterate([&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
+
+ for (auto &it : ret)
+ {
+ auto &fn_seq = it.second;
+ fn_seq->iterate([&](exec::IFunction &ifunc) {
+ ifunc.prepare();
+ tensor_builder->postFunctionPrepare();
+ });
+ }
+
+ return ret;
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/BackendContext.h b/runtime/onert/backend/gpu_cl/BackendContext.h
new file mode 100644
index 000000000..f17489e7a
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/BackendContext.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_BACKEND_CONTEXT_H__
+#define __ONERT_BACKEND_GPU_CL_BACKEND_CONTEXT_H__
+
+#include <backend/BackendContext.h>
+#include <util/ConfigSource.h>
+
+#include "ConstantInitializer.h"
+#include "KernelGenerator.h"
+#include "TensorBuilder.h"
+#include "open_cl/InferenceContext.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+class BackendContext : public onert::backend::BackendContext
+{
+public:
+ BackendContext(const Backend *backend, ContextData &&data,
+ std::shared_ptr<ITensorRegistry> tensor_registry = nullptr,
+ std::shared_ptr<TensorBuilder> tensor_builder = nullptr,
+ std::shared_ptr<ConstantInitializer> constant_initializer = nullptr,
+ std::shared_ptr<KernelGenerator> kernel_gen = nullptr)
+ : onert::backend::BackendContext(backend, std::move(data), tensor_registry),
+ tensor_builder{tensor_builder}, constant_initializer{constant_initializer}, kernel_gen{
+ kernel_gen}
+ {
+ }
+
+ ITensorRegistry *genTensors() override;
+ FunctionMap genKernels() override;
+
+private:
+ void initConsts();
+ void planTensors();
+
+public:
+ std::shared_ptr<TensorBuilder> tensor_builder;
+ std::shared_ptr<ConstantInitializer> constant_initializer;
+ std::shared_ptr<KernelGenerator> kernel_gen;
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_BACKEND_CONTEXT_H__
diff --git a/runtime/onert/backend/gpu_cl/CMakeLists.txt b/runtime/onert/backend/gpu_cl/CMakeLists.txt
new file mode 100644
index 000000000..49bae37f8
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/CMakeLists.txt
@@ -0,0 +1,44 @@
+set(LIB_ONERT_BACKEND_GPU_CL onert_backend_gpu_cl)
+
+nnas_find_package(Opencl_Headers QUIET)
+if(NOT Opencl_Headers_FOUND)
+ return()
+endif(NOT Opencl_Headers_FOUND)
+
+if(NOT BUILD_GPU_CL)
+ return()
+endif(NOT BUILD_GPU_CL)
+
+nnas_find_package(Farmhash QUIET)
+if(NOT Farmhash_FOUND)
+ return()
+endif(NOT Farmhash_FOUND)
+
+nnas_find_package(Abseil QUIET)
+if(NOT Abseil_FOUND)
+ return()
+endif(NOT Abseil_FOUND)
+
+file(GLOB_RECURSE SOURCES "*.cc")
+
+
+add_library(${LIB_ONERT_BACKEND_GPU_CL} SHARED ${SOURCES})
+
+target_include_directories(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+
+target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE abseil)
+target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE dl)
+target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE farmhash)
+target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE Headers)
+target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE onert_core)
+target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE nnfw_common)
+target_link_libraries(${LIB_ONERT_BACKEND_GPU_CL} PRIVATE nnfw_coverage)
+
+set_target_properties(${LIB_ONERT_BACKEND_GPU_CL} PROPERTIES OUTPUT_NAME backend_gpu_cl)
+
+if(CMAKE_BUILD_TYPE_LC STREQUAL "release")
+ add_custom_command(TARGET ${LIB_ONERT_BACKEND_GPU_CL} POST_BUILD
+ COMMAND ${CMAKE_STRIP} "--strip-unneeded" $<TARGET_FILE_NAME:${LIB_ONERT_BACKEND_GPU_CL}>)
+endif()
+
+install(TARGETS ${LIB_ONERT_BACKEND_GPU_CL} DESTINATION lib)
diff --git a/runtime/onert/backend/gpu_cl/ClConstantInitializer.cc b/runtime/onert/backend/gpu_cl/ClConstantInitializer.cc
new file mode 100644
index 000000000..b3ef2f560
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/ClConstantInitializer.cc
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ClConstantInitializer.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+ClConstantInitializer::ClConstantInitializer(const ir::Operands &operands,
+ const std::shared_ptr<ITensorRegistry> &tensor_reg)
+ : _operands{operands}, _tensor_reg{tensor_reg}, _current_layout{ir::Layout::UNKNOWN}
+{
+ // DO NOTHING
+}
+
+void ClConstantInitializer::copyInputInitialize(const ir::Operation &node, uint32_t index)
+{
+ assert(node.getInputs().size() > index);
+
+ const auto &input_index = node.getInputs().at(index);
+ if (input_index.valid())
+ {
+ const auto &input_obj = _operands.at(input_index);
+ registerCopyInitializer(input_index, input_obj);
+ }
+}
+
+void ClConstantInitializer::permuteInputInitialize(const ir::Operation &node, uint32_t index)
+{
+ assert(node.getInputs().size() > index);
+
+ const auto &input_index = node.getInputs().at(index);
+ const auto &input_obj = _operands.at(input_index);
+ registerPermuteInitializer(input_index, input_obj);
+}
+
+// NOTE Workaround for 16b float type. Here, this is enough since only the size of bytes matters.
+using float16 = uint16_t;
+
+void ClConstantInitializer::registerCopyInitializer(const ir::OperandIndex &index,
+ const ir::Operand &obj)
+{
+ // For only CONSTANTS
+ // TODO Add to check if tensor has been allocated
+ if (!obj.isConstant())
+ return;
+
+ const auto type = obj.typeInfo().type();
+ using ir::DataType;
+
+ switch (type)
+ {
+ case DataType::FLOAT32:
+ _init_map[index] = copyInit<float>;
+ break;
+ default:
+ throw std::runtime_error("Not supported, yet");
+ break;
+ }
+}
+
+void ClConstantInitializer::registerPermuteInitializer(const ir::OperandIndex &index,
+ const ir::Operand &obj)
+{
+ // For only CONSTANTS
+ // TODO Add to check if tensor has been allocated
+ if (!obj.isConstant())
+ return;
+
+ const auto type = obj.typeInfo().type();
+ using ir::DataType;
+ using namespace std::placeholders;
+
+ switch (type)
+ {
+ case DataType::FLOAT32:
+ _init_map[index] = std::bind(permuteInit<float>, _1, _2, _current_layout);
+ break;
+ default:
+ throw std::runtime_error("Not supported, yet");
+ break;
+ }
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/ClConstantInitializer.h b/runtime/onert/backend/gpu_cl/ClConstantInitializer.h
new file mode 100644
index 000000000..d7d21e847
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/ClConstantInitializer.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_COMPILER_GPU_CL_CLCONSTANT_INITIALIZER_H__
+#define __ONERT_COMPILER_GPU_CL_CLCONSTANT_INITIALIZER_H__
+
+#include "ClTensorRegistry.h"
+
+#include <unordered_map>
+#include <functional>
+
+#include <ir/Coordinates.h>
+#include <ir/Layout.h>
+#include <ir/Operand.h>
+#include <ir/Operands.h>
+#include <ir/OperationVisitor.h>
+#include <backend/ITensorRegistry.h>
+#include <util/logging.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+template <typename T>
+static void Init(const onert::ir::Operand &model_obj, onert::backend::ITensor &obj, const bool copy,
+ const onert::ir::Layout frontend_layout = onert::ir::Layout::UNKNOWN)
+{
+ const auto shape = model_obj.shape();
+ assert(model_obj.data());
+ obj.access([&](::onert::backend::ITensor &tensor) {
+ switch (shape.rank())
+ {
+ case 0:
+ case 1:
+ case 2:
+ case 3:
+ case 4:
+ if (copy)
+ {
+ tensor.enqueueWriteBuffer(model_obj.data()->base(), true);
+ }
+ else
+ {
+ // NYI
+ (void)frontend_layout;
+ throw std::runtime_error{"Not yet supported"};
+ }
+ break;
+ default:
+ throw std::runtime_error{"Not yet supported"};
+ }
+ });
+}
+
+template <typename T>
+void copyInit(const onert::ir::Operand &model_obj, onert::backend::ITensor &obj)
+{
+ Init<T>(model_obj, obj, true);
+}
+
+template <typename T>
+void permuteInit(const onert::ir::Operand &model_obj, onert::backend::ITensor &obj,
+ const onert::ir::Layout frontend_layout)
+{
+ const bool copy = frontend_layout == obj.layout();
+ Init<T>(model_obj, obj, copy, frontend_layout);
+}
+
+class ClConstantInitializer : public ir::OperationVisitor
+{
+public:
+ void run()
+ {
+ assert(_tensor_reg);
+ for (const auto &it : _init_map)
+ {
+ const auto &ind = it.first;
+ const auto &fn = it.second;
+
+ const auto &model_obj = _operands.at(ind);
+ auto tensor_obj = _tensor_reg->getNativeITensor(ind);
+ assert(tensor_obj != nullptr);
+ fn(model_obj, *tensor_obj);
+ VERBOSE(FillOperandData) << "Fill data for operand " << ind << std::endl;
+ }
+ _init_map.clear();
+ }
+
+public:
+ ClConstantInitializer(const ir::Operands &operands,
+ const std::shared_ptr<ITensorRegistry> &tensor_reg);
+
+public:
+ using Initializer = std::function<void(const ir::Operand &, backend::ITensor &)>;
+
+public:
+ void registerDefaultInitializer(const ir::OperandIndex &index, const ir::Operand &obj)
+ {
+ registerPermuteInitializer(index, obj);
+ }
+ void registerCopyInitializer(const ir::OperandIndex &index, const ir::Operand &obj);
+ void registerPermuteInitializer(const ir::OperandIndex &index, const ir::Operand &obj);
+
+public:
+ void setLayout(ir::Layout layout) { _current_layout = layout; }
+ bool exist(const ir::OperandIndex &ind) { return _init_map.find(ind) != _init_map.end(); }
+
+public:
+protected:
+ void copyInputInitialize(const ir::Operation &node, uint32_t index);
+ void permuteInputInitialize(const ir::Operation &node, uint32_t index);
+
+protected:
+ const ir::Operands &_operands;
+ std::shared_ptr<ITensorRegistry> _tensor_reg;
+ std::unordered_map<ir::OperandIndex, Initializer> _init_map;
+ ir::Layout _current_layout;
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_COMPILER_GPU_CL_CLCONSTANT_INITIALIZER_H__
diff --git a/runtime/onert/backend/gpu_cl/ClFunction.h b/runtime/onert/backend/gpu_cl/ClFunction.h
new file mode 100644
index 000000000..9d3d69092
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/ClFunction.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_GPU_CL_OPEN_CL_FUNCTION_H__
+#define __ONERT_GPU_CL_OPEN_CL_FUNCTION_H__
+
+#include <exec/IFunction.h>
+
+#include <vector>
+#include <memory>
+
+#include "open_cl/kernels/GpuOperation.h"
+#include "open_cl/ClCommandQueue.h"
+#include "open_cl/Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+class ClFunction : public ::onert::exec::IFunction
+{
+public:
+ ClFunction() : _gpu_operations(), _creation_context() {}
+
+public:
+ void configure(std::shared_ptr<CreationContext> creation_context)
+ {
+ _creation_context = creation_context;
+ }
+
+ void add_operation(std::unique_ptr<GPUOperation> gpu_operation)
+ {
+ _gpu_operations.push_back(std::move(gpu_operation));
+ }
+
+ void run() override
+ {
+ for (const auto &gpu_operation : _gpu_operations)
+ {
+ if (!gpu_operation->AddToQueue(_creation_context->queue).ok())
+ {
+ throw std::runtime_error("Failed to AddToQueue.");
+ }
+ }
+ }
+
+ void prepare() override
+ {
+ for (const auto &gpu_operation : _gpu_operations)
+ {
+ if (!gpu_operation->Compile(*_creation_context).ok())
+ {
+ throw std::runtime_error("Failed to Compile.");
+ }
+
+ if (!gpu_operation->UpdateParams().ok())
+ {
+ throw std::runtime_error("Failed to UpdateParams.");
+ }
+ }
+ }
+
+private:
+ std::vector<std::unique_ptr<GPUOperation>> _gpu_operations;
+ std::shared_ptr<CreationContext> _creation_context;
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_GPU_CL_OPEN_CL_FUNCTION_H__
diff --git a/runtime/onert/backend/gpu_cl/ClMemoryManager.h b/runtime/onert/backend/gpu_cl/ClMemoryManager.h
new file mode 100644
index 000000000..3bac0d51d
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/ClMemoryManager.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_ACL_COMMON_MEMORY_MANAGER_H__
+#define __ONERT_BACKEND_ACL_COMMON_MEMORY_MANAGER_H__
+
+#include <cassert>
+
+#include "ir/OperandIndexMap.h"
+#include "ir/Shape.h"
+#include "open_cl/ClContext.h"
+#include "open_cl/InferenceContext.h"
+#include "open_cl/Status.h"
+#include "open_cl/StorageTypeUtil.h"
+#include "open_cl/TensorType.h"
+#include "util/logging.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+template <typename T_ITensor, typename T_Tensor> class ClMemoryManager
+{
+public:
+ ClMemoryManager(CLContext *context) : _context{context} {}
+
+ virtual ~ClMemoryManager() = default;
+
+ virtual void allocate(void)
+ {
+ for (const auto &tensor_entry : _tensors)
+ {
+ auto tensor = tensor_entry.second;
+ const auto &t = tensor_reserver_.Get(tensor_entry.first.value());
+ const auto &shape = t->shape;
+ const auto &descriptor = t->descriptor;
+ if (!CreateTensor(*_context, shape, descriptor, tensor->handle()).ok())
+ {
+ return;
+ }
+ }
+ }
+
+ virtual void deallocate(void)
+ {
+ // NYI
+ }
+
+ virtual void startLifetime(const ir::OperandIndex &)
+ { /* DO NOTHING */
+ }
+ virtual void finishLifetime(const ir::OperandIndex &)
+ { /* DO NOTHING */
+ }
+
+ void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &info,
+ InferenceContext::CreateInferenceInfo create_info,
+ std::shared_ptr<Environment> environment, DeviceInfo &device_info)
+ {
+ ValueId max_id = 0;
+ auto data_type = DeduceDataTypeFromPrecision(create_info.precision);
+ const auto shape = info.shape();
+
+ auto tensor = std::make_shared<T_Tensor>(shape.rank(), shape, environment);
+ _tensors[ind] = tensor;
+
+ BHWC t_shape;
+ switch (shape.rank())
+ {
+ case 1:
+ // B layout
+ t_shape = BHWC(shape.dim(0), 1, 1, 1);
+ break;
+ case 2:
+ // BC layout
+ t_shape = BHWC(shape.dim(0), 1, 1, shape.dim(1));
+ break;
+ case 3:
+ // BWC layout
+ t_shape = BHWC(shape.dim(0), 1, shape.dim(1), shape.dim(2));
+ break;
+ case 4:
+ // BHWC layout
+ t_shape = BHWC(shape.dim(0), shape.dim(1), shape.dim(2), shape.dim(3));
+ break;
+ default:
+ break;
+ }
+
+ TensorStorageType storage_type = create_info.storage_type;
+ Layout layout = t_shape.b == 1 ? Layout::HWC : Layout::BHWC;
+
+ ValueId id = ind.value();
+ storage_type = SelectBestStorageType(device_info, t_shape, storage_type, data_type, layout);
+ auto dummy = std::make_shared<InferenceContext::DummyTensor>();
+ dummy->shape = t_shape;
+ dummy->descriptor = TensorDescriptor{data_type, storage_type, layout};
+ tensor_reserver_.Add(id, dummy);
+
+ max_id = std::max(max_id, id);
+
+ tensor_reserver_.SetNext(max_id + 1);
+ }
+
+ ir::OperandIndexMap<std::shared_ptr<T_Tensor>> &tensors(void) { return _tensors; }
+
+ InferenceContext::TensorReserver &tensorReservers(void) { return tensor_reserver_; }
+
+private:
+ ir::OperandIndexMap<std::shared_ptr<T_Tensor>> _tensors;
+ InferenceContext::TensorReserver tensor_reserver_;
+ CLContext *_context;
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_ACL_COMMON_MEMORY_MANAGER_H__
diff --git a/runtime/onert/backend/gpu_cl/ClTensorBuilder.h b/runtime/onert/backend/gpu_cl/ClTensorBuilder.h
new file mode 100644
index 000000000..951bbd844
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/ClTensorBuilder.h
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CL_TENSOR_BUILDER_H__
+#define __ONERT_BACKEND_CL_TENSOR_BUILDER_H__
+
+#include <memory>
+#include <queue>
+
+#include "ClTensorManager.h"
+#include "ClTensorRegistry.h"
+#include "ParentInfo.h"
+
+#include "open_cl/TensorType.h"
+#include "open_cl/TensorTypeUtil.h"
+#include "open_cl/ClDevice.h"
+#include "open_cl/InferenceContext.h"
+
+#include "ir/OperandIndexMap.h"
+#include "ir/OperandIndexSequence.h"
+#include <ir/Operands.h>
+#include <util/Utils.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+enum class UsesType
+{
+ FIRST,
+ LAST
+};
+
+template <typename T_ITensor, typename T_Tensor> class ClTensorBuilder
+{
+public:
+ using T_ClTensorManager = ClTensorManager<T_ITensor, T_Tensor>;
+
+ ClTensorBuilder(const ir::Operands &operands, T_ClTensorManager *tensor_mgr,
+ InferenceContext::CreateInferenceInfo create_info,
+ const std::shared_ptr<Environment> &environment);
+
+ /**
+ * @brief Register tensor information to allocate on ACL-CL backend
+ * @param[in] ind Operand index
+ * @param[in] info Tensor information
+ * @param[in] layout Tensor data layout
+ */
+ void registerTensorInfo(const ir::OperandIndex &ind, const ir::OperandInfo &info,
+ ir::Layout backend_layout);
+
+ void notifyFirstUse(const ir::OperandIndex &);
+ void notifyLastUse(const ir::OperandIndex &);
+
+ bool isRegistered(const ir::OperandIndex &) const;
+
+ void prepare();
+ void allocate();
+ void postFunctionPrepare();
+
+ T_ClTensorManager *cl_tensor_manager(void) { return _tensor_mgr.get(); }
+
+ void setUsesCount(const ir::OperandIndex &index, size_t num_uses)
+ {
+ assert(_uses_count_map.find(index) != _uses_count_map.end() ? _uses_count_map[index] == num_uses
+ : true);
+ _uses_count_map[index] = num_uses;
+ }
+
+ void parent_map(std::unordered_map<ir::OperandIndex, ParentInfo> &&parent_map)
+ {
+ _parent_map = std::move(parent_map);
+ }
+
+ bool areSubTensorsOf(const ir::OperandIndex &parent, const ir::OperandIndexSequence &seq);
+
+ /**
+ * @brief Check child tensor is allocated as subtensor of parent tensor
+ * @param[in] parent Index of parent
+ * @param[in] child Index of child
+ * @return @c true if child is allocated as subtensor of parent, otherwise @c false
+ */
+ bool isSubTensorOf(const ir::OperandIndex &parent, const ir::OperandIndex &child);
+
+private:
+ void buildTensors(void);
+ ir::OperandIndex findRootParent(ir::OperandIndex index);
+
+private:
+ const ir::Operands &_operands;
+ ir::OperandIndexMap<ir::OperandInfo> _tensor_info_map;
+ ir::OperandIndexMap<ir::Layout> _tensor_layout_map;
+ ir::OperandIndexMap<size_t> _uses_count_map;
+
+ std::unique_ptr<T_ClTensorManager> _tensor_mgr;
+ InferenceContext::CreateInferenceInfo _create_info;
+ std::shared_ptr<Environment> _environment;
+
+ // for linear executor
+ std::vector<std::pair<UsesType, ir::OperandIndex>> _lifetime_seq;
+
+ // Extra info for concat elimination
+ ir::OperandIndexMap<ParentInfo> _parent_map;
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#include <cassert>
+#include <stack>
+
+#include "util/logging.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+template <typename T_ITensor, typename T_Tensor>
+ClTensorBuilder<T_ITensor, T_Tensor>::ClTensorBuilder(
+ const ir::Operands &operands, T_ClTensorManager *tensor_mgr,
+ InferenceContext::CreateInferenceInfo create_info,
+ const std::shared_ptr<Environment> &environment)
+ : _operands{operands}, _tensor_mgr{tensor_mgr}, _create_info{create_info}, _environment{
+ environment}
+{
+ assert(_tensor_mgr);
+}
+
+template <typename T_ITensor, typename T_Tensor>
+void ClTensorBuilder<T_ITensor, T_Tensor>::registerTensorInfo(const ir::OperandIndex &ind,
+ const ir::OperandInfo &info,
+ ir::Layout backend_layout)
+{
+ assert(_tensor_mgr->constTensors().size() == 0);
+ assert(_tensor_mgr->nonconstTensors().size() == 0);
+
+ _uses_count_map[ind] = _operands.at(ind).getUses().size();
+
+ _tensor_info_map.emplace(ind, info);
+ _tensor_layout_map.insert({ind, backend_layout});
+}
+
+template <typename T_ITensor, typename T_Tensor>
+void ClTensorBuilder<T_ITensor, T_Tensor>::notifyFirstUse(const ir::OperandIndex &ind)
+{
+ _lifetime_seq.emplace_back(UsesType::FIRST, ind);
+}
+
+template <typename T_ITensor, typename T_Tensor>
+void ClTensorBuilder<T_ITensor, T_Tensor>::notifyLastUse(const ir::OperandIndex &ind)
+{
+ _lifetime_seq.emplace_back(UsesType::LAST, ind);
+}
+
+template <typename T_ITensor, typename T_Tensor>
+bool ClTensorBuilder<T_ITensor, T_Tensor>::isRegistered(const ir::OperandIndex &ind) const
+{
+ return _tensor_info_map.find(ind) != _tensor_info_map.end();
+}
+
+template <typename T_ITensor, typename T_Tensor>
+void ClTensorBuilder<T_ITensor, T_Tensor>::prepare(void)
+{
+ buildTensors();
+}
+
+template <typename T_ITensor, typename T_Tensor>
+void ClTensorBuilder<T_ITensor, T_Tensor>::allocate(void)
+{
+ // Update lifetime sequence to apply subtensor optimization
+
+ std::unordered_map<ir::OperandIndex, ir::OperandIndex> root_map;
+ std::function<ir::OperandIndex &(ir::OperandIndex)> find_root =
+ [&](ir::OperandIndex ind) -> ir::OperandIndex & {
+ ir::OperandIndex &ret = root_map[ind];
+
+ // We know the root parent value already
+ if (ret.valid())
+ return ret;
+
+ auto itr = _parent_map.find(ind);
+ if (itr == _parent_map.end())
+ {
+ // If there is no parent, let's store the value of itself
+ return ret = ind;
+ }
+ else
+ {
+ return ret = find_root(itr->second.parent);
+ }
+ };
+
+ ir::OperandIndexMap<bool> first_use_check;
+ ir::OperandIndexMap<bool> last_use_check;
+ std::map<size_t, std::pair<UsesType, ir::OperandIndex>> lifetime_map;
+ for (size_t i = 0; i < _lifetime_seq.size(); i++)
+ {
+ auto &entry = _lifetime_seq[i];
+ if (entry.first != UsesType::FIRST)
+ continue;
+ auto root_ind = find_root(entry.second);
+ if (first_use_check[root_ind])
+ continue;
+ first_use_check[root_ind] = true;
+ lifetime_map[i] = {UsesType::FIRST, root_ind};
+ }
+
+ for (int i = _lifetime_seq.size() - 1; i >= 0; i--)
+ {
+ auto &entry = _lifetime_seq[i];
+ if (entry.first != UsesType::LAST)
+ continue;
+ auto root_ind = find_root(entry.second);
+ if (last_use_check[root_ind])
+ continue;
+ last_use_check[root_ind] = true;
+ lifetime_map[i] = {UsesType::LAST, root_ind};
+ }
+
+ for (auto &entry : lifetime_map)
+ {
+ auto &use = entry.second;
+ auto use_type = use.first;
+ auto use_index = use.second;
+ assert(use_index.valid());
+ if (use_type == UsesType::FIRST)
+ _tensor_mgr->startLifetime(use_index);
+ else
+ _tensor_mgr->finishLifetime(use_index);
+ }
+
+ _tensor_mgr->allocateConsts();
+
+ // TODO Since `_parent_map` is filled for all Concat nodes even if the node this backend uses
+ // After refactoring BackendContext we can uncomment this
+ // assert(_tensor_info_map.size() ==
+ // _tensor_mgr->nonconstTensors().size() + num of constants of _tensor_info_map +
+ // _parent_map.size());
+ _tensor_mgr->allocateNonconsts();
+}
+
+template <typename T_ITensor, typename T_Tensor>
+void ClTensorBuilder<T_ITensor, T_Tensor>::postFunctionPrepare(void)
+{
+ _tensor_mgr->tryDeallocConstants();
+}
+
+template <typename T_ITensor, typename T_Tensor>
+void ClTensorBuilder<T_ITensor, T_Tensor>::buildTensors(void)
+{
+ assert(_tensor_mgr->constTensors().size() == 0);
+ assert(_tensor_mgr->nonconstTensors().size() == 0);
+ // Normal tensors
+ for (auto &entry : _tensor_info_map)
+ {
+ auto ind = entry.first;
+ if (_parent_map.count(ind) > 0)
+ continue;
+
+ const auto &info = entry.second;
+ _tensor_mgr->buildTensor(ind, info, _create_info, _environment, _environment->device().info_);
+ }
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_ACL_COMMON_TEMPL_TENSOR_BUILDER_H__
diff --git a/runtime/onert/backend/gpu_cl/ClTensorManager.h b/runtime/onert/backend/gpu_cl/ClTensorManager.h
new file mode 100644
index 000000000..49a11730f
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/ClTensorManager.h
@@ -0,0 +1,235 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_ACL_COMMON_TENSOR_MANAGER_H__
+#define __ONERT_BACKEND_ACL_COMMON_TENSOR_MANAGER_H__
+
+#include "ClMemoryManager.h"
+
+#include "open_cl/InferenceContext.h"
+#include "open_cl/TensorType.h"
+
+#include "ir/OperandInfo.h"
+#include "ir/OperandIndexMap.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+template <typename T_ITensor, typename T_Tensor> class ClTensorManager
+{
+public:
+ using T_ClMemoryManager = ClMemoryManager<T_ITensor, T_Tensor>;
+
+ ClTensorManager(T_ClMemoryManager *const_mgr, T_ClMemoryManager *nonconst_mgr);
+
+ virtual ~ClTensorManager() = default;
+
+ void allocateConsts(void);
+ void allocateNonconsts(void);
+ void deallocateConsts(void);
+ void deallocateNonconsts(void);
+
+ void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &info,
+ InferenceContext::CreateInferenceInfo create_info,
+ std::shared_ptr<Environment> environment, DeviceInfo &device_info);
+
+ std::shared_ptr<T_ITensor> findTensorAsParent(const ir::OperandIndex &ind);
+
+ void startLifetime(const ir::OperandIndex &ind);
+ void finishLifetime(const ir::OperandIndex &ind);
+
+ std::shared_ptr<T_ITensor> at(const ir::OperandIndex &ind);
+ std::shared_ptr<InferenceContext::DummyTensor> atR(const ir::OperandIndex &ind);
+
+ InferenceContext::TensorReserver &constTensorReservers(void);
+ InferenceContext::TensorReserver &nonconstTensorReservers(void);
+
+ ir::OperandIndexMap<std::shared_ptr<T_Tensor>> &constTensors(void);
+ ir::OperandIndexMap<std::shared_ptr<T_Tensor>> &nonconstTensors(void);
+
+ void iterate(const std::function<void(const ir::OperandIndex &)> &fn);
+
+ void tryDeallocConstants(void);
+
+private:
+ std::unique_ptr<T_ClMemoryManager> _const_mgr;
+ std::unique_ptr<T_ClMemoryManager> _nonconst_mgr;
+ ir::OperandIndexMap<T_ClMemoryManager &> _ind_to_mgr;
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#include <cassert>
+#include "util/logging.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+template <typename T_ITensor, typename T_Tensor>
+ClTensorManager<T_ITensor, T_Tensor>::ClTensorManager(T_ClMemoryManager *const_mgr,
+ T_ClMemoryManager *nonconst_mgr)
+ : _const_mgr{const_mgr}, _nonconst_mgr{nonconst_mgr}
+{
+ // DO NOTHING
+}
+
+template <typename T_ITensor, typename T_Tensor>
+void ClTensorManager<T_ITensor, T_Tensor>::allocateConsts(void)
+{
+ _const_mgr->allocate();
+}
+
+template <typename T_ITensor, typename T_Tensor>
+void ClTensorManager<T_ITensor, T_Tensor>::allocateNonconsts(void)
+{
+ _nonconst_mgr->allocate();
+}
+
+template <typename T_ITensor, typename T_Tensor>
+void ClTensorManager<T_ITensor, T_Tensor>::deallocateConsts(void)
+{
+ _const_mgr->deallocate();
+}
+
+template <typename T_ITensor, typename T_Tensor>
+void ClTensorManager<T_ITensor, T_Tensor>::deallocateNonconsts(void)
+{
+ _nonconst_mgr->deallocate();
+}
+
+template <typename T_ITensor, typename T_Tensor>
+void ClTensorManager<T_ITensor, T_Tensor>::buildTensor(
+ const ir::OperandIndex &ind, const ir::OperandInfo &info,
+ InferenceContext::CreateInferenceInfo create_info, std::shared_ptr<Environment> environment,
+ DeviceInfo &device_info)
+{
+ assert(_ind_to_mgr.find(ind) == _ind_to_mgr.end());
+
+ if (info.isConstant())
+ {
+ _const_mgr->buildTensor(ind, info, create_info, environment, device_info);
+ _ind_to_mgr.insert({ind, *_const_mgr});
+ }
+ else
+ {
+ _nonconst_mgr->buildTensor(ind, info, create_info, environment, device_info);
+ _ind_to_mgr.insert({ind, *_nonconst_mgr});
+ }
+}
+
+template <typename T_ITensor, typename T_Tensor>
+void ClTensorManager<T_ITensor, T_Tensor>::startLifetime(const ir::OperandIndex &ind)
+{
+ assert(_ind_to_mgr.find(ind) != _ind_to_mgr.end());
+ _ind_to_mgr.at(ind).startLifetime(ind);
+}
+
+template <typename T_ITensor, typename T_Tensor>
+void ClTensorManager<T_ITensor, T_Tensor>::finishLifetime(const ir::OperandIndex &ind)
+{
+ assert(_ind_to_mgr.find(ind) != _ind_to_mgr.end());
+ _ind_to_mgr.at(ind).finishLifetime(ind);
+}
+
+template <typename T_ITensor, typename T_Tensor>
+std::shared_ptr<T_ITensor> ClTensorManager<T_ITensor, T_Tensor>::at(const ir::OperandIndex &ind)
+{
+ if (_ind_to_mgr.find(ind) == _ind_to_mgr.end())
+ return nullptr;
+
+ auto &tensors = _ind_to_mgr.at(ind).tensors();
+ if (tensors.find(ind) != tensors.end())
+ {
+ return tensors.at(ind);
+ }
+
+ return nullptr;
+}
+
+template <typename T_ITensor, typename T_Tensor>
+ir::OperandIndexMap<std::shared_ptr<T_Tensor>> &
+ClTensorManager<T_ITensor, T_Tensor>::constTensors(void)
+{
+ return _const_mgr->tensors();
+}
+
+template <typename T_ITensor, typename T_Tensor>
+ir::OperandIndexMap<std::shared_ptr<T_Tensor>> &
+ClTensorManager<T_ITensor, T_Tensor>::nonconstTensors(void)
+{
+ return _nonconst_mgr->tensors();
+}
+
+template <typename T_ITensor, typename T_Tensor>
+std::shared_ptr<InferenceContext::DummyTensor>
+ClTensorManager<T_ITensor, T_Tensor>::atR(const ir::OperandIndex &ind)
+{
+ if (_nonconst_mgr->tensorReservers().HaveTensor(ind.value()))
+ {
+ return _nonconst_mgr->tensorReservers().Get(ind.value());
+ }
+ else if (_const_mgr->tensorReservers().HaveTensor(ind.value()))
+ {
+ return _const_mgr->tensorReservers().Get(ind.value());
+ }
+ return nullptr;
+}
+
+template <typename T_ITensor, typename T_Tensor>
+InferenceContext::TensorReserver &ClTensorManager<T_ITensor, T_Tensor>::constTensorReservers(void)
+{
+ return _const_mgr->tensorReservers();
+}
+
+template <typename T_ITensor, typename T_Tensor>
+InferenceContext::TensorReserver &
+ClTensorManager<T_ITensor, T_Tensor>::nonconstTensorReservers(void)
+{
+ return _nonconst_mgr->tensorReservers();
+}
+
+template <typename T_ITensor, typename T_Tensor>
+void ClTensorManager<T_ITensor, T_Tensor>::iterate(
+ const std::function<void(const ir::OperandIndex &)> &fn)
+{
+ for (auto it : _nonconst_mgr->tensors())
+ fn(it.first);
+
+ for (auto it : _const_mgr->tensors())
+ fn(it.first);
+}
+
+template <typename T_ITensor, typename T_Tensor>
+void ClTensorManager<T_ITensor, T_Tensor>::tryDeallocConstants(void)
+{
+ // NYI
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_ACL_COMMON_TENSOR_MANAGER_H__
diff --git a/runtime/onert/backend/gpu_cl/ClTensorRegistry.h b/runtime/onert/backend/gpu_cl/ClTensorRegistry.h
new file mode 100644
index 000000000..1f0018bd1
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/ClTensorRegistry.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_TENSOR_REGISTRY_H__
+#define __ONERT_BACKEND_GPU_CL_TENSOR_REGISTRY_H__
+
+#include "backend/ITensorRegistry.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+/**
+ * @brief Tensor registry class for acl backends
+ *
+ * This is implemented as a wrapper of AclTensorManager.
+ */
+template <typename T_ClTensorManager> class ClTensorRegistry : public ITensorRegistry
+{
+public:
+ ClTensorRegistry(T_ClTensorManager *tensor_mgr) : _tensor_mgr{tensor_mgr} {}
+
+ ITensor *getITensor(const ir::OperandIndex &ind) override { return _tensor_mgr->at(ind).get(); }
+
+ ITensor *getNativeITensor(const ir::OperandIndex &ind) override { return getITensor(ind); }
+
+ auto getClTensor(const ir::OperandIndex &ind) { return _tensor_mgr->at(ind).get(); }
+
+ auto getClTensorReserver(const ir::OperandIndex &ind) { return _tensor_mgr->atR(ind); }
+
+private:
+ T_ClTensorManager *_tensor_mgr;
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_TENSOR_REGISTRY_H__
diff --git a/runtime/onert/backend/gpu_cl/Config.cc b/runtime/onert/backend/gpu_cl/Config.cc
new file mode 100644
index 000000000..067a2070f
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/Config.cc
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Config.h"
+
+#include <dlfcn.h>
+#include "open_cl/OpenclWrapper.h"
+#include "open_cl/Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+Config::~Config() { UnloadOpenCL(_handle); }
+
+bool Config::initialize()
+{
+ if (LoadOpenCL(&_handle).ok())
+ {
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+}
+
+ir::Layout Config::supportLayout(const ir::Operation &, ir::Layout) { return ir::Layout::NHWC; }
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/Config.h b/runtime/onert/backend/gpu_cl/Config.h
new file mode 100644
index 000000000..aa5a51a15
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/Config.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_CONFIG_H__
+#define __ONERT_BACKEND_GPU_CL_CONFIG_H__
+
+#include <backend/IConfig.h>
+#include <memory>
+#include <util/ITimer.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+class Config : public IConfig
+{
+public:
+ virtual ~Config();
+
+public:
+ std::string id() override { return "gpu_cl"; }
+ bool initialize() override;
+ ir::Layout supportLayout(const ir::Operation &node, ir::Layout frontend_layout) override;
+ bool supportPermutation() override { return true; }
+ bool supportDynamicTensor() override { return false; }
+ bool supportFP16() override { return true; }
+ std::unique_ptr<util::ITimer> timer() override { return std::make_unique<util::CPUTimer>(); }
+
+private:
+ void *_handle = nullptr;
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_CONFIG_H__
diff --git a/runtime/onert/backend/gpu_cl/ConstantInitializer.cc b/runtime/onert/backend/gpu_cl/ConstantInitializer.cc
new file mode 100644
index 000000000..7976abea9
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/ConstantInitializer.cc
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConstantInitializer.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+ConstantInitializer::ConstantInitializer(const ir::Operands &operands,
+ const std::shared_ptr<ITensorRegistry> &tensor_reg)
+ : ClConstantInitializer{operands, tensor_reg}
+{
+ // DO NOTHING
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/ConstantInitializer.h b/runtime/onert/backend/gpu_cl/ConstantInitializer.h
new file mode 100644
index 000000000..ce8131af2
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/ConstantInitializer.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_CONSTANT_INITIALIZER_H__
+#define __ONERT_BACKEND_GPU_CL_CONSTANT_INITIALIZER_H__
+
+#include "ClConstantInitializer.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+class ConstantInitializer : public ClConstantInitializer
+{
+public:
+ ConstantInitializer(const ir::Operands &operands,
+ const std::shared_ptr<ITensorRegistry> &tensor_reg);
+
+public:
+ using ClConstantInitializer::visit;
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_CONSTANT_INITIALIZER_H__
diff --git a/runtime/onert/backend/gpu_cl/KernelGenerator.cc b/runtime/onert/backend/gpu_cl/KernelGenerator.cc
new file mode 100644
index 000000000..a84867f8c
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/KernelGenerator.cc
@@ -0,0 +1,593 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <stdexcept>
+
+#include <backend/basic/KernelGeneratorBase.h>
+
+#include "KernelGenerator.h"
+
+#include "ClTensorRegistry.h"
+#include "ClFunction.h"
+#include "TensorManager.h"
+
+#include "open_cl/selectors/ConvolutionSelector.h"
+#include "open_cl/selectors/DwConvolutionSelector.h"
+#include "open_cl/selectors/SimpleSelectors.h"
+
+#include "ir/Operations.h"
+#include "ir/Operations.Include.h"
+#include "ir/Index.h"
+#include "ir/DataType.h"
+#include "ir/InternalType.h"
+#include "exec/NopFunction.h"
+#include "exec/FunctionSequence.h"
+#include "util/logging.h"
+#include "util/Utils.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+HW ToHW(int32_t h, int32_t w) { return HW(h > 0 ? h : 1, w > 0 ? w : 1); }
+
+template <typename AttrT>
+void UpdatePadding(const ir::PaddingType type, const BHWC &input_shape, AttrT *attr)
+{
+ if (type == ir::PaddingType::SAME)
+ {
+ attr->padding = CalculateSamePadding(input_shape, *attr);
+ }
+ else
+ {
+ attr->padding.prepended = HW(0, 0);
+ attr->padding.appended = HW(0, 0);
+ }
+}
+
+gpu_cl::PoolingType convertPoolType(ir::operation::Pool2D::PoolType type_ir)
+{
+ switch (type_ir)
+ {
+ case ir::operation::Pool2D::PoolType::AVG:
+ return gpu_cl::PoolingType::AVERAGE;
+ case ir::operation::Pool2D::PoolType::MAX:
+ return gpu_cl::PoolingType::MAX;
+ default:
+ throw std::runtime_error("gpu_Cl KernelGenerator : Not supported operation yet");
+ }
+}
+
+KernelGenerator::KernelGenerator(const ir::Graph &graph,
+ const std::shared_ptr<TensorBuilder> &tensor_builder,
+ const std::shared_ptr<ClTensorRegistry<TensorManager>> &tensor_reg,
+ const std::shared_ptr<CreationContext> &creation_context)
+ : basic::KernelGeneratorBase{graph}, _ctx(graph.operands()),
+ _operations_ctx(graph.operations()), _current_layout{graph.layout()},
+ _tensor_builder(tensor_builder), _tensor_reg(tensor_reg), _creation_context(creation_context)
+{
+}
+
+std::unique_ptr<exec::FunctionSequence> KernelGenerator::generate(ir::OperationIndex ind)
+{
+ auto ret = std::make_unique<exec::FunctionSequence>();
+ ret->enableDynamicShapeInferer(false);
+
+ const auto &op = _graph.operations().at(ind);
+ op.accept(*this);
+ ret->append(releaseFunction());
+ return ret;
+}
+
+void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
+{
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto lhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::LHS)};
+ const auto rhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::RHS)};
+
+ // const auto activation = node.param().activation;
+
+ OperationDef op_def;
+ op_def.precision = CalculationsPrecision::F32;
+
+ op_def.src_tensors.push_back(_tensor_reg->getClTensorReserver(lhs_index)->descriptor);
+ auto lhs_shape = _tensor_reg->getClTensorReserver(lhs_index)->shape;
+
+ op_def.src_tensors.push_back(_tensor_reg->getClTensorReserver(rhs_index)->descriptor);
+ auto rhs_shape = _tensor_reg->getClTensorReserver(rhs_index)->shape;
+
+ op_def.dst_tensors.push_back(_tensor_reg->getClTensorReserver(ofm_index)->descriptor);
+ auto out_shape = _tensor_reg->getClTensorReserver(ofm_index)->shape;
+
+ auto fn = std::make_unique<ClFunction>();
+
+ std::unique_ptr<GPUOperation> gpu_op;
+ switch (node.param().arithmetic_type)
+ {
+ case ir::operation::BinaryArithmetic::ArithmeticType::ADD:
+ {
+ std::vector<int> channels(2);
+ channels[0] = lhs_shape.c;
+ channels[1] = rhs_shape.c;
+ SelectAdd(op_def, channels, out_shape.c, &gpu_op);
+
+ auto ofm_tensor = _tensor_reg->getClTensor(ofm_index);
+ auto lhs_tensor = _tensor_reg->getClTensor(lhs_index);
+ auto rhs_tensor = _tensor_reg->getClTensor(rhs_index);
+ gpu_op->SetSrc(lhs_tensor->handle(), ir::operation::BinaryArithmetic::Input::LHS);
+ gpu_op->SetSrc(rhs_tensor->handle(), ir::operation::BinaryArithmetic::Input::RHS);
+ gpu_op->SetDst(ofm_tensor->handle(), 0);
+
+ fn->configure(_creation_context);
+ fn->add_operation(std::move(gpu_op));
+ break;
+ }
+ case ir::operation::BinaryArithmetic::ArithmeticType::SUB:
+ {
+ // NYI
+ break;
+ }
+ case ir::operation::BinaryArithmetic::ArithmeticType::MUL:
+ {
+ // NYI
+ break;
+ }
+ case ir::operation::BinaryArithmetic::ArithmeticType::DIV:
+ {
+ // NYI
+ break;
+ }
+ default:
+ assert(false && "The BinaryArithmetic operation supports only binary arithmetic operations");
+ break;
+ }
+
+ _return_fn = std::move(fn);
+}
+
+void KernelGenerator::visit(const ir::operation::Conv2D &node)
+{
+ auto output{node.getOutputs().at(0)};
+
+ auto input{node.getInputs().at(ir::operation::Conv2D::INPUT)};
+ auto kernel{node.getInputs().at(ir::operation::Conv2D::KERNEL)};
+ auto bias{node.getInputs().at(ir::operation::Conv2D::BIAS)};
+
+ const auto param = node.param();
+
+ OperationDef op_def;
+ op_def.precision = CalculationsPrecision::F32;
+
+ op_def.src_tensors.push_back(_tensor_reg->getClTensorReserver(input)->descriptor);
+
+ auto input_shape = _tensor_reg->getClTensorReserver(input)->shape;
+ auto kernel_shape = _tensor_reg->getClTensorReserver(kernel)->shape;
+ auto output_shape = _tensor_reg->getClTensorReserver(output)->shape;
+ auto bias_shape = _tensor_reg->getClTensorReserver(bias)->shape;
+
+ op_def.dst_tensors.push_back(_tensor_reg->getClTensorReserver(output)->descriptor);
+
+ ModelHints hints;
+ std::unique_ptr<GPUOperation> gpu_op; // = InitSingleOpSubgraph(inputs, outputs, gpu_subgraph);
+
+ auto input_tensor = _tensor_reg->getClTensor(input);
+ auto kernel_tensor = _tensor_reg->getClTensor(kernel);
+ auto bias_tensor = _tensor_reg->getClTensor(bias);
+ auto output_tensor = _tensor_reg->getClTensor(output);
+
+ gpu_cl::Convolution2DAttributes attr;
+ attr.strides = ToHW(param.stride.vertical, param.stride.horizontal);
+ attr.dilations = HW(std::max(static_cast<u_int32_t>(1), param.dilation.height_factor),
+ std::max(static_cast<u_int32_t>(1), param.dilation.width_factor));
+
+ bool is_weight = (_ctx.at(kernel).isConstant() ? true : false);
+
+ if (is_weight)
+ {
+ attr.weights.id = kernel.value();
+ attr.weights.shape.o = kernel_shape.b;
+ attr.weights.shape.h = kernel_shape.h;
+ attr.weights.shape.w = kernel_shape.w;
+ attr.weights.shape.i = kernel_shape.c;
+ attr.weights.data.resize(kernel_shape.DimensionsProduct());
+ memcpy(attr.weights.data.data(), _ctx.at(kernel).data()->base(), kernel_tensor->total_size());
+ }
+
+ attr.bias.id = bias.value();
+ // TODO Modify
+ attr.bias.shape.v = bias_shape.b != 1 ? bias_shape.b : bias_shape.c;
+ attr.bias.data.resize(bias_shape.DimensionsProduct());
+ memcpy(attr.bias.data.data(), _ctx.at(bias).data()->base(), bias_tensor->total_size());
+
+ UpdatePadding(param.padding.type, input_shape, &attr);
+
+ gpu_op = SelectConvolution(attr, output_shape, _creation_context->GetDeviceInfo(), op_def, hints);
+ gpu_op->SetSrc(input_tensor->handle(), ir::operation::Conv2D::INPUT);
+
+ auto fn = std::make_unique<ClFunction>();
+
+ fn->configure(_creation_context);
+
+ const auto activation = node.param().activation;
+
+ switch (activation)
+ {
+ case ir::Activation::NONE:
+ {
+ gpu_op->SetDst(output_tensor->handle(), 0);
+ fn->add_operation(std::move(gpu_op));
+ break;
+ }
+ case ir::Activation::RELU6:
+ {
+ std::unique_ptr<GPUOperation> gpu_op_1;
+ OperationDef op_def_1;
+ std::shared_ptr<Tensor> new_tensor = std::make_shared<Tensor>();
+
+ _new_tensors[output] = new_tensor;
+ if (!CreateTensor(*_creation_context->context, output_shape,
+ _tensor_reg->getClTensorReserver(output)->descriptor, new_tensor.get())
+ .ok())
+ {
+ throw std::runtime_error("Error CreateTensor.");
+ }
+
+ gpu_op->SetDst(new_tensor.get(), 0);
+ fn->add_operation(std::move(gpu_op));
+ op_def_1.precision = CalculationsPrecision::F32;
+ op_def_1.src_tensors.push_back(_tensor_reg->getClTensorReserver(output)->descriptor);
+ op_def_1.dst_tensors.push_back(_tensor_reg->getClTensorReserver(output)->descriptor);
+
+ // - ReLU6: clip = 6, alpha = 0
+ ReLUAttributes attr_1;
+ attr_1.clip = 6;
+ attr_1.alpha = 0;
+ gpu_op_1 = SelectReLU(attr_1, op_def_1);
+
+ gpu_op_1->SetSrc(new_tensor.get(), 0);
+ gpu_op_1->SetDst(output_tensor->handle(), 0);
+ fn->add_operation(std::move(gpu_op_1));
+ break;
+ }
+ default:
+ {
+ throw std::runtime_error("gpu_cl KernelGenerator : Not supported operation yet");
+ }
+ }
+
+ _return_fn = std::move(fn);
+}
+
+void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
+{
+ using ir::operation::DepthwiseConv2D;
+
+ const auto ofm_index{node.getOutputs().at(0)};
+ const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
+ const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
+ const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
+
+ const auto stride = node.param().stride;
+ const auto dilation = node.param().dilation;
+ const auto padding = node.param().padding;
+
+ const auto multiplier = node.param().multiplier;
+
+ auto ofm_tensor = _tensor_reg->getClTensor(ofm_index);
+ auto ifm_tensor = _tensor_reg->getClTensor(ifm_index);
+ auto ker_tensor = _tensor_reg->getClTensor(ker_index);
+ auto bias_tensor = _tensor_reg->getClTensor(bias_index);
+
+ bool is_weight = (_ctx.at(ker_index).isConstant() ? true : false);
+ OperationDef op_def;
+ op_def.precision = CalculationsPrecision::F32;
+
+ op_def.src_tensors.push_back(_tensor_reg->getClTensorReserver(ifm_index)->descriptor);
+ auto input_shape = _tensor_reg->getClTensorReserver(ifm_index)->shape;
+
+ auto ker_shape = _tensor_reg->getClTensorReserver(ker_index)->shape;
+
+ op_def.dst_tensors.push_back(_tensor_reg->getClTensorReserver(ofm_index)->descriptor);
+ auto out_shape = _tensor_reg->getClTensorReserver(ofm_index)->shape;
+ auto bias_shape = _tensor_reg->getClTensorReserver(bias_index)->shape;
+
+ DepthwiseConvolution2DAttributes attr;
+ attr.strides = ToHW(stride.vertical, stride.horizontal);
+ attr.dilations = HW(std::max(static_cast<u_int32_t>(1), dilation.height_factor),
+ std::max(static_cast<u_int32_t>(1), dilation.width_factor));
+
+ if (is_weight)
+ {
+ attr.weights.id = ker_index.value();
+ attr.weights.shape.o = ker_shape.b;
+ attr.weights.shape.h = ker_shape.h;
+ attr.weights.shape.w = ker_shape.w;
+ attr.weights.shape.i = ker_shape.c;
+ attr.weights.data.resize(ker_shape.DimensionsProduct());
+ memcpy(attr.weights.data.data(), _ctx.at(ker_index).data()->base(), ker_tensor->total_size());
+ }
+ attr.bias.id = bias_index.value();
+ attr.bias.shape.v = bias_shape.b != 1 ? bias_shape.b : bias_shape.c;
+ attr.bias.data.resize(bias_shape.DimensionsProduct());
+ memcpy(attr.bias.data.data(), _ctx.at(bias_index).data()->base(), bias_tensor->total_size());
+ UpdatePadding(padding.type, input_shape, &attr);
+
+ if (multiplier != 1)
+ {
+ const int input_depth = input_shape.c;
+ const int filter_height = ker_shape.h;
+ const int filter_width = ker_shape.w;
+ const int output_depth = out_shape.c;
+
+ InternalTensor<OHWI, DataType::FLOAT32> weights;
+ weights.id = attr.weights.id;
+ weights.shape = OHWI(output_depth, filter_height, filter_width, input_depth);
+ weights.data.resize(weights.shape.DimensionsProduct());
+ float *dst = &weights.data[0];
+ for (int j = 0; j < output_depth; ++j)
+ {
+ const float *src = attr.weights.data.data() + j;
+ for (int i = 0; i < filter_height * filter_width; ++i)
+ {
+ *dst = *src;
+ dst++;
+ src += output_depth;
+ }
+ }
+ attr.weights = std::move(weights);
+ }
+
+ auto fn = std::make_unique<ClFunction>();
+ std::unique_ptr<GPUOperation> gpu_op;
+
+ if (is_weight)
+ {
+ gpu_op = SelectDWConvolution(attr, _creation_context->GetDeviceInfo(), op_def);
+ }
+ else
+ {
+ if (ker_shape.b != 1)
+ {
+ throw std::runtime_error(
+ "No support of depthwise runtime weights with channel multiplier != 1");
+ }
+ gpu_op = SelectDWConvolutionDynamicWeights(attr, _creation_context->GetDeviceInfo(), op_def);
+ }
+
+ gpu_op->SetSrc(ifm_tensor->handle(), ir::operation::DepthwiseConv2D::Input::INPUT);
+
+ fn->configure(_creation_context);
+
+ const auto activation = node.param().activation;
+
+ switch (activation)
+ {
+ case ir::Activation::NONE:
+ {
+ gpu_op->SetDst(ofm_tensor->handle(), 0);
+ fn->add_operation(std::move(gpu_op));
+ break;
+ }
+ case ir::Activation::RELU6:
+ {
+ std::unique_ptr<GPUOperation> gpu_op_1;
+ OperationDef op_def_1;
+ std::shared_ptr<Tensor> new_tensor = std::make_shared<Tensor>();
+
+ _new_tensors[ofm_index] = new_tensor;
+ if (!CreateTensor(*_creation_context->context, out_shape,
+ _tensor_reg->getClTensorReserver(ofm_index)->descriptor, new_tensor.get())
+ .ok())
+ {
+ throw std::runtime_error("Error CreateTensor.");
+ }
+
+ gpu_op->SetDst(new_tensor.get(), 0);
+ fn->add_operation(std::move(gpu_op));
+ op_def_1.precision = CalculationsPrecision::F32;
+ op_def_1.src_tensors.push_back(_tensor_reg->getClTensorReserver(ofm_index)->descriptor);
+ op_def_1.dst_tensors.push_back(_tensor_reg->getClTensorReserver(ofm_index)->descriptor);
+
+ // - ReLU6: clip = 6, alpha = 0
+ ReLUAttributes attr_1;
+ attr_1.clip = 6;
+ attr_1.alpha = 0;
+ gpu_op_1 = SelectReLU(attr_1, op_def_1);
+
+ gpu_op_1->SetSrc(new_tensor.get(), 0);
+ gpu_op_1->SetDst(ofm_tensor->handle(), 0);
+ fn->add_operation(std::move(gpu_op_1));
+ break;
+ }
+ default:
+ {
+ throw std::runtime_error("gpu_cl KernelGenerator : Not supported operation yet");
+ }
+ }
+
+ _return_fn = std::move(fn);
+}
+
+void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
+{
+ std::unique_ptr<GPUOperation> gpu_op;
+ auto fn = std::make_unique<ClFunction>();
+
+ switch (node.param().op_type)
+ {
+ case ir::operation::ElementwiseActivation::Type::LEAKY_RELU:
+ case ir::operation::ElementwiseActivation::Type::RELU:
+ {
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{
+ node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
+
+ OperationDef op_def;
+ op_def.precision = CalculationsPrecision::F32;
+ auto output_tensor = _tensor_reg->getClTensor(output_index);
+ auto input_tensor = _tensor_reg->getClTensor(input_index);
+ op_def.dst_tensors.push_back(_tensor_reg->getClTensorReserver(output_index)->descriptor);
+ op_def.src_tensors.push_back(_tensor_reg->getClTensorReserver(input_index)->descriptor);
+
+ ReLUAttributes attr;
+ if (ir::operation::ElementwiseActivation::Type::LEAKY_RELU == node.param().op_type)
+ {
+ attr.alpha = node.param().alpha;
+ attr.clip = 0;
+ }
+ else
+ {
+ attr.alpha = node.param().beta;
+ attr.clip = node.param().alpha;
+ }
+ gpu_op = SelectReLU(attr, op_def);
+ gpu_op->SetSrc(input_tensor->handle(), ir::operation::ElementwiseActivation::Input::INPUT);
+ gpu_op->SetDst(output_tensor->handle(), 0);
+ fn->configure(_creation_context);
+ fn->add_operation(std::move(gpu_op));
+
+ _return_fn = std::move(fn);
+ break;
+ }
+ default:
+ throw std::runtime_error("gpu_cl KernelGenerator : Not supported operation yet");
+ }
+}
+
+void KernelGenerator::visit(const ir::operation::Pool2D &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Pool2D::Input::INPUT)};
+
+ OperationDef op_def;
+ op_def.precision = CalculationsPrecision::F32;
+
+ op_def.src_tensors.push_back(_tensor_reg->getClTensorReserver(input_index)->descriptor);
+ auto input_shape = _tensor_reg->getClTensorReserver(input_index)->shape;
+
+ op_def.dst_tensors.push_back(_tensor_reg->getClTensorReserver(output_index)->descriptor);
+
+ const auto kh = node.param().kh;
+ const auto kw = node.param().kw;
+ const auto stride = node.param().stride;
+ const auto op_type = convertPoolType(node.param().op_type);
+
+ Pooling2DAttributes attributes;
+ attributes.type = op_type;
+ attributes.kernel = HW(kh > 0 ? kh : 1, kw > 0 ? kw : 1);
+ attributes.strides =
+ HW(stride.vertical > 0 ? stride.vertical : 1, stride.horizontal > 0 ? stride.horizontal : 1);
+
+ if (node.param().padding.type == ir::PaddingType::SAME)
+ {
+ attributes.padding = CalculateSamePadding(input_shape, attributes);
+ }
+ else
+ {
+ attributes.padding.prepended = HW(0, 0);
+ attributes.padding.appended = HW(0, 0);
+ }
+
+ auto fn = std::make_unique<ClFunction>();
+ std::unique_ptr<GPUOperation> gpu_op;
+ gpu_op = SelectPooling(attributes, op_def);
+
+ auto input_tensor = _tensor_reg->getClTensor(input_index);
+ auto output_tensor = _tensor_reg->getClTensor(output_index);
+
+ gpu_op->SetSrc(input_tensor->handle(), ir::operation::Pool2D::Input::INPUT);
+ gpu_op->SetDst(output_tensor->handle(), 0);
+
+ fn->configure(_creation_context);
+ fn->add_operation(std::move(gpu_op));
+
+ _return_fn = std::move(fn);
+}
+
+void KernelGenerator::visit(const ir::operation::Reshape &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
+
+ OperationDef op_def;
+ op_def.precision = CalculationsPrecision::F32;
+
+ op_def.src_tensors.push_back(_tensor_reg->getClTensorReserver(input_index)->descriptor);
+ auto input_shape = _tensor_reg->getClTensorReserver(input_index)->shape;
+
+ op_def.dst_tensors.push_back(_tensor_reg->getClTensorReserver(output_index)->descriptor);
+ auto output_shape = _tensor_reg->getClTensorReserver(output_index)->shape;
+
+ ReshapeAttributes attr;
+ attr.new_shape = output_shape;
+
+ auto fn = std::make_unique<ClFunction>();
+ std::unique_ptr<GPUOperation> gpu_op;
+ const int src_channels = input_shape.c;
+ SelectReshape(src_channels, attr.new_shape.c, op_def, &gpu_op);
+
+ auto input_tensor = _tensor_reg->getClTensor(input_index);
+ auto output_tensor = _tensor_reg->getClTensor(output_index);
+ gpu_op->SetSrc(input_tensor->handle(), ir::operation::Reshape::Input::INPUT);
+ gpu_op->SetDst(output_tensor->handle(), 0);
+
+ fn->configure(_creation_context);
+ fn->add_operation(std::move(gpu_op));
+
+ _return_fn = std::move(fn);
+}
+
+void KernelGenerator::visit(const ir::operation::Softmax &node)
+{
+ const auto output_index{node.getOutputs().at(0)};
+ const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
+
+ const auto beta = node.param().beta;
+
+ if (beta != 1.0)
+ {
+ throw std::runtime_error("Softmax.beta != 1 is not supported in gpu_cl");
+ }
+
+ OperationDef op_def;
+ op_def.precision = CalculationsPrecision::F32;
+
+ op_def.dst_tensors.push_back(_tensor_reg->getClTensorReserver(output_index)->descriptor);
+
+ op_def.src_tensors.push_back(_tensor_reg->getClTensorReserver(input_index)->descriptor);
+ auto input_shape = _tensor_reg->getClTensorReserver(input_index)->shape;
+
+ auto fn = std::make_unique<ClFunction>();
+
+ std::unique_ptr<GPUOperation> gpu_op;
+ SelectSoftmax(input_shape, op_def, &gpu_op);
+ auto output_tensor = _tensor_reg->getClTensor(output_index);
+ auto input_tensor = _tensor_reg->getClTensor(input_index);
+
+ gpu_op->SetSrc(input_tensor->handle(), ir::operation::Softmax::Input::INPUT);
+ gpu_op->SetDst(output_tensor->handle(), 0);
+
+ fn->configure(_creation_context);
+ fn->add_operation(std::move(gpu_op));
+
+ _return_fn = std::move(fn);
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/KernelGenerator.h b/runtime/onert/backend/gpu_cl/KernelGenerator.h
new file mode 100644
index 000000000..3e341b111
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/KernelGenerator.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_KERNEL_GENERATOR_H__
+#define __ONERT_BACKEND_GPU_CL_KERNEL_GENERATOR_H__
+
+#include "ClTensorRegistry.h"
+#include "backend/basic/TensorRegistry.h"
+#include "TensorBuilder.h"
+#include "TensorManager.h"
+
+#include <backend/CustomKernelBuilder.h>
+#include <backend/basic/KernelGeneratorBase.h>
+#include <ir/Operands.h>
+#include <ir/Operations.h>
+#include <ir/Operations.Include.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+class KernelGenerator : public basic::KernelGeneratorBase
+{
+public:
+ KernelGenerator(const ir::Graph &graph, const std::shared_ptr<TensorBuilder> &tensor_builder,
+ const std::shared_ptr<ClTensorRegistry<TensorManager>> &tensor_reg,
+ const std::shared_ptr<CreationContext> &creation_context);
+
+ std::unique_ptr<exec::FunctionSequence> generate(ir::OperationIndex ind) override;
+
+private:
+ void visit(const ir::operation::BinaryArithmetic &) override;
+ void visit(const ir::operation::Conv2D &) override;
+ void visit(const ir::operation::DepthwiseConv2D &) override;
+ void visit(const ir::operation::ElementwiseActivation &) override;
+ void visit(const ir::operation::Pool2D &) override;
+ void visit(const ir::operation::Reshape &) override;
+ void visit(const ir::operation::Softmax &) override;
+
+private:
+ const ir::Operands &_ctx;
+ const ir::Operations &_operations_ctx;
+ ir::Layout _current_layout;
+ std::shared_ptr<TensorBuilder> _tensor_builder;
+ std::shared_ptr<ClTensorRegistry<TensorManager>> _tensor_reg;
+ std::shared_ptr<CreationContext> _creation_context;
+ ir::OperandIndexMap<std::shared_ptr<Tensor>> _new_tensors;
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_KERNEL_GENERATOR_H__
diff --git a/runtime/onert/backend/gpu_cl/ParentInfo.h b/runtime/onert/backend/gpu_cl/ParentInfo.h
new file mode 100644
index 000000000..d7cb2d4fb
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/ParentInfo.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_PARENT_INFO_H__
+#define __ONERT_BACKEND_PARENT_INFO_H__
+
+#include <ir/Index.h>
+#include <ir/Coordinates.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+/**
+ * @brief Struct to represent parent operand in child operand
+ */
+struct ParentInfo
+{
+ ir::OperandIndex parent;
+ ir::Layout frontend_layout;
+ ir::Coordinates coordinates;
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_ACL_COMMON_PARENT_INFO_H__
diff --git a/runtime/onert/backend/gpu_cl/TensorBuilder.h b/runtime/onert/backend/gpu_cl/TensorBuilder.h
new file mode 100644
index 000000000..d55358191
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/TensorBuilder.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_TENSOR_BUILDER_H__
+#define __ONERT_BACKEND_GPU_CL_TENSOR_BUILDER_H__
+
+#include <backend/basic/TensorBuilder.h>
+#include "operand/ICLTensor.h"
+#include "operand/CLTensor.h"
+#include "ClTensorBuilder.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+using TensorBuilder = ClTensorBuilder<operand::ICLTensor, operand::CLTensor>;
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_TENSOR_BUILDER_H__
diff --git a/runtime/onert/backend/gpu_cl/TensorManager.h b/runtime/onert/backend/gpu_cl/TensorManager.h
new file mode 100644
index 000000000..111b5f8a7
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/TensorManager.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CL_TENSOR_MANAGER_H__
+#define __ONERT_BACKEND_CL_TENSOR_MANAGER_H__
+
+#include "ClMemoryManager.h"
+#include "ClTensorManager.h"
+#include "open_cl/ClContext.h"
+#include "operand/CLTensor.h"
+#include "operand/ICLTensor.h"
+#include "util/logging.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+using MemoryManager = ClMemoryManager<operand::ICLTensor, operand::CLTensor>;
+
+using TensorManager = ClTensorManager<operand::ICLTensor, operand::CLTensor>;
+
+inline TensorManager *createTensorManager(CLContext *context)
+{
+ VERBOSE(createTensorManager) << "ClTensorManager" << std::endl;
+ return new TensorManager(new MemoryManager(context), new MemoryManager(context));
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_ACL_CL_TENSOR_MANAGER_H__
diff --git a/runtime/onert/backend/gpu_cl/gpu_cl.cc b/runtime/onert/backend/gpu_cl/gpu_cl.cc
new file mode 100644
index 000000000..b771d6d29
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/gpu_cl.cc
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Backend.h"
+
+#include <util/logging.h>
+
+extern "C" {
+onert::backend::Backend *onert_backend_create()
+{
+ VERBOSE(onert_backend_create) << "'gpu_cl' loaded\n";
+ return new onert::backend::gpu_cl::Backend;
+}
+
+void onert_backend_destroy(onert::backend::Backend *backend)
+{
+ VERBOSE(onert_backend_destroy) << "'gpu_cl' unloaded\n";
+ delete backend;
+}
+}
diff --git a/runtime/onert/backend/gpu_cl/open_cl/AccessType.h b/runtime/onert/backend/gpu_cl/open_cl/AccessType.h
new file mode 100644
index 000000000..81efd666f
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/AccessType.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_ACCESS_TYPE_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_ACCESS_TYPE_H__
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+enum class AccessType
+{
+ UNKNOWN,
+ READ,
+ WRITE,
+ READ_WRITE,
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_ACCESS_TYPE_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/Api.cc b/runtime/onert/backend/gpu_cl/open_cl/Api.cc
new file mode 100644
index 000000000..10bf87c38
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/Api.cc
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Api.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+namespace
+{
+
+struct ObjectTypeGetter
+{
+ ObjectType operator()(absl::monostate) const { return ObjectType::UNKNOWN; }
+ ObjectType operator()(OpenClBuffer) const { return ObjectType::OPENCL_BUFFER; }
+ ObjectType operator()(OpenClTexture) const { return ObjectType::OPENCL_TEXTURE; }
+ ObjectType operator()(CpuMemory) const { return ObjectType::CPU_MEMORY; }
+};
+
+struct ObjectValidityChecker
+{
+ bool operator()(absl::monostate) const { return false; }
+ bool operator()(OpenClBuffer obj) const { return obj.memobj; }
+ bool operator()(OpenClTexture obj) const { return obj.memobj; }
+ bool operator()(CpuMemory obj) const
+ {
+ return obj.data != nullptr && obj.size_bytes > 0 &&
+ (data_type == DataType::UNKNOWN || obj.size_bytes % SizeOf(data_type) == 0);
+ }
+ DataType data_type;
+};
+
+} // namespace
+
+bool IsValid(const ObjectDef &def)
+{
+ return def.data_type != DataType::UNKNOWN && def.data_layout != DataLayout::UNKNOWN &&
+ def.object_type != ObjectType::UNKNOWN;
+}
+
+ObjectType GetType(const TensorObject &object) { return absl::visit(ObjectTypeGetter{}, object); }
+
+bool IsValid(const TensorObjectDef &def) { return IsValid(def.object_def); }
+
+bool IsValid(const TensorObjectDef &def, const TensorObject &object)
+{
+ return GetType(object) == def.object_def.object_type &&
+ absl::visit(ObjectValidityChecker{def.object_def.data_type}, object);
+}
+
+bool IsObjectPresent(ObjectType type, const TensorObject &obj)
+{
+ switch (type)
+ {
+ case ObjectType::CPU_MEMORY:
+ return absl::holds_alternative<CpuMemory>(obj);
+ case ObjectType::OPENCL_BUFFER:
+ return absl::holds_alternative<OpenClBuffer>(obj);
+ case ObjectType::OPENCL_TEXTURE:
+ return absl::holds_alternative<OpenClTexture>(obj);
+ case ObjectType::UNKNOWN:
+ return false;
+ }
+ return false;
+}
+
+uint32_t NumElements(const TensorObjectDef &def)
+{
+ const auto &d = def.dimensions;
+ switch (def.object_def.data_layout)
+ {
+ case DataLayout::BHWC:
+ return d.product();
+ case DataLayout::HWDC4:
+ case DataLayout::HDWC4:
+ case DataLayout::DHWC4:
+ return d.b * d.h * d.w * AlignByN(d.c, 4);
+ case DataLayout::UNKNOWN:
+ return 0;
+ }
+ return 0;
+}
+
+int GetPosition(const InferenceOptions &options, InferencePriority p)
+{
+ if (options.priority1 == p)
+ return 1;
+ if (options.priority2 == p)
+ return 2;
+ if (options.priority3 == p)
+ return 3;
+ return 4; // least important
+}
+
+PriorityImportance GetRelativeImportance(const InferenceOptions &options, InferencePriority p1,
+ InferencePriority p2)
+{
+ int p1_position = GetPosition(options, p1);
+ int p2_position = GetPosition(options, p2);
+ if (p1_position == p2_position)
+ return PriorityImportance::UNKNOWN;
+ return p1_position < p2_position ? PriorityImportance::HIGHER : PriorityImportance::LOWER;
+}
+
+bool IsValid(const InferenceOptions &options)
+{
+ if (options.usage == InferenceUsage::UNKNOWN)
+ {
+ return false;
+ }
+ if (options.priority1 == InferencePriority::UNKNOWN ||
+ options.priority2 == InferencePriority::UNKNOWN ||
+ options.priority3 == InferencePriority::UNKNOWN)
+ {
+ return false;
+ }
+ if (options.priority1 == InferencePriority::AUTO)
+ {
+ return false;
+ }
+ if (options.priority2 == InferencePriority::AUTO && options.priority3 != InferencePriority::AUTO)
+ {
+ return false;
+ }
+ if (options.priority1 == options.priority2 || options.priority1 == options.priority3)
+ {
+ return false;
+ }
+ if (options.priority2 == options.priority3 && options.priority2 != InferencePriority::AUTO)
+ {
+ return false;
+ }
+ return true;
+}
+
+// Implementation note: this resolution logic is shared between GL and CL
+// backends, but they might have own logic. Thus, the function is defined
+// here just for code re-use purposes.
+void ResolveAutoPriority(InferenceOptions *options)
+{
+ // priority1 can not be AUTO as it would make options invalid.
+ if (options->priority2 == InferencePriority::AUTO)
+ {
+ switch (options->priority1)
+ {
+ case InferencePriority::MIN_LATENCY:
+ options->priority2 = InferencePriority::MIN_MEMORY_USAGE;
+ options->priority3 = InferencePriority::MAX_PRECISION;
+ return;
+ case InferencePriority::MIN_MEMORY_USAGE:
+ options->priority2 = InferencePriority::MAX_PRECISION;
+ options->priority3 = InferencePriority::MIN_LATENCY;
+ return;
+ case InferencePriority::MAX_PRECISION:
+ options->priority2 = InferencePriority::MIN_LATENCY;
+ options->priority3 = InferencePriority::MIN_MEMORY_USAGE;
+ return;
+ case InferencePriority::UNKNOWN:
+ case InferencePriority::AUTO:
+ // Invalid and unreachable option.
+ return;
+ }
+ }
+
+ if (options->priority3 == InferencePriority::AUTO)
+ {
+ // Simply add missing priority
+ if (GetPosition(*options, InferencePriority::MIN_LATENCY) == 4)
+ {
+ options->priority3 = InferencePriority::MIN_LATENCY;
+ }
+ else if (GetPosition(*options, InferencePriority::MAX_PRECISION) == 4)
+ {
+ options->priority3 = InferencePriority::MAX_PRECISION;
+ }
+ else if (GetPosition(*options, InferencePriority::MIN_MEMORY_USAGE) == 4)
+ {
+ options->priority3 = InferencePriority::MIN_MEMORY_USAGE;
+ }
+ }
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/Api.h b/runtime/onert/backend/gpu_cl/open_cl/Api.h
new file mode 100644
index 000000000..35be3d99c
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/Api.h
@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_API_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_API_H__
+
+// Usage example:
+//
+// // Builder is created from a model using GPU-specific parameters.
+// std::unique_ptr<InferenceBuilder> builder = ...;
+//
+// // input data is coming from a texture
+// // output data goes to CPU
+// builder->SetInputObjectDef(0, {DataType::FLOAT16, DataLayout::PHWC4,
+// ObjectType::OPENGL_TEXTURE, true});
+// builder->SetOutputObjectDef(0, {DataType::FLOAT32, DataLayout::BHWC,
+// ObjectType::CPU_MEMORY, false});
+// std::unique_ptr<InferenceRunner> runner;
+// RETURN_IF_ERROR(builder->Build(&runner)); // may take significant time.
+// RETURN_IF_ERROR(
+// runner->SetInputObject(0, OpenGlTexture{texture_ud, texture_format}));
+// RETURN_IF_ERROR(runner->Run());
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "absl/types/variant.h"
+#include "DataType.h"
+#include "Status.h"
+#include "Util.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+// Common abbreviations:
+// B - batch
+// H - height
+// W - width
+// C - channels
+// D - depth := DivideRoundUp(C, 4)
+// C4 - is the constant = 4.
+enum class DataLayout
+{
+ UNKNOWN,
+ BHWC,
+ DHWC4,
+ HWDC4,
+ HDWC4,
+};
+
+enum class ObjectType
+{
+ UNKNOWN,
+ CPU_MEMORY,
+ OPENCL_TEXTURE,
+ OPENCL_BUFFER,
+};
+
+struct OpenClBuffer
+{
+ OpenClBuffer() = default;
+ explicit OpenClBuffer(cl_mem new_memobj) : memobj(new_memobj) {}
+
+ cl_mem memobj = nullptr;
+};
+
+struct OpenClTexture
+{
+ OpenClTexture() = default;
+ explicit OpenClTexture(cl_mem new_memobj) : memobj(new_memobj) {}
+
+ cl_mem memobj = nullptr;
+ // TODO(akulik): should it specify texture format?
+};
+
+struct CpuMemory
+{
+ CpuMemory() = default;
+ CpuMemory(void *new_data, size_t new_size_bytes) : data(new_data), size_bytes(new_size_bytes) {}
+
+ void *data = nullptr;
+ size_t size_bytes = 0;
+};
+
+template <typename T> inline CpuMemory MakeCpuMemory(absl::Span<T> t)
+{
+ CpuMemory m;
+ m.data = t.data();
+ m.size_bytes = t.size() * sizeof(T);
+ return m;
+}
+
+template <typename T> inline CpuMemory MakeReadableCpuMemory(absl::Span<const T> t)
+{
+ CpuMemory m;
+ m.data = const_cast<T *>(t.data());
+ m.size_bytes = t.size() * sizeof(T);
+ return m;
+}
+
+// Defines object representation.
+struct ObjectDef
+{
+ DataType data_type = DataType::UNKNOWN;
+ DataLayout data_layout = DataLayout::UNKNOWN;
+ ObjectType object_type = ObjectType::UNKNOWN;
+
+ // If true, then object is managed externally and needs to be provided to
+ // InferenceRunner by a user before running inference.
+ //
+ // User-provided objects will not be re-used internally for any purpose to
+ // lower overall memory usage.
+ bool user_provided = false;
+
+ bool operator==(const ObjectDef &other) const
+ {
+ return data_type == other.data_type && data_layout == other.data_layout &&
+ object_type == other.object_type && user_provided == other.user_provided;
+ }
+};
+
+bool IsValid(const ObjectDef &def);
+
+struct Dimensions
+{
+ Dimensions() : b(1), h(1), w(1), c(1) {}
+
+ Dimensions(int32_t batch, int32_t height, int32_t width, int32_t channels)
+ : b(batch), h(height), w(width), c(channels)
+ {
+ }
+
+ int32_t d() const { return DivideRoundUp(c, 4); }
+
+ int32_t product() const { return b * h * w * c; }
+
+ bool operator==(const Dimensions &other) const
+ {
+ return b == other.b && h == other.h && w == other.w && c == other.c;
+ }
+
+ int32_t b;
+ int32_t h;
+ int32_t w;
+ int32_t c;
+};
+
+// Connects tensor shape with corresponding object definition.
+struct TensorObjectDef
+{
+ // Dimensions semantic is defined by corresponding DataLayout.
+ Dimensions dimensions;
+ ObjectDef object_def;
+
+ bool operator==(const TensorObjectDef &other) const
+ {
+ return dimensions == other.dimensions && object_def == other.object_def;
+ }
+};
+
+// @return true if tensor object def is defined.
+bool IsValid(const TensorObjectDef &def);
+
+// @return the number of elements in a tensor object.
+uint32_t NumElements(const TensorObjectDef &def);
+
+using TensorObject = absl::variant<absl::monostate, CpuMemory, OpenClBuffer, OpenClTexture>;
+
+// @return true if object is set and corresponding values are defined.
+bool IsValid(const TensorObjectDef &def, const TensorObject &object);
+
+ObjectType GetType(const TensorObject &object);
+
+// @return true if corresponding object is set for the given type
+bool IsObjectPresent(ObjectType type, const TensorObject &obj);
+
+class InferenceRunner;
+
+// Allows to inspect and change input and output definitions before a graph is
+// prepared for the inference.
+class InferenceBuilder
+{
+public:
+ virtual ~InferenceBuilder() {}
+
+ // Returns inference graph inputs and outputs definitions.
+ virtual std::vector<TensorObjectDef> inputs() const = 0;
+ virtual std::vector<TensorObjectDef> outputs() const = 0;
+
+ // Sets new shape for the input if underlying implementation and graph
+ // structure allows dynamic tensors.
+ virtual absl::Status SetInputShape(int index, const Dimensions &dimensions) = 0;
+
+ // Updates object definitions for the given index. Implementation may allow
+ // to use different layouts and/or data type conversions between objects
+ // defined in a graph and given objects, for example:
+ // input '0' is DataType::FLOAT32, DataLayout::BHWC.
+ // A user, however, has an input in DataType::FLOAT16, DataLayout::PHWC4.
+ // An implementation may allow this transformation to happen automatically
+ // under the hood.
+ virtual absl::Status SetInputObjectDef(int index, ObjectDef def) = 0;
+ virtual absl::Status SetOutputObjectDef(int index, ObjectDef def) = 0;
+ virtual absl::Status SetAllInputObjectDefsTo(ObjectDef def)
+ {
+ auto input_defs = inputs();
+ for (size_t i = 0; i < input_defs.size(); ++i)
+ {
+ RETURN_IF_ERROR(SetInputObjectDef(i, def));
+ }
+ return absl::OkStatus();
+ }
+ virtual absl::Status SetAllOutputObjectDefsTo(ObjectDef def)
+ {
+ auto output_defs = outputs();
+ for (size_t i = 0; i < output_defs.size(); ++i)
+ {
+ RETURN_IF_ERROR(SetOutputObjectDef(i, def));
+ }
+ return absl::OkStatus();
+ }
+
+ // Creates new instance of the inference runner. InferenceBuilder stays valid
+ // and could be used to create another inference runner if needed.
+ //
+ // This method may take significant time to prepare new inference runner. For
+ // example, it may require to compile OpenGL shaders.
+ virtual absl::Status Build(std::unique_ptr<InferenceRunner> *runner) = 0;
+};
+
+// Runs prepared inference. Every object marked as external needs to be set
+// prior calling Run method.
+class InferenceRunner
+{
+public:
+ virtual ~InferenceRunner() {}
+
+ // Returns inference graph inputs and outputs definitions.
+ virtual std::vector<TensorObjectDef> inputs() const = 0;
+ virtual std::vector<TensorObjectDef> outputs() const = 0;
+
+ // Getters provide access to underlying objects for the given index.
+ // Setters allow to set or change external object for the given index. Note,
+ // object need to match object definition set before in InferenceBuilder.
+
+ virtual absl::Status GetInputObject(int index, TensorObject *object) = 0;
+ virtual absl::Status GetOutputObject(int index, TensorObject *object) = 0;
+ virtual absl::Status SetInputObject(int index, TensorObject object) = 0;
+ virtual absl::Status SetOutputObject(int index, TensorObject object) = 0;
+
+ virtual absl::Status Run() = 0;
+};
+
+// Encapsulated compilation/runtime tradeoffs.
+enum class InferenceUsage
+{
+ UNKNOWN,
+
+ // InferenceRunner will be used only once. Therefore, it is important to
+ // minimize bootstrap time as well.
+ FAST_SINGLE_ANSWER,
+
+ // Prefer maximizing the throughput. Same inference runner will be used
+ // repeatedly on different inputs.
+ SUSTAINED_SPEED,
+};
+
+// Defines aspects to control while instantiating a runner.
+enum class InferencePriority
+{
+ UNKNOWN,
+
+ AUTO,
+
+ MIN_LATENCY,
+
+ MAX_PRECISION,
+
+ MIN_MEMORY_USAGE,
+};
+
+struct InferenceOptions
+{
+ InferenceUsage usage = InferenceUsage::SUSTAINED_SPEED;
+
+ // Ordered priorities provide better understanding of desired semantics,
+ // where priority(n) is more important than priority(n+1).
+ // AUTO priority is needed when a single priority is the most important
+ // factor. For example, priority1 = InferencePriority::MIN_LATENCY and leaving
+ // everything else to AUTO would result in configuration that achieves maximum
+ // performance.
+ //
+ // AUTO priority can only be used when higher priorities are fully specified.
+ // For example:
+ // VALID: priority1 = MIN_LATENCY, priority2 = AUTO, priority3 = AUTO
+ // VALID: priority1 = MIN_LATENCY, priority2 = MAX_PRECISION,
+ // priority3 = AUTO
+ // INVALID: priority1 = AUTO, priority2 = MIN_LATENCY, priority3 = AUTO
+ // INVALID: priority1 = MIN_LATENCY, priority2 = AUTO,
+ // priority3 = MAX_PRECISION
+ // Invalid priorities will result in error.
+ InferencePriority priority1 = InferencePriority::MAX_PRECISION;
+
+ InferencePriority priority2 = InferencePriority::AUTO;
+
+ InferencePriority priority3 = InferencePriority::AUTO;
+};
+
+// Returns a position number for the priority. If priority is missing,
+// then it it would return 'max num priorities + 1'.
+int GetPosition(const InferenceOptions &options, InferencePriority p);
+
+// Return true if options are valid.
+bool IsValid(const InferenceOptions &options);
+
+// Resolves AUTO priorities and specifies them explicitly.
+// Note, no-one should assume that these mappings will not change.
+// Technically this function is declared here for code re-use purposes and
+// by no means it should be treated as canonical way to resolve AUTO.
+void ResolveAutoPriority(InferenceOptions *options);
+
+enum class PriorityImportance
+{
+ UNKNOWN,
+ HIGHER,
+ LOWER,
+};
+
+// If both p1 and p2 are not present in options, return UNKNOWN
+// If p1 is present, but p2 is not, return HIGHER
+// If p2 is present, but p1 is not, return LOWER
+// If both are present, and p1 is more important, return HIGHER, otherwise,
+// LOWER.
+PriorityImportance GetRelativeImportance(const InferenceOptions &options, InferencePriority p1,
+ InferencePriority p2);
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_API_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/Arguments.cc b/runtime/onert/backend/gpu_cl/open_cl/Arguments.cc
new file mode 100644
index 000000000..a7f86bffc
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/Arguments.cc
@@ -0,0 +1,926 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Arguments.h"
+
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/substitute.h"
+
+#include "AccessType.h"
+#include "TensorType.h"
+#include "DataType.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+namespace
+{
+
+bool IsWordSymbol(char symbol) { return absl::ascii_isalnum(symbol) || symbol == '_'; }
+
+std::string GetNextWord(const std::string &code, size_t first_position)
+{
+ size_t pos = first_position;
+ char t = code[pos];
+ while (IsWordSymbol(t))
+ {
+ pos++;
+ t = code[pos];
+ }
+ return code.substr(first_position, pos - first_position);
+}
+
+size_t FindEnclosingBracket(const std::string &text, size_t first_pos, char bracket)
+{
+ const std::map<char, char> brackets = {
+ {'(', ')'},
+ {'{', '}'},
+ {'[', ']'},
+ {'<', '>'},
+ };
+ char b_open = bracket;
+ auto it = brackets.find(b_open);
+ if (it == brackets.end())
+ {
+ return -1;
+ }
+ char b_close = it->second;
+ size_t pos = first_pos;
+ int opened = 1;
+ int closed = 0;
+ while (opened != closed && pos < text.size())
+ {
+ if (text[pos] == b_open)
+ {
+ opened++;
+ }
+ else if (text[pos] == b_close)
+ {
+ closed++;
+ }
+ pos++;
+ }
+ if (opened == closed)
+ {
+ return pos;
+ }
+ else
+ {
+ return -1;
+ }
+}
+
+absl::Status ParseArgsInsideBrackets(const std::string &text, size_t open_bracket_pos,
+ size_t *close_bracket_pos, std::vector<std::string> *args)
+{
+ *close_bracket_pos = FindEnclosingBracket(text, open_bracket_pos + 1, text[open_bracket_pos]);
+ if (*close_bracket_pos == static_cast<size_t>(-1))
+ {
+ return absl::NotFoundError("Not found enclosing bracket");
+ }
+ std::string str_args =
+ text.substr(open_bracket_pos + 1, *close_bracket_pos - open_bracket_pos - 2);
+ std::vector<absl::string_view> words = absl::StrSplit(str_args, ',');
+ args->reserve(words.size());
+ for (const auto &word : words)
+ {
+ absl::string_view arg = absl::StripAsciiWhitespace(word);
+ if (!arg.empty())
+ {
+ args->push_back(std::string(arg));
+ }
+ }
+ return absl::OkStatus();
+}
+
+void ReplaceAllWords(const std::string &old_word, const std::string &new_word, std::string *str)
+{
+ size_t position = str->find(old_word);
+ while (position != std::string::npos)
+ {
+ char prev = position == 0 ? '.' : (*str)[position - 1];
+ char next = position + old_word.size() < str->size() ? (*str)[position + old_word.size()] : '.';
+ if (IsWordSymbol(prev) || IsWordSymbol(next))
+ {
+ position = str->find(old_word, position + 1);
+ continue;
+ }
+ str->replace(position, old_word.size(), new_word);
+ position = str->find(old_word, position + new_word.size());
+ }
+}
+
+std::string RenameArg(const std::vector<std::string> &object_names, const std::string &postfix,
+ const std::string &arg_name)
+{
+ for (const auto &object_name : object_names)
+ {
+ if (absl::StartsWith(arg_name, object_name) && arg_name.size() > object_name.size() &&
+ arg_name[object_name.size()] == '_')
+ {
+ return object_name + postfix +
+ arg_name.substr(object_name.size(), arg_name.size() - object_name.size());
+ }
+ }
+ return arg_name + postfix;
+}
+
+void AppendArgument(const std::string &arg, std::string *args)
+{
+ if (!args->empty())
+ {
+ absl::StrAppend(args, ",\n ");
+ }
+ absl::StrAppend(args, arg);
+}
+
+std::string GetImageModifier(AccessType access)
+{
+ switch (access)
+ {
+ case AccessType::READ:
+ return "__read_only";
+ case AccessType::WRITE:
+ return "__write_only";
+ case AccessType::READ_WRITE:
+ return "__read_write";
+ default:
+ throw std::runtime_error("Invalid AccessType");
+ }
+}
+
+std::string GetDefaultSamplers(const DeviceInfo &device_info)
+{
+ std::string result;
+ result += "__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | "
+ "CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n";
+ if (device_info.IsAdreno3xx())
+ {
+ // Unfortunately, CLK_ADDRESS_CLAMP is very slow on Adreno3xx and
+ // we can observe huge register overhead when compared to other modes.
+
+ // While using CLK_ADDRESS_NONE with out-of-range image coordinates is
+ // undefined in the OpenCL specification, we have observed that
+ // CLK_ADDRESS_NONE works like CLK_ADDRESS_CLAMP for out-of-range image
+ // coordinates for RGBA F16/F32 textures on Adreno3xx devices. Using
+ // CLK_ADDRESS_NONE is significantly faster than CLK_ADDRESS_CLAMP on Adreno
+ // 3xx.
+ result += "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | "
+ "CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n";
+ }
+ else
+ {
+ result += "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | "
+ "CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n";
+ }
+
+ return result;
+}
+
+} // namespace
+
+// Static
+constexpr char Arguments::kArgsPrefix[];
+
+Arguments::Arguments(Arguments &&args)
+ : int_values_(std::move(args.int_values_)),
+ shared_int4s_data_(std::move(args.shared_int4s_data_)),
+ float_values_(std::move(args.float_values_)),
+ shared_float4s_data_(std::move(args.shared_float4s_data_)), buffers_(std::move(args.buffers_)),
+ images2d_(std::move(args.images2d_)), image2d_arrays_(std::move(args.image2d_arrays_)),
+ images3d_(std::move(args.images3d_)), image_buffers_(std::move(args.image_buffers_)),
+ custom_memories_(std::move(args.custom_memories_)), object_refs_(std::move(args.object_refs_)),
+ objects_(std::move(args.objects_))
+{
+}
+Arguments &Arguments::operator=(Arguments &&args)
+{
+ if (this != &args)
+ {
+ int_values_ = std::move(args.int_values_);
+ shared_int4s_data_ = std::move(args.shared_int4s_data_);
+ float_values_ = std::move(args.float_values_);
+ shared_float4s_data_ = std::move(args.shared_float4s_data_);
+ buffers_ = std::move(args.buffers_);
+ images2d_ = std::move(args.images2d_);
+ image2d_arrays_ = std::move(args.image2d_arrays_);
+ images3d_ = std::move(args.images3d_);
+ image_buffers_ = std::move(args.image_buffers_);
+ custom_memories_ = std::move(args.custom_memories_);
+ object_refs_ = std::move(args.object_refs_);
+ objects_ = std::move(args.objects_);
+ }
+ return *this;
+}
+
+void Arguments::AddFloat(const std::string &name, float value)
+{
+ float_values_[name].value = value;
+}
+void Arguments::AddInt(const std::string &name, int value) { int_values_[name].value = value; }
+void Arguments::AddBuffer(const std::string &name, const GPUBufferDescriptor &desc)
+{
+ buffers_[name] = desc;
+}
+void Arguments::AddImage2D(const std::string &name, const GPUImage2DDescriptor &desc)
+{
+ images2d_[name] = desc;
+}
+
+void Arguments::AddImage2DArray(const std::string &name, const GPUImage2DArrayDescriptor &desc)
+{
+ image2d_arrays_[name] = desc;
+}
+
+void Arguments::AddImage3D(const std::string &name, const GPUImage3DDescriptor &desc)
+{
+ images3d_[name] = desc;
+}
+
+void Arguments::AddImageBuffer(const std::string &name, const GPUImageBufferDescriptor &desc)
+{
+ image_buffers_[name] = desc;
+}
+
+void Arguments::AddCustomMemory(const std::string &name, const GPUCustomMemoryDescriptor &desc)
+{
+ custom_memories_[name] = desc;
+}
+
+void Arguments::AddObjectRef(const std::string &name, AccessType access_type,
+ GPUObjectDescriptorPtr &&descriptor_ptr)
+{
+ descriptor_ptr->SetAccess(access_type);
+ object_refs_[name] = {std::move(descriptor_ptr)};
+}
+
+void Arguments::AddObject(const std::string &name, GPUObjectDescriptorPtr &&descriptor_ptr)
+{
+ descriptor_ptr->SetAccess(AccessType::READ);
+ objects_[name] = {nullptr, std::move(descriptor_ptr)};
+}
+
+void Arguments::AddGPUResources(const std::string &name, const GPUResources &resources)
+{
+ for (const auto &r : resources.ints)
+ {
+ AddInt(absl::StrCat(name, "_", r));
+ }
+ for (const auto &r : resources.floats)
+ {
+ AddFloat(absl::StrCat(name, "_", r));
+ }
+ for (const auto &r : resources.buffers)
+ {
+ AddBuffer(absl::StrCat(name, "_", r.first), r.second);
+ }
+ for (const auto &r : resources.images2d)
+ {
+ AddImage2D(absl::StrCat(name, "_", r.first), r.second);
+ }
+ for (const auto &r : resources.image2d_arrays)
+ {
+ AddImage2DArray(absl::StrCat(name, "_", r.first), r.second);
+ }
+ for (const auto &r : resources.images3d)
+ {
+ AddImage3D(absl::StrCat(name, "_", r.first), r.second);
+ }
+ for (const auto &r : resources.image_buffers)
+ {
+ AddImageBuffer(absl::StrCat(name, "_", r.first), r.second);
+ }
+ for (const auto &r : resources.custom_memories)
+ {
+ AddCustomMemory(absl::StrCat(name, "_", r.first), r.second);
+ }
+}
+
+absl::Status Arguments::SetInt(const std::string &name, int value)
+{
+ auto it = int_values_.find(name);
+ if (it == int_values_.end())
+ {
+ return absl::NotFoundError(absl::StrCat("No int argument with name - ", name));
+ }
+ it->second.value = value;
+ if (it->second.active)
+ {
+ shared_int4s_data_[it->second.offset] = value;
+ }
+ return absl::OkStatus();
+}
+
+absl::Status Arguments::SetFloat(const std::string &name, float value)
+{
+ auto it = float_values_.find(name);
+ if (it == float_values_.end())
+ {
+ return absl::NotFoundError(absl::StrCat("No float argument with name - ", name));
+ }
+ it->second.value = value;
+ if (it->second.active)
+ {
+ shared_float4s_data_[it->second.offset] = value;
+ }
+ return absl::OkStatus();
+}
+
+absl::Status Arguments::SetImage2D(const std::string &name, cl_mem memory)
+{
+ auto it = images2d_.find(name);
+ if (it == images2d_.end())
+ {
+ return absl::NotFoundError(absl::StrCat("No image2D argument with name - ", name));
+ }
+ it->second.memory = memory;
+ return absl::OkStatus();
+}
+
+absl::Status Arguments::SetBuffer(const std::string &name, cl_mem memory)
+{
+ auto it = buffers_.find(name);
+ if (it == buffers_.end())
+ {
+ return absl::NotFoundError(absl::StrCat("No buffer argument with name - ", name));
+ }
+ it->second.memory = memory;
+ return absl::OkStatus();
+}
+
+absl::Status Arguments::SetImage2DArray(const std::string &name, cl_mem memory)
+{
+ auto it = image2d_arrays_.find(name);
+ if (it == image2d_arrays_.end())
+ {
+ return absl::NotFoundError(absl::StrCat("No image2D array argument with name - ", name));
+ }
+ it->second.memory = memory;
+ return absl::OkStatus();
+}
+
+absl::Status Arguments::SetImage3D(const std::string &name, cl_mem memory)
+{
+ auto it = images3d_.find(name);
+ if (it == images3d_.end())
+ {
+ return absl::NotFoundError(absl::StrCat("No image3D argument with name - ", name));
+ }
+ it->second.memory = memory;
+ return absl::OkStatus();
+}
+
+absl::Status Arguments::SetImageBuffer(const std::string &name, cl_mem memory)
+{
+ auto it = image_buffers_.find(name);
+ if (it == image_buffers_.end())
+ {
+ return absl::NotFoundError(absl::StrCat("No image buffer argument with name - ", name));
+ }
+ it->second.memory = memory;
+ return absl::OkStatus();
+}
+
+absl::Status Arguments::SetCustomMemory(const std::string &name, cl_mem memory)
+{
+ auto it = custom_memories_.find(name);
+ if (it == custom_memories_.end())
+ {
+ return absl::NotFoundError(absl::StrCat("No custom memory argument with name - ", name));
+ }
+ it->second.memory = memory;
+ return absl::OkStatus();
+}
+
+absl::Status Arguments::SetObjectRef(const std::string &name, const GPUObject *object)
+{
+ auto it = object_refs_.find(name);
+ if (it == object_refs_.end())
+ {
+ return absl::NotFoundError(absl::StrCat("No object ref with name - ", name));
+ }
+ GPUResourcesWithValue resources;
+ RETURN_IF_ERROR(object->GetGPUResources(it->second.descriptor.get(), &resources));
+ return SetGPUResources(name, resources);
+}
+
+absl::Status Arguments::SetGPUResources(const std::string &name,
+ const GPUResourcesWithValue &resources)
+{
+ for (const auto &r : resources.ints)
+ {
+ RETURN_IF_ERROR(SetInt(absl::StrCat(name, "_", r.first), r.second));
+ }
+ for (const auto &r : resources.floats)
+ {
+ RETURN_IF_ERROR(SetFloat(absl::StrCat(name, "_", r.first), r.second));
+ }
+ for (const auto &r : resources.buffers)
+ {
+ RETURN_IF_ERROR(SetBuffer(absl::StrCat(name, "_", r.first), r.second));
+ }
+ for (const auto &r : resources.images2d)
+ {
+ RETURN_IF_ERROR(SetImage2D(absl::StrCat(name, "_", r.first), r.second));
+ }
+ for (const auto &r : resources.image2d_arrays)
+ {
+ RETURN_IF_ERROR(SetImage2DArray(absl::StrCat(name, "_", r.first), r.second));
+ }
+ for (const auto &r : resources.images3d)
+ {
+ RETURN_IF_ERROR(SetImage3D(absl::StrCat(name, "_", r.first), r.second));
+ }
+ for (const auto &r : resources.image_buffers)
+ {
+ RETURN_IF_ERROR(SetImageBuffer(absl::StrCat(name, "_", r.first), r.second));
+ }
+ for (const auto &r : resources.custom_memories)
+ {
+ RETURN_IF_ERROR(SetCustomMemory(absl::StrCat(name, "_", r.first), r.second));
+ }
+ return absl::OkStatus();
+}
+void Arguments::RenameArgs(const std::string &postfix, std::string *code) const
+{
+ size_t next_position = code->find(kArgsPrefix);
+ while (next_position != std::string::npos)
+ {
+ size_t arg_pos = next_position + strlen(kArgsPrefix);
+ std::string arg_name = GetNextWord(*code, arg_pos);
+ code->replace(arg_pos, arg_name.size(), arg_name + postfix);
+ next_position = code->find(kArgsPrefix, arg_pos + arg_name.size());
+ }
+}
+
+absl::Status Arguments::Merge(Arguments &&args, const std::string &postfix)
+{
+ std::vector<std::string> object_names;
+ object_names.reserve(args.object_refs_.size() + args.objects_.size());
+ for (auto &v : args.object_refs_)
+ {
+ object_names.push_back(v.first);
+ const std::string name = v.first + postfix;
+ if (object_refs_.find(name) != object_refs_.end())
+ {
+ return absl::InvalidArgumentError(
+ absl::StrCat("Object reference name collision. Name - ", name));
+ }
+ object_refs_[name] = {std::move(v.second.descriptor)};
+ }
+ for (auto &v : args.objects_)
+ {
+ object_names.push_back(v.first);
+ const std::string name = v.first + postfix;
+ if (objects_.find(name) != objects_.end())
+ {
+ return absl::InvalidArgumentError(absl::StrCat("Object name collision. Name - ", name));
+ }
+ objects_[name] = {std::move(v.second.obj_ptr), std::move(v.second.descriptor)};
+ }
+ for (const auto &v : args.int_values_)
+ {
+ AddInt(RenameArg(object_names, postfix, v.first), v.second.value);
+ }
+ for (const auto &v : args.float_values_)
+ {
+ AddFloat(RenameArg(object_names, postfix, v.first), v.second.value);
+ }
+ for (const auto &v : args.buffers_)
+ {
+ AddBuffer(RenameArg(object_names, postfix, v.first), v.second);
+ }
+ for (const auto &v : args.images2d_)
+ {
+ AddImage2D(RenameArg(object_names, postfix, v.first), v.second);
+ }
+ for (const auto &v : args.image2d_arrays_)
+ {
+ AddImage2DArray(RenameArg(object_names, postfix, v.first), v.second);
+ }
+ for (const auto &v : args.images3d_)
+ {
+ AddImage3D(RenameArg(object_names, postfix, v.first), v.second);
+ }
+ for (const auto &v : args.image_buffers_)
+ {
+ AddImageBuffer(RenameArg(object_names, postfix, v.first), v.second);
+ }
+ for (const auto &v : args.custom_memories_)
+ {
+ AddCustomMemory(RenameArg(object_names, postfix, v.first), v.second);
+ }
+ return absl::OkStatus();
+}
+
+absl::Status Arguments::TransformToCLCode(const DeviceInfo &device_info,
+ const std::map<std::string, std::string> &linkables,
+ std::string *code)
+{
+ RETURN_IF_ERROR(AddObjectArgs());
+ RETURN_IF_ERROR(ResolveSelectorsPass(linkables, code));
+ ResolveArgsPass(device_info, code);
+ *code = absl::Substitute(*code, GetListOfArgs());
+ *code = GetDefaultSamplers(device_info) + *code;
+ return absl::OkStatus();
+}
+
+std::string Arguments::GetListOfArgs()
+{
+ std::string result;
+ for (auto &t : buffers_)
+ {
+ const std::string type_name = t.second.data_type == DataType::FLOAT32 ? "float" : "half";
+ std::string attributes;
+ for (const auto &attr : t.second.attributes)
+ {
+ attributes += absl::StrCat(" __attribute__((", attr, "))");
+ }
+ AppendArgument(absl::StrCat(MemoryTypeToCLType(t.second.memory_type), " ",
+ ToCLDataType(t.second.data_type, t.second.element_size), "* ",
+ t.first, attributes),
+ &result);
+ }
+ for (auto &t : image_buffers_)
+ {
+ AppendArgument(
+ absl::StrCat(GetImageModifier(t.second.access_type), " image1d_buffer_t ", t.first), &result);
+ }
+ for (auto &t : images2d_)
+ {
+ AppendArgument(absl::StrCat(GetImageModifier(t.second.access_type), " image2d_t ", t.first),
+ &result);
+ }
+ for (auto &t : image2d_arrays_)
+ {
+ AppendArgument(
+ absl::StrCat(GetImageModifier(t.second.access_type), " image2d_array_t ", t.first), &result);
+ }
+ for (auto &t : images3d_)
+ {
+ AppendArgument(absl::StrCat(GetImageModifier(t.second.access_type), " image3d_t ", t.first),
+ &result);
+ }
+ for (auto &t : custom_memories_)
+ {
+ AppendArgument(absl::StrCat(t.second.type_name, " ", t.first), &result);
+ }
+ for (uint32_t i = 0; i < shared_int4s_data_.size() / 4; ++i)
+ {
+ AppendArgument(absl::StrCat("int4 shared_int4_", i), &result);
+ }
+ for (uint32_t i = 0; i < shared_float4s_data_.size() / 4; ++i)
+ {
+ AppendArgument(absl::StrCat("float4 shared_float4_", i), &result);
+ }
+ return result;
+}
+
+absl::Status Arguments::Bind(cl_kernel kernel, int offset)
+{
+ for (auto &t : buffers_)
+ {
+ const int error_code = clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(absl::StrCat("Failed to set kernel arguments - ",
+ CLErrorCodeToString(error_code), "(at index - ",
+ offset, ")"));
+ }
+ offset++;
+ }
+ for (auto &t : image_buffers_)
+ {
+ const int error_code = clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(absl::StrCat("Failed to set kernel arguments - ",
+ CLErrorCodeToString(error_code), "(at index - ",
+ offset, ")"));
+ }
+ offset++;
+ }
+ for (auto &t : images2d_)
+ {
+ const int error_code = clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(absl::StrCat("Failed to set kernel arguments - ",
+ CLErrorCodeToString(error_code), "(at index - ",
+ offset, ")"));
+ }
+ offset++;
+ }
+ for (auto &t : image2d_arrays_)
+ {
+ const int error_code = clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(absl::StrCat("Failed to set kernel arguments - ",
+ CLErrorCodeToString(error_code), "(at index - ",
+ offset, ")"));
+ }
+ offset++;
+ }
+ for (auto &t : images3d_)
+ {
+ const int error_code = clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(absl::StrCat("Failed to set kernel arguments - ",
+ CLErrorCodeToString(error_code), "(at index - ",
+ offset, ")"));
+ }
+ offset++;
+ }
+ for (auto &t : custom_memories_)
+ {
+ const int error_code = clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(absl::StrCat("Failed to set kernel arguments - ",
+ CLErrorCodeToString(error_code), "(at index - ",
+ offset, ")"));
+ }
+ offset++;
+ }
+ for (size_t i = 0; i < shared_int4s_data_.size() / 4; ++i)
+ {
+ const int error_code =
+ clSetKernelArg(kernel, offset, sizeof(int32_t) * 4, &shared_int4s_data_[i * 4]);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(absl::StrCat("Failed to set kernel arguments - ",
+ CLErrorCodeToString(error_code), "(at index - ",
+ offset, ")"));
+ }
+ offset++;
+ }
+ for (size_t i = 0; i < shared_float4s_data_.size() / 4; ++i)
+ {
+ const int error_code =
+ clSetKernelArg(kernel, offset, sizeof(int32_t) * 4, &shared_float4s_data_[i * 4]);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(absl::StrCat("Failed to set kernel arguments - ",
+ CLErrorCodeToString(error_code), "(at index - ",
+ offset, ")"));
+ }
+ offset++;
+ }
+ return absl::OkStatus();
+}
+
+std::string Arguments::AddActiveArgument(const std::string &arg_name, bool)
+{
+ {
+ auto it = int_values_.find(arg_name);
+ if (it != int_values_.end())
+ {
+ int int_index;
+ if (it->second.active)
+ {
+ int_index = it->second.offset;
+ }
+ else
+ {
+ it->second.active = true;
+ it->second.offset = shared_int4s_data_.size();
+ int_index = it->second.offset;
+ shared_int4s_data_.push_back(it->second.value);
+ }
+ std::string index = std::to_string(int_index / 4);
+ std::string postfixes[4] = {"x", "y", "z", "w"};
+ return "shared_int4_" + index + "." + postfixes[int_index % 4];
+ }
+ }
+ {
+ auto it = float_values_.find(arg_name);
+ if (it != float_values_.end())
+ {
+ int float_index;
+ if (it->second.active)
+ {
+ float_index = it->second.offset;
+ }
+ else
+ {
+ it->second.active = true;
+ it->second.offset = shared_float4s_data_.size();
+ float_index = it->second.offset;
+ shared_float4s_data_.push_back(it->second.value);
+ }
+ std::string index = std::to_string(float_index / 4);
+ std::string postfixes[4] = {"x", "y", "z", "w"};
+ return "shared_float4_" + index + "." + postfixes[float_index % 4];
+ }
+ }
+ return arg_name;
+}
+
+void Arguments::ResolveArgsPass(const DeviceInfo &device_info, std::string *code)
+{
+ bool use_f32_for_half_arguments = device_info.IsPowerVR();
+ size_t position = 0;
+ size_t next_position = code->find(kArgsPrefix);
+ while (next_position != std::string::npos)
+ {
+ size_t arg_pos = next_position;
+ next_position += strlen(kArgsPrefix);
+ std::string object_name = GetNextWord(*code, next_position);
+ std::string new_name = AddActiveArgument(object_name, use_f32_for_half_arguments);
+ code->replace(arg_pos, object_name.size() + strlen(kArgsPrefix), new_name);
+ position = arg_pos + new_name.size();
+ next_position = code->find(kArgsPrefix, position);
+ }
+
+ int shared_int4s_aligned_size = AlignByN(shared_int4s_data_.size(), 4);
+ shared_int4s_data_.resize(shared_int4s_aligned_size);
+ int shared_float4s_aligned_size = AlignByN(shared_float4s_data_.size(), 4);
+ shared_float4s_data_.resize(shared_float4s_aligned_size);
+}
+
+void Arguments::ResolveObjectNames(const std::string &object_name,
+ const std::vector<std::string> &member_names, std::string *code)
+{
+ for (const auto &member_name : member_names)
+ {
+ const std::string new_name = kArgsPrefix + object_name + "_" + member_name;
+ ReplaceAllWords(member_name, new_name, code);
+ }
+}
+
+GPUObjectDescriptor *Arguments::GetObjectDescriptor(const std::string &object_name) const
+{
+ {
+ auto it = object_refs_.find(object_name);
+ if (it != object_refs_.end())
+ {
+ return it->second.descriptor.get();
+ }
+ }
+ {
+ auto it = objects_.find(object_name);
+ if (it != objects_.end())
+ {
+ return it->second.descriptor.get();
+ }
+ }
+ return nullptr;
+}
+
+absl::Status Arguments::ResolveSelector(const std::map<std::string, std::string> &linkables,
+ const std::string &object_name, const std::string &selector,
+ const std::vector<std::string> &args,
+ const std::vector<std::string> &template_args,
+ std::string *result)
+{
+ const GPUObjectDescriptor *desc_ptr = GetObjectDescriptor(object_name);
+ if (!desc_ptr)
+ {
+ return absl::NotFoundError(absl::StrCat("No object with name - ", object_name));
+ }
+ auto names = desc_ptr->GetGPUResources().GetNames();
+ const auto *tensor_desc = dynamic_cast<const TensorDescriptor *>(desc_ptr);
+ if (tensor_desc && selector == "Write")
+ {
+ auto it = linkables.find(object_name);
+ if (it != linkables.end())
+ {
+ if (desc_ptr->GetAccess() != AccessType::WRITE &&
+ desc_ptr->GetAccess() != AccessType::READ_WRITE)
+ {
+ return absl::FailedPreconditionError(
+ absl::StrCat("Object with name - ", object_name, " should have Write access."));
+ }
+ std::string value_name, x_coord, y_coord, s_coord;
+ RETURN_IF_ERROR(tensor_desc->GetLinkingContextFromWriteSelector(args, &value_name, &x_coord,
+ &y_coord, &s_coord));
+ // x_coord can have batch size property of link_object
+ ResolveObjectNames(object_name, names, &x_coord);
+ *result = it->second;
+ ReplaceAllWords("in_out_value", value_name, result);
+ ReplaceAllWords("X_COORD", x_coord, result);
+ ReplaceAllWords("Y_COORD", y_coord, result);
+ ReplaceAllWords("S_COORD", s_coord, result);
+ RETURN_IF_ERROR(ResolveSelectorsPass({}, result));
+ }
+ }
+ std::string patch;
+ RETURN_IF_ERROR(desc_ptr->PerformSelector(selector, args, template_args, &patch));
+ ResolveObjectNames(object_name, names, &patch);
+ *result += patch;
+ return absl::OkStatus();
+}
+
+absl::Status Arguments::ResolveSelectorsPass(const std::map<std::string, std::string> &linkables,
+ std::string *code)
+{
+ std::string result;
+ size_t position = 0;
+ size_t next_position = code->find(kArgsPrefix);
+ while (next_position != std::string::npos)
+ {
+ size_t arg_pos = next_position;
+ next_position += strlen(kArgsPrefix);
+ std::string object_name = GetNextWord(*code, next_position);
+ char next = (*code)[next_position + object_name.size()];
+ if (next == '.')
+ {
+ next_position += object_name.size() + 1;
+ std::string selector_name = GetNextWord(*code, next_position);
+ next_position += selector_name.size();
+ next = (*code)[next_position];
+ std::vector<std::string> template_args;
+ if (next == '<')
+ {
+ size_t close_bracket_pos;
+ RETURN_IF_ERROR(
+ ParseArgsInsideBrackets(*code, next_position, &close_bracket_pos, &template_args));
+ next_position = close_bracket_pos;
+ next = (*code)[next_position];
+ }
+ if (next != '(')
+ {
+ return absl::NotFoundError(
+ absl::StrCat("Expected ( after ", object_name, ".", selector_name, " call"));
+ }
+ std::vector<std::string> args;
+ size_t close_bracket_pos;
+ RETURN_IF_ERROR(ParseArgsInsideBrackets(*code, next_position, &close_bracket_pos, &args));
+ for (auto &arg : args)
+ {
+ RETURN_IF_ERROR(ResolveSelectorsPass({}, &arg));
+ }
+ std::string patch;
+ RETURN_IF_ERROR(
+ ResolveSelector(linkables, object_name, selector_name, args, template_args, &patch));
+ code->replace(arg_pos, close_bracket_pos - arg_pos, patch);
+ position = arg_pos + patch.size();
+ }
+ else
+ {
+ position = arg_pos + strlen(kArgsPrefix);
+ }
+ next_position = code->find(kArgsPrefix, position);
+ }
+ return absl::OkStatus();
+}
+
+absl::Status Arguments::AllocateObjects(CLContext *context)
+{
+ for (auto &t : objects_)
+ {
+ RETURN_IF_ERROR(t.second.descriptor->CreateGPUObject(context, &t.second.obj_ptr));
+ }
+ return absl::OkStatus();
+}
+
+void Arguments::ReleaseCPURepresentation()
+{
+ for (auto &t : objects_)
+ {
+ t.second.descriptor->Release();
+ }
+}
+
+absl::Status Arguments::AddObjectArgs()
+{
+ for (auto &t : objects_)
+ {
+ AddGPUResources(t.first, t.second.descriptor->GetGPUResources());
+ GPUResourcesWithValue resources;
+ RETURN_IF_ERROR(t.second.obj_ptr->GetGPUResources(t.second.descriptor.get(), &resources));
+ RETURN_IF_ERROR(SetGPUResources(t.first, resources));
+ }
+ for (auto &t : object_refs_)
+ {
+ AddGPUResources(t.first, t.second.descriptor->GetGPUResources());
+ }
+ return absl::OkStatus();
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/Arguments.h b/runtime/onert/backend/gpu_cl/open_cl/Arguments.h
new file mode 100644
index 000000000..0c6ce1edf
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/Arguments.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_ARGUMENTS_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_ARGUMENTS_H__
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "ClDevice.h"
+#include "GpuObject.h"
+#include "OpenclWrapper.h"
+
+#include "AccessType.h"
+#include "Types.h"
+#include "Util.h"
+#include "Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+class ArgumentsBinder
+{
+public:
+ virtual absl::Status SetInt(const std::string &name, int value) = 0;
+ virtual absl::Status SetFloat(const std::string &name, float value) = 0;
+ virtual ~ArgumentsBinder() = default;
+};
+
+class Arguments : public ArgumentsBinder
+{
+public:
+ Arguments() = default;
+ void AddFloat(const std::string &name, float value = 0.0f);
+ void AddInt(const std::string &name, int value = 0);
+ void AddObjectRef(const std::string &name, AccessType access_type,
+ GPUObjectDescriptorPtr &&descriptor_ptr);
+ void AddObject(const std::string &name, GPUObjectDescriptorPtr &&descriptor_ptr);
+
+ absl::Status SetInt(const std::string &name, int value) override;
+ absl::Status SetFloat(const std::string &name, float value) override;
+ absl::Status SetObjectRef(const std::string &name, const GPUObject *object);
+
+ absl::Status Bind(cl_kernel kernel, int offset = 0);
+
+ void RenameArgs(const std::string &postfix, std::string *code) const;
+ absl::Status Merge(Arguments &&args, const std::string &postfix);
+
+ absl::Status AllocateObjects(CLContext *context);
+ void ReleaseCPURepresentation();
+ absl::Status TransformToCLCode(const DeviceInfo &device_info,
+ const std::map<std::string, std::string> &linkables,
+ std::string *code);
+
+ // Move only
+ Arguments(Arguments &&args);
+ Arguments &operator=(Arguments &&args);
+ Arguments(const Arguments &) = delete;
+ Arguments &operator=(const Arguments &) = delete;
+
+ ~Arguments() override = default;
+
+private:
+ void AddBuffer(const std::string &name, const GPUBufferDescriptor &desc);
+ void AddImage2D(const std::string &name, const GPUImage2DDescriptor &desc);
+ void AddImage2DArray(const std::string &name, const GPUImage2DArrayDescriptor &desc);
+ void AddImage3D(const std::string &name, const GPUImage3DDescriptor &desc);
+ void AddImageBuffer(const std::string &name, const GPUImageBufferDescriptor &desc);
+ void AddCustomMemory(const std::string &name, const GPUCustomMemoryDescriptor &desc);
+
+ absl::Status SetImage2D(const std::string &name, cl_mem memory);
+ absl::Status SetBuffer(const std::string &name, cl_mem memory);
+ absl::Status SetImage2DArray(const std::string &name, cl_mem memory);
+ absl::Status SetImage3D(const std::string &name, cl_mem memory);
+ absl::Status SetImageBuffer(const std::string &name, cl_mem memory);
+ absl::Status SetCustomMemory(const std::string &name, cl_mem memory);
+
+ std::string GetListOfArgs();
+
+ std::string AddActiveArgument(const std::string &arg_name, bool use_f32_for_halfs);
+ void AddGPUResources(const std::string &name, const GPUResources &resources);
+
+ absl::Status SetGPUResources(const std::string &name, const GPUResourcesWithValue &resources);
+
+ absl::Status AddObjectArgs();
+
+ void ResolveArgsPass(const DeviceInfo &device_info, std::string *code);
+ absl::Status ResolveSelectorsPass(const std::map<std::string, std::string> &linkables,
+ std::string *code);
+
+ absl::Status ResolveSelector(const std::map<std::string, std::string> &linkables,
+ const std::string &object_name, const std::string &selector,
+ const std::vector<std::string> &args,
+ const std::vector<std::string> &template_args, std::string *result);
+
+ void ResolveObjectNames(const std::string &object_name,
+ const std::vector<std::string> &member_names, std::string *code);
+
+ GPUObjectDescriptor *GetObjectDescriptor(const std::string &object_name) const;
+
+ static constexpr char kArgsPrefix[] = "args.";
+
+ struct IntValue
+ {
+ int value;
+
+ // many uniforms generated automatically and not used
+ // to reduce amount of data transferred we adding this optimization
+ bool active = false;
+
+ // offset to shared uniform storage.
+ uint32_t offset = -1;
+ };
+ std::map<std::string, IntValue> int_values_;
+ std::vector<int32_t> shared_int4s_data_;
+
+ struct FloatValue
+ {
+ float value;
+
+ // many uniforms generated automatically and not used
+ // to reduce amount of data transferred we adding this optimization
+ bool active = false;
+
+ // offset to shared uniform storage.
+ uint32_t offset = -1;
+ };
+ std::map<std::string, FloatValue> float_values_;
+ std::vector<float> shared_float4s_data_;
+
+ std::map<std::string, GPUBufferDescriptor> buffers_;
+ std::map<std::string, GPUImage2DDescriptor> images2d_;
+ std::map<std::string, GPUImage2DArrayDescriptor> image2d_arrays_;
+ std::map<std::string, GPUImage3DDescriptor> images3d_;
+ std::map<std::string, GPUImageBufferDescriptor> image_buffers_;
+ std::map<std::string, GPUCustomMemoryDescriptor> custom_memories_;
+
+ struct ObjectRefArg
+ {
+ GPUObjectDescriptorPtr descriptor;
+ };
+ std::map<std::string, ObjectRefArg> object_refs_;
+
+ struct ObjectArg
+ {
+ GPUObjectPtr obj_ptr;
+ GPUObjectDescriptorPtr descriptor;
+ };
+ std::map<std::string, ObjectArg> objects_;
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_ARGUMENTS_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/Buffer.cc b/runtime/onert/backend/gpu_cl/open_cl/Buffer.cc
new file mode 100644
index 000000000..64c071921
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/Buffer.cc
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Buffer.h"
+
+#include <string>
+
+#include "ClContext.h"
+#include "DataType.h"
+#include "GpuObject.h"
+#include "Util.h"
+#include "Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+namespace
+{
+
+absl::Status CreateBuffer(size_t size_in_bytes, bool gpu_read_only, const void *data,
+ CLContext *context, Buffer *result)
+{
+ cl_mem buffer;
+ RETURN_IF_ERROR(CreateCLBuffer(context->context(), size_in_bytes, gpu_read_only,
+ const_cast<void *>(data), &buffer));
+ *result = Buffer(buffer, size_in_bytes);
+
+ return absl::OkStatus();
+}
+
+} // namespace
+
+BufferDescriptor::BufferDescriptor(BufferDescriptor &&desc)
+ : GPUObjectDescriptor(std::move(desc)), element_type(desc.element_type),
+ element_size(desc.element_size), memory_type(desc.memory_type),
+ attributes(std::move(desc.attributes)), size(desc.size), data(std::move(desc.data))
+{
+}
+
+BufferDescriptor &BufferDescriptor::operator=(BufferDescriptor &&desc)
+{
+ if (this != &desc)
+ {
+ std::swap(element_type, desc.element_type);
+ std::swap(element_size, desc.element_size);
+ std::swap(memory_type, desc.memory_type);
+ attributes = std::move(desc.attributes);
+ std::swap(size, desc.size);
+ data = std::move(desc.data);
+ GPUObjectDescriptor::operator=(std::move(desc));
+ }
+ return *this;
+}
+
+void BufferDescriptor::Release() { data.clear(); }
+
+GPUResources BufferDescriptor::GetGPUResources() const
+{
+ GPUResources resources;
+ GPUBufferDescriptor desc;
+ desc.data_type = element_type;
+ desc.access_type = access_type_;
+ desc.element_size = element_size;
+ desc.memory_type = memory_type;
+ desc.attributes = attributes;
+ resources.buffers.push_back({"buffer", desc});
+ return resources;
+}
+
+absl::Status BufferDescriptor::PerformSelector(const std::string &selector,
+ const std::vector<std::string> &args,
+ const std::vector<std::string> &template_args,
+ std::string *result) const
+{
+ if (selector == "Read")
+ {
+ return PerformReadSelector(args, result);
+ }
+ else if (selector == "GetPtr")
+ {
+ return PerformGetPtrSelector(args, template_args, result);
+ }
+ else
+ {
+ return absl::NotFoundError(
+ absl::StrCat("BufferDescriptor don't have selector with name - ", selector));
+ }
+}
+
+absl::Status BufferDescriptor::PerformReadSelector(const std::vector<std::string> &args,
+ std::string *result) const
+{
+ if (args.size() != 1)
+ {
+ return absl::NotFoundError(
+ absl::StrCat("BufferDescriptor Read require one argument, but ", args.size(), " was passed"));
+ }
+ *result = absl::StrCat("buffer[", args[0], "]");
+ return absl::OkStatus();
+}
+
+absl::Status BufferDescriptor::PerformGetPtrSelector(const std::vector<std::string> &args,
+ const std::vector<std::string> &template_args,
+ std::string *result) const
+{
+ if (args.size() > 1)
+ {
+ return absl::NotFoundError(absl::StrCat(
+ "BufferDescriptor GetPtr require one or zero arguments, but ", args.size(), " was passed"));
+ }
+ if (template_args.size() > 1)
+ {
+ return absl::NotFoundError(absl::StrCat("BufferDescriptor GetPtr require one or zero teemplate "
+ "arguments, but ",
+ template_args.size(), " was passed"));
+ }
+ std::string conversion;
+ if (template_args.size() == 1)
+ {
+ const std::string type_name = ToCLDataType(element_type, element_size);
+ if (type_name != template_args[0])
+ {
+ conversion = absl::StrCat("(", MemoryTypeToCLType(memory_type), " ", template_args[0], "*)&");
+ }
+ }
+ if (args.empty())
+ {
+ *result = absl::StrCat(conversion, "buffer");
+ }
+ else if (conversion.empty())
+ {
+ *result = absl::StrCat("(buffer + ", args[0], ")");
+ }
+ else
+ {
+ *result = absl::StrCat(conversion, "buffer[", args[0], "]");
+ }
+ return absl::OkStatus();
+}
+
+absl::Status BufferDescriptor::CreateGPUObject(CLContext *context, GPUObjectPtr *result) const
+{
+ Buffer gpu_buffer;
+ RETURN_IF_ERROR(gpu_buffer.CreateFromBufferDescriptor(*this, context));
+ *result = absl::make_unique<Buffer>(std::move(gpu_buffer));
+ return absl::OkStatus();
+}
+
+Buffer::Buffer(cl_mem buffer, size_t size_in_bytes) : buffer_(buffer), size_(size_in_bytes) {}
+
+Buffer::Buffer(Buffer &&buffer) : buffer_(buffer.buffer_), size_(buffer.size_)
+{
+ buffer.buffer_ = nullptr;
+ buffer.size_ = 0;
+}
+
+Buffer &Buffer::operator=(Buffer &&buffer)
+{
+ if (this != &buffer)
+ {
+ Release();
+ std::swap(size_, buffer.size_);
+ std::swap(buffer_, buffer.buffer_);
+ }
+ return *this;
+}
+
+void Buffer::Release()
+{
+ if (buffer_)
+ {
+ clReleaseMemObject(buffer_);
+ buffer_ = nullptr;
+ size_ = 0;
+ }
+}
+
+absl::Status Buffer::GetGPUResources(const GPUObjectDescriptor *obj_ptr,
+ GPUResourcesWithValue *resources) const
+{
+ const auto *buffer_desc = dynamic_cast<const BufferDescriptor *>(obj_ptr);
+ if (!buffer_desc)
+ {
+ return absl::InvalidArgumentError("Expected BufferDescriptor on input.");
+ }
+
+ resources->buffers.push_back({"buffer", buffer_});
+ return absl::OkStatus();
+}
+
+absl::Status Buffer::CreateFromBufferDescriptor(const BufferDescriptor &desc, CLContext *context)
+{
+ bool read_only = desc.memory_type == MemoryType::CONSTANT;
+ uint8_t *data_ptr = desc.data.empty() ? nullptr : const_cast<unsigned char *>(desc.data.data());
+ size_ = desc.size;
+ return CreateCLBuffer(context->context(), desc.size, read_only, data_ptr, &buffer_);
+}
+
+absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, CLContext *context, Buffer *result)
+{
+ return CreateBuffer(size_in_bytes, true, nullptr, context, result);
+}
+
+absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, const void *data, CLContext *context,
+ Buffer *result)
+{
+ return CreateBuffer(size_in_bytes, true, data, context, result);
+}
+
+absl::Status CreateReadWriteBuffer(size_t size_in_bytes, CLContext *context, Buffer *result)
+{
+ return CreateBuffer(size_in_bytes, false, nullptr, context, result);
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/Buffer.h b/runtime/onert/backend/gpu_cl/open_cl/Buffer.h
new file mode 100644
index 000000000..39e97be6d
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/Buffer.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_BUFFER_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_BUFFER_H__
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+
+#include "ClCommandQueue.h"
+#include "ClContext.h"
+#include "GpuObject.h"
+#include "OpenclWrapper.h"
+#include "DataType.h"
+#include "Util.h"
+#include "Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+struct BufferDescriptor : public GPUObjectDescriptor
+{
+ DataType element_type;
+ int element_size;
+ MemoryType memory_type = MemoryType::GLOBAL;
+ std::vector<std::string> attributes;
+
+ // optional
+ int size = 0;
+ std::vector<uint8_t> data;
+
+ BufferDescriptor() = default;
+ BufferDescriptor(const BufferDescriptor &) = default;
+ BufferDescriptor &operator=(const BufferDescriptor &) = default;
+ BufferDescriptor(BufferDescriptor &&desc);
+ BufferDescriptor &operator=(BufferDescriptor &&desc);
+
+ absl::Status PerformSelector(const std::string &selector, const std::vector<std::string> &args,
+ const std::vector<std::string> &template_args,
+ std::string *result) const override;
+
+ GPUResources GetGPUResources() const override;
+ absl::Status PerformReadSelector(const std::vector<std::string> &args, std::string *result) const;
+ absl::Status PerformGetPtrSelector(const std::vector<std::string> &args,
+ const std::vector<std::string> &template_args,
+ std::string *result) const;
+
+ absl::Status CreateGPUObject(CLContext *context, GPUObjectPtr *result) const override;
+ void Release() override;
+};
+
+// Buffer represent linear GPU data storage with arbitrary data format.
+// Buffer is moveable but not copyable.
+class Buffer : public GPUObject
+{
+public:
+ Buffer() {} // just for using Buffer as a class members
+ Buffer(cl_mem buffer, size_t size_in_bytes);
+
+ // Move only
+ Buffer(Buffer &&buffer);
+ Buffer &operator=(Buffer &&buffer);
+ Buffer(const Buffer &) = delete;
+ Buffer &operator=(const Buffer &) = delete;
+
+ virtual ~Buffer() { Release(); }
+
+ // for profiling and memory statistics
+ uint64_t GetMemorySizeInBytes() const { return size_; }
+
+ cl_mem GetMemoryPtr() const { return buffer_; }
+
+ // Writes data to a buffer. Data should point to a region that
+ // has exact size in bytes as size_in_bytes(constructor parameter).
+ template <typename T> absl::Status WriteData(CLCommandQueue *queue, const std::vector<T> *data);
+
+ // Reads data from Buffer into CPU memory.
+ template <typename T> absl::Status ReadData(CLCommandQueue *queue, std::vector<T> *result) const;
+
+ absl::Status GetGPUResources(const GPUObjectDescriptor *obj_ptr,
+ GPUResourcesWithValue *resources) const override;
+
+ absl::Status CreateFromBufferDescriptor(const BufferDescriptor &desc, CLContext *context);
+
+private:
+ void Release();
+
+ cl_mem buffer_ = nullptr;
+ size_t size_ = 0;
+};
+
+absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, CLContext *context, Buffer *result);
+
+absl::Status CreateReadOnlyBuffer(size_t size_in_bytes, const void *data, CLContext *context,
+ Buffer *result);
+
+absl::Status CreateReadWriteBuffer(size_t size_in_bytes, CLContext *context, Buffer *result);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_BUFFER_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClCommandQueue.cc b/runtime/onert/backend/gpu_cl/open_cl/ClCommandQueue.cc
new file mode 100644
index 000000000..d147b7b13
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/ClCommandQueue.cc
@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ClCommandQueue.h"
+
+#include <algorithm>
+#include <map>
+#include <string>
+#include <vector>
+#include <limits>
+
+#include "absl/strings/str_cat.h"
+#include "ClDevice.h"
+#include "ClEvent.h"
+#include "Util.h"
+#include "Types.h"
+#include "Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+using namespace std;
+
+CLCommandQueue::CLCommandQueue(cl_command_queue queue, bool has_ownership)
+ : queue_(queue), has_ownership_(has_ownership)
+{
+}
+
+CLCommandQueue::CLCommandQueue(CLCommandQueue &&queue)
+ : queue_(queue.queue_), has_ownership_(queue.has_ownership_)
+{
+ queue.queue_ = nullptr;
+}
+
+CLCommandQueue &CLCommandQueue::operator=(CLCommandQueue &&queue)
+{
+ if (this != &queue)
+ {
+ Release();
+ std::swap(queue_, queue.queue_);
+ has_ownership_ = queue.has_ownership_;
+ }
+ return *this;
+}
+
+CLCommandQueue::~CLCommandQueue() { Release(); }
+
+void CLCommandQueue::Release()
+{
+ if (has_ownership_ && queue_)
+ {
+ clReleaseCommandQueue(queue_);
+ queue_ = nullptr;
+ }
+}
+
+absl::Status CLCommandQueue::Dispatch(const CLKernel &kernel, const int3 &work_groups_count,
+ const int3 &work_group_size, CLEvent *event)
+{
+ std::vector<size_t> local(3);
+ std::vector<size_t> global(3);
+ for (int i = 0; i < 3; ++i)
+ {
+ local[i] = work_group_size[i];
+ global[i] = work_groups_count[i] * work_group_size[i];
+ }
+ cl_event resulting_event;
+ const int error_code =
+ clEnqueueNDRangeKernel(queue_, kernel.kernel(), 3, nullptr, global.data(), local.data(), 0,
+ nullptr, event ? &resulting_event : nullptr);
+ if (event)
+ {
+ *event = CLEvent(resulting_event);
+ }
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(
+ absl::StrCat("Failed to clEnqueueNDRangeKernel - ", CLErrorCodeToString(error_code)));
+ }
+ return absl::OkStatus();
+}
+
+absl::Status CLCommandQueue::Dispatch(const CLKernel &kernel, const int3 &work_groups_count,
+ const int3 &work_group_size)
+{
+ return Dispatch(kernel, work_groups_count, work_group_size, nullptr);
+}
+
+absl::Status CLCommandQueue::EnqueueEvent(CLEvent *event)
+{
+ cl_event resulting_event;
+ const int error_code = clEnqueueMarker(queue_, &resulting_event);
+ *event = CLEvent(resulting_event);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(
+ absl::StrCat("Failed to clEnqueueMarker - ", CLErrorCodeToString(error_code)));
+ }
+ return absl::OkStatus();
+}
+
+absl::Status CLCommandQueue::EnqueueWriteImage(cl_mem memory, int3 region, const void *data)
+{
+ const size_t origin[] = {0, 0, 0};
+ const size_t r[] = {static_cast<size_t>(region.x), static_cast<size_t>(region.y),
+ static_cast<size_t>(region.z)};
+ auto error_code =
+ clEnqueueWriteImage(queue_, memory, CL_TRUE, origin, r, 0, 0, data, 0, nullptr, nullptr);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(absl::StrCat("Failed to upload data to GPU (clEnqueueWriteImage) - ",
+ CLErrorCodeToString(error_code)));
+ }
+
+ return absl::OkStatus();
+}
+
+absl::Status CLCommandQueue::EnqueueReadImage(cl_mem memory, int3 region, void *data)
+{
+ const size_t origin[] = {0, 0, 0};
+ const size_t r[] = {static_cast<size_t>(region.x), static_cast<size_t>(region.y),
+ static_cast<size_t>(region.z)};
+ auto error_code =
+ clEnqueueReadImage(queue_, memory, CL_TRUE, origin, r, 0, 0, data, 0, nullptr, nullptr);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(absl::StrCat("Failed to read data from GPU (clEnqueueReadImage) - ",
+ CLErrorCodeToString(error_code)));
+ }
+
+ return absl::OkStatus();
+}
+
+absl::Status CLCommandQueue::EnqueueWriteBuffer(cl_mem memory, size_t size_in_bytes,
+ const void *data)
+{
+ auto error_code =
+ clEnqueueWriteBuffer(queue_, memory, CL_TRUE, 0, size_in_bytes, data, 0, nullptr, nullptr);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(absl::StrCat("Failed to upload data to GPU (clEnqueueWriteBuffer) - ",
+ CLErrorCodeToString(error_code)));
+ }
+ return absl::OkStatus();
+}
+
+absl::Status CLCommandQueue::EnqueueReadBuffer(cl_mem memory, size_t size_in_bytes, void *data)
+{
+ auto error_code =
+ clEnqueueReadBuffer(queue_, memory, CL_TRUE, 0, size_in_bytes, data, 0, nullptr, nullptr);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(absl::StrCat("Failed to read data from GPU (clEnqueueReadBuffer) - ",
+ CLErrorCodeToString(error_code)));
+ }
+ return absl::OkStatus();
+}
+
+absl::Status CLCommandQueue::WaitForCompletion()
+{
+ auto error_code = clFinish(queue_);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(
+ absl::StrCat("Failed to clFinish - ", CLErrorCodeToString(error_code)));
+ }
+ return absl::OkStatus();
+}
+
+ProfilingCommandQueue::ProfilingCommandQueue(cl_command_queue queue) : CLCommandQueue(queue, true)
+{
+ events_.reserve(128);
+}
+
+ProfilingCommandQueue::ProfilingCommandQueue(ProfilingCommandQueue &&queue)
+ : CLCommandQueue(std::move(queue)), events_(std::move(queue.events_)),
+ current_label_(std::move(queue.current_label_))
+{
+}
+
+ProfilingCommandQueue &ProfilingCommandQueue::operator=(ProfilingCommandQueue &&queue)
+{
+ if (this != &queue)
+ {
+ events_ = std::move(queue.events_);
+ current_label_ = std::move(queue.current_label_);
+ CLCommandQueue::operator=(std::move(queue));
+ }
+ return *this;
+}
+
+void ProfilingCommandQueue::SetEventsLabel(const std::string &name) { current_label_ = name; }
+
+void ProfilingCommandQueue::ResetMeasurements() { events_.clear(); }
+
+absl::Status ProfilingCommandQueue::Dispatch(const CLKernel &kernel, const int3 &work_groups_count,
+ const int3 &work_group_size)
+{
+ events_.push_back(CLEvent());
+ RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count, work_group_size,
+ &events_[events_.size() - 1]));
+ events_.back().SetName(current_label_);
+ return absl::OkStatus();
+}
+
+absl::Status
+ProfilingCommandQueue::GetBestWorkGroupIndex(const CLKernel &kernel, const DeviceInfo &device_info,
+ const std::vector<int3> &work_groups_count,
+ const std::vector<int3> &work_group_sizes, int *index)
+{
+ // Some Adreno 3xx can have wrong numbers for some events
+ const bool possible_bug_with_events = device_info.IsAdreno3xx();
+ events_.resize(work_group_sizes.size());
+ for (size_t i = 0; i < work_group_sizes.size(); ++i)
+ {
+ RETURN_IF_ERROR(
+ CLCommandQueue::Dispatch(kernel, work_groups_count[i], work_group_sizes[i], &events_[i]));
+
+ // reducing the speed of memory leak on Mali for some kernels
+ if (device_info.IsMali() && i % 8 == 7)
+ {
+ events_[i - 7].Wait();
+ }
+ if (possible_bug_with_events)
+ {
+ // We are trying to increase probability for correct result.
+ RETURN_IF_ERROR(WaitForCompletion());
+ }
+ }
+
+ RETURN_IF_ERROR(WaitForCompletion());
+
+ // To release memory of some kernel pool on Mali.
+ if (device_info.IsMali())
+ {
+ RETURN_IF_ERROR(kernel.ReInit());
+ }
+
+ int minimum_index = 0;
+ double minimum_time = std::numeric_limits<double>::max();
+ if (possible_bug_with_events)
+ { // we will try to cut out suspicious results
+ double average_time = 0.0;
+ int average_samples_count = 0;
+ for (size_t i = 0; i < work_group_sizes.size(); ++i)
+ {
+ if (events_[i].GetEventTimeMs() < 100 * 1000)
+ { // 100 sec
+ average_time += events_[i].GetEventTimeMs();
+ average_samples_count++;
+ }
+ }
+ if (average_samples_count == 0)
+ {
+ throw std::runtime_error("It cannot be divided by zero");
+ }
+ else
+ {
+ average_time /= average_samples_count;
+ }
+
+ for (size_t i = 0; i < work_group_sizes.size(); ++i)
+ {
+ double time = events_[i].GetEventTimeMs();
+ if (time < minimum_time && time >= 0.1 * average_time)
+ {
+ minimum_index = i;
+ minimum_time = time;
+ }
+ }
+ }
+ else
+ {
+ for (size_t i = 0; i < work_group_sizes.size(); ++i)
+ {
+ double time = events_[i].GetEventTimeMs();
+ if (time < minimum_time)
+ {
+ minimum_index = i;
+ minimum_time = time;
+ }
+ }
+ }
+
+ *index = minimum_index;
+
+ return absl::OkStatus();
+}
+
+absl::Status CreateCLCommandQueue(const CLDevice &device, const CLContext &context,
+ CLCommandQueue *result)
+{
+ int error_code;
+ cl_command_queue queue = clCreateCommandQueue(context.context(), device.id(), 0, &error_code);
+ if (!queue)
+ {
+ return absl::UnknownError(
+ absl::StrCat("Failed to create a command queue - ", CLErrorCodeToString(error_code)));
+ }
+ *result = CLCommandQueue(queue, true);
+ return absl::OkStatus();
+}
+
+double ProfilingCommandQueue::GetQueueExecutionTimeMs() const
+{
+ const uint64_t start = events_.front().GetStartedTimeNs();
+ const uint64_t end = events_.back().GetFinishedTimeNs();
+ const uint64_t time_ns = (end - start);
+
+ return static_cast<double>(time_ns) / 1000000.0;
+}
+
+double ProfilingCommandQueue::GetSumOfEventsTimeMs() const
+{
+ double sum = 0.0;
+ for (uint32_t i = 0; i < events_.size(); ++i)
+ {
+ sum += events_[i].GetEventTimeMs();
+ }
+ return sum;
+}
+
+absl::Status CreateProfilingCommandQueue(const CLDevice &device, const CLContext &context,
+ ProfilingCommandQueue *result)
+{
+ int error_code;
+ cl_command_queue queue =
+ clCreateCommandQueue(context.context(), device.id(), CL_QUEUE_PROFILING_ENABLE, &error_code);
+ if (!queue)
+ {
+ return absl::UnknownError(
+ absl::StrCat("Failed to create a command queue - ", CLErrorCodeToString(error_code)));
+ }
+
+ *result = ProfilingCommandQueue(queue);
+ return absl::OkStatus();
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClCommandQueue.h b/runtime/onert/backend/gpu_cl/open_cl/ClCommandQueue.h
new file mode 100644
index 000000000..81f93fd23
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/ClCommandQueue.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_CL_COMMAND_QUEUE_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_CL_COMMAND_QUEUE_H__
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/time/time.h"
+#include "ClContext.h"
+#include "ClDevice.h"
+#include "ClEvent.h"
+#include "ClKernel.h"
+#include "OpenclWrapper.h"
+#include "Types.h"
+#include "Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+struct ProfilingInfo
+{
+ struct DispatchInfo
+ {
+ std::string label;
+ absl::Duration duration;
+ };
+
+ std::vector<DispatchInfo> dispatches;
+
+ absl::Duration GetTotalTime() const;
+
+ // Returns report (string of lines delimited by \n)
+ // This method uses GPU counters and measure GPU time only.
+ // Report has next structure:
+ // Per kernel timing(K kernels):
+ // conv2d 3.2ms
+ // ...
+ // --------------------
+ // Accumulated time per operation type:
+ // conv2d - 14.5ms
+ // ....
+ // --------------------
+ // Ideal total time: 23.4ms // Total time for all kernels
+ std::string GetDetailedReport() const;
+};
+
+// A wrapper around opencl command queue
+class CLCommandQueue
+{
+public:
+ CLCommandQueue() {}
+ CLCommandQueue(cl_command_queue queue, bool has_ownership);
+
+ // Move only
+ CLCommandQueue(CLCommandQueue &&queue);
+ CLCommandQueue &operator=(CLCommandQueue &&queue);
+ CLCommandQueue(const CLCommandQueue &) = delete;
+ CLCommandQueue &operator=(const CLCommandQueue &) = delete;
+
+ virtual ~CLCommandQueue();
+
+ cl_command_queue queue() const { return queue_; }
+
+ virtual absl::Status Dispatch(const CLKernel &kernel, const int3 &work_groups_count,
+ const int3 &work_group_size);
+
+ absl::Status Dispatch(const CLKernel &kernel, const int3 &work_groups_count,
+ const int3 &work_group_size, CLEvent *event);
+
+ absl::Status EnqueueEvent(CLEvent *event);
+
+ absl::Status EnqueueWriteImage(cl_mem memory, int3 region, const void *data);
+ absl::Status EnqueueReadImage(cl_mem memory, int3 region, void *data);
+
+ absl::Status EnqueueWriteBuffer(cl_mem memory, size_t size_in_bytes, const void *data);
+ absl::Status EnqueueReadBuffer(cl_mem memory, size_t size_in_bytes, void *data);
+
+ absl::Status WaitForCompletion();
+
+protected:
+ void Release();
+
+ cl_command_queue queue_ = nullptr;
+ bool has_ownership_ = false;
+};
+
+class ProfilingCommandQueue : public CLCommandQueue
+{
+public:
+ ProfilingCommandQueue() {}
+ explicit ProfilingCommandQueue(cl_command_queue queue);
+
+ // Move only
+ ProfilingCommandQueue(ProfilingCommandQueue &&queue);
+ ProfilingCommandQueue &operator=(ProfilingCommandQueue &&queue);
+ ProfilingCommandQueue(const ProfilingCommandQueue &) = delete;
+ ProfilingCommandQueue &operator=(const ProfilingCommandQueue &) = delete;
+
+ absl::Status Dispatch(const CLKernel &kernel, const int3 &work_groups_count,
+ const int3 &work_group_size) override;
+
+ // will write index for fastest work_group among work_group_sizes
+ absl::Status GetBestWorkGroupIndex(const CLKernel &kernel, const DeviceInfo &device_info,
+ const std::vector<int3> &work_groups_count,
+ const std::vector<int3> &work_group_sizes, int *index);
+
+ // call ResetMeasurements() to start new seriese of measurements
+ void ResetMeasurements();
+
+ double GetQueueExecutionTimeMs() const;
+
+ // Difference from GetQueueExecutionTimeMs is that this number doesn't include
+ // time between kernels(kernels launches or preparing) on GPU. Usually, this
+ // time should be 5-10% better than GetQueueExecutionTimeMs, because 5-10%
+ // spend on something else(maybe kernels launches or preparing)
+ double GetSumOfEventsTimeMs() const;
+
+ // This label will be used for all subsequent dispatches.
+ void SetEventsLabel(const std::string &name);
+
+private:
+ std::vector<CLEvent> events_;
+ std::string current_label_;
+};
+
+absl::Status CreateCLCommandQueue(const CLDevice &device, const CLContext &context,
+ CLCommandQueue *result);
+
+absl::Status CreateProfilingCommandQueue(const CLDevice &device, const CLContext &context,
+ ProfilingCommandQueue *result);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_CL_COMMAND_QUEUE_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClContext.cc b/runtime/onert/backend/gpu_cl/open_cl/ClContext.cc
new file mode 100644
index 000000000..3289ff914
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/ClContext.cc
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ClContext.h"
+
+#include "absl/strings/str_cat.h"
+#include "ClImageFormat.h"
+#include "Util.h"
+#include "Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+namespace
+{
+
+std::vector<cl_image_format> GetSupportedImage2DFormats(cl_context context, cl_mem_flags flags)
+{
+ cl_uint num_image_formats;
+ cl_int error = clGetSupportedImageFormats(context, flags, CL_MEM_OBJECT_IMAGE2D, 0, nullptr,
+ &num_image_formats);
+ if (error != CL_SUCCESS)
+ {
+ return {};
+ }
+
+ std::vector<cl_image_format> result(num_image_formats);
+ error = clGetSupportedImageFormats(context, flags, CL_MEM_OBJECT_IMAGE2D, num_image_formats,
+ &result[0], nullptr);
+ if (error != CL_SUCCESS)
+ {
+ return {};
+ }
+ return result;
+}
+
+bool IsEqualToImageFormat(cl_image_format image_format, DataType data_type, int num_channels)
+{
+ return image_format.image_channel_data_type == ToImageChannelType(data_type) &&
+ image_format.image_channel_order == ToChannelOrder(num_channels);
+}
+
+void AddSupportedImageFormats(cl_context context, DeviceInfo *info)
+{
+ auto supported_formats = GetSupportedImage2DFormats(context, CL_MEM_READ_WRITE);
+ for (auto format : supported_formats)
+ {
+ info->supports_r_f16_tex2d =
+ info->supports_r_f16_tex2d || IsEqualToImageFormat(format, DataType::FLOAT16, 1);
+ info->supports_rg_f16_tex2d =
+ info->supports_rg_f16_tex2d || IsEqualToImageFormat(format, DataType::FLOAT16, 2);
+ info->supports_rgb_f16_tex2d =
+ info->supports_rgb_f16_tex2d || IsEqualToImageFormat(format, DataType::FLOAT16, 3);
+ info->supports_rgba_f16_tex2d =
+ info->supports_rgba_f16_tex2d || IsEqualToImageFormat(format, DataType::FLOAT16, 4);
+ info->supports_r_f32_tex2d =
+ info->supports_r_f32_tex2d || IsEqualToImageFormat(format, DataType::FLOAT32, 1);
+ info->supports_rg_f32_tex2d =
+ info->supports_rg_f32_tex2d || IsEqualToImageFormat(format, DataType::FLOAT32, 2);
+ info->supports_rgb_f32_tex2d =
+ info->supports_rgb_f32_tex2d || IsEqualToImageFormat(format, DataType::FLOAT32, 3);
+ info->supports_rgba_f32_tex2d =
+ info->supports_rgba_f32_tex2d || IsEqualToImageFormat(format, DataType::FLOAT32, 4);
+ }
+}
+
+absl::Status CreateCLContext(const CLDevice &device, cl_context_properties *properties,
+ CLContext *result)
+{
+ int error_code;
+ cl_device_id device_id = device.id();
+ cl_context context = clCreateContext(properties, 1, &device_id, nullptr, nullptr, &error_code);
+ if (!context)
+ {
+ return absl::UnknownError(
+ absl::StrCat("Failed to create a compute context - ", CLErrorCodeToString(error_code)));
+ }
+ AddSupportedImageFormats(context, &device.info_);
+
+ *result = CLContext(context, true);
+ return absl::OkStatus();
+}
+
+} // namespace
+
+CLContext::CLContext(cl_context context, bool has_ownership)
+ : context_(context), has_ownership_(has_ownership)
+{
+}
+
+CLContext::CLContext(CLContext &&context)
+ : context_(context.context_), has_ownership_(context.has_ownership_)
+{
+ context.context_ = nullptr;
+}
+
+CLContext &CLContext::operator=(CLContext &&context)
+{
+ if (this != &context)
+ {
+ Release();
+ std::swap(context_, context.context_);
+ has_ownership_ = context.has_ownership_;
+ }
+ return *this;
+}
+
+CLContext::~CLContext() { Release(); }
+
+void CLContext::Release()
+{
+ if (has_ownership_ && context_)
+ {
+ clReleaseContext(context_);
+ context_ = nullptr;
+ }
+}
+
+bool CLContext::IsFloatTexture2DSupported(int num_channels, DataType data_type,
+ cl_mem_flags flags) const
+{
+ auto supported_formats = GetSupportedImage2DFormats(context_, flags);
+ for (auto format : supported_formats)
+ {
+ if (format.image_channel_data_type == ToImageChannelType(data_type) &&
+ format.image_channel_order == ToChannelOrder(num_channels))
+ {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+absl::Status CreateCLContext(const CLDevice &device, CLContext *result)
+{
+ return CreateCLContext(device, nullptr, result);
+}
+
+absl::Status CreateCLGLContext(const CLDevice &device, cl_context_properties egl_context,
+ cl_context_properties egl_display, CLContext *result)
+{
+ if (!device.SupportsExtension("cl_khr_gl_sharing"))
+ {
+ return absl::UnavailableError("Device doesn't support CL-GL sharing.");
+ }
+ cl_context_properties platform = reinterpret_cast<cl_context_properties>(device.platform());
+ cl_context_properties props[] = {CL_GL_CONTEXT_KHR,
+ egl_context,
+ CL_EGL_DISPLAY_KHR,
+ egl_display,
+ CL_CONTEXT_PLATFORM,
+ platform,
+ 0};
+ return CreateCLContext(device, props, result);
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClContext.h b/runtime/onert/backend/gpu_cl/open_cl/ClContext.h
new file mode 100644
index 000000000..cf1d0d2d2
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/ClContext.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_CL_CONTEXT_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_CL_CONTEXT_H__
+
+#include "ClDevice.h"
+#include "OpenclWrapper.h"
+#include "DataType.h"
+#include "Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+// A RAII wrapper around opencl context
+class CLContext
+{
+public:
+ CLContext() {}
+ CLContext(cl_context context, bool has_ownership);
+
+ // Move only
+ CLContext(CLContext &&context);
+ CLContext &operator=(CLContext &&context);
+ CLContext(const CLContext &) = delete;
+ CLContext &operator=(const CLContext &) = delete;
+
+ ~CLContext();
+
+ cl_context context() const { return context_; }
+
+ bool IsFloatTexture2DSupported(int num_channels, DataType data_type,
+ cl_mem_flags flags = CL_MEM_READ_WRITE) const;
+
+private:
+ void Release();
+
+ cl_context context_ = nullptr;
+ bool has_ownership_ = false;
+};
+
+absl::Status CreateCLContext(const CLDevice &device, CLContext *result);
+absl::Status CreateCLGLContext(const CLDevice &device, cl_context_properties egl_context,
+ cl_context_properties egl_display, CLContext *result);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_CL_CONTEXT_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClDevice.cc b/runtime/onert/backend/gpu_cl/open_cl/ClDevice.cc
new file mode 100644
index 000000000..8dede139c
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/ClDevice.cc
@@ -0,0 +1,448 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ClDevice.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "Util.h"
+#include "Status.h"
+
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+template <> std::string GetDeviceInfo<std::string>(cl_device_id id, cl_device_info info)
+{
+ size_t size;
+ cl_int error = clGetDeviceInfo(id, info, 0, nullptr, &size);
+ if (error != CL_SUCCESS)
+ {
+ return "";
+ }
+
+ std::string result(size - 1, 0);
+ error = clGetDeviceInfo(id, info, size, &result[0], nullptr);
+ if (error != CL_SUCCESS)
+ {
+ return "";
+ }
+ return result;
+}
+
+namespace
+{
+template <typename T> T GetPlatformInfo(cl_platform_id id, cl_platform_info info)
+{
+ T result;
+ cl_int error = clGetPlatformInfo(id, info, sizeof(T), &result, nullptr);
+ if (error != CL_SUCCESS)
+ {
+ return -1;
+ }
+ return result;
+}
+
+std::string GetPlatformInfo(cl_platform_id id, cl_platform_info info)
+{
+ size_t size;
+ cl_int error = clGetPlatformInfo(id, info, 0, nullptr, &size);
+ if (error != CL_SUCCESS)
+ {
+ return "";
+ }
+
+ std::string result(size - 1, 0);
+ error = clGetPlatformInfo(id, info, size, &result[0], nullptr);
+ if (error != CL_SUCCESS)
+ {
+ return "";
+ }
+ return result;
+}
+
+void GetDeviceWorkDimsSizes(cl_device_id id, int3 *result)
+{
+ int dims_count = GetDeviceInfo<cl_uint>(id, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS);
+ if (dims_count < 3)
+ {
+ return;
+ }
+ std::vector<size_t> limits(dims_count);
+ cl_int error = clGetDeviceInfo(id, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * dims_count,
+ limits.data(), nullptr);
+ if (error != CL_SUCCESS)
+ {
+ return;
+ }
+ // dims_count must be at least 3 according to spec
+ result->x = limits[0];
+ result->y = limits[1];
+ result->z = limits[2];
+}
+
+OpenCLVersion ParseCLVersion(const std::string &version)
+{
+ const auto first_dot_pos = version.find_first_of('.');
+ if (first_dot_pos == std::string::npos)
+ {
+ return OpenCLVersion::CL_1_0;
+ }
+ const int major = version[first_dot_pos - 1] - '0';
+ const int minor = version[first_dot_pos + 1] - '0';
+
+ if (major == 1)
+ {
+ if (minor == 2)
+ {
+ return OpenCLVersion::CL_1_2;
+ }
+ else if (minor == 1)
+ {
+ return OpenCLVersion::CL_1_1;
+ }
+ else
+ {
+ return OpenCLVersion::CL_1_0;
+ }
+ }
+ else if (major == 2)
+ {
+ if (minor == 2)
+ {
+ return OpenCLVersion::CL_2_2;
+ }
+ else if (minor == 1)
+ {
+ return OpenCLVersion::CL_2_1;
+ }
+ else
+ {
+ return OpenCLVersion::CL_2_0;
+ }
+ }
+ else if (major == 3)
+ {
+ return OpenCLVersion::CL_3_0;
+ }
+ else
+ {
+ return OpenCLVersion::CL_1_0;
+ }
+}
+
+Vendor ParseVendor(const std::string &device_name, const std::string &vendor_name)
+{
+ std::string d_name = device_name;
+ std::string v_name = vendor_name;
+ std::transform(d_name.begin(), d_name.end(), d_name.begin(), ::tolower);
+ std::transform(v_name.begin(), v_name.end(), v_name.begin(), ::tolower);
+ if (d_name.find("qualcomm") != std::string::npos || v_name.find("qualcomm") != std::string::npos)
+ {
+ return Vendor::kQualcomm;
+ }
+ else if (d_name.find("mali") != std::string::npos || v_name.find("mali") != std::string::npos)
+ {
+ return Vendor::kMali;
+ }
+ else if (d_name.find("power") != std::string::npos || v_name.find("power") != std::string::npos)
+ {
+ return Vendor::kPowerVR;
+ }
+ else if (d_name.find("nvidia") != std::string::npos || v_name.find("nvidia") != std::string::npos)
+ {
+ return Vendor::kNvidia;
+ }
+ else if (d_name.find("advanced micro devices") != std::string::npos ||
+ v_name.find("advanced micro devices") != std::string::npos)
+ {
+ return Vendor::kAMD;
+ }
+ else if (d_name.find("intel") != std::string::npos || v_name.find("intel") != std::string::npos)
+ {
+ return Vendor::kIntel;
+ }
+ else
+ {
+ return Vendor::kUnknown;
+ }
+}
+
+// check that gpu_version belong to range min_version-max_version
+// min_version is included and max_version is excluded.
+bool IsGPUVersionInRange(int gpu_version, int min_version, int max_version)
+{
+ return gpu_version >= min_version && gpu_version < max_version;
+}
+} // namespace
+
+DeviceInfo DeviceInfoFromDeviceID(cl_device_id id)
+{
+ DeviceInfo info;
+ const auto device_name = GetDeviceInfo<std::string>(id, CL_DEVICE_NAME);
+ const auto vendor_name = GetDeviceInfo<std::string>(id, CL_DEVICE_VENDOR);
+ const auto opencl_c_version = GetDeviceInfo<std::string>(id, CL_DEVICE_OPENCL_C_VERSION);
+ info.vendor = ParseVendor(device_name, vendor_name);
+ if (info.vendor == Vendor::kQualcomm)
+ {
+ info.adreno_info = AdrenoInfo(opencl_c_version);
+ }
+ else if (info.vendor == Vendor::kMali)
+ {
+ info.mali_info = MaliInfo(device_name);
+ }
+ info.cl_version = ParseCLVersion(opencl_c_version);
+ info.extensions = absl::StrSplit(GetDeviceInfo<std::string>(id, CL_DEVICE_EXTENSIONS), ' ');
+
+ info.supports_fp16 = false;
+ info.supports_image3d_writes = false;
+ for (const auto &ext : info.extensions)
+ {
+ if (ext == "cl_khr_fp16")
+ {
+ info.supports_fp16 = true;
+ }
+ if (ext == "cl_khr_3d_image_writes")
+ {
+ info.supports_image3d_writes = true;
+ }
+ }
+
+ cl_device_fp_config f32_config =
+ GetDeviceInfo<cl_device_fp_config>(id, CL_DEVICE_SINGLE_FP_CONFIG);
+ info.supports_fp32_rtn = f32_config & CL_FP_ROUND_TO_NEAREST;
+
+ if (info.supports_fp16)
+ {
+ cl_device_fp_config f16_config;
+ auto status = GetDeviceInfo<cl_device_fp_config>(id, CL_DEVICE_HALF_FP_CONFIG, &f16_config);
+ // AMD supports cl_khr_fp16 but CL_DEVICE_HALF_FP_CONFIG is empty.
+ if (status.ok() && info.vendor != Vendor::kAMD)
+ {
+ info.supports_fp16_rtn = f16_config & CL_FP_ROUND_TO_NEAREST;
+ }
+ else
+ { // happens on PowerVR
+ f16_config = f32_config;
+ info.supports_fp16_rtn = info.supports_fp32_rtn;
+ }
+ }
+ else
+ {
+ info.supports_fp16_rtn = false;
+ }
+
+ if (info.vendor == Vendor::kPowerVR && !info.supports_fp16)
+ {
+ // PowerVR doesn't have full support of fp16 and so doesn't list this
+ // extension. But it can support fp16 in MADs and as buffers/textures types,
+ // so we will use it.
+ info.supports_fp16 = true;
+ info.supports_fp16_rtn = info.supports_fp32_rtn;
+ }
+
+ if (!info.supports_image3d_writes &&
+ ((info.vendor == Vendor::kQualcomm &&
+ IsGPUVersionInRange(info.adreno_info.gpu_version, 400, 500)) ||
+ info.vendor == Vendor::kNvidia))
+ {
+ // in local tests Adreno 430 can write in image 3d, at least on small sizes,
+ // but it doesn't have cl_khr_3d_image_writes in list of available
+ // extensions
+ // The same for NVidia
+ info.supports_image3d_writes = true;
+ }
+ info.compute_units_count = GetDeviceInfo<cl_uint>(id, CL_DEVICE_MAX_COMPUTE_UNITS);
+ info.image2d_max_width = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_WIDTH);
+ info.image2d_max_height = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_HEIGHT);
+ info.buffer_max_size = GetDeviceInfo<cl_ulong>(id, CL_DEVICE_MAX_MEM_ALLOC_SIZE);
+ if (info.cl_version >= OpenCLVersion::CL_1_2)
+ {
+ info.image_buffer_max_size = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE);
+ info.image_array_max_layers = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE_MAX_ARRAY_SIZE);
+ }
+ info.image3d_max_width = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE3D_MAX_WIDTH);
+ info.image3d_max_height = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE2D_MAX_HEIGHT);
+ info.image3d_max_depth = GetDeviceInfo<size_t>(id, CL_DEVICE_IMAGE3D_MAX_DEPTH);
+ int3 max_work_group_sizes;
+ GetDeviceWorkDimsSizes(id, &max_work_group_sizes);
+ info.max_work_group_size_x = max_work_group_sizes.x;
+ info.max_work_group_size_y = max_work_group_sizes.y;
+ info.max_work_group_size_z = max_work_group_sizes.z;
+
+ if (info.IsIntel())
+ {
+ if (info.SupportsExtension("cl_intel_required_subgroup_size"))
+ {
+ size_t sub_groups_count;
+ cl_int status = clGetDeviceInfo(id, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/, 0, nullptr,
+ &sub_groups_count);
+ if (status == CL_SUCCESS)
+ {
+ std::vector<size_t> sub_group_sizes(sub_groups_count);
+ status =
+ clGetDeviceInfo(id, 0x4108 /*CL_DEVICE_SUB_GROUP_SIZES_INTEL*/,
+ sizeof(size_t) * sub_groups_count, sub_group_sizes.data(), nullptr);
+ if (status == CL_SUCCESS)
+ {
+ for (size_t i = 0; i < sub_groups_count; ++i)
+ {
+ info.supported_subgroup_sizes.push_back(sub_group_sizes[i]);
+ }
+ }
+ }
+ }
+ }
+ return info;
+}
+
+CLDevice::CLDevice(cl_device_id id, cl_platform_id platform_id)
+ : info_(DeviceInfoFromDeviceID(id)), id_(id), platform_id_(platform_id)
+{
+}
+
+CLDevice::CLDevice(const CLDevice &device)
+ : info_(device.info_), id_(device.id_), platform_id_(device.platform_id_)
+{
+}
+
+CLDevice &CLDevice::operator=(const CLDevice &device)
+{
+ if (this != &device)
+ {
+ info_ = device.info_;
+ id_ = device.id_;
+ platform_id_ = device.platform_id_;
+ }
+ return *this;
+}
+
+CLDevice::CLDevice(CLDevice &&device)
+ : info_(std::move(device.info_)), id_(device.id_), platform_id_(device.platform_id_)
+{
+ device.id_ = nullptr;
+ device.platform_id_ = nullptr;
+}
+
+CLDevice &CLDevice::operator=(CLDevice &&device)
+{
+ if (this != &device)
+ {
+ id_ = nullptr;
+ platform_id_ = nullptr;
+ info_ = std::move(device.info_);
+ std::swap(id_, device.id_);
+ std::swap(platform_id_, device.platform_id_);
+ }
+ return *this;
+}
+
+bool CLDevice::SupportsFP16() const { return info_.supports_fp16; }
+
+bool CLDevice::SupportsExtension(const std::string &extension) const
+{
+ return info_.SupportsExtension(extension);
+}
+
+bool CLDevice::SupportsTextureArray() const { return info_.SupportsTextureArray(); }
+
+bool CLDevice::SupportsImageBuffer() const { return info_.SupportsImageBuffer(); }
+
+bool CLDevice::SupportsImage3D() const { return info_.SupportsImage3D(); }
+
+bool CLDevice::SupportsFP32RTN() const { return info_.supports_fp32_rtn; }
+
+bool CLDevice::SupportsFP16RTN() const { return info_.supports_fp16_rtn; }
+
+std::string CLDevice::GetPlatformVersion() const
+{
+ return GetPlatformInfo(platform_id_, CL_PLATFORM_VERSION);
+}
+
+bool CLDevice::IsCL20OrHigher() const { return info_.IsCL20OrHigher(); }
+
+bool CLDevice::SupportsSubGroupWithSize(int sub_group_size) const
+{
+ return info_.SupportsSubGroupWithSize(sub_group_size);
+}
+
+bool CLDevice::IsAdreno() const { return info_.IsAdreno(); }
+
+bool CLDevice::IsAdreno3xx() const { return info_.IsAdreno3xx(); }
+
+bool CLDevice::IsAdreno4xx() const { return info_.IsAdreno4xx(); }
+
+bool CLDevice::IsAdreno5xx() const { return info_.IsAdreno5xx(); }
+
+bool CLDevice::IsAdreno6xx() const { return info_.IsAdreno6xx(); }
+
+bool CLDevice::IsAdreno6xxOrHigher() const { return info_.IsAdreno6xxOrHigher(); }
+
+bool CLDevice::IsPowerVR() const { return info_.IsPowerVR(); }
+
+bool CLDevice::IsNvidia() const { return info_.IsNvidia(); }
+
+bool CLDevice::IsMali() const { return info_.IsMali(); }
+
+bool CLDevice::IsAMD() const { return info_.IsAMD(); }
+
+bool CLDevice::IsIntel() const { return info_.IsIntel(); }
+
+bool CLDevice::SupportsOneLayerTextureArray() const { return info_.SupportsOneLayerTextureArray(); }
+
+void CLDevice::DisableOneLayerTextureArray()
+{
+ info_.adreno_info.support_one_layer_texture_array = false;
+}
+
+absl::Status CreateDefaultGPUDevice(CLDevice *result)
+{
+ cl_uint num_platforms;
+ clGetPlatformIDs(0, nullptr, &num_platforms);
+ if (num_platforms == 0)
+ {
+ return absl::UnknownError("No supported OpenCL platform.");
+ }
+ std::vector<cl_platform_id> platforms(num_platforms);
+ clGetPlatformIDs(num_platforms, platforms.data(), nullptr);
+
+ cl_platform_id platform_id = platforms[0];
+ cl_uint num_devices;
+ clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 0, nullptr, &num_devices);
+ if (num_devices == 0)
+ {
+ return absl::UnknownError("No GPU on current platform.");
+ }
+
+ std::vector<cl_device_id> devices(num_devices);
+ clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, num_devices, devices.data(), nullptr);
+
+ *result = CLDevice(devices[0], platform_id);
+ return absl::OkStatus();
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClDevice.h b/runtime/onert/backend/gpu_cl/open_cl/ClDevice.h
new file mode 100644
index 000000000..6e740fe97
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/ClDevice.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_CL_DEVICE_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_CL_DEVICE_H__
+
+#include <string>
+#include <vector>
+
+#include "DeviceInfo.h"
+#include "OpenclWrapper.h"
+#include "Util.h"
+#include "Types.h"
+#include "Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+// A wrapper around opencl device id
+class CLDevice
+{
+public:
+ CLDevice() = default;
+ CLDevice(cl_device_id id, cl_platform_id platform_id);
+
+ CLDevice(CLDevice &&device);
+ CLDevice &operator=(CLDevice &&device);
+ CLDevice(const CLDevice &);
+ CLDevice &operator=(const CLDevice &);
+
+ ~CLDevice() {}
+
+ cl_device_id id() const { return id_; }
+ cl_platform_id platform() const { return platform_id_; }
+ std::string GetPlatformVersion() const;
+
+ Vendor vendor() const { return info_.vendor; }
+ OpenCLVersion cl_version() const { return info_.cl_version; }
+ bool SupportsFP16() const;
+ bool SupportsTextureArray() const;
+ bool SupportsImageBuffer() const;
+ bool SupportsImage3D() const;
+ bool SupportsExtension(const std::string &extension) const;
+ bool SupportsFP32RTN() const;
+ bool SupportsFP16RTN() const;
+ bool IsCL20OrHigher() const;
+ bool SupportsSubGroupWithSize(int sub_group_size) const;
+ bool IsAdreno() const;
+ bool IsAdreno3xx() const;
+ bool IsAdreno4xx() const;
+ bool IsAdreno5xx() const;
+ bool IsAdreno6xx() const;
+ bool IsAdreno6xxOrHigher() const;
+ bool IsPowerVR() const;
+ bool IsNvidia() const;
+ bool IsMali() const;
+ bool IsAMD() const;
+ bool IsIntel() const;
+
+ // To track bug on some Adreno. b/131099086
+ bool SupportsOneLayerTextureArray() const;
+ void DisableOneLayerTextureArray();
+
+ const DeviceInfo &GetInfo() const { return info_; }
+ // We update device info during context creation, so as supported texture
+ // formats can be requested from context only.
+ mutable DeviceInfo info_;
+
+private:
+ cl_device_id id_ = nullptr;
+ cl_platform_id platform_id_ = nullptr;
+};
+
+absl::Status CreateDefaultGPUDevice(CLDevice *result);
+
+template <typename T> T GetDeviceInfo(cl_device_id id, cl_device_info info)
+{
+ T result;
+ cl_int error = clGetDeviceInfo(id, info, sizeof(T), &result, nullptr);
+ if (error != CL_SUCCESS)
+ {
+ return -1;
+ }
+ return result;
+}
+
+template <typename T> absl::Status GetDeviceInfo(cl_device_id id, cl_device_info info, T *result)
+{
+ cl_int error = clGetDeviceInfo(id, info, sizeof(T), result, nullptr);
+ if (error != CL_SUCCESS)
+ {
+ return absl::InvalidArgumentError(CLErrorCodeToString(error));
+ }
+ return absl::OkStatus();
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_CL_DEVICE_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClErrors.h b/runtime/onert/backend/gpu_cl/open_cl/ClErrors.h
new file mode 100644
index 000000000..48cd2fb00
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/ClErrors.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_CL_ERRORS_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_CL_ERRORS_H__
+
+#include <string>
+
+#include "Util.h"
+#include "Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+// @return if error_code is success, then return OK status. Otherwise translates
+// error code into a message.
+inline absl::Status GetOpenCLError(cl_int error_code)
+{
+ if (error_code == CL_SUCCESS)
+ {
+ return absl::OkStatus();
+ }
+ return absl::InternalError("OpenCL error: " + CLErrorCodeToString(error_code));
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_CL_ERRORS_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClEvent.cc b/runtime/onert/backend/gpu_cl/open_cl/ClEvent.cc
new file mode 100644
index 000000000..beb64a9a8
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/ClEvent.cc
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ClEvent.h"
+
+#include "OpenclWrapper.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+CLEvent::CLEvent(cl_event event) : event_(event) {}
+
+CLEvent::CLEvent(CLEvent &&event) : event_(event.event_), name_(std::move(event.name_))
+{
+ event.event_ = nullptr;
+}
+
+CLEvent &CLEvent::operator=(CLEvent &&event)
+{
+ if (this != &event)
+ {
+ Release();
+ std::swap(event_, event.event_);
+ name_ = std::move(event.name_);
+ }
+ return *this;
+}
+
+uint64_t CLEvent::GetStartedTimeNs() const
+{
+ cl_ulong time_ns;
+ clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &time_ns, nullptr);
+ return time_ns;
+}
+
+uint64_t CLEvent::GetFinishedTimeNs() const
+{
+ cl_ulong time_ns;
+ clGetEventProfilingInfo(event_, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &time_ns, nullptr);
+ return time_ns;
+}
+
+double CLEvent::GetEventTimeMs() const
+{
+ const uint64_t start = GetStartedTimeNs();
+ const uint64_t end = GetFinishedTimeNs();
+ const uint64_t time_ns = (end - start);
+
+ return static_cast<double>(time_ns) * 1e-6;
+}
+
+uint64_t CLEvent::GetEventTimeNs() const { return GetFinishedTimeNs() - GetStartedTimeNs(); }
+
+void CLEvent::SetName(const std::string &name) { name_ = name; }
+
+void CLEvent::Wait() const { clWaitForEvents(1, &event_); }
+
+CLEvent::~CLEvent() { Release(); }
+
+void CLEvent::Release()
+{
+ if (event_)
+ {
+ clReleaseEvent(event_);
+ event_ = nullptr;
+ }
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClEvent.h b/runtime/onert/backend/gpu_cl/open_cl/ClEvent.h
new file mode 100644
index 000000000..265409ffe
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/ClEvent.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_CL_EVENT_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_CL_EVENT_H__
+
+#include <cstdint>
+#include <string>
+
+#include "OpenclWrapper.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+// A RAII wrapper around opencl event
+class CLEvent
+{
+public:
+ CLEvent() {}
+ explicit CLEvent(cl_event event);
+
+ // Move only
+ CLEvent(CLEvent &&event);
+ CLEvent &operator=(CLEvent &&event);
+ CLEvent(const CLEvent &) = delete;
+ CLEvent &operator=(const CLEvent &) = delete;
+
+ ~CLEvent();
+
+ uint64_t GetStartedTimeNs() const;
+ uint64_t GetFinishedTimeNs() const;
+
+ double GetEventTimeMs() const;
+ uint64_t GetEventTimeNs() const;
+
+ void Wait() const;
+
+ cl_event event() const { return event_; }
+
+ bool is_valid() const { return event_ != nullptr; }
+
+ void SetName(const std::string &name);
+ std::string GetName() const { return name_; }
+
+private:
+ void Release();
+
+ cl_event event_ = nullptr;
+
+ std::string name_; // optional, for profiling mostly
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_CL_EVENT_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClImageFormat.cc b/runtime/onert/backend/gpu_cl/open_cl/ClImageFormat.cc
new file mode 100644
index 000000000..247a63d39
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/ClImageFormat.cc
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ClImageFormat.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+cl_channel_order ToChannelOrder(int num_channels)
+{
+ switch (num_channels)
+ {
+ case 1:
+ return CL_R;
+ case 2:
+ return CL_RG;
+ case 3:
+ return CL_RGB;
+ case 4:
+ return CL_RGBA;
+ default:
+ return -1;
+ }
+}
+
+cl_channel_type ToImageChannelType(DataType data_type)
+{
+ switch (data_type)
+ {
+ case DataType::FLOAT32:
+ return CL_FLOAT;
+ case DataType::FLOAT16:
+ return CL_HALF_FLOAT;
+ default:
+ return -1;
+ }
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClImageFormat.h b/runtime/onert/backend/gpu_cl/open_cl/ClImageFormat.h
new file mode 100644
index 000000000..a763746bd
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/ClImageFormat.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_CL_IMAGE_FORMAT_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_CL_IMAGE_FORMAT_H__
+
+#include "OpenclWrapper.h"
+#include "DataType.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+cl_channel_order ToChannelOrder(int num_channels);
+
+cl_channel_type ToImageChannelType(DataType data_type);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_CL_IMAGE_FORMAT_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClKernel.cc b/runtime/onert/backend/gpu_cl/open_cl/ClKernel.cc
new file mode 100644
index 000000000..f7745b9ac
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/ClKernel.cc
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ClKernel.h"
+
+#include "absl/strings/str_cat.h"
+#include "ClProgram.h"
+#include "Util.h"
+#include "Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+namespace
+{
+
+absl::Status GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id, int *result)
+{
+ size_t max_work_group_size;
+ cl_int error_code = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE,
+ sizeof(size_t), &max_work_group_size, nullptr);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(absl::StrCat("Failed to get info CL_KERNEL_WORK_GROUP_SIZE ",
+ CLErrorCodeToString(error_code)));
+ }
+ *result = static_cast<int>(max_work_group_size);
+ return absl::OkStatus();
+}
+
+absl::Status GetKernelPrivateMemorySize(cl_kernel kernel, cl_device_id device_id, int *result)
+{
+ cl_ulong private_mem_size;
+ cl_int error_code = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_PRIVATE_MEM_SIZE,
+ sizeof(cl_ulong), &private_mem_size, nullptr);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(absl::StrCat("Failed to get info CL_KERNEL_PRIVATE_MEM_SIZE ",
+ CLErrorCodeToString(error_code)));
+ }
+ *result = static_cast<int>(private_mem_size);
+ return absl::OkStatus();
+}
+
+} // namespace
+
+CLKernel::CLKernel(CLKernel &&kernel)
+ : info_(kernel.info_), binding_counter_(kernel.binding_counter_),
+ function_name_(std::move(kernel.function_name_)), program_(kernel.program_),
+ kernel_(kernel.kernel_)
+{
+ kernel.kernel_ = nullptr;
+}
+
+CLKernel &CLKernel::operator=(CLKernel &&kernel)
+{
+ if (this != &kernel)
+ {
+ Release();
+ std::swap(info_, kernel.info_);
+ std::swap(binding_counter_, kernel.binding_counter_);
+ function_name_ = std::move(kernel.function_name_);
+ std::swap(program_, kernel.program_);
+ std::swap(kernel_, kernel.kernel_);
+ }
+ return *this;
+}
+
+CLKernel::~CLKernel() { Release(); }
+
+absl::Status CLKernel::ReInit() const
+{
+ clReleaseKernel(kernel_);
+ cl_kernel *kern_ptr = const_cast<cl_kernel *>(&kernel_);
+ int error_code;
+ *kern_ptr = clCreateKernel(program_, function_name_.c_str(), &error_code);
+ if (!kernel_ || error_code != CL_SUCCESS)
+ {
+ *kern_ptr = nullptr;
+ return absl::UnknownError(
+ absl::StrCat("Failed to create ", function_name_, CLErrorCodeToString(error_code)));
+ }
+ return absl::OkStatus();
+}
+
+void CLKernel::Release()
+{
+ if (kernel_)
+ {
+ clReleaseKernel(kernel_);
+ clReleaseProgram(program_);
+ kernel_ = nullptr;
+ }
+}
+
+absl::Status CLKernel::CreateFromProgram(const CLProgram &program, const std::string &function_name)
+{
+ int error_code;
+ function_name_ = function_name;
+ kernel_ = clCreateKernel(program.program(), function_name.c_str(), &error_code);
+ if (!kernel_ || error_code != CL_SUCCESS)
+ {
+ kernel_ = nullptr;
+ return absl::UnknownError(
+ absl::StrCat("Failed to create ", function_name, CLErrorCodeToString(error_code)));
+ }
+
+ program_ = program.program();
+ clRetainProgram(program_);
+
+ RETURN_IF_ERROR(
+ GetKernelPrivateMemorySize(kernel_, program.GetDeviceId(), &info_.private_memory_size));
+ RETURN_IF_ERROR(
+ GetKernelMaxWorkGroupSize(kernel_, program.GetDeviceId(), &info_.max_work_group_size));
+ return absl::OkStatus();
+}
+
+absl::Status CLKernel::SetMemory(int index, cl_mem memory)
+{
+ return SetBytes(index, &memory, sizeof(cl_mem));
+}
+
+absl::Status CLKernel::SetMemoryAuto(cl_mem memory)
+{
+ return SetBytesAuto(&memory, sizeof(cl_mem));
+}
+
+absl::Status CLKernel::SetBytes(int index, const void *ptr, int length) const
+{
+ const int error_code = clSetKernelArg(kernel_, index, length, ptr);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(
+ absl::StrCat("Failed to set kernel arguments - ", CLErrorCodeToString(error_code)));
+ }
+ return absl::OkStatus();
+}
+
+absl::Status CLKernel::SetBytesAuto(const void *ptr, int length)
+{
+ const int error_code = clSetKernelArg(kernel_, binding_counter_, length, ptr);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(absl::StrCat("Failed to set kernel arguments - ",
+ CLErrorCodeToString(error_code), "(at index - ",
+ binding_counter_, ")"));
+ }
+ binding_counter_++;
+ return absl::OkStatus();
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClKernel.h b/runtime/onert/backend/gpu_cl/open_cl/ClKernel.h
new file mode 100644
index 000000000..9575b7946
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/ClKernel.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_CL_KERNEL_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_CL_KERNEL_H__
+
+#include <string>
+
+#include "ClContext.h"
+#include "ClDevice.h"
+#include "ClProgram.h"
+#include "OpenclWrapper.h"
+#include "Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+struct KernelInfo
+{
+ int private_memory_size = 0;
+ int max_work_group_size = 0;
+};
+
+// Arguments binding to CLKernel can be manual or automatic
+// In manual you specify binding index explicitly
+// In automatic binding, index auto-incremented with every binding call
+// Also, if you use automatic mode you must call ResetBindingCounter
+// before parameters binding
+class CLKernel
+{
+public:
+ CLKernel() {}
+
+ // Move only
+ CLKernel(CLKernel &&kernel);
+ CLKernel &operator=(CLKernel &&kernel);
+ CLKernel(const CLKernel &) = delete;
+ CLKernel &operator=(const CLKernel &) = delete;
+
+ ~CLKernel();
+
+ cl_kernel kernel() const { return kernel_; }
+
+ absl::Status CreateFromProgram(const CLProgram &program, const std::string &function_name);
+
+ absl::Status SetMemory(int index, cl_mem memory);
+ absl::Status SetMemoryAuto(cl_mem memory);
+ template <typename T> absl::Status SetBytes(int index, const T &value) const
+ {
+ return SetBytes(index, static_cast<const void *>(&value), sizeof(T));
+ }
+ template <typename T> absl::Status SetBytesAuto(const T &value)
+ {
+ return SetBytesAuto(static_cast<const void *>(&value), sizeof(T));
+ }
+
+ int GetBindingCounter() const { return binding_counter_; }
+ void ResetBindingCounter() { binding_counter_ = 0; }
+
+ // Do not use this function
+ // workaround for Mali memory leak
+ absl::Status ReInit() const;
+
+ KernelInfo info_;
+
+private:
+ void Release();
+ absl::Status SetBytes(int index, const void *ptr, int length) const;
+ absl::Status SetBytesAuto(const void *ptr, int length);
+
+ int binding_counter_ = -1;
+
+ std::string function_name_ = "";
+ // reference to program from which kernel was created
+ cl_program program_ = nullptr;
+ cl_kernel kernel_ = nullptr;
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_CL_KERNEL_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClMemory.cc b/runtime/onert/backend/gpu_cl/open_cl/ClMemory.cc
new file mode 100644
index 000000000..fd3bc5579
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/ClMemory.cc
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ClMemory.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+cl_mem_flags ToClMemFlags(AccessType access_type)
+{
+ switch (access_type)
+ {
+ case AccessType::READ:
+ return CL_MEM_READ_ONLY;
+ case AccessType::WRITE:
+ return CL_MEM_WRITE_ONLY;
+ case AccessType::READ_WRITE:
+ return CL_MEM_READ_WRITE;
+ default:
+ throw std::runtime_error("Invalid AccessType");
+ }
+
+ return CL_MEM_READ_ONLY; // unreachable
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClMemory.h b/runtime/onert/backend/gpu_cl/open_cl/ClMemory.h
new file mode 100644
index 000000000..c704ec71f
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/ClMemory.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_CL_MEMORY_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_CL_MEMORY_H__
+
+#include <algorithm>
+
+#include "OpenclWrapper.h"
+#include "AccessType.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+// RAII wrapper for OpenCL memory object.
+//
+// Image is moveable but not copyable.
+class CLMemory
+{
+public:
+ // Creates invalid object.
+ CLMemory() : CLMemory(nullptr, false) {}
+
+ CLMemory(cl_mem memory, bool has_ownership) : memory_(memory), has_ownership_(has_ownership) {}
+
+ // Move-only
+ CLMemory(const CLMemory &) = delete;
+ CLMemory &operator=(const CLMemory &) = delete;
+ CLMemory(CLMemory &&image) : memory_(image.memory_), has_ownership_(image.has_ownership_)
+ {
+ image.memory_ = nullptr;
+ }
+
+ ~CLMemory() { Invalidate(); }
+
+ CLMemory &operator=(CLMemory &&image)
+ {
+ if (this != &image)
+ {
+ Invalidate();
+ std::swap(memory_, image.memory_);
+ has_ownership_ = image.has_ownership_;
+ }
+ return *this;
+ }
+
+ cl_mem memory() const { return memory_; }
+
+ bool is_valid() const { return memory_ != nullptr; }
+
+ // @return true if this object actually owns corresponding CL memory
+ // and manages it's lifetime.
+ bool has_ownership() const { return has_ownership_; }
+
+ cl_mem Release()
+ {
+ cl_mem to_return = memory_;
+ memory_ = nullptr;
+ return to_return;
+ }
+
+private:
+ void Invalidate()
+ {
+ if (memory_ && has_ownership_)
+ {
+ clReleaseMemObject(memory_);
+ }
+ memory_ = nullptr;
+ }
+
+ cl_mem memory_ = nullptr;
+ bool has_ownership_ = false;
+};
+
+cl_mem_flags ToClMemFlags(AccessType access_type);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_CL_MEMORY_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClProgram.cc b/runtime/onert/backend/gpu_cl/open_cl/ClProgram.cc
new file mode 100644
index 000000000..c72b01a73
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/ClProgram.cc
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ClProgram.h"
+
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "Util.h"
+#include "Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+namespace
+{
+
+std::string GetProgramBuildInfo(cl_program program, cl_device_id id, cl_program_build_info info)
+{
+ size_t size;
+ cl_int error_code = clGetProgramBuildInfo(program, id, info, 0, nullptr, &size);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::StrCat("Failed to GetProgramBuildInfo - ", CLErrorCodeToString(error_code));
+ }
+
+ std::string result(size - 1, 0);
+ error_code = clGetProgramBuildInfo(program, id, info, size, &result[0], nullptr);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::StrCat("Failed to GetProgramBuildInfo - ", CLErrorCodeToString(error_code));
+ }
+ return result;
+}
+
+absl::Status GetBinarySize(cl_program program, size_t *binary_size)
+{
+ cl_int error_code =
+ clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), binary_size, nullptr);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(
+ absl::StrCat("Failed to get program binary size - ", CLErrorCodeToString(error_code)));
+ }
+ return absl::OkStatus();
+}
+
+absl::Status BuildProgram(cl_program program, const CLDevice &device,
+ const std::string &compiler_options)
+{
+ const int error_code =
+ clBuildProgram(program, 0, nullptr, compiler_options.c_str(), nullptr, nullptr);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(
+ absl::StrCat("Failed to build program executable - ", CLErrorCodeToString(error_code),
+ GetProgramBuildInfo(program, device.id(), CL_PROGRAM_BUILD_LOG)));
+ }
+
+ return absl::OkStatus();
+}
+
+std::string CompilerOptionToString(const CLDevice &device, CompilerOptions option)
+{
+ switch (option)
+ {
+ case CompilerOptions::ADRENO_FULL_SIMD_LINE:
+ if (device.info_.adreno_info.gpu_version < 500)
+ {
+ return "-qcom-accelerate-16-bit";
+ }
+ else
+ {
+ return "-qcom-accelerate-16-bit=true";
+ }
+ case CompilerOptions::ADRENO_MORE_WAVES:
+ if (device.info_.adreno_info.gpu_version >= 500)
+ {
+ return "-qcom-accelerate-16-bit=false";
+ }
+ else
+ {
+ return "";
+ }
+ case CompilerOptions::POWERVR_FP16:
+ return "-cl-fast-relaxed-math";
+ case CompilerOptions::CL_OPT_DISABLE:
+ return "-cl-opt-disable";
+ case CompilerOptions::CL_2_0:
+ return "-cl-std=CL2.0";
+ case CompilerOptions::CL_3_0:
+ return "-cl-std=CL3.0";
+ }
+ return "";
+}
+
+} // namespace
+
+std::string CompilerOptionsToString(const CLDevice &device,
+ const std::vector<CompilerOptions> &compiler_options)
+{
+ std::string result;
+ for (auto option : compiler_options)
+ {
+ absl::StrAppend(&result, CompilerOptionToString(device, option), " ");
+ }
+ return result;
+}
+
+CLProgram::CLProgram(cl_program program, cl_device_id device_id)
+ : program_(program), device_id_(device_id)
+{
+}
+
+CLProgram::CLProgram(CLProgram &&program)
+ : program_(program.program_), device_id_(program.device_id_)
+{
+ program.program_ = nullptr;
+}
+
+CLProgram &CLProgram::operator=(CLProgram &&program)
+{
+ if (this != &program)
+ {
+ Release();
+ std::swap(program_, program.program_);
+ std::swap(device_id_, program.device_id_);
+ }
+ return *this;
+}
+
+CLProgram::~CLProgram() { Release(); }
+
+void CLProgram::Release()
+{
+ if (program_)
+ {
+ clReleaseProgram(program_);
+ program_ = nullptr;
+ }
+}
+
+absl::Status CLProgram::GetBinary(std::vector<uint8_t> *result) const
+{
+ size_t binary_size;
+ RETURN_IF_ERROR(GetBinarySize(program_, &binary_size));
+ result->resize(result->size() + binary_size);
+ uint8_t *binary_ptr = result->data() + result->size() - binary_size;
+ cl_int error_code =
+ clGetProgramInfo(program_, CL_PROGRAM_BINARIES, binary_size, &binary_ptr, nullptr);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(
+ absl::StrCat("Failed to get program binary - ", CLErrorCodeToString(error_code)));
+ }
+ return absl::OkStatus();
+}
+
+absl::Status CreateCLProgram(const std::string &code, const std::string &compiler_options,
+ const CLContext &context, const CLDevice &device, CLProgram *result)
+{
+ int error_code;
+ const char *source = code.c_str();
+
+ cl_program program =
+ clCreateProgramWithSource(context.context(), 1, &source, nullptr, &error_code);
+ if (!program || error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(
+ absl::StrCat("Failed to create compute program - ", CLErrorCodeToString(error_code)));
+ }
+
+ *result = CLProgram(program, device.id());
+ RETURN_IF_ERROR(BuildProgram(program, device, compiler_options));
+ return absl::OkStatus();
+}
+
+absl::Status CreateCLProgramFromBinary(const CLContext &context, const CLDevice &device,
+ absl::Span<const uint8_t> binary, CLProgram *result)
+{
+ cl_int binary_status;
+ cl_int error_code;
+ cl_device_id devices_list[] = {device.id()};
+ size_t binary_size = binary.size();
+ const uint8_t *binary_pointer = binary.data();
+ cl_program program = clCreateProgramWithBinary(context.context(), 1, devices_list, &binary_size,
+ &binary_pointer, &binary_status, &error_code);
+ if (binary_status != CL_SUCCESS)
+ {
+ return absl::UnknownError(absl::StrCat(
+ "Something wrong with binary after clCreateProgramWithBinary - ", binary_status));
+ }
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(
+ absl::StrCat("Failed to create program - ", CLErrorCodeToString(error_code)));
+ }
+ *result = CLProgram(program, device.id());
+ return BuildProgram(program, device, "");
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/ClProgram.h b/runtime/onert/backend/gpu_cl/open_cl/ClProgram.h
new file mode 100644
index 000000000..d039ff698
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/ClProgram.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_CL_PROGRAM_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_CL_PROGRAM_H__
+
+#include <cstdint>
+#include <vector>
+
+#include "ClContext.h"
+#include "ClDevice.h"
+#include "OpenclWrapper.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+enum class CompilerOptions
+{
+ // ADRENO_FULL_SIMD_LINE:
+ // Adreno can have 2 sizes for SIMD size.
+ // On Adreno 4xx/5xx it is 32/64, on Adreno6xx it is 64/128.
+ // Some our algorithms actually rely on exact size, for example on full
+ // SIMD size, so we need this define.
+ // This define is actually -qcom-accelerate-16-bit, but it controls SIMD
+ // size.
+ ADRENO_FULL_SIMD_LINE,
+ ADRENO_MORE_WAVES,
+ POWERVR_FP16,
+ CL_OPT_DISABLE,
+ CL_2_0,
+ CL_3_0,
+};
+
+std::string CompilerOptionsToString(const CLDevice &device,
+ const std::vector<CompilerOptions> &compiler_options);
+
+class CLProgram
+{
+public:
+ CLProgram() {}
+ CLProgram(cl_program program, cl_device_id device_id);
+
+ // Move only
+ CLProgram(CLProgram &&program);
+ CLProgram &operator=(CLProgram &&program);
+ CLProgram(const CLProgram &) = delete;
+ CLProgram &operator=(const CLProgram &) = delete;
+
+ ~CLProgram();
+
+ cl_program program() const { return program_; }
+
+ // Return the cl_device_id associated with the program object.
+ // This can be the device associated with context on which the program object
+ // has been created or can be device that was specified when a program object
+ // was created using clCreateProgramWithBinary.
+ cl_device_id GetDeviceId() const { return device_id_; }
+
+ absl::Status GetBinary(std::vector<uint8_t> *result) const;
+
+private:
+ void Release();
+
+ cl_program program_ = nullptr;
+
+ // reference
+ cl_device_id device_id_ = nullptr;
+};
+
+absl::Status CreateCLProgram(const std::string &code, const std::string &compiler_options,
+ const CLContext &context, const CLDevice &device, CLProgram *result);
+
+absl::Status CreateCLProgramFromBinary(const CLContext &context, const CLDevice &device,
+ absl::Span<const uint8_t> binary, CLProgram *result);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_CL_PROGRAM_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/DataType.cc b/runtime/onert/backend/gpu_cl/open_cl/DataType.cc
new file mode 100644
index 000000000..ce2aa8298
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/DataType.cc
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DataType.h"
+
+#include <stddef.h>
+#include <string>
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+size_t SizeOf(DataType data_type)
+{
+ switch (data_type)
+ {
+ case DataType::UINT8:
+ case DataType::INT8:
+ return 1;
+ case DataType::FLOAT16:
+ case DataType::INT16:
+ case DataType::UINT16:
+ return 2;
+ case DataType::FLOAT32:
+ case DataType::INT32:
+ case DataType::UINT32:
+ return 4;
+ case DataType::FLOAT64:
+ case DataType::INT64:
+ case DataType::UINT64:
+ return 8;
+ case DataType::UNKNOWN:
+ return 0;
+ }
+ return 0;
+}
+
+std::string ToString(DataType data_type)
+{
+ switch (data_type)
+ {
+ case DataType::FLOAT16:
+ return "float16";
+ case DataType::FLOAT32:
+ return "float32";
+ case DataType::FLOAT64:
+ return "float64";
+ case DataType::INT16:
+ return "int16";
+ case DataType::INT32:
+ return "int32";
+ case DataType::INT64:
+ return "int64";
+ case DataType::INT8:
+ return "int8";
+ case DataType::UINT16:
+ return "uint16";
+ case DataType::UINT32:
+ return "uint32";
+ case DataType::UINT64:
+ return "uint64";
+ case DataType::UINT8:
+ return "uint8";
+ case DataType::UNKNOWN:
+ return "unknown";
+ }
+ return "undefined";
+}
+
+std::string ToCLDataType(DataType data_type, int vec_size)
+{
+ const std::string postfix = vec_size == 1 ? "" : std::to_string(vec_size);
+ switch (data_type)
+ {
+ case DataType::FLOAT16:
+ return "half" + postfix;
+ case DataType::FLOAT32:
+ return "float" + postfix;
+ case DataType::FLOAT64:
+ return "double" + postfix;
+ case DataType::INT16:
+ return "short" + postfix;
+ case DataType::INT32:
+ return "int" + postfix;
+ case DataType::INT64:
+ return "long" + postfix;
+ case DataType::INT8:
+ return "char" + postfix;
+ case DataType::UINT16:
+ return "ushort" + postfix;
+ case DataType::UINT32:
+ return "uint" + postfix;
+ case DataType::UINT64:
+ return "ulong" + postfix;
+ case DataType::UINT8:
+ return "uchar" + postfix;
+ case DataType::UNKNOWN:
+ return "unknown";
+ }
+ return "undefined";
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/DataType.h b/runtime/onert/backend/gpu_cl/open_cl/DataType.h
new file mode 100644
index 000000000..2a5afd551
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/DataType.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_DATA_TYPE_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_DATA_TYPE_H__
+
+#include <stddef.h>
+#include <string>
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+enum class DataType
+{
+ UNKNOWN = 0,
+ FLOAT16 = 1,
+ FLOAT32 = 2,
+ FLOAT64 = 3,
+ UINT8 = 4,
+ INT8 = 5,
+ UINT16 = 6,
+ INT16 = 7,
+ UINT32 = 8,
+ INT32 = 9,
+ UINT64 = 10,
+ INT64 = 11,
+};
+
+size_t SizeOf(DataType type);
+
+std::string ToString(DataType t);
+
+std::string ToCLDataType(DataType data_type, int vec_size = 1);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_DATA_TYPE_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/DeviceInfo.cc b/runtime/onert/backend/gpu_cl/open_cl/DeviceInfo.cc
new file mode 100644
index 000000000..2966fad75
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/DeviceInfo.cc
@@ -0,0 +1,383 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DeviceInfo.h"
+
+#include <algorithm>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+namespace
+{
+// check that gpu_version belong to range min_version-max_version
+// min_version is included and max_version is excluded.
+bool IsGPUVersionInRange(int gpu_version, int min_version, int max_version)
+{
+ return gpu_version >= min_version && gpu_version < max_version;
+}
+
+MaliGPU GetMaliGPUVersion(const std::string &device_name)
+{
+ const std::map<std::string, MaliGPU> kMapping = {
+ {"T604", MaliGPU::T604}, {"T622", MaliGPU::T622}, {"T624", MaliGPU::T624},
+ {"T628", MaliGPU::T628}, {"T658", MaliGPU::T658}, {"T678", MaliGPU::T678},
+ {"T720", MaliGPU::T720}, {"T760", MaliGPU::T760}, {"T820", MaliGPU::T820},
+ {"T830", MaliGPU::T830}, {"T860", MaliGPU::T860}, {"T880", MaliGPU::T880},
+ {"G31", MaliGPU::G31}, {"G51", MaliGPU::G51}, {"G71", MaliGPU::G71},
+ {"G52", MaliGPU::G52}, {"G72", MaliGPU::G72}, {"G76", MaliGPU::G76},
+ {"G57", MaliGPU::G57}, {"G77", MaliGPU::G77}, {"G68", MaliGPU::G68},
+ {"G78", MaliGPU::G78},
+ };
+ for (const auto &v : kMapping)
+ {
+ if (device_name.find(v.first) != std::string::npos)
+ {
+ return v.second;
+ }
+ }
+ return MaliGPU::UNKNOWN;
+}
+
+} // namespace
+
+// There is no rule for gpu version encoding, but we found these samples:
+// Version: OpenCL C 2.0 Adreno(TM) 540 // Pixel 2
+// Version: OpenCL C 2.0 Adreno(TM) 630 // Sony Compact XZ2
+// Version: OpenCL C 2.0 Adreno(TM) 630 // Pixel 3
+// Version: OpenCL C 2.0 Adreno(TM) 540 // Samsung S8
+// Version: OpenCL C 1.2 Adreno(TM) 430 // HTC One M9
+// Version: OpenCL C 2.0 Adreno(TM) 530 // Samsung S7 Edge
+// Version: OpenCL C 1.2 Adreno(TM) 405 // Motorola Moto G(4)
+// After the number string ends.
+// It is assumed that the <vendor-specific information> for Adreno GPUs has
+// the following format:
+// <text?><space?>Adreno(TM)<space><text?><version>
+// Returns -1 if vendor-specific information cannot be parsed
+int GetAdrenoGPUVersion(const std::string &gpu_version)
+{
+ const std::string gpu = absl::AsciiStrToLower(gpu_version);
+ const std::vector<absl::string_view> words = absl::StrSplit(gpu, ' ');
+ size_t i = 0;
+ for (; i < words.size(); ++i)
+ {
+ if (words[i].find("adreno") != words[i].npos)
+ {
+ break;
+ }
+ }
+ i += 1;
+ for (; i < words.size(); ++i)
+ {
+ int number;
+ bool is_number = absl::SimpleAtoi(words[i], &number);
+ // Adreno GPUs starts from 2xx, but opencl support should be only from 3xx
+ if (is_number && number >= 300)
+ {
+ return number;
+ }
+ }
+ return -1;
+}
+
+std::string VendorToString(Vendor v)
+{
+ switch (v)
+ {
+ case Vendor::kQualcomm:
+ return "Qualcomm";
+ case Vendor::kMali:
+ return "Mali";
+ case Vendor::kPowerVR:
+ return "PowerVR";
+ case Vendor::kNvidia:
+ return "NVIDIA";
+ case Vendor::kAMD:
+ return "AMD";
+ case Vendor::kIntel:
+ return "Intel";
+ case Vendor::kUnknown:
+ return "unknown vendor";
+ default:
+ return "Error";
+ }
+}
+
+std::string OpenCLVersionToString(OpenCLVersion version)
+{
+ switch (version)
+ {
+ case OpenCLVersion::CL_1_0:
+ return "1.0";
+ case OpenCLVersion::CL_1_1:
+ return "1.1";
+ case OpenCLVersion::CL_1_2:
+ return "1.2";
+ case OpenCLVersion::CL_2_0:
+ return "2.0";
+ case OpenCLVersion::CL_2_1:
+ return "2.1";
+ case OpenCLVersion::CL_2_2:
+ return "2.2";
+ case OpenCLVersion::CL_3_0:
+ return "3.0";
+ default:
+ return "Error";
+ }
+}
+
+AdrenoInfo::AdrenoInfo(const std::string &device_version)
+ : gpu_version(GetAdrenoGPUVersion(device_version))
+{
+}
+
+int AdrenoInfo::GetMaximumWavesCount() const
+{
+ if (gpu_version < 400)
+ {
+ return -1; // Adreno 3xx does not support it currently
+ }
+ else if (gpu_version >= 400 && gpu_version < 500)
+ {
+ return -1; // Adreno 4xx does not support it currently
+ }
+ else if (gpu_version >= 500 && gpu_version < 600)
+ {
+ return -1; // Adreno 5xx does not support it currently
+ }
+ else if (gpu_version >= 600 && gpu_version < 700)
+ {
+ return gpu_version == 640 ? 30 : 16;
+ }
+ else
+ {
+ return -1; // Adreno 7xx and higher does not exist yet
+ }
+}
+
+int AdrenoInfo::GetRegisterMemorySizePerComputeUnit() const
+{
+ if (gpu_version < 400)
+ {
+ return -1; // Adreno 3xx does not support it currently
+ }
+ else if (gpu_version >= 400 && gpu_version < 500)
+ {
+ return -1; // Adreno 4xx does not support it currently
+ }
+ else if (gpu_version >= 500 && gpu_version < 600)
+ {
+ return -1; // Adreno 5xx does not support it currently
+ }
+ else if (gpu_version >= 600 && gpu_version < 700)
+ {
+ return gpu_version == 640 ? 128 * 144 * 16 : 128 * 96 * 16;
+ }
+ else
+ {
+ return -1; // Adreno 7xx and higher does not exist yet
+ }
+}
+
+int AdrenoInfo::GetMaximumWavesCount(int register_footprint_per_tread, bool full_wave) const
+{
+ const int register_usage_per_wave = GetWaveSize(full_wave) * register_footprint_per_tread;
+ const int possible_waves_count = GetRegisterMemorySizePerComputeUnit() / register_usage_per_wave;
+ return std::min(possible_waves_count, GetMaximumWavesCount());
+}
+
+int AdrenoInfo::GetWaveSize(bool full_wave) const
+{
+ if (gpu_version < 400)
+ {
+ return -1; // Adreno 3xx does not support it currently
+ }
+ else if (gpu_version < 600)
+ {
+ return full_wave ? 64 : 32;
+ }
+ else
+ {
+ return full_wave ? 128 : 64;
+ }
+}
+
+MaliInfo::MaliInfo(const std::string &device_name) : gpu_version(GetMaliGPUVersion(device_name)) {}
+
+bool MaliInfo::IsMaliT6xx() const
+{
+ return gpu_version == MaliGPU::T604 || gpu_version == MaliGPU::T622 ||
+ gpu_version == MaliGPU::T624 || gpu_version == MaliGPU::T628 ||
+ gpu_version == MaliGPU::T658 || gpu_version == MaliGPU::T678;
+}
+
+bool MaliInfo::IsMaliT7xx() const
+{
+ return gpu_version == MaliGPU::T720 || gpu_version == MaliGPU::T760;
+}
+
+bool MaliInfo::IsMaliT8xx() const
+{
+ return gpu_version == MaliGPU::T820 || gpu_version == MaliGPU::T830 ||
+ gpu_version == MaliGPU::T860 || gpu_version == MaliGPU::T880;
+}
+
+bool MaliInfo::IsMidgard() const { return IsMaliT6xx() || IsMaliT7xx() || IsMaliT8xx(); }
+
+bool MaliInfo::IsBifrostGen1() const
+{
+ return gpu_version == MaliGPU::G31 || gpu_version == MaliGPU::G51 || gpu_version == MaliGPU::G71;
+}
+
+bool MaliInfo::IsBifrostGen2() const
+{
+ return gpu_version == MaliGPU::G52 || gpu_version == MaliGPU::G72;
+}
+
+bool MaliInfo::IsBifrostGen3() const { return gpu_version == MaliGPU::G76; }
+
+bool MaliInfo::IsBifrost() const { return IsBifrostGen1() || IsBifrostGen2() || IsBifrostGen3(); }
+
+bool MaliInfo::IsValhall() const
+{
+ return gpu_version == MaliGPU::G57 || gpu_version == MaliGPU::G77 ||
+ gpu_version == MaliGPU::G68 || gpu_version == MaliGPU::G78;
+}
+
+bool DeviceInfo::SupportsTextureArray() const { return cl_version >= OpenCLVersion::CL_1_2; }
+
+bool DeviceInfo::SupportsImageBuffer() const { return cl_version >= OpenCLVersion::CL_1_2; }
+
+bool DeviceInfo::SupportsImage3D() const
+{
+ if (vendor == Vendor::kMali)
+ {
+ // On Mali T880 read_imageh doesn't compile with image3d_t
+ return false;
+ }
+ return supports_image3d_writes;
+}
+
+bool DeviceInfo::SupportsFloatImage2D(DataType data_type, int channels) const
+{
+ if (channels == 1)
+ {
+ return data_type == DataType::FLOAT32 ? supports_r_f32_tex2d : supports_r_f16_tex2d;
+ }
+ else if (channels == 2)
+ {
+ return data_type == DataType::FLOAT32 ? supports_rg_f32_tex2d : supports_rg_f16_tex2d;
+ }
+ else if (channels == 3)
+ {
+ return data_type == DataType::FLOAT32 ? supports_rgb_f32_tex2d : supports_rgb_f16_tex2d;
+ }
+ else if (channels == 4)
+ {
+ return data_type == DataType::FLOAT32 ? supports_rgba_f32_tex2d : supports_rgba_f16_tex2d;
+ }
+ else
+ {
+ return false;
+ }
+}
+
+bool DeviceInfo::SupportsOneLayerTextureArray() const
+{
+ return !IsAdreno() || adreno_info.support_one_layer_texture_array;
+}
+
+bool DeviceInfo::SupportsExtension(const std::string &extension) const
+{
+ for (const auto &ext : extensions)
+ {
+ if (ext == extension)
+ {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool DeviceInfo::IsCL20OrHigher() const
+{
+ return cl_version != OpenCLVersion::CL_1_0 && cl_version != OpenCLVersion::CL_1_1 &&
+ cl_version != OpenCLVersion::CL_1_2;
+}
+
+bool DeviceInfo::SupportsSubGroupWithSize(int sub_group_size) const
+{
+ for (auto subgroup_size : supported_subgroup_sizes)
+ {
+ if (sub_group_size == subgroup_size)
+ {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool DeviceInfo::IsAdreno() const { return vendor == Vendor::kQualcomm; }
+
+bool DeviceInfo::IsAdreno3xx() const
+{
+ return IsAdreno() && IsGPUVersionInRange(adreno_info.gpu_version, 300, 400);
+}
+
+bool DeviceInfo::IsAdreno4xx() const
+{
+ return IsAdreno() && IsGPUVersionInRange(adreno_info.gpu_version, 400, 500);
+}
+
+bool DeviceInfo::IsAdreno5xx() const
+{
+ return IsAdreno() && IsGPUVersionInRange(adreno_info.gpu_version, 500, 600);
+}
+
+bool DeviceInfo::IsAdreno6xx() const
+{
+ return IsAdreno() && IsGPUVersionInRange(adreno_info.gpu_version, 600, 700);
+}
+
+bool DeviceInfo::IsAdreno6xxOrHigher() const
+{
+ return IsAdreno() && adreno_info.gpu_version >= 600;
+}
+
+bool DeviceInfo::IsPowerVR() const { return vendor == Vendor::kPowerVR; }
+
+bool DeviceInfo::IsNvidia() const { return vendor == Vendor::kNvidia; }
+
+bool DeviceInfo::IsMali() const { return vendor == Vendor::kMali; }
+
+bool DeviceInfo::IsAMD() const { return vendor == Vendor::kAMD; }
+
+bool DeviceInfo::IsIntel() const { return vendor == Vendor::kIntel; }
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/DeviceInfo.h b/runtime/onert/backend/gpu_cl/open_cl/DeviceInfo.h
new file mode 100644
index 000000000..85d7d4c80
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/DeviceInfo.h
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_DEVICE_INFO_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_DEVICE_INFO_H__
+
+#include <string>
+#include <vector>
+
+#include "DataType.h"
+
+// for use only in device_info.cc, but keep here to make tests
+int GetAdrenoGPUVersion(const std::string &gpu_version);
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+enum class Vendor
+{
+ kQualcomm,
+ kMali,
+ kPowerVR,
+ kNvidia,
+ kAMD,
+ kIntel,
+ kUnknown
+};
+std::string VendorToString(Vendor v);
+
+enum class OpenCLVersion
+{
+ UNKNOWN,
+ CL_1_0,
+ CL_1_1,
+ CL_1_2,
+ CL_2_0,
+ CL_2_1,
+ CL_2_2,
+ CL_3_0
+};
+std::string OpenCLVersionToString(OpenCLVersion version);
+
+struct AdrenoInfo
+{
+ AdrenoInfo() = default;
+ explicit AdrenoInfo(const std::string &device_version);
+ int gpu_version = -1; // can be, for example, 405/430/540/530/630 etc.
+
+ // This function returns some not very documented physical parameter of
+ // Adreno6xx GPU.
+ // We obtained it using Snapdragon Profiler.
+ int GetMaximumWavesCount() const;
+
+ // returns amount of register memory per CU(Compute Unit) in bytes.
+ int GetRegisterMemorySizePerComputeUnit() const;
+
+ // returns maximum possible amount of waves based on register usage.
+ int GetMaximumWavesCount(int register_footprint_per_tread, bool full_wave = true) const;
+
+ int GetWaveSize(bool full_wave) const;
+
+ // Not supported on some Adreno devices with specific driver version.
+ // b/131099086
+ bool support_one_layer_texture_array = true;
+};
+
+enum class MaliGPU
+{
+ T604,
+ T622,
+ T624,
+ T628,
+ T658,
+ T678,
+ T720,
+ T760,
+ T820,
+ T830,
+ T860,
+ T880,
+ G31,
+ G51,
+ G71,
+ G52,
+ G72,
+ G76,
+ G57,
+ G77,
+ G68,
+ G78,
+ UNKNOWN
+};
+
+struct MaliInfo
+{
+ MaliInfo() = default;
+ explicit MaliInfo(const std::string &device_name);
+ MaliGPU gpu_version = MaliGPU::UNKNOWN;
+
+ bool IsMaliT6xx() const;
+ bool IsMaliT7xx() const;
+ bool IsMaliT8xx() const;
+ bool IsMidgard() const;
+ bool IsBifrostGen1() const;
+ bool IsBifrostGen2() const;
+ bool IsBifrostGen3() const;
+ bool IsBifrost() const;
+ bool IsValhall() const;
+};
+
+struct DeviceInfo
+{
+ DeviceInfo() = default;
+
+ bool IsAdreno() const;
+ bool IsAdreno3xx() const;
+ bool IsAdreno4xx() const;
+ bool IsAdreno5xx() const;
+ bool IsAdreno6xx() const;
+ bool IsAdreno6xxOrHigher() const;
+ bool IsPowerVR() const;
+ bool IsNvidia() const;
+ bool IsMali() const;
+ bool IsAMD() const;
+ bool IsIntel() const;
+
+ bool SupportsTextureArray() const;
+ bool SupportsImageBuffer() const;
+ bool SupportsImage3D() const;
+
+ bool SupportsFloatImage2D(DataType data_type, int channels) const;
+
+ // To track bug on some Adreno. b/131099086
+ bool SupportsOneLayerTextureArray() const;
+
+ bool SupportsExtension(const std::string &extension) const;
+ bool IsCL20OrHigher() const;
+ bool SupportsSubGroupWithSize(int sub_group_size) const;
+
+ std::vector<std::string> extensions;
+ bool supports_fp16 = false;
+ bool supports_image3d_writes = false;
+ Vendor vendor = Vendor::kUnknown;
+ OpenCLVersion cl_version = OpenCLVersion::UNKNOWN;
+ int compute_units_count = 0;
+ uint64_t buffer_max_size = 0;
+ uint64_t image2d_max_width = 0;
+ uint64_t image2d_max_height = 0;
+ uint64_t image_buffer_max_size = 0;
+ uint64_t image_array_max_layers = 0;
+ uint64_t image3d_max_width = 0;
+ uint64_t image3d_max_height = 0;
+ uint64_t image3d_max_depth = 0;
+ int max_work_group_size_x = 0;
+ int max_work_group_size_y = 0;
+ int max_work_group_size_z = 0;
+ std::vector<int> supported_subgroup_sizes;
+
+ // rtn is ROUND_TO_NEAREST
+ // with rtn precision is much better then with rtz (ROUND_TO_ZERO)
+ // Adreno 3xx supports only rtz, Adreno 4xx and more support rtn
+ // Mali from T6xx supports rtn
+ // PowerVR supports only rtz
+ bool supports_fp32_rtn = false;
+ bool supports_fp16_rtn = false;
+
+ bool supports_r_f16_tex2d = false;
+ bool supports_rg_f16_tex2d = false;
+ bool supports_rgb_f16_tex2d = false;
+ bool supports_rgba_f16_tex2d = false;
+
+ bool supports_r_f32_tex2d = false;
+ bool supports_rg_f32_tex2d = false;
+ bool supports_rgb_f32_tex2d = false;
+ bool supports_rgba_f32_tex2d = false;
+
+ AdrenoInfo adreno_info;
+ MaliInfo mali_info;
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_DEVICE_INFO_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/Environment.cc b/runtime/onert/backend/gpu_cl/open_cl/Environment.cc
new file mode 100644
index 000000000..b558f0377
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/Environment.cc
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Environment.h"
+
+#include <string>
+#include <vector>
+
+#include "Util.h"
+#include "Shape.h"
+#include "Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+Environment::Environment(CLDevice &&device, CLContext &&context, CLCommandQueue &&queue,
+ ProfilingCommandQueue &&profiling_queue)
+ : device_(std::move(device)), context_(std::move(context)), queue_(std::move(queue)),
+ profiling_queue_(std::move(profiling_queue))
+{
+}
+
+Environment::Environment(Environment &&environment)
+ : device_(std::move(environment.device_)), context_(std::move(environment.context_)),
+ queue_(std::move(environment.queue_)),
+ profiling_queue_(std::move(environment.profiling_queue_)),
+ program_cache_(std::move(environment.program_cache_))
+{
+}
+
+Environment &Environment::operator=(Environment &&environment)
+{
+ if (this != &environment)
+ {
+ device_ = std::move(environment.device_);
+ context_ = std::move(environment.context_);
+ queue_ = std::move(environment.queue_);
+ profiling_queue_ = std::move(environment.profiling_queue_);
+ program_cache_ = std::move(environment.program_cache_);
+ }
+ return *this;
+}
+
+absl::Status Environment::Init()
+{
+ if (device().IsAdreno() && device().SupportsTextureArray())
+ {
+ // Some Adreno < 600 have bug with one layer texture array. b/131099086
+ // If we have one layer texture array and will write smt from kernel to this
+ // texture, we will get zeroes instead of actual values.
+ // The same kernel will work, if we use texture array with more than one
+ // layer.
+ if (device().info_.adreno_info.gpu_version < 600)
+ {
+ GetDevicePtr()->DisableOneLayerTextureArray();
+ }
+ }
+ return absl::OkStatus();
+}
+
+void Environment::SetHighPerformance() const
+{
+ // TODO(sorokin) use cl_perf_hint if available
+}
+
+void Environment::SetDefaultPerformance() const
+{
+ // TODO(sorokin) use cl_perf_hint if available
+}
+
+void Environment::SetLowPerformance() const
+{
+ // TODO(sorokin) use cl_perf_hint if available
+}
+
+std::vector<CalculationsPrecision> Environment::GetSupportedPrecisions() const
+{
+ std::vector<CalculationsPrecision> precisions;
+ for (CalculationsPrecision precision :
+ {CalculationsPrecision::F32, CalculationsPrecision::F32_F16, CalculationsPrecision::F16})
+ {
+ if (IsSupported(precision))
+ {
+ precisions.push_back(precision);
+ }
+ }
+ return precisions;
+}
+
+bool Environment::IsSupported(CalculationsPrecision precision) const
+{
+ switch (precision)
+ {
+ case CalculationsPrecision::F32_F16:
+ case CalculationsPrecision::F16:
+ return device_.SupportsFP16();
+ case CalculationsPrecision::F32:
+ return true;
+ }
+ return false;
+}
+
+std::vector<TensorStorageType> Environment::GetSupportedStorages() const
+{
+ std::vector<TensorStorageType> storage_types;
+ for (auto storage_type :
+ {TensorStorageType::TEXTURE_2D, TensorStorageType::BUFFER, TensorStorageType::TEXTURE_ARRAY,
+ TensorStorageType::IMAGE_BUFFER, TensorStorageType::TEXTURE_3D})
+ {
+ if (IsSupported(storage_type))
+ {
+ storage_types.push_back(storage_type);
+ }
+ }
+ return storage_types;
+}
+
+std::vector<TensorStorageType> Environment::GetSupportedStoragesWithHWZeroClampSupport() const
+{
+ std::vector<TensorStorageType> storage_types;
+ for (auto storage_type : {TensorStorageType::TEXTURE_2D, TensorStorageType::TEXTURE_ARRAY,
+ TensorStorageType::TEXTURE_3D})
+ {
+ if (IsSupported(storage_type))
+ {
+ storage_types.push_back(storage_type);
+ }
+ }
+ return storage_types;
+}
+
+bool Environment::IsSupported(TensorStorageType storage_type) const
+{
+ switch (storage_type)
+ {
+ case TensorStorageType::TEXTURE_2D:
+ return !device_.IsAMD();
+ case TensorStorageType::BUFFER:
+ return true;
+ case TensorStorageType::TEXTURE_ARRAY:
+ return !device_.IsAMD() && device_.SupportsTextureArray();
+ case TensorStorageType::IMAGE_BUFFER:
+ return (device_.IsAdreno() || device_.IsAMD() || device_.IsNvidia()) &&
+ device_.SupportsImageBuffer();
+ case TensorStorageType::TEXTURE_3D:
+ return !device_.IsAMD() && device_.SupportsImage3D();
+ case TensorStorageType::SINGLE_TEXTURE_2D:
+ return false;
+ case TensorStorageType::UNKNOWN:
+ return false;
+ }
+ return false;
+}
+
+TensorStorageType GetFastestStorageType(const DeviceInfo &gpu_info)
+{
+ if (gpu_info.IsAdreno())
+ {
+ if (gpu_info.IsAdreno6xxOrHigher())
+ {
+ return TensorStorageType::TEXTURE_ARRAY;
+ }
+ else
+ {
+ return TensorStorageType::TEXTURE_2D;
+ }
+ }
+ else if (gpu_info.IsPowerVR())
+ {
+ return TensorStorageType::TEXTURE_2D;
+ }
+ else if (gpu_info.IsMali())
+ {
+ const MaliInfo mali_info = gpu_info.mali_info;
+ if (mali_info.IsMaliT8xx() || mali_info.IsBifrostGen3() || mali_info.IsValhall())
+ {
+ return TensorStorageType::TEXTURE_2D;
+ }
+ else
+ {
+ return TensorStorageType::BUFFER;
+ }
+ }
+ else if (gpu_info.IsNvidia())
+ {
+ return gpu_info.SupportsImageBuffer() ? TensorStorageType::IMAGE_BUFFER
+ : TensorStorageType::BUFFER;
+ }
+ else if (gpu_info.IsAMD())
+ {
+ return gpu_info.SupportsImageBuffer() ? TensorStorageType::IMAGE_BUFFER
+ : TensorStorageType::BUFFER;
+ }
+ else if (gpu_info.IsIntel())
+ {
+ return TensorStorageType::BUFFER;
+ }
+ return TensorStorageType::BUFFER;
+}
+
+TensorStorageType GetStorageTypeWithMinimalMemoryConsumption(const DeviceInfo &gpu_info)
+{
+ if (gpu_info.IsAdreno())
+ {
+ if (gpu_info.IsAdreno3xx() || gpu_info.IsAdreno4xx())
+ {
+ return TensorStorageType::BUFFER;
+ }
+ else
+ {
+ return TensorStorageType::IMAGE_BUFFER;
+ }
+ }
+ else if (gpu_info.IsPowerVR())
+ {
+ return TensorStorageType::BUFFER;
+ }
+ else if (gpu_info.IsMali())
+ {
+ return TensorStorageType::BUFFER;
+ }
+ else if (gpu_info.IsNvidia())
+ {
+ return gpu_info.SupportsImageBuffer() ? TensorStorageType::IMAGE_BUFFER
+ : TensorStorageType::BUFFER;
+ }
+ else if (gpu_info.IsAMD())
+ {
+ return gpu_info.SupportsImageBuffer() ? TensorStorageType::IMAGE_BUFFER
+ : TensorStorageType::BUFFER;
+ }
+ else if (gpu_info.IsIntel())
+ {
+ return TensorStorageType::BUFFER;
+ }
+ return TensorStorageType::BUFFER;
+}
+
+absl::Status CreateEnvironment(Environment *result)
+{
+ CLDevice gpu;
+ RETURN_IF_ERROR(CreateDefaultGPUDevice(&gpu));
+
+ CLContext context;
+ RETURN_IF_ERROR(CreateCLContext(gpu, &context));
+ CLCommandQueue queue;
+ RETURN_IF_ERROR(CreateCLCommandQueue(gpu, context, &queue));
+ ProfilingCommandQueue profiling_queue;
+ RETURN_IF_ERROR(CreateProfilingCommandQueue(gpu, context, &profiling_queue));
+
+ *result =
+ Environment(std::move(gpu), std::move(context), std::move(queue), std::move(profiling_queue));
+ return result->Init();
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/Environment.h b/runtime/onert/backend/gpu_cl/open_cl/Environment.h
new file mode 100644
index 000000000..47866b563
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/Environment.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_ENVIRONMENT_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_ENVIRONMENT_H__
+
+#include "ClCommandQueue.h"
+#include "ClContext.h"
+#include "ClDevice.h"
+#include "DeviceInfo.h"
+#include "Precision.h"
+#include "TensorType.h"
+#include "DataType.h"
+#include "ProgramCache.h"
+#include "Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+class Environment
+{
+public:
+ Environment() = default;
+ explicit Environment(CLDevice &&device, CLContext &&context, CLCommandQueue &&queue,
+ ProfilingCommandQueue &&profiling_queue);
+ // Move only
+ Environment(Environment &&environment);
+ Environment &operator=(Environment &&environment);
+ Environment(const Environment &) = delete;
+ Environment &operator=(const Environment &) = delete;
+
+ const CLDevice &device() const { return device_; }
+ CLDevice *GetDevicePtr() { return &device_; }
+ const CLDevice *GetDevicePtr() const { return &device_; }
+ CLContext &context() { return context_; }
+ CLCommandQueue *queue() { return &queue_; }
+ ProfilingCommandQueue *profiling_queue() { return &profiling_queue_; }
+ ProgramCache *program_cache() { return &program_cache_; }
+ const ProgramCache *program_cache() const { return &program_cache_; }
+
+ std::vector<CalculationsPrecision> GetSupportedPrecisions() const;
+ bool IsSupported(CalculationsPrecision precision) const;
+ std::vector<TensorStorageType> GetSupportedStorages() const;
+ // returns storage types that support zero clamping when reading OOB in HW
+ // (Height/Width) dimensions.
+ std::vector<TensorStorageType> GetSupportedStoragesWithHWZeroClampSupport() const;
+ bool IsSupported(TensorStorageType storage_type) const;
+
+ absl::Status Init();
+
+ void SetHighPerformance() const;
+ void SetDefaultPerformance() const;
+ void SetLowPerformance() const; // for energy saving
+
+private:
+ CLDevice device_;
+ CLContext context_;
+ CLCommandQueue queue_;
+ ProfilingCommandQueue profiling_queue_;
+ ProgramCache program_cache_;
+};
+
+TensorStorageType GetFastestStorageType(const DeviceInfo &gpu_info);
+TensorStorageType GetStorageTypeWithMinimalMemoryConsumption(const DeviceInfo &gpu_info);
+
+absl::Status CreateEnvironment(Environment *result);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_ENVIRONMENT_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/GpuObject.cc b/runtime/onert/backend/gpu_cl/open_cl/GpuObject.cc
new file mode 100644
index 000000000..774f8151f
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/GpuObject.cc
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GpuObject.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+std::string MemoryTypeToCLType(MemoryType type)
+{
+ switch (type)
+ {
+ case MemoryType::GLOBAL:
+ return "__global";
+ case MemoryType::CONSTANT:
+ return "__constant";
+ break;
+ case MemoryType::LOCAL:
+ return "__local";
+ }
+ return "";
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/GpuObject.h b/runtime/onert/backend/gpu_cl/open_cl/GpuObject.h
new file mode 100644
index 000000000..a31630235
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/GpuObject.h
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_GPU_OBJECT_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_GPU_OBJECT_H__
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "ClContext.h"
+#include "OpenclWrapper.h"
+#include "AccessType.h"
+#include "DataType.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+struct GPUImage2DDescriptor
+{
+ DataType data_type = DataType::UNKNOWN;
+ AccessType access_type = AccessType::UNKNOWN;
+ cl_mem memory = nullptr;
+};
+
+struct GPUImage3DDescriptor
+{
+ DataType data_type = DataType::UNKNOWN;
+ AccessType access_type = AccessType::UNKNOWN;
+ cl_mem memory = nullptr;
+};
+
+struct GPUImage2DArrayDescriptor
+{
+ DataType data_type = DataType::UNKNOWN;
+ AccessType access_type = AccessType::UNKNOWN;
+ cl_mem memory = nullptr;
+};
+
+struct GPUImageBufferDescriptor
+{
+ DataType data_type = DataType::UNKNOWN;
+ AccessType access_type = AccessType::UNKNOWN;
+ cl_mem memory = nullptr;
+};
+
+struct GPUCustomMemoryDescriptor
+{
+ std::string type_name = "";
+ cl_mem memory = nullptr;
+};
+
+enum class MemoryType
+{
+ GLOBAL,
+ CONSTANT,
+ LOCAL
+};
+
+std::string MemoryTypeToCLType(MemoryType type);
+
+struct GPUBufferDescriptor
+{
+ DataType data_type = DataType::UNKNOWN;
+ AccessType access_type = AccessType::UNKNOWN;
+ int element_size = 0;
+ MemoryType memory_type = MemoryType::GLOBAL;
+ std::vector<std::string> attributes;
+ cl_mem memory = nullptr;
+};
+
+struct GPUResources
+{
+ std::vector<std::string> ints;
+ std::vector<std::string> floats;
+ std::vector<std::pair<std::string, GPUBufferDescriptor>> buffers;
+ std::vector<std::pair<std::string, GPUImage2DDescriptor>> images2d;
+ std::vector<std::pair<std::string, GPUImage2DArrayDescriptor>> image2d_arrays;
+ std::vector<std::pair<std::string, GPUImage3DDescriptor>> images3d;
+ std::vector<std::pair<std::string, GPUImageBufferDescriptor>> image_buffers;
+ std::vector<std::pair<std::string, GPUCustomMemoryDescriptor>> custom_memories;
+
+ std::vector<std::string> GetNames() const
+ {
+ std::vector<std::string> names = ints;
+ names.insert(names.end(), floats.begin(), floats.end());
+ for (const auto &obj : buffers)
+ {
+ names.push_back(obj.first);
+ }
+ for (const auto &obj : images2d)
+ {
+ names.push_back(obj.first);
+ }
+ for (const auto &obj : image2d_arrays)
+ {
+ names.push_back(obj.first);
+ }
+ for (const auto &obj : images3d)
+ {
+ names.push_back(obj.first);
+ }
+ for (const auto &obj : image_buffers)
+ {
+ names.push_back(obj.first);
+ }
+ for (const auto &obj : custom_memories)
+ {
+ names.push_back(obj.first);
+ }
+ return names;
+ }
+};
+
+struct GPUResourcesWithValue
+{
+ std::vector<std::pair<std::string, int>> ints;
+ std::vector<std::pair<std::string, float>> floats;
+ std::vector<std::pair<std::string, cl_mem>> buffers;
+ std::vector<std::pair<std::string, cl_mem>> images2d;
+ std::vector<std::pair<std::string, cl_mem>> image2d_arrays;
+ std::vector<std::pair<std::string, cl_mem>> images3d;
+ std::vector<std::pair<std::string, cl_mem>> image_buffers;
+ std::vector<std::pair<std::string, cl_mem>> custom_memories;
+};
+
+class GPUObject;
+
+class GPUObjectDescriptor
+{
+public:
+ GPUObjectDescriptor() = default;
+ GPUObjectDescriptor(const GPUObjectDescriptor &) = default;
+ GPUObjectDescriptor &operator=(const GPUObjectDescriptor &) = default;
+ GPUObjectDescriptor(GPUObjectDescriptor &&obj_desc) : state_vars_(std::move(obj_desc.state_vars_))
+ {
+ }
+ GPUObjectDescriptor &operator=(GPUObjectDescriptor &&obj_desc)
+ {
+ if (this != &obj_desc)
+ {
+ state_vars_ = std::move(obj_desc.state_vars_);
+ }
+ return *this;
+ }
+ virtual ~GPUObjectDescriptor() = default;
+
+ void SetStateVar(const std::string &key, const std::string &value) const
+ {
+ state_vars_[key] = value;
+ }
+
+ virtual std::string PerformConstExpr(const std::string &) const { return ""; }
+
+ virtual absl::Status PerformSelector(const std::string &, const std::vector<std::string> &,
+ const std::vector<std::string> &, std::string *result) const
+ {
+ *result = "";
+ return absl::OkStatus();
+ }
+ virtual GPUResources GetGPUResources() const { return GPUResources(); }
+
+ virtual absl::Status CreateGPUObject(CLContext *, std::unique_ptr<GPUObject> *) const
+ {
+ return absl::OkStatus();
+ }
+ virtual void Release() {}
+
+ void SetAccess(AccessType access_type) { access_type_ = access_type; }
+ AccessType GetAccess() const { return access_type_; }
+
+protected:
+ // friend flatbuffers::Offset<data::GPUObjectDescriptor> Encode(
+ // const GPUObjectDescriptor& desc, flatbuffers::FlatBufferBuilder* builder);
+ // friend void Decode(const data::GPUObjectDescriptor* fb_obj,
+ // GPUObjectDescriptor* obj);
+ mutable std::map<std::string, std::string> state_vars_;
+ AccessType access_type_ = AccessType::UNKNOWN;
+};
+
+using GPUObjectDescriptorPtr = std::unique_ptr<GPUObjectDescriptor>;
+
+class GPUObject
+{
+public:
+ GPUObject() = default;
+ // Move only
+ GPUObject(GPUObject &&obj_desc) = default;
+ GPUObject &operator=(GPUObject &&obj_desc) = default;
+ GPUObject(const GPUObject &) = delete;
+ GPUObject &operator=(const GPUObject &) = delete;
+ virtual ~GPUObject() = default;
+ virtual absl::Status GetGPUResources(const GPUObjectDescriptor *obj_ptr,
+ GPUResourcesWithValue *resources) const = 0;
+};
+
+using GPUObjectPtr = std::unique_ptr<GPUObject>;
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_GPU_OBJECT_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/InferenceContext.cc b/runtime/onert/backend/gpu_cl/open_cl/InferenceContext.cc
new file mode 100644
index 000000000..afb7e2950
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/InferenceContext.cc
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "InferenceContext.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+#include "Buffer.h"
+#include "ClDevice.h"
+
+#include "kernels/GpuOperation.h"
+#include "ModelHints.h"
+#include "Precision.h"
+#include "StorageTypeUtil.h"
+#include "TensorType.h"
+#include "DataType.h"
+#include "Model.h"
+#include "Operations.h"
+#include "Shape.h"
+#include "Types.h"
+#include "Util.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+CLNode::CLNode(CLNode &&node)
+ : operation(std::move(node.operation)), inputs(std::move(node.inputs)),
+ outputs(std::move(node.outputs)), name(std::move(node.name))
+{
+}
+
+CLNode &CLNode::operator=(CLNode &&node)
+{
+ if (this != &node)
+ {
+ operation = std::move(node.operation);
+ inputs = std::move(node.inputs);
+ outputs = std::move(node.outputs);
+ name = std::move(node.name);
+ }
+ return *this;
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/InferenceContext.h b/runtime/onert/backend/gpu_cl/open_cl/InferenceContext.h
new file mode 100644
index 000000000..ebe2c5313
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/InferenceContext.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_INFERENCE_CONTEXT_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_INFERENCE_CONTEXT_H__
+
+#include <cstdint>
+#include <functional>
+#include <map>
+#include <memory>
+#include <vector>
+#include <unordered_map>
+
+#include "Buffer.h"
+#include "ClCommandQueue.h"
+#include "Environment.h"
+#include "GpuObject.h"
+#include "kernels/GpuOperation.h"
+#include "ModelHints.h"
+#include "OpenclWrapper.h"
+#include "Precision.h"
+#include "TensorType.h"
+#include "Model.h"
+#include "InternalTensor.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+struct CLNode
+{
+ std::unique_ptr<GPUOperation> operation;
+ std::vector<ValueId> inputs;
+ std::vector<ValueId> outputs;
+
+ // Mostly for debug purposes.
+ std::string name;
+
+ CLNode() = default;
+
+ CLNode(CLNode &&node);
+ CLNode &operator=(CLNode &&node);
+ CLNode(const CLNode &) = delete;
+ CLNode &operator=(const CLNode &) = delete;
+};
+
+class InferenceContext
+{
+public:
+ struct CreateInferenceInfo
+ {
+ CalculationsPrecision precision;
+ TensorStorageType storage_type;
+ ModelHints hints;
+ };
+
+ struct DummyTensor
+ {
+ BHWC shape;
+ TensorDescriptor descriptor;
+
+ bool operator==(const DummyTensor &b) const
+ {
+ return shape == b.shape && descriptor == b.descriptor;
+ }
+ };
+
+ class TensorReserver
+ {
+ public:
+ ValueId Add(const std::shared_ptr<DummyTensor> dummy)
+ {
+ reservations_[next_] = std::move(dummy);
+ return next_++;
+ }
+ void Add(ValueId id, const std::shared_ptr<DummyTensor> dummy)
+ {
+ reservations_[id] = std::move(dummy);
+ }
+ void SetNext(ValueId id) { next_ = id; }
+ bool HaveTensor(ValueId id) { return reservations_.find(id) != reservations_.end(); }
+ std::shared_ptr<DummyTensor> Get(ValueId id) { return reservations_[id]; }
+
+ std::vector<std::pair<ValueId, TensorDescriptor>> GetTensorDescs() const
+ {
+ std::vector<std::pair<ValueId, TensorDescriptor>> result;
+ for (auto &v : reservations_)
+ {
+ TensorDescriptor desc = v.second->descriptor;
+ desc.shape.b = v.second->shape.b;
+ desc.shape.h = v.second->shape.h;
+ desc.shape.w = v.second->shape.w;
+ desc.shape.d = 1;
+ desc.shape.c = v.second->shape.c;
+ result.push_back({v.first, desc});
+ }
+ return result;
+ }
+
+ void Add(const std::vector<std::pair<ValueId, TensorDescriptor>> &tensors)
+ {
+ for (auto &v : tensors)
+ {
+ auto dummy = std::make_shared<DummyTensor>();
+ dummy->descriptor = v.second;
+ dummy->shape.b = v.second.shape.b;
+ dummy->shape.h = v.second.shape.h;
+ dummy->shape.w = v.second.shape.w;
+ dummy->shape.c = v.second.shape.c;
+ Add(v.first, dummy);
+ }
+ }
+
+ private:
+ std::unordered_map<ValueId, std::shared_ptr<DummyTensor>> reservations_;
+ ValueId next_ = 0;
+ };
+
+private:
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_INFERENCE_CONTEXT_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/InternalTensor.h b/runtime/onert/backend/gpu_cl/open_cl/InternalTensor.h
new file mode 100644
index 000000000..f0423db86
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/InternalTensor.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_INTERNAL_TENSOR_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_INTERNAL_TENSOR_H__
+
+#include <stdint.h>
+
+#include <vector>
+
+#include "DataType.h"
+#include "Shape.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+namespace internal_tensor
+{
+
+// Meta function given element type returns a type for Tensor data container.
+template <DataType Type> struct StorageType;
+
+template <> struct StorageType<DataType::FLOAT32>
+{
+ using value = std::vector<float>;
+};
+
+template <> struct StorageType<DataType::INT32>
+{
+ using value = std::vector<int32_t>;
+};
+
+} // namespace internal_tensor
+
+template <typename ShapeT, DataType Type> struct InternalTensor
+{
+ using ShapeType = ShapeT;
+
+ constexpr static DataType kType = Type;
+
+ using TensorStorageType = typename internal_tensor::StorageType<Type>::value;
+
+ // Opaque id of a tensor.
+ int64_t id = -1;
+
+ ShapeType shape;
+
+ TensorStorageType data;
+};
+
+// TensorRef is a reference to another tensor. If an object should never hold
+// tensor data, then TensorRef should be used instead.
+template <typename ShapeT> struct TensorRef
+{
+ using ShapeType = ShapeT;
+
+ DataType type = DataType::UNKNOWN;
+
+ ShapeT shape;
+
+ // Opaque reference to a tensor. Upstream component is responsible for
+ // resolving this reference into an actual tensor.
+ int64_t ref = -1;
+
+ // Specifies if the tensor should be a variable input tensor that must be an
+ // output as well as an input to the graph.
+ bool is_variable_input = false;
+};
+
+template <typename ShapeT, DataType Type> constexpr DataType InternalTensor<ShapeT, Type>::kType;
+
+template <typename ShapeT, DataType Type>
+InternalTensor<ShapeT, Type> MakeZeroTensor(const ShapeT &shape)
+{
+ InternalTensor<ShapeT, Type> tensor;
+ tensor.shape = shape;
+ tensor.data =
+ typename InternalTensor<ShapeT, Type>::TensorStorageType(shape.DimensionsProduct(), 0);
+ return tensor;
+}
+
+using TensorFloat32 = InternalTensor<BHWC, DataType::FLOAT32>;
+using Tensor5DFloat32 = InternalTensor<BHWDC, DataType::FLOAT32>;
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_INTERNAL_TENSOR_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/LinearStorage.cc b/runtime/onert/backend/gpu_cl/open_cl/LinearStorage.cc
new file mode 100644
index 000000000..3889d4369
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/LinearStorage.cc
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "LinearStorage.h"
+
+#include "absl/strings/str_cat.h"
+#include "DataType.h"
+#include "Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+TensorLinearDescriptor::TensorLinearDescriptor(TensorLinearDescriptor &&desc)
+ : GPUObjectDescriptor(std::move(desc)), storage_type(desc.storage_type),
+ element_type(desc.element_type), memory_type(desc.memory_type), size(desc.size),
+ data(std::move(desc.data))
+{
+}
+
+TensorLinearDescriptor &TensorLinearDescriptor::operator=(TensorLinearDescriptor &&desc)
+{
+ if (this != &desc)
+ {
+ std::swap(storage_type, desc.storage_type);
+ std::swap(element_type, desc.element_type);
+ std::swap(memory_type, desc.memory_type);
+ std::swap(size, desc.size);
+ data = std::move(desc.data);
+ GPUObjectDescriptor::operator=(std::move(desc));
+ }
+ return *this;
+}
+
+void TensorLinearDescriptor::Release() { data.clear(); }
+
+GPUResources TensorLinearDescriptor::GetGPUResources() const
+{
+ GPUResources resources;
+ resources.ints.push_back("length");
+ if (storage_type == LinearStorageType::BUFFER)
+ {
+ GPUBufferDescriptor desc;
+ desc.data_type = element_type;
+ desc.access_type = access_type_;
+ desc.element_size = 4;
+ desc.memory_type = memory_type;
+ resources.buffers.push_back({"buffer", desc});
+ }
+ else
+ {
+ GPUImage2DDescriptor desc;
+ desc.data_type = element_type;
+ desc.access_type = access_type_;
+ resources.images2d.push_back({"tex2d", desc});
+ }
+ return resources;
+}
+
+absl::Status TensorLinearDescriptor::PerformSelector(const std::string &selector,
+ const std::vector<std::string> &args,
+ const std::vector<std::string> &,
+ std::string *result) const
+{
+ if (selector == "Length")
+ {
+ *result = "length";
+ return absl::OkStatus();
+ }
+ else if (selector == "Read")
+ {
+ return PerformReadSelector(args, result);
+ }
+ else if (selector == "GetPtr")
+ {
+ if (storage_type != LinearStorageType::BUFFER)
+ {
+ return absl::InvalidArgumentError(
+ "GetPtr selector supported for LinearStorageType::BUFFER only.");
+ }
+ *result = "buffer";
+ return absl::OkStatus();
+ }
+ else
+ {
+ return absl::NotFoundError(
+ absl::StrCat("TensorLinearDescriptor don't have selector with name - ", selector));
+ }
+}
+
+absl::Status TensorLinearDescriptor::PerformReadSelector(const std::vector<std::string> &args,
+ std::string *result) const
+{
+ if (args.size() != 1)
+ {
+ return absl::NotFoundError(absl::StrCat(
+ "TensorLinearDescriptor Read require one argument, but ", args.size(), " was passed"));
+ }
+ if (storage_type == LinearStorageType::BUFFER)
+ {
+ *result = absl::StrCat("buffer[", args[0], "]");
+ return absl::OkStatus();
+ }
+ else
+ {
+ const std::string read = element_type == DataType::FLOAT16 ? "read_imageh" : "read_imagef";
+ *result = absl::StrCat(read, "(tex2d, smp_none, (int2)(", args[0], ", 0))");
+ return absl::OkStatus();
+ }
+}
+
+absl::Status TensorLinearDescriptor::CreateGPUObject(CLContext *context, GPUObjectPtr *result) const
+{
+ LinearStorage gpu_storage;
+ RETURN_IF_ERROR(gpu_storage.CreateFromTensorLinearDescriptor(*this, context));
+ *result = absl::make_unique<LinearStorage>(std::move(gpu_storage));
+ return absl::OkStatus();
+}
+
+void TensorLinearDescriptor::UploadLinearData(const InternalTensor<Linear, DataType::FLOAT32> &src,
+ int aligned_size)
+{
+ size = aligned_size == 0 ? DivideRoundUp(src.shape.v, 4) : aligned_size;
+ if (element_type == DataType::FLOAT32)
+ {
+ data.resize(size * sizeof(float) * 4);
+ float *gpu_data = reinterpret_cast<float *>(data.data());
+ for (int i = 0; i < size * 4; ++i)
+ {
+ if (i < src.shape.v)
+ {
+ gpu_data[i] = src.data[i];
+ }
+ else
+ {
+ gpu_data[i] = 0.0f;
+ }
+ }
+ }
+ // TODO
+ // It doesn't support F16 yet. I will try to add it later.
+ //
+ // else {
+ // data.resize(size * sizeof(half) * 4);
+ // half* gpu_data = reinterpret_cast<half*>(data.data());
+ // for (int i = 0; i < size * 4; ++i) {
+ // if (i < src.shape.v) {
+ // gpu_data[i] = src.data[i];
+ // } else {
+ // gpu_data[i] = 0.0f;
+ // }
+ // }
+ // }
+}
+
+void LinearStorage::Release()
+{
+ if (memory_)
+ {
+ clReleaseMemObject(memory_);
+ memory_ = nullptr;
+ }
+}
+
+LinearStorage::LinearStorage(LinearStorage &&storage)
+ : GPUObject(std::move(storage)), memory_(storage.memory_), depth_(storage.depth_),
+ storage_type_(storage.storage_type_)
+{
+ storage.memory_ = nullptr;
+}
+
+LinearStorage &LinearStorage::operator=(LinearStorage &&storage)
+{
+ if (this != &storage)
+ {
+ Release();
+ std::swap(memory_, storage.memory_);
+ std::swap(depth_, storage.depth_);
+ std::swap(storage_type_, storage.storage_type_);
+ GPUObject::operator=(std::move(storage));
+ }
+ return *this;
+}
+
+absl::Status LinearStorage::GetGPUResources(const GPUObjectDescriptor *obj_ptr,
+ GPUResourcesWithValue *resources) const
+{
+ const auto *linear_desc = dynamic_cast<const TensorLinearDescriptor *>(obj_ptr);
+ if (!linear_desc)
+ {
+ return absl::InvalidArgumentError("Expected TensorLinearDescriptor on input.");
+ }
+
+ resources->ints.push_back({"length", depth_});
+
+ if (storage_type_ == LinearStorageType::BUFFER)
+ {
+ resources->buffers.push_back({"buffer", memory_});
+ }
+ else
+ {
+ resources->images2d.push_back({"tex2d", memory_});
+ }
+
+ return absl::OkStatus();
+}
+
+absl::Status LinearStorage::CreateFromTensorLinearDescriptor(const TensorLinearDescriptor &desc,
+ CLContext *context)
+{
+ storage_type_ = desc.storage_type;
+ depth_ = desc.size;
+ uint8_t *data_ptr = desc.data.empty() ? nullptr : const_cast<unsigned char *>(desc.data.data());
+ if (storage_type_ == LinearStorageType::BUFFER)
+ {
+ bool read_only = desc.memory_type == MemoryType::CONSTANT;
+ uint8_t *data_ptr = desc.data.empty() ? nullptr : const_cast<unsigned char *>(desc.data.data());
+ // TODO
+ // It doesn't support F16 yet. I will try to add it later.
+ //
+ // const int float4_size = desc.element_type == DataType::FLOAT32
+ // ? sizeof(float) * 4
+ // : sizeof(half) * 4;
+ const int float4_size = sizeof(float) * 4;
+ return CreateCLBuffer(context->context(), depth_ * float4_size, read_only, data_ptr, &memory_);
+ }
+ else
+ {
+ return CreateRGBAImage2D(context->context(), depth_, 1,
+ DataTypeToChannelType(desc.element_type), data_ptr, &memory_);
+ }
+}
+
+LinearStorageType DeduceLinearStorageType(TensorStorageType tensor_storage_type)
+{
+ if (tensor_storage_type == TensorStorageType::BUFFER)
+ {
+ return LinearStorageType::BUFFER;
+ }
+ else
+ {
+ return LinearStorageType::TEXTURE_2D;
+ }
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/LinearStorage.h b/runtime/onert/backend/gpu_cl/open_cl/LinearStorage.h
new file mode 100644
index 000000000..f6c3ac82f
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/LinearStorage.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_LINEAR_STORAGE_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_LINEAR_STORAGE_H__
+
+#include <string>
+#include <utility>
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "GpuObject.h"
+#include "OpenclWrapper.h"
+#include "TensorType.h"
+#include "Util.h"
+#include "DataType.h"
+#include "Status.h"
+#include "Types.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+enum class LinearStorageType
+{
+ BUFFER,
+ TEXTURE_2D
+};
+
+struct TensorLinearDescriptor : public GPUObjectDescriptor
+{
+ LinearStorageType storage_type;
+ DataType element_type; // FLOAT32 or FLOAT16
+ MemoryType memory_type = MemoryType::GLOBAL; // applicable for BUFFER
+
+ // optional
+ int size = 0;
+ std::vector<uint8_t> data;
+
+ TensorLinearDescriptor() = default;
+ TensorLinearDescriptor(const TensorLinearDescriptor &) = default;
+ TensorLinearDescriptor &operator=(const TensorLinearDescriptor &) = default;
+ TensorLinearDescriptor(TensorLinearDescriptor &&desc);
+ TensorLinearDescriptor &operator=(TensorLinearDescriptor &&desc);
+
+ void UploadLinearData(const InternalTensor<Linear, DataType::FLOAT32> &src, int aligned_size = 0);
+
+ absl::Status PerformSelector(const std::string &selector, const std::vector<std::string> &args,
+ const std::vector<std::string> &template_args,
+ std::string *result) const override;
+
+ GPUResources GetGPUResources() const override;
+ absl::Status PerformReadSelector(const std::vector<std::string> &args, std::string *result) const;
+
+ absl::Status CreateGPUObject(CLContext *context, GPUObjectPtr *result) const override;
+ void Release() override;
+};
+
+LinearStorageType DeduceLinearStorageType(TensorStorageType tensor_storage_type);
+
+// Represent GPU 1D-array of FLT4(float4/half4) values
+// Can use inside texture2d or buffer
+class LinearStorage : public GPUObject
+{
+public:
+ LinearStorage() {}
+ ~LinearStorage() override { Release(); }
+
+ // Move only
+ LinearStorage(LinearStorage &&storage);
+ LinearStorage &operator=(LinearStorage &&storage);
+ LinearStorage(const LinearStorage &) = delete;
+ LinearStorage &operator=(const LinearStorage &) = delete;
+
+ absl::Status GetGPUResources(const GPUObjectDescriptor *obj_ptr,
+ GPUResourcesWithValue *resources) const override;
+
+ absl::Status CreateFromTensorLinearDescriptor(const TensorLinearDescriptor &desc,
+ CLContext *context);
+
+private:
+ void Release();
+
+ cl_mem memory_ = nullptr;
+ int depth_;
+ LinearStorageType storage_type_;
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_LINEAR_STORAGE_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/Model.h b/runtime/onert/backend/gpu_cl/open_cl/Model.h
new file mode 100644
index 000000000..f434bb22f
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/Model.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_MODEL_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_MODEL_H__
+
+#include <string>
+
+#include "absl/types/any.h"
+#include "InternalTensor.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+// There is yet another representation of CNN graph. The primary purpose of this
+// representation is to simplify graph manipulation.
+
+using ValueId = uint32_t;
+
+// Used to emulate quantized behavior.
+struct QuantizationParams
+{
+ float min = 0;
+ float max = 0;
+ float scale = 0;
+};
+
+struct Operation
+{
+ std::string type;
+ absl::any attributes;
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_MODEL_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/ModelHints.h b/runtime/onert/backend/gpu_cl/open_cl/ModelHints.h
new file mode 100644
index 000000000..474c56b2a
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/ModelHints.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_MODEL_HINTS_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_MODEL_HINTS_H__
+
+#include <cstdint>
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+struct ModelHints
+{
+ using ModelHint = uint64_t;
+
+ // By default we want the fastest inference.
+ static constexpr ModelHint kFastestInference = 0x00000000;
+ // Can improve compilation time, but inference can be slower.
+ static constexpr ModelHint kReduceKernelsCount = 0x00000001;
+ // Can improve tuning time, but inference can be slower.
+ static constexpr ModelHint kFastTuning = 0x00000002;
+
+ // Experimental.
+ // Can improve performance and memory consumption, but slow down
+ // initialization a lot and create more kernels.
+ static constexpr ModelHint kAllowSpecialKernels = 0x00000004;
+
+ void Add(ModelHint hint)
+ {
+ if (hint == kFastestInference)
+ {
+ hints = kFastestInference;
+ }
+ else
+ {
+ hints |= hint;
+ }
+ }
+
+ bool Check(ModelHint hint) const { return hints & hint; }
+
+ uint64_t hints = kFastestInference;
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_MODEL_HINTS_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/OpenclWrapper.cc b/runtime/onert/backend/gpu_cl/open_cl/OpenclWrapper.cc
new file mode 100644
index 000000000..dbaf6faf6
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/OpenclWrapper.cc
@@ -0,0 +1,407 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined(_WIN32)
+#define __WINDOWS__
+#endif
+
+#include "OpenclWrapper.h"
+
+#ifdef __WINDOWS__
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+#ifdef __ANDROID__
+#define LoadFunction(function) \
+ if (use_wrapper) \
+ { \
+ function = reinterpret_cast<PFN_##function>(loadOpenCLPointer(#function)); \
+ } \
+ else \
+ { \
+ function = reinterpret_cast<PFN_##function>(dlsym(*libopencl, #function)); \
+ }
+#elif defined(__WINDOWS__)
+#define LoadFunction(function) \
+ function = reinterpret_cast<PFN_##function>(GetProcAddress(libopencl, #function));
+#else
+#define LoadFunction(function) \
+ function = reinterpret_cast<PFN_##function>(dlsym(*libopencl, #function));
+#endif
+
+#ifdef __WINDOWS__
+void LoadOpenCLFunctions(HMODULE libopencl);
+#else
+void LoadOpenCLFunctions(void **libopencl, bool use_wrapper);
+#endif
+
+absl::Status LoadOpenCL(void **libopencl)
+{
+#ifdef __WINDOWS__
+ HMODULE libopencl = LoadLibraryA("OpenCL.dll");
+ if (libopencl)
+ {
+ LoadOpenCLFunctions(libopencl);
+ return absl::OkStatus();
+ }
+ else
+ {
+ DWORD error_code = GetLastError();
+ return absl::UnknownError(
+ absl::StrCat("Can not open OpenCL library on this device, error code - ", error_code));
+ }
+#else
+ *libopencl = dlopen("libOpenCL.so", RTLD_NOW | RTLD_LOCAL);
+ if (*libopencl)
+ {
+ LoadOpenCLFunctions(libopencl, false);
+ return absl::OkStatus();
+ }
+ // record error
+ std::string error(dlerror());
+#ifdef __ANDROID__
+ // Pixel phone or auto?
+ *libopencl = dlopen("libOpenCL-pixel.so", RTLD_NOW | RTLD_LOCAL);
+ if (!*libopencl)
+ {
+ *libopencl = dlopen("libOpenCL-car.so", RTLD_NOW | RTLD_LOCAL);
+ }
+ if (*libopencl)
+ {
+ typedef void (*enableOpenCL_t)();
+ enableOpenCL_t enableOpenCL =
+ reinterpret_cast<enableOpenCL_t>(dlsym(*libopencl, "enableOpenCL"));
+ enableOpenCL();
+ LoadOpenCLFunctions(libopencl, true);
+ return absl::OkStatus();
+ }
+#endif
+ return absl::UnknownError(absl::StrCat("Can not open OpenCL library on this device - ", error));
+#endif
+}
+
+void UnloadOpenCL(void *libopencl)
+{
+ if (libopencl)
+ {
+ dlclose(libopencl);
+ }
+}
+
+#ifdef __WINDOWS__
+void LoadOpenCLFunctions(HMODULE libopencl)
+{
+#else
+#ifdef __ANDROID__
+void LoadOpenCLFunctions(void **libopencl, bool use_wrapper)
+{
+ typedef void *(*loadOpenCLPointer_t)(const char *name);
+ loadOpenCLPointer_t loadOpenCLPointer;
+ if (use_wrapper)
+ {
+ loadOpenCLPointer =
+ reinterpret_cast<loadOpenCLPointer_t>(dlsym(*libopencl, "loadOpenCLPointer"));
+ }
+#else
+void LoadOpenCLFunctions(void **libopencl, bool)
+{
+#endif // __ANDROID__
+#endif // __WINDOWS__
+
+ LoadFunction(clGetPlatformIDs);
+ LoadFunction(clGetPlatformInfo);
+ LoadFunction(clGetDeviceIDs);
+ LoadFunction(clGetDeviceInfo);
+ LoadFunction(clCreateSubDevices);
+ LoadFunction(clRetainDevice);
+ LoadFunction(clReleaseDevice);
+ LoadFunction(clCreateContext);
+ LoadFunction(clCreateContextFromType);
+ LoadFunction(clRetainContext);
+ LoadFunction(clReleaseContext);
+ LoadFunction(clGetContextInfo);
+ LoadFunction(clCreateCommandQueueWithProperties);
+ LoadFunction(clRetainCommandQueue);
+ LoadFunction(clReleaseCommandQueue);
+ LoadFunction(clGetCommandQueueInfo);
+ LoadFunction(clCreateBuffer);
+ LoadFunction(clCreateSubBuffer);
+ LoadFunction(clCreateImage);
+ LoadFunction(clCreatePipe);
+ LoadFunction(clRetainMemObject);
+ LoadFunction(clReleaseMemObject);
+ LoadFunction(clGetSupportedImageFormats);
+ LoadFunction(clGetMemObjectInfo);
+ LoadFunction(clGetImageInfo);
+ LoadFunction(clGetPipeInfo);
+ LoadFunction(clSetMemObjectDestructorCallback);
+ LoadFunction(clSVMAlloc);
+ LoadFunction(clSVMFree);
+ LoadFunction(clCreateSamplerWithProperties);
+ LoadFunction(clRetainSampler);
+ LoadFunction(clReleaseSampler);
+ LoadFunction(clGetSamplerInfo);
+ LoadFunction(clCreateProgramWithSource);
+ LoadFunction(clCreateProgramWithBinary);
+ LoadFunction(clCreateProgramWithBuiltInKernels);
+ LoadFunction(clRetainProgram);
+ LoadFunction(clReleaseProgram);
+ LoadFunction(clBuildProgram);
+ LoadFunction(clCompileProgram);
+ LoadFunction(clLinkProgram);
+ LoadFunction(clUnloadPlatformCompiler);
+ LoadFunction(clGetProgramInfo);
+ LoadFunction(clGetProgramBuildInfo);
+ LoadFunction(clCreateKernel);
+ LoadFunction(clCreateKernelsInProgram);
+ LoadFunction(clRetainKernel);
+ LoadFunction(clReleaseKernel);
+ LoadFunction(clSetKernelArg);
+ LoadFunction(clSetKernelArgSVMPointer);
+ LoadFunction(clSetKernelExecInfo);
+ LoadFunction(clGetKernelInfo);
+ LoadFunction(clGetKernelArgInfo);
+ LoadFunction(clGetKernelWorkGroupInfo);
+ LoadFunction(clWaitForEvents);
+ LoadFunction(clGetEventInfo);
+ LoadFunction(clCreateUserEvent);
+ LoadFunction(clRetainEvent);
+ LoadFunction(clReleaseEvent);
+ LoadFunction(clSetUserEventStatus);
+ LoadFunction(clSetEventCallback);
+ LoadFunction(clGetEventProfilingInfo);
+ LoadFunction(clFlush);
+ LoadFunction(clFinish);
+ LoadFunction(clEnqueueReadBuffer);
+ LoadFunction(clEnqueueReadBufferRect);
+ LoadFunction(clEnqueueWriteBuffer);
+ LoadFunction(clEnqueueWriteBufferRect);
+ LoadFunction(clEnqueueFillBuffer);
+ LoadFunction(clEnqueueCopyBuffer);
+ LoadFunction(clEnqueueCopyBufferRect);
+ LoadFunction(clEnqueueReadImage);
+ LoadFunction(clEnqueueWriteImage);
+ LoadFunction(clEnqueueFillImage);
+ LoadFunction(clEnqueueCopyImage);
+ LoadFunction(clEnqueueCopyImageToBuffer);
+ LoadFunction(clEnqueueCopyBufferToImage);
+ LoadFunction(clEnqueueMapBuffer);
+ LoadFunction(clEnqueueMapImage);
+ LoadFunction(clEnqueueUnmapMemObject);
+ LoadFunction(clEnqueueMigrateMemObjects);
+ LoadFunction(clEnqueueNDRangeKernel);
+ LoadFunction(clEnqueueNativeKernel);
+ LoadFunction(clEnqueueMarkerWithWaitList);
+ LoadFunction(clEnqueueBarrierWithWaitList);
+ LoadFunction(clEnqueueSVMFree);
+ LoadFunction(clEnqueueSVMMemcpy);
+ LoadFunction(clEnqueueSVMMemFill);
+ LoadFunction(clEnqueueSVMMap);
+ LoadFunction(clEnqueueSVMUnmap);
+ LoadFunction(clGetExtensionFunctionAddressForPlatform);
+ LoadFunction(clCreateImage2D);
+ LoadFunction(clCreateImage3D);
+ LoadFunction(clEnqueueMarker);
+ LoadFunction(clEnqueueWaitForEvents);
+ LoadFunction(clEnqueueBarrier);
+ LoadFunction(clUnloadCompiler);
+ LoadFunction(clGetExtensionFunctionAddress);
+ LoadFunction(clCreateCommandQueue);
+ LoadFunction(clCreateSampler);
+ LoadFunction(clEnqueueTask);
+
+ // OpenGL sharing
+ LoadFunction(clCreateFromGLBuffer);
+ LoadFunction(clCreateFromGLTexture);
+ LoadFunction(clEnqueueAcquireGLObjects);
+ LoadFunction(clEnqueueReleaseGLObjects);
+
+ // cl_khr_egl_event extension
+ LoadFunction(clCreateEventFromEGLSyncKHR);
+
+ // EGL sharing
+ LoadFunction(clCreateFromEGLImageKHR);
+ LoadFunction(clEnqueueAcquireEGLObjectsKHR);
+ LoadFunction(clEnqueueReleaseEGLObjectsKHR);
+} // namespace gpu_cl
+
+// No OpenCL support, do not set function addresses
+PFN_clGetPlatformIDs clGetPlatformIDs;
+PFN_clGetPlatformInfo clGetPlatformInfo;
+PFN_clGetDeviceIDs clGetDeviceIDs;
+PFN_clGetDeviceInfo clGetDeviceInfo;
+PFN_clCreateSubDevices clCreateSubDevices;
+PFN_clRetainDevice clRetainDevice;
+PFN_clReleaseDevice clReleaseDevice;
+PFN_clCreateContext clCreateContext;
+PFN_clCreateContextFromType clCreateContextFromType;
+PFN_clRetainContext clRetainContext;
+PFN_clReleaseContext clReleaseContext;
+PFN_clGetContextInfo clGetContextInfo;
+PFN_clCreateCommandQueueWithProperties clCreateCommandQueueWithProperties;
+PFN_clRetainCommandQueue clRetainCommandQueue;
+PFN_clReleaseCommandQueue clReleaseCommandQueue;
+PFN_clGetCommandQueueInfo clGetCommandQueueInfo;
+PFN_clCreateBuffer clCreateBuffer;
+PFN_clCreateSubBuffer clCreateSubBuffer;
+PFN_clCreateImage clCreateImage;
+PFN_clCreatePipe clCreatePipe;
+PFN_clRetainMemObject clRetainMemObject;
+PFN_clReleaseMemObject clReleaseMemObject;
+PFN_clGetSupportedImageFormats clGetSupportedImageFormats;
+PFN_clGetMemObjectInfo clGetMemObjectInfo;
+PFN_clGetImageInfo clGetImageInfo;
+PFN_clGetPipeInfo clGetPipeInfo;
+PFN_clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback;
+PFN_clSVMAlloc clSVMAlloc;
+PFN_clSVMFree clSVMFree;
+PFN_clCreateSamplerWithProperties clCreateSamplerWithProperties;
+PFN_clRetainSampler clRetainSampler;
+PFN_clReleaseSampler clReleaseSampler;
+PFN_clGetSamplerInfo clGetSamplerInfo;
+PFN_clCreateProgramWithSource clCreateProgramWithSource;
+PFN_clCreateProgramWithBinary clCreateProgramWithBinary;
+PFN_clCreateProgramWithBuiltInKernels clCreateProgramWithBuiltInKernels;
+PFN_clRetainProgram clRetainProgram;
+PFN_clReleaseProgram clReleaseProgram;
+PFN_clBuildProgram clBuildProgram;
+PFN_clCompileProgram clCompileProgram;
+PFN_clLinkProgram clLinkProgram;
+PFN_clUnloadPlatformCompiler clUnloadPlatformCompiler;
+PFN_clGetProgramInfo clGetProgramInfo;
+PFN_clGetProgramBuildInfo clGetProgramBuildInfo;
+PFN_clCreateKernel clCreateKernel;
+PFN_clCreateKernelsInProgram clCreateKernelsInProgram;
+PFN_clRetainKernel clRetainKernel;
+PFN_clReleaseKernel clReleaseKernel;
+PFN_clSetKernelArg clSetKernelArg;
+PFN_clSetKernelArgSVMPointer clSetKernelArgSVMPointer;
+PFN_clSetKernelExecInfo clSetKernelExecInfo;
+PFN_clGetKernelInfo clGetKernelInfo;
+PFN_clGetKernelArgInfo clGetKernelArgInfo;
+PFN_clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo;
+PFN_clWaitForEvents clWaitForEvents;
+PFN_clGetEventInfo clGetEventInfo;
+PFN_clCreateUserEvent clCreateUserEvent;
+PFN_clRetainEvent clRetainEvent;
+PFN_clReleaseEvent clReleaseEvent;
+PFN_clSetUserEventStatus clSetUserEventStatus;
+PFN_clSetEventCallback clSetEventCallback;
+PFN_clGetEventProfilingInfo clGetEventProfilingInfo;
+PFN_clFlush clFlush;
+PFN_clFinish clFinish;
+PFN_clEnqueueReadBuffer clEnqueueReadBuffer;
+PFN_clEnqueueReadBufferRect clEnqueueReadBufferRect;
+PFN_clEnqueueWriteBuffer clEnqueueWriteBuffer;
+PFN_clEnqueueWriteBufferRect clEnqueueWriteBufferRect;
+PFN_clEnqueueFillBuffer clEnqueueFillBuffer;
+PFN_clEnqueueCopyBuffer clEnqueueCopyBuffer;
+PFN_clEnqueueCopyBufferRect clEnqueueCopyBufferRect;
+PFN_clEnqueueReadImage clEnqueueReadImage;
+PFN_clEnqueueWriteImage clEnqueueWriteImage;
+PFN_clEnqueueFillImage clEnqueueFillImage;
+PFN_clEnqueueCopyImage clEnqueueCopyImage;
+PFN_clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer;
+PFN_clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage;
+PFN_clEnqueueMapBuffer clEnqueueMapBuffer;
+PFN_clEnqueueMapImage clEnqueueMapImage;
+PFN_clEnqueueUnmapMemObject clEnqueueUnmapMemObject;
+PFN_clEnqueueMigrateMemObjects clEnqueueMigrateMemObjects;
+PFN_clEnqueueNDRangeKernel clEnqueueNDRangeKernel;
+PFN_clEnqueueNativeKernel clEnqueueNativeKernel;
+PFN_clEnqueueMarkerWithWaitList clEnqueueMarkerWithWaitList;
+PFN_clEnqueueBarrierWithWaitList clEnqueueBarrierWithWaitList;
+PFN_clEnqueueSVMFree clEnqueueSVMFree;
+PFN_clEnqueueSVMMemcpy clEnqueueSVMMemcpy;
+PFN_clEnqueueSVMMemFill clEnqueueSVMMemFill;
+PFN_clEnqueueSVMMap clEnqueueSVMMap;
+PFN_clEnqueueSVMUnmap clEnqueueSVMUnmap;
+PFN_clGetExtensionFunctionAddressForPlatform clGetExtensionFunctionAddressForPlatform;
+PFN_clCreateImage2D clCreateImage2D;
+PFN_clCreateImage3D clCreateImage3D;
+PFN_clEnqueueMarker clEnqueueMarker;
+PFN_clEnqueueWaitForEvents clEnqueueWaitForEvents;
+PFN_clEnqueueBarrier clEnqueueBarrier;
+PFN_clUnloadCompiler clUnloadCompiler;
+PFN_clGetExtensionFunctionAddress clGetExtensionFunctionAddress;
+PFN_clCreateCommandQueue clCreateCommandQueue;
+PFN_clCreateSampler clCreateSampler;
+PFN_clEnqueueTask clEnqueueTask;
+
+// OpenGL sharing
+PFN_clCreateFromGLBuffer clCreateFromGLBuffer;
+PFN_clCreateFromGLTexture clCreateFromGLTexture;
+PFN_clEnqueueAcquireGLObjects clEnqueueAcquireGLObjects;
+PFN_clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects;
+
+// cl_khr_egl_event extension
+PFN_clCreateEventFromEGLSyncKHR clCreateEventFromEGLSyncKHR;
+
+// EGL sharing
+PFN_clCreateFromEGLImageKHR clCreateFromEGLImageKHR;
+PFN_clEnqueueAcquireEGLObjectsKHR clEnqueueAcquireEGLObjectsKHR;
+PFN_clEnqueueReleaseEGLObjectsKHR clEnqueueReleaseEGLObjectsKHR;
+
+cl_mem CreateImage2DLegacy(cl_context context, cl_mem_flags flags,
+ const cl_image_format *image_format, const cl_image_desc *image_desc,
+ void *host_ptr, cl_int *errcode_ret)
+{
+ if (clCreateImage)
+ { // clCreateImage available since OpenCL 1.2
+ return clCreateImage(context, flags, image_format, image_desc, host_ptr, errcode_ret);
+ }
+ else
+ {
+ return clCreateImage2D(context, flags, image_format, image_desc->image_width,
+ image_desc->image_height, image_desc->image_row_pitch, host_ptr,
+ errcode_ret);
+ }
+}
+
+cl_mem CreateImage3DLegacy(cl_context context, cl_mem_flags flags,
+ const cl_image_format *image_format, const cl_image_desc *image_desc,
+ void *host_ptr, cl_int *errcode_ret)
+{
+ if (clCreateImage)
+ { // clCreateImage available since OpenCL 1.2
+ return clCreateImage(context, flags, image_format, image_desc, host_ptr, errcode_ret);
+ }
+ else
+ {
+ return clCreateImage3D(context, flags, image_format, image_desc->image_width,
+ image_desc->image_height, image_desc->image_depth,
+ image_desc->image_row_pitch, image_desc->image_slice_pitch, host_ptr,
+ errcode_ret);
+ }
+}
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/OpenclWrapper.h b/runtime/onert/backend/gpu_cl/open_cl/OpenclWrapper.h
new file mode 100644
index 000000000..021f8735a
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/OpenclWrapper.h
@@ -0,0 +1,560 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_WRAPPERE_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_WRAPPERE_H__
+
+#include "CL/cl.h"
+#include "CL/cl_egl.h"
+#include "CL/cl_ext.h"
+#include "CL/cl_gl.h"
+#include "CL/cl_platform.h"
+#include "Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+absl::Status LoadOpenCL(void **libopencl);
+void UnloadOpenCL(void *libopencl);
+
+typedef cl_int(CL_API_CALL *PFN_clGetPlatformIDs)(
+ cl_uint /* num_entries */, cl_platform_id * /* platforms */,
+ cl_uint * /* num_platforms */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetPlatformInfo)(
+ cl_platform_id /* platform */, cl_platform_info /* param_name */, size_t /* param_value_size */,
+ void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetDeviceIDs)(
+ cl_platform_id /* platform */, cl_device_type /* device_type */, cl_uint /* num_entries */,
+ cl_device_id * /* devices */, cl_uint * /* num_devices */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetDeviceInfo)(
+ cl_device_id /* device */, cl_device_info /* param_name */, size_t /* param_value_size */,
+ void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clCreateSubDevices)(
+ cl_device_id /* in_device */, const cl_device_partition_property * /* properties */,
+ cl_uint /* num_devices */, cl_device_id * /* out_devices */,
+ cl_uint * /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int(CL_API_CALL *PFN_clRetainDevice)(cl_device_id /* device */)
+ CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int(CL_API_CALL *PFN_clReleaseDevice)(cl_device_id /* device */)
+ CL_API_SUFFIX__VERSION_1_2;
+typedef cl_context(CL_API_CALL *PFN_clCreateContext)(
+ const cl_context_properties * /* properties */, cl_uint /* num_devices */,
+ const cl_device_id * /* devices */,
+ void(CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *),
+ void * /* user_data */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_context(CL_API_CALL *PFN_clCreateContextFromType)(
+ const cl_context_properties * /* properties */, cl_device_type /* device_type */,
+ void(CL_CALLBACK * /* pfn_notify*/)(const char *, const void *, size_t, void *),
+ void * /* user_data */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clRetainContext)(cl_context /* context */)
+ CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clReleaseContext)(cl_context /* context */)
+ CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetContextInfo)(
+ cl_context /* context */, cl_context_info /* param_name */, size_t /* param_value_size */,
+ void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_command_queue(CL_API_CALL *PFN_clCreateCommandQueueWithProperties)(
+ cl_context /* context */, cl_device_id /* device */, const cl_queue_properties * /* properties */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int(CL_API_CALL *PFN_clRetainCommandQueue)(cl_command_queue /* command_queue */)
+ CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clReleaseCommandQueue)(cl_command_queue /* command_queue */)
+ CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetCommandQueueInfo)(
+ cl_command_queue /* command_queue */, cl_command_queue_info /* param_name */,
+ size_t /* param_value_size */, void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_mem(CL_API_CALL *PFN_clCreateBuffer)(
+ cl_context /* context */, cl_mem_flags /* flags */, size_t /* size */, void * /* host_ptr */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_mem(CL_API_CALL *PFN_clCreateSubBuffer)(
+ cl_mem /* buffer */, cl_mem_flags /* flags */, cl_buffer_create_type /* buffer_create_type */,
+ const void * /* buffer_create_info */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+typedef cl_mem(CL_API_CALL *PFN_clCreateImage)(
+ cl_context /* context */, cl_mem_flags /* flags */, const cl_image_format * /* image_format */,
+ const cl_image_desc * /* image_desc */, void * /* host_ptr */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_mem(CL_API_CALL *PFN_clCreatePipe)(
+ cl_context /* context */, cl_mem_flags /* flags */, cl_uint /* pipe_packet_size */,
+ cl_uint /* pipe_max_packets */, const cl_pipe_properties * /* properties */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int(CL_API_CALL *PFN_clRetainMemObject)(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clReleaseMemObject)(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetSupportedImageFormats)(
+ cl_context /* context */, cl_mem_flags /* flags */, cl_mem_object_type /* image_type */,
+ cl_uint /* num_entries */, cl_image_format * /* image_formats */,
+ cl_uint * /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetMemObjectInfo)(
+ cl_mem /* memobj */, cl_mem_info /* param_name */, size_t /* param_value_size */,
+ void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetImageInfo)(
+ cl_mem /* image */, cl_image_info /* param_name */, size_t /* param_value_size */,
+ void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetPipeInfo)(
+ cl_mem /* pipe */, cl_pipe_info /* param_name */, size_t /* param_value_size */,
+ void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int(CL_API_CALL *PFN_clSetMemObjectDestructorCallback)(
+ cl_mem /* memobj */,
+ void(CL_CALLBACK * /*pfn_notify*/)(cl_mem /* memobj */, void * /*user_data*/),
+ void * /*user_data */) CL_API_SUFFIX__VERSION_1_1;
+typedef void *(CL_API_CALL *PFN_clSVMAlloc)(cl_context /* context */, cl_svm_mem_flags /* flags */,
+ size_t /* size */,
+ cl_uint /* alignment */)CL_API_SUFFIX__VERSION_2_0;
+typedef void(CL_API_CALL *PFN_clSVMFree)(cl_context /* context */,
+ void * /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_sampler(CL_API_CALL *PFN_clCreateSamplerWithProperties)(
+ cl_context /* context */, const cl_sampler_properties * /* normalized_coords */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int(CL_API_CALL *PFN_clRetainSampler)(cl_sampler /* sampler */)
+ CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clReleaseSampler)(cl_sampler /* sampler */)
+ CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetSamplerInfo)(
+ cl_sampler /* sampler */, cl_sampler_info /* param_name */, size_t /* param_value_size */,
+ void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_program(CL_API_CALL *PFN_clCreateProgramWithSource)(
+ cl_context /* context */, cl_uint /* count */, const char ** /* strings */,
+ const size_t * /* lengths */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_program(CL_API_CALL *PFN_clCreateProgramWithBinary)(
+ cl_context /* context */, cl_uint /* num_devices */, const cl_device_id * /* device_list */,
+ const size_t * /* lengths */, const unsigned char ** /* binaries */, cl_int * /* binary_status */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_program(CL_API_CALL *PFN_clCreateProgramWithBuiltInKernels)(
+ cl_context /* context */, cl_uint /* num_devices */, const cl_device_id * /* device_list */,
+ const char * /* kernel_names */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int(CL_API_CALL *PFN_clRetainProgram)(cl_program /* program */)
+ CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clReleaseProgram)(cl_program /* program */)
+ CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clBuildProgram)(
+ cl_program /* program */, cl_uint /* num_devices */, const cl_device_id * /* device_list */,
+ const char * /* options */,
+ void(CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+ void * /* user_data */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clCompileProgram)(
+ cl_program /* program */, cl_uint /* num_devices */, const cl_device_id * /* device_list */,
+ const char * /* options */, cl_uint /* num_input_headers */,
+ const cl_program * /* input_headers */, const char ** /* header_include_names */,
+ void(CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+ void * /* user_data */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_program(CL_API_CALL *PFN_clLinkProgram)(
+ cl_context /* context */, cl_uint /* num_devices */, const cl_device_id * /* device_list */,
+ const char * /* options */, cl_uint /* num_input_programs */,
+ const cl_program * /* input_programs */,
+ void(CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */),
+ void * /* user_data */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int(CL_API_CALL *PFN_clUnloadPlatformCompiler)(cl_platform_id /* platform */)
+ CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int(CL_API_CALL *PFN_clGetProgramInfo)(
+ cl_program /* program */, cl_program_info /* param_name */, size_t /* param_value_size */,
+ void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetProgramBuildInfo)(
+ cl_program /* program */, cl_device_id /* device */, cl_program_build_info /* param_name */,
+ size_t /* param_value_size */, void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_kernel(CL_API_CALL *PFN_clCreateKernel)(
+ cl_program /* program */, const char * /* kernel_name */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clCreateKernelsInProgram)(
+ cl_program /* program */, cl_uint /* num_kernels */, cl_kernel * /* kernels */,
+ cl_uint * /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clRetainKernel)(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clReleaseKernel)(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clSetKernelArg)(cl_kernel /* kernel */, cl_uint /* arg_index */,
+ size_t /* arg_size */, const void * /* arg_value */)
+ CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clSetKernelArgSVMPointer)(
+ cl_kernel /* kernel */, cl_uint /* arg_index */,
+ const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int(CL_API_CALL *PFN_clSetKernelExecInfo)(
+ cl_kernel /* kernel */, cl_kernel_exec_info /* param_name */, size_t /* param_value_size */,
+ const void * /* param_value */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int(CL_API_CALL *PFN_clGetKernelInfo)(
+ cl_kernel /* kernel */, cl_kernel_info /* param_name */, size_t /* param_value_size */,
+ void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetKernelArgInfo)(
+ cl_kernel /* kernel */, cl_uint /* arg_indx */, cl_kernel_arg_info /* param_name */,
+ size_t /* param_value_size */, void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int(CL_API_CALL *PFN_clGetKernelWorkGroupInfo)(
+ cl_kernel /* kernel */, cl_device_id /* device */, cl_kernel_work_group_info /* param_name */,
+ size_t /* param_value_size */, void * /* param_value */,
+ size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clWaitForEvents)(
+ cl_uint /* num_events */, const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clGetEventInfo)(
+ cl_event /* event */, cl_event_info /* param_name */, size_t /* param_value_size */,
+ void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_event(CL_API_CALL *PFN_clCreateUserEvent)(
+ cl_context /* context */, cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1;
+typedef cl_int(CL_API_CALL *PFN_clRetainEvent)(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clReleaseEvent)(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clSetUserEventStatus)(
+ cl_event /* event */, cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1;
+typedef cl_int(CL_API_CALL *PFN_clSetEventCallback)(
+ cl_event /* event */, cl_int /* command_exec_callback_type */,
+ void(CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *),
+ void * /* user_data */) CL_API_SUFFIX__VERSION_1_1;
+typedef cl_int(CL_API_CALL *PFN_clGetEventProfilingInfo)(
+ cl_event /* event */, cl_profiling_info /* param_name */, size_t /* param_value_size */,
+ void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clFlush)(cl_command_queue /* command_queue */)
+ CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clFinish)(cl_command_queue /* command_queue */)
+ CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueReadBuffer)(
+ cl_command_queue /* command_queue */, cl_mem /* buffer */, cl_bool /* blocking_read */,
+ size_t /* offset */, size_t /* size */, void * /* ptr */, cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueReadBufferRect)(
+ cl_command_queue /* command_queue */, cl_mem /* buffer */, cl_bool /* blocking_read */,
+ const size_t * /* buffer_offset */, const size_t * /* host_offset */, const size_t * /* region */,
+ size_t /* buffer_row_pitch */, size_t /* buffer_slice_pitch */, size_t /* host_row_pitch */,
+ size_t /* host_slice_pitch */, void * /* ptr */, cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueWriteBuffer)(
+ cl_command_queue /* command_queue */, cl_mem /* buffer */, cl_bool /* blocking_write */,
+ size_t /* offset */, size_t /* size */, const void * /* ptr */,
+ cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueWriteBufferRect)(
+ cl_command_queue /* command_queue */, cl_mem /* buffer */, cl_bool /* blocking_write */,
+ const size_t * /* buffer_offset */, const size_t * /* host_offset */, const size_t * /* region */,
+ size_t /* buffer_row_pitch */, size_t /* buffer_slice_pitch */, size_t /* host_row_pitch */,
+ size_t /* host_slice_pitch */, const void * /* ptr */, cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueFillBuffer)(
+ cl_command_queue /* command_queue */, cl_mem /* buffer */, const void * /* pattern */,
+ size_t /* pattern_size */, size_t /* offset */, size_t /* size */,
+ cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueCopyBuffer)(
+ cl_command_queue /* command_queue */, cl_mem /* src_buffer */, cl_mem /* dst_buffer */,
+ size_t /* src_offset */, size_t /* dst_offset */, size_t /* size */,
+ cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueCopyBufferRect)(
+ cl_command_queue /* command_queue */, cl_mem /* src_buffer */, cl_mem /* dst_buffer */,
+ const size_t * /* src_origin */, const size_t * /* dst_origin */, const size_t * /* region */,
+ size_t /* src_row_pitch */, size_t /* src_slice_pitch */, size_t /* dst_row_pitch */,
+ size_t /* dst_slice_pitch */, cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueReadImage)(
+ cl_command_queue /* command_queue */, cl_mem /* image */, cl_bool /* blocking_read */,
+ const size_t * /* origin[3] */, const size_t * /* region[3] */, size_t /* row_pitch */,
+ size_t /* slice_pitch */, void * /* ptr */, cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueWriteImage)(
+ cl_command_queue /* command_queue */, cl_mem /* image */, cl_bool /* blocking_write */,
+ const size_t * /* origin[3] */, const size_t * /* region[3] */, size_t /* input_row_pitch */,
+ size_t /* input_slice_pitch */, const void * /* ptr */, cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueFillImage)(
+ cl_command_queue /* command_queue */, cl_mem /* image */, const void * /* fill_color */,
+ const size_t * /* origin[3] */, const size_t * /* region[3] */,
+ cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueCopyImage)(
+ cl_command_queue /* command_queue */, cl_mem /* src_image */, cl_mem /* dst_image */,
+ const size_t * /* src_origin[3] */, const size_t * /* dst_origin[3] */,
+ const size_t * /* region[3] */, cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueCopyImageToBuffer)(
+ cl_command_queue /* command_queue */, cl_mem /* src_image */, cl_mem /* dst_buffer */,
+ const size_t * /* src_origin[3] */, const size_t * /* region[3] */, size_t /* dst_offset */,
+ cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueCopyBufferToImage)(
+ cl_command_queue /* command_queue */, cl_mem /* src_buffer */, cl_mem /* dst_image */,
+ size_t /* src_offset */, const size_t * /* dst_origin[3] */, const size_t * /* region[3] */,
+ cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef void *(CL_API_CALL *PFN_clEnqueueMapBuffer)(
+ cl_command_queue /* command_queue */, cl_mem /* buffer */, cl_bool /* blocking_map */,
+ cl_map_flags /* map_flags */, size_t /* offset */, size_t /* size */,
+ cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */,
+ cl_event * /* event */, cl_int * /* errcode_ret */)CL_API_SUFFIX__VERSION_1_0;
+typedef void *(CL_API_CALL *PFN_clEnqueueMapImage)(
+ cl_command_queue /* command_queue */, cl_mem /* image */, cl_bool /* blocking_map */,
+ cl_map_flags /* map_flags */, const size_t * /* origin[3] */, const size_t * /* region[3] */,
+ size_t * /* image_row_pitch */, size_t * /* image_slice_pitch */,
+ cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */,
+ cl_event * /* event */, cl_int * /* errcode_ret */)CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueUnmapMemObject)(
+ cl_command_queue /* command_queue */, cl_mem /* memobj */, void * /* mapped_ptr */,
+ cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueMigrateMemObjects)(
+ cl_command_queue /* command_queue */, cl_uint /* num_mem_objects */,
+ const cl_mem * /* mem_objects */, cl_mem_migration_flags /* flags */,
+ cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueNDRangeKernel)(
+ cl_command_queue /* command_queue */, cl_kernel /* kernel */, cl_uint /* work_dim */,
+ const size_t * /* global_work_offset */, const size_t * /* global_work_size */,
+ const size_t * /* local_work_size */, cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueNativeKernel)(
+ cl_command_queue /* command_queue */, void(CL_CALLBACK * /*user_func*/)(void *),
+ void * /* args */, size_t /* cb_args */, cl_uint /* num_mem_objects */,
+ const cl_mem * /* mem_list */, const void ** /* args_mem_loc */,
+ cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueMarkerWithWaitList)(
+ cl_command_queue /* command_queue */, cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueBarrierWithWaitList)(
+ cl_command_queue /* command_queue */, cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueSVMFree)(
+ cl_command_queue /* command_queue */, cl_uint /* num_svm_pointers */,
+ void *[] /* svm_pointers[] */,
+ void(CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
+ cl_uint /* num_svm_pointers */,
+ void *[] /* svm_pointers[] */, void * /* user_data */),
+ void * /* user_data */, cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueSVMMemcpy)(
+ cl_command_queue /* command_queue */, cl_bool /* blocking_copy */, void * /* dst_ptr */,
+ const void * /* src_ptr */, size_t /* size */, cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueSVMMemFill)(
+ cl_command_queue /* command_queue */, void * /* svm_ptr */, const void * /* pattern */,
+ size_t /* pattern_size */, size_t /* size */, cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueSVMMap)(
+ cl_command_queue /* command_queue */, cl_bool /* blocking_map */, cl_map_flags /* flags */,
+ void * /* svm_ptr */, size_t /* size */, cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueSVMUnmap)(
+ cl_command_queue /* command_queue */, void * /* svm_ptr */, cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */, cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0;
+typedef void *(CL_API_CALL *PFN_clGetExtensionFunctionAddressForPlatform)(
+ cl_platform_id /* platform */, const char * /* func_name */)CL_API_SUFFIX__VERSION_1_2;
+typedef cl_mem(CL_API_CALL *PFN_clCreateImage2D)(cl_context /* context */, cl_mem_flags /* flags */,
+ const cl_image_format * /* image_format */,
+ size_t /* image_width */,
+ size_t /* image_height */,
+ size_t /* image_row_pitch */,
+ void * /* host_ptr */, cl_int * /* errcode_ret */);
+typedef cl_mem(CL_API_CALL *PFN_clCreateImage3D)(
+ cl_context /* context */, cl_mem_flags /* flags */, const cl_image_format * /* image_format */,
+ size_t /* image_width */, size_t /* image_height */, size_t /* image_depth */,
+ size_t /* image_row_pitch */, size_t /* image_slice_pitch */, void * /* host_ptr */,
+ cl_int * /* errcode_ret */);
+typedef cl_int(CL_API_CALL *PFN_clEnqueueMarker)(cl_command_queue /* command_queue */,
+ cl_event * /* event */);
+typedef cl_int(CL_API_CALL *PFN_clEnqueueWaitForEvents)(cl_command_queue /* command_queue */,
+ cl_uint /* num_events */,
+ const cl_event * /* event_list */);
+typedef cl_int(CL_API_CALL *PFN_clEnqueueBarrier)(cl_command_queue /* command_queue */);
+typedef cl_int(CL_API_CALL *PFN_clUnloadCompiler)();
+typedef void *(CL_API_CALL *PFN_clGetExtensionFunctionAddress)(const char * /* func_name */);
+typedef cl_command_queue(CL_API_CALL *PFN_clCreateCommandQueue)(
+ cl_context /* context */, cl_device_id /* device */, cl_command_queue_properties /* properties */,
+ cl_int * /* errcode_ret */);
+typedef cl_sampler(CL_API_CALL *PFN_clCreateSampler)(cl_context /* context */,
+ cl_bool /* normalized_coords */,
+ cl_addressing_mode /* addressing_mode */,
+ cl_filter_mode /* filter_mode */,
+ cl_int * /* errcode_ret */);
+typedef cl_int(CL_API_CALL *PFN_clEnqueueTask)(cl_command_queue /* command_queue */,
+ cl_kernel /* kernel */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */);
+
+// OpenGL sharing
+typedef cl_mem(CL_API_CALL *PFN_clCreateFromGLBuffer)(cl_context, cl_mem_flags, cl_GLuint, int *);
+typedef cl_mem(CL_API_CALL *PFN_clCreateFromGLTexture)(
+ cl_context /* context */, cl_mem_flags /* flags */, cl_GLenum /* target */,
+ cl_GLint /* miplevel */, cl_GLuint /* texture */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
+typedef cl_int(CL_API_CALL *PFN_clEnqueueAcquireGLObjects)(cl_command_queue /* command_queue */,
+ cl_uint /* num_objects */,
+ const cl_mem * /* mem_objects */,
+ cl_uint /* num_events_in_wait_list */,
+ const cl_event * /* event_wait_list */,
+ cl_event * /* event */);
+typedef cl_int(CL_API_CALL *PFN_clEnqueueReleaseGLObjects)(
+ cl_command_queue /* command_queue */, cl_uint /* num_objects */, const cl_mem * /* mem_objects */,
+ cl_uint /* num_events_in_wait_list */, const cl_event * /* event_wait_list */,
+ cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
+
+// cl_khr_egl_event extension
+
+// CLeglDisplayKHR is an opaque handle to an EGLDisplay
+typedef void *CLeglDisplayKHR;
+
+// CLeglSyncKHR is an opaque handle to an EGLSync object
+typedef void *CLeglSyncKHR;
+
+typedef cl_event(CL_API_CALL *PFN_clCreateEventFromEGLSyncKHR)(cl_context /* context */,
+ CLeglSyncKHR /* sync */,
+ CLeglDisplayKHR /* display */,
+ cl_int * /* errcode_ret */);
+
+// EGL sharing
+typedef cl_mem(CL_API_CALL *PFN_clCreateFromEGLImageKHR)(
+ cl_context /*context*/, CLeglDisplayKHR /*display*/, CLeglImageKHR /*image*/,
+ cl_mem_flags /*flags*/, const cl_egl_image_properties_khr * /*properties*/,
+ cl_int * /*errcode_ret*/);
+typedef cl_int(CL_API_CALL *PFN_clEnqueueAcquireEGLObjectsKHR)(
+ cl_command_queue /*command_queue*/, cl_uint /*num_objects*/, const cl_mem * /*mem_objects*/,
+ cl_uint /*num_events_in_wait_list*/, const cl_event * /*event_wait_list*/, cl_event * /*event*/);
+typedef cl_int(CL_API_CALL *PFN_clEnqueueReleaseEGLObjectsKHR)(
+ cl_command_queue /*command_queue*/, cl_uint /*num_objects*/, const cl_mem * /*mem_objects*/,
+ cl_uint /*num_events_in_wait_list*/, const cl_event * /*event_wait_list*/, cl_event * /*event*/);
+
+extern PFN_clGetPlatformIDs clGetPlatformIDs;
+extern PFN_clGetPlatformInfo clGetPlatformInfo;
+extern PFN_clGetDeviceIDs clGetDeviceIDs;
+extern PFN_clGetDeviceInfo clGetDeviceInfo;
+extern PFN_clCreateSubDevices clCreateSubDevices;
+extern PFN_clRetainDevice clRetainDevice;
+extern PFN_clReleaseDevice clReleaseDevice;
+extern PFN_clCreateContext clCreateContext;
+extern PFN_clCreateContextFromType clCreateContextFromType;
+extern PFN_clRetainContext clRetainContext;
+extern PFN_clReleaseContext clReleaseContext;
+extern PFN_clGetContextInfo clGetContextInfo;
+extern PFN_clCreateCommandQueueWithProperties clCreateCommandQueueWithProperties;
+extern PFN_clRetainCommandQueue clRetainCommandQueue;
+extern PFN_clReleaseCommandQueue clReleaseCommandQueue;
+extern PFN_clGetCommandQueueInfo clGetCommandQueueInfo;
+extern PFN_clCreateBuffer clCreateBuffer;
+extern PFN_clCreateSubBuffer clCreateSubBuffer;
+extern PFN_clCreateImage clCreateImage;
+extern PFN_clCreatePipe clCreatePipe;
+extern PFN_clRetainMemObject clRetainMemObject;
+extern PFN_clReleaseMemObject clReleaseMemObject;
+extern PFN_clGetSupportedImageFormats clGetSupportedImageFormats;
+extern PFN_clGetMemObjectInfo clGetMemObjectInfo;
+extern PFN_clGetImageInfo clGetImageInfo;
+extern PFN_clGetPipeInfo clGetPipeInfo;
+extern PFN_clSetMemObjectDestructorCallback clSetMemObjectDestructorCallback;
+extern PFN_clSVMAlloc clSVMAlloc;
+extern PFN_clSVMFree clSVMFree;
+extern PFN_clCreateSamplerWithProperties clCreateSamplerWithProperties;
+extern PFN_clRetainSampler clRetainSampler;
+extern PFN_clReleaseSampler clReleaseSampler;
+extern PFN_clGetSamplerInfo clGetSamplerInfo;
+extern PFN_clCreateProgramWithSource clCreateProgramWithSource;
+extern PFN_clCreateProgramWithBinary clCreateProgramWithBinary;
+extern PFN_clCreateProgramWithBuiltInKernels clCreateProgramWithBuiltInKernels;
+extern PFN_clRetainProgram clRetainProgram;
+extern PFN_clReleaseProgram clReleaseProgram;
+extern PFN_clBuildProgram clBuildProgram;
+extern PFN_clCompileProgram clCompileProgram;
+extern PFN_clLinkProgram clLinkProgram;
+extern PFN_clUnloadPlatformCompiler clUnloadPlatformCompiler;
+extern PFN_clGetProgramInfo clGetProgramInfo;
+extern PFN_clGetProgramBuildInfo clGetProgramBuildInfo;
+extern PFN_clCreateKernel clCreateKernel;
+extern PFN_clCreateKernelsInProgram clCreateKernelsInProgram;
+extern PFN_clRetainKernel clRetainKernel;
+extern PFN_clReleaseKernel clReleaseKernel;
+extern PFN_clSetKernelArg clSetKernelArg;
+extern PFN_clSetKernelArgSVMPointer clSetKernelArgSVMPointer;
+extern PFN_clSetKernelExecInfo clSetKernelExecInfo;
+extern PFN_clGetKernelInfo clGetKernelInfo;
+extern PFN_clGetKernelArgInfo clGetKernelArgInfo;
+extern PFN_clGetKernelWorkGroupInfo clGetKernelWorkGroupInfo;
+extern PFN_clWaitForEvents clWaitForEvents;
+extern PFN_clGetEventInfo clGetEventInfo;
+extern PFN_clCreateUserEvent clCreateUserEvent;
+extern PFN_clRetainEvent clRetainEvent;
+extern PFN_clReleaseEvent clReleaseEvent;
+extern PFN_clSetUserEventStatus clSetUserEventStatus;
+extern PFN_clSetEventCallback clSetEventCallback;
+extern PFN_clGetEventProfilingInfo clGetEventProfilingInfo;
+extern PFN_clFlush clFlush;
+extern PFN_clFinish clFinish;
+extern PFN_clEnqueueReadBuffer clEnqueueReadBuffer;
+extern PFN_clEnqueueReadBufferRect clEnqueueReadBufferRect;
+extern PFN_clEnqueueWriteBuffer clEnqueueWriteBuffer;
+extern PFN_clEnqueueWriteBufferRect clEnqueueWriteBufferRect;
+extern PFN_clEnqueueFillBuffer clEnqueueFillBuffer;
+extern PFN_clEnqueueCopyBuffer clEnqueueCopyBuffer;
+extern PFN_clEnqueueCopyBufferRect clEnqueueCopyBufferRect;
+extern PFN_clEnqueueReadImage clEnqueueReadImage;
+extern PFN_clEnqueueWriteImage clEnqueueWriteImage;
+extern PFN_clEnqueueFillImage clEnqueueFillImage;
+extern PFN_clEnqueueCopyImage clEnqueueCopyImage;
+extern PFN_clEnqueueCopyImageToBuffer clEnqueueCopyImageToBuffer;
+extern PFN_clEnqueueCopyBufferToImage clEnqueueCopyBufferToImage;
+extern PFN_clEnqueueMapBuffer clEnqueueMapBuffer;
+extern PFN_clEnqueueMapImage clEnqueueMapImage;
+extern PFN_clEnqueueUnmapMemObject clEnqueueUnmapMemObject;
+extern PFN_clEnqueueMigrateMemObjects clEnqueueMigrateMemObjects;
+extern PFN_clEnqueueNDRangeKernel clEnqueueNDRangeKernel;
+extern PFN_clEnqueueNativeKernel clEnqueueNativeKernel;
+extern PFN_clEnqueueMarkerWithWaitList clEnqueueMarkerWithWaitList;
+extern PFN_clEnqueueBarrierWithWaitList clEnqueueBarrierWithWaitList;
+extern PFN_clEnqueueSVMFree clEnqueueSVMFree;
+extern PFN_clEnqueueSVMMemcpy clEnqueueSVMMemcpy;
+extern PFN_clEnqueueSVMMemFill clEnqueueSVMMemFill;
+extern PFN_clEnqueueSVMMap clEnqueueSVMMap;
+extern PFN_clEnqueueSVMUnmap clEnqueueSVMUnmap;
+extern PFN_clGetExtensionFunctionAddressForPlatform clGetExtensionFunctionAddressForPlatform;
+extern PFN_clCreateImage2D clCreateImage2D;
+extern PFN_clCreateImage3D clCreateImage3D;
+extern PFN_clEnqueueMarker clEnqueueMarker;
+extern PFN_clEnqueueWaitForEvents clEnqueueWaitForEvents;
+extern PFN_clEnqueueBarrier clEnqueueBarrier;
+extern PFN_clUnloadCompiler clUnloadCompiler;
+extern PFN_clGetExtensionFunctionAddress clGetExtensionFunctionAddress;
+extern PFN_clCreateCommandQueue clCreateCommandQueue;
+extern PFN_clCreateSampler clCreateSampler;
+extern PFN_clEnqueueTask clEnqueueTask;
+
+// OpenGL sharing
+extern PFN_clCreateFromGLBuffer clCreateFromGLBuffer;
+extern PFN_clCreateFromGLTexture clCreateFromGLTexture;
+extern PFN_clEnqueueAcquireGLObjects clEnqueueAcquireGLObjects;
+extern PFN_clEnqueueReleaseGLObjects clEnqueueReleaseGLObjects;
+
+// cl_khr_egl_event extension
+extern PFN_clCreateEventFromEGLSyncKHR clCreateEventFromEGLSyncKHR;
+
+// EGL sharing
+extern PFN_clCreateFromEGLImageKHR clCreateFromEGLImageKHR;
+extern PFN_clEnqueueAcquireEGLObjectsKHR clEnqueueAcquireEGLObjectsKHR;
+extern PFN_clEnqueueReleaseEGLObjectsKHR clEnqueueReleaseEGLObjectsKHR;
+
+// For convenient image creation
+// It uses clCreateImage if it available (clCreateImage available since cl 1.2)
+// otherwise it will use legacy clCreateImage2D
+cl_mem CreateImage2DLegacy(cl_context context, cl_mem_flags flags,
+ const cl_image_format *image_format, const cl_image_desc *image_desc,
+ void *host_ptr, cl_int *errcode_ret);
+
+// It uses clCreateImage if it available (clCreateImage available since cl 1.2)
+// otherwise it will use legacy clCreateImage3D
+cl_mem CreateImage3DLegacy(cl_context context, cl_mem_flags flags,
+ const cl_image_format *image_format, const cl_image_desc *image_desc,
+ void *host_ptr, cl_int *errcode_ret);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_WRAPPERE_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/Operations.cc b/runtime/onert/backend/gpu_cl/open_cl/Operations.cc
new file mode 100644
index 000000000..2608b5364
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/Operations.cc
@@ -0,0 +1,704 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Operations.h"
+#include "open_cl/Operations.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+#include <unordered_map>
+
+#include "absl/container/flat_hash_map.h"
+
+#include "Shape.h"
+#include "Status.h"
+#include "InternalTensor.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+Padding2D &Padding2D::operator=(const Padding2D &value)
+{
+ prepended = value.prepended;
+ appended = value.appended;
+ return *this;
+}
+
+bool Padding2D::operator==(const Padding2D &value)
+{
+ return this->prepended == value.prepended && this->appended == value.appended;
+}
+
+bool Padding2D::operator!=(const Padding2D &value) { return !(*this == value); }
+
+Padding2D &Padding2D::operator-(const Padding2D &value)
+{
+ prepended.h -= value.prepended.h;
+ prepended.w -= value.prepended.w;
+ appended.h -= value.appended.h;
+ appended.w -= value.appended.w;
+ return *this;
+}
+
+Padding3D &Padding3D::operator=(const Padding3D &value)
+{
+ prepended = value.prepended;
+ appended = value.appended;
+ return *this;
+}
+
+bool Padding3D::operator==(const Padding3D &value)
+{
+ return this->prepended == value.prepended && this->appended == value.appended;
+}
+
+bool Padding3D::operator!=(const Padding3D &value) { return !(*this == value); }
+
+Padding3D &Padding3D::operator-(const Padding3D &value)
+{
+ prepended.h -= value.prepended.h;
+ prepended.w -= value.prepended.w;
+ prepended.d -= value.prepended.d;
+ appended.h -= value.appended.h;
+ appended.w -= value.appended.w;
+ appended.d -= value.appended.d;
+ return *this;
+}
+
+std::string ToString(enum OperationType op)
+{
+ switch (op)
+ {
+ // case OperationType::ABS:
+ // return "abs";
+ case OperationType::ADD:
+ return "add";
+ // case OperationType::CONCAT:
+ // return "concat";
+ // case OperationType::COS:
+ // return "cos";
+ // case OperationType::EXP:
+ // return "exp";
+ // case OperationType::LOG:
+ // return "log";
+ // case OperationType::NEG:
+ // return "neg";
+ // case OperationType::POOLING_2D:
+ // return "pooling_2d";
+ // case OperationType::REDUCE_MAXIMUM:
+ // return "reduce_maximum";
+ // case OperationType::REDUCE_MINIMUM:
+ // return "reduce_minimum";
+ // case OperationType::REDUCE_PRODUCT:
+ // return "reduce_product";
+ // case OperationType::REDUCE_SUM:
+ // return "reduce_sum";
+ // case OperationType::RESIZE:
+ // return "resize";
+ // case OperationType::RELU:
+ // return "relu";
+ // case OperationType::RSQRT:
+ // return "rsqrt";
+ // case OperationType::SQRT:
+ // return "sqrt";
+ // case OperationType::SQUARE:
+ // return "square";
+ case OperationType::UNKNOWN:
+ return "unknown_operation";
+ }
+ return "";
+}
+
+OperationType OperationTypeFromString(const std::string &name)
+{
+ static const auto operations = new std::unordered_map<std::string, OperationType>({
+ // {"abs", OperationType::ABS},
+ {"add", OperationType::ADD},
+ // {"concat", OperationType::CONCAT},
+ // {"cos", OperationType::COS},
+ // {"exp", OperationType::EXP},
+ // {"log", OperationType::LOG},
+ // {"neg", OperationType::NEG},
+ // {"pooling_2d", OperationType::POOLING_2D},
+ // {"reduce_maximum", OperationType::REDUCE_MAXIMUM},
+ // {"reduce_minimum", OperationType::REDUCE_MINIMUM},
+ // {"reduce_product", OperationType::REDUCE_PRODUCT},
+ // {"reduce_sum", OperationType::REDUCE_SUM},
+ // {"relu", OperationType::RELU},
+ // {"resize", OperationType::RESIZE},
+ // {"rsqrt", OperationType::RSQRT},
+ // {"sqrt", OperationType::SQRT},
+ // {"square", OperationType::SQUARE},
+ });
+ auto op = operations->find(name);
+ return op == operations->end() ? OperationType::UNKNOWN : op->second;
+}
+
+namespace
+{
+
+template <typename T> T DivideRoundUp(T n, T divisor) { return (n - 1) / divisor + 1; }
+
+int32_t CalculateOutputSizeBeforeStrides(int32_t input, int32_t kernel, int32_t padding,
+ int32_t dilation)
+{
+ const int32_t dilated_kernel = (kernel - 1) * dilation + 1;
+ return input + padding - dilated_kernel + 1;
+}
+
+template <Axis T>
+int32_t CalculateOutputWithoutStrides(const BHWC &input, const Convolution2DAttributes &attr)
+{
+ return CalculateOutputSizeBeforeStrides(
+ input.get<T>(), attr.weights.shape.get<T>(),
+ attr.padding.prepended.get<T>() + attr.padding.appended.get<T>(), attr.dilations.get<T>());
+}
+
+template <Axis T>
+int32_t CalculateOutputWithoutStrides(const BHWDC &input, const Convolution3DAttributes &attr)
+{
+ return CalculateOutputSizeBeforeStrides(
+ input.get<T>(), attr.weights.shape.get<T>(),
+ attr.padding.prepended.get<T>() + attr.padding.appended.get<T>(), attr.dilations.get<T>());
+}
+
+template <Axis T>
+int32_t CalculateOutputWithoutStrides(const BHWC &input, const Pooling2DAttributes &attr)
+{
+ return CalculateOutputSizeBeforeStrides(input.get<T>(), attr.kernel.get<T>(),
+ attr.padding.prepended.get<T>() +
+ attr.padding.appended.get<T>(),
+ /*dilation=*/1);
+}
+
+template <Axis T>
+int32_t CalculateOutputWithoutStrides(const BHWDC &input, const Pooling3DAttributes &attr)
+{
+ return CalculateOutputSizeBeforeStrides(input.get<T>(), attr.kernel.get<T>(),
+ attr.padding.prepended.get<T>() +
+ attr.padding.appended.get<T>(),
+ /*dilation=*/1);
+}
+
+template <Axis T>
+int32_t CalculateOutput(const BHWC &input, const ConvolutionTransposedAttributes &attr)
+{
+ return (input.get<T>() - 1) * attr.stride.get<T>() -
+ (attr.padding.prepended.get<T>() + attr.padding.appended.get<T>()) +
+ attr.weights.shape.get<T>() + attr.adjacent.get<T>();
+}
+
+template <Axis T>
+int32_t CalculateOutput(const BHWDC &input, const ConvolutionTransposed3DAttributes &attr)
+{
+ return (input.get<T>() - 1) * attr.stride.get<T>() -
+ (attr.padding.prepended.get<T>() + attr.padding.appended.get<T>()) +
+ attr.weights.shape.get<T>();
+}
+
+inline int32_t StridedSize(int32_t size, int32_t stride)
+{
+ return stride == 0 ? -1 : DivideRoundUp(size, stride);
+}
+
+template <Axis AxisT, typename AttrT> int32_t CalculateOutput(const BHWC &input, const AttrT &attr)
+{
+ return StridedSize(CalculateOutputWithoutStrides<AxisT>(input, attr),
+ attr.strides.template get<AxisT>());
+}
+
+template <Axis AxisT, typename AttrT> int32_t CalculateOutput(const BHWDC &input, const AttrT &attr)
+{
+ return StridedSize(CalculateOutputWithoutStrides<AxisT>(input, attr),
+ attr.strides.template get<AxisT>());
+}
+
+int32_t CalculateSamePadding(int32_t input, int32_t kernel, int32_t dilation, int32_t stride)
+{
+ const int32_t dilated_kernel = (kernel - 1) * dilation + 1;
+ return std::max(0, dilated_kernel - (input - 1) % stride - 1);
+}
+
+// Returns a padding that should be present to make sure image size stays
+// the same.
+template <Axis AxisT>
+int32_t CalculateSamePadding(const BHWC &input, const Convolution2DAttributes &attr)
+{
+ return CalculateSamePadding(input.get<AxisT>(), attr.weights.shape.get<AxisT>(),
+ attr.dilations.get<AxisT>(), attr.strides.get<AxisT>());
+}
+
+// Returns a padding that should be present to make sure image size stays
+// the same.
+template <Axis AxisT>
+int32_t CalculateSamePadding(const BHWDC &input, const Convolution3DAttributes &attr)
+{
+ return CalculateSamePadding(input.get<AxisT>(), attr.weights.shape.get<AxisT>(),
+ attr.dilations.get<AxisT>(), attr.strides.get<AxisT>());
+}
+
+template <Axis AxisT>
+int32_t CalculateSamePadding(const BHWC &input, const ConvolutionTransposedAttributes &attr)
+{
+ return CalculateSamePadding(input.get<AxisT>(), attr.weights.shape.get<AxisT>(),
+ /*dilation=*/1, attr.stride.get<AxisT>());
+}
+
+template <Axis AxisT>
+int32_t CalculateSamePadding(const BHWDC &input, const ConvolutionTransposed3DAttributes &attr)
+{
+ return CalculateSamePadding(input.get<AxisT>(), attr.weights.shape.get<AxisT>(),
+ /*dilation=*/1, attr.stride.get<AxisT>());
+}
+
+template <Axis AxisT>
+int32_t CalculateSamePadding(const BHWC &input, const Pooling2DAttributes &attr)
+{
+ return CalculateSamePadding(input.get<AxisT>(), attr.kernel.get<AxisT>(),
+ /*dilation=*/1, attr.strides.get<AxisT>());
+}
+
+template <Axis AxisT>
+int32_t CalculateSamePadding(const BHWDC &input, const Pooling3DAttributes &attr)
+{
+ return CalculateSamePadding(input.get<AxisT>(), attr.kernel.get<AxisT>(),
+ /*dilation=*/1, attr.strides.get<AxisT>());
+}
+
+template <Axis AxisT>
+int32_t CalculateSamePadding(const BHWC &input, const MaxUnpooling2DAttributes &attr)
+{
+ return CalculateSamePadding(input.get<AxisT>(), attr.kernel.get<AxisT>(),
+ /*dilation=*/1, attr.strides.get<AxisT>());
+}
+
+template <Axis AxisT>
+int32_t CalculateSamePadding(const BHWDC &input, const MaxUnpooling3DAttributes &attr)
+{
+ return CalculateSamePadding(input.get<AxisT>(), attr.kernel.get<AxisT>(),
+ /*dilation=*/1, attr.strides.get<AxisT>());
+}
+
+Padding2D MakeSamePadding(const BHWC &input, const ConvolutionTransposedAttributes &attr)
+{
+ int32_t padding_height = CalculateSamePadding<Axis::HEIGHT>(input, attr);
+ int32_t padding_width = CalculateSamePadding<Axis::WIDTH>(input, attr);
+ Padding2D padding;
+ padding.prepended = HW(padding_height / 2, padding_width / 2);
+ padding.appended = HW(padding_height - padding_height / 2, padding_width - padding_width / 2);
+ return padding;
+}
+
+Padding3D MakeSamePadding(const BHWDC &input, const ConvolutionTransposed3DAttributes &attr)
+{
+ int32_t padding_height = CalculateSamePadding<Axis::HEIGHT>(input, attr);
+ int32_t padding_width = CalculateSamePadding<Axis::WIDTH>(input, attr);
+ int32_t padding_depth = CalculateSamePadding<Axis::DEPTH>(input, attr);
+ Padding3D padding;
+ padding.prepended = HWD(padding_height / 2, padding_width / 2, padding_depth / 2);
+ padding.appended = HWD(padding_height - padding_height / 2, padding_width - padding_width / 2,
+ padding_depth - padding_depth / 2);
+ return padding;
+}
+
+// If padding depends on input, convert it into fixed padding.
+template <class AttrT> Padding2D MakeSamePadding(const BHWC &input, const AttrT &attr)
+{
+ int32_t padding_height = CalculateSamePadding<Axis::HEIGHT>(input, attr);
+ int32_t padding_width = CalculateSamePadding<Axis::WIDTH>(input, attr);
+ Padding2D padding;
+ padding.prepended = HW(padding_height / 2, padding_width / 2);
+ padding.appended = HW(padding_height - padding_height / 2, padding_width - padding_width / 2);
+ return padding;
+}
+
+// If padding depends on input, convert it into fixed padding.
+template <class AttrT> Padding3D MakeSamePadding(const BHWDC &input, const AttrT &attr)
+{
+ int32_t padding_height = CalculateSamePadding<Axis::HEIGHT>(input, attr);
+ int32_t padding_width = CalculateSamePadding<Axis::WIDTH>(input, attr);
+ int32_t padding_depth = CalculateSamePadding<Axis::DEPTH>(input, attr);
+ Padding3D padding;
+ padding.prepended = HWD(padding_height / 2, padding_width / 2, padding_depth / 2);
+ padding.appended = HWD(padding_height - padding_height / 2, padding_width - padding_width / 2,
+ padding_depth - padding_depth / 2);
+ return padding;
+}
+
+} // namespace
+
+BHWC CalculateOutputShape(const BHWC &input, const MaxUnpooling2DAttributes &attr)
+{
+ return BHWC(
+ input.b, input.h * attr.strides.h - attr.padding.prepended.h - attr.padding.appended.h,
+ input.w * attr.strides.w - attr.padding.prepended.w - attr.padding.appended.w, input.c);
+}
+
+BHWDC CalculateOutputShape(const BHWDC &input, const MaxUnpooling3DAttributes &attr)
+{
+ return BHWDC(
+ input.b, input.h * attr.strides.h - attr.padding.prepended.h - attr.padding.appended.h,
+ input.w * attr.strides.w - attr.padding.prepended.w - attr.padding.appended.w,
+ input.d * attr.strides.d - attr.padding.prepended.d - attr.padding.appended.d, input.c);
+}
+
+BHWC CalculateOutputShape(const BHWC &input, const Pooling2DAttributes &attr)
+{
+ return BHWC(input.b, CalculateOutput<Axis::HEIGHT>(input, attr),
+ CalculateOutput<Axis::WIDTH>(input, attr), input.c);
+}
+
+BHWDC CalculateOutputShape(const BHWDC &input, const Pooling3DAttributes &attr)
+{
+ return BHWDC(input.b, CalculateOutput<Axis::HEIGHT>(input, attr),
+ CalculateOutput<Axis::WIDTH>(input, attr), CalculateOutput<Axis::DEPTH>(input, attr),
+ input.c);
+}
+
+BHWC CalculateOutputShape(const BHWC &input, const Convolution2DAttributes &attr)
+{
+ return BHWC(input.b, CalculateOutput<Axis::HEIGHT>(input, attr),
+ CalculateOutput<Axis::WIDTH>(input, attr),
+ attr.weights.shape.get<Axis::OUTPUT_CHANNELS>());
+}
+
+BHWDC CalculateOutputShape(const BHWDC &input, const Convolution3DAttributes &attr)
+{
+ return BHWDC(input.b, CalculateOutput<Axis::HEIGHT>(input, attr),
+ CalculateOutput<Axis::WIDTH>(input, attr), CalculateOutput<Axis::DEPTH>(input, attr),
+ attr.weights.shape.get<Axis::OUTPUT_CHANNELS>());
+}
+
+BHWC CalculateOutputShape(const BHWC &input, const ConvolutionTransposedAttributes &attr)
+{
+ return BHWC(input.b, CalculateOutput<Axis::HEIGHT>(input, attr),
+ CalculateOutput<Axis::WIDTH>(input, attr),
+ attr.weights.shape.get<Axis::OUTPUT_CHANNELS>());
+}
+
+BHWDC CalculateOutputShape(const BHWDC &input, const ConvolutionTransposed3DAttributes &attr)
+{
+ return BHWDC(input.b, CalculateOutput<Axis::HEIGHT>(input, attr),
+ CalculateOutput<Axis::WIDTH>(input, attr), CalculateOutput<Axis::DEPTH>(input, attr),
+ attr.weights.shape.get<Axis::OUTPUT_CHANNELS>());
+}
+
+BHWC CalculateOutputShape(const BHWC &input, const DepthwiseConvolution2DAttributes &attr)
+{
+ return BHWC(input.b, CalculateOutput<Axis::HEIGHT>(input, attr),
+ CalculateOutput<Axis::WIDTH>(input, attr),
+ attr.weights.shape.get<Axis::OUTPUT_CHANNELS>() *
+ attr.weights.shape.get<Axis::INPUT_CHANNELS>());
+}
+
+BHWDC CalculateOutputShape(const BHWDC &input, const DepthwiseConvolution3DAttributes &attr)
+{
+ return BHWDC(input.b, CalculateOutput<Axis::HEIGHT>(input, attr),
+ CalculateOutput<Axis::WIDTH>(input, attr), CalculateOutput<Axis::DEPTH>(input, attr),
+ attr.weights.shape.get<Axis::OUTPUT_CHANNELS>() *
+ attr.weights.shape.get<Axis::INPUT_CHANNELS>());
+}
+
+BHWC CalculateOutputShape(const BHWC &input, const SliceAttributes &attr)
+{
+ (void)input;
+ return BHWC(StridedSize(attr.ends.b - attr.starts.b, attr.strides.b),
+ StridedSize(attr.ends.h - attr.starts.h, attr.strides.h),
+ StridedSize(attr.ends.w - attr.starts.w, attr.strides.w),
+ StridedSize(attr.ends.c - attr.starts.c, attr.strides.c));
+}
+
+BHWDC CalculateOutputShape(const BHWDC &input, const Slice3DAttributes &attr)
+{
+ (void)input;
+ return BHWDC(StridedSize(attr.ends.b - attr.starts.b, attr.strides.b),
+ StridedSize(attr.ends.h - attr.starts.h, attr.strides.h),
+ StridedSize(attr.ends.w - attr.starts.w, attr.strides.w),
+ StridedSize(attr.ends.d - attr.starts.d, attr.strides.d),
+ StridedSize(attr.ends.c - attr.starts.c, attr.strides.c));
+}
+
+BHWC CalculateOutputShape(const BHWC &input, const PadAttributes &attr)
+{
+ return BHWC(
+ attr.appended.b + attr.prepended.b + input.b, attr.appended.h + attr.prepended.h + input.h,
+ attr.appended.w + attr.prepended.w + input.w, attr.appended.c + attr.prepended.c + input.c);
+}
+
+BHWDC CalculateOutputShape(const BHWDC &input, const Pad3DAttributes &attr)
+{
+ return BHWDC(
+ attr.appended.b + attr.prepended.b + input.b, attr.appended.h + attr.prepended.h + input.h,
+ attr.appended.w + attr.prepended.w + input.w, attr.appended.d + attr.prepended.d + input.d,
+ attr.appended.c + attr.prepended.c + input.c);
+}
+
+BHWC CalculateOutputShape(const BHWC &input, const FullyConnectedAttributes &attr)
+{
+ return BHWC(input.b, 1, 1, attr.weights.shape.o);
+}
+
+BHWC CalculateOutputShape(const BHWC &input, const MeanAttributes &attr)
+{
+ const int b = attr.dims.find(Axis::BATCH) == attr.dims.end() ? input.b : 1;
+ const int h = attr.dims.find(Axis::HEIGHT) == attr.dims.end() ? input.h : 1;
+ const int w = attr.dims.find(Axis::WIDTH) == attr.dims.end() ? input.w : 1;
+ const int c = attr.dims.find(Axis::CHANNELS) == attr.dims.end() ? input.c : 1;
+ return BHWC(b, h, w, c);
+}
+
+absl::Status CalculateOutputShape(const std::vector<BHWC> &input, const ConcatAttributes &attr,
+ BHWC *output_shape)
+{
+ BHWC new_shape = input[0];
+ switch (attr.axis)
+ {
+ case Axis::CHANNELS:
+ for (size_t i = 1; i < input.size(); i++)
+ {
+ if (input[i].h != new_shape.h || input[i].w != new_shape.w || input[i].b != new_shape.b)
+ {
+ return absl::InvalidArgumentError(
+ "Height, Width and Batch must be the same when concatenating "
+ "by channels axis");
+ }
+ new_shape.c += input[i].c;
+ }
+ break;
+ case Axis::HEIGHT:
+ for (size_t i = 1; i < input.size(); i++)
+ {
+ if (input[i].w != new_shape.w || input[i].c != new_shape.c || input[i].b != new_shape.b)
+ {
+ return absl::InvalidArgumentError(
+ "Channels, Width and Batch must be the same when concatenating "
+ "by height axis");
+ }
+ new_shape.h += input[i].h;
+ }
+ break;
+ case Axis::WIDTH:
+ for (size_t i = 1; i < input.size(); i++)
+ {
+ if (input[i].h != new_shape.h || input[i].c != new_shape.c || input[i].b != new_shape.b)
+ {
+ return absl::InvalidArgumentError(
+ "Height, Channels and Batch must be the same when concatenating "
+ "by width axis");
+ }
+ new_shape.w += input[i].w;
+ }
+ break;
+ case Axis::BATCH:
+ for (size_t i = 1; i < input.size(); i++)
+ {
+ if (input[i].h != new_shape.h || input[i].c != new_shape.c || input[i].w != new_shape.w)
+ {
+ return absl::InvalidArgumentError(
+ "Width, Height and Channels must be the same when concatenating "
+ "by batch axis");
+ }
+ new_shape.b += input[i].b;
+ }
+ break;
+ default:
+ return absl::InvalidArgumentError("Invalid axis");
+ break;
+ }
+ *output_shape = new_shape;
+ return absl::OkStatus();
+}
+
+absl::Status CalculateOutputShape(const std::vector<BHWDC> &input, const ConcatAttributes &attr,
+ BHWDC *output_shape)
+{
+ BHWDC new_shape = input[0];
+ switch (attr.axis)
+ {
+ case Axis::CHANNELS:
+ for (size_t i = 1; i < input.size(); ++i)
+ {
+ if (input[i].h != new_shape.h || input[i].w != new_shape.w || input[i].d != new_shape.d ||
+ input[i].b != new_shape.b)
+ {
+ return absl::InvalidArgumentError("Height, Width, Batch and Depth must be the same when "
+ "concatenating "
+ "by channels axis");
+ }
+ new_shape.c += input[i].c;
+ }
+ break;
+ case Axis::HEIGHT:
+ for (size_t i = 1; i < input.size(); ++i)
+ {
+ if (input[i].w != new_shape.w || input[i].c != new_shape.c || input[i].d != new_shape.d ||
+ input[i].b != new_shape.b)
+ {
+ return absl::InvalidArgumentError(
+ "Width, Depth, Batch and Channels must be the same when "
+ "concatenating "
+ "by height axis");
+ }
+ new_shape.h += input[i].h;
+ }
+ break;
+ case Axis::WIDTH:
+ for (size_t i = 1; i < input.size(); ++i)
+ {
+ if (input[i].h != new_shape.h || input[i].c != new_shape.c || input[i].d != new_shape.d ||
+ input[i].b != new_shape.b)
+ {
+ return absl::InvalidArgumentError(
+ "Height, Depth, Batch and Channels must be the same when "
+ "concatenating "
+ "by width axis");
+ }
+ new_shape.w += input[i].w;
+ }
+ break;
+ case Axis::DEPTH:
+ for (size_t i = 1; i < input.size(); ++i)
+ {
+ if (input[i].w != new_shape.w || input[i].h != new_shape.h || input[i].c != new_shape.c ||
+ input[i].b != new_shape.b)
+ {
+ return absl::InvalidArgumentError(
+ "Width, Height, Batch and Channels must be the same when "
+ "concatenating "
+ "by depth axis");
+ }
+ new_shape.d += input[i].d;
+ }
+ break;
+ case Axis::BATCH:
+ for (size_t i = 1; i < input.size(); ++i)
+ {
+ if (input[i].w != new_shape.w || input[i].h != new_shape.h || input[i].c != new_shape.c ||
+ input[i].d != new_shape.d)
+ {
+ return absl::InvalidArgumentError(
+ "Width, Height, Depth and Channels must be the same when "
+ "concatenating "
+ "by batch axis");
+ }
+ new_shape.b += input[i].b;
+ }
+ break;
+ default:
+ return absl::InvalidArgumentError("Invalid axis");
+ }
+ *output_shape = new_shape;
+ return absl::OkStatus();
+}
+
+Padding2D CalculateSamePadding(const BHWC &input, const Convolution2DAttributes &attr)
+{
+ return MakeSamePadding(input, attr);
+}
+
+Padding3D CalculateSamePadding(const BHWDC &input, const Convolution3DAttributes &attr)
+{
+ return MakeSamePadding(input, attr);
+}
+
+Padding2D CalculateSamePadding(const BHWC &input, const ConvolutionTransposedAttributes &attr)
+{
+ return MakeSamePadding(input, attr);
+}
+
+Padding3D CalculateSamePadding(const BHWDC &input, const ConvolutionTransposed3DAttributes &attr)
+{
+ return MakeSamePadding(input, attr);
+}
+
+Padding2D CalculateSamePadding(const BHWC &input, const DepthwiseConvolution2DAttributes &attr)
+{
+ return MakeSamePadding(input, attr);
+}
+
+Padding3D CalculateSamePadding(const BHWDC &input, const DepthwiseConvolution3DAttributes &attr)
+{
+ return MakeSamePadding(input, attr);
+}
+
+Padding2D CalculateSamePadding(const BHWC &input, const Pooling2DAttributes &attr)
+{
+ return MakeSamePadding(input, attr);
+}
+
+Padding3D CalculateSamePadding(const BHWDC &input, const Pooling3DAttributes &attr)
+{
+ return MakeSamePadding(input, attr);
+}
+
+Padding2D CalculateSamePadding(const BHWC &input, const MaxUnpooling2DAttributes &attr)
+{
+ return MakeSamePadding(input, attr);
+}
+
+Padding3D CalculateSamePadding(const BHWDC &input, const MaxUnpooling3DAttributes &attr)
+{
+ return MakeSamePadding(input, attr);
+}
+
+float CalculateResizeScale(int32_t input_size, int32_t output_size, const Resize2DAttributes &attr)
+{
+ return attr.align_corners && input_size > 1 && output_size > 1
+ ? static_cast<float>(input_size - 1) / (output_size - 1)
+ : static_cast<float>(input_size) / output_size;
+}
+
+float CalculateResizeScale(int32_t input_size, int32_t output_size, const Resize3DAttributes &attr)
+{
+ return attr.align_corners && input_size > 1 && output_size > 1
+ ? static_cast<float>(input_size - 1) / (output_size - 1)
+ : static_cast<float>(input_size) / output_size;
+}
+
+BHWC CalculateOutputShape(const BHWC &input, const Resize2DAttributes &attr)
+{
+ return BHWC(input.b, attr.new_shape.h, attr.new_shape.w, input.c);
+}
+
+BHWDC CalculateOutputShape(const BHWDC &input, const Resize3DAttributes &attr)
+{
+ return BHWDC(input.b, attr.new_shape.h, attr.new_shape.w, attr.new_shape.d, input.c);
+}
+
+BHWC CalculateOutputShape(const BHWC &input, const TransposeAttributes &attr)
+{
+ return BHWC(input.get(attr.perm.b), input.get(attr.perm.h), input.get(attr.perm.w),
+ input.get(attr.perm.c));
+}
+
+BHWDC CalculateOutputShape(const BHWDC &input, const Transpose3DAttributes &attr)
+{
+ return BHWDC(input.get(attr.perm.b), input.get(attr.perm.h), input.get(attr.perm.w),
+ input.get(attr.perm.d), input.get(attr.perm.c));
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/Operations.h b/runtime/onert/backend/gpu_cl/open_cl/Operations.h
new file mode 100644
index 000000000..825eb90a4
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/Operations.h
@@ -0,0 +1,586 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_OPERATIONS_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_OPERATIONS_H__
+
+#include <cstdint>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "absl/types/variant.h"
+
+#include "DataType.h"
+#include "Shape.h"
+#include "Status.h"
+#include "InternalTensor.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+enum class OperationType
+{
+ UNKNOWN = 0,
+ // ABS,
+ ADD,
+ // BATCH_TO_SPACE,
+ // BATCH_NORMALIZATION,
+ // BATCHED_MATMUL,
+ // CONCAT,
+ // CONST,
+ // CONVOLUTION_2D,
+ // CONVOLUTION_TRANSPOSED,
+ // COPY,
+ // COS,
+ // DEPTHWISE_CONVOLUTION,
+ // DIV,
+ // ELU,
+ // EQUAL,
+ // EXP,
+ // FULLY_CONNECTED,
+ // GREATER,
+ // GREATER_EQUAL,
+ // HARD_SWISH,
+ // LESS,
+ // LESS_EQUAL,
+ // LOG,
+ // LSTM,
+ // MAXIMUM,
+ // MAX_UNPOOLING_2D,
+ // MEAN,
+ // MEAN_STDDEV_NORMALIZATION,
+ // MINIMUM,
+ // MUL,
+ // NEG,
+ // NOT_EQUAL,
+ // PAD,
+ // POOLING_2D,
+ // POW,
+ // PRELU,
+ // Used to accurately run inference on quantized models.
+ // QUANTIZE_AND_DEQUANTIZE,
+ // REDUCE_MAXIMUM,
+ // REDUCE_MINIMUM,
+ // REDUCE_PRODUCT,
+ // REDUCE_SUM,
+ // RELU,
+ // RESHAPE,
+ // RESIZE,
+ // RSQRT,
+ // SIGMOID,
+ // SIN,
+ // SLICE,
+ // SOFTMAX,
+ // SPACE_TO_BATCH,
+ // SPACE_TO_DEPTH,
+ // SQRT,
+ // SQUARE,
+ // SQUARED_DIFF,
+ // SUB,
+ // TANH,
+ // TRANSPOSE,
+};
+
+std::string ToString(enum OperationType op);
+
+OperationType OperationTypeFromString(const std::string &name);
+
+typedef absl::variant<absl::monostate, InternalTensor<HWC, DataType::FLOAT32>,
+ InternalTensor<Linear, DataType::FLOAT32>, float>
+ TensorOrScalar;
+
+struct Padding2D
+{
+ Padding2D() = default;
+ Padding2D(const Padding2D &);
+ Padding2D &operator=(const Padding2D &value);
+ bool operator==(const Padding2D &value);
+ bool operator!=(const Padding2D &value);
+ Padding2D &operator-(const Padding2D &value);
+
+ // Padding values for every axis (if needed), where 'prepended' defines
+ // padding for the beginning of each axis and 'appended' represents end part
+ // of the corresponding axis.
+ HW prepended = HW(-1, -1);
+ HW appended = HW(-1, -1);
+};
+
+struct Padding3D
+{
+ Padding3D() = default;
+ Padding3D(const Padding3D &);
+ Padding3D &operator=(const Padding3D &value);
+ bool operator==(const Padding3D &value);
+ bool operator!=(const Padding3D &value);
+ Padding3D &operator-(const Padding3D &value);
+ // Padding values for every axis (if needed), where 'prepended' defines
+ // padding for the beginning of each axis and 'appended' represents end part
+ // of the corresponding axis.
+ HWD prepended = HWD(0, 0, 0);
+ HWD appended = HWD(0, 0, 0);
+};
+
+struct Crop2D : public Padding2D
+{
+};
+
+struct SpaceToBatchAttributes
+{
+ HW block;
+ Padding2D padding;
+};
+
+struct BatchToSpaceAttributes
+{
+ HW block;
+ Crop2D crop;
+};
+
+enum class PoolingType
+{
+ UNDEFINED = 0,
+
+ // average pooling
+ AVERAGE = 1,
+
+ // max pooling
+ MAX = 2,
+};
+
+struct Pooling2DAttributes
+{
+ PoolingType type = PoolingType::UNDEFINED;
+ // Strides for every axis.
+ HW strides = HW(-1, -1);
+ HW kernel = HW(-1, -1);
+ Padding2D padding;
+ // NOTE(akulik): technically the number of outputs from Pooling node indicates
+ // whether indices are needed or not, but I decided to keep it inside
+ // attributes to simplify processing.
+ bool output_indices = false;
+};
+
+struct Pooling3DAttributes
+{
+ PoolingType type = PoolingType::UNDEFINED;
+ // Strides for every axis.
+ HWD strides = HWD(0, 0, 0);
+ HWD kernel = HWD(0, 0, 0);
+ Padding3D padding;
+ // NOTE(akulik): technically the number of outputs from Pooling node indicates
+ // whether indices are needed or not, but I decided to keep it inside
+ // attributes to simplify processing.
+ bool output_indices = false;
+};
+
+struct MaxUnpooling2DAttributes
+{
+ // Strides for every axis.
+ HW strides = HW(-1, -1);
+ HW kernel = HW(-1, -1);
+ Padding2D padding;
+};
+
+struct MaxUnpooling3DAttributes
+{
+ // Strides for every axis.
+ HWD strides = HWD(0, 0, 0);
+ HWD kernel = HWD(0, 0, 0);
+ Padding3D padding;
+};
+
+struct MeanAttributes
+{
+ // The vector of dimensions to calculate mean along.
+ std::set<Axis> dims;
+};
+
+struct ConcatAttributes
+{
+ // Defines axis by which to concat on.
+ Axis axis = Axis::UNKNOWN;
+};
+
+// @return shape of a tensor after MaxUnpooling2D operation is applied to
+// the given input.
+BHWC CalculateOutputShape(const BHWC &input, const MaxUnpooling2DAttributes &attr);
+
+// @return shape of a tensor after MaxUnpooling3D operation is applied to
+// the given input.
+BHWDC CalculateOutputShape(const BHWDC &input, const MaxUnpooling3DAttributes &attr);
+
+// @return shape of a tensor after Pooling2D operation is applied to the given
+// input.
+BHWC CalculateOutputShape(const BHWC &input, const Pooling2DAttributes &attr);
+
+// @return shape of a tensor after Pooling3D operation is applied to the given
+// input.
+BHWDC CalculateOutputShape(const BHWDC &input, const Pooling3DAttributes &attr);
+
+// @return shape of a tensor after Concat operation is applied to the given
+// input.
+absl::Status CalculateOutputShape(const std::vector<BHWC> &input, const ConcatAttributes &attr,
+ BHWC *output_shape);
+
+// @return shape of a tensor after Concat operation is applied to the given
+// input.
+absl::Status CalculateOutputShape(const std::vector<BHWDC> &input, const ConcatAttributes &attr,
+ BHWDC *output_shape);
+
+// @return padding for pooling operation to make sure output keep the same shape
+// as the given input.
+Padding2D CalculateSamePadding(const BHWC &input, const Pooling2DAttributes &attr);
+
+// @return padding for pooling operation to make sure output keep the same shape
+// as the given input.
+Padding3D CalculateSamePadding(const BHWDC &input, const Pooling3DAttributes &attr);
+
+// @return padding for max unpooling operation to make sure output keep the same
+// shape as the given input.
+Padding2D CalculateSamePadding(const BHWC &input, const MaxUnpooling2DAttributes &attr);
+
+// @return padding for max unpooling operation to make sure output keep the same
+// shape as the given input.
+Padding3D CalculateSamePadding(const BHWDC &input, const MaxUnpooling3DAttributes &attr);
+
+struct Convolution2DAttributes
+{
+ HW strides = HW(1, 1); // Along each axis.
+ HW dilations = HW(1, 1); // Along each axis.
+ Padding2D padding;
+
+ InternalTensor<OHWI, DataType::FLOAT32> weights;
+ InternalTensor<Linear, DataType::FLOAT32> bias; // optional
+};
+
+struct Convolution3DAttributes
+{
+ HWD strides = HWD(0, 0, 0); // Along each axis.
+ HWD dilations = HWD(0, 0, 0); // Along each axis.
+ Padding3D padding;
+
+ InternalTensor<OHWDI, DataType::FLOAT32> weights;
+ InternalTensor<Linear, DataType::FLOAT32> bias; // optional
+};
+
+// @return shape of a tensor after Convolution2D operation is applied to
+// the given input.
+BHWC CalculateOutputShape(const BHWC &input, const Convolution2DAttributes &attr);
+
+// @return shape of a tensor after Convolution3D operation is applied to
+// the given input.
+BHWDC CalculateOutputShape(const BHWDC &input, const Convolution3DAttributes &attr);
+
+// @return padding for convolution operation to make sure output keep the same
+// shape as the given input.
+Padding2D CalculateSamePadding(const BHWC &input, const Convolution2DAttributes &attr);
+
+// @return padding for convolution operation to make sure output keep the same
+// shape as the given input.
+Padding3D CalculateSamePadding(const BHWDC &input, const Convolution3DAttributes &attr);
+
+struct ConvolutionTransposedAttributes
+{
+ HW stride = HW(1, 1); // Along each axis.
+ HW adjacent; // TODO(sorokin): No op on Flow.
+ Padding2D padding;
+
+ InternalTensor<OHWI, DataType::FLOAT32> weights;
+ InternalTensor<Linear, DataType::FLOAT32> bias; // optional
+};
+
+struct ConvolutionTransposed3DAttributes
+{
+ HWD stride = HWD(0, 0, 0); // Along each axis.
+ Padding3D padding;
+
+ InternalTensor<OHWDI, DataType::FLOAT32> weights;
+ InternalTensor<Linear, DataType::FLOAT32> bias; // optional
+};
+
+Padding2D CalculateSamePadding(const BHWC &input, const ConvolutionTransposedAttributes &attr);
+
+Padding3D CalculateSamePadding(const BHWDC &input, const ConvolutionTransposed3DAttributes &attr);
+
+// @return shape of a tensor after ConvolutionTransposed operation is applied to
+// the given input.
+BHWC CalculateOutputShape(const BHWC &input, const ConvolutionTransposedAttributes &attr);
+
+// @return shape of a tensor after ConvolutionTransposed3D operation is applied
+// to
+// the given input.
+BHWDC CalculateOutputShape(const BHWDC &input, const ConvolutionTransposed3DAttributes &attr);
+
+struct DepthwiseConvolution2DAttributes : public Convolution2DAttributes
+{
+};
+struct DepthwiseConvolution3DAttributes : public Convolution3DAttributes
+{
+};
+
+// @return shape of a tensor after DepthwiseConvolution2D operation is applied
+// to the given input.
+BHWC CalculateOutputShape(const BHWC &input, const DepthwiseConvolution2DAttributes &attr);
+
+// @return shape of a tensor after DepthwiseConvolution3D operation is applied
+// to the given input.
+BHWDC CalculateOutputShape(const BHWDC &input, const DepthwiseConvolution3DAttributes &attr);
+
+// @return padding for depthwise convolution operation to make sure output keep
+// the same shape as the given input.
+Padding2D CalculateSamePadding(const BHWC &input, const DepthwiseConvolution2DAttributes &attr);
+
+// @return padding for depthwise convolution operation to make sure output keep
+// the same shape as the given input.
+Padding3D CalculateSamePadding(const BHWDC &input, const DepthwiseConvolution3DAttributes &attr);
+
+// f(x):= {
+// if x < 0 : x -> alpha * x
+// if x >= 0 : x -> min(clip, x)
+// }
+//
+// Examples:
+// - ReLU: clip = 0, alpha = 0
+// - ReLU6: clip = 6, alpha = 0
+// - Leaky ReLU: clip = 0, alpha = a
+struct ReLUAttributes
+{
+ // clip <= 0 mean it is not set.
+ float clip = 0;
+
+ float alpha = 0;
+};
+
+struct PReLUAttributes
+{
+ // clip <= 0 mean it is not set.
+ float clip = 0;
+
+ // If alpha is linear, then it is sharded across CHANNELS axis, otherwise
+ // full shape alpha is required.
+ absl::variant<InternalTensor<Linear, DataType::FLOAT32>, InternalTensor<HWC, DataType::FLOAT32>>
+ alpha;
+};
+
+struct ReduceAttributes
+{
+ Axis axis = Axis::UNKNOWN;
+};
+
+struct SoftmaxAttributes
+{
+ Axis axis = Axis::UNKNOWN;
+};
+
+enum LstmKernelType
+{
+ FULL = 0,
+ BASIC = 1, // Currently, only basic is supported.
+};
+
+struct LstmAttributes
+{
+ LstmKernelType kernel_type = LstmKernelType::BASIC;
+};
+
+enum class SamplingType
+{
+ UNKNOWN = 0,
+ NEAREST = 1,
+ BILINEAR = 2,
+};
+
+struct Resize2DAttributes
+{
+ HW new_shape;
+
+ SamplingType type = SamplingType::UNKNOWN;
+
+ // If true, the centers of the 4 corner pixels of the input and output tensors
+ // are aligned, preserving the values at the corner pixels. Defaults to false.
+ bool align_corners = false;
+
+ bool half_pixel_centers = false;
+};
+
+// TODO(b/147771327): rename to Resize3D
+struct Resize3DAttributes
+{
+ HWD new_shape;
+
+ SamplingType type = SamplingType::NEAREST;
+
+ // If true, the centers of the 8 corner pixels of the input and output tensors
+ // are aligned, preserving the values at the corner pixels. Defaults to false.
+ bool align_corners = false;
+
+ bool half_pixel_centers = false;
+};
+
+float CalculateResizeScale(int32_t input_size, int32_t output_size, const Resize2DAttributes &attr);
+
+float CalculateResizeScale(int32_t input_size, int32_t output_size, const Resize3DAttributes &attr);
+
+// @return shape of a tensor after scale operation is applied to the given
+// input.
+BHWC CalculateOutputShape(const BHWC &input, const Resize2DAttributes &attr);
+
+// @return shape of a tensor after scale operation is applied to the given
+// input.
+BHWDC CalculateOutputShape(const BHWDC &input, const Resize3DAttributes &attr);
+
+enum class PaddingContentType
+{
+ ZEROS = 0,
+ REFLECT = 1,
+ EDGE = 2,
+};
+
+struct PadAttributes
+{
+ PaddingContentType type = PaddingContentType::ZEROS;
+
+ BHWC prepended;
+ BHWC appended;
+};
+
+// @return shape of a tensor after Pad operation is applied to the given input.
+BHWC CalculateOutputShape(const BHWC &input, const PadAttributes &attr);
+
+struct Pad3DAttributes
+{
+ PaddingContentType type = PaddingContentType::ZEROS;
+
+ BHWDC prepended;
+ BHWDC appended;
+};
+
+// @return shape of a tensor after Pad3D operation is applied to the given
+// input.
+BHWDC CalculateOutputShape(const BHWDC &input, const Pad3DAttributes &attr);
+
+struct ConstTensorAttributes
+{
+ InternalTensor<BHWC, DataType::FLOAT32> tensor;
+};
+
+// Simple slicing without advanced support for shrinking, reverse slicing etc.
+struct SliceAttributes
+{
+ // Specifies start and end dimensions for slicing.
+ BHWC starts;
+ BHWC ends;
+
+ // Stride should be >= 1.
+ BHWC strides;
+};
+
+// @return shape of a tensor after Slice2D operation is applied to the given
+// input.
+BHWC CalculateOutputShape(const BHWC &input, const SliceAttributes &attr);
+
+// Simple slicing without advanced support for shrinking, reverse slicing etc.
+struct Slice3DAttributes
+{
+ // Specifies start and end dimensions for slicing.
+ BHWDC starts;
+ BHWDC ends;
+
+ // Stride should be >= 1.
+ BHWDC strides;
+};
+
+// @return shape of a tensor after Slice3D operation is applied to the given
+// input.
+BHWDC CalculateOutputShape(const BHWDC &input, const Slice3DAttributes &attr);
+
+struct FullyConnectedAttributes
+{
+ InternalTensor<OHWI, DataType::FLOAT32> weights;
+ InternalTensor<Linear, DataType::FLOAT32> bias;
+};
+
+// @return shape of a tensor after FullyConnected operation is applied to
+// the given input.
+BHWC CalculateOutputShape(const BHWC &input, const FullyConnectedAttributes &attr);
+
+// @return shape of a tensor after Mean operation is applied to the given input.
+BHWC CalculateOutputShape(const BHWC &input, const MeanAttributes &attr);
+
+struct ElementwiseAttributes
+{
+ TensorOrScalar param;
+ // For elementwise operation with 2 inputs op(A, B), runtime_tensor_is_second
+ // true when runtime tensor is B(on second position). this is important for
+ // ops that non commutative, for example substract.
+ bool runtime_tensor_is_second = false;
+};
+
+struct ReshapeAttributes
+{
+ BHWC new_shape;
+};
+
+struct Reshape3DAttributes
+{
+ BHWDC new_shape;
+};
+
+struct TransposeAttributes
+{
+ // A permutation of the dimensions of input tensor
+ BHWC perm;
+};
+
+// @return shape of a tensor after Transpose operation is applied to
+// the given input.
+BHWC CalculateOutputShape(const BHWC &input, const TransposeAttributes &attr);
+
+struct Transpose3DAttributes
+{
+ // A permutation of the dimensions of input tensor
+ BHWDC perm;
+};
+
+// @return shape of a tensor after Transpose3D operation is applied to
+// the given input.
+BHWDC CalculateOutputShape(const BHWDC &input, const Transpose3DAttributes &attr);
+
+struct SpaceToDepthAttributes
+{
+ int block_size;
+};
+
+// These help perform a combination of Quantize & Dequantize to adjust float
+// values like quantized inference would.
+struct QuantizeAndDequantizeAttributes
+{
+ float min = 0;
+ float max = 0;
+ float scale = 0;
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_OPERATIONS_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/Precision.cc b/runtime/onert/backend/gpu_cl/open_cl/Precision.cc
new file mode 100644
index 000000000..bd908bd43
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/Precision.cc
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Precision.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+std::string ToString(CalculationsPrecision precision)
+{
+ switch (precision)
+ {
+ case CalculationsPrecision::F32_F16:
+ return "CalculationsPrecision::F32_F16";
+ case CalculationsPrecision::F32:
+ return "CalculationsPrecision::F32";
+ case CalculationsPrecision::F16:
+ return "CalculationsPrecision::F16";
+ }
+ return " ";
+}
+
+DataType DeduceDataTypeFromPrecision(CalculationsPrecision precision)
+{
+ if (precision == CalculationsPrecision::F32)
+ {
+ return DataType::FLOAT32;
+ }
+ else
+ {
+ return DataType::FLOAT16;
+ }
+ return DataType::UNKNOWN;
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/Precision.h b/runtime/onert/backend/gpu_cl/open_cl/Precision.h
new file mode 100644
index 000000000..cb910c783
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/Precision.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_PRECISION_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_PRECISION_H__
+
+#include <string>
+
+#include "DataType.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+enum class CalculationsPrecision
+{
+ F32,
+ F32_F16,
+ F16
+};
+// F32 - all data and all math ops in F32
+// F16 - all data and all math ops in F16
+// F32_F16 - as F16, but some operations (Convolution,
+// DepthwiseConvolution, FullyConnected, ConvolutionTransposed)
+// have accumulator in F32 and usually it calculates 4 mads in F16, sum them,
+// than converts this partial sum to F32 and add to accumulator.
+
+DataType DeduceDataTypeFromPrecision(CalculationsPrecision precision);
+
+std::string ToString(CalculationsPrecision precision);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_PRECISION_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/ProgramCache.cc b/runtime/onert/backend/gpu_cl/open_cl/ProgramCache.cc
new file mode 100644
index 000000000..350d7a1c5
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/ProgramCache.cc
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ProgramCache.h"
+
+#include <cstdint>
+#include <string>
+
+#include "ClProgram.h"
+#include "Status.h"
+#include "Util.h"
+#include "farmhash.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+ProgramCache::ProgramDescriptor::ProgramDescriptor(const std::string &code_text,
+ const std::string &options,
+ bool use_fingerprints)
+ : code(code_text), compiler_options(options), use_fingerprint(use_fingerprints)
+{
+ const uint64_t code_fingerprint = ::util::Fingerprint64(code);
+ const uint64_t options_fingerprint = ::util::Fingerprint64(compiler_options);
+ fingerprint = code_fingerprint + options_fingerprint;
+}
+
+ProgramCache::ProgramDescriptor::ProgramDescriptor(uint64_t fingerprints)
+ : fingerprint(fingerprints), use_fingerprint(true)
+{
+}
+
+ProgramCache::ProgramCache(ProgramCache &&program_cache)
+ : use_fingerprints_(program_cache.use_fingerprints_),
+ programs_(std::move(program_cache.programs_))
+{
+}
+
+ProgramCache &ProgramCache::operator=(ProgramCache &&program_cache)
+{
+ if (this != &program_cache)
+ {
+ use_fingerprints_ = program_cache.use_fingerprints_;
+ programs_ = std::move(program_cache.programs_);
+ }
+ return *this;
+}
+
+absl::Status ProgramCache::GetOrCreateCLKernel(const std::string &code,
+ const std::string &function_name,
+ const std::vector<CompilerOptions> &compiler_options,
+ const CLContext &context, const CLDevice &device,
+ CLKernel *result)
+{
+ const std::string options = CompilerOptionsToString(device, compiler_options);
+ ProgramDescriptor desc{code, options, use_fingerprints_};
+ auto it = programs_.find(desc);
+ if (it != programs_.end())
+ {
+ return result->CreateFromProgram(it->second, function_name);
+ }
+
+ CLProgram program;
+ RETURN_IF_ERROR(CreateCLProgram(code, options, context, device, &program));
+ RETURN_IF_ERROR(result->CreateFromProgram(program, function_name));
+ programs_.insert(std::make_pair(std::move(desc), std::move(program)));
+ return absl::OkStatus();
+}
+
+absl::Status ProgramCache::GetOrCreateCLKernel(const std::string &code,
+ const std::string &function_name,
+ const CLContext &context, const CLDevice &device,
+ CLKernel *result)
+{
+ return GetOrCreateCLKernel(code, function_name, {}, context, device, result);
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/ProgramCache.h b/runtime/onert/backend/gpu_cl/open_cl/ProgramCache.h
new file mode 100644
index 000000000..3f5ee0215
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/ProgramCache.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_PROGRAM_CACHE_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_PROGRAM_CACHE_H__
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/span.h"
+#include "ClContext.h"
+#include "ClDevice.h"
+#include "ClKernel.h"
+#include "ClProgram.h"
+#include "Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+class ProgramCache
+{
+public:
+ ProgramCache() = default;
+
+ ProgramCache(ProgramCache &&program_cache);
+ ProgramCache &operator=(ProgramCache &&program_cache);
+ ProgramCache(const ProgramCache &) = delete;
+ ProgramCache &operator=(const ProgramCache &) = delete;
+
+ absl::Status GetOrCreateCLKernel(const std::string &code, const std::string &function_name,
+ const std::vector<CompilerOptions> &compiler_options,
+ const CLContext &context, const CLDevice &device,
+ CLKernel *result);
+
+ absl::Status GetOrCreateCLKernel(const std::string &code, const std::string &function_name,
+ const CLContext &context, const CLDevice &device,
+ CLKernel *result);
+
+private:
+ struct ProgramDescriptor
+ {
+ ProgramDescriptor() = default;
+ ProgramDescriptor(const std::string &code_text, const std::string &options,
+ bool use_fingerprint);
+ explicit ProgramDescriptor(uint64_t fingerprint);
+
+ std::string code;
+ std::string compiler_options;
+ uint64_t fingerprint;
+ bool use_fingerprint;
+ };
+ struct ProgramDescriptorHasher
+ {
+ std::size_t operator()(const ProgramDescriptor &k) const
+ {
+ if (k.use_fingerprint)
+ {
+ return std::hash<uint64_t>()(k.fingerprint);
+ }
+ else
+ {
+ return std::hash<std::string>()(k.code) + std::hash<std::string>()(k.compiler_options);
+ }
+ }
+ };
+ struct ProgramDescriptorEqual
+ {
+ bool operator()(const ProgramDescriptor &a, const ProgramDescriptor &b) const
+ {
+ if (a.use_fingerprint && b.use_fingerprint)
+ {
+ return a.fingerprint == b.fingerprint;
+ }
+ else
+ {
+ return a.compiler_options == b.compiler_options && a.code == b.code;
+ }
+ }
+ };
+
+ // There is a low probability of a hash collision when cache is deserialized
+ // because only fingerprints are serialized instead of full source code.
+ bool use_fingerprints_ = false;
+ absl::flat_hash_map<ProgramDescriptor, CLProgram, ProgramDescriptorHasher, ProgramDescriptorEqual>
+ programs_;
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_PROGRAM_CACHE_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/Shape.cc b/runtime/onert/backend/gpu_cl/open_cl/Shape.cc
new file mode 100644
index 000000000..5a2374516
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/Shape.cc
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Shape.h"
+
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+namespace
+{
+
+struct GetAxisByIndexFunc
+{
+ template <Layout T> Axis operator()() const { return GetAxis<T>(index); }
+ int32_t index;
+};
+
+struct GetIndexByAxisFunc
+{
+ template <Layout T> int operator()() const { return GetAxisIndex<T>(axis); }
+ Axis axis;
+};
+
+struct NumAxisFunc
+{
+ template <Layout T> int operator()() const { return Size<T>(); }
+};
+
+} // namespace
+
+std::string ToString(Axis axis)
+{
+ switch (axis)
+ {
+ case Axis::BATCH:
+ return "batch";
+ case Axis::CHANNELS:
+ return "channels";
+ case Axis::INPUT_CHANNELS:
+ return "input_channels";
+ case Axis::OUTPUT_CHANNELS:
+ return "output_channels";
+ case Axis::HEIGHT:
+ return "height";
+ case Axis::WIDTH:
+ return "width";
+ case Axis::VALUE:
+ return "value";
+ case Axis::DEPTH:
+ return "depth";
+ case Axis::UNKNOWN:
+ return "unknown";
+ }
+ return "undefined";
+}
+
+std::string ToString(Layout layout)
+{
+ switch (layout)
+ {
+ case Layout::SCALAR:
+ return "scalar";
+ case Layout::LINEAR:
+ return "linear";
+ case Layout::HW:
+ return "hw";
+ case Layout::HWD:
+ return "hwd";
+ case Layout::CHW:
+ return "chw";
+ case Layout::HWC:
+ return "hwc";
+ case Layout::HWDC:
+ return "hwdc";
+ case Layout::OHWI:
+ return "ohwi";
+ case Layout::IHWO:
+ return "ihwo";
+ case Layout::OIHW:
+ return "oihw";
+ case Layout::IOHW:
+ return "iohw";
+ case Layout::BHWC:
+ return "bhwc";
+ case Layout::BHWDC:
+ return "bhwdc";
+ case Layout::OHWDI:
+ return "ohwi";
+ case Layout::UNKNOWN:
+ return "unknown";
+ }
+ return "undefined";
+}
+
+Axis GetAxis(Layout layout, int32_t index)
+{
+ return DispatchByLayout(layout, GetAxisByIndexFunc{index});
+}
+
+int GetAxisIndex(Layout layout, Axis axis)
+{
+ return DispatchByLayout(layout, GetIndexByAxisFunc{axis});
+}
+
+bool HasAxis(Layout layout, Axis axis) { return GetAxisIndex(layout, axis) >= 0; }
+
+int Size(Layout layout) { return DispatchByLayout(layout, NumAxisFunc()); }
+
+std::string ToString(const Shape &s)
+{
+ return absl::StrCat("{", ToString(s.layout), ", {", absl::StrJoin(s.dimensions, ", "), "}}");
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/Shape.h b/runtime/onert/backend/gpu_cl/open_cl/Shape.h
new file mode 100644
index 000000000..3767e106f
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/Shape.h
@@ -0,0 +1,668 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_SHAPE_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_SHAPE_H__
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <array>
+#include <functional>
+#include <numeric>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+enum class Axis
+{
+ UNKNOWN = 0,
+ CHANNELS = 1,
+ INPUT_CHANNELS = 2,
+ OUTPUT_CHANNELS = 3,
+ HEIGHT = 4,
+ WIDTH = 5,
+ BATCH = 6,
+ VALUE = 7,
+ DEPTH = 8,
+};
+
+std::string ToString(Axis t);
+
+// Layout represents axis order.
+enum class Layout
+{
+ UNKNOWN = 0,
+ SCALAR = 1,
+ LINEAR = 2,
+ HW = 3,
+ CHW = 4,
+ HWC = 5,
+ OIHW = 6,
+ OHWI = 7,
+ IHWO = 8,
+ IOHW = 9,
+ BHWC = 10,
+ HWDC = 11,
+ BHWDC = 12,
+ HWD = 13,
+ OHWDI = 14,
+};
+
+std::string ToString(Layout l);
+
+// Returns number of axis for the fixed layout.
+template <Layout T> constexpr int Size();
+
+// Returns number of axis for the given layout.
+int Size(Layout layout);
+
+// Returns Axis for the given index and fixed layout.
+template <Layout T> constexpr Axis GetAxis(int index);
+
+// Returns axis for the given layout and index.
+Axis GetAxis(Layout layout, int32_t index);
+
+// Returns axis index for the given axis and fixed layout.
+template <Layout T> constexpr int GetAxisIndex(Axis axis);
+
+// Returns axis index for the given layout and axis.
+int GetAxisIndex(Layout layout, Axis axis);
+
+// Checks if fixed layout has given axis
+template <Layout T> constexpr bool HasAxis(Axis axis);
+
+// Checks if given layout has given axis
+bool HasAxis(Layout layout, Axis axis);
+
+// Stores Layout(axis set and order) and value for dimensions.
+struct Shape
+{
+ Shape() : layout(Layout::UNKNOWN), dimensions() {}
+
+ explicit Shape(Layout t) : layout(t), dimensions(Size(t)) {}
+
+ Shape(Layout t, std::vector<int32_t> d) : layout(t), dimensions(std::move(d)) {}
+
+ bool operator==(const Shape &other) const
+ {
+ return (layout == other.layout) && (dimensions == other.dimensions);
+ }
+
+ bool operator!=(const Shape &other) const { return !operator==(other); }
+
+ // All methods below are matching same methods defined in StrongShape to
+ // make sure generic algorithms work both ways.
+
+ // Returns back a dimension or -1 if it is not found.
+ template <Axis D> int32_t get() const;
+ int32_t get(Axis axis) const;
+
+ template <Axis D> bool set(int32_t t);
+ bool set(Axis axis, int32_t t);
+
+ Axis axis(int index) const { return GetAxis(layout, index); }
+
+ int index(Axis axis) const { return GetAxisIndex(layout, axis); }
+
+ bool has(Axis axis) const { return HasAxis(layout, axis); }
+
+ int64_t DimensionsProduct() const
+ {
+ return std::accumulate(dimensions.begin(), dimensions.end(), 1ll, std::multiplies<int64_t>());
+ }
+
+ Layout layout = Layout::UNKNOWN;
+
+ std::vector<int32_t> dimensions;
+};
+
+std::string ToString(const Shape &s);
+
+// StrongShape provides convenient explicit access to dimensions stored in
+// shape, e.g. StrongShape<Layout::HW> s; provides s.h and s.w accessors.
+//
+// There is a conversion possible both ways between Shape and StrongShape.
+//
+// OIHW oihw; // specific shape
+// Shape l = oihw.ToShape();
+//
+// OHWI other; // notice not the same but compatible shape.
+// if (!other.Adopt(l)) {
+// // error handling
+// }
+//
+// StrongShape supports the following set of operations:
+//
+// // Returns number of axis in the shape class.
+// static constexpr int size();
+//
+// // Returns Axis for the given index or Axis::UNKNOWN if index
+// // falls outside of the defined range in this shape.
+// static constexpr Axis axis(int index);
+//
+// // Returns index for the given axis or -1 if axis is not defined in this
+// // shape.
+// static constexpr int index(Axis axis);
+//
+// // Getters
+// int32_t get(int index) const;
+// int32_t get(Axis axis) const;
+// int32_t get<Axis>() const;
+//
+// // Setters that return false if set was not successful.
+// bool set(int index, int32_t v);
+// bool set(Axis axis, int32_t v);
+// bool set<Axis>(int32_t v);
+//
+// // Returns shape's layout.
+// static const Layout layout;
+//
+// // Turns specific shape into generic shape.
+// Shape ToShape() const;
+//
+// // Copies all dimensions from the given shape.
+// bool Adopt(const Shape&);
+//
+template <Layout L> struct StrongShape;
+
+using Scalar = StrongShape<Layout::SCALAR>;
+using Linear = StrongShape<Layout::LINEAR>;
+using HW = StrongShape<Layout::HW>;
+using HWD = StrongShape<Layout::HWD>;
+
+// Common tensor shape for CNN models working with images.
+using CHW = StrongShape<Layout::CHW>;
+using HWC = StrongShape<Layout::HWC>;
+using HWDC = StrongShape<Layout::HWDC>;
+using BHWC = StrongShape<Layout::BHWC>;
+using BHWDC = StrongShape<Layout::BHWDC>;
+
+// Tensor shape used in convolution_2d weights.
+using OIHW = StrongShape<Layout::OIHW>;
+using OHWI = StrongShape<Layout::OHWI>;
+using IHWO = StrongShape<Layout::IHWO>;
+using IOHW = StrongShape<Layout::IOHW>;
+
+// Tensor shape used in convolution_3d weights.
+using OHWDI = StrongShape<Layout::OHWDI>;
+
+// -----------------------------------------------------------------------------
+// Everything below are internal implementation details.
+// -----------------------------------------------------------------------------
+
+namespace internal_shape
+{
+
+template <Axis T> struct AxisTraits;
+
+#define TFLITE_GPU_AXIS_TRAITS(AxisName, HolderName) \
+ template <> struct AxisTraits<Axis::AxisName> \
+ { \
+ struct Holder \
+ { \
+ int32_t HolderName; \
+ \
+ protected: \
+ int32_t operator()() const { return HolderName; } \
+ void operator()(int32_t v) { HolderName = v; } \
+ }; \
+ \
+ using dimension_holder_type = Holder; \
+ }
+
+TFLITE_GPU_AXIS_TRAITS(CHANNELS, c);
+TFLITE_GPU_AXIS_TRAITS(HEIGHT, h);
+TFLITE_GPU_AXIS_TRAITS(WIDTH, w);
+TFLITE_GPU_AXIS_TRAITS(INPUT_CHANNELS, i);
+TFLITE_GPU_AXIS_TRAITS(OUTPUT_CHANNELS, o);
+TFLITE_GPU_AXIS_TRAITS(BATCH, b);
+TFLITE_GPU_AXIS_TRAITS(VALUE, v);
+TFLITE_GPU_AXIS_TRAITS(DEPTH, d);
+
+#undef TFLITE_GPU_AXIS_TRAITS
+
+template <int N, Axis... As> struct StrongShapeImpl;
+
+template <int N> struct StrongShapeImpl<N>
+{
+ static constexpr int size() { return N; }
+
+ static constexpr Axis axis(int) { return Axis::UNKNOWN; }
+
+ static constexpr int index(Axis) { return -1; }
+
+ static constexpr bool has(Axis) { return false; }
+
+ int32_t get(Axis) const { return -1; }
+
+ int32_t get(int) const { return -1; }
+
+ template <Axis B> int32_t get() const { return -1; }
+
+ bool set(Axis, int32_t) { return false; }
+
+ bool set(int, int32_t) { return false; }
+
+ template <Axis B> bool set(int32_t) { return false; }
+};
+
+// Used to deduce number of axis, and to be a child of a proper holder to
+// provide access to the dimension by name
+template <int N, Axis A, Axis... As>
+struct StrongShapeImpl<N, A, As...> : public AxisTraits<A>::dimension_holder_type,
+ public StrongShapeImpl<N + 1, As...>
+{
+ using dimension_holder_type = typename AxisTraits<A>::dimension_holder_type;
+
+ using rest_type = StrongShapeImpl<N + 1, As...>;
+
+ StrongShapeImpl() : dimension_holder_type{0}, rest_type() {}
+
+ template <typename... Ts>
+ explicit StrongShapeImpl(int32_t t, Ts... ts) : dimension_holder_type{t}, rest_type(ts...)
+ {
+ }
+
+ static constexpr Axis axis(int index) { return index == N ? A : rest_type::axis(index); }
+
+ static constexpr int index(Axis axis) { return axis == A ? N : rest_type::index(axis); }
+
+ static constexpr bool has(Axis axis) { return axis == A ? true : rest_type::has(axis); }
+
+ int32_t get(Axis axis) const
+ {
+ return axis == A ? dimension_holder_type::operator()() : rest_type::get(axis);
+ }
+
+ template <Axis B> int32_t get() const
+ {
+ return B == A ? dimension_holder_type::operator()() : rest_type::template get<B>();
+ }
+
+ int32_t get(int index) const
+ {
+ return index == N ? dimension_holder_type::operator()() : rest_type::get(index);
+ }
+
+ bool set(Axis axis, int32_t t)
+ {
+ if (axis == A)
+ {
+ dimension_holder_type::operator()(t);
+ return true;
+ }
+ return rest_type::set(axis, t);
+ }
+
+ bool set(int index, int32_t t)
+ {
+ if (index == N)
+ {
+ dimension_holder_type::operator()(t);
+ return true;
+ }
+ return rest_type::set(index, t);
+ }
+
+ template <Axis B> bool set(int32_t t)
+ {
+ if (A == B)
+ {
+ dimension_holder_type::operator()(t);
+ return true;
+ }
+ return rest_type::template set<B>(t);
+ }
+};
+
+template <Layout T> struct LayoutTraits;
+
+#define TFLITE_GPU_LAYOUT_TRAITS(LayoutName, ...) \
+ template <> struct LayoutTraits<Layout::LayoutName> \
+ { \
+ using strong_shape_type = StrongShapeImpl<0, __VA_ARGS__>; \
+ }
+
+TFLITE_GPU_LAYOUT_TRAITS(HW, Axis::HEIGHT, Axis::WIDTH);
+TFLITE_GPU_LAYOUT_TRAITS(HWD, Axis::HEIGHT, Axis::WIDTH, Axis::DEPTH);
+TFLITE_GPU_LAYOUT_TRAITS(OHWI, Axis::OUTPUT_CHANNELS, Axis::HEIGHT, Axis::WIDTH,
+ Axis::INPUT_CHANNELS);
+TFLITE_GPU_LAYOUT_TRAITS(OIHW, Axis::OUTPUT_CHANNELS, Axis::INPUT_CHANNELS, Axis::HEIGHT,
+ Axis::WIDTH);
+TFLITE_GPU_LAYOUT_TRAITS(IOHW, Axis::INPUT_CHANNELS, Axis::OUTPUT_CHANNELS, Axis::HEIGHT,
+ Axis::WIDTH);
+TFLITE_GPU_LAYOUT_TRAITS(IHWO, Axis::INPUT_CHANNELS, Axis::HEIGHT, Axis::WIDTH,
+ Axis::OUTPUT_CHANNELS);
+TFLITE_GPU_LAYOUT_TRAITS(CHW, Axis::CHANNELS, Axis::HEIGHT, Axis::WIDTH);
+TFLITE_GPU_LAYOUT_TRAITS(HWC, Axis::HEIGHT, Axis::WIDTH, Axis::CHANNELS);
+TFLITE_GPU_LAYOUT_TRAITS(HWDC, Axis::HEIGHT, Axis::WIDTH, Axis::DEPTH, Axis::CHANNELS);
+TFLITE_GPU_LAYOUT_TRAITS(LINEAR, Axis::VALUE);
+TFLITE_GPU_LAYOUT_TRAITS(SCALAR, Axis::VALUE);
+TFLITE_GPU_LAYOUT_TRAITS(BHWC, Axis::BATCH, Axis::HEIGHT, Axis::WIDTH, Axis::CHANNELS);
+TFLITE_GPU_LAYOUT_TRAITS(BHWDC, Axis::BATCH, Axis::HEIGHT, Axis::WIDTH, Axis::DEPTH,
+ Axis::CHANNELS);
+TFLITE_GPU_LAYOUT_TRAITS(OHWDI, Axis::OUTPUT_CHANNELS, Axis::HEIGHT, Axis::WIDTH, Axis::DEPTH,
+ Axis::INPUT_CHANNELS);
+
+#undef TFLITE_GPU_LAYOUT_TRAITS
+
+template <> struct LayoutTraits<Layout::UNKNOWN>
+{
+ using strong_shape_type = StrongShapeImpl<0>;
+};
+
+template <Axis A> struct DimensionGetterFixedAxisFunc
+{
+ template <Layout T> int32_t operator()() const
+ {
+ constexpr int i = GetAxisIndex<T>(A);
+ return i >= 0 && i < l->dimensions.size() ? l->dimensions[i] : -1;
+ }
+ const Shape *l;
+};
+
+struct DimensionGetterFunc
+{
+ template <Layout T> int32_t operator()() const
+ {
+ uint32_t i = GetAxisIndex<T>(axis);
+ return i < l->dimensions.size() ? l->dimensions[i] : -1;
+ }
+ Axis axis;
+ const Shape *l;
+};
+
+template <Axis A> struct DimensionSetterFixedAxisFunc
+{
+ template <Layout T> bool operator()() const
+ {
+ constexpr uint32_t i = GetAxisIndex<T>(A);
+ if (i < l->dimensions.size())
+ {
+ l->dimensions[i] = v;
+ return true;
+ }
+ return false;
+ }
+ Shape *l;
+ int32_t v;
+};
+
+struct DimensionSetterFunc
+{
+ template <Layout T> bool operator()() const
+ {
+ uint32_t i = GetAxisIndex<T>(axis);
+ if (i < l->dimensions.size())
+ {
+ l->dimensions[i] = v;
+ return true;
+ }
+ return false;
+ }
+ Axis axis;
+ Shape *l;
+ int32_t v;
+};
+
+template <Layout L> struct ToShapeFunc
+{
+ template <Layout T> bool operator()() const
+ {
+ for (int i = 0; i < StrongShape<L>::size(); ++i)
+ {
+ int index = GetAxisIndex<T>(StrongShape<L>::axis(i));
+ if (index < 0)
+ return false;
+ shape->set(i, l.dimensions[index]);
+ }
+ return true;
+ }
+
+ StrongShape<L> *shape;
+ const Shape &l;
+};
+
+} // namespace internal_shape
+
+// template <Axis... As>
+template <Layout L> struct StrongShape : public internal_shape::LayoutTraits<L>::strong_shape_type
+{
+ using strong_shape_type = typename internal_shape::LayoutTraits<L>::strong_shape_type;
+ StrongShape() = default;
+
+ template <typename... Ts> explicit StrongShape(Ts... t) : strong_shape_type(t...) {}
+
+ constexpr static Layout layout = L;
+
+ bool operator==(const StrongShape<L> &shape) const
+ {
+ // TODO(akulik): implement better alternative.
+ return this->ToShape() == shape.ToShape();
+ }
+
+ bool operator!=(const StrongShape<L> &shape) const
+ {
+ // TODO(akulik): implement better alternative.
+ return this->ToShape() != shape.ToShape();
+ }
+ bool empty() const { return DimensionsProduct() == 0; }
+
+ // Turns StrongShape into generic shape.
+ Shape ToShape() const
+ {
+ std::vector<int32_t> dimensions(StrongShape::size());
+ for (int i = 0; i < StrongShape::size(); ++i)
+ {
+ dimensions[i] = StrongShape::get(i);
+ }
+ return Shape(L, std::move(dimensions));
+ }
+
+ // @return all dimensions multiplied
+ int64_t DimensionsProduct() const
+ {
+ int64_t product = 1;
+ for (int i = 0; i < StrongShape::size(); ++i)
+ {
+ product *= StrongShape::get(i);
+ }
+ return product;
+ }
+
+ // Translates given coordinates of the layout into a linear index assuming
+ // dimensions are sorted in tensor access order e.g. if you access
+ // foobar[i][j][k] order of coordinates should be i,j,k.
+ int64_t LinearIndex(const std::array<int32_t, StrongShape::size()> &coordinates) const
+ {
+ int64_t index = coordinates[0];
+ for (int i = 1; i < StrongShape::size(); ++i)
+ {
+ index = index * StrongShape::get(i) + coordinates[i];
+ }
+ return index;
+ }
+
+ // Copies all dimensions from the given generic shape into specific shape.
+ // It requires shape to have all axis defined in the given
+ // StrongShape. For example:
+ // - If this shape is OHWI but given shape is OIHW, Adopt will copy all
+ // dimensions and return true.
+ // - If this shape is OIHW but input shape is HW, Adopt will copy H and W
+ // dimensions and return true, but if this shape is HW and given shape
+ // OIHW, then Adopt will return false because not all axis are present in
+ // the input shape.
+ //
+ // @return false if generic shape is not compatible.
+ bool Adopt(const Shape &shape)
+ {
+ return DispatchByLayout(shape.layout, internal_shape::ToShapeFunc<L>{this, shape});
+ }
+
+ // For all axis defined in a given shape copies values to this shape.
+ // Therefore, it is possible to copy dimensions from CHW to BCHW, but not
+ // the other way around.
+ //
+ // BCHW bchw;
+ // CHW chw;
+ // bchw.CopyAllGivenAxis(chw); --> true
+ // chw.CopyAllGivenAxis(bchw); --> false
+ //
+ // @return false if axis in source shape is not defined here, thus value
+ // was not copied.
+ template <Layout B> bool CopyAllGivenAxis(const StrongShape<B> &source)
+ {
+ for (int i = 0; i < source.size(); ++i)
+ {
+ if (!StrongShape::set(source.axis(i), source.get(i)))
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ // For all axis defined in this shape copies values from the given shape.
+ //
+ // BCHW bchw;
+ // CHW chw;
+ // bchw.CopyAllDefinedAxis(chw); --> false
+ // chw.CopyAllDefinedAxis(bchw); --> true
+ //
+ // @return false if given shape does not have axis defined here,
+ // therefore a value was not copied.
+ template <Layout B> bool CopyAllDefinedAxis(const StrongShape<B> &source)
+ {
+ for (int i = 0; i < StrongShape::size(); ++i)
+ {
+ int source_index = source.index(StrongShape::axis(i));
+ if (source_index < 0)
+ {
+ return false;
+ }
+ StrongShape::set(i, source.get(source_index)); // always true
+ }
+ return true;
+ }
+
+ // Copies values only for matching axis.
+ template <Layout B> void CopyMatchingAxis(const StrongShape<B> &source)
+ {
+ for (int i = 0; i < StrongShape::size(); ++i)
+ {
+ StrongShape::set(source.axis(i), source.get(i));
+ }
+ }
+
+ // AbslHash function for using in flat hash containers.
+ template <typename H> friend H AbslHashValue(H hash_state, const StrongShape &strong_shape)
+ {
+ for (size_t i = 0; i < strong_shape.size(); ++i)
+ {
+ hash_state = H::combine(std::move(hash_state), strong_shape.get(i));
+ }
+ return hash_state;
+ }
+};
+
+template <Layout T> inline std::string ToString(const StrongShape<T> &s)
+{
+ return ToString(s.ToShape());
+}
+
+template <Layout L> constexpr Layout StrongShape<L>::layout;
+
+template <class F>
+auto DispatchByLayout(Layout type, F f) -> decltype(f.template operator()<Layout::UNKNOWN>())
+{
+ switch (type)
+ {
+ case Layout::HW:
+ return f.template operator()<Layout::HW>();
+ case Layout::HWD:
+ return f.template operator()<Layout::HWD>();
+ case Layout::HWC:
+ return f.template operator()<Layout::HWC>();
+ case Layout::HWDC:
+ return f.template operator()<Layout::HWDC>();
+ case Layout::CHW:
+ return f.template operator()<Layout::CHW>();
+ case Layout::OIHW:
+ return f.template operator()<Layout::OIHW>();
+ case Layout::IOHW:
+ return f.template operator()<Layout::IOHW>();
+ case Layout::OHWI:
+ return f.template operator()<Layout::OHWI>();
+ case Layout::IHWO:
+ return f.template operator()<Layout::IHWO>();
+ case Layout::LINEAR:
+ return f.template operator()<Layout::LINEAR>();
+ case Layout::SCALAR:
+ return f.template operator()<Layout::SCALAR>();
+ case Layout::BHWC:
+ return f.template operator()<Layout::BHWC>();
+ case Layout::BHWDC:
+ return f.template operator()<Layout::BHWDC>();
+ case Layout::OHWDI:
+ return f.template operator()<Layout::OHWDI>();
+ case Layout::UNKNOWN:
+ return f.template operator()<Layout::UNKNOWN>();
+ }
+ return f.template operator()<Layout::UNKNOWN>();
+}
+
+template <Layout T> constexpr int Size() { return StrongShape<T>::size(); }
+
+template <Layout T> constexpr Axis GetAxis(int index) { return StrongShape<T>::axis(index); }
+
+template <Layout T> constexpr int GetAxisIndex(Axis axis) { return StrongShape<T>::index(axis); }
+
+template <Layout T> constexpr bool HasAxis(Axis axis) { return StrongShape<T>::has(axis); }
+
+template <Axis D> inline int32_t Shape::get() const
+{
+ return DispatchByLayout(layout, internal_shape::DimensionGetterFixedAxisFunc<D>{this});
+}
+
+inline int32_t Shape::get(Axis axis) const
+{
+ return DispatchByLayout(layout, internal_shape::DimensionGetterFunc{axis, this});
+}
+
+template <Axis D> inline bool Shape::set(int32_t t)
+{
+ return DispatchByLayout(layout, internal_shape::DimensionSetterFixedAxisFunc<D>{this, t});
+}
+
+inline bool Shape::set(Axis axis, int32_t t)
+{
+ return DispatchByLayout(layout, internal_shape::DimensionSetterFunc{axis, this, t});
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_SHAPE_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/Spi.h b/runtime/onert/backend/gpu_cl/open_cl/Spi.h
new file mode 100644
index 000000000..c1d65b67e
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/Spi.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPEN_CL_SPI_H__
+#define __ONERT_BACKEND_GPU_CL_OPEN_CL_SPI_H__
+
+#include <cstdint>
+
+#include "Api.h"
+#include "AccessType.h"
+#include "Status.h"
+
+// Contains only service provider-related interfaces. Users should not use them
+// directly.
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+// Converts a tensor object into another one.
+class TensorObjectConverter
+{
+public:
+ virtual ~TensorObjectConverter() = default;
+
+ virtual absl::Status Convert(const TensorObject &input, const TensorObject &output) = 0;
+};
+
+class TensorObjectConverterBuilder
+{
+public:
+ virtual ~TensorObjectConverterBuilder() = default;
+
+ virtual bool IsSupported(const TensorObjectDef &input, const TensorObjectDef &output) const = 0;
+
+ virtual absl::Status MakeConverter(const TensorObjectDef &input, const TensorObjectDef &output,
+ std::unique_ptr<TensorObjectConverter> *converter) = 0;
+};
+
+// Connects tensor definition provided by a user (external) with tensor
+// definition used by the inference engine (internal).
+struct TensorTieDef
+{
+ uint32_t id;
+ AccessType access_type;
+ TensorObjectDef internal_def;
+ TensorObjectDef external_def;
+};
+
+// Connects external tensor object to internal tensor object and provides
+// functionality to copy data to/from external object to internal.
+class TensorTie
+{
+public:
+ explicit TensorTie(const TensorTieDef &def) : def_(def) {}
+
+ virtual ~TensorTie() = default;
+
+ virtual absl::Status SetExternalObject(TensorObject obj) = 0;
+
+ virtual TensorObject GetExternalObject() = 0;
+
+ virtual absl::Status CopyToExternalObject() = 0;
+
+ virtual absl::Status CopyFromExternalObject() = 0;
+
+ const TensorTieDef &def() const { return def_; }
+
+private:
+ const TensorTieDef def_;
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPEN_CL_SPI_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/Status.h b/runtime/onert/backend/gpu_cl/open_cl/Status.h
new file mode 100644
index 000000000..6295a7e77
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/Status.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_STATUS_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_STATUS_H__
+
+#include "absl/status/status.h" // IWYU pragma: export
+#define RETURN_IF_ERROR(s) \
+ { \
+ auto c = (s); \
+ if (!c.ok()) \
+ return c; \
+ } // IWYU pragma: export
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_STATUS_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/StorageTypeUtil.cc b/runtime/onert/backend/gpu_cl/open_cl/StorageTypeUtil.cc
new file mode 100644
index 000000000..eada697ac
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/StorageTypeUtil.cc
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "StorageTypeUtil.h"
+
+#include "TensorType.h"
+#include "DataType.h"
+#include "Shape.h"
+#include "Util.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+bool CanCreateTensorWithShape(const DeviceInfo &device_info, const BHWDC &shape,
+ const TensorDescriptor &descriptor)
+{
+ const int slices = DivideRoundUp(shape.c, 4);
+ switch (descriptor.storage_type)
+ {
+ case TensorStorageType::BUFFER:
+ {
+ const uint64_t flt4_size = 4 * (descriptor.data_type == DataType::FLOAT32 ? 4 : 2);
+ const uint64_t buffer_size = shape.b * shape.w * shape.h * shape.d * slices * flt4_size;
+ return buffer_size <= device_info.buffer_max_size;
+ }
+ case TensorStorageType::IMAGE_BUFFER:
+ return (uint64_t)shape.b * shape.w * shape.h * shape.d * slices <=
+ device_info.image_buffer_max_size;
+ case TensorStorageType::TEXTURE_3D:
+ if (device_info.cl_version < OpenCLVersion::CL_1_2 && slices == 1)
+ {
+ // clCreateImage3D (that used in CL 1.0/1.1) can not create image with
+ // depth = 1 by specification;
+ return false;
+ }
+ return (uint64_t)shape.w * shape.b <= device_info.image3d_max_width &&
+ (uint64_t)shape.h <= device_info.image3d_max_height &&
+ (uint64_t)slices * shape.d <= device_info.image3d_max_depth;
+ case TensorStorageType::TEXTURE_ARRAY:
+ // Bug on some Adreno. b/131099086
+ if (slices == 1 && !device_info.SupportsOneLayerTextureArray())
+ {
+ return false;
+ }
+ return (uint64_t)shape.w * shape.b <= device_info.image2d_max_width &&
+ (uint64_t)shape.h <= device_info.image2d_max_height &&
+ (uint64_t)slices * shape.d <= device_info.image_array_max_layers;
+ case TensorStorageType::TEXTURE_2D:
+ return (uint64_t)shape.w * shape.b * shape.d <= device_info.image2d_max_width &&
+ (uint64_t)shape.h * slices <= device_info.image2d_max_height;
+ case TensorStorageType::SINGLE_TEXTURE_2D:
+ return (uint64_t)shape.c <= 4 &&
+ device_info.SupportsFloatImage2D(descriptor.data_type, shape.c) &&
+ (uint64_t)shape.w * shape.b * shape.d <= device_info.image2d_max_width &&
+ (uint64_t)shape.h <= device_info.image2d_max_height;
+ default:
+ return false;
+ }
+}
+
+bool CanCreateTensorWithShape(const DeviceInfo &device_info, const BHWC &shape,
+ const TensorDescriptor &descriptor)
+{
+ const BHWDC shape5D(shape.b, shape.h, shape.w, 1, shape.c);
+ return CanCreateTensorWithShape(device_info, shape5D, descriptor);
+}
+
+TensorStorageType SelectBestStorageType(const DeviceInfo &device_info, const BHWC &shape,
+ const TensorStorageType &desired, const DataType &data_type,
+ const Layout &layout)
+{
+ if (CanCreateTensorWithShape(device_info, shape, TensorDescriptor{data_type, desired, layout}))
+ {
+ return desired;
+ }
+ auto GetBestTypeAfterTextureArray = [&]() {
+ if (device_info.SupportsImageBuffer() &&
+ CanCreateTensorWithShape(
+ device_info, shape, TensorDescriptor{data_type, TensorStorageType::IMAGE_BUFFER, layout}))
+ {
+ return TensorStorageType::IMAGE_BUFFER;
+ }
+ else
+ {
+ return TensorStorageType::BUFFER;
+ }
+ };
+ auto GetBestTypeAfterTexture2D = [&]() {
+ if (device_info.SupportsTextureArray() &&
+ CanCreateTensorWithShape(
+ device_info, shape,
+ TensorDescriptor{data_type, TensorStorageType::TEXTURE_ARRAY, layout}))
+ {
+ return TensorStorageType::TEXTURE_ARRAY;
+ }
+ else
+ {
+ return GetBestTypeAfterTextureArray();
+ }
+ };
+ auto GetBestTypeAfterTexture3D = [&]() {
+ if (CanCreateTensorWithShape(
+ device_info, shape, TensorDescriptor{data_type, TensorStorageType::TEXTURE_2D, layout}))
+ {
+ return TensorStorageType::TEXTURE_2D;
+ }
+ else
+ {
+ return GetBestTypeAfterTexture2D();
+ }
+ };
+ switch (desired)
+ {
+ case TensorStorageType::TEXTURE_2D:
+ case TensorStorageType::SINGLE_TEXTURE_2D:
+ return GetBestTypeAfterTexture2D();
+ case TensorStorageType::TEXTURE_ARRAY:
+ return GetBestTypeAfterTextureArray();
+ case TensorStorageType::TEXTURE_3D:
+ return GetBestTypeAfterTexture3D();
+ case TensorStorageType::IMAGE_BUFFER:
+ case TensorStorageType::BUFFER:
+ return TensorStorageType::BUFFER;
+ default:
+ return TensorStorageType::BUFFER;
+ }
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/StorageTypeUtil.h b/runtime/onert/backend/gpu_cl/open_cl/StorageTypeUtil.h
new file mode 100644
index 000000000..a84c3865f
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/StorageTypeUtil.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_STORAGE_TYPE_UTIL_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_STORAGE_TYPE_UTIL_H__
+
+#include "DeviceInfo.h"
+#include "TensorType.h"
+#include "DataType.h"
+#include "Shape.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+bool CanCreateTensorWithShape(const DeviceInfo &device_info, const BHWDC &shape,
+ const TensorDescriptor &descriptor);
+
+bool CanCreateTensorWithShape(const DeviceInfo &device_info, const BHWC &shape,
+ const TensorDescriptor &descriptor);
+
+TensorStorageType SelectBestStorageType(const DeviceInfo &device_info, const BHWC &shape,
+ const TensorStorageType &desired, const DataType &data_type,
+ const Layout &layout);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_STORAGE_TYPE_UTIL_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/Tensor.cc b/runtime/onert/backend/gpu_cl/open_cl/Tensor.cc
new file mode 100644
index 000000000..983e0d29d
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/Tensor.cc
@@ -0,0 +1,690 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Tensor.h"
+
+#include <cstring>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+
+#include "Buffer.h"
+#include "ClImageFormat.h"
+#include "ClMemory.h"
+#include "GpuObject.h"
+#include "TensorType.h"
+#include "InternalTensor.h"
+#include "DataType.h"
+#include "Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+namespace
+{
+
+absl::Status AllocateTensorMemory(const CLContext &context, const BHWDC &shape,
+ const TensorDescriptor &descriptor, const void *data_ptr,
+ CLMemory *result)
+{
+ const int slices = DivideRoundUp(shape.c, 4);
+ cl_mem_flags mem_flags = CL_MEM_READ_WRITE;
+ if (data_ptr)
+ {
+ mem_flags |= CL_MEM_COPY_HOST_PTR;
+ }
+ switch (descriptor.storage_type)
+ {
+ case TensorStorageType::BUFFER:
+ case TensorStorageType::IMAGE_BUFFER:
+ {
+ const size_t data_size =
+ shape.b * shape.w * shape.h * shape.d * slices * 4 * SizeOf(descriptor.data_type);
+ cl_int error_code;
+ cl_mem memory = clCreateBuffer(context.context(), mem_flags, data_size,
+ const_cast<void *>(data_ptr), &error_code);
+ if (!memory)
+ {
+ return absl::UnknownError(absl::StrCat(
+ "Failed to allocate device memory (clCreateBuffer): ", CLErrorCodeToString(error_code)));
+ }
+ *result = CLMemory(memory, true);
+ return absl::OkStatus();
+ }
+ case TensorStorageType::TEXTURE_2D:
+ {
+ cl_image_desc desc;
+ desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+ desc.image_width = shape.w * shape.b * shape.d;
+ desc.image_height = shape.h * slices;
+ desc.image_depth = 0;
+ desc.image_row_pitch = 0;
+ desc.image_slice_pitch = 0;
+ desc.num_mip_levels = 0;
+ desc.num_samples = 0;
+ desc.buffer = nullptr;
+
+ cl_image_format format;
+ format.image_channel_order = CL_RGBA;
+ format.image_channel_data_type = ToImageChannelType(descriptor.data_type);
+
+ cl_int error_code;
+ cl_mem memory = CreateImage2DLegacy(context.context(), mem_flags, &format, &desc,
+ const_cast<void *>(data_ptr), &error_code);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(absl::StrCat("Failed to create 2D texture (clCreateImage): ",
+ CLErrorCodeToString(error_code)));
+ }
+
+ *result = CLMemory(memory, true);
+ return absl::OkStatus();
+ }
+ case TensorStorageType::TEXTURE_3D:
+ {
+ cl_image_desc desc;
+ desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+ desc.image_width = shape.w * shape.b;
+ desc.image_height = shape.h;
+ desc.image_depth = slices * shape.d;
+ desc.image_row_pitch = 0;
+ desc.image_slice_pitch = 0;
+ desc.num_mip_levels = 0;
+ desc.num_samples = 0;
+ desc.buffer = nullptr;
+
+ cl_image_format format;
+ format.image_channel_order = CL_RGBA;
+ format.image_channel_data_type = ToImageChannelType(descriptor.data_type);
+
+ cl_int error_code;
+ cl_mem memory = CreateImage3DLegacy(context.context(), mem_flags, &format, &desc,
+ const_cast<void *>(data_ptr), &error_code);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(absl::StrCat("Failed to create 3D texture (clCreateImage): ",
+ CLErrorCodeToString(error_code)));
+ }
+
+ *result = CLMemory(memory, true);
+ return absl::OkStatus();
+ }
+ case TensorStorageType::TEXTURE_ARRAY:
+ {
+ cl_image_desc desc;
+ desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;
+ desc.image_width = shape.w * shape.b;
+ desc.image_height = shape.h;
+ desc.image_depth = 0;
+ desc.image_array_size = slices * shape.d;
+ desc.image_row_pitch = 0;
+ desc.image_slice_pitch = 0;
+ desc.num_mip_levels = 0;
+ desc.num_samples = 0;
+ desc.buffer = nullptr;
+
+ cl_image_format format;
+ format.image_channel_order = CL_RGBA;
+ format.image_channel_data_type = ToImageChannelType(descriptor.data_type);
+
+ cl_int error_code;
+ cl_mem memory = clCreateImage(context.context(), mem_flags, &format, &desc,
+ const_cast<void *>(data_ptr), &error_code);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(absl::StrCat(
+ "Failed to create 2D texture array (clCreateImage): ", CLErrorCodeToString(error_code)));
+ }
+
+ *result = CLMemory(memory, true);
+ return absl::OkStatus();
+ }
+
+ case TensorStorageType::SINGLE_TEXTURE_2D:
+ {
+ if (slices != 1)
+ {
+ return absl::InvalidArgumentError(absl::StrCat(
+ "SINGLE_TEXTURE_2D support only channels in range [1-4], but ", shape.c, "was provided"));
+ }
+ cl_image_desc desc;
+ desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+ desc.image_width = shape.w * shape.b * shape.d;
+ desc.image_height = shape.h;
+ desc.image_depth = 0;
+ desc.image_row_pitch = 0;
+ desc.image_slice_pitch = 0;
+ desc.num_mip_levels = 0;
+ desc.num_samples = 0;
+ desc.buffer = nullptr;
+
+ cl_image_format format;
+ if (context.IsFloatTexture2DSupported(shape.c, descriptor.data_type))
+ {
+ format.image_channel_order = ToChannelOrder(shape.c);
+ format.image_channel_data_type = ToImageChannelType(descriptor.data_type);
+ }
+ else
+ {
+ return absl::InvalidArgumentError(
+ absl::StrCat("This device doesn't support ", shape.c, "-channel textures."));
+ }
+
+ cl_int error_code;
+ cl_mem memory = CreateImage2DLegacy(context.context(), mem_flags, &format, &desc,
+ const_cast<void *>(data_ptr), &error_code);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(absl::StrCat(
+ "Failed to create single 2D texture (clCreateImage): ", CLErrorCodeToString(error_code)));
+ }
+
+ *result = CLMemory(memory, true);
+ return absl::OkStatus();
+ }
+
+ default:
+ return absl::InternalError("Unsupported tensor storage type");
+ }
+}
+
+absl::Status CreateImageBufferFromBuffer(const CLContext &context, cl_mem memory,
+ DataType data_type, int width, cl_mem *result)
+{
+ cl_image_format format;
+ cl_image_desc desc;
+ std::memset(&desc, 0, sizeof(desc));
+ desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+ desc.image_width = width;
+ desc.mem_object = memory;
+
+ format.image_channel_data_type = ToImageChannelType(data_type);
+ format.image_channel_order = CL_RGBA;
+
+ cl_int error_code;
+ *result =
+ clCreateImage(context.context(), CL_MEM_READ_WRITE, &format, &desc, nullptr, &error_code);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(absl::StrCat("Failed to create Image from Buffer (clCreateImage): ",
+ CLErrorCodeToString(error_code)));
+ }
+ return absl::OkStatus();
+}
+
+absl::Status CreateTensor(const CLContext &context, const BHWDC &shape,
+ const TensorDescriptor &descriptor, cl_mem memory, Tensor *result)
+{
+ const bool memory_owner = memory == nullptr;
+ if (memory_owner)
+ {
+ CLMemory mem;
+ RETURN_IF_ERROR(AllocateTensorMemory(context, shape, descriptor, nullptr, &mem));
+ memory = mem.Release();
+ }
+ if (descriptor.storage_type == TensorStorageType::IMAGE_BUFFER)
+ {
+ cl_mem image_memory;
+ RETURN_IF_ERROR(CreateImageBufferFromBuffer(
+ context, memory, descriptor.data_type,
+ shape.b * shape.w * shape.h * shape.d * DivideRoundUp(shape.c, 4), &image_memory));
+ *result = Tensor(memory, memory_owner, image_memory, shape, descriptor);
+ }
+ else
+ {
+ *result = Tensor(memory, memory_owner, shape, descriptor);
+ }
+ return absl::OkStatus();
+}
+
+absl::Status CreateTensorShared(const CLContext &context, const BHWDC &shape,
+ const TensorDescriptor &descriptor, cl_mem memory, Tensor *result)
+{
+ const bool memory_owner = false;
+ if (descriptor.storage_type == TensorStorageType::IMAGE_BUFFER)
+ {
+ cl_mem image_memory;
+ RETURN_IF_ERROR(CreateImageBufferFromBuffer(
+ context, memory, descriptor.data_type,
+ shape.b * shape.w * shape.h * shape.d * DivideRoundUp(shape.c, 4), &image_memory));
+ *result = Tensor(memory, memory_owner, image_memory, shape, descriptor);
+ }
+ else
+ {
+ *result = Tensor(memory, memory_owner, shape, descriptor);
+ }
+ return absl::OkStatus();
+}
+
+} // namespace
+
+absl::Status TensorDescriptor::CreateGPUObject(CLContext *context, GPUObjectPtr *result) const
+{
+ Tensor gpu_tensor;
+ RETURN_IF_ERROR(gpu_tensor.CreateFromDescriptor(*this, context));
+ *result = absl::make_unique<Tensor>(std::move(gpu_tensor));
+ return absl::OkStatus();
+}
+
+Tensor::Tensor(cl_mem memory, bool memory_owner, const BHWC &shape,
+ const TensorDescriptor &descriptor)
+ : memory_(memory), image_buffer_memory_(nullptr), memory_owner_(memory_owner),
+ shape_(shape.b, shape.h, shape.w, 1, shape.c), descriptor_(descriptor)
+{
+}
+
+Tensor::Tensor(cl_mem memory, bool memory_owner, const BHWDC &shape,
+ const TensorDescriptor &descriptor)
+ : memory_(memory), image_buffer_memory_(nullptr), memory_owner_(memory_owner), shape_(shape),
+ descriptor_(descriptor)
+{
+}
+
+Tensor::Tensor(cl_mem memory, bool memory_owner, cl_mem image_buffer_memory, const BHWC &shape,
+ const TensorDescriptor &descriptor)
+ : memory_(memory), image_buffer_memory_(image_buffer_memory), memory_owner_(memory_owner),
+ shape_(shape.b, shape.h, shape.w, 1, shape.c), descriptor_(descriptor)
+{
+}
+
+Tensor::Tensor(cl_mem memory, bool memory_owner, cl_mem image_buffer_memory, const BHWDC &shape,
+ const TensorDescriptor &descriptor)
+ : memory_(memory), image_buffer_memory_(image_buffer_memory), memory_owner_(memory_owner),
+ shape_(shape), descriptor_(descriptor)
+{
+}
+
+Tensor::Tensor(Tensor &&tensor)
+ : memory_(tensor.memory_), image_buffer_memory_(tensor.image_buffer_memory_),
+ memory_owner_(tensor.memory_owner_), shape_(tensor.shape_), descriptor_(tensor.descriptor_)
+{
+ tensor.memory_ = nullptr;
+ tensor.image_buffer_memory_ = nullptr;
+}
+
+Tensor &Tensor::operator=(Tensor &&tensor)
+{
+ if (this != &tensor)
+ {
+ Release();
+ std::swap(memory_, tensor.memory_);
+ std::swap(image_buffer_memory_, tensor.image_buffer_memory_);
+ std::swap(memory_owner_, tensor.memory_owner_);
+ std::swap(shape_, tensor.shape_);
+ std::swap(descriptor_, tensor.descriptor_);
+ }
+ return *this;
+}
+
+void Tensor::Release()
+{
+ // image_buffer_memory_ always owned by object
+ if (image_buffer_memory_)
+ {
+ clReleaseMemObject(image_buffer_memory_);
+ image_buffer_memory_ = nullptr;
+ }
+ if (memory_owner_ && memory_)
+ {
+ clReleaseMemObject(memory_);
+ memory_ = nullptr;
+ }
+}
+
+absl::Status Tensor::GetGPUResources(const GPUObjectDescriptor *obj_ptr,
+ GPUResourcesWithValue *resources) const
+{
+ const auto *buffer_desc = dynamic_cast<const BufferDescriptor *>(obj_ptr);
+ if (buffer_desc)
+ {
+ if (descriptor_.storage_type != TensorStorageType::BUFFER)
+ {
+ return absl::InvalidArgumentError("Tensor can be used with BufferDescriptor only wtih "
+ "TensorStorageType::BUFFER.");
+ }
+ resources->buffers.push_back({"buffer", memory_});
+ return absl::OkStatus();
+ }
+ const auto *tensor_desc = dynamic_cast<const TensorDescriptor *>(obj_ptr);
+ if (!tensor_desc)
+ {
+ return absl::InvalidArgumentError("Expected TensorDescriptor on input.");
+ }
+ if (descriptor_.HasAxis(Axis::WIDTH))
+ {
+ resources->ints.push_back({"width", Width()});
+ resources->ints.push_back({"width_div2", Width() / 2});
+ resources->ints.push_back({"width_div4", Width() / 4});
+ resources->ints.push_back({"width_batched", Width() * Batch()});
+ resources->ints.push_back({"width_batched_div2", Width() * Batch() / 2});
+ resources->ints.push_back({"width_batched_div4", Width() * Batch() / 4});
+ }
+ if (descriptor_.HasAxis(Axis::HEIGHT))
+ {
+ resources->ints.push_back({"height", Height()});
+ }
+ if (descriptor_.HasAxis(Axis::CHANNELS))
+ {
+ resources->ints.push_back({"slices", Slices()});
+ resources->ints.push_back({"channels", Channels()});
+ }
+ if (descriptor_.HasAxis(Axis::BATCH))
+ {
+ resources->ints.push_back({"batch", Batch()});
+ }
+ if (descriptor_.HasAxis(Axis::DEPTH))
+ {
+ resources->ints.push_back({"depth", Depth()});
+ }
+
+ if (descriptor_.storage_type == TensorStorageType::BUFFER)
+ {
+ resources->buffers.push_back({"buffer", memory_});
+ }
+ else if (descriptor_.storage_type == TensorStorageType::TEXTURE_2D ||
+ descriptor_.storage_type == TensorStorageType::SINGLE_TEXTURE_2D)
+ {
+ resources->images2d.push_back({"image2d", memory_});
+ }
+ else if (descriptor_.storage_type == TensorStorageType::TEXTURE_ARRAY)
+ {
+ resources->image2d_arrays.push_back({"image2d_array", memory_});
+ }
+ else if (descriptor_.storage_type == TensorStorageType::TEXTURE_3D)
+ {
+ resources->images3d.push_back({"image3d", memory_});
+ }
+ else if (descriptor_.storage_type == TensorStorageType::IMAGE_BUFFER)
+ {
+ if (obj_ptr->GetAccess() == AccessType::READ)
+ {
+ resources->image_buffers.push_back({"image_buffer", image_buffer_memory_});
+ }
+ else
+ {
+ resources->buffers.push_back({"buffer", memory_});
+ }
+ }
+
+ return absl::OkStatus();
+}
+
+int3 Tensor::GetFullTensorRegion() const
+{
+ switch (descriptor_.storage_type)
+ {
+ case TensorStorageType::BUFFER:
+ case TensorStorageType::TEXTURE_ARRAY:
+ case TensorStorageType::TEXTURE_3D:
+ case TensorStorageType::IMAGE_BUFFER:
+ return {shape_.w * shape_.b, shape_.h, shape_.d * Slices()};
+ case TensorStorageType::TEXTURE_2D:
+ return {shape_.w * shape_.b * shape_.d, shape_.h * Slices(), 1};
+ case TensorStorageType::SINGLE_TEXTURE_2D:
+ return {shape_.w * shape_.b * shape_.d, shape_.h, 1};
+ case TensorStorageType::UNKNOWN:
+ return {-1, -1, -1};
+ }
+ return {-1, -1, -1};
+}
+
+absl::Status Tensor::IsValid(const BHWC &shape) const
+{
+ if (shape.b != shape_.b)
+ {
+ return absl::InvalidArgumentError("Shape batch does not match tensor batch");
+ }
+ if (shape.w != shape_.w)
+ {
+ return absl::InvalidArgumentError("Shape width does not match tensor width");
+ }
+ if (shape.h != shape_.h)
+ {
+ return absl::InvalidArgumentError("Shape height does not match tensor height");
+ }
+ if (shape.c != shape_.c)
+ {
+ return absl::InvalidArgumentError("Shape channels does not match tensor channels");
+ }
+ return absl::OkStatus();
+}
+
+absl::Status Tensor::IsValid(const BHWDC &shape) const
+{
+ if (shape.b != shape_.b)
+ {
+ return absl::InvalidArgumentError("Shape batch does not match tensor batch");
+ }
+ if (shape.w != shape_.w)
+ {
+ return absl::InvalidArgumentError("Shape width does not match tensor width");
+ }
+ if (shape.h != shape_.h)
+ {
+ return absl::InvalidArgumentError("Shape height does not match tensor height");
+ }
+ if (shape.d != shape_.d)
+ {
+ return absl::InvalidArgumentError("Shape depth does not match tensor depth");
+ }
+ if (shape.c != shape_.c)
+ {
+ return absl::InvalidArgumentError("Shape channels does not match tensor channels");
+ }
+ return absl::OkStatus();
+}
+
+int Tensor::GetAlignedChannels() const
+{
+ return descriptor_.storage_type == TensorStorageType::SINGLE_TEXTURE_2D ? shape_.c
+ : AlignByN(shape_.c, 4);
+}
+
+uint64_t Tensor::GetMemorySizeInBytes() const
+{
+ const uint64_t flt_size = static_cast<uint64_t>(SizeOf(descriptor_.data_type));
+ const uint64_t flt4_size = 4 * flt_size;
+ switch (descriptor_.storage_type)
+ {
+ case TensorStorageType::BUFFER:
+ case TensorStorageType::IMAGE_BUFFER:
+ case TensorStorageType::TEXTURE_ARRAY:
+ case TensorStorageType::TEXTURE_2D:
+ case TensorStorageType::TEXTURE_3D:
+ return flt4_size * shape_.b * shape_.w * shape_.h * shape_.d * Slices();
+ case TensorStorageType::SINGLE_TEXTURE_2D:
+ return flt_size * shape_.w * shape_.h * shape_.c * shape_.b * shape_.d;
+ default:
+ return 0;
+ }
+}
+
+cl_mem Tensor::GetMemoryPtr() const
+{
+ return descriptor_.storage_type == TensorStorageType::IMAGE_BUFFER ? image_buffer_memory_
+ : memory_;
+}
+
+cl_mem Tensor::GetMemoryPtrForWriting() const { return memory_; }
+
+absl::Status Tensor::WriteDataBHWDC(absl::Span<const float> in, CLCommandQueue *queue)
+{
+ void *data_ptr = nullptr;
+ const int aligned_channels = GetAlignedChannels();
+ const int elements_count = shape_.b * shape_.w * shape_.h * shape_.d * aligned_channels;
+
+ const size_t data_size = elements_count * SizeOf(descriptor_.data_type);
+ std::vector<float> data_f;
+ data_f.resize(elements_count);
+ data_ptr = data_f.data();
+ DataFromBHWDC(in, shape_, descriptor_, absl::MakeSpan(data_f.data(), data_f.size()));
+
+ switch (descriptor_.storage_type)
+ {
+ case TensorStorageType::BUFFER:
+ case TensorStorageType::IMAGE_BUFFER:
+ RETURN_IF_ERROR(queue->EnqueueWriteBuffer(memory_, data_size, data_ptr));
+ break;
+ case TensorStorageType::TEXTURE_ARRAY:
+ case TensorStorageType::TEXTURE_2D:
+ case TensorStorageType::TEXTURE_3D:
+ case TensorStorageType::SINGLE_TEXTURE_2D:
+ RETURN_IF_ERROR(queue->EnqueueWriteImage(memory_, GetFullTensorRegion(), data_ptr));
+ break;
+ default:
+ return absl::InternalError("Unsupported tensor storage type");
+ }
+
+ return absl::OkStatus();
+}
+
+absl::Status Tensor::WriteData(CLCommandQueue *queue, const TensorFloat32 &src)
+{
+ RETURN_IF_ERROR(IsValid(src.shape));
+ return WriteDataBHWDC(absl::MakeConstSpan(src.data), queue);
+}
+
+absl::Status Tensor::WriteData(CLCommandQueue *queue,
+ const InternalTensor<Linear, DataType::FLOAT32> &src)
+{
+ return WriteDataBHWDC(absl::MakeConstSpan(src.data), queue);
+}
+
+absl::Status Tensor::WriteData(CLCommandQueue *queue,
+ const InternalTensor<HWC, DataType::FLOAT32> &src)
+{
+ return WriteDataBHWDC(absl::MakeConstSpan(src.data), queue);
+}
+
+absl::Status Tensor::WriteData(CLCommandQueue *queue, const Tensor5DFloat32 &src)
+{
+ RETURN_IF_ERROR(IsValid(src.shape));
+ return WriteDataBHWDC(absl::MakeConstSpan(src.data), queue);
+}
+
+absl::Status Tensor::ReadDataBHWDC(absl::Span<float> out, CLCommandQueue *queue) const
+{
+ void *data_ptr = nullptr;
+ const int aligned_channels = GetAlignedChannels();
+ const int elements_count = shape_.b * shape_.w * shape_.h * shape_.d * aligned_channels;
+ const size_t data_size = elements_count * SizeOf(descriptor_.data_type);
+
+ std::vector<float> data_f;
+ data_f.resize(elements_count);
+ data_ptr = data_f.data();
+ switch (descriptor_.storage_type)
+ {
+ case TensorStorageType::BUFFER:
+ case TensorStorageType::IMAGE_BUFFER:
+ RETURN_IF_ERROR(queue->EnqueueReadBuffer(memory_, data_size, data_ptr));
+ break;
+ case TensorStorageType::TEXTURE_ARRAY:
+ case TensorStorageType::TEXTURE_2D:
+ case TensorStorageType::TEXTURE_3D:
+ case TensorStorageType::SINGLE_TEXTURE_2D:
+ RETURN_IF_ERROR(queue->EnqueueReadImage(memory_, GetFullTensorRegion(), data_ptr));
+ break;
+ default:
+ return absl::InternalError("Unsupported tensor storage type");
+ }
+
+ if (descriptor_.data_type == DataType::FLOAT32)
+ {
+ DataToBHWDC(absl::MakeConstSpan(data_f.data(), data_f.size()), shape_, descriptor_, out);
+ }
+
+ return absl::OkStatus();
+}
+
+absl::Status Tensor::ReadData(CLCommandQueue *queue, TensorFloat32 *dst) const
+{
+ RETURN_IF_ERROR(IsValid(dst->shape));
+ return ReadDataBHWDC(absl::MakeSpan(dst->data), queue);
+}
+
+absl::Status Tensor::ReadData(CLCommandQueue *queue, Tensor5DFloat32 *dst) const
+{
+ RETURN_IF_ERROR(IsValid(dst->shape));
+ return ReadDataBHWDC(absl::MakeSpan(dst->data), queue);
+}
+
+absl::Status Tensor::CreateFromDescriptor(const TensorDescriptor &desc, CLContext *context)
+{
+ shape_ = desc.shape;
+ descriptor_.data_type = desc.data_type;
+ descriptor_.storage_type = desc.storage_type;
+ descriptor_.layout = desc.layout;
+ memory_owner_ = true;
+ CLMemory memory;
+ uint8_t *data_ptr = desc.data.empty() ? nullptr : const_cast<unsigned char *>(desc.data.data());
+ RETURN_IF_ERROR(AllocateTensorMemory(*context, shape_, descriptor_, data_ptr, &memory));
+ memory_ = memory.Release();
+ if (desc.storage_type == TensorStorageType::IMAGE_BUFFER)
+ {
+ RETURN_IF_ERROR(CreateImageBufferFromBuffer(*context, memory_, desc.data_type,
+ shape_.b * shape_.w * shape_.h * shape_.d *
+ DivideRoundUp(shape_.c, 4),
+ &image_buffer_memory_));
+ }
+ return absl::OkStatus();
+}
+
+absl::Status CreateTensor(const CLContext &context, const BHWC &shape,
+ const TensorDescriptor &descriptor, Tensor *result)
+{
+ const BHWDC shape5D(shape.b, shape.h, shape.w, 1, shape.c);
+ return CreateTensor(context, shape5D, descriptor, nullptr, result);
+}
+
+absl::Status CreateTensor(const CLContext &context, const BHWDC &shape,
+ const TensorDescriptor &descriptor, Tensor *result)
+{
+ return CreateTensor(context, shape, descriptor, nullptr, result);
+}
+
+absl::Status CreateSharedTensor(const CLContext &context, cl_mem memory, const BHWC &shape,
+ const TensorDescriptor &descriptor, Tensor *result)
+{
+ const BHWDC shape5D(shape.b, shape.h, shape.w, 1, shape.c);
+ return CreateTensorShared(context, shape5D, descriptor, memory, result);
+}
+
+absl::Status CreateSharedTensor(const CLContext &context, cl_mem memory, const BHWDC &shape,
+ const TensorDescriptor &descriptor, Tensor *result)
+{
+ return CreateTensorShared(context, shape, descriptor, memory, result);
+}
+
+absl::Status AllocateTensorMemory(const CLContext &context, const BHWC &shape,
+ const TensorDescriptor &descriptor, CLMemory *result)
+{
+ const BHWDC shape5D(shape.b, shape.h, shape.w, 1, shape.c);
+ return AllocateTensorMemory(context, shape5D, descriptor, nullptr, result);
+}
+
+absl::Status AllocateTensorMemory(const CLContext &context, const BHWDC &shape,
+ const TensorDescriptor &descriptor, CLMemory *result)
+{
+ return AllocateTensorMemory(context, shape, descriptor, nullptr, result);
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/Tensor.h b/runtime/onert/backend/gpu_cl/open_cl/Tensor.h
new file mode 100644
index 000000000..b1930a423
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/Tensor.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_TENSOR_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_TENSOR_H__
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/types/span.h"
+#include "ClCommandQueue.h"
+#include "OpenclWrapper.h"
+#include "ClContext.h"
+#include "ClDevice.h"
+#include "ClMemory.h"
+#include "GpuObject.h"
+#include "TensorType.h"
+#include "Util.h"
+#include "DataType.h"
+#include "Shape.h"
+#include "Status.h"
+#include "InternalTensor.h"
+#include "Types.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+class Tensor : public GPUObject
+{
+public:
+ Tensor() : memory_(nullptr), image_buffer_memory_(nullptr), memory_owner_(true) {}
+ Tensor(cl_mem memory, bool memory_owner, const BHWC &shape, const TensorDescriptor &descriptor);
+ Tensor(cl_mem memory, bool memory_owner, const BHWDC &shape, const TensorDescriptor &descriptor);
+ Tensor(cl_mem memory, bool memory_owner, cl_mem image_buffer_memory, const BHWC &shape,
+ const TensorDescriptor &descriptor);
+ Tensor(cl_mem memory, bool memory_owner, cl_mem image_buffer_memory, const BHWDC &shape,
+ const TensorDescriptor &descriptor);
+
+ // Move only
+ Tensor(Tensor &&tensor);
+ Tensor &operator=(Tensor &&tensor);
+ Tensor(const Tensor &) = delete;
+ Tensor &operator=(const Tensor &) = delete;
+
+ virtual ~Tensor() { Release(); }
+
+ absl::Status GetGPUResources(const GPUObjectDescriptor *obj_ptr,
+ GPUResourcesWithValue *resources) const override;
+
+ int Width() const { return shape_.w; }
+ int Height() const { return shape_.h; }
+ int Depth() const { return shape_.d; }
+ int Channels() const { return shape_.c; }
+ int Slices() const { return DivideRoundUp(shape_.c, 4); }
+ int Batch() const { return shape_.b; }
+ TensorDescriptor GetDescriptor() const { return descriptor_; }
+ DataType GetDataType() const { return descriptor_.data_type; }
+ TensorStorageType GetStorageType() const { return descriptor_.storage_type; }
+
+ // for profiling and memory statistics
+ uint64_t GetMemorySizeInBytes() const;
+
+ cl_mem GetMemoryPtr() const;
+
+ // This function returns buffer memory ptr for IMAGE_BUFFER instead of image
+ // memory ptr.
+ cl_mem GetMemoryPtrForWriting() const;
+
+ absl::Status WriteData(CLCommandQueue *queue, const TensorFloat32 &src);
+ absl::Status WriteData(CLCommandQueue *queue,
+ const InternalTensor<Linear, DataType::FLOAT32> &src);
+ absl::Status WriteData(CLCommandQueue *queue, const InternalTensor<HWC, DataType::FLOAT32> &src);
+
+ absl::Status WriteData(CLCommandQueue *queue, const Tensor5DFloat32 &src);
+ absl::Status ReadData(CLCommandQueue *queue, TensorFloat32 *dst) const;
+ absl::Status ReadData(CLCommandQueue *queue, Tensor5DFloat32 *dst) const;
+
+ absl::Status CreateFromDescriptor(const TensorDescriptor &desc, CLContext *context);
+
+private:
+ absl::Status IsValid(const BHWC &shape) const;
+ absl::Status IsValid(const BHWDC &shape) const;
+
+ int GetChannelsAlignment() const;
+ int GetAlignedChannels() const;
+
+ absl::Status WriteDataBHWDC(absl::Span<const float> in, CLCommandQueue *queue);
+ absl::Status ReadDataBHWDC(absl::Span<float> out, CLCommandQueue *queue) const;
+
+ int3 GetFullTensorRegion() const;
+ void Release();
+
+ cl_mem memory_;
+ cl_mem image_buffer_memory_; // for TensorStorageType::IMAGE_BUFFER only
+ bool memory_owner_;
+ BHWDC shape_;
+ TensorDescriptor descriptor_;
+};
+
+using TensorPtr = std::shared_ptr<Tensor>;
+
+absl::Status AllocateTensorMemory(const CLContext &context, const BHWC &shape,
+ const TensorDescriptor &descriptor, CLMemory *result);
+
+absl::Status AllocateTensorMemory(const CLContext &context, const BHWDC &shape,
+ const TensorDescriptor &descriptor, CLMemory *result);
+
+absl::Status CreateTensor(const CLContext &context, const BHWC &shape,
+ const TensorDescriptor &descriptor, Tensor *result);
+
+absl::Status CreateTensor(const CLContext &context, const BHWDC &shape,
+ const TensorDescriptor &descriptor, Tensor *result);
+
+absl::Status CreateSharedTensor(const CLContext &context, cl_mem memory, const BHWC &shape,
+ const TensorDescriptor &descriptor, Tensor *result);
+
+absl::Status CreateSharedTensor(const CLContext &context, cl_mem memory, const BHWDC &shape,
+ const TensorDescriptor &descriptor, Tensor *result);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_TENSOR_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/TensorType.cc b/runtime/onert/backend/gpu_cl/open_cl/TensorType.cc
new file mode 100644
index 000000000..7ede38795
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/TensorType.cc
@@ -0,0 +1,1116 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TensorType.h"
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/substitute.h"
+#include "Shape.h"
+#include "DataType.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+namespace
+{
+
+std::string GetWriteImageFromDataType(DataType data_type)
+{
+ if (data_type == DataType::FLOAT32)
+ {
+ return "write_imagef";
+ }
+ else if (data_type == DataType::FLOAT16)
+ {
+ return "write_imageh";
+ }
+ else
+ {
+ throw std::runtime_error("Not supported data type");
+ }
+}
+
+} // namespace
+
+std::string TextureAddressModeToString(TextureAddressMode address_mode)
+{
+ switch (address_mode)
+ {
+ case TextureAddressMode::DONT_CARE:
+ return "smp_none";
+ case TextureAddressMode::ZERO:
+ return "smp_zero";
+ }
+ return "";
+}
+
+std::string ToString(TensorStorageType type)
+{
+ switch (type)
+ {
+ case TensorStorageType::UNKNOWN:
+ return "TensorStorageType::UNKNOWN";
+ case TensorStorageType::BUFFER:
+ return "TensorStorageType::BUFFER";
+ case TensorStorageType::TEXTURE_ARRAY:
+ return "TensorStorageType::TEXTURE_ARRAY";
+ case TensorStorageType::TEXTURE_2D:
+ return "TensorStorageType::TEXTURE_2D";
+ case TensorStorageType::TEXTURE_3D:
+ return "TensorStorageType::TEXTURE_3D";
+ case TensorStorageType::SINGLE_TEXTURE_2D:
+ return "TensorStorageType::SINGLE_TEXTURE_2D";
+ case TensorStorageType::IMAGE_BUFFER:
+ return "TensorStorageType::IMAGE_BUFFER";
+ }
+ return "";
+}
+
+TensorDescriptor::TensorDescriptor(TensorDescriptor &&desc)
+ : GPUObjectDescriptor(std::move(desc)), data_type(desc.data_type),
+ storage_type(desc.storage_type), layout(desc.layout), shape(desc.shape),
+ data(std::move(desc.data))
+{
+}
+TensorDescriptor &TensorDescriptor::operator=(TensorDescriptor &&desc)
+{
+ if (this != &desc)
+ {
+ std::swap(data_type, desc.data_type);
+ std::swap(storage_type, desc.storage_type);
+ std::swap(layout, desc.layout);
+ std::swap(shape, desc.shape);
+ data = std::move(desc.data);
+ GPUObjectDescriptor::operator=(std::move(desc));
+ }
+ return *this;
+}
+
+GPUResources TensorDescriptor::GetGPUResources() const
+{
+ GPUResources resources;
+ if (HasAxis(Axis::WIDTH))
+ {
+ resources.ints.push_back("width");
+ resources.ints.push_back("width_div2");
+ resources.ints.push_back("width_div4");
+ resources.ints.push_back("width_batched");
+ resources.ints.push_back("width_batched_div2");
+ resources.ints.push_back("width_batched_div4");
+ }
+ if (HasAxis(Axis::HEIGHT))
+ {
+ resources.ints.push_back("height");
+ }
+ if (HasAxis(Axis::CHANNELS))
+ {
+ resources.ints.push_back("slices");
+ resources.ints.push_back("channels");
+ }
+ if (HasAxis(Axis::BATCH))
+ {
+ resources.ints.push_back("batch");
+ }
+ if (HasAxis(Axis::DEPTH))
+ {
+ resources.ints.push_back("depth");
+ }
+ if (storage_type == TensorStorageType::BUFFER)
+ {
+ GPUBufferDescriptor desc;
+ desc.data_type = data_type;
+ desc.access_type = access_type_;
+ desc.element_size = 4;
+ auto it1 = state_vars_.find("ElementsX2");
+ if (it1 != state_vars_.end() && it1->second == "true")
+ {
+ desc.element_size = 8;
+ }
+ auto it2 = state_vars_.find("ElementsX4");
+ if (it2 != state_vars_.end() && it2->second == "true")
+ {
+ desc.element_size = 16;
+ }
+ resources.buffers.push_back({"buffer", desc});
+ }
+ else if (storage_type == TensorStorageType::SINGLE_TEXTURE_2D ||
+ storage_type == TensorStorageType::TEXTURE_2D)
+ {
+ GPUImage2DDescriptor desc;
+ desc.data_type = data_type;
+ desc.access_type = access_type_;
+ resources.images2d.push_back({"image2d", desc});
+ }
+ else if (storage_type == TensorStorageType::TEXTURE_ARRAY)
+ {
+ GPUImage2DArrayDescriptor desc;
+ desc.data_type = data_type;
+ desc.access_type = access_type_;
+ resources.image2d_arrays.push_back({"image2d_array", desc});
+ }
+ else if (storage_type == TensorStorageType::TEXTURE_3D)
+ {
+ GPUImage3DDescriptor desc;
+ desc.data_type = data_type;
+ desc.access_type = access_type_;
+ resources.images3d.push_back({"image3d", desc});
+ }
+ else if (storage_type == TensorStorageType::IMAGE_BUFFER)
+ {
+ if (access_type_ == AccessType::READ)
+ {
+ GPUImageBufferDescriptor desc;
+ desc.data_type = data_type;
+ desc.access_type = access_type_;
+ resources.image_buffers.push_back({"image_buffer", desc});
+ }
+ else
+ {
+ GPUBufferDescriptor desc;
+ desc.data_type = data_type;
+ desc.access_type = access_type_;
+ desc.element_size = 4;
+ resources.buffers.push_back({"buffer", desc});
+ }
+ }
+ return resources;
+}
+
+absl::Status TensorDescriptor::PerformSelector(const std::string &selector,
+ const std::vector<std::string> &args,
+ const std::vector<std::string> &template_args,
+ std::string *result) const
+{
+ if (selector == "Width")
+ {
+ *result = GetWidth();
+ return absl::OkStatus();
+ }
+ else if (selector == "Height")
+ {
+ *result = "height";
+ return absl::OkStatus();
+ }
+ else if (selector == "Slices")
+ {
+ *result = "slices";
+ return absl::OkStatus();
+ }
+ else if (selector == "SliceStride")
+ {
+ *result = GetSliceStride();
+ return absl::OkStatus();
+ }
+ else if (selector == "Channels")
+ {
+ *result = "channels";
+ return absl::OkStatus();
+ }
+ else if (selector == "Batch")
+ {
+ if (HasAxis(Axis::BATCH))
+ {
+ *result = "batch";
+ }
+ else
+ {
+ *result = "1";
+ }
+ return absl::OkStatus();
+ }
+ else if (selector == "Depth")
+ {
+ *result = "depth";
+ return absl::OkStatus();
+ }
+ else if (selector == "SetBatchRef")
+ {
+ if (args.size() != 1)
+ {
+ return absl::InvalidArgumentError("Unsupported arguments in SetBatchRef selector");
+ }
+ state_vars_["batch_id"] = args[0];
+ *result = "";
+ return absl::OkStatus();
+ }
+ else if (selector == "Read")
+ {
+ return PerformReadSelector(args, template_args, result);
+ }
+ else if (selector == "Write")
+ {
+ return PerformWriteSelector(args, result);
+ }
+ else if (selector == "WriteLinear")
+ {
+ return PerformWriteLinearSelector(args, result);
+ }
+ else if (selector == "GetAddress")
+ {
+ return PerformGetAddressSelector(args, result);
+ }
+ else if (selector == "GetPtrWithSliceOffset")
+ {
+ return PerformGetPtrWithSliceOffsetSelector(args, result);
+ }
+ else if (selector == "GetWHOffset")
+ {
+ return PerformGetWHOffsetSelector(args, result);
+ }
+ else if (selector == "GetHandle")
+ {
+ return PerformGetHandleSelector(args, result);
+ }
+ else
+ {
+ return absl::NotFoundError(
+ absl::StrCat("TensorDescriptor don't have selector with name - ", selector));
+ }
+}
+
+absl::Status TensorDescriptor::PerformReadSelector(const std::vector<std::string> &args,
+ const std::vector<std::string> &template_args,
+ std::string *result) const
+{
+ DataType read_as_type = data_type;
+ if (!template_args.empty())
+ {
+ if (template_args.size() != 1)
+ {
+ return absl::NotFoundError("Unrecognized Read selector template arguments.");
+ }
+ else
+ {
+ RETURN_IF_ERROR(GetDataTypeFromTemplateArgs(template_args[0], &read_as_type));
+ }
+ }
+ if (args.size() == 1)
+ { // function overload for 1D linear types.
+ if (storage_type == TensorStorageType::BUFFER ||
+ storage_type == TensorStorageType::IMAGE_BUFFER)
+ {
+ *result = Read(read_as_type, args[0]);
+ return absl::OkStatus();
+ }
+ else
+ {
+ return absl::InvalidArgumentError(
+ "Read selector with single argument can be used only with linear "
+ "storage types(BUFFER or IMAGE_BUFFER)");
+ }
+ }
+ std::string xc;
+ std::string yc;
+ std::string zc;
+ std::string sc;
+ std::string bc;
+ bool parsed = ParseCoordsFromArgs(args, 0, &xc, &yc, &zc, &sc, &bc);
+ if (args.size() < 2 || !parsed)
+ {
+ return absl::NotFoundError("Unrecognized Read selector");
+ }
+
+ *result = Read(read_as_type, GetGlobalAddressNoDeclaration(xc, yc, zc, sc, bc));
+ return absl::OkStatus();
+}
+
+absl::Status TensorDescriptor::GetLinkingContextFromWriteSelector(
+ const std::vector<std::string> &args, std::string *value_name, std::string *x_coord,
+ std::string *y_coord, std::string *s_coord) const
+{
+ std::string xc;
+ std::string yc;
+ std::string zc;
+ std::string sc;
+ std::string bc;
+ bool parsed = ParseCoordsFromArgs(args, 1, &xc, &yc, &zc, &sc, &bc);
+ if (args.size() < 2 || !parsed)
+ {
+ return absl::NotFoundError("Unrecognized Write selector");
+ }
+ *value_name = args[0];
+ if (HasAxis(Axis::BATCH) && !IsBatchedWidth())
+ {
+ *x_coord = absl::StrCat("((", xc, ") * batch + (", bc, "))");
+ }
+ else
+ {
+ *x_coord = absl::StrCat("(", xc, ")");
+ }
+ *y_coord = absl::StrCat("(", yc, ")");
+ *s_coord = absl::StrCat("(", sc, ")");
+ return absl::OkStatus();
+}
+
+absl::Status TensorDescriptor::PerformWriteSelector(const std::vector<std::string> &args,
+ std::string *result) const
+{
+ std::string xc;
+ std::string yc;
+ std::string zc;
+ std::string sc;
+ std::string bc;
+ bool parsed = ParseCoordsFromArgs(args, 1, &xc, &yc, &zc, &sc, &bc);
+ if (args.size() < 2 || !parsed)
+ {
+ return absl::NotFoundError("Unrecognized Write selector");
+ }
+ *result = Write(args[0], GetGlobalAddressNoDeclaration(xc, yc, zc, sc, bc));
+ return absl::OkStatus();
+}
+
+absl::Status TensorDescriptor::PerformWriteLinearSelector(const std::vector<std::string> &args,
+ std::string *result) const
+{
+ if (storage_type != TensorStorageType::BUFFER && storage_type != TensorStorageType::IMAGE_BUFFER)
+ {
+ return absl::InvalidArgumentError("WriteLinear selector can be used only with linear "
+ "storages(BUFFER/IMAGE_BUFFER)");
+ }
+ if (args.size() != 2)
+ {
+ return absl::NotFoundError("Unrecognized WriteLinear selector");
+ }
+ *result = Write(args[0], "(" + args[1] + ")");
+ return absl::OkStatus();
+}
+
+std::string TensorDescriptor::Read(DataType read_as_type, const std::string &global_address) const
+{
+ const std::string read_as = read_as_type == DataType::FLOAT16 ? "read_imageh" : "read_imagef";
+ std::string image_type;
+ if (storage_type == TensorStorageType::TEXTURE_2D ||
+ storage_type == TensorStorageType::SINGLE_TEXTURE_2D)
+ {
+ image_type = "image2d";
+ }
+ else if (storage_type == TensorStorageType::TEXTURE_3D)
+ {
+ image_type = "image3d";
+ }
+ else if (storage_type == TensorStorageType::TEXTURE_ARRAY)
+ {
+ image_type = "image2d_array";
+ }
+ switch (storage_type)
+ {
+ case TensorStorageType::BUFFER:
+ if (read_as_type == data_type)
+ {
+ return absl::StrCat("buffer[", global_address, "]");
+ }
+ else
+ {
+ const std::string conversion =
+ read_as_type == DataType::FLOAT16 ? "convert_half4" : "convert_float4";
+ return absl::StrCat(conversion, "(buffer[", global_address, "])");
+ }
+ case TensorStorageType::TEXTURE_2D:
+ case TensorStorageType::TEXTURE_3D:
+ case TensorStorageType::SINGLE_TEXTURE_2D:
+ case TensorStorageType::TEXTURE_ARRAY:
+ return absl::StrCat(read_as, "(", image_type,
+ ", " + TextureAddressModeToString(ModeFromState()) + ", ", global_address,
+ ")");
+ case TensorStorageType::IMAGE_BUFFER:
+ return absl::StrCat(read_as, "(image_buffer, ", global_address, ")");
+ case TensorStorageType::UNKNOWN:
+ return "";
+ }
+ return "";
+}
+
+std::string TensorDescriptor::Write(const std::string &var_name,
+ const std::string &global_address) const
+{
+ std::string image_type;
+ if (storage_type == TensorStorageType::TEXTURE_2D ||
+ storage_type == TensorStorageType::SINGLE_TEXTURE_2D)
+ {
+ image_type = "image2d";
+ }
+ else if (storage_type == TensorStorageType::TEXTURE_3D)
+ {
+ image_type = "image3d";
+ }
+ else if (storage_type == TensorStorageType::TEXTURE_ARRAY)
+ {
+ image_type = "image2d_array";
+ }
+ switch (storage_type)
+ {
+ case TensorStorageType::BUFFER:
+ case TensorStorageType::IMAGE_BUFFER:
+ return absl::StrCat("buffer[", global_address, "] = ", var_name, ";\n");
+ case TensorStorageType::TEXTURE_2D:
+ case TensorStorageType::TEXTURE_3D:
+ case TensorStorageType::SINGLE_TEXTURE_2D:
+ case TensorStorageType::TEXTURE_ARRAY:
+ return absl::StrCat(GetWriteImageFromDataType(data_type), "(", image_type, ", ",
+ global_address, ", ", var_name, ");\n");
+ case TensorStorageType::UNKNOWN:
+ return "";
+ }
+ return "";
+}
+
+absl::Status TensorDescriptor::PerformGetAddressSelector(const std::vector<std::string> &args,
+ std::string *result) const
+{
+ std::string xc;
+ std::string yc;
+ std::string zc;
+ std::string sc;
+ std::string bc;
+ bool parsed = ParseCoordsFromArgs(args, 1, &xc, &yc, &zc, &sc, &bc);
+ if (args.size() < 3 || !parsed)
+ {
+ return absl::NotFoundError("Unrecognized GetAddress selector");
+ }
+
+ *result = DeclareAddress(args[0], GetGlobalAddressNoDeclaration(xc, yc, zc, sc, bc));
+ return absl::OkStatus();
+}
+
+absl::Status
+TensorDescriptor::PerformGetPtrWithSliceOffsetSelector(const std::vector<std::string> &args,
+ std::string *result) const
+{
+ if (storage_type != TensorStorageType::BUFFER)
+ {
+ return absl::InvalidArgumentError(
+ "GetPtrWithSliceOffset selector can be used only with BUFFER");
+ }
+ if (args.size() != 1)
+ {
+ return absl::NotFoundError(
+ absl::StrCat("GetPtrWithSliceOffset require one argument(slice coordinate), but ",
+ args.size(), " was passed"));
+ }
+ *result = absl::StrCat("buffer + ", args[0], " * ", GetSliceStride());
+ return absl::OkStatus();
+}
+
+absl::Status TensorDescriptor::PerformGetWHOffsetSelector(const std::vector<std::string> &args,
+ std::string *result) const
+{
+ if (storage_type != TensorStorageType::BUFFER && storage_type != TensorStorageType::IMAGE_BUFFER)
+ {
+ return absl::InvalidArgumentError(
+ "GetWHOffset selector can be used only with BUFFER/IMAGE_BUFFER");
+ }
+ if (args.size() != 2)
+ {
+ return absl::NotFoundError(absl::StrCat(
+ "GetWHOffset require two arguments(X and Y coordinates), but ", args.size(), " was passed"));
+ }
+ if (HasAxis(Axis::BATCH) && !IsBatchedWidth())
+ {
+ auto it = state_vars_.find("batch_id");
+ std::string batch_id;
+ if (it == state_vars_.end())
+ {
+ return absl::NotFoundError(
+ "Not found batch_id. Should be setted up by SetBatchRef(). method");
+ }
+ else
+ {
+ batch_id = it->second;
+ }
+ *result = absl::StrCat("((", args[1], ") * ", GetWidth(), " + (", args[0], ")) * batch + (",
+ batch_id, ")");
+ }
+ else
+ {
+ *result = absl::StrCat("(", args[1], ") * ", GetWidth(), " + (", args[0], ")");
+ }
+ return absl::OkStatus();
+}
+
+absl::Status TensorDescriptor::PerformGetHandleSelector(const std::vector<std::string> &args,
+ std::string *result) const
+{
+ if (!args.empty())
+ {
+ return absl::NotFoundError(
+ absl::StrCat("GetHandle does not require arguments, but ", args.size(), " was passed"));
+ }
+ switch (storage_type)
+ {
+ case TensorStorageType::BUFFER:
+ *result = "buffer";
+ return absl::OkStatus();
+ case TensorStorageType::IMAGE_BUFFER:
+ if (access_type_ == AccessType::READ)
+ {
+ *result = "image_buffer";
+ }
+ else
+ {
+ *result = "buffer";
+ }
+ return absl::OkStatus();
+ case TensorStorageType::TEXTURE_2D:
+ case TensorStorageType::SINGLE_TEXTURE_2D:
+ *result = "image2d";
+ return absl::OkStatus();
+ case TensorStorageType::TEXTURE_ARRAY:
+ *result = "image2d_array";
+ return absl::OkStatus();
+ case TensorStorageType::TEXTURE_3D:
+ *result = "image3d";
+ return absl::OkStatus();
+ case TensorStorageType::UNKNOWN:
+ return absl::UnavailableError("Unknown type");
+ }
+ return absl::UnavailableError("Unknown type");
+}
+
+std::string TensorDescriptor::DeclareAddress(const std::string &var_name,
+ const std::string &address) const
+{
+ return absl::StrCat(StorageTypeToAddressType(), " ", var_name, " = ", address, ";");
+}
+
+std::string TensorDescriptor::StorageTypeToAddressType() const
+{
+ switch (storage_type)
+ {
+ case TensorStorageType::BUFFER:
+ case TensorStorageType::IMAGE_BUFFER:
+ return "int";
+ case TensorStorageType::TEXTURE_2D:
+ case TensorStorageType::SINGLE_TEXTURE_2D:
+ return "int2";
+ case TensorStorageType::TEXTURE_ARRAY:
+ case TensorStorageType::TEXTURE_3D:
+ return "int4";
+ case TensorStorageType::UNKNOWN:
+ return "";
+ }
+ return "";
+}
+
+std::string TensorDescriptor::GetGlobalAddressNoDeclarationWHS(const std::string &x,
+ const std::string &y,
+ const std::string &s) const
+{
+ switch (storage_type)
+ {
+ case TensorStorageType::BUFFER:
+ case TensorStorageType::IMAGE_BUFFER:
+ {
+ return absl::Substitute("((($2) * height + ($1)) * $3 + ($0))", x, y, s, GetWidth());
+ }
+ case TensorStorageType::TEXTURE_2D:
+ return absl::Substitute("(int2)(($0), ($1) * slices + ($2))", x, y, s);
+ case TensorStorageType::SINGLE_TEXTURE_2D:
+ return absl::StrCat("(int2)(", x, ", ", y, ")");
+ case TensorStorageType::TEXTURE_ARRAY:
+ case TensorStorageType::TEXTURE_3D:
+ return absl::StrCat("(int4)(", x, ", ", y, ", ", s, ", 0)");
+ case TensorStorageType::UNKNOWN:
+ return "error";
+ }
+ return "error";
+}
+
+std::string TensorDescriptor::GetGlobalAddressNoDeclarationWHSB(const std::string &x,
+ const std::string &y,
+ const std::string &s,
+ const std::string &b) const
+{
+ switch (storage_type)
+ {
+ case TensorStorageType::BUFFER:
+ case TensorStorageType::IMAGE_BUFFER:
+ return absl::Substitute("(((($3) * height + $2) * width + ($1)) * batch + ($0))", b, x, y, s);
+ case TensorStorageType::TEXTURE_2D:
+ return absl::Substitute("(int2)(($0) * batch + ($1), ($2) * slices + ($3))", x, b, y, s);
+ case TensorStorageType::SINGLE_TEXTURE_2D:
+ return absl::Substitute("(int2)(($0) * batch + ($1), ($2))", x, b, y);
+ case TensorStorageType::TEXTURE_ARRAY:
+ case TensorStorageType::TEXTURE_3D:
+ return absl::Substitute("(int4)(($0) * batch + ($1), ($2), ($3), 0)", x, b, y, s);
+ default:
+ throw std::runtime_error("Unknown storage type");
+ }
+}
+
+std::string TensorDescriptor::GetGlobalAddressNoDeclarationWHDS(const std::string &x,
+ const std::string &y,
+ const std::string &z,
+ const std::string &s) const
+{
+ switch (storage_type)
+ {
+ case TensorStorageType::BUFFER:
+ case TensorStorageType::IMAGE_BUFFER:
+ {
+ return absl::Substitute("(((($3) * slices + ($2)) * height + ($1)) * $4 + ($0))", x, y, s, z,
+ GetWidth());
+ }
+ case TensorStorageType::TEXTURE_2D:
+ return absl::Substitute("(int2)(($0) * depth + ($1), ($2) * slices + ($3))", x, z, y, s);
+ case TensorStorageType::SINGLE_TEXTURE_2D:
+ return absl::Substitute("(int2)(($0) * depth + ($1), ($2))", x, z, y);
+ case TensorStorageType::TEXTURE_ARRAY:
+ case TensorStorageType::TEXTURE_3D:
+ return absl::Substitute("(int4)(($0), ($1), ($2) * slices + ($3), 0)", x, y, z, s);
+ case TensorStorageType::UNKNOWN:
+ return "error";
+ }
+ return "error";
+}
+
+std::string TensorDescriptor::GetGlobalAddressNoDeclarationWHDSB(const std::string &x,
+ const std::string &y,
+ const std::string &z,
+ const std::string &s,
+ const std::string &b) const
+{
+ switch (storage_type)
+ {
+ case TensorStorageType::BUFFER:
+ case TensorStorageType::IMAGE_BUFFER:
+ return absl::Substitute("((((($4) * slices + ($3)) * height + $2) * width + ($1)) * batch + "
+ "($0))",
+ b, x, y, s, z);
+ case TensorStorageType::TEXTURE_2D:
+ return absl::Substitute("(int2)((($0) * batch + ($1)) * depth + ($2), ($3) * slices + ($4))",
+ x, b, z, y, s);
+ case TensorStorageType::SINGLE_TEXTURE_2D:
+ return absl::Substitute("(int2)((($0) * batch + ($1)) * depth + ($2), ($3))", x, b, z, y);
+ case TensorStorageType::TEXTURE_ARRAY:
+ case TensorStorageType::TEXTURE_3D:
+ return absl::Substitute("(int4)(($0) * batch + ($1), ($2), ($3) * slices + ($4), 0)", x, b, y,
+ z, s);
+ default:
+ throw std::runtime_error("Unknown storage type");
+ }
+}
+
+std::string TensorDescriptor::GetGlobalAddressNoDeclaration(const std::string &xc,
+ const std::string &yc,
+ const std::string &zc,
+ const std::string &sc,
+ const std::string &bc) const
+{
+ if (layout == Layout::HWC || (IsBatchedWidth() && layout == Layout::BHWC))
+ {
+ return GetGlobalAddressNoDeclarationWHS(xc, yc, sc);
+ }
+ else if (layout == Layout::BHWC)
+ {
+ return GetGlobalAddressNoDeclarationWHSB(xc, yc, sc, bc);
+ }
+ else if (layout == Layout::HWDC || (IsBatchedWidth() && layout == Layout::BHWDC))
+ {
+ return GetGlobalAddressNoDeclarationWHDS(xc, yc, zc, sc);
+ }
+ else if (layout == Layout::BHWDC)
+ {
+ return GetGlobalAddressNoDeclarationWHDSB(xc, yc, zc, sc, bc);
+ }
+ else
+ {
+ throw std::runtime_error("Unsupported layout");
+ }
+}
+
+absl::Status TensorDescriptor::GetDataTypeFromTemplateArgs(const std::string &template_arg,
+ DataType *result) const
+{
+ std::string read_type = template_arg;
+ if (read_type == "FLT" || read_type == "ACCUM_FLT")
+ {
+ auto it = state_vars_.find(read_type);
+ if (it == state_vars_.end())
+ {
+ return absl::UnavailableError(
+ absl::StrCat("Read selector template argument ", read_type, " uninitialized."));
+ }
+ else
+ {
+ read_type = it->second;
+ }
+ }
+
+ if (read_type == "half")
+ {
+ *result = DataType::FLOAT16;
+ }
+ else if (read_type == "float")
+ {
+ *result = DataType::FLOAT32;
+ }
+ else
+ {
+ return absl::NotFoundError(
+ absl::StrCat("Unrecognized Read selector template argument - ", read_type));
+ }
+ return absl::OkStatus();
+}
+
+bool TensorDescriptor::HasAxis(Axis axis) const
+{
+ if (axis == Axis::WIDTH || axis == Axis::HEIGHT || axis == Axis::CHANNELS)
+ {
+ return true;
+ }
+ if (axis == Axis::BATCH && (layout == Layout::BHWC || layout == Layout::BHWDC))
+ {
+ return true;
+ }
+ if (axis == Axis::DEPTH && (layout == Layout::HWDC || layout == Layout::BHWDC))
+ {
+ return true;
+ }
+ return false;
+}
+
+void TensorDescriptor::SetTextureAddressMode(TextureAddressMode mode)
+{
+ if (mode == TextureAddressMode::ZERO)
+ {
+ state_vars_["TextureMode"] = "ZERO";
+ }
+ else
+ {
+ state_vars_["TextureMode"] = "DONT_CARE";
+ }
+}
+
+bool TensorDescriptor::ParseCoordsFromArgs(const std::vector<std::string> &args, int offset,
+ std::string *xc, std::string *yc, std::string *zc,
+ std::string *sc, std::string *bc) const
+{
+ if (HasAxis(Axis::WIDTH))
+ {
+ if ((size_t)offset >= args.size())
+ return false;
+ *xc = args[offset++];
+ }
+ if (HasAxis(Axis::HEIGHT))
+ {
+ if ((size_t)offset >= args.size())
+ return false;
+ *yc = args[offset++];
+ }
+ if (HasAxis(Axis::DEPTH))
+ {
+ if ((size_t)offset >= args.size())
+ return false;
+ *zc = args[offset++];
+ }
+ if (HasAxis(Axis::CHANNELS))
+ {
+ if ((size_t)offset >= args.size())
+ {
+ auto it = state_vars_.find("slice_id");
+ if (it == state_vars_.end())
+ {
+ return false;
+ }
+ else
+ {
+ *sc = it->second;
+ }
+ }
+ else
+ {
+ *sc = args[offset++];
+ }
+ }
+ if (HasAxis(Axis::BATCH) && !IsBatchedWidth())
+ {
+ if ((size_t)offset >= args.size())
+ {
+ auto it = state_vars_.find("batch_id");
+ if (it == state_vars_.end())
+ {
+ return false;
+ }
+ else
+ {
+ *bc = it->second;
+ }
+ }
+ else
+ {
+ *bc = args[offset++];
+ }
+ }
+ return true;
+}
+
+bool TensorDescriptor::IsBatchedWidth() const
+{
+ auto it = state_vars_.find("BatchedWidth");
+ return it != state_vars_.end() && it->second == "true";
+}
+
+std::string TensorDescriptor::GetWidth() const
+{
+ std::string div;
+ auto it1 = state_vars_.find("ElementsX2");
+ if (it1 != state_vars_.end() && it1->second == "true")
+ {
+ div = "_div2";
+ }
+ auto it2 = state_vars_.find("ElementsX4");
+ if (it2 != state_vars_.end() && it2->second == "true")
+ {
+ div = "_div4";
+ }
+ auto it = state_vars_.find("BatchedWidth");
+ if (it != state_vars_.end() && it->second == "true")
+ {
+ return "width_batched" + div;
+ }
+ else
+ {
+ return "width" + div;
+ }
+}
+
+std::string TensorDescriptor::GetSliceStride() const
+{
+ if (IsBatchedWidth())
+ {
+ return GetWidth() + " * height";
+ }
+ else
+ {
+ if (HasAxis(Axis::BATCH))
+ {
+ return GetWidth() + " * height * batch";
+ }
+ else
+ {
+ return GetWidth() + " * height";
+ }
+ }
+}
+
+TextureAddressMode TensorDescriptor::ModeFromState() const
+{
+ auto it = state_vars_.find("TextureMode");
+ if (it != state_vars_.end())
+ {
+ if (it->second == "ZERO")
+ {
+ return TextureAddressMode::ZERO;
+ }
+ else
+ {
+ return TextureAddressMode::DONT_CARE;
+ }
+ }
+ else
+ {
+ return TextureAddressMode::DONT_CARE;
+ }
+}
+
+void TensorDescriptor::UploadData(const InternalTensor<HWC, DataType::FLOAT32> &src)
+{
+ shape = BHWDC(1, src.shape.h, src.shape.w, 1, src.shape.c);
+ UploadData(absl::MakeConstSpan(src.data));
+}
+
+void TensorDescriptor::UploadData(const InternalTensor<Linear, DataType::FLOAT32> &src)
+{
+ shape = BHWDC(1, 1, 1, 1, src.shape.v);
+ UploadData(absl::MakeConstSpan(src.data));
+}
+
+void TensorDescriptor::UploadData(absl::Span<const float> src)
+{
+ int aligned_channels =
+ storage_type == TensorStorageType::SINGLE_TEXTURE_2D ? shape.c : AlignByN(shape.c, 4);
+ int elements_count = shape.b * shape.w * shape.h * shape.d * aligned_channels;
+ data.resize(elements_count * SizeOf(data_type));
+ if (data_type == DataType::FLOAT32)
+ {
+ float *gpu_data = reinterpret_cast<float *>(data.data());
+ DataFromBHWDC(src, shape, *this, absl::MakeSpan(gpu_data, elements_count));
+ }
+}
+
+bool TensorDescriptor::SupportsZeroClamp(const Axis &axis) const
+{
+ switch (storage_type)
+ {
+ case TensorStorageType::UNKNOWN:
+ return false;
+ case TensorStorageType::BUFFER:
+ case TensorStorageType::IMAGE_BUFFER:
+ return false;
+ case TensorStorageType::TEXTURE_ARRAY:
+ case TensorStorageType::TEXTURE_2D:
+ case TensorStorageType::SINGLE_TEXTURE_2D:
+ return axis == Axis::WIDTH || axis == Axis::HEIGHT;
+ case TensorStorageType::TEXTURE_3D:
+ return axis == Axis::WIDTH || axis == Axis::HEIGHT || axis == Axis::DEPTH;
+ }
+ return false;
+}
+
+bool TensorDescriptor::CanReadOutOfBorder(const Axis &) const
+{
+ switch (storage_type)
+ {
+ case TensorStorageType::UNKNOWN:
+ return false;
+ case TensorStorageType::BUFFER:
+ return false;
+ case TensorStorageType::IMAGE_BUFFER:
+ case TensorStorageType::TEXTURE_2D:
+ case TensorStorageType::TEXTURE_3D:
+ case TensorStorageType::SINGLE_TEXTURE_2D:
+ case TensorStorageType::TEXTURE_ARRAY:
+ return true;
+ }
+ return false;
+}
+
+bool TensorDescriptor::IsLinear() const
+{
+ return storage_type == TensorStorageType::BUFFER ||
+ storage_type == TensorStorageType::IMAGE_BUFFER;
+}
+
+bool TensorDescriptor::ReturnsZeroForNegOneRead() const
+{
+ return storage_type == TensorStorageType::IMAGE_BUFFER;
+}
+
+namespace
+{
+int GetLinearIndex(const TensorDescriptor &desc, const BHWDC &shape, int b, int x, int y, int d,
+ int s, int sub_c)
+{
+ const int slices = DivideRoundUp(shape.c, 4);
+ switch (desc.storage_type)
+ {
+ case TensorStorageType::BUFFER:
+ case TensorStorageType::IMAGE_BUFFER:
+ case TensorStorageType::TEXTURE_ARRAY:
+ case TensorStorageType::TEXTURE_3D:
+ return ((((d * slices + s) * shape.h + y) * shape.w + x) * shape.b + b) * 4 +
+ sub_c; // DSHWBC4
+ case TensorStorageType::TEXTURE_2D:
+ return ((((y * slices + s) * shape.w + x) * shape.b + b) * shape.d + d) * 4 +
+ sub_c; // HSWBDC4
+ case TensorStorageType::SINGLE_TEXTURE_2D:
+ return (((y * shape.w + x) * shape.b + b) * shape.d + d) * shape.c + sub_c; // HWBDC
+ default:
+ return -1;
+ }
+ return -1;
+}
+
+int GetChannelsAlignment(const TensorDescriptor &desc, const BHWDC &shape)
+{
+ return desc.storage_type == TensorStorageType::SINGLE_TEXTURE_2D ? shape.c : 4;
+}
+} // namespace
+
+template <typename T>
+void DataFromBHWDC(absl::Span<const float> src, const BHWDC &shape, const TensorDescriptor &desc,
+ absl::Span<T> dst)
+{
+ const int channels_alignment = GetChannelsAlignment(desc, shape);
+ const int slices = DivideRoundUp(shape.c, 4);
+ for (int b = 0; b < shape.b; ++b)
+ {
+ for (int s = 0; s < slices; ++s)
+ {
+ for (int y = 0; y < shape.h; ++y)
+ {
+ for (int x = 0; x < shape.w; ++x)
+ {
+ for (int d = 0; d < shape.d; ++d)
+ {
+ for (int c = 0; c < channels_alignment; ++c)
+ {
+ float value;
+ if (s * 4 + c < shape.c)
+ {
+ const int cpu_index = shape.LinearIndex({b, y, x, d, s * 4 + c});
+ value = src[cpu_index];
+ }
+ else
+ {
+ value = 0.0f;
+ }
+ int gpu_index = GetLinearIndex(desc, shape, b, x, y, d, s, c);
+ dst[gpu_index] = value;
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+template void DataFromBHWDC<float>(absl::Span<const float> src, const BHWDC &shape,
+ const TensorDescriptor &desc, absl::Span<float> dst);
+
+template <typename T>
+void DataToBHWDC(absl::Span<const T> src, const BHWDC &shape, const TensorDescriptor &desc,
+ absl::Span<float> dst)
+{
+ const int channels_alignment = GetChannelsAlignment(desc, shape);
+ const int slices = DivideRoundUp(shape.c, 4);
+ for (int b = 0; b < shape.b; ++b)
+ {
+ for (int s = 0; s < slices; ++s)
+ {
+ for (int y = 0; y < shape.h; ++y)
+ {
+ for (int x = 0; x < shape.w; ++x)
+ {
+ for (int d = 0; d < shape.d; ++d)
+ {
+ for (int c = 0; c < channels_alignment; ++c)
+ {
+ if (s * 4 + c >= shape.c)
+ {
+ continue;
+ }
+ int cpu_index = shape.LinearIndex({b, y, x, d, s * 4 + c});
+ int gpu_index = GetLinearIndex(desc, shape, b, x, y, d, s, c);
+ dst[cpu_index] = src[gpu_index];
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+template void DataToBHWDC<float>(absl::Span<const float> src, const BHWDC &shape,
+ const TensorDescriptor &desc, absl::Span<float> dst);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/TensorType.h b/runtime/onert/backend/gpu_cl/open_cl/TensorType.h
new file mode 100644
index 000000000..45523783f
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/TensorType.h
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_TENSOR_TYPE_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_TENSOR_TYPE_H__
+
+#include <cstddef>
+#include <string>
+
+#include "absl/types/span.h"
+#include "GpuObject.h"
+#include "DataType.h"
+#include "InternalTensor.h"
+#include "Shape.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+enum class TextureAddressMode
+{
+ DONT_CARE, // translated to CLK_ADDRESS_NONE
+ ZERO, // translated to CLK_ADDRESS_CLAMP
+};
+
+std::string TextureAddressModeToString(TextureAddressMode address_mode);
+
+enum class TensorStorageType
+{
+ UNKNOWN,
+ BUFFER,
+ IMAGE_BUFFER,
+ TEXTURE_2D,
+ TEXTURE_3D,
+ TEXTURE_ARRAY,
+ SINGLE_TEXTURE_2D
+};
+
+struct TensorDescriptor : public GPUObjectDescriptor
+{
+ TensorDescriptor() = default;
+ TensorDescriptor(DataType dt, TensorStorageType st, Layout l)
+ : data_type(dt), storage_type(st), layout(l)
+ {
+ }
+
+ TensorDescriptor(const TensorDescriptor &) = default;
+ TensorDescriptor &operator=(const TensorDescriptor &) = default;
+ TensorDescriptor(TensorDescriptor &&desc);
+ TensorDescriptor &operator=(TensorDescriptor &&desc);
+
+ bool operator==(const TensorDescriptor &d) const
+ {
+ return data_type == d.data_type && storage_type == d.storage_type && layout == d.layout;
+ }
+
+ bool operator!=(const TensorDescriptor &d) const { return !(*this == d); }
+
+ absl::Status PerformSelector(const std::string &selector, const std::vector<std::string> &args,
+ const std::vector<std::string> &template_args,
+ std::string *result) const override;
+
+ GPUResources GetGPUResources() const override;
+
+ absl::Status CreateGPUObject(CLContext *context, GPUObjectPtr *result) const override;
+ void Release() override { data.clear(); }
+
+ bool HasAxis(Axis axis) const;
+ void SetTextureAddressMode(TextureAddressMode mode);
+
+ absl::Status GetLinkingContextFromWriteSelector(const std::vector<std::string> &args,
+ std::string *value_name, std::string *x_coord,
+ std::string *y_coord, std::string *s_coord) const;
+
+ void UploadData(const InternalTensor<HWC, DataType::FLOAT32> &src);
+ void UploadData(const InternalTensor<Linear, DataType::FLOAT32> &src);
+
+ bool SupportsZeroClamp(const Axis &axis) const;
+ bool CanReadOutOfBorder(const Axis &axis) const;
+ bool IsLinear() const;
+
+ // applicable only for types that: IsLinear -> true.
+ // In this case for address we have 1d component - addr (int)
+ // If for addr == -1 this linear storage type returns FLT4(0.0), this function
+ // returns true, otherwise false
+ bool ReturnsZeroForNegOneRead() const;
+
+ DataType data_type = DataType::UNKNOWN;
+ TensorStorageType storage_type = TensorStorageType::UNKNOWN;
+ // This field describes logical layout, actual(physical) GPU layout can be
+ // totally different.
+ Layout layout = Layout::UNKNOWN; // Supported layouts is HWC, BHWC, HWDC, BHWDC
+
+ // optional
+ BHWDC shape;
+ std::vector<uint8_t> data;
+
+private:
+ absl::Status PerformReadSelector(const std::vector<std::string> &args,
+ const std::vector<std::string> &template_args,
+ std::string *result) const;
+
+ absl::Status PerformGetAddressSelector(const std::vector<std::string> &args,
+ std::string *result) const;
+
+ absl::Status PerformGetPtrWithSliceOffsetSelector(const std::vector<std::string> &args,
+ std::string *result) const;
+
+ absl::Status PerformGetWHOffsetSelector(const std::vector<std::string> &args,
+ std::string *result) const;
+
+ absl::Status PerformGetHandleSelector(const std::vector<std::string> &args,
+ std::string *result) const;
+
+ std::string DeclareAddress(const std::string &var_name, const std::string &address) const;
+
+ std::string StorageTypeToAddressType() const;
+
+ absl::Status PerformWriteSelector(const std::vector<std::string> &args,
+ std::string *result) const;
+
+ absl::Status PerformWriteLinearSelector(const std::vector<std::string> &args,
+ std::string *result) const;
+
+ std::string Read(DataType read_as_type, const std::string &global_address) const;
+ std::string Write(const std::string &var_name, const std::string &global_address) const;
+
+ bool IsBatchedWidth() const;
+
+ std::string GetWidth() const;
+ std::string GetSliceStride() const;
+
+ TextureAddressMode ModeFromState() const;
+
+ absl::Status GetDataTypeFromTemplateArgs(const std::string &template_arg, DataType *result) const;
+
+ std::string GetGlobalAddressNoDeclarationWHS(const std::string &x, const std::string &y,
+ const std::string &s) const;
+ std::string GetGlobalAddressNoDeclarationWHSB(const std::string &x, const std::string &y,
+ const std::string &s, const std::string &b) const;
+ std::string GetGlobalAddressNoDeclarationWHDS(const std::string &x, const std::string &y,
+ const std::string &z, const std::string &s) const;
+ std::string GetGlobalAddressNoDeclarationWHDSB(const std::string &x, const std::string &y,
+ const std::string &z, const std::string &s,
+ const std::string &b) const;
+ std::string GetGlobalAddressNoDeclaration(const std::string &xc, const std::string &yc,
+ const std::string &zc, const std::string &sc,
+ const std::string &bc) const;
+
+ bool ParseCoordsFromArgs(const std::vector<std::string> &args, int offset, std::string *xc,
+ std::string *yc, std::string *zc, std::string *sc,
+ std::string *bc) const;
+
+ void UploadData(absl::Span<const float> src);
+};
+
+template <typename T>
+void DataFromBHWDC(absl::Span<const float> src, const BHWDC &shape, const TensorDescriptor &desc,
+ absl::Span<T> dst);
+
+template <typename T>
+void DataToBHWDC(absl::Span<const T> src, const BHWDC &shape, const TensorDescriptor &desc,
+ absl::Span<float> dst);
+
+std::string ToString(TensorStorageType type);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_TENSOR_TYPE_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/TensorTypeUtil.cc b/runtime/onert/backend/gpu_cl/open_cl/TensorTypeUtil.cc
new file mode 100644
index 000000000..b1f8309e4
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/TensorTypeUtil.cc
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TensorTypeUtil.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+ObjectType ToObjectType(TensorStorageType type)
+{
+ switch (type)
+ {
+ case TensorStorageType::IMAGE_BUFFER:
+ case TensorStorageType::BUFFER:
+ return ObjectType::OPENCL_BUFFER;
+ case TensorStorageType::SINGLE_TEXTURE_2D:
+ case TensorStorageType::TEXTURE_2D:
+ case TensorStorageType::TEXTURE_ARRAY:
+ case TensorStorageType::TEXTURE_3D:
+ return ObjectType::OPENCL_TEXTURE;
+ default:
+ return ObjectType::UNKNOWN;
+ }
+}
+
+DataLayout ToDataLayout(TensorStorageType type)
+{
+ switch (type)
+ {
+ case TensorStorageType::BUFFER:
+ return DataLayout::DHWC4;
+ case TensorStorageType::IMAGE_BUFFER:
+ return DataLayout::DHWC4;
+ case TensorStorageType::SINGLE_TEXTURE_2D:
+ return DataLayout::BHWC;
+ case TensorStorageType::TEXTURE_2D:
+ return DataLayout::HDWC4;
+ case TensorStorageType::TEXTURE_ARRAY:
+ return DataLayout::DHWC4;
+ case TensorStorageType::TEXTURE_3D:
+ return DataLayout::DHWC4;
+ default:
+ return DataLayout::UNKNOWN;
+ }
+}
+
+TensorStorageType ToTensorStorageType(ObjectType object_type, DataLayout data_layout)
+{
+ switch (object_type)
+ {
+ case ObjectType::OPENCL_BUFFER:
+ return TensorStorageType::BUFFER;
+ case ObjectType::OPENCL_TEXTURE:
+ switch (data_layout)
+ {
+ case DataLayout::BHWC:
+ return TensorStorageType::SINGLE_TEXTURE_2D;
+ case DataLayout::DHWC4:
+ return TensorStorageType::TEXTURE_ARRAY;
+ case DataLayout::HDWC4:
+ return TensorStorageType::TEXTURE_2D;
+ default:
+ return TensorStorageType::UNKNOWN;
+ }
+ default:
+ return TensorStorageType::UNKNOWN;
+ }
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/TensorTypeUtil.h b/runtime/onert/backend/gpu_cl/open_cl/TensorTypeUtil.h
new file mode 100644
index 000000000..f56fc3d83
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/TensorTypeUtil.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_TENSOR_TYPE_UTIL_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_TENSOR_TYPE_UTIL_H__
+
+#include "Api.h"
+#include "TensorType.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+ObjectType ToObjectType(TensorStorageType type);
+
+DataLayout ToDataLayout(TensorStorageType type);
+
+TensorStorageType ToTensorStorageType(ObjectType object_type, DataLayout data_layout);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_TENSOR_TYPE_UTIL_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/Texture2d.cc b/runtime/onert/backend/gpu_cl/open_cl/Texture2d.cc
new file mode 100644
index 000000000..ae25e85d0
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/Texture2d.cc
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Texture2d.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+namespace
+{
+
+// Creates new 4-channel 2D texture with cl_channel_type elements
+absl::Status CreateTexture2D(int width, int height, DataType type, void *data, CLContext *context,
+ Texture2D *result)
+{
+ cl_mem texture;
+ cl_channel_type channel_type = DataTypeToChannelType(type);
+ RETURN_IF_ERROR(
+ CreateRGBAImage2D(context->context(), width, height, channel_type, data, &texture));
+ *result = Texture2D(texture, width, height, channel_type);
+
+ return absl::OkStatus();
+}
+} // namespace
+
+Texture2DDescriptor::Texture2DDescriptor(Texture2DDescriptor &&desc)
+ : GPUObjectDescriptor(std::move(desc)), element_type(desc.element_type),
+ normalized(desc.normalized), normalized_type(desc.normalized_type), size(desc.size),
+ data(std::move(desc.data))
+{
+}
+
+Texture2DDescriptor &Texture2DDescriptor::operator=(Texture2DDescriptor &&desc)
+{
+ if (this != &desc)
+ {
+ std::swap(element_type, desc.element_type);
+ std::swap(normalized, desc.normalized);
+ std::swap(normalized_type, desc.normalized_type);
+ std::swap(size, desc.size);
+ data = std::move(desc.data);
+ GPUObjectDescriptor::operator=(std::move(desc));
+ }
+ return *this;
+}
+
+void Texture2DDescriptor::Release() { data.clear(); }
+
+GPUResources Texture2DDescriptor::GetGPUResources() const
+{
+ GPUResources resources;
+ GPUImage2DDescriptor desc;
+ desc.data_type = element_type;
+ desc.access_type = access_type_;
+ resources.images2d.push_back({"tex2d", desc});
+ return resources;
+}
+
+absl::Status Texture2DDescriptor::PerformSelector(const std::string &selector,
+ const std::vector<std::string> &args,
+ const std::vector<std::string> &,
+ std::string *result) const
+{
+ if (selector == "Read")
+ {
+ return PerformReadSelector(args, result);
+ }
+ else
+ {
+ return absl::NotFoundError(
+ absl::StrCat("Texture2DDescriptor don't have selector with name - ", selector));
+ }
+}
+
+absl::Status Texture2DDescriptor::PerformReadSelector(const std::vector<std::string> &args,
+ std::string *result) const
+{
+ if (args.size() != 2)
+ {
+ return absl::NotFoundError(absl::StrCat("Texture2DDescriptor Read require two arguments, but ",
+ args.size(), " was passed"));
+ }
+ std::string read;
+ switch (element_type)
+ {
+ case DataType::FLOAT32:
+ read = "read_imagef";
+ break;
+ case DataType::FLOAT16:
+ read = "read_imageh";
+ break;
+ case DataType::INT8:
+ case DataType::INT16:
+ case DataType::INT32:
+ if (normalized)
+ {
+ read = normalized_type == DataType::FLOAT16 ? "read_imageh" : "read_imagef";
+ }
+ else
+ {
+ read = "read_imagei";
+ }
+ break;
+ case DataType::UINT8:
+ case DataType::UINT16:
+ case DataType::UINT32:
+ if (normalized)
+ {
+ read = normalized_type == DataType::FLOAT16 ? "read_imageh" : "read_imagef";
+ }
+ else
+ {
+ read = "read_imageui";
+ }
+ break;
+ default:
+ read = "unknown_type";
+ break;
+ }
+ *result = absl::StrCat(read, "(tex2d, smp_none, (int2)(", args[0], ", " + args[1] + "))");
+ return absl::OkStatus();
+}
+
+absl::Status Texture2DDescriptor::CreateGPUObject(CLContext *context, GPUObjectPtr *result) const
+{
+ Texture2D gpu_texture;
+ RETURN_IF_ERROR(gpu_texture.CreateFromTexture2DDescriptor(*this, context));
+ *result = absl::make_unique<Texture2D>(std::move(gpu_texture));
+ return absl::OkStatus();
+}
+
+Texture2D::Texture2D(cl_mem texture, int width, int height, cl_channel_type type)
+ : texture_(texture), width_(width), height_(height), channel_type_(type)
+{
+}
+
+Texture2D::Texture2D(Texture2D &&texture)
+ : texture_(texture.texture_), width_(texture.width_), height_(texture.height_),
+ channel_type_(texture.channel_type_)
+{
+ texture.texture_ = nullptr;
+ texture.width_ = 0;
+ texture.height_ = 0;
+}
+
+Texture2D &Texture2D::operator=(Texture2D &&texture)
+{
+ if (this != &texture)
+ {
+ Release();
+ std::swap(channel_type_, texture.channel_type_);
+ std::swap(width_, texture.width_);
+ std::swap(height_, texture.height_);
+ std::swap(texture_, texture.texture_);
+ }
+ return *this;
+}
+
+void Texture2D::Release()
+{
+ if (texture_)
+ {
+ clReleaseMemObject(texture_);
+ texture_ = nullptr;
+ width_ = 0;
+ height_ = 0;
+ }
+}
+
+absl::Status Texture2D::GetGPUResources(const GPUObjectDescriptor *obj_ptr,
+ GPUResourcesWithValue *resources) const
+{
+ const auto *texture_desc = dynamic_cast<const Texture2DDescriptor *>(obj_ptr);
+ if (!texture_desc)
+ {
+ return absl::InvalidArgumentError("Expected Texture2DDescriptor on input.");
+ }
+
+ resources->images2d.push_back({"tex2d", texture_});
+ return absl::OkStatus();
+}
+
+absl::Status Texture2D::CreateFromTexture2DDescriptor(const Texture2DDescriptor &desc,
+ CLContext *context)
+{
+ width_ = desc.size.x;
+ height_ = desc.size.y;
+ channel_type_ = DataTypeToChannelType(desc.element_type, desc.normalized);
+ uint8_t *data_ptr = desc.data.empty() ? nullptr : const_cast<unsigned char *>(desc.data.data());
+ return CreateRGBAImage2D(context->context(), desc.size.x, desc.size.y, channel_type_, data_ptr,
+ &texture_);
+}
+
+// Creates new 4-channel 2D texture with f32 elements
+absl::Status CreateTexture2DRGBA32F(int width, int height, CLContext *context, Texture2D *result)
+{
+ return CreateTexture2D(width, height, DataType::FLOAT32, nullptr, context, result);
+}
+
+// Creates new 4-channel 2D texture with f16 elements
+absl::Status CreateTexture2DRGBA16F(int width, int height, CLContext *context, Texture2D *result)
+{
+ return CreateTexture2D(width, height, DataType::FLOAT16, nullptr, context, result);
+}
+
+absl::Status CreateTexture2DRGBA(DataType type, int width, int height, CLContext *context,
+ Texture2D *result)
+{
+ return CreateTexture2D(width, height, type, nullptr, context, result);
+}
+
+absl::Status CreateTexture2DRGBA(DataType type, int width, int height, void *data,
+ CLContext *context, Texture2D *result)
+{
+ return CreateTexture2D(width, height, type, data, context, result);
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/Texture2d.h b/runtime/onert/backend/gpu_cl/open_cl/Texture2d.h
new file mode 100644
index 000000000..264507079
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/Texture2d.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_TEXTURE2D_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_TEXTURE2D_H__
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "ClCommandQueue.h"
+#include "ClContext.h"
+#include "GpuObject.h"
+#include "OpenclWrapper.h"
+#include "TensorType.h"
+#include "Util.h"
+#include "DataType.h"
+#include "Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+struct Texture2DDescriptor : public GPUObjectDescriptor
+{
+ DataType element_type;
+ bool normalized = false; // used with INT data types, if normalized, we read
+ // in kernel float data.
+ DataType normalized_type; // can be FLOAT32 or FLOAT16, using with normalized
+ // = true
+
+ // optional
+ int2 size = int2(0, 0);
+ std::vector<uint8_t> data;
+
+ Texture2DDescriptor() = default;
+ Texture2DDescriptor(const Texture2DDescriptor &) = default;
+ Texture2DDescriptor &operator=(const Texture2DDescriptor &) = default;
+ Texture2DDescriptor(Texture2DDescriptor &&desc);
+ Texture2DDescriptor &operator=(Texture2DDescriptor &&desc);
+
+ absl::Status PerformSelector(const std::string &selector, const std::vector<std::string> &args,
+ const std::vector<std::string> &template_args,
+ std::string *result) const override;
+
+ GPUResources GetGPUResources() const override;
+ absl::Status PerformReadSelector(const std::vector<std::string> &args, std::string *result) const;
+
+ absl::Status CreateGPUObject(CLContext *context, GPUObjectPtr *result) const override;
+ void Release() override;
+};
+
+// Texture2D represent formatted GPU data storage.
+// Texture2D is moveable but not copyable.
+class Texture2D : public GPUObject
+{
+public:
+ Texture2D() {} // just for using Texture2D as a class members
+ Texture2D(cl_mem texture, int width, int height, cl_channel_type type);
+
+ // Move only
+ Texture2D(Texture2D &&texture);
+ Texture2D &operator=(Texture2D &&texture);
+ Texture2D(const Texture2D &) = delete;
+ Texture2D &operator=(const Texture2D &) = delete;
+
+ virtual ~Texture2D() { Release(); }
+
+ cl_mem GetMemoryPtr() const { return texture_; }
+
+ // Writes data to a texture. Data should point to a region that
+ // has exact width * height * sizeof(pixel) bytes.
+ template <typename T> absl::Status WriteData(CLCommandQueue *queue, const absl::Span<T> data);
+
+ // Reads data from Texture2D into CPU memory.
+ template <typename T> absl::Status ReadData(CLCommandQueue *queue, std::vector<T> *result) const;
+
+ absl::Status GetGPUResources(const GPUObjectDescriptor *obj_ptr,
+ GPUResourcesWithValue *resources) const override;
+
+ absl::Status CreateFromTexture2DDescriptor(const Texture2DDescriptor &desc, CLContext *context);
+
+private:
+ void Release();
+
+ cl_mem texture_ = nullptr;
+ int width_;
+ int height_;
+ cl_channel_type channel_type_;
+};
+
+using Texture2DPtr = std::shared_ptr<Texture2D>;
+
+// Creates new 4-channel 2D texture with f32 elements
+absl::Status CreateTexture2DRGBA32F(int width, int height, CLContext *context, Texture2D *result);
+
+// Creates new 4-channel 2D texture with f16 elements
+absl::Status CreateTexture2DRGBA16F(int width, int height, CLContext *context, Texture2D *result);
+
+absl::Status CreateTexture2DRGBA(DataType type, int width, int height, CLContext *context,
+ Texture2D *result);
+
+absl::Status CreateTexture2DRGBA(DataType type, int width, int height, void *data,
+ CLContext *context, Texture2D *result);
+
+template <typename T>
+absl::Status Texture2D::WriteData(CLCommandQueue *queue, const absl::Span<T> data)
+{
+ const int element_size = ChannelTypeToSizeInBytes(channel_type_);
+ if (sizeof(T) % element_size != 0)
+ {
+ return absl::InvalidArgumentError(
+ "Template type T has not suitable element type for created texture.");
+ }
+ if (4 * width_ * height_ * element_size != data.size() * sizeof(T))
+ {
+ return absl::InvalidArgumentError(
+ "absl::Span<T> data size is different from texture allocated size.");
+ }
+
+ RETURN_IF_ERROR(queue->EnqueueWriteImage(texture_, int3(width_, height_, 1), data.data()));
+
+ return absl::OkStatus();
+}
+
+template <typename T>
+absl::Status Texture2D::ReadData(CLCommandQueue *queue, std::vector<T> *result) const
+{
+ const int element_size = ChannelTypeToSizeInBytes(channel_type_);
+ if (sizeof(T) != element_size)
+ {
+ return absl::InvalidArgumentError("Pixel format is different.");
+ }
+
+ const int elements_count = width_ * height_ * 4;
+ result->resize(elements_count);
+
+ return queue->EnqueueReadImage(texture_, int3(width_, height_, 1), result->data());
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_TEXTURE2D_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/Types.h b/runtime/onert/backend/gpu_cl/open_cl/Types.h
new file mode 100644
index 000000000..f3cf33450
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/Types.h
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_TYPES_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_TYPES_H__
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+// TODO(akulik): make these types Google-style compliant.
+
+template <typename T> struct alignas(sizeof(T)) Vec4
+{
+ union {
+ struct
+ {
+ T x, y, z, w;
+ };
+ std::array<T, 4> data_;
+ };
+
+ Vec4() : Vec4(T(0.0f)) {}
+
+ template <typename S> Vec4(S x_, S y_, S z_, S w_) : x(x_), y(y_), z(z_), w(w_) {}
+ explicit Vec4(T v) : x(v), y(v), z(v), w(v) {}
+
+ template <typename S> explicit Vec4(S v) : x(v), y(v), z(v), w(v) {}
+
+ Vec4(const Vec4 &f) : x(f.x), y(f.y), z(f.z), w(f.w) {}
+
+ template <typename S> Vec4(const Vec4<S> &f) : x(f.x), y(f.y), z(f.z), w(f.w) {}
+
+ Vec4 &operator=(const Vec4 &other)
+ {
+ x = other.x;
+ y = other.y;
+ z = other.z;
+ w = other.w;
+ return *this;
+ }
+
+ static constexpr int size() { return 4; }
+
+ T &operator[](size_t n) { return data_[n]; }
+ T operator[](size_t n) const { return data_[n]; }
+
+ bool operator==(const Vec4 &value) const
+ {
+ return data_[0] == value[0] && data_[1] == value[1] && data_[2] == value[2] &&
+ data_[3] == value[3];
+ }
+ bool operator!=(const Vec4 &value) const { return !(this->operator==(value)); }
+};
+
+template <typename T> struct alignas(sizeof(T)) Vec3
+{
+ union {
+ struct
+ {
+ T x, y, z;
+ };
+ std::array<T, 3> data_;
+ };
+
+ Vec3() : Vec3(T(0.0f)) {}
+
+ template <typename S> constexpr Vec3(S x_, S y_, S z_) : x(x_), y(y_), z(z_) {}
+ explicit Vec3(T v) : x(v), y(v), z(v) {}
+
+ template <typename S> explicit Vec3(S v) : x(v), y(v), z(v) {}
+
+ Vec3(const Vec3 &f) : x(f.x), y(f.y), z(f.z) {}
+
+ template <typename S> Vec3(const Vec3<S> &f) : x(f.x), y(f.y), z(f.z) {}
+
+ Vec3 &operator=(const Vec3 &other)
+ {
+ x = other.x;
+ y = other.y;
+ z = other.z;
+ return *this;
+ }
+
+ static constexpr int size() { return 3; }
+
+ T &operator[](size_t n) { return data_[n]; }
+ T operator[](size_t n) const { return data_[n]; }
+ bool operator==(const Vec3 &value) const
+ {
+ return data_[0] == value[0] && data_[1] == value[1] && data_[2] == value[2];
+ }
+ bool operator!=(const Vec3 &value) const { return !(this->operator==(value)); }
+};
+
+template <typename T> struct alignas(sizeof(T)) Vec2
+{
+ union {
+ struct
+ {
+ T x, y;
+ };
+ std::array<T, 2> data_;
+ };
+
+ Vec2() : Vec2(T(0.0f)) {}
+
+ template <typename S> Vec2(S x_, S y_) : x(x_), y(y_) {}
+ explicit Vec2(T v) : x(v), y(v) {}
+
+ template <typename S> explicit Vec2(S v) : x(v), y(v) {}
+
+ Vec2(const Vec2 &f) : x(f.x), y(f.y) {}
+
+ template <typename S> Vec2(const Vec2<S> &f) : x(f.x), y(f.y) {}
+
+ Vec2 &operator=(const Vec2 &other)
+ {
+ x = other.x;
+ y = other.y;
+ return *this;
+ }
+
+ bool operator==(const Vec2 &value) const { return data_[0] == value[0] && data_[1] == value[1]; }
+
+ bool operator!=(const Vec2 &value) const { return !(this->operator==(value)); }
+
+ static constexpr int size() { return 2; }
+
+ T &operator[](size_t n) { return data_[n]; }
+ T operator[](size_t n) const { return data_[n]; }
+};
+
+using float2 = Vec2<float>;
+using byte2 = Vec2<int8_t>;
+using ubyte2 = Vec2<uint8_t>;
+using short2 = Vec2<int16_t>;
+using ushort2 = Vec2<uint16_t>;
+using int2 = Vec2<int32_t>;
+using uint2 = Vec2<uint32_t>;
+
+using float3 = Vec3<float>;
+using byte3 = Vec3<int8_t>;
+using ubyte3 = Vec3<uint8_t>;
+using short3 = Vec3<int16_t>;
+using ushort3 = Vec3<uint16_t>;
+using int3 = Vec3<int32_t>;
+using uint3 = Vec3<uint32_t>;
+
+using float4 = Vec4<float>;
+using byte4 = Vec4<int8_t>;
+using ubyte4 = Vec4<uint8_t>;
+using short4 = Vec4<int16_t>;
+using ushort4 = Vec4<uint16_t>;
+using int4 = Vec4<int32_t>;
+using uint4 = Vec4<uint32_t>;
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_TYPES_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/Util.cc b/runtime/onert/backend/gpu_cl/open_cl/Util.cc
new file mode 100644
index 000000000..9f5a8388b
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/Util.cc
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Util.h"
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/substitute.h"
+#include "Status.h"
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+std::string CLErrorCodeToString(cl_int error_code)
+{
+ switch (error_code)
+ {
+ case CL_SUCCESS:
+ return "Success";
+ case CL_DEVICE_NOT_FOUND:
+ return "Device not found";
+ case CL_DEVICE_NOT_AVAILABLE:
+ return "Device not available";
+ case CL_COMPILER_NOT_AVAILABLE:
+ return "Compiler not available";
+ case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+ return "Memory object allocation failure";
+ case CL_OUT_OF_RESOURCES:
+ return "Out of resources";
+ case CL_OUT_OF_HOST_MEMORY:
+ return "Out of host memory";
+ case CL_PROFILING_INFO_NOT_AVAILABLE:
+ return "Profiling information not available";
+ case CL_MEM_COPY_OVERLAP:
+ return "Memory copy overlap";
+ case CL_IMAGE_FORMAT_MISMATCH:
+ return "Image format mismatch";
+ case CL_IMAGE_FORMAT_NOT_SUPPORTED:
+ return "Image format not supported";
+ case CL_BUILD_PROGRAM_FAILURE:
+ return "Build program failure";
+ case CL_MAP_FAILURE:
+ return "Mapping failure";
+ case CL_MISALIGNED_SUB_BUFFER_OFFSET:
+ return "Misaligned sub-buffer offset";
+ case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
+ return "Execution status error for events in wait list";
+ case CL_COMPILE_PROGRAM_FAILURE:
+ return "Compile program failure";
+ case CL_LINKER_NOT_AVAILABLE:
+ return "Linker not available";
+ case CL_LINK_PROGRAM_FAILURE:
+ return "Link program failure";
+ case CL_DEVICE_PARTITION_FAILED:
+ return "Device partition failed";
+ case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
+ return "Kernel argument information not available";
+
+ case CL_INVALID_VALUE:
+ return "Invalid value";
+ case CL_INVALID_DEVICE_TYPE:
+ return "Invalid device type";
+ case CL_INVALID_PLATFORM:
+ return "Invalid platform";
+ case CL_INVALID_DEVICE:
+ return "Invalid device";
+ case CL_INVALID_CONTEXT:
+ return "Invalid context";
+ case CL_INVALID_QUEUE_PROPERTIES:
+ return "Invalid queue properties";
+ case CL_INVALID_COMMAND_QUEUE:
+ return "Invalid command queue";
+ case CL_INVALID_HOST_PTR:
+ return "Invalid host pointer";
+ case CL_INVALID_MEM_OBJECT:
+ return "Invalid memory object";
+ case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
+ return "Invalid image format descriptor";
+ case CL_INVALID_IMAGE_SIZE:
+ return "Invalid image size";
+ case CL_INVALID_SAMPLER:
+ return "Invalid sampler";
+ case CL_INVALID_BINARY:
+ return "Invalid binary";
+ case CL_INVALID_BUILD_OPTIONS:
+ return "Invalid build options";
+ case CL_INVALID_PROGRAM:
+ return "Invalid program";
+ case CL_INVALID_PROGRAM_EXECUTABLE:
+ return "Invalid program executable";
+ case CL_INVALID_KERNEL_NAME:
+ return "Invalid kernel name";
+ case CL_INVALID_KERNEL_DEFINITION:
+ return "Invalid kernel definition";
+ case CL_INVALID_KERNEL:
+ return "Invalid kernel";
+ case CL_INVALID_ARG_INDEX:
+ return "Invalid argument index";
+ case CL_INVALID_ARG_VALUE:
+ return "Invalid argument value";
+ case CL_INVALID_ARG_SIZE:
+ return "Invalid argument size";
+ case CL_INVALID_KERNEL_ARGS:
+ return "Invalid kernel arguments";
+ case CL_INVALID_WORK_DIMENSION:
+ return "Invalid work dimension";
+ case CL_INVALID_WORK_GROUP_SIZE:
+ return "Invalid work group size";
+ case CL_INVALID_WORK_ITEM_SIZE:
+ return "Invalid work item size";
+ case CL_INVALID_GLOBAL_OFFSET:
+ return "Invalid global offset";
+ case CL_INVALID_EVENT_WAIT_LIST:
+ return "Invalid event wait list";
+ case CL_INVALID_EVENT:
+ return "Invalid event";
+ case CL_INVALID_OPERATION:
+ return "Invalid operation";
+ case CL_INVALID_GL_OBJECT:
+ return "Invalid GL object";
+ case CL_INVALID_BUFFER_SIZE:
+ return "Invalid buffer size";
+ case CL_INVALID_MIP_LEVEL:
+ return "Invalid mip-level";
+ case CL_INVALID_GLOBAL_WORK_SIZE:
+ return "Invalid global work size";
+ case CL_INVALID_PROPERTY:
+ return "Invalid property";
+ case CL_INVALID_IMAGE_DESCRIPTOR:
+ return "Invalid image descriptor";
+ case CL_INVALID_COMPILER_OPTIONS:
+ return "Invalid compiler options";
+ case CL_INVALID_LINKER_OPTIONS:
+ return "Invalid linker options";
+ case CL_INVALID_DEVICE_PARTITION_COUNT:
+ return "Invalid device partition count";
+ case CL_INVALID_PIPE_SIZE:
+ return "Invalid pipe size";
+ case CL_INVALID_DEVICE_QUEUE:
+ return "Invalid device queue";
+ case CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR:
+ return "Invalid GL sharegroup reference KHR";
+
+ default:
+ return "Unknown OpenCL";
+ }
+}
+
+int ChannelTypeToSizeInBytes(cl_channel_type type)
+{
+ switch (type)
+ {
+ case CL_FLOAT:
+ return 4;
+ default:
+ return 0;
+ }
+}
+
+absl::Status CreateCLBuffer(cl_context context, int size_in_bytes, bool read_only, void *data,
+ cl_mem *result)
+{
+ cl_mem_flags flags = read_only ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE;
+ if (data)
+ {
+ flags |= CL_MEM_COPY_HOST_PTR;
+ }
+ cl_int error_code;
+ *result = clCreateBuffer(context, flags, size_in_bytes, data, &error_code);
+ if (!*result)
+ {
+ return absl::UnknownError(absl::StrCat("Failed to allocate device memory (clCreateBuffer): ",
+ CLErrorCodeToString(error_code)));
+ }
+ return absl::OkStatus();
+}
+
+cl_channel_type DataTypeToChannelType(DataType type, bool normalized)
+{
+ switch (type)
+ {
+ case DataType::FLOAT32:
+ return CL_FLOAT;
+ case DataType::INT8:
+ return normalized ? CL_SNORM_INT8 : CL_SIGNED_INT8;
+ case DataType::UINT8:
+ return normalized ? CL_UNORM_INT8 : CL_UNSIGNED_INT8;
+ case DataType::INT16:
+ return normalized ? CL_SNORM_INT16 : CL_SIGNED_INT16;
+ case DataType::UINT16:
+ return normalized ? CL_UNORM_INT16 : CL_UNSIGNED_INT16;
+ case DataType::INT32:
+ return CL_SIGNED_INT32;
+ case DataType::UINT32:
+ return CL_UNSIGNED_INT32;
+ default:
+ return CL_FLOAT;
+ }
+}
+
+absl::Status CreateRGBAImage2D(cl_context context, int width, int height,
+ cl_channel_type channel_type, void *data, cl_mem *result)
+{
+ cl_image_desc desc;
+ desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+ desc.image_width = width;
+ desc.image_height = height;
+ desc.image_depth = 0;
+ desc.image_row_pitch = 0;
+ desc.image_slice_pitch = 0;
+ desc.num_mip_levels = 0;
+ desc.num_samples = 0;
+ desc.buffer = nullptr;
+
+ cl_image_format format;
+ format.image_channel_order = CL_RGBA;
+ format.image_channel_data_type = channel_type;
+
+ cl_mem_flags flags = CL_MEM_READ_WRITE;
+ if (data)
+ {
+ flags |= CL_MEM_COPY_HOST_PTR;
+ }
+
+ cl_int error_code;
+ *result = CreateImage2DLegacy(context, flags, &format, &desc, data, &error_code);
+ if (error_code != CL_SUCCESS)
+ {
+ return absl::UnknownError(absl::StrCat("Failed to create 2D texture (clCreateImage): ",
+ CLErrorCodeToString(error_code)));
+ }
+ return absl::OkStatus();
+}
+
+std::string GetXStrideCorrected(const std::string &src_x, const std::string &batch_size,
+ const std::string &stride_x, const std::string &padding_x)
+{
+ // TODO(sorokin) check perf and optimize with floor() if needed
+ // int p0 = src_x / batch_size;\n";
+ // int b0 = src_x % batch_size;\n";
+ // return p0 * stride_x * batch_size + b0 + padding_x;\n";
+ return absl::Substitute("((($0) / $1) * $2 * $1 + (($0) % $1) + $3)", src_x, batch_size, stride_x,
+ padding_x);
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/Util.h b/runtime/onert/backend/gpu_cl/open_cl/Util.h
new file mode 100644
index 000000000..996c564f4
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/Util.h
@@ -0,0 +1,278 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_UTIL_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_UTIL_H__
+
+#include <string>
+
+#include "absl/types/span.h"
+#include "OpenclWrapper.h"
+#include "DataType.h"
+#include "InternalTensor.h"
+#include "Status.h"
+#include "Types.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+// Calculates correct X coordinate when stride != 1 and batch != 1 for layouts
+// with B after W (for example HWBC4) and WB stored in one axis of GPU
+// resources.
+std::string GetXStrideCorrected(const std::string &src_x, const std::string &batch_size,
+ const std::string &stride_x, const std::string &padding_x);
+
+// @param n must be non negative
+// @param divisor must be greater than zero
+template <typename T, typename N> T DivideRoundUp(T n, N divisor)
+{
+ const T div = static_cast<T>(divisor);
+ const T q = n / div;
+ return n % div == 0 ? q : q + 1;
+}
+
+template <> inline uint3 DivideRoundUp(uint3 n, uint3 divisor)
+{
+ return uint3(DivideRoundUp(n.x, divisor.x), DivideRoundUp(n.y, divisor.y),
+ DivideRoundUp(n.z, divisor.z));
+}
+
+// @param number or its components must be greater than zero
+// @param n must be greater than zero
+template <typename T, typename N> T AlignByN(T number, N n) { return DivideRoundUp(number, n) * n; }
+
+std::string CLErrorCodeToString(cl_int error_code);
+
+int ChannelTypeToSizeInBytes(cl_channel_type type);
+
+template <DataType S, typename T>
+void CopyLinearFLT4(const InternalTensor<Linear, S> &src, absl::Span<T> dst)
+{
+ const int dst_depth = dst.size();
+ for (int d = 0; d < dst_depth; ++d)
+ {
+ T val;
+ for (int i = 0; i < 4; ++i)
+ {
+ const int dst_ch = d * 4 + i;
+ val[i] = dst_ch >= src.shape.v ? 0.0f : src.data[dst_ch];
+ }
+ dst[d] = val;
+ }
+}
+
+absl::Status CreateCLBuffer(cl_context context, int size_in_bytes, bool read_only, void *data,
+ cl_mem *result);
+
+cl_channel_type DataTypeToChannelType(DataType type, bool normalized = false);
+absl::Status CreateRGBAImage2D(cl_context context, int width, int height,
+ cl_channel_type channel_type, void *data, cl_mem *result);
+
+template <DataType S, typename T>
+void RearrangeWeightsToOHWIOGroupI4O4(const InternalTensor<OHWI, S> &weights, int out_group_size,
+ absl::Span<T> dst)
+{
+ const int dst_slices = DivideRoundUp(weights.shape.o, 4);
+ const int src_slices = DivideRoundUp(weights.shape.i, 4);
+ const int dst_groups = DivideRoundUp(dst_slices, out_group_size);
+
+ int counter = 0;
+ for (int d = 0; d < dst_groups; ++d)
+ {
+ for (int y = 0; y < weights.shape.h; ++y)
+ {
+ for (int x = 0; x < weights.shape.w; ++x)
+ {
+ for (int s = 0; s < src_slices; ++s)
+ {
+ for (int d_group = 0; d_group < out_group_size; ++d_group)
+ {
+ for (int j = 0; j < 4; ++j)
+ {
+ T filter;
+ for (int i = 0; i < 4; ++i)
+ {
+ const int s_ch = s * 4 + j;
+ const int d_ch = (d * out_group_size + d_group) * 4 + i;
+ if (s_ch < weights.shape.i && d_ch < weights.shape.o)
+ {
+ const int f_index = weights.shape.LinearIndex({d_ch, y, x, s_ch});
+ filter[i] = weights.data[f_index];
+ }
+ else
+ {
+ filter[i] = 0.0f;
+ }
+ }
+ dst[counter++] = filter;
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+template <DataType S, typename T>
+void RearrangeWeightsToODHWIOGroupI4O4(const InternalTensor<OHWDI, S> &weights, int out_group_size,
+ absl::Span<T> dst)
+{
+ const int dst_slices = DivideRoundUp(weights.shape.o, 4);
+ const int src_slices = DivideRoundUp(weights.shape.i, 4);
+ const int dst_groups = DivideRoundUp(dst_slices, out_group_size);
+
+ int counter = 0;
+ for (int d = 0; d < dst_groups; ++d)
+ {
+ for (int z = 0; z < weights.shape.d; ++z)
+ {
+ for (int y = 0; y < weights.shape.h; ++y)
+ {
+ for (int x = 0; x < weights.shape.w; ++x)
+ {
+ for (int s = 0; s < src_slices; ++s)
+ {
+ for (int d_group = 0; d_group < out_group_size; ++d_group)
+ {
+ for (int j = 0; j < 4; ++j)
+ {
+ T filter;
+ for (int i = 0; i < 4; ++i)
+ {
+ const int s_ch = s * 4 + j;
+ const int d_ch = (d * out_group_size + d_group) * 4 + i;
+ if (s_ch < weights.shape.i && d_ch < weights.shape.o)
+ {
+ const int f_index = weights.shape.LinearIndex({d_ch, y, x, z, s_ch});
+ filter[i] = weights.data[f_index];
+ }
+ else
+ {
+ filter[i] = 0.0f;
+ }
+ }
+ dst[counter++] = filter;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+template <DataType S, typename T>
+void RearrangeWeightsToI4HWIOOGroupO4(const InternalTensor<OHWI, S> &weights, int out_group_size,
+ absl::Span<T> dst)
+{
+ const int dst_slices = DivideRoundUp(weights.shape.o, 4);
+ const int src_slices = DivideRoundUp(weights.shape.i, 4);
+ const int dst_groups = DivideRoundUp(dst_slices, out_group_size);
+
+ int counter = 0;
+ for (int j = 0; j < 4; ++j)
+ {
+ for (int y = 0; y < weights.shape.h; ++y)
+ {
+ for (int x = 0; x < weights.shape.w; ++x)
+ {
+ for (int s = 0; s < src_slices; ++s)
+ {
+ for (int d = 0; d < dst_groups; ++d)
+ {
+ for (int d_group = 0; d_group < out_group_size; ++d_group)
+ {
+ T filter;
+ for (int i = 0; i < 4; ++i)
+ {
+ const int s_ch = s * 4 + j;
+ const int d_ch = (d * out_group_size + d_group) * 4 + i;
+ if (s_ch < weights.shape.i && d_ch < weights.shape.o)
+ {
+ const int f_index = weights.shape.LinearIndex({d_ch, y, x, s_ch});
+ filter[i] = weights.data[f_index];
+ }
+ else
+ {
+ filter[i] = 0.0f;
+ }
+ }
+ dst[counter++] = filter;
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+template <DataType S, typename T>
+void RearrangeWeightsToI4DHWIOOGroupO4(const InternalTensor<OHWDI, S> &weights, int out_group_size,
+ absl::Span<T> dst)
+{
+ const int dst_slices = DivideRoundUp(weights.shape.o, 4);
+ const int src_slices = DivideRoundUp(weights.shape.i, 4);
+ const int dst_groups = DivideRoundUp(dst_slices, out_group_size);
+
+ int counter = 0;
+ for (int j = 0; j < 4; ++j)
+ {
+ for (int z = 0; z < weights.shape.d; ++z)
+ {
+ for (int y = 0; y < weights.shape.h; ++y)
+ {
+ for (int x = 0; x < weights.shape.w; ++x)
+ {
+ for (int s = 0; s < src_slices; ++s)
+ {
+ for (int d = 0; d < dst_groups; ++d)
+ {
+ for (int d_group = 0; d_group < out_group_size; ++d_group)
+ {
+ T filter;
+ for (int i = 0; i < 4; ++i)
+ {
+ const int s_ch = s * 4 + j;
+ const int d_ch = (d * out_group_size + d_group) * 4 + i;
+ if (s_ch < weights.shape.i && d_ch < weights.shape.o)
+ {
+ const int f_index = weights.shape.LinearIndex({d_ch, y, x, z, s_ch});
+ filter[i] = weights.data[f_index];
+ }
+ else
+ {
+ filter[i] = 0.0f;
+ }
+ }
+ dst[counter++] = filter;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_UTIL_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/WinogradUtil.cc b/runtime/onert/backend/gpu_cl/open_cl/WinogradUtil.cc
new file mode 100644
index 000000000..5f1103ad9
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/WinogradUtil.cc
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "open_cl/WinogradUtil.h"
+
+#include <cmath>
+#include <vector>
+
+#include "open_cl/DataType.h"
+#include "open_cl/Shape.h"
+#include "open_cl/Tensor.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace
+{
+// Matrices for Winograd trasformations were computed with the method described
+// here https://openreview.net/pdf?id=H1ZaRZVKg
+std::vector<float> GetTransposedMatrixForWinograd(int width, int height)
+{
+ const float kDelta = std::sqrt(2.0f) / 2.0f;
+ std::vector<float> px(width);
+
+ px[0] = 0.0f;
+ const int points_count = (width - 1) / 2;
+ for (int i = 0; i < points_count; ++i)
+ {
+ px[i * 2 + 1] = kDelta * (i + 1.0f);
+ px[i * 2 + 2] = -kDelta * (i + 1.0f);
+ }
+ px[width - 1] = 1.0f;
+
+ std::vector<float> py(width, 1.0f);
+ py[width - 1] = 0.0f;
+
+ std::vector<float> result(height * width);
+ for (int y = 0; y < width; ++y)
+ {
+ for (int x = 0; x < height; ++x)
+ {
+ result[x * width + y] = std::pow(px[y], 1.0f * x) * std::pow(py[y], (height - 1.0f) - x);
+ }
+ }
+ return result;
+}
+
+std::vector<float> GetInversedMatrixForWinograd(int rank)
+{
+ auto matrix = GetTransposedMatrixForWinograd(rank, rank);
+ std::vector<float> inverted(rank * rank, 0.0f);
+ for (int i = 0; i < rank; ++i)
+ {
+ inverted[i * rank + i] = 1.0f;
+ }
+
+ for (int i = 1; i < rank - 1; ++i)
+ {
+ float inv_t = 1.0f / matrix[i * rank + i];
+ for (int x = i; x < rank; ++x)
+ {
+ matrix[i * rank + x] *= inv_t;
+ }
+ for (int x = 0; x < rank; ++x)
+ {
+ inverted[i * rank + x] *= inv_t;
+ }
+
+ for (int y = 0; y < rank; ++y)
+ {
+ if (y == i)
+ continue;
+ float t = matrix[y * rank + i];
+ for (int x = i; x < rank; ++x)
+ {
+ matrix[y * rank + x] -= t * matrix[i * rank + x];
+ }
+ for (int x = 0; x < rank; ++x)
+ {
+ inverted[y * rank + x] -= t * inverted[i * rank + x];
+ }
+ }
+ }
+
+ return inverted;
+}
+
+std::vector<float> Multiply(const std::vector<float> &a_mat, const std::vector<float> &b_mat, int m,
+ int n, int k)
+{
+ std::vector<float> result(m * k);
+ for (int y = 0; y < m; ++y)
+ {
+ for (int x = 0; x < k; ++x)
+ {
+ float sum = 0.0f;
+ for (int i = 0; i < n; ++i)
+ {
+ sum += a_mat[y * n + i] * b_mat[i * k + x];
+ }
+ result[y * k + x] = sum;
+ }
+ }
+ return result;
+}
+} // namespace
+
+std::vector<float> AtMatrixForWinograd4x4To6x6() { return GetTransposedMatrixForWinograd(6, 4); }
+
+std::vector<float> BtMatrixForWinograd4x4To6x6() { return GetInversedMatrixForWinograd(6); }
+
+void RearrangeWeightsToWinograd4x4To6x6Weights(
+ const gpu_cl::InternalTensor<gpu_cl::OHWI, gpu_cl::DataType::FLOAT32> &src_weights,
+ gpu_cl::InternalTensor<gpu_cl::OHWI, gpu_cl::DataType::FLOAT32> *dst_weights)
+{
+ gpu_cl::OHWI dst_shape;
+ dst_shape.o = src_weights.shape.o;
+ dst_shape.h = 6;
+ dst_shape.w = 6;
+ dst_shape.i = src_weights.shape.i;
+ dst_weights->shape = dst_shape;
+ dst_weights->data.resize(dst_shape.DimensionsProduct());
+
+ auto gt_mat = GetTransposedMatrixForWinograd(6, 3);
+ std::vector<float> g_mat(gt_mat.size());
+ for (int y = 0; y < 3; ++y)
+ {
+ for (int x = 0; x < 6; ++x)
+ {
+ g_mat[x * 3 + y] = gt_mat[y * 6 + x];
+ }
+ }
+
+ for (int d = 0; d < src_weights.shape.o; ++d)
+ {
+ for (int s = 0; s < src_weights.shape.i; ++s)
+ {
+ std::vector<float> in_vals(9);
+ for (int y = 0; y < 3; ++y)
+ {
+ for (int x = 0; x < 3; ++x)
+ {
+ const int f_index = src_weights.shape.LinearIndex({d, y, x, s});
+ in_vals[y * 3 + x] = src_weights.data[f_index];
+ }
+ }
+
+ auto temp_vals = Multiply(g_mat, in_vals, 6, 3, 3);
+ auto out_vals = Multiply(temp_vals, gt_mat, 6, 3, 6);
+ for (int y = 0; y < 6; ++y)
+ {
+ for (int x = 0; x < 6; ++x)
+ {
+ const int f_index = dst_shape.LinearIndex({d, y, x, s});
+ dst_weights->data[f_index] = out_vals[y * 6 + x];
+ }
+ }
+ }
+ }
+}
+
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/WinogradUtil.h b/runtime/onert/backend/gpu_cl/open_cl/WinogradUtil.h
new file mode 100644
index 000000000..32e21760d
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/WinogradUtil.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_WINOGRAD_UTIL_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_WINOGRAD_UTIL_H__
+
+#include <vector>
+
+#include "open_cl/DataType.h"
+#include "open_cl/Shape.h"
+#include "open_cl/InternalTensor.h"
+
+namespace onert
+{
+namespace backend
+{
+
+// Matrices for Winograd trasformations received with method described here
+// https://openreview.net/pdf?id=H1ZaRZVKg
+
+// returns A transposed matrix(6 * 4) as array (24 values) for Winograd4x4To6x6
+std::vector<float> AtMatrixForWinograd4x4To6x6();
+
+// returns B transposed matrix(6 * 6) as array (36 values) for Winograd4x4To6x6
+std::vector<float> BtMatrixForWinograd4x4To6x6();
+
+void RearrangeWeightsToWinograd4x4To6x6Weights(
+ const gpu_cl::InternalTensor<gpu_cl::OHWI, gpu_cl::DataType::FLOAT32> &src_weights,
+ gpu_cl::InternalTensor<gpu_cl::OHWI, gpu_cl::DataType::FLOAT32> *dst_weights);
+
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_WINOGRAD_UTIL_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/WorkgroupSelection.cc b/runtime/onert/backend/gpu_cl/open_cl/WorkgroupSelection.cc
new file mode 100644
index 000000000..847c2a2aa
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/WorkgroupSelection.cc
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "WorkgroupSelection.h"
+
+#include <math.h>
+
+#include <set>
+#include <vector>
+
+#include "Util.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+namespace
+{
+
+template <typename T>
+void AddCornerCases(const T &grid, int max_work_group_total_size, const T &max_work_group_sizes,
+ WorkGroupSizeAlignment x_alignment, WorkGroupSizeAlignment y_alignment,
+ WorkGroupSizeAlignment z_alignment, std::vector<T> *work_groups)
+{
+ for (int x = 1; x <= 4; ++x)
+ {
+ for (int y = 1; y <= 4; ++y)
+ {
+ for (int z = 1; z <= 4; ++z)
+ {
+ u_int32_t wg_x = DivideRoundUp(grid.x, x);
+ u_int32_t wg_y = DivideRoundUp(grid.y, y);
+ u_int32_t wg_z = DivideRoundUp(grid.z, z);
+ if (wg_x > static_cast<u_int32_t>(max_work_group_sizes.x) ||
+ wg_y > static_cast<u_int32_t>(max_work_group_sizes.y) ||
+ wg_z > static_cast<u_int32_t>(max_work_group_sizes.z) ||
+ wg_x * wg_y * wg_z > static_cast<u_int32_t>(max_work_group_total_size))
+ {
+ continue;
+ }
+ if (x_alignment == WorkGroupSizeAlignment::PRECISE && grid.x % wg_x != 0)
+ {
+ continue;
+ }
+ if (y_alignment == WorkGroupSizeAlignment::PRECISE && grid.y % wg_y != 0)
+ {
+ continue;
+ }
+ if (z_alignment == WorkGroupSizeAlignment::PRECISE && grid.z % wg_z != 0)
+ {
+ continue;
+ }
+ work_groups->push_back({wg_x, wg_y, wg_z});
+ }
+ }
+ }
+
+ // this will add at least {1, 1, 1} always.
+ for (u_int32_t x = 1; x <= 4; ++x)
+ {
+ for (u_int32_t y = 1; y <= 4; ++y)
+ {
+ for (u_int32_t z = 1; z <= 4; ++z)
+ {
+ if (x > static_cast<u_int32_t>(max_work_group_sizes.x) ||
+ y > static_cast<u_int32_t>(max_work_group_sizes.y) ||
+ z > static_cast<u_int32_t>(max_work_group_sizes.z) ||
+ x * y * z > static_cast<u_int32_t>(max_work_group_total_size))
+ {
+ continue;
+ }
+ if (x_alignment == WorkGroupSizeAlignment::PRECISE && grid.x % x != 0)
+ {
+ continue;
+ }
+ if (y_alignment == WorkGroupSizeAlignment::PRECISE && grid.y % y != 0)
+ {
+ continue;
+ }
+ if (z_alignment == WorkGroupSizeAlignment::PRECISE && grid.z % z != 0)
+ {
+ continue;
+ }
+ work_groups->push_back({x, y, z});
+ }
+ }
+ }
+}
+
+std::vector<int> GetDivisors(int number)
+{
+ const int max_divisor = static_cast<int>(sqrt(number));
+ std::vector<int> divisors;
+ // we don't know the number of dividers, so it is just heuristic.
+ divisors.reserve(max_divisor / 3 + 1);
+ for (int i = 1; i <= max_divisor; ++i)
+ {
+ const int d = number / i;
+ if (i * d == number)
+ {
+ divisors.push_back(i);
+ if (d != i)
+ {
+ divisors.push_back(d);
+ }
+ }
+ }
+ return divisors;
+}
+
+std::vector<int> GetDivisorsForRange(int number, int range)
+{
+ const int last_number = number + range;
+ const int max_divisor = static_cast<int>(sqrt(last_number));
+ std::set<int> divisors;
+ for (int i = 1; i <= max_divisor; ++i)
+ {
+ const int reminder = number % i;
+ // iterate through numbers that divisible by i in our range;
+ const int first_number = number + (i - reminder) % i;
+ if (first_number <= last_number)
+ {
+ divisors.insert(i);
+ }
+ for (int j = first_number; j <= last_number; j += i)
+ {
+ const int d = j / i;
+ if (d != i)
+ {
+ divisors.insert(d);
+ }
+ }
+ }
+ return std::vector<int>(divisors.begin(), divisors.end());
+}
+
+} // namespace
+
+std::vector<int> GetPossibleSizes(int number, WorkGroupSizeAlignment z_alignment)
+{
+ if (z_alignment == WorkGroupSizeAlignment::PRECISE)
+ {
+ // we will use for potential sizes, sizes that cover grid precisely
+ // work group size * k (k is integer) == grid_size
+ return GetDivisors(number);
+ }
+ else
+ {
+ // when we chose work group size we can use work group size that
+ // work group size * k (k is integer) != grid_size (slightly bigger)
+ // so in this heuristic we trying to find potential size, that satisfies
+ // to this : work group size * k (k is integer) <= grid_size + 5
+ // and this : work group size * k (k is integer) >= grid_size
+ return GetDivisorsForRange(number, 5);
+ }
+}
+
+template <typename T>
+std::vector<T>
+GenerateWorkGroupSizes(const T &grid, int min_work_group_total_size, int max_work_group_total_size,
+ const T &max_work_group_sizes, WorkGroupSizeAlignment x_alignment,
+ WorkGroupSizeAlignment y_alignment, WorkGroupSizeAlignment z_alignment)
+{
+ std::vector<T> work_groups;
+ work_groups.reserve(64);
+
+ std::vector<int> sizes_x = GetPossibleSizes(grid.x, x_alignment);
+ std::vector<int> sizes_y = GetPossibleSizes(grid.y, y_alignment);
+ std::vector<int> sizes_z = GetPossibleSizes(grid.z, z_alignment);
+
+ for (auto x : sizes_x)
+ {
+ if (static_cast<int>(x) > static_cast<int>(max_work_group_sizes.x))
+ continue;
+ for (auto y : sizes_y)
+ {
+ if (static_cast<int>(y) > static_cast<int>(max_work_group_sizes.y))
+ continue;
+ for (auto z : sizes_z)
+ {
+ if (static_cast<int>(z) > static_cast<int>(max_work_group_sizes.z))
+ continue;
+ const int work_group_size = x * y * z;
+ if (work_group_size < min_work_group_total_size ||
+ work_group_size > max_work_group_total_size)
+ continue;
+ work_groups.push_back({x, y, z});
+ }
+ }
+ }
+
+ return work_groups;
+}
+
+// Specializations of GenerateWorkGroupSizes for int3 and uint3
+
+template std::vector<int3> GenerateWorkGroupSizes(const int3 &grid, int min_work_group_total_size,
+ int max_work_group_total_size,
+ const int3 &max_work_group_sizes,
+ WorkGroupSizeAlignment x_alignment,
+ WorkGroupSizeAlignment y_alignment,
+ WorkGroupSizeAlignment z_alignment);
+
+template std::vector<uint3> GenerateWorkGroupSizes(const uint3 &grid, int min_work_group_total_size,
+ int max_work_group_total_size,
+ const uint3 &max_work_group_sizes,
+ WorkGroupSizeAlignment x_alignment,
+ WorkGroupSizeAlignment y_alignment,
+ WorkGroupSizeAlignment z_alignment);
+
+template <typename T>
+void GenerateWorkGroupSizesAlignedToGrid(const T &grid, const T &max_work_group_size,
+ const int max_work_group_invocations,
+ std::vector<T> *work_groups)
+{
+ auto alignment = WorkGroupSizeAlignment::PRECISE;
+ *work_groups =
+ GenerateWorkGroupSizes<T>(grid, /*min_work_group_total_size = */ 32, max_work_group_invocations,
+ max_work_group_size, alignment, alignment, alignment);
+ // If the grid parameter too small, method below cannot generate workgroups.
+ if (work_groups->empty())
+ {
+ AddCornerCases(grid, max_work_group_invocations, max_work_group_size, alignment, alignment,
+ alignment, work_groups);
+ }
+}
+
+// Specializations of GenerateWorkGroupSizesAlignedToGrid for int3 and uint3
+
+template void GenerateWorkGroupSizesAlignedToGrid(const int3 &grid, const int3 &max_work_group_size,
+ const int max_work_group_invocations,
+ std::vector<int3> *work_groups);
+
+template void GenerateWorkGroupSizesAlignedToGrid(const uint3 &grid,
+ const uint3 &max_work_group_size,
+ const int max_work_group_invocations,
+ std::vector<uint3> *work_groups);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/WorkgroupSelection.h b/runtime/onert/backend/gpu_cl/open_cl/WorkgroupSelection.h
new file mode 100644
index 000000000..b0702ac7c
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/WorkgroupSelection.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_WORK_GROUP_SELECTION_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_WORK_GROUP_SELECTION_H__
+
+#include <vector>
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+// PRECISE assume that WorkGroupSize * k = GridSize;
+// NO_ALIGNMENT no restrictions;
+// We need PRECISE when we don't have check in kernel for boundaries
+// If we have the check, we can use PRECISE or NO_ALIGNMENT as well.
+enum class WorkGroupSizeAlignment
+{
+ PRECISE,
+ NO_ALIGNMENT
+};
+
+std::vector<int> GetPossibleSizes(int number, WorkGroupSizeAlignment z_alignment);
+
+// Specializations exist for int3 and uint3 in the .cc file
+
+template <typename T>
+std::vector<T>
+GenerateWorkGroupSizes(const T &grid, int min_work_group_total_size, int max_work_group_total_size,
+ const T &max_work_group_sizes, WorkGroupSizeAlignment x_alignment,
+ WorkGroupSizeAlignment y_alignment, WorkGroupSizeAlignment z_alignment);
+
+template <typename T>
+void GenerateWorkGroupSizesAlignedToGrid(const T &grid, const T &max_work_group_size,
+ const int max_work_group_invocations,
+ std::vector<T> *work_groups);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_WORK_GROUP_SELECTION_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Add.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/Add.cc
new file mode 100644
index 000000000..09100fe1f
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/Add.cc
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Add.h"
+
+#include <cstring>
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "Util.h"
+#include "open_cl/Util.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+GPUOperation CreateAdd(const OperationDef &definition, const std::vector<int> &channels,
+ int dst_channels)
+{
+ GPUOperation add(definition);
+ int dst_depth = DivideRoundUp(dst_channels, 4);
+ int src0_depth = DivideRoundUp(channels[0], 4);
+ add.elementwise_ = true;
+ add.linkable_ = dst_depth == src0_depth;
+ if (src0_depth < dst_depth)
+ {
+ add.check_src_channels_size_ = true;
+ }
+ for (uint32_t i = 1; i < definition.src_tensors.size(); ++i)
+ {
+ const std::string tensor_name = absl::StrCat("src_data_", i);
+ auto src_desc = definition.src_tensors[i];
+ if (definition.IsBatchSupported())
+ {
+ src_desc.SetStateVar("BatchedWidth", "true");
+ }
+ add.AddSrcTensor(tensor_name, src_desc);
+ add.code_ += "if (S_COORD < args." + tensor_name + ".Slices()) {\n";
+ add.code_ += " in_out_value += args." + tensor_name + ".Read(X_COORD, Y_COORD, S_COORD);\n";
+ add.code_ += "}\n";
+ }
+ return add;
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Add.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/Add.h
new file mode 100644
index 000000000..2335a901c
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/Add.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_ADD_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_ADD_H__
+
+#include <string>
+#include <vector>
+
+#include "GpuOperation.h"
+#include "open_cl/Operations.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+// Add operation supports not equal tensors on input (for possibility to
+// remove Padding operation with zeroes in channels dimension)
+GPUOperation CreateAdd(const OperationDef &definition, const std::vector<int> &channels,
+ int dst_channels);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_ADD_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvBuffer1x1.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvBuffer1x1.cc
new file mode 100644
index 000000000..1b9014fdf
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvBuffer1x1.cc
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "open_cl/kernels/ConvBuffer1x1.h"
+
+#include <array>
+#include <string>
+#include <utility>
+
+#include "open_cl/ClDevice.h"
+#include "open_cl/kernels/Util.h"
+#include "open_cl/kernels/WorkGroupPicking.h"
+#include "open_cl/Precision.h"
+#include "open_cl/TensorType.h"
+#include "open_cl/Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+namespace
+{
+
+// element_size must be 1, 2 or 4
+// 1 - is FLT4
+// 2 - is FLT8
+// 4 - is FLT16
+// This function generates code for arithmetic part of convolution
+std::string GetComputationPart(const int3 &block_size, int element_size,
+ CalculationsPrecision precision)
+{
+ const std::string hexes[16] = {"0", "1", "2", "3", "4", "5", "6", "7",
+ "8", "9", "a", "b", "c", "d", "e", "f"};
+ std::string c;
+ for (int z = 0; z < block_size.z; ++z)
+ {
+ const std::string z_s = std::to_string(z);
+ c += " FLT16 W" + z_s + " = weights_cache[" + z_s + "];\n";
+ for (int y = 0; y < block_size.y; ++y)
+ {
+ for (int x = 0; x < block_size.x; ++x)
+ {
+ std::string s_index = std::to_string(y * block_size.x + x);
+ for (int e = 0; e < element_size; ++e)
+ {
+ std::string r_index = z_s + std::to_string(y) + std::to_string(x * element_size + e);
+ const std::string f0 = "W" + z_s + ".s0123";
+ const std::string f1 = "W" + z_s + ".s4567";
+ const std::string f2 = "W" + z_s + ".s89ab";
+ const std::string f3 = "W" + z_s + ".scdef";
+ switch (precision)
+ {
+ case CalculationsPrecision::F32:
+ case CalculationsPrecision::F16:
+ c += " r" + r_index + " += " + f0 + " * s" + s_index + ".s" + hexes[e * 4 + 0] +
+ ";\n";
+ c += " r" + r_index + " += " + f1 + " * s" + s_index + ".s" + hexes[e * 4 + 1] +
+ ";\n";
+ c += " r" + r_index + " += " + f2 + " * s" + s_index + ".s" + hexes[e * 4 + 2] +
+ ";\n";
+ c += " r" + r_index + " += " + f3 + " * s" + s_index + ".s" + hexes[e * 4 + 3] +
+ ";\n";
+ break;
+ case CalculationsPrecision::F32_F16:
+ c += " r" + r_index + " += convert_float4(" + f0 + " * s" + s_index + ".s" +
+ hexes[e * 4 + 0] + " + " + f1 + " * s" + s_index + ".s" + hexes[e * 4 + 1] +
+ " + " + f2 + " * s" + s_index + ".s" + hexes[e * 4 + 2] + " + " + f3 + " * s" +
+ s_index + ".s" + hexes[e * 4 + 3] + ");\n";
+ break;
+ }
+ }
+ }
+ }
+ }
+ return c;
+}
+
+ConvBuffer1x1::ConvParams GetBestParams(const DeviceInfo &device_info,
+ const OperationDef &definition, const BHWC &shape, int,
+ int dst_depth)
+{
+ ConvBuffer1x1::ConvParams conv_params;
+ conv_params.element_size = 4;
+ conv_params.block_size = int3(1, 1, 1);
+ if (!device_info.IsMali())
+ {
+ return conv_params;
+ }
+ bool can_use_flt8 =
+ (shape.w * shape.b) % 2 == 0 && definition.precision != CalculationsPrecision::F32;
+ bool is_midgard = device_info.IsMali() && device_info.mali_info.IsMidgard();
+ if (is_midgard)
+ {
+ if (can_use_flt8)
+ {
+ conv_params.element_size = 8;
+ }
+ if (definition.precision == CalculationsPrecision::F16 || !can_use_flt8)
+ {
+ conv_params.block_size.x = 2;
+ }
+ return conv_params;
+ }
+
+ int task_size = shape.w * shape.b * shape.h * dst_depth;
+ int block_size = GetRecommendedBlockSizeForConv(device_info, definition.precision, task_size);
+
+ if (!can_use_flt8 && block_size > 4)
+ {
+ block_size = 4;
+ }
+
+ if (can_use_flt8 && block_size >= 2)
+ {
+ conv_params.element_size = 8;
+ block_size /= 2;
+ }
+ if (block_size == 4)
+ {
+ conv_params.block_size.x = 2;
+ if (definition.precision == CalculationsPrecision::F32 && dst_depth < 32)
+ {
+ conv_params.block_size.y = 2;
+ }
+ else
+ {
+ conv_params.block_size.z = 2;
+ }
+ }
+ else if (block_size == 2)
+ {
+ if (dst_depth >= 32)
+ {
+ conv_params.block_size.z = 2;
+ }
+ else
+ {
+ conv_params.block_size.x = 2;
+ }
+ }
+
+ return conv_params;
+}
+
+ConvBuffer1x1::ConvParams GetBestParams(const DeviceInfo &device_info,
+ const OperationDef &definition, int, int)
+{
+ ConvBuffer1x1::ConvParams conv_params;
+ conv_params.element_size = 4;
+ conv_params.block_size = int3(1, 1, 1);
+ if (device_info.IsMali() && definition.precision == CalculationsPrecision::F16 &&
+ device_info.compute_units_count <= 4)
+ {
+ conv_params.block_size.x *= 2;
+ }
+ return conv_params;
+}
+
+} // namespace
+
+ConvBuffer1x1::ConvBuffer1x1(const OperationDef &definition, const ConvParams &conv_params)
+ : GPUOperation(definition), conv_params_(conv_params)
+{
+ code_ = GenerateConvBuffer1x1(definition_, conv_params_, &args_);
+ work_group_size_ = int3(2, 4, 1);
+}
+
+ConvBuffer1x1::ConvBuffer1x1(ConvBuffer1x1 &&operation)
+ : GPUOperation(std::move(operation)), conv_params_(std::move(operation.conv_params_))
+{
+}
+
+ConvBuffer1x1 &ConvBuffer1x1::operator=(ConvBuffer1x1 &&operation)
+{
+ if (this != &operation)
+ {
+ std::swap(conv_params_, operation.conv_params_);
+ GPUOperation::operator=(std::move(operation));
+ }
+ return *this;
+}
+
+std::string ConvBuffer1x1::GenerateConvBuffer1x1(const OperationDef &op_def,
+ const ConvBuffer1x1::ConvParams &conv_params,
+ Arguments *)
+{
+ auto src_desc = op_def.src_tensors[0];
+ if (op_def.IsBatchSupported())
+ {
+ src_desc.SetStateVar("BatchedWidth", "true");
+ }
+ if (conv_params_.element_size == 8)
+ {
+ src_desc.SetStateVar("ElementsX2", "true");
+ }
+ else if (conv_params_.element_size == 16)
+ {
+ src_desc.SetStateVar("ElementsX4", "true");
+ }
+ AddSrcTensor("src_tensor", src_desc);
+ if (op_def.src_tensors.size() == 2)
+ {
+ // dynamic weights
+ BufferDescriptor desc;
+ desc.element_type = op_def.src_tensors[1].data_type;
+ desc.element_size = 16;
+ desc.memory_type = MemoryType::GLOBAL;
+ AddSrcBuffer("weights", desc);
+ }
+
+ auto dst_desc = op_def.dst_tensors[0];
+ if (op_def.IsBatchSupported())
+ {
+ dst_desc.SetStateVar("BatchedWidth", "true");
+ }
+ AddDstTensor("dst_tensor", dst_desc);
+
+ std::string c = GetCommonDefines(op_def.precision);
+ switch (op_def.precision)
+ {
+ case CalculationsPrecision::F32:
+ c += "#define FLT8 float8\n";
+ c += "#define FLT16 float16\n";
+ break;
+ case CalculationsPrecision::F32_F16:
+ case CalculationsPrecision::F16:
+ c += "#define FLT8 half8\n";
+ c += "#define FLT16 half16\n";
+ break;
+ }
+
+ const int3 block_size = conv_params.block_size;
+ const int element_size = conv_params.element_size / 4;
+
+ c += "__kernel void main_function(\n";
+ c += "$0) {\n";
+ c += " int X = get_global_id(0) * " + std::to_string(block_size.x * element_size) + ";\n";
+ c += " int X_SRC = get_global_id(0) * " + std::to_string(block_size.x) + ";\n";
+ c += " int Y = get_global_id(1) * " + std::to_string(block_size.y) + ";\n";
+ c += " int Z = get_global_id(2) * " + std::to_string(block_size.z) + ";\n";
+ c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+ "Z >= args.dst_tensor.Slices()) return;\n";
+ if (conv_params.different_weights_for_height)
+ {
+ c += " __global FLT16* weights_cache = args.weights.GetPtr() + (Z * "
+ "args.src_tensor.Height() + "
+ "Y * " +
+ std::to_string(block_size.z) +
+ ") * "
+ "args.src_tensor.Slices();\n";
+ }
+ else
+ {
+ c += " __global FLT16* weights_cache = args.weights.GetPtr() + Z * "
+ "args.src_tensor.Slices();\n";
+ }
+ for (int z = 0; z < block_size.z; ++z)
+ {
+ const std::string z_s = std::to_string(z);
+ c += " ACCUM_FLT4 bias_val_" + z_s + " = TO_ACCUM_TYPE(args.biases.Read(Z + " + z_s + "));\n";
+ for (int y = 0; y < block_size.y; ++y)
+ {
+ for (int x = 0; x < block_size.x * element_size; ++x)
+ {
+ c += " ACCUM_FLT4 r" + z_s + std::to_string(y) + std::to_string(x) + " = bias_val_" + z_s +
+ ";\n";
+ }
+ }
+ }
+ for (int x = 0; x < block_size.x; ++x)
+ {
+ std::string x_s = std::to_string(x);
+ c += " int xc" + x_s + " = min(X_SRC + " + std::to_string(x) +
+ ", args.src_tensor.Width() - 1);\n";
+ }
+ for (int y = 0; y < block_size.y; ++y)
+ {
+ std::string y_s = std::to_string(y);
+ c += " int yc" + y_s + " = min(Y + " + y_s + ", args.src_tensor.Height() - 1);\n";
+ }
+ for (int y = 0; y < block_size.y; ++y)
+ {
+ std::string y_s = std::to_string(y);
+ for (int x = 0; x < block_size.x; ++x)
+ {
+ std::string x_s = std::to_string(x);
+ std::string i_s = std::to_string(y * block_size.x + x);
+ c += " int src_addr_" + i_s + " = (yc" + y_s + ") * args.src_tensor.Width() + (xc" + x_s +
+ ");\n";
+ }
+ }
+ c += " for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
+ for (int y = 0; y < block_size.y; ++y)
+ {
+ std::string y_s = std::to_string(y);
+ for (int x = 0; x < block_size.x; ++x)
+ {
+ std::string x_s = std::to_string(x);
+ std::string i_s = std::to_string(y * block_size.x + x);
+ c += " FLT" + std::to_string(element_size * 4) + " s" + i_s +
+ " = args.src_tensor.Read(src_addr_" + i_s + ");\n";
+ }
+ }
+ c += GetComputationPart(block_size, element_size, op_def.precision);
+ for (int i = 0; i < block_size.x * block_size.y; ++i)
+ {
+ std::string i_s = std::to_string(i);
+ c += " src_addr_" + i_s + " += args.src_tensor.SliceStride();\n";
+ }
+ c += " weights_cache += " + std::to_string(block_size.z) + ";\n";
+ c += " }\n"; // SRC_SLICES
+
+ for (int z = 0; z < block_size.z; ++z)
+ {
+ const std::string z_s = std::to_string(z);
+ if (z != 0)
+ {
+ c += " if (Z + " + z_s + " >= args.dst_tensor.Slices()) return;\n";
+ }
+ for (int y = 0; y < block_size.y; ++y)
+ {
+ const std::string y_s = std::to_string(y);
+ for (int x = 0; x < block_size.x * element_size; ++x)
+ {
+ const std::string x_s = std::to_string(x);
+ c += " if (X + " + x_s + " < args.dst_tensor.Width() && Y + " + y_s +
+ " < args.dst_tensor.Height()) {\n";
+ c += " FLT4 res = TO_FLT4(r" + z_s + y_s + x_s + ");\n";
+ c += " args.dst_tensor.Write(res, X + " + x_s + ", Y + " + y_s + ", Z + " + z_s + ");\n";
+ c += " }\n";
+ }
+ }
+ }
+ c += "}\n";
+ return c;
+}
+
+int3 ConvBuffer1x1::GetGridSize() const
+{
+ const int dst_width_elements =
+ DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(), (conv_params_.element_size / 4));
+ const int grid_x = DivideRoundUp(dst_width_elements, conv_params_.block_size.x);
+ const int grid_y = DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
+ const int grid_z = DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
+ return int3(grid_x, grid_y, grid_z);
+}
+
+void ConvBuffer1x1::GetPossibleKernelWorkGroups(TuningType tuning_type,
+ const DeviceInfo &device_info,
+ const KernelInfo &kernel_info,
+ std::vector<int3> *work_groups) const
+{
+ GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_, work_groups);
+}
+
+bool IsConvBuffer1x1Supported(const OperationDef &definition, const Convolution2DAttributes &attr)
+{
+ auto src_storage_type = definition.src_tensors[0].storage_type;
+ return src_storage_type == TensorStorageType::BUFFER && attr.weights.shape.w == 1 &&
+ attr.weights.shape.h == 1 && attr.dilations.w == 1 && attr.dilations.h == 1 &&
+ attr.strides.w == 1 && attr.strides.h == 1 && attr.padding.prepended.w == 0 &&
+ attr.padding.prepended.h == 0 && attr.padding.appended.w == 0 &&
+ attr.padding.appended.h == 0;
+}
+
+bool IsConvBuffer1x1Supported(const OperationDef &definition, const BHWC &weights_shape,
+ const Convolution2DAttributes &attr)
+{
+ auto src_storage_type = definition.src_tensors[0].storage_type;
+ return src_storage_type == TensorStorageType::BUFFER && weights_shape.w == 1 &&
+ weights_shape.h == 1 && attr.dilations.w == 1 && attr.dilations.h == 1 &&
+ attr.strides.w == 1 && attr.strides.h == 1 && attr.padding.prepended.w == 0 &&
+ attr.padding.prepended.h == 0 && attr.padding.appended.w == 0 &&
+ attr.padding.appended.h == 0;
+}
+
+ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo &device_info, const OperationDef &definition,
+ const Convolution2DAttributes &attr, const BHWC *shape)
+{
+ const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
+ const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
+ ConvBuffer1x1::ConvParams conv_params;
+ if (shape)
+ {
+ conv_params = GetBestParams(device_info, definition, *shape, src_depth, dst_depth);
+ }
+ else
+ {
+ conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
+ }
+ ConvBuffer1x1 result(definition, conv_params);
+ result.UploadData(attr.weights, attr.bias);
+ return result;
+}
+
+ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo &device_info, const OperationDef &definition,
+ const FullyConnectedAttributes &attr, const BHWC *shape)
+{
+ const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
+ const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
+ ConvBuffer1x1::ConvParams conv_params;
+ if (shape)
+ {
+ conv_params = GetBestParams(device_info, definition, *shape, src_depth, dst_depth);
+ }
+ else
+ {
+ conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
+ }
+ conv_params.block_size.x *= conv_params.block_size.y;
+ conv_params.block_size.y = 1;
+ ConvBuffer1x1 result(definition, conv_params);
+ result.UploadData(attr.weights, attr.bias);
+ return result;
+}
+
+ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const Convolution2DAttributes &attr,
+ const BHWC *shape)
+{
+ const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
+ const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
+ ConvBuffer1x1::ConvParams conv_params;
+ if (shape)
+ {
+ conv_params = GetBestParams(device_info, definition, *shape, src_depth, dst_depth);
+ }
+ else
+ {
+ conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
+ }
+ conv_params.block_size.x *= conv_params.block_size.y;
+ conv_params.block_size.y = 1;
+ conv_params.different_weights_for_height = true;
+ ConvBuffer1x1 result(definition, conv_params);
+ result.UploadDataForWinograd4x4To6x6(attr.weights);
+ return result;
+}
+
+ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const Convolution2DAttributes &attr,
+ const BHWC &weights_shape, const BHWC *dst_shape)
+{
+ const int dst_depth = DivideRoundUp(weights_shape.b, 4);
+ const int src_depth = DivideRoundUp(weights_shape.c, 4);
+ ConvBuffer1x1::ConvParams conv_params;
+ if (dst_shape)
+ {
+ conv_params = GetBestParams(device_info, definition, *dst_shape, src_depth, dst_depth);
+ }
+ else
+ {
+ conv_params = GetBestParams(device_info, definition, src_depth, dst_depth);
+ }
+ ConvBuffer1x1 result(definition, conv_params);
+ result.UploadBiases(attr.bias);
+ return result;
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvBuffer1x1.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvBuffer1x1.h
new file mode 100644
index 000000000..0abd6051f
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvBuffer1x1.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_BUFFER_1X1_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_BUFFER_1X1_H__
+
+#include "open_cl/Buffer.h"
+#include "open_cl/ClKernel.h"
+#include "open_cl/kernels/ConvCommon.h"
+#include "open_cl/kernels/GpuOperation.h"
+#include "open_cl/kernels/Util.h"
+#include "open_cl/LinearStorage.h"
+#include "open_cl/Precision.h"
+#include "open_cl/InternalTensor.h"
+#include "open_cl/Util.h"
+#include "open_cl/DataType.h"
+#include "open_cl/Operations.h"
+#include "open_cl/Shape.h"
+#include "open_cl/Status.h"
+#include "open_cl/Tensor.h"
+#include "open_cl/Types.h"
+#include "open_cl/WinogradUtil.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+class ConvBuffer1x1 : public GPUOperation
+{
+public:
+ ConvBuffer1x1() = default;
+
+ // Move only
+ ConvBuffer1x1(ConvBuffer1x1 &&operation);
+ ConvBuffer1x1 &operator=(ConvBuffer1x1 &&operation);
+ ConvBuffer1x1(const ConvBuffer1x1 &) = delete;
+ ConvBuffer1x1 &operator=(const ConvBuffer1x1 &) = delete;
+
+ void GetPossibleKernelWorkGroups(TuningType tuning_type, const DeviceInfo &device_info,
+ const KernelInfo &kernel_info,
+ std::vector<int3> *work_groups) const override;
+ int3 GetGridSize() const override;
+
+ ConvWeightsDescription GetConvWeightsDescription() const
+ {
+ ConvWeightsDescription desc;
+ desc.layout = ConvWeightsLayout::kOHWIOGroupI4O4;
+ desc.output_group_size = conv_params_.block_size.z;
+ return desc;
+ }
+
+ struct ConvParams
+ {
+ int3 block_size = int3(1, 1, 1);
+ int element_size = 4; // can be 4, 8 or 16
+
+ // By default in 2d convolution we have the same weights for WH dims, but in
+ // some cases we need separate weights for H dimension and convolution
+ // kernel requires very small modifications to support it.
+ bool different_weights_for_height = false;
+ };
+
+private:
+ ConvBuffer1x1(const OperationDef &definition, const ConvParams &conv_params);
+ friend ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const Convolution2DAttributes &attr, const BHWC *shape);
+ friend ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const FullyConnectedAttributes &attr, const BHWC *shape);
+ friend ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const Convolution2DAttributes &attr,
+ const BHWC *shape);
+ friend ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const Convolution2DAttributes &attr,
+ const BHWC &weights_shape,
+ const BHWC *dst_shape);
+
+ template <DataType T>
+ void UploadData(const InternalTensor<OHWI, T> &weights, const InternalTensor<Linear, T> &biases);
+ template <DataType T> void UploadDataForWinograd4x4To6x6(const InternalTensor<OHWI, T> &weights);
+
+ template <DataType T> void UploadWeights(const InternalTensor<OHWI, T> &weights);
+
+ template <DataType T> void UploadBiases(const InternalTensor<Linear, T> &biases);
+
+ std::string GenerateConvBuffer1x1(const OperationDef &op_def,
+ const ConvBuffer1x1::ConvParams &conv_params, Arguments *args);
+
+ ConvParams conv_params_;
+};
+
+template <DataType T>
+void ConvBuffer1x1::UploadData(const InternalTensor<OHWI, T> &weights,
+ const InternalTensor<Linear, T> &biases)
+{
+ UploadWeights(weights);
+ UploadBiases(biases);
+}
+
+template <DataType T>
+void ConvBuffer1x1::UploadDataForWinograd4x4To6x6(const InternalTensor<OHWI, T> &weights)
+{
+ InternalTensor<OHWI, T> wino_weights;
+ RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
+ UploadWeights(wino_weights);
+ InternalTensor<Linear, DataType::FLOAT32> bias;
+ bias.shape = Linear(weights.shape.o);
+ bias.data.resize(weights.shape.o, 0.0f);
+ UploadBiases(bias);
+}
+
+template <DataType T> void ConvBuffer1x1::UploadWeights(const InternalTensor<OHWI, T> &weights)
+{
+ const int dst_depth = DivideRoundUp(weights.shape.o, 4);
+ const int src_depth = DivideRoundUp(weights.shape.i, 4);
+
+ const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
+ const int float4_size = sizeof(float4);
+ // TODO
+ // f32_weights ? sizeof(float4) : sizeof(half4);
+
+ const int dst_depth_aligned = AlignByN(dst_depth, conv_params_.block_size.z);
+ const int elements_count = weights.shape.h * weights.shape.w * src_depth * dst_depth_aligned * 4;
+
+ BufferDescriptor desc;
+ desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+ desc.element_size = 16;
+ desc.memory_type = MemoryType::GLOBAL;
+ desc.size = float4_size * elements_count;
+ desc.data.resize(desc.size);
+
+ if (f32_weights)
+ {
+ float4 *ptr = reinterpret_cast<float4 *>(desc.data.data());
+ RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
+ absl::MakeSpan(ptr, elements_count));
+ }
+ // else
+ // {
+ // half4 *ptr = reinterpret_cast<half4 *>(desc.data.data());
+ // RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
+ // absl::MakeSpan(ptr, elements_count));
+ // }
+
+ args_.AddObject("weights", absl::make_unique<BufferDescriptor>(std::move(desc)));
+}
+
+template <DataType T> void ConvBuffer1x1::UploadBiases(const InternalTensor<Linear, T> &biases)
+{
+ TensorLinearDescriptor desc;
+ desc.storage_type = LinearStorageType::BUFFER;
+ desc.element_type = definition_.GetDataType();
+ int depth = AlignByN(biases.shape.v, 4 * conv_params_.block_size.z) / 4;
+ desc.UploadLinearData(biases, depth);
+ args_.AddObject("biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+}
+
+bool IsConvBuffer1x1Supported(const OperationDef &definition, const Convolution2DAttributes &attr);
+
+bool IsConvBuffer1x1Supported(const OperationDef &definition, const BHWC &weights_shape,
+ const Convolution2DAttributes &attr);
+
+ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo &device_info, const OperationDef &definition,
+ const Convolution2DAttributes &attr, const BHWC *shape = nullptr);
+
+ConvBuffer1x1 CreateConvBuffer1x1(const DeviceInfo &device_info, const OperationDef &definition,
+ const FullyConnectedAttributes &attr,
+ const BHWC *shape = nullptr);
+
+ConvBuffer1x1 CreateConvBuffer1x1DynamicWeights(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const Convolution2DAttributes &attr,
+ const BHWC &weights_shape,
+ const BHWC *dst_shape = nullptr);
+
+ConvBuffer1x1 CreateConvBuffer1x1Wino4x4To6x6(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const Convolution2DAttributes &attr,
+ const BHWC *shape = nullptr);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_BUFFER_1X1_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvCommon.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvCommon.h
new file mode 100644
index 000000000..4700381dc
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvCommon.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_COMMON_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_COMMON_H__
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+enum class ConvWeightsLayout
+{
+ kUnknown,
+ kOHWIOGroupI4O4,
+};
+
+struct ConvWeightsDescription
+{
+ ConvWeightsLayout layout;
+ int output_group_size;
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_COMMON_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvConstants.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvConstants.cc
new file mode 100644
index 000000000..0a51bab5c
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvConstants.cc
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "open_cl/kernels/ConvConstants.h"
+
+#include <string>
+#include <utility>
+
+#include "open_cl/kernels/Util.h"
+#include "open_cl/kernels/WorkGroupPicking.h"
+#include "open_cl/Precision.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+namespace
+{
+// Adreno can provide up to ~3-4KB of constant memory, but in some cases even
+// 3KB can have very bad performance.
+int GetAdrenoOptimalMaxConstantSize(int gpu_version)
+{
+ if (gpu_version < 600)
+ {
+ return 256 * 10; // 2.5KB
+ }
+ else
+ {
+ return 256 * 14; // 3.5KB
+ }
+}
+
+int GetOptimalMaxConstantSize(const DeviceInfo &info)
+{
+ if (!info.IsAdreno())
+ {
+ // In general we do not expect that this kernel will be used with non Adreno
+ // so as it tuned for __constant memory that have big profit on Adreno
+ return 1024; // 1KB
+ }
+ else
+ {
+ return GetAdrenoOptimalMaxConstantSize(info.adreno_info.gpu_version);
+ }
+}
+
+std::string GenerateConvolutionConstantCode(const OperationDef &op_def, const OHWI &weights_shape,
+ bool stride_correction, GPUOperation *op)
+{
+ auto src_desc = op_def.src_tensors[0];
+ src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
+ if (op_def.IsBatchSupported())
+ {
+ src_desc.SetStateVar("BatchedWidth", "true");
+ }
+ op->AddSrcTensor("src_tensor", src_desc);
+
+ auto dst_desc = op_def.dst_tensors[0];
+ if (op_def.IsBatchSupported())
+ {
+ dst_desc.SetStateVar("BatchedWidth", "true");
+ }
+ op->AddDstTensor("dst_tensor", dst_desc);
+
+ std::string c = GetCommonDefines(op_def.precision);
+
+ const int out_z = DivideRoundUp(weights_shape.o, 4);
+ const std::string kOutZ = std::to_string(out_z);
+ const int src_depth = DivideRoundUp(weights_shape.i, 4);
+
+ const auto src_tensor_type = op_def.src_tensors[0].storage_type;
+ const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
+ src_tensor_type == TensorStorageType::IMAGE_BUFFER;
+
+ switch (op_def.precision)
+ {
+ case CalculationsPrecision::F32:
+ case CalculationsPrecision::F16:
+ c += "#define CONV4(R, SRC, F, i) \\\n";
+ c += " R += SRC.x * F[i + 0]; \\\n";
+ c += " R += SRC.y * F[i + 1]; \\\n";
+ c += " R += SRC.z * F[i + 2]; \\\n";
+ c += " R += SRC.w * F[i + 3]; \n";
+
+ c += "#define CONV3(R, SRC, F, i) \\\n";
+ c += " R += SRC.x * F[i + 0]; \\\n";
+ c += " R += SRC.y * F[i + 1]; \\\n";
+ c += " R += SRC.z * F[i + 2]; \n";
+
+ c += "#define CONV2(R, SRC, F, i) \\\n";
+ c += " R += SRC.x * F[i + 0]; \\\n";
+ c += " R += SRC.y * F[i + 1]; \n";
+
+ c += "#define CONV1(R, SRC, F, i) \\\n";
+ c += " R += SRC * F[i + 0]; \n";
+ break;
+ case CalculationsPrecision::F32_F16:
+ c += "#define CONV4(R, SRC, F, i) \\\n";
+ c += " R += convert_float4(SRC.x * F[i + 0] + SRC.y * F[i + 1]";
+ c += " + SRC.z * F[i + 2] + SRC.w * F[i + 3]);\n";
+
+ c += "#define CONV3(R, SRC, F, i) \\\n";
+ c += " R += convert_float4(SRC.x * F[i + 0] + SRC.y * F[i + 1]";
+ c += " + SRC.z * F[i + 2]);\n";
+
+ c += "#define CONV2(R, SRC, F, i) \\\n";
+ c += " R += convert_float4(SRC.x * F[i + 0] + SRC.y * F[i + 1]);\n";
+
+ c += "#define CONV1(R, SRC, F, i) \\\n";
+ c += " R += convert_float4(SRC * F[i + 0]);\n";
+ break;
+ }
+
+ const std::string postfixes[] = {".x", ".xy", ".xyz", ""};
+
+ c += "__kernel void main_function(\n";
+ c += "$0) {\n";
+ c += " int X = get_global_id(0);\n";
+ c += " int Y = get_global_id(1);\n";
+ c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height()) "
+ "return;\n";
+ if (stride_correction)
+ {
+ c += " int start_x = " +
+ GetXStrideCorrectedV2("X", "args.src_tensor.Batch()", "args.stride_x", "args.padding_x") +
+ ";\n";
+ }
+ else
+ {
+ if (op_def.IsBatchSupported())
+ {
+ c += " int start_x = X * args.stride_x + args.padding_x * "
+ "args.src_tensor.Batch();\n";
+ }
+ else
+ {
+ c += " int start_x = X * args.stride_x + args.padding_x;\n";
+ }
+ }
+ c += " int start_y = Y * args.stride_y + args.padding_y;\n";
+ c += " ACCUM_FLT4 r[" + kOutZ + "];\n";
+ c += " for (int i = 0; i < " + kOutZ + "; ++i) {\n";
+ c += " r[i] = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+ c += " }\n";
+ int filters_counter = 0;
+ for (int s = 0; s < src_depth; ++s)
+ {
+ const int ch_count = std::min(4, weights_shape.i - s * 4);
+ const std::string s_conv = "CONV" + std::to_string(ch_count);
+ const std::string s_count = ch_count == 1 ? "" : std::to_string(ch_count);
+ const std::string s_type = absl::StrCat("FLT", s_count);
+ const std::string s_postfix = postfixes[ch_count - 1];
+ const std::string dilation_x =
+ op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()" : "args.dilation_x";
+ for (int ky = 0; ky < weights_shape.h; ++ky)
+ {
+ std::string s_y = absl::StrCat("(start_y + ", ky, " * args.dilation_y)");
+ if (manual_clamp)
+ {
+ c += " {\n";
+ c += " bool y_out = " + s_y + " < 0 || " + s_y + " >= args.src_tensor.Height();\n";
+ }
+ for (int kx = 0; kx < weights_shape.w; ++kx)
+ {
+ c += " {\n";
+ std::string s_x = absl::StrCat("(start_x + ", kx, " * " + dilation_x + ")");
+ if (manual_clamp)
+ {
+ c += " bool x_out = " + s_x + "< 0 || " + s_x + ">= args.src_tensor.Width();\n";
+ c += " " + s_type + " src = x_out || y_out ?";
+ c += "(" + s_type + ")(0.0) : args.src_tensor.Read(" + s_x + ", " + s_y + ", " +
+ std::to_string(s) + ")" + s_postfix + ";\n";
+ }
+ else
+ {
+ c += " " + s_type + " src = args.src_tensor.Read(" + s_x + ", " + s_y + ", " +
+ std::to_string(s) + ")" + s_postfix + ";\n";
+ }
+ for (int d = 0; d < out_z; ++d)
+ {
+ c += " " + s_conv + "(r[" + std::to_string(d) + "], src, args.weigths.GetPtr(),";
+ c += " " + std::to_string(filters_counter) + ");\n";
+ filters_counter += ch_count;
+ }
+ c += " }\n";
+ }
+ if (manual_clamp)
+ {
+ c += " }\n";
+ }
+ }
+ }
+ for (int i = 0; i < out_z; ++i)
+ {
+ std::string s_i = std::to_string(i);
+ c += " {\n";
+ c += " FLT4 res = TO_FLT4(r[" + s_i + "]) + args.biases.Read(" + s_i + ");\n";
+ c += " args.dst_tensor.Write(res, X, Y, " + s_i + ");\n";
+ c += " }\n";
+ }
+ c += "}\n";
+ return c;
+}
+
+} // namespace
+
+bool IsConvConstantsSupported(const DeviceInfo &device_info, const OperationDef &definition,
+ const Convolution2DAttributes &attr)
+{
+ if (device_info.IsAMD() && definition.precision != CalculationsPrecision::F32 &&
+ definition.src_tensors[0].storage_type != TensorStorageType::BUFFER)
+ {
+ // BUG, some AMD gpus crashe without it
+ return false;
+ }
+
+ const auto &w_shape = attr.weights.shape;
+ const int dst_channels = AlignByN(w_shape.o, 4);
+ const int filters_count = w_shape.i * dst_channels * w_shape.h * w_shape.w;
+ const int float_size = sizeof(float);
+ // TODO F32 and F16
+ // definition.precision == CalculationsPrecision::F32 ? sizeof(float) : sizeof(half);
+ const int filters_buffer_size = filters_count * float_size;
+ const int kConstantMaxSize = GetOptimalMaxConstantSize(device_info);
+ const int flt4_registers = DivideRoundUp(w_shape.o, 4);
+ return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8;
+}
+
+GPUOperation CreateConvConstants(const DeviceInfo &device_info, const OperationDef &definition,
+ const Convolution2DAttributes &attr)
+{
+ GPUOperation op(definition);
+ UploadWeightsForConvConstants(attr.weights, definition.precision, &op);
+ op.args_.AddInt("stride_x", attr.strides.w);
+ op.args_.AddInt("stride_y", attr.strides.h);
+ op.args_.AddInt("padding_x", -attr.padding.prepended.w);
+ op.args_.AddInt("padding_y", -attr.padding.prepended.h);
+ op.args_.AddInt("dilation_x", attr.dilations.w);
+ op.args_.AddInt("dilation_y", attr.dilations.h);
+ op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_ZIs1;
+
+ const bool stride_correction = definition.IsBatchSupported() && attr.strides.w != 1;
+ op.code_ =
+ GenerateConvolutionConstantCode(definition, attr.weights.shape, stride_correction, &op);
+ if (definition.precision == CalculationsPrecision::F16 && device_info.IsAdreno3xx())
+ {
+ op.compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
+ }
+ if (definition.precision != CalculationsPrecision::F32 && device_info.IsPowerVR())
+ {
+ // BUG, some PowerVRs (GE8320) produce incorrect result without it
+ op.compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
+ }
+
+ TensorLinearDescriptor desc;
+ desc.storage_type = LinearStorageType::BUFFER;
+ desc.element_type = definition.GetDataType();
+ desc.memory_type = MemoryType::CONSTANT;
+ desc.UploadLinearData(attr.bias);
+ op.args_.AddObject("biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+ return op;
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvConstants.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvConstants.h
new file mode 100644
index 000000000..be6670c53
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvConstants.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_CONSTANTS_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_CONSTANTS_H__
+
+#include "open_cl/Buffer.h"
+#include "open_cl/kernels/GpuOperation.h"
+#include "open_cl/LinearStorage.h"
+#include "open_cl/Tensor.h"
+#include "open_cl/Util.h"
+#include "open_cl/DataType.h"
+#include "open_cl/Operations.h"
+#include "open_cl/Shape.h"
+#include "open_cl/Status.h"
+#include "open_cl/Tensor.h"
+#include "open_cl/Types.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+template <DataType S, typename T>
+void RearrangeWeightsForConvConstants(const InternalTensor<OHWI, S> &weights, absl::Span<T> dst)
+{
+ const int dst_depth = DivideRoundUp(weights.shape.o, 4);
+ const int src_depth = DivideRoundUp(weights.shape.i, 4);
+ const int kernel_x = weights.shape.w;
+ const int kernel_y = weights.shape.h;
+
+ int counter = 0;
+ for (int s = 0; s < src_depth; ++s)
+ {
+ for (int y = 0; y < kernel_y; ++y)
+ {
+ for (int x = 0; x < kernel_x; ++x)
+ {
+ for (int d = 0; d < dst_depth; ++d)
+ {
+ const int channels_count = std::min(4, weights.shape.i - s * 4);
+ T filters[4];
+ for (int i = 0; i < 4; ++i)
+ {
+ for (int j = 0; j < channels_count; ++j)
+ {
+ const int s_ch = s * 4 + j;
+ const int d_ch = d * 4 + i;
+ if (s_ch < weights.shape.i && d_ch < weights.shape.o)
+ {
+ const int f_index = weights.shape.LinearIndex({d_ch, y, x, s_ch});
+ filters[i][j] = weights.data[f_index];
+ }
+ else
+ {
+ filters[i][j] = 0.0f;
+ }
+ }
+ }
+ T filters_new[4];
+ for (int i = 0; i < 4; ++i)
+ {
+ for (int j = 0; j < 4; ++j)
+ {
+ filters_new[i][j] = filters[j][i];
+ }
+ }
+ for (int i = 0; i < channels_count; ++i)
+ {
+ dst[counter++] = filters_new[i];
+ }
+ }
+ }
+ }
+ }
+}
+
+template <DataType T>
+void UploadWeightsForConvConstants(const InternalTensor<OHWI, T> &weights,
+ CalculationsPrecision precision, GPUOperation *op)
+{
+ const int dst_depth = DivideRoundUp(weights.shape.o, 4);
+ const int kernel_x = weights.shape.w;
+ const int kernel_y = weights.shape.h;
+
+ const bool f32_weights = precision == CalculationsPrecision::F32;
+ const int float_size = f32_weights ? 4 : 2;
+ const int float_count = weights.shape.i * dst_depth * 4 * kernel_x * kernel_y;
+
+ BufferDescriptor desc;
+ desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+ desc.element_size = 4;
+ desc.memory_type = MemoryType::CONSTANT;
+ desc.size = float_size * float_count;
+ desc.data.resize(desc.size);
+
+ if (f32_weights)
+ {
+ float4 *ptr = reinterpret_cast<float4 *>(desc.data.data());
+ RearrangeWeightsForConvConstants(weights, absl::MakeSpan(ptr, float_count / 4));
+ }
+ // else
+ // {
+ // half4 *ptr = reinterpret_cast<half4 *>(desc.data.data());
+ // RearrangeWeightsForConvConstants(weights, absl::MakeSpan(ptr, float_count / 4));
+ // }
+
+ op->args_.AddObject("weigths", absl::make_unique<BufferDescriptor>(std::move(desc)));
+}
+
+bool IsConvConstantsSupported(const DeviceInfo &device_info, const OperationDef &definition,
+ const Convolution2DAttributes &attr);
+
+GPUOperation CreateConvConstants(const DeviceInfo &device_info, const OperationDef &definition,
+ const Convolution2DAttributes &attr);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_CONSTANTS_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvPowervr.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvPowervr.cc
new file mode 100644
index 000000000..5cb0c2719
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvPowervr.cc
@@ -0,0 +1,1653 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "open_cl/kernels/ConvPowervr.h"
+
+#include <algorithm>
+#include <string>
+#include <utility>
+
+#include "absl/strings/substitute.h"
+#include "open_cl/kernels/Util.h"
+#include "open_cl/kernels/WorkGroupPicking.h"
+#include "open_cl/Precision.h"
+#include "open_cl/TensorType.h"
+#include "open_cl/DataType.h"
+#include "open_cl/Shape.h"
+#include "open_cl/Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+namespace
+{
+std::string GenerateUploadByThreads(const std::string &local_ptr_name,
+ const std::string &global_ptr_name,
+ const std::string &global_offset_name,
+ const std::string &lid_name, int total_work_items,
+ int elements_to_upload)
+{
+ std::string c;
+ std::string offset = global_offset_name.empty() ? "" : global_offset_name + " + ";
+ const int groups = elements_to_upload / total_work_items;
+ const int reminder = elements_to_upload % total_work_items;
+ for (int i = 0; i < groups; ++i)
+ {
+ c += " " + local_ptr_name + "[" + lid_name + " + " + std::to_string(total_work_items * i) +
+ "] = " + global_ptr_name + "[" + offset + lid_name + " + " +
+ std::to_string(total_work_items * i) + "];\n";
+ }
+ if (reminder != 0)
+ {
+ c += " if (" + lid_name + " < " + std::to_string(reminder) + ") {\n";
+ c += " " + local_ptr_name + "[" + lid_name + " + " +
+ std::to_string(total_work_items * groups) + "] = " + global_ptr_name + "[" + offset +
+ lid_name + " + " + std::to_string(total_work_items * groups) + "];\n";
+ c += " }\n";
+ }
+ return c;
+}
+
+std::string GenerateAsyncUpload(const std::string &local_ptr_name,
+ const std::string &global_ptr_name,
+ const std::string &global_offset_name, int elements_to_upload)
+{
+ std::string c;
+ std::string offset = global_offset_name.empty() ? "" : " + " + global_offset_name;
+ c += " async_work_group_copy(" + local_ptr_name + ", " + global_ptr_name + offset + ", " +
+ std::to_string(elements_to_upload) + ", 0);\n";
+ return c;
+}
+
+std::string GenerateBlockCoords(const int4 &block_size, const int3 &work_group_launch_order,
+ bool linear_spatial, bool need_depth)
+{
+ std::string c;
+ int3 launch_remap;
+ launch_remap[work_group_launch_order.x] = 0;
+ launch_remap[work_group_launch_order.y] = 1;
+ launch_remap[work_group_launch_order.z] = 2;
+ if (linear_spatial)
+ {
+ if (work_group_launch_order[0] == 0)
+ {
+ c += " int linear_spatial = get_global_id(0);\n";
+ }
+ else
+ {
+ c += " int linear_spatial = get_group_id(" + std::to_string(launch_remap[0]) +
+ ") * get_local_size(0) + get_local_id(0);\n";
+ }
+ if (need_depth)
+ {
+ c += " int DST_X = (linear_spatial % args.task_size_x) * " + std::to_string(block_size.x) +
+ ";\n";
+ c += " linear_spatial = linear_spatial / args.task_size_x;\n";
+ c += " int DST_Y = (linear_spatial % args.task_size_y) * " + std::to_string(block_size.y) +
+ ";\n";
+ c += " int DST_Z = (linear_spatial / args.task_size_y) * " + std::to_string(block_size.z) +
+ ";\n";
+ }
+ else
+ {
+ c += " int DST_Y = (linear_spatial / args.task_size_x) * " + std::to_string(block_size.y) +
+ ";\n";
+ c += " int DST_X = (linear_spatial % args.task_size_x) * " + std::to_string(block_size.x) +
+ ";\n";
+ }
+ if (work_group_launch_order[1] == 1)
+ {
+ c += " int DST_S = get_global_id(1) * " + std::to_string(block_size.w) + ";\n";
+ }
+ else
+ {
+ c += " int DST_S = (get_group_id(" + std::to_string(launch_remap[1]) +
+ ") * get_local_size(1) + get_local_id(1)) * " + std::to_string(block_size.w) + ";\n";
+ }
+ }
+ else
+ {
+ if (work_group_launch_order[0] == 0)
+ {
+ c += " int DST_X = get_global_id(0) * " + std::to_string(block_size.x) + ";\n";
+ }
+ else
+ {
+ c += " int DST_X = (get_group_id(" + std::to_string(launch_remap[0]) +
+ ") * get_local_size(0) + get_local_id(0)) * " + std::to_string(block_size.x) + ";\n";
+ }
+ std::string global_id_1;
+ if (work_group_launch_order[1] == 1)
+ {
+ global_id_1 = "get_global_id(1)";
+ }
+ else
+ {
+ global_id_1 = "(get_group_id(" + std::to_string(launch_remap[1]) +
+ ") * get_local_size(1) + get_local_id(1))";
+ }
+ if (need_depth)
+ {
+ c += " int linear_id_1 = " + global_id_1 + ";\n";
+ c +=
+ " int DST_Z = (linear_id_1 / args.task_size_y) * " + std::to_string(block_size.z) + ";\n";
+ c +=
+ " int DST_Y = (linear_id_1 % args.task_size_y) * " + std::to_string(block_size.y) + ";\n";
+ }
+ else
+ {
+ c += " int DST_Y = " + global_id_1 + " * " + std::to_string(block_size.y) + ";\n";
+ }
+ if (work_group_launch_order[2] == 2)
+ {
+ c += " int DST_S = get_global_id(2) * " + std::to_string(block_size.w) + ";\n";
+ }
+ else
+ {
+ c += " int DST_S = (get_group_id(" + std::to_string(launch_remap[2]) +
+ ") * get_local_size(2) + get_local_id(2)) * " + std::to_string(block_size.w) + ";\n";
+ }
+ }
+
+ return c;
+}
+} // namespace
+
+ConvPowerVR::ConvPowerVR(const OperationDef &definition, const Convolution2DAttributes &attr,
+ const DeviceInfo &device_info, const BHWC *dst_shape)
+ : GPUOperation(definition), stride_(attr.strides.w, attr.strides.h, 1, 1),
+ padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
+ kernel_size_(attr.weights.shape.w, attr.weights.shape.h, 1, 1),
+ dilation_(attr.dilations.w, attr.dilations.h, 1, 1),
+ conv_params_(GuessBestParams(device_info, definition, attr, dst_shape))
+{
+}
+
+ConvPowerVR::ConvPowerVR(const OperationDef &definition, const Convolution2DAttributes &attr,
+ const BHWC &weights_shape, const DeviceInfo &device_info,
+ const BHWC *dst_shape)
+ : GPUOperation(definition), stride_(attr.strides.w, attr.strides.h, 1, 1),
+ padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
+ kernel_size_(weights_shape.w, weights_shape.h, 1, 1),
+ dilation_(attr.dilations.w, attr.dilations.h, 1, 1),
+ conv_params_(GuessBestParams(device_info, definition, attr, weights_shape, dst_shape))
+{
+}
+
+ConvPowerVR::ConvPowerVR(const OperationDef &definition, const FullyConnectedAttributes &attr,
+ const DeviceInfo &device_info, const BHWC *dst_shape)
+ : GPUOperation(definition), stride_(1, 1, 1, 1), padding_(0, 0, 0, 0), kernel_size_(1, 1, 1, 1),
+ dilation_(1, 1, 1, 1), conv_params_(GuessBestParams(device_info, definition, attr, dst_shape))
+{
+}
+
+ConvPowerVR::ConvPowerVR(const OperationDef &definition)
+ : GPUOperation(definition), stride_(1, 1, 1, 1), padding_(0, 0, 0, 0), kernel_size_(1, 1, 1, 1),
+ dilation_(1, 1, 1, 1)
+{
+}
+
+ConvPowerVR::ConvPowerVR(ConvPowerVR &&operation)
+ : GPUOperation(std::move(operation)), stride_(operation.stride_), padding_(operation.padding_),
+ kernel_size_(operation.kernel_size_), dilation_(operation.dilation_),
+ conv_params_(operation.conv_params_)
+{
+}
+
+ConvPowerVR::ConvPowerVR(const OperationDef &definition, const Convolution3DAttributes &attr,
+ const DeviceInfo &device_info, const BHWDC *dst_shape)
+ : GPUOperation(definition), stride_(attr.strides.w, attr.strides.h, attr.strides.d, 1),
+ padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, -attr.padding.prepended.d, 0),
+ kernel_size_(attr.weights.shape.w, attr.weights.shape.h, attr.weights.shape.d, 1),
+ dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d, 1),
+ conv_params_(GuessBestParams(device_info, definition, attr, dst_shape))
+{
+}
+
+ConvPowerVR &ConvPowerVR::operator=(ConvPowerVR &&operation)
+{
+ if (this != &operation)
+ {
+ std::swap(stride_, operation.stride_);
+ std::swap(padding_, operation.padding_);
+ std::swap(kernel_size_, operation.kernel_size_);
+ std::swap(dilation_, operation.dilation_);
+ std::swap(conv_params_, operation.conv_params_);
+ GPUOperation::operator=(std::move(operation));
+ }
+ return *this;
+}
+
+void ConvPowerVR::GenerateCode(const DeviceInfo &device_info)
+{
+ if (conv_params_.linear_spatial)
+ {
+ grid_dimension_ = 2;
+ }
+ const bool stride_correction = definition_.IsBatchSupported() && stride_.x != 1;
+ code_ = GenerateConv(device_info, definition_, stride_correction, conv_params_);
+ if (definition_.precision == CalculationsPrecision::F16 && device_info.IsPowerVR())
+ {
+ compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
+ }
+ if (conv_params_.IsPrivateMemBroadcast() && device_info.IsCL20OrHigher())
+ {
+ compiler_options_.push_back(CompilerOptions::CL_2_0);
+ }
+ bool kernel_is_trivial = conv_params_.x_kernel_is_1 && conv_params_.y_kernel_is_1;
+ if (definition_.src_tensors[0].HasAxis(Axis::DEPTH))
+ {
+ kernel_is_trivial = kernel_is_trivial & conv_params_.z_kernel_is_1;
+ }
+ if (device_info.IsAdreno3xx() && definition_.precision == CalculationsPrecision::F16 &&
+ kernel_is_trivial)
+ {
+ compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
+ }
+}
+
+absl::Status ConvPowerVR::BindArguments(ArgumentsBinder *args)
+{
+ if (!conv_params_.x_kernel_is_1)
+ {
+ RETURN_IF_ERROR(args->SetInt("stride_x", stride_.x));
+ RETURN_IF_ERROR(args->SetInt("padding_x", padding_.x * src_[0]->Batch()));
+ RETURN_IF_ERROR(args->SetInt("kernel_size_x", kernel_size_.x));
+ RETURN_IF_ERROR(args->SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
+ }
+ if (!conv_params_.y_kernel_is_1)
+ {
+ RETURN_IF_ERROR(args->SetInt("stride_y", stride_.y));
+ RETURN_IF_ERROR(args->SetInt("padding_y", padding_.y));
+ RETURN_IF_ERROR(args->SetInt("kernel_size_y", kernel_size_.y));
+ RETURN_IF_ERROR(args->SetInt("dilation_y", dilation_.y));
+ }
+ if (definition_.src_tensors[0].HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1)
+ {
+ RETURN_IF_ERROR(args->SetInt("stride_z", stride_.z));
+ RETURN_IF_ERROR(args->SetInt("padding_z", padding_.z));
+ RETURN_IF_ERROR(args->SetInt("kernel_size_z", kernel_size_.z));
+ RETURN_IF_ERROR(args->SetInt("dilation_z", dilation_.z));
+ }
+ if (conv_params_.linear_spatial)
+ {
+ const int grid_x =
+ DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(), conv_params_.block_size.x);
+ RETURN_IF_ERROR(args->SetInt("task_size_x", grid_x));
+ }
+ if (definition_.src_tensors[0].HasAxis(Axis::DEPTH))
+ {
+ const int task_size_y = DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
+ RETURN_IF_ERROR(args->SetInt("task_size_y", task_size_y));
+ }
+ return absl::OkStatus();
+}
+
+int3 ConvPowerVR::GetGridSize() const
+{
+ const int task_size_x =
+ DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(), conv_params_.block_size.x);
+ const int task_size_y = DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
+ const int task_size_z = DivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
+ const int task_size_s = DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w);
+ int3 wg;
+
+ if (conv_params_.linear_spatial)
+ {
+ int grid_x = task_size_x * task_size_y;
+ if (definition_.src_tensors[0].HasAxis(Axis::DEPTH))
+ {
+ grid_x *= task_size_z;
+ }
+ return int3(grid_x, task_size_s, 1);
+ }
+ else
+ {
+ int grid_y = task_size_y;
+ if (definition_.src_tensors[0].HasAxis(Axis::DEPTH))
+ {
+ grid_y *= task_size_z;
+ }
+ return int3(task_size_x, grid_y, task_size_s);
+ }
+}
+
+void ConvPowerVR::GetPossibleKernelWorkGroups(TuningType tuning_type, const DeviceInfo &device_info,
+ const KernelInfo &kernel_info,
+ std::vector<int3> *work_groups) const
+{
+ if (conv_params_.weights_upload_type == WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP ||
+ conv_params_.weights_upload_type == WeightsUploadType::LOCAL_MEM_BY_THREADS ||
+ conv_params_.fixed_work_group_size)
+ {
+ work_groups->push_back(work_group_size_);
+ return;
+ }
+ GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_, work_groups);
+}
+
+std::string ConvPowerVR::GenerateConv(const DeviceInfo &device_info, const OperationDef &op_def,
+ bool stride_correction, const ConvParams &conv_params)
+{
+ auto src_desc = op_def.src_tensors[0];
+ src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
+ if (op_def.IsBatchSupported())
+ {
+ src_desc.SetStateVar("BatchedWidth", "true");
+ }
+ AddSrcTensor("src_tensor", src_desc);
+ if (op_def.src_tensors.size() == 2)
+ {
+ // dynamic weights
+ BufferDescriptor desc;
+ desc.element_type = op_def.src_tensors[1].data_type;
+ desc.element_size = 4;
+ desc.memory_type =
+ conv_params.weights_upload_type == ConvPowerVR::WeightsUploadType::CONSTANT_MEM
+ ? MemoryType::CONSTANT
+ : MemoryType::GLOBAL;
+
+ AddSrcBuffer("weights", desc);
+ }
+
+ const auto &src_def = op_def.src_tensors[0];
+
+ auto generate_id = [&](const std::string &x, const std::string &y, const std::string &z) {
+ std::string id;
+ if (src_def.HasAxis(Axis::WIDTH))
+ {
+ id += "_w" + x;
+ }
+ if (src_def.HasAxis(Axis::HEIGHT))
+ {
+ id += "_h" + y;
+ }
+ if (src_def.HasAxis(Axis::DEPTH))
+ {
+ id += "_d" + z;
+ }
+ return id;
+ };
+
+ auto generate_id_full = [&](const std::string &x, const std::string &y, const std::string &z,
+ const std::string &s) { return generate_id(x, y, z) + "_s" + s; };
+
+ auto generate_check = [&](const std::string &x, const std::string &y, const std::string &z) {
+ std::string check;
+ const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
+ const std::vector<std::string> names{"in_x", "in_y", "in_z"};
+ const std::vector<bool> is_1{conv_params_.x_kernel_is_1, conv_params_.y_kernel_is_1,
+ conv_params_.z_kernel_is_1};
+ const std::vector<std::string> coords{x, y, z};
+ for (size_t i = 0; i < axes.size(); ++i)
+ {
+ const auto &axis = axes[i];
+ if (src_def.HasAxis(axis) && !src_def.SupportsZeroClamp(axis) && !is_1[i])
+ {
+ if (!check.empty())
+ {
+ check += " && ";
+ }
+ check += names[i] + coords[i];
+ }
+ }
+ return check;
+ };
+
+ auto dst_desc = op_def.dst_tensors[0];
+ if (op_def.IsBatchSupported())
+ {
+ dst_desc.SetStateVar("BatchedWidth", "true");
+ }
+ AddDstTensor("dst_tensor", dst_desc);
+
+ if (!conv_params_.x_kernel_is_1)
+ {
+ args_.AddInt("stride_x");
+ args_.AddInt("padding_x");
+ args_.AddInt("kernel_size_x");
+ args_.AddInt("dilation_x");
+ }
+ if (!conv_params_.y_kernel_is_1)
+ {
+ args_.AddInt("stride_y");
+ args_.AddInt("padding_y");
+ args_.AddInt("kernel_size_y");
+ args_.AddInt("dilation_y");
+ }
+ if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1)
+ {
+ args_.AddInt("stride_z");
+ args_.AddInt("padding_z");
+ args_.AddInt("kernel_size_z");
+ args_.AddInt("dilation_z");
+ }
+ if (conv_params_.linear_spatial)
+ {
+ args_.AddInt("task_size_x");
+ }
+ if (src_def.HasAxis(Axis::DEPTH))
+ {
+ args_.AddInt("task_size_y");
+ }
+
+ const bool need_local_mem =
+ conv_params.weights_upload_type == ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS ||
+ conv_params.weights_upload_type == ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;
+
+ const int local_mem_size = conv_params.block_size.w * 4 * conv_params.src_depth_loop_size;
+
+ const bool use_simd_broadcast = conv_params.IsPrivateMemBroadcast();
+ const int simd_size = conv_params.simd_size;
+
+ const bool late_oob_check = need_local_mem || use_simd_broadcast;
+
+ const std::string weights_space =
+ conv_params.weights_upload_type == ConvPowerVR::WeightsUploadType::CONSTANT_MEM ? "__constant"
+ : "__global";
+
+ const std::string weights_data_type =
+ conv_params.weights_data_type == DataType::FLOAT32 ? "float4" : "half4";
+
+ const std::string weights_global_ptr = weights_space + " " + weights_data_type + "*";
+
+ std::string c = GetCommonDefines(op_def.precision);
+ if (use_simd_broadcast)
+ {
+ if (device_info.cl_version == OpenCLVersion::CL_2_0)
+ {
+ c += "#pragma OPENCL EXTENSION cl_khr_subgroups : enable\n";
+ }
+ else if (device_info.SupportsExtension("cl_intel_subgroups"))
+ {
+ c += "#pragma OPENCL EXTENSION cl_intel_subgroups : enable\n";
+ }
+ }
+ const int4 block_size = conv_params.block_size;
+ if (conv_params.fixed_work_group_size)
+ {
+ c += "__attribute__((reqd_work_group_size(" + std::to_string(work_group_size_.x) + ", " +
+ std::to_string(work_group_size_.y) + ", " + std::to_string(work_group_size_.z) + ")))\n";
+ }
+ if (use_simd_broadcast && device_info.IsIntel())
+ {
+ c += "__attribute__((intel_reqd_sub_group_size(" + std::to_string(simd_size) + ")))\n";
+ }
+ std::string dst_oob_check;
+ if (src_def.HasAxis(Axis::DEPTH))
+ {
+ if (conv_params.linear_spatial)
+ {
+ dst_oob_check = "DST_Z >= args.dst_tensor.Depth() || DST_S >= "
+ "args.dst_tensor.Slices()";
+ }
+ else
+ {
+ dst_oob_check = "DST_X >= args.dst_tensor.Width() || DST_Z >= "
+ "args.dst_tensor.Depth() || DST_S >= args.dst_tensor.Slices()";
+ }
+ }
+ else
+ {
+ if (conv_params.linear_spatial)
+ {
+ dst_oob_check = "DST_Y >= args.dst_tensor.Height() || DST_S >= "
+ "args.dst_tensor.Slices()";
+ }
+ else
+ {
+ dst_oob_check = "DST_X >= args.dst_tensor.Width() || DST_Y >= "
+ "args.dst_tensor.Height() || DST_S >= args.dst_tensor.Slices()";
+ }
+ }
+ c += "__kernel void main_function(\n";
+ c += "$0) {\n";
+ c += GenerateBlockCoords(conv_params.block_size, work_group_launch_order_,
+ conv_params.linear_spatial, src_def.HasAxis(Axis::DEPTH));
+ if (!late_oob_check)
+ {
+ c += " if (" + dst_oob_check + ") {\n";
+ c += " return;\n";
+ c += " }\n";
+ }
+ if (conv_params.weights_upload_type == ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS)
+ {
+ if (conv_params.linear_spatial)
+ {
+ c += " int lid = get_local_id(0);\n";
+ }
+ else
+ {
+ c += " int lid = get_local_id(1) * " + std::to_string(work_group_size_.x) +
+ " + get_local_id(0);\n";
+ }
+ }
+ if (use_simd_broadcast)
+ {
+ c += " int simd_id = get_sub_group_local_id();\n";
+ }
+ for (int s = 0; s < block_size.w; ++s)
+ {
+ const std::string sind = std::to_string(s);
+ for (int z = 0; z < block_size.z; ++z)
+ {
+ const std::string zind = std::to_string(z);
+ for (int y = 0; y < block_size.y; ++y)
+ {
+ const std::string yind = std::to_string(y);
+ for (int x = 0; x < block_size.x; ++x)
+ {
+ const std::string xind = std::to_string(x);
+ c += " ACCUM_FLT4 r" + generate_id_full(xind, yind, zind, sind) +
+ " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+ }
+ }
+ }
+ }
+ if (!conv_params_.x_kernel_is_1)
+ {
+ for (int x = 0; x < block_size.x; ++x)
+ {
+ const std::string xind = std::to_string(x);
+ const std::string xc = "(DST_X + " + xind + ")";
+ if (stride_correction)
+ {
+ c += " int xc" + xind + " = " +
+ GetXStrideCorrected(xc, "args.src_tensor.Batch()", "args.stride_x", "args.padding_x") +
+ ";\n";
+ }
+ else
+ {
+ c += " int xc" + xind + " = " + xc + " * args.stride_x + args.padding_x;\n";
+ }
+ }
+ }
+ else
+ {
+ for (int x = 0; x < block_size.x; ++x)
+ {
+ const std::string xind = std::to_string(x);
+ c += " int xc" + xind + " = DST_X + " + xind + ";\n";
+ if (!src_def.CanReadOutOfBorder(Axis::WIDTH))
+ {
+ c += " xc" + xind + " = clamp(xc" + xind + ", 0, args.src_tensor.Width() - 1);\n";
+ }
+ }
+ }
+ if (!conv_params_.y_kernel_is_1)
+ {
+ for (int y = 0; y < block_size.y; ++y)
+ {
+ const std::string yind = std::to_string(y);
+ const std::string yc = "(DST_Y + " + yind + ")";
+ c += " int yc" + yind + " = " + yc + " * args.stride_y + args.padding_y;\n";
+ }
+ }
+ else
+ {
+ for (int y = 0; y < block_size.y; ++y)
+ {
+ const std::string yind = std::to_string(y);
+ c += " int yc" + yind + " = DST_Y + " + yind + ";\n";
+ if (!src_def.CanReadOutOfBorder(Axis::HEIGHT))
+ {
+ c += " yc" + yind + " = clamp(yc" + yind + ", 0, args.src_tensor.Height() - 1);\n";
+ }
+ }
+ }
+ if (src_def.HasAxis(Axis::DEPTH))
+ {
+ if (!conv_params_.z_kernel_is_1)
+ {
+ for (int z = 0; z < block_size.z; ++z)
+ {
+ const std::string zind = std::to_string(z);
+ const std::string zc = "(DST_Z + " + zind + ")";
+ c += " int zc" + zind + " = " + zc + " * args.stride_z + args.padding_z;\n";
+ }
+ }
+ else
+ {
+ for (int z = 0; z < block_size.z; ++z)
+ {
+ const std::string zind = std::to_string(z);
+ c += " int zc" + zind + " = DST_Z + " + zind + ";\n";
+ if (!src_def.CanReadOutOfBorder(Axis::DEPTH))
+ {
+ c += " zc" + zind + " = clamp(zc" + zind + ", 0, args.src_tensor.Depth() - 1);\n";
+ }
+ }
+ }
+ }
+ bool trivial_kernel_size = conv_params_.x_kernel_is_1 && conv_params_.y_kernel_is_1;
+ if (src_def.HasAxis(Axis::DEPTH))
+ {
+ trivial_kernel_size = trivial_kernel_size && conv_params_.z_kernel_is_1;
+ }
+ if (need_local_mem)
+ {
+ c += " __local " + weights_data_type + " weights_cache[" + std::to_string(local_mem_size) +
+ "];\n";
+ }
+ else if (conv_params.AreWeightsBuffer())
+ {
+ c += " " + weights_global_ptr + " weights_cache;\n";
+ }
+ else if (!trivial_kernel_size)
+ {
+ c += " int filter_offset = 0;\n";
+ }
+ if (conv_params.AreWeightsBuffer())
+ {
+ if (conv_params.different_weights_for_height)
+ {
+ c += " " + weights_global_ptr +
+ " filters_loc = args.weights.GetPtr() + (DST_S * "
+ "args.src_tensor.Height() + DST_Y * " +
+ std::to_string(block_size.w) + ") * 4 * args.src_tensor.Slices();\n";
+ }
+ else
+ {
+ std::string kernel_spatial_offset = "";
+ if (!conv_params_.x_kernel_is_1)
+ {
+ kernel_spatial_offset += " * args.kernel_size_x";
+ }
+ if (!conv_params_.y_kernel_is_1)
+ {
+ kernel_spatial_offset += " * args.kernel_size_y";
+ }
+ if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1)
+ {
+ kernel_spatial_offset += " * args.kernel_size_z";
+ }
+ c += " " + weights_global_ptr +
+ " filters_loc = args.weights.GetPtr() + DST_S * 4 * "
+ "args.src_tensor.Slices()" +
+ kernel_spatial_offset + ";\n";
+ }
+ }
+ if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1)
+ {
+ c += " for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
+ for (int z = 0; z < block_size.z; ++z)
+ {
+ const std::string zck = "zck" + std::to_string(z);
+ c += " int zck" + std::to_string(z) + " = kz * args.dilation_z + zc" + std::to_string(z) +
+ ";\n";
+ if (!src_def.SupportsZeroClamp(Axis::DEPTH))
+ {
+ c += " bool in_z" + std::to_string(z) + " = " + zck + " >= 0 && " + zck +
+ " < args.src_tensor.Depth();\n";
+ if (!src_def.CanReadOutOfBorder(Axis::DEPTH))
+ {
+ c += " " + zck + " = clamp(" + zck + ", 0, args.src_tensor.Depth() - 1);\n";
+ }
+ }
+ }
+ }
+ if (!conv_params_.y_kernel_is_1)
+ {
+ c += " for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
+ for (int y = 0; y < block_size.y; ++y)
+ {
+ const std::string yck = "yck" + std::to_string(y);
+ c += " int " + yck + " = ky * args.dilation_y + yc" + std::to_string(y) + ";\n";
+ if (!src_def.SupportsZeroClamp(Axis::HEIGHT))
+ {
+ c += " bool in_y" + std::to_string(y) + " = " + yck + " >= 0 && " + yck +
+ " < args.src_tensor.Height();\n";
+ if (!src_def.CanReadOutOfBorder(Axis::HEIGHT))
+ {
+ c += " " + yck + " = clamp(" + yck + ", 0, args.src_tensor.Height() - 1);\n";
+ }
+ }
+ }
+ }
+ if (!conv_params_.x_kernel_is_1)
+ {
+ c += " for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
+ for (int x = 0; x < block_size.x; ++x)
+ {
+ const std::string xck = "xck" + std::to_string(x);
+ c += " int xck" + std::to_string(x) + " = kx * args.dilation_x + xc" + std::to_string(x) +
+ ";\n";
+ if (!src_def.SupportsZeroClamp(Axis::WIDTH))
+ {
+ c += " bool in_x" + std::to_string(x) + " = " + xck + " >= 0 && " + xck +
+ " < args.src_tensor.Width();\n";
+ if (!src_def.CanReadOutOfBorder(Axis::WIDTH))
+ {
+ c += " " + xck + " = clamp(" + xck + ", 0, args.src_tensor.Width() - 1);\n";
+ }
+ }
+ }
+ }
+ const bool need_multiple_slice_strides =
+ src_def.ReturnsZeroForNegOneRead() && !trivial_kernel_size;
+ for (int z = 0; z < block_size.z; ++z)
+ {
+ const std::string zind = std::to_string(z);
+ for (int y = 0; y < block_size.y; ++y)
+ {
+ const std::string yind = std::to_string(y);
+ for (int x = 0; x < block_size.x; ++x)
+ {
+ const std::string xind = std::to_string(x);
+ std::string xc = conv_params.x_kernel_is_1 ? "xc" + xind : "xck" + xind;
+ std::string yc = conv_params.y_kernel_is_1 ? "yc" + yind : "yck" + yind;
+ const std::string id = generate_id(xind, yind, zind);
+ std::string coords = "" + xc + ", " + yc;
+ if (src_def.HasAxis(Axis::DEPTH))
+ {
+ std::string zc = conv_params.z_kernel_is_1 ? "zc" + zind : "zck" + zind;
+ coords += ", " + zc;
+ }
+ if (src_def.IsLinear())
+ {
+ c += " args.src_tensor.GetAddress(addr" + id + ", " + coords + ", 0);\n";
+ if (need_multiple_slice_strides)
+ {
+ const std::string check = generate_check(xind, yind, zind);
+ c += " addr" + id + " = select(-1, addr" + id + ", (" + check + "));\n";
+ c +=
+ " int ds" + id + " = select(0, args.src_tensor.SliceStride(), (" + check + "));\n";
+ }
+ }
+ }
+ }
+ }
+ if (src_def.IsLinear() && !need_multiple_slice_strides)
+ {
+ c += " int ds = args.src_tensor.SliceStride();\n";
+ }
+
+ auto declare_src = [&]() {
+ for (int z = 0; z < block_size.z; ++z)
+ {
+ const std::string zind = std::to_string(z);
+ for (int y = 0; y < block_size.y; ++y)
+ {
+ const std::string yind = std::to_string(y);
+ for (int x = 0; x < block_size.x; ++x)
+ {
+ const std::string xind = std::to_string(x);
+ const std::string id = generate_id(xind, yind, zind);
+ c += " " + weights_data_type + " src" + id + ";\n";
+ }
+ }
+ }
+ };
+ const bool conditional_read = device_info.IsMali();
+ auto read_src = [&]() {
+ const std::string cl_type = ToCLDataType(conv_params.weights_data_type);
+ for (int z = 0; z < block_size.z; ++z)
+ {
+ const std::string zind = std::to_string(z);
+ for (int y = 0; y < block_size.y; ++y)
+ {
+ const std::string yind = std::to_string(y);
+ for (int x = 0; x < block_size.x; ++x)
+ {
+ const std::string xind = std::to_string(x);
+ std::string id = generate_id(xind, yind, zind);
+ const std::string check = generate_check(xind, yind, zind);
+ std::string address;
+ if (src_def.IsLinear())
+ {
+ address = "addr" + id;
+ }
+ else
+ {
+ std::string xc = conv_params.x_kernel_is_1 ? "xc" + xind : "xck" + xind;
+ std::string yc = conv_params.y_kernel_is_1 ? "yc" + yind : "yck" + yind;
+ address = "" + xc + ", " + yc;
+ if (src_def.HasAxis(Axis::DEPTH))
+ {
+ std::string zc = conv_params.z_kernel_is_1 ? "zc" + zind : "zck" + zind;
+ address += ", " + zc;
+ }
+ address += ", s";
+ }
+ if (src_def.ReturnsZeroForNegOneRead())
+ {
+ c += " src" + id + " = args.src_tensor.Read<" + cl_type + ">(" + address + ");\n";
+ const std::string ds = trivial_kernel_size ? "ds" : "ds" + id;
+ c += " " + address + " += " + ds + ";\n";
+ }
+ else
+ {
+ if (!check.empty())
+ {
+ if (conditional_read)
+ {
+ c += " src" + id + " = " + check + " ? args.src_tensor.Read<" + cl_type + ">(" +
+ address + ") : (FLT4)(0.0f);\n";
+ }
+ else
+ {
+ c += " src" + id + " = args.src_tensor.Read<" + cl_type + ">(" + address +
+ ") * (FLT)(" + check + ");\n";
+ }
+ }
+ else
+ {
+ c += " src" + id + " = args.src_tensor.Read<" + cl_type + ">(" + address + ");\n";
+ }
+ if (src_def.IsLinear())
+ {
+ c += " " + address + " += ds;\n";
+ }
+ }
+ }
+ }
+ }
+ };
+ const bool weights_type_as_accum_type = !(op_def.precision == CalculationsPrecision::F32_F16 &&
+ conv_params.weights_data_type == DataType::FLOAT16);
+ auto conv_core = [&](int shared_offset) {
+ const std::string channels[] = {"x", "y", "z", "w"};
+ for (int s = 0; s < block_size.w; ++s)
+ {
+ const std::string sind = std::to_string(s);
+ if (weights_type_as_accum_type)
+ {
+ for (int ch = 0; ch < 4; ++ch)
+ {
+ for (int z = 0; z < block_size.z; ++z)
+ {
+ const std::string zind = std::to_string(z);
+ for (int y = 0; y < block_size.y; ++y)
+ {
+ const std::string yind = std::to_string(y);
+ for (int x = 0; x < block_size.x; ++x)
+ {
+ const std::string xind = std::to_string(x);
+ std::string R = "r" + generate_id_full(xind, yind, zind, sind);
+ std::string S = "src" + generate_id(xind, yind, zind);
+ if (use_simd_broadcast)
+ {
+ int simd_id = (s * 4 + ch + shared_offset) / simd_size;
+ int thread_id = (s * 4 + ch + shared_offset) % simd_size;
+ std::string w_val_x = "sub_group_broadcast(simd_w" + std::to_string(simd_id) +
+ ".x, " + std::to_string(thread_id) + "u)";
+ std::string w_val_y = "sub_group_broadcast(simd_w" + std::to_string(simd_id) +
+ ".y, " + std::to_string(thread_id) + "u)";
+ std::string w_val_z = "sub_group_broadcast(simd_w" + std::to_string(simd_id) +
+ ".z, " + std::to_string(thread_id) + "u)";
+ std::string w_val_w = "sub_group_broadcast(simd_w" + std::to_string(simd_id) +
+ ".w, " + std::to_string(thread_id) + "u)";
+ c += " " + R + ".x += " + w_val_x + " * " + S + "." + channels[ch] + ";\n";
+ c += " " + R + ".y += " + w_val_y + " * " + S + "." + channels[ch] + ";\n";
+ c += " " + R + ".z += " + w_val_z + " * " + S + "." + channels[ch] + ";\n";
+ c += " " + R + ".w += " + w_val_w + " * " + S + "." + channels[ch] + ";\n";
+ }
+ else
+ {
+ const std::string weight_id = std::to_string(s * 4 + ch + shared_offset);
+ std::string w_val;
+ if (conv_params.AreWeightsBuffer())
+ {
+ w_val = "weights_cache[" + weight_id + "]";
+ }
+ else
+ {
+ w_val = "f" + weight_id;
+ }
+ c += " " + R + " += " + w_val + " * " + S + "." + channels[ch] + ";\n";
+ }
+ }
+ }
+ }
+ }
+ }
+ else
+ { // F32_F16 precision and weights type is float16
+ for (int z = 0; z < block_size.z; ++z)
+ {
+ const std::string zind = std::to_string(z);
+ for (int y = 0; y < block_size.y; ++y)
+ {
+ const std::string yind = std::to_string(y);
+ for (int x = 0; x < block_size.x; ++x)
+ {
+ const std::string xind = std::to_string(x);
+ std::string R = "r" + generate_id_full(xind, yind, zind, sind);
+ std::string S = "src" + generate_id(xind, yind, zind);
+ std::vector<std::string> F(4);
+ for (int i = 0; i < 4; ++i)
+ {
+ std::string weight_id = std::to_string(s * 4 + i + shared_offset);
+ if (conv_params.AreWeightsBuffer())
+ {
+ F[i] = "weights_cache[" + weight_id + "]";
+ }
+ else
+ {
+ F[i] = "f" + weight_id;
+ }
+ }
+ c += " " + R + " += convert_float4(" + S + ".x * " + F[0] + " + " + S + ".y * " +
+ F[1] + " + " + S + ".z * " + F[2] + " + " + S + ".w * " + F[3] + ");\n";
+ }
+ }
+ }
+ }
+ }
+ };
+
+ c += " int s = 0;\n";
+ c += " do {\n";
+ declare_src();
+ const int total_work_items = work_group_size_.x * work_group_size_.y * work_group_size_.z;
+ if (conv_params.weights_upload_type == ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP)
+ {
+ c += GenerateAsyncUpload("weights_cache", "filters_loc",
+ /*global_offset_name*/ "", local_mem_size);
+ }
+ else if (conv_params.weights_upload_type == ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS)
+ {
+ c += " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ c +=
+ GenerateUploadByThreads("weights_cache", "filters_loc",
+ /*global_offset_name*/ "", "lid", total_work_items, local_mem_size);
+ }
+ else if (use_simd_broadcast)
+ {
+ int parts = local_mem_size / simd_size;
+ int reminder = local_mem_size % simd_size;
+ for (int i = 0; i < parts; ++i)
+ {
+ c += " FLT4 simd_w" + std::to_string(i) + " = filters_loc[simd_id + " +
+ std::to_string(i * simd_size) + "];\n";
+ }
+ if (reminder)
+ {
+ c += " FLT4 simd_w" + std::to_string(parts) + ";\n";
+ c += " if (simd_id < " + std::to_string(reminder) + ") {\n";
+ c += " simd_w" + std::to_string(parts) + " = filters_loc[simd_id + " +
+ std::to_string(parts * simd_size) + "];\n";
+ c += " }\n";
+ }
+ }
+ else if (conv_params.AreWeightsBuffer())
+ { // GLOBAL_MEM/CONSTANT_MEM
+ c += " weights_cache = filters_loc;\n";
+ }
+ else
+ { // TEXTURES_MEM
+ for (int dst_s = 0; dst_s < block_size.w; ++dst_s)
+ {
+ std::string f_y = trivial_kernel_size ? "s" : "filter_offset";
+ if (conv_params.different_weights_for_height)
+ {
+ f_y = "DST_Y * args.src_tensor.Slices() + s";
+ }
+ c += absl::Substitute(
+ R"( FLT4 f$2 = args.weights0.Read(DST_S + $0, $1);
+ FLT4 f$3 = args.weights1.Read(DST_S + $0, $1);
+ FLT4 f$4 = args.weights2.Read(DST_S + $0, $1);
+ FLT4 f$5 = args.weights3.Read(DST_S + $0, $1);
+)",
+ dst_s, f_y, dst_s * 4 + 0, dst_s * 4 + 1, dst_s * 4 + 2, dst_s * 4 + 3);
+ }
+ if (!trivial_kernel_size)
+ {
+ c += " filter_offset++;\n";
+ }
+ }
+ read_src();
+ c += " s += 1;\n";
+ if (conv_params.weights_upload_type == ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS)
+ {
+ c += " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ }
+ conv_core(0);
+ for (int i = 1; i < conv_params.src_depth_loop_size; ++i)
+ {
+ read_src();
+ conv_core(i * block_size.w * 4);
+ c += " s += 1;\n";
+ }
+ if (conv_params.AreWeightsBuffer())
+ {
+ c += " filters_loc += " + std::to_string(local_mem_size) + ";\n";
+ }
+ c += " } while (s < args.src_tensor.Slices());\n";
+ if (!conv_params.x_kernel_is_1)
+ {
+ c += " };\n";
+ }
+ if (!conv_params.y_kernel_is_1)
+ {
+ c += " };\n";
+ }
+ if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1)
+ {
+ c += " };\n";
+ }
+ if (conv_params.AreWeightsBuffer())
+ {
+ if (conv_params.weights_upload_type == ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP)
+ {
+ c += GenerateAsyncUpload("weights_cache", "args.biases.GetPtr()", "DST_S", block_size.w);
+ }
+ else if (conv_params.weights_upload_type ==
+ ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS)
+ {
+ c += " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ c += GenerateUploadByThreads("weights_cache", "args.biases.GetPtr()", "DST_S", "lid",
+ total_work_items, block_size.w);
+ c += " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ }
+ else
+ {
+ c += " weights_cache = args.biases.GetPtr() + DST_S;\n";
+ }
+ }
+ if (late_oob_check)
+ {
+ c += " if (" + dst_oob_check + ") {\n";
+ c += " return;\n";
+ c += " }\n";
+ }
+
+ auto generate_dst_check = [&](int x, int y, int z) {
+ std::string check;
+ const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
+ const std::vector<std::string> names{"Width()", "Height()", "Depth()"};
+ std::vector<std::string> coords(3);
+ coords[0] = "DST_X + " + std::to_string(x);
+ coords[1] = "DST_Y + " + std::to_string(y);
+ coords[2] = "DST_Z + " + std::to_string(z);
+ const std::vector<int> ids{x, y, z};
+ for (size_t i = 0; i < axes.size(); ++i)
+ {
+ const auto &axis = axes[i];
+ if (src_def.HasAxis(axis) && ids[i] != 0)
+ {
+ if (!check.empty())
+ {
+ check += " && ";
+ }
+ check += coords[i] + " < args.dst_tensor." + names[i];
+ }
+ }
+ return check;
+ };
+
+ for (int s = 0; s < block_size.w; ++s)
+ {
+ const std::string sind = std::to_string(s);
+ c += " if (DST_S + " + sind + " >= args.dst_tensor.Slices()) return;\n";
+ c += " {\n";
+ if (conv_params.AreWeightsBuffer())
+ {
+ c += " FLT4 bias_val = TO_FLT4(weights_cache[" + sind + "]);\n";
+ }
+ else
+ {
+ c += " FLT4 bias_val = args.biases.Read(DST_S + " + sind + ");\n";
+ }
+ for (int z = 0; z < block_size.z; ++z)
+ {
+ const std::string zind = std::to_string(z);
+ for (int y = 0; y < block_size.y; ++y)
+ {
+ const std::string yind = std::to_string(y);
+ for (int x = 0; x < block_size.x; ++x)
+ {
+ const std::string xind = std::to_string(x);
+ const std::string id = generate_id_full(xind, yind, zind, sind);
+ const std::string check = generate_dst_check(x, y, z);
+ std::string coords = "DST_X + " + xind + ", DST_Y + " + yind;
+ if (src_def.HasAxis(Axis::DEPTH))
+ {
+ coords += ", DST_Z + " + zind;
+ }
+ coords += ", DST_S + " + sind;
+ if (!check.empty())
+ {
+ c += " if (" + check + ") {\n";
+ }
+ else
+ {
+ c += " {\n";
+ }
+ c += " FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
+ c += " args.dst_tensor.Write(res, " + coords + ");\n";
+ c += " }\n";
+ }
+ }
+ }
+ c += " }\n";
+ }
+ c += "}\n";
+ return c;
+}
+
+ConvPowerVR::ConvParams
+ConvPowerVR::GuessBestParams(const DeviceInfo &device_info, const OperationDef &definition,
+ int src_depth, int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1,
+ bool different_weights_for_height, const BHWC *dst_shape)
+{
+ ConvParams conv_params;
+ conv_params.linear_spatial = false;
+ conv_params.weights_data_type = DeduceDataTypeFromPrecision(definition.precision);
+ conv_params.x_kernel_is_1 = x_kernel_is_1;
+ conv_params.y_kernel_is_1 = y_kernel_is_1;
+ conv_params.different_weights_for_height = different_weights_for_height;
+ if (device_info.IsNvidia())
+ {
+ if (different_weights_for_height)
+ {
+ work_group_size_ = int3(32, 1, 1);
+ work_group_launch_order_ = int3(2, 0, 1);
+ conv_params.fixed_work_group_size = true;
+ }
+ else
+ {
+ conv_params.linear_spatial = true;
+ work_group_size_ = int3(32, 1, 1);
+ work_group_launch_order_ = int3(1, 0, 2);
+ conv_params.fixed_work_group_size = true;
+ }
+ conv_params.block_size = int4(2, 1, 1, 4);
+ conv_params.src_depth_loop_size = 1;
+ conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
+ if (dst_depth % 4 == 0 || dst_depth >= 8)
+ {
+ conv_params.block_size.w = 4;
+ }
+ else if (dst_depth % 2 == 0 || dst_depth >= 4)
+ {
+ conv_params.block_size.w = 2;
+ }
+ else
+ {
+ conv_params.block_size.w = dst_depth;
+ }
+ if (dst_shape)
+ {
+ int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
+ float task_size_per_cu = static_cast<float>(task_size) / device_info.compute_units_count;
+ int block_size =
+ conv_params.block_size.x * conv_params.block_size.y * conv_params.block_size.w;
+ float threads_per_cu = task_size_per_cu / block_size;
+ float warps_per_cu = threads_per_cu / 32 /*warp_size*/;
+ if (warps_per_cu < 8.0f)
+ {
+ conv_params.block_size.x = 1;
+ }
+ if (warps_per_cu < 4.0f && conv_params.block_size.w >= 4)
+ {
+ conv_params.block_size.w /= 2;
+ }
+ if (warps_per_cu < 2.0f && conv_params.block_size.w >= 2)
+ {
+ conv_params.block_size.w /= 2;
+ }
+ }
+ if (src_depth % 2 == 0)
+ {
+ conv_params.src_depth_loop_size = 2;
+ }
+ if (src_depth % 4 == 0 && conv_params.block_size.w <= 2)
+ {
+ conv_params.src_depth_loop_size = 4;
+ }
+ }
+ else if (device_info.IsPowerVR())
+ {
+ if (different_weights_for_height)
+ {
+ work_group_size_ = int3(32, 1, 1);
+ work_group_launch_order_ = int3(2, 0, 1);
+ conv_params.fixed_work_group_size = true;
+ }
+ else
+ {
+ conv_params.linear_spatial = true;
+ work_group_size_ = int3(32, 1, 1);
+ work_group_launch_order_ = int3(1, 0, 2);
+ conv_params.fixed_work_group_size = true;
+ }
+ conv_params.weights_data_type =
+ definition.precision == CalculationsPrecision::F16 ? DataType::FLOAT16 : DataType::FLOAT32;
+ conv_params.block_size = int4(1, 1, 1, 4);
+ conv_params.src_depth_loop_size = 1;
+ conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;
+ if (dst_depth % 8 == 0 || dst_depth >= 32)
+ {
+ conv_params.block_size.w = 8;
+ }
+ else if (dst_depth % 4 == 0 || dst_depth >= 8)
+ {
+ conv_params.block_size.w = 4;
+ }
+ else if (dst_depth % 2 == 0 || dst_depth >= 4)
+ {
+ conv_params.block_size.w = 2;
+ }
+ else
+ {
+ conv_params.block_size.w = dst_depth;
+ }
+ if (definition.precision == CalculationsPrecision::F16)
+ {
+ conv_params.block_size.w = std::min(4, conv_params.block_size.w);
+ if (src_depth % 2 == 0)
+ {
+ conv_params.src_depth_loop_size = 2;
+ }
+ if (src_depth % 4 == 0 && conv_params.block_size.w <= 2)
+ {
+ conv_params.src_depth_loop_size = 4;
+ }
+ if (conv_params.block_size.w == 1)
+ {
+ if (src_depth % 2 == 0)
+ {
+ conv_params.src_depth_loop_size = 2;
+ }
+ if (src_depth % 4 == 0)
+ {
+ conv_params.src_depth_loop_size = 4;
+ }
+ if (src_depth <= 8)
+ {
+ conv_params.src_depth_loop_size = src_depth;
+ }
+ }
+ conv_params.block_size.x = 2;
+ }
+ }
+ else if (device_info.IsAMD())
+ {
+ if (different_weights_for_height)
+ {
+ work_group_size_ = int3(32, 1, 1);
+ work_group_launch_order_ = int3(2, 0, 1);
+ conv_params.fixed_work_group_size = true;
+ }
+ else
+ {
+ work_group_size_ = int3(8, 4, 1);
+ work_group_launch_order_ = int3(2, 0, 1);
+ conv_params.fixed_work_group_size = true;
+ }
+
+ conv_params.block_size = int4(2, 1, 1, 1);
+ if (x_kernel_is_1 && y_kernel_is_1)
+ {
+ conv_params.block_size.y = 2;
+ }
+ conv_params.src_depth_loop_size = 1;
+ conv_params.weights_upload_type = WeightsUploadType::CONSTANT_MEM;
+ if (dst_depth % 8 == 0 || dst_depth >= 32)
+ {
+ conv_params.block_size.w = 8;
+ }
+ else if (dst_depth % 4 == 0 || dst_depth >= 8)
+ {
+ conv_params.block_size.w = 4;
+ }
+ else if (dst_depth % 2 == 0 || dst_depth >= 4)
+ {
+ conv_params.block_size.w = 2;
+ }
+ else
+ {
+ conv_params.block_size.w = 1;
+ }
+ if (src_depth % 2 == 0 && src_depth >= 16)
+ {
+ conv_params.src_depth_loop_size = 2;
+ }
+ }
+ else if (device_info.IsMali())
+ {
+ int block_size = 2;
+ if (dst_shape)
+ {
+ int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
+ block_size = GetRecommendedBlockSizeForConv(device_info, definition.precision, task_size);
+ }
+ if (!x_kernel_is_1 || !y_kernel_is_1)
+ {
+ block_size = std::min(block_size, 4);
+ }
+ if (block_size == 8)
+ {
+ if (dst_depth == 1 || dst_depth == 3)
+ {
+ conv_params.block_size = int4(2, 2, 1, 1);
+ }
+ else
+ {
+ conv_params.block_size = int4(2, 2, 1, 2);
+ }
+ }
+ else if (block_size == 4)
+ {
+ if (dst_depth == 1 || dst_depth == 3)
+ {
+ conv_params.block_size = int4(2, 2, 1, 1);
+ }
+ else
+ {
+ conv_params.block_size = int4(2, 1, 1, 2);
+ }
+ }
+ else if (block_size == 2)
+ {
+ conv_params.block_size = int4(2, 1, 1, 1);
+ }
+ else
+ {
+ conv_params.block_size = int4(1, 1, 1, 1);
+ }
+ conv_params.src_depth_loop_size = 1;
+ MaliInfo mali_info = device_info.mali_info;
+ if (src_depth % 2 == 0 && block_size <= 2 && !mali_info.IsMidgard())
+ {
+ conv_params.src_depth_loop_size = 2;
+ }
+ if (src_depth % 4 == 0 && block_size == 1 && !mali_info.IsMidgard() &&
+ definition.precision == CalculationsPrecision::F16)
+ {
+ conv_params.src_depth_loop_size = 4;
+ }
+ work_group_size_ = int3(4, 4, 1);
+ work_group_launch_order_ = int3(0, 1, 2);
+ conv_params.fixed_work_group_size = false;
+ conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
+ }
+ else if (device_info.IsAdreno())
+ {
+ conv_params.block_size = int4(2, 2, 1, 2);
+ if (device_info.IsAdreno3xx())
+ {
+ if (definition.precision == CalculationsPrecision::F16)
+ {
+ conv_params.block_size = int4(2, 2, 1, 2);
+ }
+ else if (definition.precision == CalculationsPrecision::F32_F16)
+ {
+ conv_params.block_size = int4(2, 1, 1, 2);
+ }
+ else
+ { // F32
+ conv_params.block_size = int4(2, 2, 1, 1);
+ }
+ }
+ work_group_size_ = int3(8, 2, 1);
+ work_group_launch_order_ = int3(0, 1, 2);
+ conv_params.fixed_work_group_size = false;
+ conv_params.src_depth_loop_size = 1;
+ if (definition.src_tensors.size() == 2)
+ {
+ // dynamic weights supported only with buffers.
+ conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
+ }
+ else
+ {
+ conv_params.weights_upload_type = WeightsUploadType::TEXTURES_MEM_X4;
+ }
+ }
+ else if (device_info.IsIntel())
+ {
+ if (different_weights_for_height)
+ {
+ work_group_size_ = int3(16, 1, 1);
+ work_group_launch_order_ = int3(0, 1, 2);
+ conv_params.fixed_work_group_size = true;
+ }
+ else
+ {
+ conv_params.linear_spatial = true;
+ work_group_size_ = int3(16, 1, 1);
+ work_group_launch_order_ = int3(0, 1, 2);
+ conv_params.fixed_work_group_size = true;
+ }
+ conv_params.block_size = int4(1, 1, 1, 4);
+ conv_params.src_depth_loop_size = 1;
+ int sub_group_size = 16;
+ const bool supports_subgroups = device_info.SupportsExtension("cl_khr_subgroups") ||
+ device_info.SupportsExtension("cl_intel_subgroups");
+ if (definition.precision != CalculationsPrecision::F32_F16 && supports_subgroups &&
+ device_info.SupportsExtension("cl_intel_required_subgroup_size") &&
+ device_info.SupportsSubGroupWithSize(sub_group_size))
+ {
+ conv_params.weights_upload_type = WeightsUploadType::PRIVATE_MEM_SIMD_BROADCAST;
+ conv_params.simd_size = sub_group_size;
+ }
+ else
+ {
+ conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
+ }
+ if (dst_depth % 4 == 0 || dst_depth >= 8)
+ {
+ conv_params.block_size.w = 4;
+ }
+ else if (dst_depth % 2 == 0 || dst_depth >= 4)
+ {
+ conv_params.block_size.w = 2;
+ }
+ else
+ {
+ conv_params.block_size.w = dst_depth;
+ }
+ if (src_depth % 2 == 0)
+ {
+ conv_params.src_depth_loop_size = 2;
+ }
+ if (src_depth % 4 == 0 && conv_params.block_size.w <= 2)
+ {
+ conv_params.src_depth_loop_size = 4;
+ }
+ }
+ else
+ {
+ conv_params.block_size = int4(1, 1, 1, 4);
+ work_group_size_ = int3(8, 2, 1);
+ work_group_launch_order_ = int3(0, 1, 2);
+ conv_params.fixed_work_group_size = false;
+ conv_params.src_depth_loop_size = 1;
+ conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
+ if (dst_depth % 4 == 0 || dst_depth >= 8)
+ {
+ conv_params.block_size.w = 4;
+ }
+ else if (dst_depth % 2 == 0 || dst_depth >= 4)
+ {
+ conv_params.block_size.w = 2;
+ }
+ else
+ {
+ conv_params.block_size.w = dst_depth;
+ }
+ if (src_depth % 2 == 0)
+ {
+ conv_params.src_depth_loop_size = 2;
+ }
+ if (src_depth % 4 == 0 && conv_params.block_size.w <= 2)
+ {
+ conv_params.src_depth_loop_size = 4;
+ }
+ }
+
+ return conv_params;
+}
+
+ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const Convolution2DAttributes &attr,
+ const BHWC *dst_shape)
+{
+ const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
+ const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
+ const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
+ attr.dilations.w == 1 && attr.padding.prepended.w == 0 &&
+ attr.padding.appended.w == 0;
+ const bool y_kernel_is_1 = attr.weights.shape.h == 1 && attr.strides.h == 1 &&
+ attr.dilations.h == 1 && attr.padding.prepended.h == 0 &&
+ attr.padding.appended.h == 0;
+ return GuessBestParams(device_info, definition, src_depth, dst_depth, x_kernel_is_1,
+ y_kernel_is_1, false, dst_shape);
+}
+
+ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const Convolution3DAttributes &attr,
+ const BHWDC *dst_shape)
+{
+ const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
+ const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
+ const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
+ attr.dilations.w == 1 && attr.padding.prepended.w == 0 &&
+ attr.padding.appended.w == 0;
+ const bool y_kernel_is_1 = attr.weights.shape.h == 1 && attr.strides.h == 1 &&
+ attr.dilations.h == 1 && attr.padding.prepended.h == 0 &&
+ attr.padding.appended.h == 0;
+ const bool z_kernel_is_1 = attr.weights.shape.d == 1 && attr.strides.d == 1 &&
+ attr.dilations.d == 1 && attr.padding.prepended.d == 0 &&
+ attr.padding.appended.d == 0;
+
+ ConvPowerVR::ConvParams result;
+ BHWC shape;
+ if (dst_shape)
+ {
+ shape.b = dst_shape->b;
+ shape.h = dst_shape->h * dst_shape->d;
+ shape.w = dst_shape->w;
+ shape.c = dst_shape->c;
+ result = GuessBestParams(device_info, definition, src_depth, dst_depth, x_kernel_is_1,
+ y_kernel_is_1, false, &shape);
+ }
+ else
+ {
+ result = GuessBestParams(device_info, definition, src_depth, dst_depth, x_kernel_is_1,
+ y_kernel_is_1, false, nullptr);
+ }
+ result.z_kernel_is_1 = z_kernel_is_1;
+ return result;
+}
+
+ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const Convolution2DAttributes &attr,
+ const BHWC &weights_shape,
+ const BHWC *dst_shape)
+{
+ const int dst_depth = DivideRoundUp(weights_shape.b, 4);
+ const int src_depth = DivideRoundUp(weights_shape.c, 4);
+ const bool x_kernel_is_1 = weights_shape.w == 1 && attr.strides.w == 1 && attr.dilations.w == 1 &&
+ attr.padding.prepended.w == 0 && attr.padding.appended.w == 0;
+ const bool y_kernel_is_1 = weights_shape.h == 1 && attr.strides.h == 1 && attr.dilations.h == 1 &&
+ attr.padding.prepended.h == 0 && attr.padding.appended.h == 0;
+ return GuessBestParams(device_info, definition, src_depth, dst_depth, x_kernel_is_1,
+ y_kernel_is_1, false, dst_shape);
+}
+
+ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const FullyConnectedAttributes &attr,
+ const BHWC *dst_shape)
+{
+ const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
+ const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
+ ConvPowerVR::ConvParams params =
+ GuessBestParams(device_info, definition, src_depth, dst_depth, true, true, false, dst_shape);
+ work_group_size_.x *= work_group_size_.y;
+ work_group_size_.y = 1;
+ params.block_size.x *= params.block_size.y;
+ params.block_size.y = 1;
+ return params;
+}
+
+ConvPowerVR::ConvParams ConvPowerVR::GuessBestParamsWinograd(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const Convolution2DAttributes &attr,
+ const BHWC *dst_shape)
+{
+ const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
+ const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
+ ConvPowerVR::ConvParams params =
+ GuessBestParams(device_info, definition, src_depth, dst_depth, true, true, true, dst_shape);
+ params.block_size.x *= params.block_size.y;
+ params.block_size.y = 1;
+ return params;
+}
+
+ConvPowerVR CreateConvPowerVR(const DeviceInfo &device_info, const OperationDef &definition,
+ const Convolution2DAttributes &attr, const BHWC *dst_shape)
+{
+ ConvPowerVR result(definition, attr, device_info, dst_shape);
+ result.GenerateCode(device_info);
+ result.UploadData(attr.weights, attr.bias);
+ return result;
+}
+
+ConvPowerVR CreateConvPowerVR(const DeviceInfo &device_info, const OperationDef &definition,
+ const FullyConnectedAttributes &attr, const BHWC *dst_shape)
+{
+ ConvPowerVR result(definition, attr, device_info, dst_shape);
+ result.GenerateCode(device_info);
+ result.UploadData(attr.weights, attr.bias);
+ return result;
+}
+
+ConvPowerVR CreateConvPowerVRDynamicWeights(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const Convolution2DAttributes &attr,
+ const BHWC &weights_shape, const BHWC *dst_shape)
+{
+ ConvPowerVR result(definition, attr, weights_shape, device_info, dst_shape);
+ result.GenerateCode(device_info);
+ result.UploadBias(attr.bias);
+ return result;
+}
+
+ConvPowerVR CreateConvPowerVRWino4x4To6x6(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const Convolution2DAttributes &attr,
+ const BHWC *dst_shape)
+{
+ ConvPowerVR result(definition);
+ result.conv_params_ = result.GuessBestParamsWinograd(device_info, definition, attr, dst_shape);
+ result.GenerateCode(device_info);
+ result.UploadDataForWinograd4x4To6x6(attr.weights);
+ return result;
+}
+
+ConvPowerVR CreateConvPowerVR3D(const DeviceInfo &device_info, const OperationDef &definition,
+ const Convolution3DAttributes &attr, const BHWDC *dst_shape)
+{
+ ConvPowerVR result(definition, attr, device_info, dst_shape);
+ result.GenerateCode(device_info);
+ result.UploadWeights(attr.weights);
+ result.UploadBias(attr.bias);
+ return result;
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvPowervr.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvPowervr.h
new file mode 100644
index 000000000..f83f05730
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvPowervr.h
@@ -0,0 +1,413 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_POWERVR_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_POWERVR_H__
+
+#include <cstring>
+#include <vector>
+
+#include "open_cl/Buffer.h"
+#include "open_cl/ClDevice.h"
+#include "open_cl/kernels/ConvCommon.h"
+#include "open_cl/kernels/GpuOperation.h"
+#include "open_cl/kernels/Util.h"
+#include "open_cl/LinearStorage.h"
+#include "open_cl/Tensor.h"
+#include "open_cl/Texture2d.h"
+#include "open_cl/Util.h"
+#include "open_cl/DataType.h"
+#include "open_cl/Operations.h"
+#include "open_cl/Shape.h"
+#include "open_cl/Status.h"
+#include "open_cl/Tensor.h"
+#include "open_cl/Types.h"
+#include "open_cl/WinogradUtil.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+class ConvPowerVR : public GPUOperation
+{
+public:
+ ConvPowerVR() = default;
+ void GetPossibleKernelWorkGroups(TuningType tuning_type, const DeviceInfo &device_info,
+ const KernelInfo &kernel_info,
+ std::vector<int3> *work_groups) const override;
+ absl::Status BindArguments(ArgumentsBinder *args) override;
+ int3 GetGridSize() const override;
+
+ ConvWeightsDescription GetConvWeightsDescription() const
+ {
+ ConvWeightsDescription desc;
+ desc.layout = ConvWeightsLayout::kOHWIOGroupI4O4;
+ desc.output_group_size = conv_params_.block_size.w;
+ return desc;
+ }
+
+ // Move only
+ ConvPowerVR(ConvPowerVR &&operation);
+ ConvPowerVR &operator=(ConvPowerVR &&operation);
+ ConvPowerVR(const ConvPowerVR &) = delete;
+ ConvPowerVR &operator=(const ConvPowerVR &) = delete;
+
+private:
+ enum class WeightsUploadType
+ {
+ LOCAL_MEM_ASYNC_SUBGROUP, // we use it for PowerVR with workgroup size = 32
+ LOCAL_MEM_BY_THREADS,
+ GLOBAL_MEM,
+ CONSTANT_MEM,
+ PRIVATE_MEM_SIMD_BROADCAST,
+ TEXTURES_MEM_X4, // 4 textures for weights
+ };
+
+ struct ConvParams
+ {
+ // Usually we use this combinations for CalculationPrecision:
+ // F32: all F32
+ // F16: all F16
+ // F32_F16: all besides accumulator is F16, including weights
+ // But for PowerVR we can achieve better performance in F32_F16 with F32
+ // weights, so for PowerVR in this kernel we have F32 weights for
+ // F32_F16 precision mode
+ DataType weights_data_type; // used for weights and biases
+ int4 block_size; // WHDS
+ bool fixed_work_group_size;
+ bool linear_spatial; // spatial dimensions are Width/Height/Depth
+ bool different_weights_for_height;
+ int src_depth_loop_size;
+ WeightsUploadType weights_upload_type;
+ bool x_kernel_is_1;
+ bool y_kernel_is_1;
+ bool z_kernel_is_1;
+
+ // used only with PRIVATE_MEM_SIMD_BROADCAST
+ int simd_size = 1;
+
+ bool AreWeightsBuffer() const
+ {
+ return weights_upload_type != WeightsUploadType::TEXTURES_MEM_X4;
+ }
+
+ bool IsPrivateMemBroadcast() const
+ {
+ return weights_upload_type == WeightsUploadType::PRIVATE_MEM_SIMD_BROADCAST;
+ }
+ };
+
+ ConvPowerVR(const OperationDef &definition, const Convolution2DAttributes &attr,
+ const DeviceInfo &device_info, const BHWC *dst_shape = nullptr);
+ ConvPowerVR(const OperationDef &definition, const Convolution2DAttributes &attr,
+ const BHWC &weights_shape, const DeviceInfo &device_info,
+ const BHWC *dst_shape = nullptr);
+ ConvPowerVR(const OperationDef &definition, const FullyConnectedAttributes &attr,
+ const DeviceInfo &device_info, const BHWC *dst_shape = nullptr);
+ explicit ConvPowerVR(const OperationDef &definition);
+ ConvPowerVR(const OperationDef &definition, const Convolution3DAttributes &attr,
+ const DeviceInfo &device_info, const BHWDC *dst_shape = nullptr);
+
+ void GenerateCode(const DeviceInfo &device_info);
+
+ template <DataType T>
+ void UploadData(const InternalTensor<OHWI, T> &weights, const InternalTensor<Linear, T> &biases);
+ template <DataType T> void UploadDataForWinograd4x4To6x6(const InternalTensor<OHWI, T> &weights);
+
+ template <DataType T> void UploadWeights(const InternalTensor<OHWI, T> &weights);
+
+ template <DataType T> void UploadWeights(const InternalTensor<OHWDI, T> &weights);
+
+ template <DataType T> void UploadBias(const InternalTensor<Linear, T> &bias);
+
+ friend ConvPowerVR CreateConvPowerVR(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const Convolution2DAttributes &attr, const BHWC *dst_shape);
+
+ friend ConvPowerVR CreateConvPowerVR(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const FullyConnectedAttributes &attr, const BHWC *dst_shape);
+
+ friend ConvPowerVR CreateConvPowerVRDynamicWeights(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const Convolution2DAttributes &attr,
+ const BHWC &weights_shape,
+ const BHWC *dst_shape);
+
+ friend ConvPowerVR CreateConvPowerVRWino4x4To6x6(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const Convolution2DAttributes &attr,
+ const BHWC *dst_shape);
+
+ friend ConvPowerVR CreateConvPowerVR3D(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const Convolution3DAttributes &attr,
+ const BHWDC *dst_shape);
+
+ ConvParams GuessBestParams(const DeviceInfo &device_info, const OperationDef &definition,
+ const Convolution2DAttributes &attr, const BHWC *dst_shape = nullptr);
+ ConvParams GuessBestParams(const DeviceInfo &device_info, const OperationDef &definition,
+ const Convolution2DAttributes &attr, const BHWC &weights_shape,
+ const BHWC *dst_shape = nullptr);
+ ConvParams GuessBestParams(const DeviceInfo &device_info, const OperationDef &definition,
+ const FullyConnectedAttributes &attr, const BHWC *dst_shape = nullptr);
+ ConvParams GuessBestParamsWinograd(const DeviceInfo &device_info, const OperationDef &definition,
+ const Convolution2DAttributes &attr,
+ const BHWC *dst_shape = nullptr);
+ ConvParams GuessBestParams(const DeviceInfo &device_info, const OperationDef &definition,
+ const Convolution3DAttributes &attr, const BHWDC *dst_shape = nullptr);
+ ConvParams GuessBestParams(const DeviceInfo &device_info, const OperationDef &definition,
+ int src_depth, int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1,
+ bool different_weights_for_height, const BHWC *dst_shape = nullptr);
+
+ std::string GenerateConv(const DeviceInfo &device_info, const OperationDef &op_def,
+ bool stride_correction, const ConvParams &conv_params);
+
+ int4 stride_;
+ int4 padding_;
+ int4 kernel_size_;
+ int4 dilation_;
+ ConvParams conv_params_;
+};
+
+template <DataType T>
+void ConvPowerVR::UploadData(const InternalTensor<OHWI, T> &weights,
+ const InternalTensor<Linear, T> &biases)
+{
+ UploadWeights(weights);
+ UploadBias(biases);
+}
+
+template <DataType T>
+void ConvPowerVR::UploadDataForWinograd4x4To6x6(const InternalTensor<OHWI, T> &weights)
+{
+ InternalTensor<OHWI, T> wino_weights;
+ RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
+ UploadWeights(wino_weights);
+ InternalTensor<Linear, DataType::FLOAT32> biases;
+ biases.shape = Linear(weights.shape.o);
+ biases.data.resize(weights.shape.o, 0.0f);
+ UploadBias(biases);
+}
+
+template <DataType T> void ConvPowerVR::UploadBias(const InternalTensor<Linear, T> &bias)
+{
+ BufferDescriptor desc;
+ desc.element_type = conv_params_.weights_data_type;
+ desc.element_size = 4;
+ desc.memory_type =
+ conv_params_.weights_upload_type == ConvPowerVR::WeightsUploadType::CONSTANT_MEM
+ ? MemoryType::CONSTANT
+ : MemoryType::GLOBAL;
+ const int float_size = sizeof(float);
+ // TODO
+ // conv_params_.weights_data_type == DataType::FLOAT32 ? sizeof(float) : sizeof(half);
+ int aligned_channels = AlignByN(bias.shape.v, 4 * conv_params_.block_size.w);
+ desc.size = float_size * aligned_channels;
+ desc.data.resize(desc.size);
+ if (conv_params_.weights_data_type == DataType::FLOAT32)
+ {
+ float *gpu_data = reinterpret_cast<float *>(desc.data.data());
+ for (int i = 0; i < aligned_channels; ++i)
+ {
+ gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
+ }
+ }
+ // else
+ // {
+ // half *gpu_data = reinterpret_cast<half *>(desc.data.data());
+ // for (int i = 0; i < aligned_channels; ++i)
+ // {
+ // gpu_data[i] = i < bias.shape.v ? bias.data[i] : 0.0f;
+ // }
+ // }
+ args_.AddObject("biases", absl::make_unique<BufferDescriptor>(std::move(desc)));
+}
+
+template <DataType T> void ConvPowerVR::UploadWeights(const InternalTensor<OHWI, T> &weights)
+{
+ const int dst_slices = AlignByN(DivideRoundUp(weights.shape.o, 4), conv_params_.block_size.w);
+ const int src_slices = DivideRoundUp(weights.shape.i, 4);
+
+ const bool f32_weights = conv_params_.weights_data_type == DataType::FLOAT32;
+ const int float4_size = sizeof(float4);
+ // TODO
+ // f32_weights ? sizeof(float4) : sizeof(half4);
+
+ const int elements_count = weights.shape.h * weights.shape.w * src_slices * dst_slices * 4;
+
+ std::vector<uint8_t> data(float4_size * elements_count);
+
+ if (f32_weights)
+ {
+ float4 *ptr = reinterpret_cast<float4 *>(data.data());
+ if (conv_params_.AreWeightsBuffer())
+ {
+ RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.w,
+ absl::MakeSpan(ptr, elements_count));
+ }
+ else
+ {
+ RearrangeWeightsToI4HWIOOGroupO4(weights, conv_params_.block_size.w,
+ absl::MakeSpan(ptr, elements_count));
+ }
+ }
+ // else
+ // {
+ // half4 *ptr = reinterpret_cast<half4 *>(data.data());
+ // if (conv_params_.AreWeightsBuffer())
+ // {
+ // RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.w,
+ // absl::MakeSpan(ptr, elements_count));
+ // }
+ // else
+ // {
+ // RearrangeWeightsToI4HWIOOGroupO4(weights, conv_params_.block_size.w,
+ // absl::MakeSpan(ptr, elements_count));
+ // }
+ // }
+ if (conv_params_.AreWeightsBuffer())
+ {
+ BufferDescriptor desc;
+ desc.element_type = conv_params_.weights_data_type;
+ desc.element_size = 4;
+ desc.memory_type =
+ conv_params_.weights_upload_type == ConvPowerVR::WeightsUploadType::CONSTANT_MEM
+ ? MemoryType::CONSTANT
+ : MemoryType::GLOBAL;
+ desc.size = float4_size * elements_count;
+ desc.data = std::move(data);
+ args_.AddObject("weights", absl::make_unique<BufferDescriptor>(std::move(desc)));
+ }
+ else
+ {
+ const int texture_width = dst_slices;
+ const int texture_height = src_slices * weights.shape.h * weights.shape.w;
+ const int sub_size = float4_size * texture_width * texture_height;
+ for (int i = 0; i < 4; ++i)
+ {
+ Texture2DDescriptor desc;
+ desc.element_type = conv_params_.weights_data_type;
+ desc.size = int2(texture_width, texture_height);
+ desc.data.resize(sub_size);
+ std::memcpy(desc.data.data(), data.data() + sub_size * i, sub_size);
+ const std::string name = "weights" + std::to_string(i);
+ args_.AddObject(name, absl::make_unique<Texture2DDescriptor>(std::move(desc)));
+ }
+ }
+}
+
+template <DataType T> void ConvPowerVR::UploadWeights(const InternalTensor<OHWDI, T> &weights)
+{
+ const int block_size = conv_params_.block_size.w;
+ const int dst_slices = AlignByN(DivideRoundUp(weights.shape.o, 4), block_size);
+ const int src_slices = DivideRoundUp(weights.shape.i, 4);
+
+ const int elements_count =
+ weights.shape.d * weights.shape.h * weights.shape.w * src_slices * dst_slices * 4;
+ const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
+
+ const int float4_size = f32_weights ? 16 : 8;
+
+ std::vector<uint8_t> data(float4_size * elements_count);
+
+ if (f32_weights)
+ {
+ float4 *ptr = reinterpret_cast<float4 *>(data.data());
+ if (conv_params_.AreWeightsBuffer())
+ {
+ RearrangeWeightsToODHWIOGroupI4O4(weights, conv_params_.block_size.w,
+ absl::MakeSpan(ptr, elements_count));
+ }
+ else
+ {
+ RearrangeWeightsToI4DHWIOOGroupO4(weights, conv_params_.block_size.w,
+ absl::MakeSpan(ptr, elements_count));
+ }
+ }
+ // else
+ // {
+ // half4 *ptr = reinterpret_cast<half4 *>(data.data());
+ // if (conv_params_.AreWeightsBuffer())
+ // {
+ // RearrangeWeightsToODHWIOGroupI4O4(weights, conv_params_.block_size.w,
+ // absl::MakeSpan(ptr, elements_count));
+ // }
+ // else
+ // {
+ // RearrangeWeightsToI4DHWIOOGroupO4(weights, conv_params_.block_size.w,
+ // absl::MakeSpan(ptr, elements_count));
+ // }
+ // }
+
+ if (conv_params_.AreWeightsBuffer())
+ {
+ BufferDescriptor desc;
+ desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+ desc.element_size = 4;
+ desc.size = float4_size * elements_count;
+ desc.data = std::move(data);
+ args_.AddObject("weights", absl::make_unique<BufferDescriptor>(std::move(desc)));
+ }
+ else
+ {
+ const int texture_width = dst_slices;
+ const int texture_height = src_slices * weights.shape.d * weights.shape.h * weights.shape.w;
+ int sub_size = float4_size * texture_width * texture_height;
+ for (int i = 0; i < 4; ++i)
+ {
+ Texture2DDescriptor desc;
+ desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+ desc.size = int2(texture_width, texture_height);
+ desc.data.resize(sub_size);
+ memcpy(desc.data.data(), data.data() + sub_size * i, sub_size);
+ const std::string name = "weights" + std::to_string(i);
+ args_.AddObject(name, absl::make_unique<Texture2DDescriptor>(std::move(desc)));
+ }
+ }
+}
+
+ConvPowerVR CreateConvPowerVR(const DeviceInfo &device_info, const OperationDef &definition,
+ const Convolution2DAttributes &attr, const BHWC *dst_shape = nullptr);
+
+ConvPowerVR CreateConvPowerVR(const DeviceInfo &device_info, const OperationDef &definition,
+ const FullyConnectedAttributes &attr,
+ const BHWC *dst_shape = nullptr);
+
+ConvPowerVR CreateConvPowerVRDynamicWeights(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const Convolution2DAttributes &attr,
+ const BHWC &weights_shape,
+ const BHWC *dst_shape = nullptr);
+
+ConvPowerVR CreateConvPowerVRWino4x4To6x6(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const Convolution2DAttributes &attr,
+ const BHWC *dst_shape = nullptr);
+
+ConvPowerVR CreateConvPowerVR3D(const DeviceInfo &device_info, const OperationDef &definition,
+ const Convolution3DAttributes &attr,
+ const BHWDC *dst_shape = nullptr);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_POWERVR_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvWeightsConverter.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvWeightsConverter.cc
new file mode 100644
index 000000000..95172bd05
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvWeightsConverter.cc
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "open_cl/kernels/ConvWeightsConverter.h"
+
+#include <string>
+
+#include "open_cl/kernels/Util.h"
+#include "open_cl/kernels/WorkGroupPicking.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+ConverterToConvWeights::ConverterToConvWeights(const OperationDef &definition,
+ const ConvWeightsDescription &conv_weights_desc)
+ : GPUOperation(definition), conv_weights_desc_(conv_weights_desc)
+{
+ code_ = GetConverterToConvWeightsCode(definition_, conv_weights_desc_);
+}
+
+ConverterToConvWeights::ConverterToConvWeights(ConverterToConvWeights &&operation)
+ : GPUOperation(std::move(operation)), conv_weights_desc_(operation.conv_weights_desc_)
+{
+}
+
+ConverterToConvWeights &ConverterToConvWeights::operator=(ConverterToConvWeights &&operation)
+{
+ if (this != &operation)
+ {
+ conv_weights_desc_ = operation.conv_weights_desc_;
+ GPUOperation::operator=(std::move(operation));
+ }
+ return *this;
+}
+
+std::string ConverterToConvWeights::GetConverterToConvWeightsCode(
+ const OperationDef &op_def, const ConvWeightsDescription &conv_weights_desc)
+{
+ AddSrcTensor("src_tensor", op_def.src_tensors[0]);
+ AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
+ args_.AddFloat("mask_x");
+ args_.AddFloat("mask_y");
+ args_.AddFloat("mask_z");
+ args_.AddFloat("mask_w");
+
+ std::string c = GetCommonDefines(op_def.precision);
+ c += "__kernel void main_function(\n";
+ c += "$0) {\n";
+ c += " int GROUP_SIZE = " + std::to_string(conv_weights_desc.output_group_size) + ";\n";
+ c += " int O = get_global_id(0) * 4;\n";
+ c += " int I = get_global_id(1);\n";
+ c += " int Z = get_global_id(2);\n";
+ c += " int W = Z % args.src_tensor.Width();\n";
+ c += " int H = Z / args.src_tensor.Width();\n";
+ c += " if (O >= args.src_tensor.Batch() || I >= args.src_tensor.Slices() || "
+ "H >= args.src_tensor.Height()) return;\n";
+ c += " FLT4 v0 = args.src_tensor.Read(W, H, I, O + 0);\n";
+ c += " FLT4 v1 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+ c += " FLT4 v2 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+ c += " FLT4 v3 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+ c += " if (O + 1 < args.src_tensor.Batch()) {\n";
+ c += " v1 = args.src_tensor.Read(W, H, I, O + 1);\n";
+ c += " }\n";
+ c += " if (O + 2 < args.src_tensor.Batch()) {\n";
+ c += " v2 = args.src_tensor.Read(W, H, I, O + 2);\n";
+ c += " }\n";
+ c += " if (O + 3 < args.src_tensor.Batch()) {\n";
+ c += " v3 = args.src_tensor.Read(W, H, I, O + 3);\n";
+ c += " }\n";
+ c += " if (I == args.src_tensor.Slices() - 1) {\n";
+ c += " FLT4 mask = (FLT4)(args.mask_x, args.mask_y, args.mask_z, "
+ "args.mask_w);\n";
+ c += " v0 *= mask;\n";
+ c += " v1 *= mask;\n";
+ c += " v2 *= mask;\n";
+ c += " v3 *= mask;\n";
+ c += " }\n";
+ c += " FLT4 r0 = (FLT4)(v0.x, v1.x, v2.x, v3.x);\n";
+ c += " FLT4 r1 = (FLT4)(v0.y, v1.y, v2.y, v3.y);\n";
+ c += " FLT4 r2 = (FLT4)(v0.z, v1.z, v2.z, v3.z);\n";
+ c += " FLT4 r3 = (FLT4)(v0.w, v1.w, v2.w, v3.w);\n";
+ c += " int d_index = O / (GROUP_SIZE * 4);\n";
+ c += " int k_index = (O % (GROUP_SIZE * 4)) / 4;\n";
+ c += " int dst_offset = (((d_index * args.src_tensor.Height() + H) * "
+ "args.src_tensor.Width() + W) * "
+ "args.src_tensor.Slices() + I) * GROUP_SIZE + "
+ "k_index;\n";
+ c += " int address0 = dst_offset * 4 + 0;\n";
+ c += " int address1 = dst_offset * 4 + 1;\n";
+ c += " int address2 = dst_offset * 4 + 2;\n";
+ c += " int address3 = dst_offset * 4 + 3;\n";
+ c += " args.dst_tensor.WriteLinear(r0, dst_offset * 4 + 0)\n;";
+ c += " args.dst_tensor.WriteLinear(r1, dst_offset * 4 + 1)\n;";
+ c += " args.dst_tensor.WriteLinear(r2, dst_offset * 4 + 2)\n;";
+ c += " args.dst_tensor.WriteLinear(r3, dst_offset * 4 + 3)\n;";
+ c += "}\n";
+ return c;
+}
+
+absl::Status ConverterToConvWeights::BindArguments(ArgumentsBinder *args)
+{
+ float4 mask = GetMaskForLastPlane(src_[0]->Channels());
+ RETURN_IF_ERROR(args->SetFloat("mask_x", mask.x));
+ RETURN_IF_ERROR(args->SetFloat("mask_y", mask.y));
+ RETURN_IF_ERROR(args->SetFloat("mask_z", mask.z));
+ return args->SetFloat("mask_w", mask.w);
+}
+
+int3 ConverterToConvWeights::GetGridSize() const
+{
+ const int grid_x =
+ DivideRoundUp(AlignByN(src_[0]->Batch(), 4 * conv_weights_desc_.output_group_size), 4);
+ const int grid_y = src_[0]->Slices();
+ const int grid_z = src_[0]->Width() * src_[0]->Height();
+ return int3(grid_x, grid_y, grid_z);
+}
+
+ConverterToConvWeights CreateConverterToConvWeights(const OperationDef &definition,
+ const ConvWeightsDescription &conv_weights_desc)
+{
+ return ConverterToConvWeights(definition, conv_weights_desc);
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvWeightsConverter.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvWeightsConverter.h
new file mode 100644
index 000000000..bb68977eb
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/ConvWeightsConverter.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_WEIGHTS_CONVERTER_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_WEIGHTS_CONVERTER_H__
+
+#include "open_cl/ClCommandQueue.h"
+#include "open_cl/ClKernel.h"
+#include "open_cl/kernels/ConvCommon.h"
+#include "open_cl/kernels/GpuOperation.h"
+#include "open_cl/Status.h"
+#include "open_cl/Types.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+class ConverterToConvWeights : public GPUOperation
+{
+public:
+ ConverterToConvWeights(const OperationDef &definition,
+ const ConvWeightsDescription &conv_weights_desc);
+ absl::Status BindArguments(ArgumentsBinder *args) override;
+ int3 GetGridSize() const override;
+
+ // Move only
+ ConverterToConvWeights(ConverterToConvWeights &&operation);
+ ConverterToConvWeights &operator=(ConverterToConvWeights &&operation);
+ ConverterToConvWeights(const ConverterToConvWeights &) = delete;
+ ConverterToConvWeights &operator=(const ConverterToConvWeights &) = delete;
+
+private:
+ std::string GetConverterToConvWeightsCode(const OperationDef &op_def,
+ const ConvWeightsDescription &conv_weights_desc);
+
+ ConvWeightsDescription conv_weights_desc_;
+};
+
+// We expect src BHWC tensor and we assume that B is O, H = H, W = W, C is I
+// as dst we expect Tensor with storage type BUFFER and
+// dst.b * dst.h * dst.w * dst.c = AlignByN(src.b, 4) * src.h * src.w
+// AlignByN(src.c, 4)
+ConverterToConvWeights
+CreateConverterToConvWeights(const OperationDef &definition,
+ const ConvWeightsDescription &conv_weights_desc);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONV_WEIGHTS_CONVERTER_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Converter.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/Converter.cc
new file mode 100644
index 000000000..cc2bc41d4
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/Converter.cc
@@ -0,0 +1,592 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Converter.h"
+
+#include <algorithm>
+#include <array>
+#include <string>
+
+#include "open_cl/Arguments.h"
+#include "open_cl/ClCommandQueue.h"
+#include "open_cl/ClErrors.h"
+#include "open_cl/kernels/Util.h"
+#include "open_cl/Precision.h"
+#include "open_cl/InternalTensor.h"
+#include "open_cl/TensorType.h"
+#include "open_cl/TensorTypeUtil.h"
+#include "open_cl/Util.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+namespace
+{
+
+class OpenClConverterImpl : public TensorObjectConverter
+{
+public:
+ virtual absl::Status Init(const TensorObjectDef &input_def, const TensorObjectDef &output_def,
+ Environment *environment) = 0;
+
+protected:
+ absl::Status DispatchKernel(cl_mem buffer_mem, Tensor *tensor)
+ {
+ kernel_.ResetBindingCounter();
+ RETURN_IF_ERROR(kernel_.SetMemoryAuto(buffer_mem));
+ RETURN_IF_ERROR(args_.SetObjectRef("tensor", tensor));
+ RETURN_IF_ERROR(args_.Bind(kernel_.kernel(), kernel_.GetBindingCounter()));
+ const int3 grid = int3(tensor->Width() * tensor->Batch(), tensor->Height(), tensor->Slices());
+ const int3 work_group_size = {16, 8, 1};
+ const int3 work_groups_count = GetWorkGroupsCount(grid, work_group_size);
+ return queue_->Dispatch(kernel_, work_groups_count, work_group_size);
+ }
+
+ Arguments args_;
+ BHWC shape_;
+ CLKernel kernel_;
+ TensorDescriptor tensor_descriptor_;
+ CLCommandQueue *queue_ = nullptr;
+ const CLContext *context_ = nullptr;
+};
+
+bool IsSupportedDataType(DataType type)
+{
+ return type == DataType::FLOAT16 || type == DataType::FLOAT32;
+}
+
+bool IsBHWCOpenCLBuffer(const ObjectDef &def)
+{
+ return IsSupportedDataType(def.data_type) && def.object_type == ObjectType::OPENCL_BUFFER &&
+ def.data_layout == DataLayout::BHWC;
+}
+
+bool IsOpenCLTensor(const ObjectDef &def)
+{
+ const bool is_buffer_tensor =
+ def.object_type == ObjectType::OPENCL_BUFFER && def.data_layout == DataLayout::DHWC4;
+ const bool is_image2d_tensor =
+ def.object_type == ObjectType::OPENCL_TEXTURE && def.data_layout == DataLayout::HDWC4;
+ const bool is_image2d_array_tensor =
+ def.object_type == ObjectType::OPENCL_TEXTURE && def.data_layout == DataLayout::DHWC4;
+ const bool is_single_image_tensor =
+ def.object_type == ObjectType::OPENCL_TEXTURE && def.data_layout == DataLayout::BHWC;
+ return IsSupportedDataType(def.data_type) && (is_buffer_tensor || is_image2d_tensor ||
+ is_image2d_array_tensor || is_single_image_tensor);
+}
+
+absl::Status GetOpenCLMemory(const TensorObject &obj, cl_mem *memory)
+{
+ auto texture = absl::get_if<OpenClTexture>(&obj);
+ auto buffer = absl::get_if<OpenClBuffer>(&obj);
+ if (texture && texture->memobj)
+ {
+ *memory = texture->memobj;
+ }
+ else if (buffer && buffer->memobj)
+ {
+ *memory = buffer->memobj;
+ }
+ else
+ {
+ return absl::InvalidArgumentError("Missing OpenCL object.");
+ }
+ return absl::OkStatus();
+}
+
+// Implements conversion from OpenCL tensor to another OpenCL tensor.
+class TensorToTensorConverter : public OpenClConverterImpl
+{
+public:
+ static bool IsSupported(const ObjectDef &input, const ObjectDef &output)
+ {
+ return IsOpenCLTensor(input) && IsOpenCLTensor(output);
+ }
+
+ absl::Status Init(const TensorObjectDef &input_def, const TensorObjectDef &output_def,
+ Environment *environment) final
+ {
+ src_tensor_descriptor_.layout = Layout::BHWC;
+ src_tensor_descriptor_.storage_type =
+ ToTensorStorageType(input_def.object_def.object_type, input_def.object_def.data_layout);
+ src_tensor_descriptor_.data_type = input_def.object_def.data_type;
+ args_.AddObjectRef("src_tensor", AccessType::READ,
+ absl::make_unique<TensorDescriptor>(src_tensor_descriptor_));
+
+ dst_tensor_descriptor_.layout = Layout::BHWC;
+ dst_tensor_descriptor_.storage_type =
+ ToTensorStorageType(output_def.object_def.object_type, output_def.object_def.data_layout);
+ dst_tensor_descriptor_.data_type = output_def.object_def.data_type;
+ args_.AddObjectRef("dst_tensor", AccessType::WRITE,
+ absl::make_unique<TensorDescriptor>(dst_tensor_descriptor_));
+
+ const bool need_fp16_support = input_def.object_def.data_type == DataType::FLOAT16 ||
+ output_def.object_def.data_type == DataType::FLOAT16;
+ const std::string out_data_type = ToCLDataType(output_def.object_def.data_type);
+ std::string shader_src;
+ if (need_fp16_support)
+ {
+ shader_src += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+ }
+ shader_src +=
+ R"(__kernel void tensor_to_tensor($0) {
+ int linear_id = get_global_id(0);
+ int x = linear_id / args.dst_tensor.Batch();
+ int b = linear_id % args.dst_tensor.Batch();
+ int y = get_global_id(1);
+ int d = get_global_id(2);
+ if (x >= args.dst_tensor.Width() || y >= args.dst_tensor.Height() || d >= args.dst_tensor.Slices()) return;
+)";
+ shader_src +=
+ " " + out_data_type + "4 input = args.src_tensor.Read<" + out_data_type + ">(x, y, d, b);\n";
+ shader_src += " args.dst_tensor.Write(input, x, y, d, b);\n}";
+ queue_ = environment->queue();
+ context_ = &environment->context();
+ shape_ = BHWC(input_def.dimensions.b, input_def.dimensions.h, input_def.dimensions.w,
+ input_def.dimensions.c);
+ RETURN_IF_ERROR(args_.TransformToCLCode(environment->device().info_, {}, &shader_src));
+ return environment->program_cache()->GetOrCreateCLKernel(
+ shader_src, "tensor_to_tensor", environment->context(), environment->device(), &kernel_);
+ }
+
+ absl::Status Convert(const TensorObject &input_obj, const TensorObject &output_obj) override
+ {
+ cl_mem in_memory = nullptr;
+ RETURN_IF_ERROR(GetOpenCLMemory(input_obj, &in_memory));
+ cl_mem out_memory = nullptr;
+ RETURN_IF_ERROR(GetOpenCLMemory(output_obj, &out_memory));
+
+ Tensor src_tensor;
+ RETURN_IF_ERROR(
+ CreateSharedTensor(*context_, in_memory, shape_, src_tensor_descriptor_, &src_tensor));
+ Tensor dst_tensor;
+ RETURN_IF_ERROR(
+ CreateSharedTensor(*context_, out_memory, shape_, dst_tensor_descriptor_, &dst_tensor));
+
+ RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", &src_tensor));
+ RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", &dst_tensor));
+
+ RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
+ const int3 grid =
+ int3(dst_tensor.Width() * dst_tensor.Batch(), dst_tensor.Height(), dst_tensor.Slices());
+ const int3 work_group_size = {16, 8, 1};
+ const int3 work_groups_count = GetWorkGroupsCount(grid, work_group_size);
+ return queue_->Dispatch(kernel_, work_groups_count, work_group_size);
+ }
+
+private:
+ TensorDescriptor src_tensor_descriptor_;
+ TensorDescriptor dst_tensor_descriptor_;
+};
+
+// Implements conversion from OpenCL-specific tensor layout to BHWC OpenCL
+// buffer.
+class TensorToBHWCBufferConverter : public OpenClConverterImpl
+{
+public:
+ static bool IsSupported(const ObjectDef &input, const ObjectDef &output)
+ {
+ return IsOpenCLTensor(input) && IsBHWCOpenCLBuffer(output);
+ }
+
+ absl::Status Init(const TensorObjectDef &input_def, const TensorObjectDef &output_def,
+ Environment *environment) final
+ {
+ TensorStorageType src_tensor_type =
+ ToTensorStorageType(input_def.object_def.object_type, input_def.object_def.data_layout);
+ tensor_descriptor_.layout = Layout::BHWC;
+ tensor_descriptor_.storage_type = src_tensor_type;
+ tensor_descriptor_.data_type = input_def.object_def.data_type;
+ args_.AddObjectRef("tensor", AccessType::READ,
+ absl::make_unique<TensorDescriptor>(tensor_descriptor_));
+
+ const bool need_fp16_support = input_def.object_def.data_type == DataType::FLOAT16 ||
+ output_def.object_def.data_type == DataType::FLOAT16;
+ std::string shader_src;
+ if (need_fp16_support)
+ {
+ shader_src += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+ }
+ const std::string out_data_type = ToCLDataType(output_def.object_def.data_type);
+ shader_src += "__kernel void tensor_to_bhwc(";
+ shader_src += "__global " + out_data_type + "* dst, $0) {\n";
+ shader_src += R"( int linear_id = get_global_id(0);
+ int x = linear_id / args.tensor.Batch();
+ int b = linear_id % args.tensor.Batch();
+ int y = get_global_id(1);
+ int d = get_global_id(2);
+ if (x >= args.tensor.Width() || y >= args.tensor.Height() || d >= args.tensor.Slices()) return;
+)";
+ shader_src +=
+ " " + out_data_type + "4 input = args.tensor.Read<" + out_data_type + ">(x, y, d, b);\n";
+ shader_src += R"( int c = d * 4;
+ int index = ((b * args.tensor.Height() + y) * args.tensor.Width() + x) * args.tensor.Channels() + c;
+
+ dst[index] = input.x;
+ if (c + 1 < args.tensor.Channels()) {
+ dst[index + 1] = input.y;
+ }
+ if (c + 2 < args.tensor.Channels()) {
+ dst[index + 2] = input.z;
+ }
+ if (c + 3 < args.tensor.Channels()) {
+ dst[index + 3] = input.w;
+ }
+})";
+ queue_ = environment->queue();
+ context_ = &environment->context();
+ shape_ = BHWC(input_def.dimensions.b, input_def.dimensions.h, input_def.dimensions.w,
+ input_def.dimensions.c);
+ RETURN_IF_ERROR(args_.TransformToCLCode(environment->device().info_, {}, &shader_src));
+ return environment->program_cache()->GetOrCreateCLKernel(
+ shader_src, "tensor_to_bhwc", environment->context(), environment->device(), &kernel_);
+ }
+
+ absl::Status Convert(const TensorObject &input_obj, const TensorObject &output_obj) override
+ {
+ auto output = absl::get_if<OpenClBuffer>(&output_obj);
+ if (!output || !output->memobj)
+ {
+ return absl::InvalidArgumentError("Missing output in tensor_to_bhwc converter");
+ }
+
+ cl_mem in_memory = nullptr;
+ RETURN_IF_ERROR(GetOpenCLMemory(input_obj, &in_memory));
+ Tensor tensor;
+ RETURN_IF_ERROR(CreateSharedTensor(*context_, in_memory, shape_, tensor_descriptor_, &tensor));
+ return DispatchKernel(output->memobj, &tensor);
+ }
+};
+
+// Implements conversion from BHWC OpenCL buffer to OpenCL-specific tensor
+// layout.
+class BHWCBufferToTensorConverter : public OpenClConverterImpl
+{
+public:
+ static bool IsSupported(const ObjectDef &input, const ObjectDef &output)
+ {
+ return IsBHWCOpenCLBuffer(input) && IsOpenCLTensor(output);
+ }
+
+ std::pair<std::string, std::string> GetFromBhwcKernel(const TensorObjectDef &input_def,
+ const TensorObjectDef &) const
+ {
+ return std::make_pair("__global " + ToCLDataType(input_def.object_def.data_type) + "* src",
+ R"(int c = d * 4;
+ int index = ((b * args.tensor.Height() + y) * args.tensor.Width() + x) * args.tensor.Channels() + c;
+ result.x = src[index];
+ result.y = c + 1 < args.tensor.Channels() ? src[index + 1] : 1;
+ result.z = c + 2 < args.tensor.Channels() ? src[index + 2] : 2;
+ result.w = c + 3 < args.tensor.Channels() ? src[index + 3] : 3;
+)");
+ }
+
+ absl::Status Init(const TensorObjectDef &input_def, const TensorObjectDef &output_def,
+ Environment *environment) final
+ {
+ auto params_kernel = GetFromBhwcKernel(input_def, output_def);
+
+ TensorStorageType dst_tensor_type =
+ ToTensorStorageType(output_def.object_def.object_type, output_def.object_def.data_layout);
+ tensor_descriptor_.layout = Layout::BHWC;
+ tensor_descriptor_.storage_type = dst_tensor_type;
+ tensor_descriptor_.data_type = output_def.object_def.data_type;
+ args_.AddObjectRef("tensor", AccessType::WRITE,
+ absl::make_unique<TensorDescriptor>(tensor_descriptor_));
+
+ const bool need_fp16_support = input_def.object_def.data_type == DataType::FLOAT16 ||
+ output_def.object_def.data_type == DataType::FLOAT16;
+ std::string shader_src;
+ if (need_fp16_support)
+ {
+ shader_src += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+ }
+ const std::string in_data_type = ToCLDataType(input_def.object_def.data_type);
+ const std::string out_data_type = ToCLDataType(output_def.object_def.data_type);
+ shader_src += "__kernel void bhwc_to_tensor(";
+ shader_src += "__global " + in_data_type + "* src, $0) {\n";
+
+ shader_src += R"( int linear_id = get_global_id(0);
+ int x = linear_id / args.tensor.Batch();
+ int b = linear_id % args.tensor.Batch();
+ int y = get_global_id(1);
+ int d = get_global_id(2);
+
+ if (x >= args.tensor.Width() || y >= args.tensor.Height() || d >= args.tensor.Slices()) return;
+)";
+ shader_src += " " + out_data_type + "4 result;\n";
+ shader_src += R"( int c = d * 4;
+ int index = ((b * args.tensor.Height() + y) * args.tensor.Width() + x) * args.tensor.Channels() + c;
+ result.x = src[index];
+ result.y = c + 1 < args.tensor.Channels() ? src[index + 1] : 1;
+ result.z = c + 2 < args.tensor.Channels() ? src[index + 2] : 2;
+ result.w = c + 3 < args.tensor.Channels() ? src[index + 3] : 3;
+)";
+ shader_src += " args.tensor.Write(result, x, y, d, b);\n}";
+ queue_ = environment->queue();
+ context_ = &environment->context();
+ shape_ = BHWC(output_def.dimensions.b, output_def.dimensions.h, output_def.dimensions.w,
+ output_def.dimensions.c);
+ RETURN_IF_ERROR(args_.TransformToCLCode(environment->device().info_, {}, &shader_src));
+ return environment->program_cache()->GetOrCreateCLKernel(
+ shader_src, "bhwc_to_tensor", environment->context(), environment->device(), &kernel_);
+ }
+
+ absl::Status Convert(const TensorObject &input_obj, const TensorObject &output_obj) override
+ {
+ auto input = absl::get_if<OpenClBuffer>(&input_obj);
+ if (!input || !input->memobj)
+ {
+ return absl::InvalidArgumentError("Missing input in bhwc_to_tensor converter");
+ }
+ cl_mem out_memory = nullptr;
+ RETURN_IF_ERROR(GetOpenCLMemory(output_obj, &out_memory));
+ Tensor tensor;
+ RETURN_IF_ERROR(CreateSharedTensor(*context_, out_memory, shape_, tensor_descriptor_, &tensor));
+ return DispatchKernel(input->memobj, &tensor);
+ }
+};
+
+std::array<size_t, 3> CalculateTextureRegion(const TensorObjectDef &def)
+{
+ const auto &dims = def.dimensions;
+ std::array<size_t, 3> region = {0, 0, 1};
+ switch (ToTensorStorageType(def.object_def.object_type, def.object_def.data_layout))
+ {
+ case TensorStorageType::SINGLE_TEXTURE_2D:
+ region[0] = static_cast<size_t>(dims.w * dims.b);
+ region[1] = static_cast<size_t>(dims.h);
+ break;
+ case TensorStorageType::TEXTURE_2D:
+ region[0] = static_cast<size_t>(dims.w * dims.b);
+ region[1] = static_cast<size_t>(dims.h * dims.d());
+ break;
+ case TensorStorageType::TEXTURE_ARRAY:
+ region[0] = static_cast<size_t>(dims.w * dims.b);
+ region[1] = static_cast<size_t>(dims.h);
+ region[2] = static_cast<size_t>(dims.d());
+ break;
+ default:
+ break;
+ }
+ return region;
+}
+
+bool IsOpenClTextureOrBuffer(ObjectType type)
+{
+ return type == ObjectType::OPENCL_BUFFER || type == ObjectType::OPENCL_TEXTURE;
+}
+
+// Copies data from one object of the same type and layout to another object.
+class TrivialCopier : public OpenClConverterImpl
+{
+public:
+ static bool IsSupported(const ObjectDef &input, const ObjectDef &output)
+ {
+ return IsOpenClTextureOrBuffer(input.object_type) && input.data_type == output.data_type &&
+ input.object_type == output.object_type && input.data_layout == output.data_layout;
+ }
+
+ absl::Status Init(const TensorObjectDef &input_def, const TensorObjectDef &output_def,
+ Environment *environment) final
+ {
+ shape_ = BHWC(input_def.dimensions.b, input_def.dimensions.h, input_def.dimensions.w,
+ input_def.dimensions.c);
+ data_type_ = input_def.object_def.data_type;
+ queue_ = environment->queue();
+ region_ = CalculateTextureRegion(output_def);
+ return absl::OkStatus();
+ }
+
+ absl::Status Convert(const TensorObject &input_obj, const TensorObject &output_obj) override
+ {
+ auto texture_input = absl::get_if<OpenClTexture>(&input_obj);
+ auto texture_output = absl::get_if<OpenClTexture>(&output_obj);
+ if (texture_input && texture_output)
+ {
+ return Copy(*texture_input, *texture_output);
+ }
+ auto buffer_input = absl::get_if<OpenClBuffer>(&input_obj);
+ auto buffer_output = absl::get_if<OpenClBuffer>(&output_obj);
+ if (buffer_input && buffer_output)
+ {
+ return Copy(*buffer_input, *buffer_output);
+ }
+ return absl::InternalError("Unexpected object");
+ }
+
+ absl::Status Copy(const OpenClBuffer &input, const OpenClBuffer &output)
+ {
+ if (input.memobj == output.memobj)
+ {
+ return absl::OkStatus();
+ }
+ return GetOpenCLError(clEnqueueCopyBuffer(queue_->queue(), input.memobj, output.memobj, 0, 0,
+ SizeOf(data_type_) * shape_.w * shape_.h *
+ AlignByN(shape_.c, 4) * shape_.b,
+ 0, nullptr, nullptr));
+ }
+
+ absl::Status Copy(const OpenClTexture &input, const OpenClTexture &output)
+ {
+ if (input.memobj == output.memobj)
+ {
+ return absl::OkStatus();
+ }
+ size_t origin[3] = {0, 0, 0};
+ return GetOpenCLError(clEnqueueCopyImage(queue_->queue(), input.memobj, output.memobj, origin,
+ origin, region_.data(), 0, nullptr, nullptr));
+ }
+
+private:
+ DataType data_type_ = DataType::UNKNOWN;
+ std::array<size_t, 3> region_;
+};
+
+// Copies data from/to CPU into a tensor.
+class CpuCopier : public OpenClConverterImpl
+{
+public:
+ static bool IsSupported(const ObjectDef &input, const ObjectDef &output)
+ {
+ return input.data_type == output.data_type && input.data_layout == output.data_layout &&
+ ((input.object_type == ObjectType::CPU_MEMORY &&
+ IsOpenClTextureOrBuffer(output.object_type)) ||
+ (output.object_type == ObjectType::CPU_MEMORY &&
+ IsOpenClTextureOrBuffer(input.object_type)));
+ }
+
+ absl::Status Init(const TensorObjectDef &input_def, const TensorObjectDef &output_def,
+ Environment *environment) final
+ {
+
+ region_ = CalculateTextureRegion(
+ input_def.object_def.object_type == ObjectType::CPU_MEMORY ? output_def : input_def);
+ queue_ = environment->queue();
+ return absl::OkStatus();
+ }
+
+ absl::Status Convert(const TensorObject &input_obj, const TensorObject &output_obj) override
+ {
+ auto cpu_input = absl::get_if<CpuMemory>(&input_obj);
+ auto cpu_output = absl::get_if<CpuMemory>(&output_obj);
+
+ if (cpu_input)
+ {
+ auto texture_output = absl::get_if<OpenClTexture>(&output_obj);
+ if (texture_output)
+ {
+ return queue_->EnqueueWriteImage(texture_output->memobj,
+ int3(region_[0], region_[1], region_[2]), cpu_input->data);
+ }
+ auto buffer_output = absl::get_if<OpenClBuffer>(&output_obj);
+ if (buffer_output)
+ {
+ return queue_->EnqueueWriteBuffer(buffer_output->memobj, cpu_input->size_bytes,
+ cpu_input->data);
+ }
+ }
+ else if (cpu_output)
+ {
+ auto texture_input = absl::get_if<OpenClTexture>(&input_obj);
+ if (texture_input)
+ {
+ return queue_->EnqueueReadImage(texture_input->memobj,
+ int3(region_[0], region_[1], region_[2]), cpu_output->data);
+ }
+ auto buffer_input = absl::get_if<OpenClBuffer>(&input_obj);
+ if (buffer_input)
+ {
+ return queue_->EnqueueReadBuffer(buffer_input->memobj, cpu_output->size_bytes,
+ cpu_output->data);
+ }
+ }
+ return absl::InternalError("Unexpected object");
+ }
+
+private:
+ std::array<size_t, 3> region_;
+};
+
+class OpenClTensorConverterBuilder : public TensorObjectConverterBuilder
+{
+public:
+ explicit OpenClTensorConverterBuilder(Environment *environment) : environment_(environment) {}
+
+ bool IsSupported(const TensorObjectDef &input, const TensorObjectDef &output) const final
+ {
+ const auto &input_def = input.object_def;
+ const auto &output_def = output.object_def;
+ return input.dimensions == output.dimensions &&
+ (TrivialCopier::IsSupported(input_def, output_def) ||
+ TensorToTensorConverter::IsSupported(input_def, output_def) ||
+ CpuCopier::IsSupported(input_def, output_def) ||
+ TensorToBHWCBufferConverter::IsSupported(input_def, output_def) ||
+ BHWCBufferToTensorConverter::IsSupported(input_def, output_def));
+ }
+
+ absl::Status MakeConverter(const TensorObjectDef &input, const TensorObjectDef &output,
+ std::unique_ptr<TensorObjectConverter> *converter) final
+ {
+ std::unique_ptr<OpenClConverterImpl> impl;
+ const auto &input_def = input.object_def;
+ const auto &output_def = output.object_def;
+ if (TrivialCopier::IsSupported(input_def, output_def))
+ {
+ impl = absl::make_unique<TrivialCopier>();
+ }
+ else if (TensorToTensorConverter::IsSupported(input_def, output_def))
+ {
+ impl = absl::make_unique<TensorToTensorConverter>();
+ }
+ else if (CpuCopier::IsSupported(input_def, output_def))
+ {
+ impl = absl::make_unique<CpuCopier>();
+ }
+ else if (TensorToBHWCBufferConverter::IsSupported(input_def, output_def))
+ {
+ impl = absl::make_unique<TensorToBHWCBufferConverter>();
+ }
+ else if (BHWCBufferToTensorConverter::IsSupported(input_def, output_def))
+ {
+ impl = absl::make_unique<BHWCBufferToTensorConverter>();
+ }
+ else
+ {
+ return absl::UnimplementedError("Unsupported conversion");
+ }
+ RETURN_IF_ERROR(impl->Init(input, output, environment_));
+ *converter = std::move(impl);
+ return absl::OkStatus();
+ }
+
+ Environment *environment_;
+};
+
+} // namespace
+
+std::unique_ptr<TensorObjectConverterBuilder> NewConverterBuilder(Environment *environment)
+{
+ return absl::make_unique<OpenClTensorConverterBuilder>(environment);
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Converter.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/Converter.h
new file mode 100644
index 000000000..d69ec85bb
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/Converter.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONVERTER_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONVERTER_H__
+
+#include <memory>
+
+#include "open_cl/Environment.h"
+#include "open_cl/Spi.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+// Supports conversions from BHWC to internal OpenCL tensor representation and
+// back. Also supports F16/F32.
+std::unique_ptr<TensorObjectConverterBuilder> NewConverterBuilder(Environment *environment);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_CONVERTER_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv.cc
new file mode 100644
index 000000000..e409fef47
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv.cc
@@ -0,0 +1,382 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DepthwiseConv.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "open_cl/ClDevice.h"
+#include "open_cl/kernels/Util.h"
+#include "open_cl/kernels/WorkGroupPicking.h"
+#include "open_cl/LinearStorage.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+namespace
+{
+
+bool IsSpecializedCase(int channel_multiplier)
+{
+ return channel_multiplier == 1 || channel_multiplier == 2 || channel_multiplier == 4;
+}
+
+std::string GetSrcValue(int channel_multiplier, const std::string coords)
+{
+ std::string c;
+ if (channel_multiplier == 1)
+ {
+ c += " FLT4 src_final = args.src_tensor.Read(" + coords + ", S);\n";
+ }
+ else if (channel_multiplier == 2)
+ {
+ c += " int s_layer = S / 2;\n";
+ c += " FLT4 src = args.src_tensor.Read(" + coords + ", s_layer);\n";
+ c += " FLT2 t0 = S % 2 == 0 ? src.xy : src.zw;\n";
+ c += " FLT4 src_final = (FLT4)(t0.x, t0.x, t0.y, t0.y);\n";
+ }
+ else if (channel_multiplier == 4)
+ {
+ c += " int s_layer = S / 4;\n";
+ c += " FLT4 src = args.src_tensor.Read(" + coords + ", s_layer);\n";
+ c += " FLT t0 = src.x;\n";
+ c += " int reminder = S % 4;\n";
+ c += " if (reminder == 1) t0 = src.y;\n";
+ c += " if (reminder == 2) t0 = src.z;\n";
+ c += " if (reminder == 3) t0 = src.w;\n";
+ c += " FLT4 src_final = (FLT4)(t0, t0, t0, t0);\n";
+ }
+ else
+ {
+ c += " int s_layer = S / args.ch_multiplier;\n";
+ c += " FLT4 src = args.src_tensor.Read(" + coords + ", s_layer);\n";
+ c += " int s_offset = (S % args.ch_multiplier) * 4;\n";
+ c += " FLT4 src_final;\n";
+ c += " FLT temp_arr[4] = {src.x, src.y, src.z, src.w};\n";
+ c += " src_final.x = temp_arr[(s_offset + 0) / args.ch_multiplier];\n";
+ c += " src_final.y = temp_arr[(s_offset + 1) / args.ch_multiplier];\n";
+ c += " src_final.z = temp_arr[(s_offset + 2) / args.ch_multiplier];\n";
+ c += " src_final.w = temp_arr[(s_offset + 3) / args.ch_multiplier];\n";
+ }
+
+ return c;
+}
+
+std::string GenerateDepthwiseConvolutionCode(const OperationDef &op_def, bool stride_correction,
+ int channel_multiplier, bool weights_are_buffer,
+ bool dynamic_weights, GPUOperation *op)
+{
+ auto src_desc = op_def.src_tensors[0];
+ src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
+ if (op_def.IsBatchSupported())
+ {
+ src_desc.SetStateVar("BatchedWidth", "true");
+ }
+ op->AddSrcTensor("src_tensor", src_desc);
+ if (dynamic_weights)
+ {
+ op->AddSrcTensor("weights", op_def.src_tensors[1]);
+ }
+
+ auto dst_desc = op_def.dst_tensors[0];
+ if (op_def.IsBatchSupported())
+ {
+ dst_desc.SetStateVar("BatchedWidth", "true");
+ }
+ op->AddDstTensor("dst_tensor", dst_desc);
+
+ const auto src_tensor_type = op_def.src_tensors[0].storage_type;
+
+ std::string c = GetCommonDefines(op_def.precision);
+
+ const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
+ src_tensor_type == TensorStorageType::IMAGE_BUFFER;
+
+ c += "__kernel void main_function(\n";
+ c += "$0) {\n";
+ c += " int X = get_global_id(0);\n";
+ if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH))
+ {
+ c += " int linear_id_1 = get_global_id(1);\n";
+ c += " int Y = linear_id_1 / args.dst_tensor.Depth();\n";
+ c += " int Z = linear_id_1 % args.dst_tensor.Depth();\n";
+ }
+ else
+ {
+ c += " int Y = get_global_id(1);\n";
+ }
+ c += " int S = get_global_id(2);\n";
+ c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+ "S >= args.dst_tensor.Slices()) { \n";
+ c += " return; \n";
+ c += " } \n";
+ c += " ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+ if (stride_correction)
+ {
+ c += " int x_offseted = " +
+ GetXStrideCorrectedV2("X", "args.src_tensor.Batch()", "args.stride_x", "args.padding_x") +
+ ";\n";
+ }
+ else
+ {
+ if (op_def.IsBatchSupported())
+ {
+ c += " int x_offseted = X * args.stride_x + args.padding_x * "
+ "args.src_tensor.Batch();\n";
+ }
+ else
+ {
+ c += " int x_offseted = X * args.stride_x + args.padding_x;\n";
+ }
+ }
+ c += " int y_offseted = Y * args.stride_y + args.padding_y;\n";
+ if (!dynamic_weights)
+ {
+ std::string weights_offset = "args.kernel_size_x * args.kernel_size_y";
+ if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH))
+ {
+ c += " int z_offseted = Z * args.stride_z + args.padding_z;\n";
+ weights_offset += " * args.kernel_size_z";
+ }
+ if (weights_are_buffer)
+ {
+ c += " int fx_c = S * " + weights_offset + ";\n";
+ }
+ else
+ {
+ c += " int fx_c = 0;\n";
+ }
+ }
+ std::string kernel_size_x = dynamic_weights ? "args.weights.Width()" : "args.kernel_size_x";
+ std::string kernel_size_y = dynamic_weights ? "args.weights.Height()" : "args.kernel_size_y";
+ std::string kernel_size_z = dynamic_weights ? "args.weights.Depth()" : "args.kernel_size_z";
+
+ std::string flat_coords = "x_c, y_c";
+ if (manual_clamp)
+ {
+ std::string check = "!outside_x && !outside_y";
+ if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH))
+ {
+ check += " && !outside_z";
+ flat_coords += ", z_c";
+ c += " for (int kz = 0; kz < " + kernel_size_z + "; ++kz) {\n";
+ c += " int z_c = z_offseted + kz * args.dilation_z;\n";
+ c += " bool outside_z = z_c < 0 || z_c >= args.src_tensor.Depth();\n";
+ }
+ c += " for (int ky = 0; ky < " + kernel_size_y + "; ++ky) {\n";
+ c += " int y_c = y_offseted + ky * args.dilation_y;\n";
+ c += " bool outside_y = y_c < 0 || y_c >= args.src_tensor.Height();\n";
+ c += " for (int kx = 0; kx < " + kernel_size_x + "; ++kx) {\n";
+ const std::string dilation_x =
+ op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()" : "args.dilation_x";
+ c += " int x_c = x_offseted + kx * " + dilation_x + ";\n";
+ c += " bool outside_x = x_c < 0 || x_c >= args.src_tensor.Width();\n";
+ c += " if (" + check + ") {\n";
+ if (dynamic_weights)
+ {
+ c += " FLT4 f = args.weights.Read(kx, ky, S);\n";
+ }
+ else
+ {
+ if (weights_are_buffer)
+ {
+ c += " FLT4 f = args.weights.Read(fx_c);\n";
+ }
+ else
+ {
+ c += " FLT4 f = args.weights.Read(fx_c, S);\n";
+ }
+ }
+ c += GetSrcValue(channel_multiplier, flat_coords);
+ c += " r += TO_ACCUM_TYPE(src_final * f);\n";
+ c += " };\n";
+ if (!dynamic_weights)
+ {
+ c += " fx_c++;\n";
+ }
+ c += " }\n";
+ c += " }\n";
+ if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH))
+ {
+ c += " }\n";
+ }
+ }
+ else
+ { // Texture types with ZERO clamping
+ if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH))
+ {
+ flat_coords += ", z_c";
+ c += " for (int kz = 0; kz < " + kernel_size_z + "; ++kz) {\n";
+ c += " int z_c = z_offseted + kz * args.dilation_z;\n";
+ if (src_tensor_type != TensorStorageType::TEXTURE_3D)
+ { // Only TEXTURE_3D supports clamping
+ // in DEPTH dimension
+ c += " if (z_c < 0 || z_c >= args.src_tensor.Depth()) {\n";
+ c += " fx_c += args.kernel_size_y * args.kernel_size_x;\n";
+ c += " continue;\n";
+ c += " }\n";
+ }
+ }
+ c += " for (int ky = 0; ky < " + kernel_size_y + "; ++ky) {\n";
+ c += " int y_c = y_offseted + ky * args.dilation_y;\n";
+ c += " for (int kx = 0; kx < " + kernel_size_x + "; ++kx) {\n";
+ const std::string dilation_x =
+ op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()" : "args.dilation_x";
+ c += " int x_c = x_offseted + kx * " + dilation_x + ";\n";
+ c += GetSrcValue(channel_multiplier, flat_coords);
+ if (dynamic_weights)
+ {
+ c += " FLT4 f = args.weights.Read(kx, ky, S);\n";
+ }
+ else
+ {
+ if (weights_are_buffer)
+ {
+ c += " FLT4 f = args.weights.Read(fx_c);\n";
+ }
+ else
+ {
+ c += " FLT4 f = args.weights.Read(fx_c, S);\n";
+ }
+ c += " fx_c++;\n";
+ }
+ c += " r += TO_ACCUM_TYPE(src_final * f);\n";
+ c += " }\n";
+ c += " }\n";
+ if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH))
+ {
+ c += " }\n";
+ }
+ }
+ c += " FLT4 res0 = TO_FLT4(r) + args.biases.Read(S);\n";
+ if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH))
+ {
+ c += " args.dst_tensor.Write(res0, X, Y, Z, S);\n";
+ }
+ else
+ {
+ c += " args.dst_tensor.Write(res0, X, Y, S);\n";
+ }
+ c += "}\n";
+
+ return c;
+}
+} // namespace
+
+GPUOperation CreateDepthwiseConvolution2D(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const DepthwiseConvolution2DAttributes &attr)
+{
+ bool weights_are_buffer = device_info.IsMali();
+ GPUOperation op(definition);
+ op.args_.AddInt("kernel_size_x", attr.weights.shape.w);
+ op.args_.AddInt("stride_x", attr.strides.w);
+ op.args_.AddInt("padding_x", -attr.padding.prepended.w);
+ op.args_.AddInt("dilation_x", attr.dilations.w);
+ op.args_.AddInt("kernel_size_y", attr.weights.shape.h);
+ op.args_.AddInt("stride_y", attr.strides.h);
+ op.args_.AddInt("padding_y", -attr.padding.prepended.h);
+ op.args_.AddInt("dilation_y", attr.dilations.h);
+ if (!IsSpecializedCase(attr.weights.shape.o))
+ {
+ op.args_.AddInt("ch_multiplier", attr.weights.shape.o);
+ }
+ const bool stride_correction = definition.IsBatchSupported() && attr.strides.w != 1;
+ op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction, attr.weights.shape.o,
+ weights_are_buffer, false, &op);
+ UploadWeightsForDWConv2D(attr.weights, weights_are_buffer, definition.precision, &op);
+ op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+
+ TensorLinearDescriptor desc;
+ desc.storage_type =
+ weights_are_buffer ? LinearStorageType::BUFFER : LinearStorageType::TEXTURE_2D;
+ desc.element_type = definition.GetDataType();
+ desc.UploadLinearData(attr.bias);
+ op.args_.AddObject("biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+ return op;
+}
+
+GPUOperation
+CreateDepthwiseConvolution2DDynamicWeights(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const DepthwiseConvolution2DAttributes &attr)
+{
+ GPUOperation op(definition);
+ op.args_.AddInt("stride_x", attr.strides.w);
+ op.args_.AddInt("padding_x", -attr.padding.prepended.w);
+ op.args_.AddInt("dilation_x", attr.dilations.w);
+ op.args_.AddInt("stride_y", attr.strides.h);
+ op.args_.AddInt("padding_y", -attr.padding.prepended.h);
+ op.args_.AddInt("dilation_y", attr.dilations.h);
+ const bool stride_correction = definition.IsBatchSupported() && attr.strides.w != 1;
+ op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction, 1, false, true, &op);
+ op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+
+ TensorLinearDescriptor desc;
+ desc.storage_type =
+ device_info.IsMali() ? LinearStorageType::BUFFER : LinearStorageType::TEXTURE_2D;
+ desc.element_type = definition.GetDataType();
+ desc.UploadLinearData(attr.bias);
+ op.args_.AddObject("biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+ return op;
+}
+
+GPUOperation CreateDepthwiseConvolution3D(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const DepthwiseConvolution3DAttributes &attr)
+{
+ bool weights_are_buffer = device_info.IsMali();
+ GPUOperation op(definition);
+ op.args_.AddInt("kernel_size_x", attr.weights.shape.w);
+ op.args_.AddInt("stride_x", attr.strides.w);
+ op.args_.AddInt("padding_x", -attr.padding.prepended.w);
+ op.args_.AddInt("dilation_x", attr.dilations.w);
+ op.args_.AddInt("kernel_size_y", attr.weights.shape.h);
+ op.args_.AddInt("stride_y", attr.strides.h);
+ op.args_.AddInt("padding_y", -attr.padding.prepended.h);
+ op.args_.AddInt("dilation_y", attr.dilations.h);
+ op.args_.AddInt("kernel_size_z", attr.weights.shape.d);
+ op.args_.AddInt("stride_z", attr.strides.d);
+ op.args_.AddInt("padding_z", -attr.padding.prepended.d);
+ op.args_.AddInt("dilation_z", attr.dilations.d);
+ if (!IsSpecializedCase(attr.weights.shape.o))
+ {
+ op.args_.AddInt("ch_multiplier", attr.weights.shape.o);
+ }
+ const bool stride_correction = definition.IsBatchSupported() && attr.strides.w != 1;
+ op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction, attr.weights.shape.o,
+ weights_are_buffer, false, &op);
+ UploadWeightsForDWConv3D(attr.weights, weights_are_buffer, definition.precision, &op);
+ op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+
+ TensorLinearDescriptor desc;
+ desc.storage_type =
+ weights_are_buffer ? LinearStorageType::BUFFER : LinearStorageType::TEXTURE_2D;
+ desc.element_type = definition.GetDataType();
+ desc.UploadLinearData(attr.bias);
+ op.args_.AddObject("biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+ return op;
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv.h
new file mode 100644
index 000000000..cbadd9fde
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv.h
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_DEPTHWISE_CONV_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_DEPTHWISE_CONV_H__
+
+#include <vector>
+
+#include "open_cl/Buffer.h"
+#include "open_cl/kernels/GpuOperation.h"
+#include "open_cl/LinearStorage.h"
+#include "open_cl/Tensor.h"
+#include "open_cl/Texture2d.h"
+#include "open_cl/Util.h"
+#include "open_cl/DataType.h"
+#include "open_cl/Operations.h"
+#include "open_cl/Shape.h"
+#include "open_cl/Status.h"
+#include "open_cl/Tensor.h"
+#include "open_cl/Types.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+template <DataType S, typename T>
+void RearrangeWeightsForDWConv2D(const InternalTensor<OHWI, S> &weights, absl::Span<T> dst)
+{
+ const int dst_channels = weights.shape.i * weights.shape.o;
+ const int dst_depth = DivideRoundUp(dst_channels, 4);
+ const int kernel_x = weights.shape.w;
+ const int kernel_y = weights.shape.h;
+
+ int counter = 0;
+ for (int d = 0; d < dst_depth; ++d)
+ {
+ for (int y = 0; y < kernel_y; ++y)
+ {
+ for (int x = 0; x < kernel_x; ++x)
+ {
+ T filter_val;
+ for (int i = 0; i < 4; ++i)
+ {
+ const int d_ch = d * 4 + i;
+ if (d_ch < dst_channels)
+ {
+ const int f_index =
+ weights.shape.LinearIndex({d_ch % weights.shape.o, y, x, d_ch / weights.shape.o});
+ filter_val[i] = weights.data[f_index];
+ }
+ else
+ {
+ filter_val[i] = 0.0f;
+ }
+ }
+ dst[counter++] = filter_val;
+ }
+ }
+ }
+}
+
+template <DataType T>
+void UploadWeightsForDWConv2D(const InternalTensor<OHWI, T> &weights, bool weights_are_buffer,
+ CalculationsPrecision precision, GPUOperation *op)
+{
+ const int dst_channels = weights.shape.i * weights.shape.o;
+ const int dst_slices = DivideRoundUp(dst_channels, 4);
+ const int kernel_x = weights.shape.w;
+ const int kernel_y = weights.shape.h;
+
+ const int elements_count = kernel_x * kernel_y * dst_slices;
+
+ const bool fp32_weights = precision == CalculationsPrecision::F32;
+ const int float4_size = fp32_weights ? 16 : 8;
+
+ std::vector<uint8_t> data(float4_size * elements_count);
+
+ if (fp32_weights)
+ {
+ float4 *ptr = reinterpret_cast<float4 *>(data.data());
+ RearrangeWeightsForDWConv2D(weights, absl::MakeSpan(ptr, elements_count));
+ }
+ // TODO
+ // It doesn't support F16 yet. I will try to add it later.
+ //
+ // else {
+ // half4* ptr = reinterpret_cast<half4*>(data.data());
+ // RearrangeWeightsForDWConv2D(weights, absl::MakeSpan(ptr, elements_count));
+ // }
+
+ if (weights_are_buffer)
+ {
+ BufferDescriptor desc;
+ desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+ desc.element_size = 4;
+ desc.size = float4_size * elements_count;
+ desc.data = std::move(data);
+ op->args_.AddObject("weights", absl::make_unique<BufferDescriptor>(desc));
+ }
+ else
+ {
+ Texture2DDescriptor desc;
+ desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+ desc.size = int2(kernel_x * kernel_y, dst_slices);
+ desc.data = std::move(data);
+ op->args_.AddObject("weights", absl::make_unique<Texture2DDescriptor>(desc));
+ }
+}
+
+template <DataType S, typename T>
+void RearrangeWeightsForDWConv3D(const InternalTensor<OHWDI, S> &weights, absl::Span<T> dst)
+{
+ const int dst_channels = weights.shape.i * weights.shape.o;
+ const int dst_slices = DivideRoundUp(dst_channels, 4);
+ const int kernel_x = weights.shape.w;
+ const int kernel_y = weights.shape.h;
+ const int kernel_z = weights.shape.d;
+
+ int counter = 0;
+ for (int d = 0; d < dst_slices; ++d)
+ {
+ for (int z = 0; z < kernel_z; ++z)
+ {
+ for (int y = 0; y < kernel_y; ++y)
+ {
+ for (int x = 0; x < kernel_x; ++x)
+ {
+ T filter_val;
+ for (int i = 0; i < 4; ++i)
+ {
+ const int d_ch = d * 4 + i;
+ if (d_ch < dst_channels)
+ {
+ const int f_index = weights.shape.LinearIndex(
+ {d_ch % weights.shape.o, y, x, z, d_ch / weights.shape.o});
+ filter_val[i] = weights.data[f_index];
+ }
+ else
+ {
+ filter_val[i] = 0.0f;
+ }
+ }
+ dst[counter++] = filter_val;
+ }
+ }
+ }
+ }
+}
+
+template <DataType T>
+void UploadWeightsForDWConv3D(const InternalTensor<OHWDI, T> &weights, bool weights_are_buffer,
+ CalculationsPrecision precision, GPUOperation *op)
+{
+ const int dst_channels = weights.shape.i * weights.shape.o;
+ const int dst_slices = DivideRoundUp(dst_channels, 4);
+ const int kernel_x = weights.shape.w;
+ const int kernel_y = weights.shape.h;
+ const int kernel_z = weights.shape.d;
+
+ const int elements_count = kernel_x * kernel_y * kernel_z * dst_slices;
+
+ const bool fp32_weights = precision == CalculationsPrecision::F32;
+ const int float4_size = fp32_weights ? 16 : 8;
+
+ std::vector<uint8_t> data(float4_size * elements_count);
+
+ if (fp32_weights)
+ {
+ float4 *ptr = reinterpret_cast<float4 *>(data.data());
+ RearrangeWeightsForDWConv3D(weights, absl::MakeSpan(ptr, elements_count));
+ }
+ // TODO
+ // It doesn't support F16 yet. I will try to add it later.
+ //
+ // else {
+ // half4* ptr = reinterpret_cast<half4*>(data.data());
+ // RearrangeWeightsForDWConv3D(weights, absl::MakeSpan(ptr, elements_count));
+ // }
+
+ if (weights_are_buffer)
+ {
+ BufferDescriptor desc;
+ desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+ desc.element_size = 4;
+ desc.size = float4_size * elements_count;
+ desc.data = std::move(data);
+ op->args_.AddObject("weights", absl::make_unique<BufferDescriptor>(std::move(desc)));
+ }
+ else
+ {
+ Texture2DDescriptor desc;
+ desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+ desc.size = int2(kernel_x * kernel_y * kernel_z, dst_slices);
+ desc.data = std::move(data);
+ op->args_.AddObject("weights", absl::make_unique<Texture2DDescriptor>(std::move(desc)));
+ }
+}
+
+GPUOperation CreateDepthwiseConvolution2D(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const DepthwiseConvolution2DAttributes &attr);
+
+GPUOperation
+CreateDepthwiseConvolution2DDynamicWeights(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const DepthwiseConvolution2DAttributes &attr);
+
+GPUOperation CreateDepthwiseConvolution3D(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const DepthwiseConvolution3DAttributes &attr);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_DEPTHWISE_CONV_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv3x3.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv3x3.cc
new file mode 100644
index 000000000..89a14f14d
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv3x3.cc
@@ -0,0 +1,358 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DepthwiseConv3x3.h"
+
+#include <string>
+#include <utility>
+
+#include "open_cl/kernels/Util.h"
+#include "open_cl/kernels/WorkGroupPicking.h"
+#include "open_cl/Precision.h"
+#include "open_cl/Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+DepthwiseConv3x3::DepthwiseConv3x3(const OperationDef &definition, bool weights_are_buffer,
+ bool local_mem_uploads, const DeviceInfo &device_info)
+ : GPUOperation(definition), local_mem_uploads_(local_mem_uploads)
+{
+ work_group_size_ = int3(8, 4, 1);
+ code_ = GenerateDepthwiseConvCode(definition_, weights_are_buffer, local_mem_uploads_);
+
+ if (definition_.precision == CalculationsPrecision::F16 && device_info.IsPowerVR())
+ {
+ compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
+ }
+}
+
+DepthwiseConv3x3::DepthwiseConv3x3(DepthwiseConv3x3 &&operation)
+ : GPUOperation(std::move(operation)), local_mem_uploads_(operation.local_mem_uploads_)
+{
+}
+
+DepthwiseConv3x3 &DepthwiseConv3x3::operator=(DepthwiseConv3x3 &&operation)
+{
+ if (this != &operation)
+ {
+ std::swap(local_mem_uploads_, operation.local_mem_uploads_);
+ GPUOperation::operator=(std::move(operation));
+ }
+ return *this;
+}
+
+std::string DepthwiseConv3x3::GenerateDepthwiseConvCode(const OperationDef &op_def,
+ bool weights_are_buffer,
+ bool local_mem_uploads)
+{
+ auto src_desc = op_def.src_tensors[0];
+ src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
+ AddSrcTensor("src_tensor", src_desc);
+ AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
+
+ const auto src_tensor_type = op_def.src_tensors[0].storage_type;
+
+ const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
+ src_tensor_type == TensorStorageType::IMAGE_BUFFER;
+
+ std::string c = GetCommonDefines(op_def.precision);
+ if (local_mem_uploads)
+ {
+ c += "__attribute__((reqd_work_group_size(8, 4, 1)))\n";
+ }
+ c += "__kernel void main_function(\n";
+ c += "$0) {\n";
+ if (op_def.dst_tensors[0].HasAxis(Axis::BATCH))
+ {
+ c += " int linear_id = get_global_id(0);\n";
+ c += " int X = (linear_id / args.dst_tensor.Batch()) * 2;\n";
+ c += " int B = linear_id % args.dst_tensor.Batch();\n";
+ c += " args.dst_tensor.SetBatchRef(B);\n";
+ c += " args.src_tensor.SetBatchRef(B);\n";
+ }
+ else
+ {
+ c += " int X = get_global_id(0) * 2;\n";
+ }
+ c += " int Y = get_global_id(1) * 2;\n";
+ c += " int S = get_global_id(2);\n";
+ c += " ACCUM_FLT4 r0 = (ACCUM_FLT4)(0.0f);\n";
+ c += " ACCUM_FLT4 r1 = (ACCUM_FLT4)(0.0f);\n";
+ c += " ACCUM_FLT4 r2 = (ACCUM_FLT4)(0.0f);\n";
+ c += " ACCUM_FLT4 r3 = (ACCUM_FLT4)(0.0f);\n";
+ if (!local_mem_uploads)
+ {
+ c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
+ "|| S >= args.dst_tensor.Slices()) { \n";
+ c += " return; \n";
+ c += " } \n";
+ }
+ if (local_mem_uploads)
+ {
+ c += " __local FLT4 f[10];\n";
+ c += " event_t e = async_work_group_copy(f, args.weights.GetPtr() + S * "
+ "10, 10, 0);\n";
+ c += " wait_group_events(1, &e);\n";
+ }
+ else if (weights_are_buffer)
+ {
+ c += " __global FLT4* f = args.weights.GetPtr() + S * 10;\n";
+ }
+ c += " FLT4 s0;\n";
+ c += " FLT4 s1;\n";
+ c += " FLT4 s2;\n";
+ c += " FLT4 s3;\n";
+ std::string W[9] = {"f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8"};
+ std::string bias = "bias";
+ std::string xc[4] = {"X - 1", "X", "X + 1", "X + 2"};
+ std::string yc[4] = {"Y - 1", "Y", "Y + 1", "Y + 2"};
+ if (!weights_are_buffer)
+ {
+ c += " FLT4 f0 = args.weights.Read(0, S);\n";
+ c += " FLT4 f1 = args.weights.Read(1, S);\n";
+ c += " FLT4 f2 = args.weights.Read(2, S);\n";
+ c += " FLT4 f3 = args.weights.Read(3, S);\n";
+ c += " FLT4 f4 = args.weights.Read(4, S);\n";
+ c += " FLT4 f5 = args.weights.Read(5, S);\n";
+ c += " FLT4 f6 = args.weights.Read(6, S);\n";
+ c += " FLT4 f7 = args.weights.Read(7, S);\n";
+ c += " FLT4 f8 = args.weights.Read(8, S);\n";
+ }
+ if (manual_clamp)
+ {
+ c += " int x0 = X - 1;\n";
+ c += " int x1 = X;\n";
+ c += " int x2 = X + 1;\n";
+ c += " int x3 = X + 2;\n";
+ c += " int y0 = Y - 1;\n";
+ c += " int y1 = Y;\n";
+ c += " int y2 = Y + 1;\n";
+ c += " int y3 = Y + 2;\n";
+ c += " bool x0_in = x0 >= 0 && x0 < args.dst_tensor.Width();\n";
+ c += " bool x1_in = x1 >= 0 && x1 < args.dst_tensor.Width();\n";
+ c += " bool x2_in = x2 >= 0 && x2 < args.dst_tensor.Width();\n";
+ c += " bool x3_in = x3 >= 0 && x3 < args.dst_tensor.Width();\n";
+ c += " bool y0_in = y0 >= 0 && y0 < args.dst_tensor.Height();\n";
+ c += " bool y1_in = y1 >= 0 && y1 < args.dst_tensor.Height();\n";
+ c += " bool y2_in = y2 >= 0 && y2 < args.dst_tensor.Height();\n";
+ c += " bool y3_in = y3 >= 0 && y3 < args.dst_tensor.Height();\n";
+ c += " x0 = clamp(x0, 0, args.dst_tensor.Width() - 1);\n";
+ c += " x1 = clamp(x1, 0, args.dst_tensor.Width() - 1);\n";
+ c += " x2 = clamp(x2, 0, args.dst_tensor.Width() - 1);\n";
+ c += " x3 = clamp(x3, 0, args.dst_tensor.Width() - 1);\n";
+ c += " y0 = clamp(y0, 0, args.dst_tensor.Height() - 1);\n";
+ c += " y1 = clamp(y1, 0, args.dst_tensor.Height() - 1);\n";
+ c += " y2 = clamp(y2, 0, args.dst_tensor.Height() - 1);\n";
+ c += " y3 = clamp(y3, 0, args.dst_tensor.Height() - 1);\n";
+ if (src_tensor_type == TensorStorageType::BUFFER)
+ {
+ c += " __global FLT4* src_loc = "
+ "args.src_tensor.GetPtrWithSliceOffset(S);\n";
+ }
+ xc[0] = "x0";
+ xc[1] = "x1";
+ xc[2] = "x2";
+ xc[3] = "x3";
+ yc[0] = "y0";
+ yc[1] = "y1";
+ yc[2] = "y2";
+ yc[3] = "y3";
+ }
+ if (local_mem_uploads || weights_are_buffer)
+ {
+ W[0] = "f[0]";
+ W[1] = "f[1]";
+ W[2] = "f[2]";
+ W[3] = "f[3]";
+ W[4] = "f[4]";
+ W[5] = "f[5]";
+ W[6] = "f[6]";
+ W[7] = "f[7]";
+ W[8] = "f[8]";
+ bias = "f[9]";
+ }
+ auto read_4x_line = [&](int y) {
+ if (src_tensor_type == TensorStorageType::BUFFER)
+ {
+ const std::string y_in = "y" + std::to_string(y) + "_in";
+ c += " s0 = src_loc[args.src_tensor.GetWHOffset(" + xc[0] + ", " + yc[y] +
+ ")] * (FLT)(x0_in && " + y_in + ");\n";
+ c += " s1 = src_loc[args.src_tensor.GetWHOffset(" + xc[1] + ", " + yc[y] +
+ ")] * (FLT)(x1_in && " + y_in + ");\n";
+ c += " s2 = src_loc[args.src_tensor.GetWHOffset(" + xc[2] + ", " + yc[y] +
+ ")] * (FLT)(x2_in && " + y_in + ");\n";
+ c += " s3 = src_loc[args.src_tensor.GetWHOffset(" + xc[3] + ", " + yc[y] +
+ ")] * (FLT)(x3_in && " + y_in + ");\n";
+ }
+ else if (src_tensor_type == TensorStorageType::IMAGE_BUFFER)
+ {
+ const std::string y_in = "y" + std::to_string(y) + "_in";
+ c += " s0 = args.src_tensor.Read(" + xc[0] + ", " + yc[y] + ", S) * (FLT)(x0_in && " +
+ y_in + ");\n";
+ c += " s1 = args.src_tensor.Read(" + xc[1] + ", " + yc[y] + ", S) * (FLT)(x1_in && " +
+ y_in + ");\n";
+ c += " s2 = args.src_tensor.Read(" + xc[2] + ", " + yc[y] + ", S) * (FLT)(x2_in && " +
+ y_in + ");\n";
+ c += " s3 = args.src_tensor.Read(" + xc[3] + ", " + yc[y] + ", S) * (FLT)(x3_in && " +
+ y_in + ");\n";
+ }
+ else
+ {
+ c += " s0 = args.src_tensor.Read(" + xc[0] + ", " + yc[y] + ", S);\n";
+ c += " s1 = args.src_tensor.Read(" + xc[1] + ", " + yc[y] + ", S);\n";
+ c += " s2 = args.src_tensor.Read(" + xc[2] + ", " + yc[y] + ", S);\n";
+ c += " s3 = args.src_tensor.Read(" + xc[3] + ", " + yc[y] + ", S);\n";
+ }
+ };
+ c += " {\n";
+ read_4x_line(0);
+ c += " r0 += TO_ACCUM_TYPE(" + W[0] + " * s0);\n";
+ c += " r0 += TO_ACCUM_TYPE(" + W[1] + " * s1);\n";
+ c += " r1 += TO_ACCUM_TYPE(" + W[0] + " * s1);\n";
+ c += " r0 += TO_ACCUM_TYPE(" + W[2] + " * s2);\n";
+ c += " r1 += TO_ACCUM_TYPE(" + W[1] + " * s2);\n";
+ c += " r1 += TO_ACCUM_TYPE(" + W[2] + " * s3);\n";
+ c += " }\n";
+ c += " {\n";
+ read_4x_line(1);
+ c += " r0 += TO_ACCUM_TYPE(" + W[3] + " * s0);\n";
+ c += " r2 += TO_ACCUM_TYPE(" + W[0] + " * s0);\n";
+ c += " r0 += TO_ACCUM_TYPE(" + W[4] + " * s1);\n";
+ c += " r1 += TO_ACCUM_TYPE(" + W[3] + " * s1);\n";
+ c += " r2 += TO_ACCUM_TYPE(" + W[1] + " * s1);\n";
+ c += " r3 += TO_ACCUM_TYPE(" + W[0] + " * s1);\n";
+ c += " r0 += TO_ACCUM_TYPE(" + W[5] + " * s2);\n";
+ c += " r1 += TO_ACCUM_TYPE(" + W[4] + " * s2);\n";
+ c += " r2 += TO_ACCUM_TYPE(" + W[2] + " * s2);\n";
+ c += " r3 += TO_ACCUM_TYPE(" + W[1] + " * s2);\n";
+ c += " r1 += TO_ACCUM_TYPE(" + W[5] + " * s3);\n";
+ c += " r3 += TO_ACCUM_TYPE(" + W[2] + " * s3);\n";
+ c += " }\n";
+ c += " {\n";
+ read_4x_line(2);
+ c += " r0 += TO_ACCUM_TYPE(" + W[6] + " * s0);\n";
+ c += " r2 += TO_ACCUM_TYPE(" + W[3] + " * s0);\n";
+ c += " r0 += TO_ACCUM_TYPE(" + W[7] + " * s1);\n";
+ c += " r1 += TO_ACCUM_TYPE(" + W[6] + " * s1);\n";
+ c += " r2 += TO_ACCUM_TYPE(" + W[4] + " * s1);\n";
+ c += " r3 += TO_ACCUM_TYPE(" + W[3] + " * s1);\n";
+ c += " r0 += TO_ACCUM_TYPE(" + W[8] + " * s2);\n";
+ c += " r1 += TO_ACCUM_TYPE(" + W[7] + " * s2);\n";
+ c += " r2 += TO_ACCUM_TYPE(" + W[5] + " * s2);\n";
+ c += " r3 += TO_ACCUM_TYPE(" + W[4] + " * s2);\n";
+ c += " r1 += TO_ACCUM_TYPE(" + W[8] + " * s3);\n";
+ c += " r3 += TO_ACCUM_TYPE(" + W[5] + " * s3);\n";
+ c += " }\n";
+ c += " {\n";
+ read_4x_line(3);
+ c += " r2 += TO_ACCUM_TYPE(" + W[6] + " * s0);\n";
+ c += " r2 += TO_ACCUM_TYPE(" + W[7] + " * s1);\n";
+ c += " r3 += TO_ACCUM_TYPE(" + W[6] + " * s1);\n";
+ c += " r2 += TO_ACCUM_TYPE(" + W[8] + " * s2);\n";
+ c += " r3 += TO_ACCUM_TYPE(" + W[7] + " * s2);\n";
+ c += " r3 += TO_ACCUM_TYPE(" + W[8] + " * s3);\n";
+ c += " }\n";
+ if (!weights_are_buffer)
+ {
+ c += " FLT4 bias = args.weights.Read(9, S);\n";
+ }
+ c += " r0 += TO_ACCUM_TYPE(" + bias + ");\n";
+ c += " r1 += TO_ACCUM_TYPE(" + bias + ");\n";
+ c += " r2 += TO_ACCUM_TYPE(" + bias + ");\n";
+ c += " r3 += TO_ACCUM_TYPE(" + bias + ");\n";
+ if (local_mem_uploads)
+ {
+ c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
+ "|| S >= args.dst_tensor.Slices()) { \n";
+ c += " return; \n";
+ c += " } \n";
+ }
+ c += " if(X + 0 < args.dst_tensor.Width() && Y + 0 < "
+ "args.dst_tensor.Height()) {\n";
+ c += " FLT4 result = TO_FLT4(r0);\n";
+ c += " args.dst_tensor.Write(result, X + 0, Y + 0, S)\n";
+ c += " }\n";
+ c += " if(X + 1 < args.dst_tensor.Width() && Y + 0 < "
+ "args.dst_tensor.Height()) {\n";
+ c += " FLT4 result = TO_FLT4(r1);\n";
+ c += " args.dst_tensor.Write(result, X + 1, Y + 0, S)\n";
+ c += " }\n";
+ c += " if(X + 0 < args.dst_tensor.Width() && Y + 1 < "
+ "args.dst_tensor.Height()) {\n";
+ c += " FLT4 result = TO_FLT4(r2);\n";
+ c += " args.dst_tensor.Write(result, X + 0, Y + 1, S)\n";
+ c += " }\n";
+ c += " if(X + 1 < args.dst_tensor.Width() && Y + 1 < "
+ "args.dst_tensor.Height()) {\n";
+ c += " FLT4 result = TO_FLT4(r3);\n";
+ c += " args.dst_tensor.Write(result, X + 1, Y + 1, S)\n";
+ c += " }\n";
+ c += "}\n";
+
+ return c;
+}
+
+int3 DepthwiseConv3x3::GetGridSize() const
+{
+ const int grid_x = DivideRoundUp(dst_[0]->Width(), 2) * dst_[0]->Batch();
+ const int grid_y = DivideRoundUp(dst_[0]->Height(), 2);
+ const int grid_z = dst_[0]->Slices();
+ return int3(grid_x, grid_y, grid_z);
+}
+
+void DepthwiseConv3x3::GetPossibleKernelWorkGroups(TuningType tuning_type,
+ const DeviceInfo &device_info,
+ const KernelInfo &kernel_info,
+ std::vector<int3> *work_groups) const
+{
+ if (local_mem_uploads_)
+ {
+ work_groups->push_back(work_group_size_);
+ }
+ else
+ {
+ GetPossibleWorkGroups(tuning_type, device_info, kernel_info, grid_size_, work_groups);
+ }
+}
+
+bool IsDepthwiseConv3x3Supported(const DepthwiseConvolution2DAttributes &attr)
+{
+ return attr.weights.shape.o == 1 && attr.dilations.w == 1 && attr.dilations.h == 1 &&
+ attr.weights.shape.w == 3 && attr.weights.shape.h == 3 && attr.strides.w == 1 &&
+ attr.strides.h == 1 && attr.padding.prepended.w == 1 && attr.padding.prepended.h == 1 &&
+ attr.padding.appended.w == 1 && attr.padding.appended.h == 1;
+}
+
+DepthwiseConv3x3 CreateDepthwiseConv3x3(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const DepthwiseConvolution2DAttributes &attr)
+{
+ bool weights_are_buffer = device_info.IsPowerVR() || device_info.IsMali();
+ bool local_mem_uploads = weights_are_buffer && device_info.IsPowerVR();
+ DepthwiseConv3x3 result(definition, weights_are_buffer, local_mem_uploads, device_info);
+ result.UploadWeightsAndBiases(attr.weights, attr.bias, weights_are_buffer);
+ return result;
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv3x3.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv3x3.h
new file mode 100644
index 000000000..8c571105a
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/DepthwiseConv3x3.h
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_DEPTHWISE_CONV_3X3_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_DEPTHWISE_CONV_3X3_H__
+
+#include <memory>
+#include <vector>
+
+#include "open_cl/Buffer.h"
+#include "open_cl/kernels/GpuOperation.h"
+#include "open_cl/Tensor.h"
+#include "open_cl/Texture2d.h"
+#include "open_cl/Util.h"
+#include "open_cl/DataType.h"
+#include "open_cl/Operations.h"
+#include "open_cl/Shape.h"
+#include "open_cl/Status.h"
+#include "open_cl/Tensor.h"
+#include "open_cl/Types.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+class DepthwiseConv3x3 : public GPUOperation
+{
+public:
+ DepthwiseConv3x3() = default;
+ void GetPossibleKernelWorkGroups(TuningType tuning_type, const DeviceInfo &device_info,
+ const KernelInfo &kernel_info,
+ std::vector<int3> *work_groups) const override;
+ int3 GetGridSize() const override;
+
+ // Move only
+ DepthwiseConv3x3(DepthwiseConv3x3 &&operation);
+ DepthwiseConv3x3 &operator=(DepthwiseConv3x3 &&operation);
+ DepthwiseConv3x3(const DepthwiseConv3x3 &) = delete;
+ DepthwiseConv3x3 &operator=(const DepthwiseConv3x3 &) = delete;
+
+private:
+ explicit DepthwiseConv3x3(const OperationDef &definition, bool weights_are_buffer,
+ bool local_mem_uploads, const DeviceInfo &device_info);
+ template <DataType T>
+ void UploadWeightsAndBiases(const InternalTensor<OHWI, T> &weights,
+ const InternalTensor<Linear, T> &biases, bool weights_are_buffer);
+
+ friend DepthwiseConv3x3 CreateDepthwiseConv3x3(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const DepthwiseConvolution2DAttributes &attr);
+
+ template <DataType S, typename T>
+ void RearrangeWeightsAndBiasesData(const InternalTensor<OHWI, S> &weights,
+ const InternalTensor<Linear, S> &biases, absl::Span<T> dst);
+
+ std::string GenerateDepthwiseConvCode(const OperationDef &op_def, bool weights_are_buffer,
+ bool local_mem_uploads);
+
+ bool local_mem_uploads_;
+};
+
+template <DataType T>
+void DepthwiseConv3x3::UploadWeightsAndBiases(const InternalTensor<OHWI, T> &weights,
+ const InternalTensor<Linear, T> &biases,
+ bool weights_are_buffer)
+{
+ const int src_depth = DivideRoundUp(weights.shape.i, 4);
+ int texture_width = 10; // 3x3 kernel + 1 bias
+ int texture_height = src_depth;
+ const int elements_count = texture_width * texture_height;
+ const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
+ const int float4_size = fp32_weights ? 16 : 8;
+
+ std::vector<uint8_t> data(float4_size * elements_count);
+ if (fp32_weights)
+ {
+ float4 *ptr = reinterpret_cast<float4 *>(data.data());
+ RearrangeWeightsAndBiasesData(weights, biases, absl::MakeSpan(ptr, elements_count));
+ }
+ // TODO
+ // It doesn't support F16 yet. I will try to add it later.
+ //
+ // else {
+ // half4* ptr = reinterpret_cast<half4*>(data.data());
+ // RearrangeWeightsAndBiasesData(weights, biases,
+ // absl::MakeSpan(ptr, elements_count));
+ // }
+
+ if (weights_are_buffer)
+ {
+ BufferDescriptor desc;
+ desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+ desc.element_size = 4;
+ desc.size = float4_size * elements_count;
+ desc.data = std::move(data);
+ args_.AddObject("weights", absl::make_unique<BufferDescriptor>(std::move(desc)));
+ }
+ else
+ {
+ Texture2DDescriptor desc;
+ desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+ desc.size = int2(texture_width, texture_height);
+ desc.data = std::move(data);
+ args_.AddObject("weights", absl::make_unique<Texture2DDescriptor>(std::move(desc)));
+ }
+}
+
+template <DataType S, typename T>
+void DepthwiseConv3x3::RearrangeWeightsAndBiasesData(const InternalTensor<OHWI, S> &weights,
+ const InternalTensor<Linear, S> &biases,
+ absl::Span<T> dst)
+{
+ const int src_depth = DivideRoundUp(weights.shape.i, 4);
+
+ int counter = 0;
+ for (int s = 0; s < src_depth; ++s)
+ {
+ for (int y = 0; y < 3; ++y)
+ {
+ for (int x = 0; x < 3; ++x)
+ {
+ T filter_val;
+ for (int i = 0; i < 4; ++i)
+ {
+ const int s_ch = s * 4 + i;
+ if (s_ch < weights.shape.i)
+ {
+ const int f_index = weights.shape.LinearIndex({0, y, x, s_ch});
+ filter_val[i] = weights.data[f_index];
+ }
+ else
+ {
+ filter_val[i] = 0.0f;
+ }
+ }
+ dst[counter++] = filter_val;
+ }
+ }
+
+ T bias_val;
+ for (int i = 0; i < 4; ++i)
+ {
+ const int dst_ch = s * 4 + i;
+ bias_val[i] = dst_ch >= biases.shape.v ? 0.0f : biases.data[dst_ch];
+ }
+ dst[counter++] = bias_val;
+ }
+}
+
+bool IsDepthwiseConv3x3Supported(const DepthwiseConvolution2DAttributes &attr);
+
+DepthwiseConv3x3 CreateDepthwiseConv3x3(const DeviceInfo &device_info,
+ const OperationDef &definition,
+ const DepthwiseConvolution2DAttributes &attr);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_DEPTHWISE_CONV_3X3_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/GpuOperation.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/GpuOperation.cc
new file mode 100644
index 000000000..8839d9687
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/GpuOperation.cc
@@ -0,0 +1,385 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GpuOperation.h"
+
+#include "Util.h"
+#include "WorkGroupPicking.h"
+#include "open_cl/AccessType.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+namespace
+{
+
+std::string GetElementWiseCode(const OperationDef &op_def, bool check_src_slices)
+{
+ std::string c = GetCommonDefines(op_def.precision);
+
+ c += "__kernel void main_function(\n";
+ c += "$0) {\n";
+ c += " int X = get_global_id(0);\n";
+ c += " int Y = get_global_id(1);\n";
+ c += " int Z = get_global_id(2);\n";
+ c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+ "Z >= args.dst_tensor.Slices()) return; \n";
+ if (check_src_slices)
+ {
+ c += " FLT4 src = (FLT4)(0.0f);\n";
+ c += " if (Z < args.src_tensor.Slices()) {\n";
+ c += " src = args.src_tensor.Read(X, Y, Z);\n";
+ c += " }\n";
+ }
+ else
+ {
+ c += " FLT4 src = args.src_tensor.Read(X, Y, Z);\n";
+ }
+ c += " args.dst_tensor.Write(src, X, Y, Z);\n";
+ c += "} \n";
+ return c;
+}
+
+int3 GetWorkGroupsCount(int grid_dimension, const int3 &grid_size, const int3 &work_group_size,
+ const int3 &work_group_launch_order)
+{
+ int3 work_groups_count;
+ if (grid_dimension == 1)
+ {
+ work_groups_count.x = DivideRoundUp(grid_size.x, work_group_size.x);
+ work_groups_count.y = 1;
+ work_groups_count.z = 1;
+ }
+ else if (grid_dimension == 2)
+ {
+ int3 wgs;
+ wgs.x = DivideRoundUp(grid_size.x, work_group_size.x);
+ wgs.y = DivideRoundUp(grid_size.y, work_group_size.y);
+ work_groups_count.x = wgs[work_group_launch_order[0]];
+ work_groups_count.y = wgs[work_group_launch_order[1]];
+ work_groups_count.z = 1;
+ }
+ else
+ { // grid_dimension == 3
+ int3 wgs;
+ wgs.x = DivideRoundUp(grid_size.x, work_group_size.x);
+ wgs.y = DivideRoundUp(grid_size.y, work_group_size.y);
+ wgs.z = DivideRoundUp(grid_size.z, work_group_size.z);
+ work_groups_count.x = wgs[work_group_launch_order[0]];
+ work_groups_count.y = wgs[work_group_launch_order[1]];
+ work_groups_count.z = wgs[work_group_launch_order[2]];
+ }
+ return work_groups_count;
+}
+
+} // namespace
+
+DataType OperationDef::GetDataType() const { return DeduceDataTypeFromPrecision(precision); }
+
+DataType OperationDef::GetPrimaryDataType() const { return src_tensors[0].data_type; }
+TensorStorageType OperationDef::GetPrimaryStorageType() const
+{
+ return src_tensors[0].storage_type;
+}
+
+bool OperationDef::IsBatchSupported() const
+{
+ for (const auto &src : src_tensors)
+ {
+ if (HasAxis(src.layout, Axis::BATCH))
+ {
+ return true;
+ }
+ }
+ for (const auto &dst : dst_tensors)
+ {
+ if (HasAxis(dst.layout, Axis::BATCH))
+ {
+ return true;
+ }
+ }
+ return false;
+}
+
+GPUOperation::GPUOperation(const OperationDef &definition) : definition_(definition) {}
+
+void GPUOperation::SetSrc(Tensor *ptr, int index)
+{
+ if (index >= (int)src_.size())
+ {
+ src_.resize(index + 1, nullptr);
+ }
+ src_[index] = ptr;
+}
+
+void GPUOperation::SetDst(Tensor *ptr, int index)
+{
+ if (index >= (int)dst_.size())
+ {
+ dst_.resize(index + 1, nullptr);
+ }
+ dst_[index] = ptr;
+}
+
+GPUOperation::GPUOperation(GPUOperation &&operation)
+ : args_(std::move(operation.args_)), code_(std::move(operation.code_)),
+ work_group_size_(operation.work_group_size_),
+ compiler_options_(std::move(operation.compiler_options_)),
+ tensor_to_grid_(operation.tensor_to_grid_), elementwise_(operation.elementwise_),
+ linkable_(operation.linkable_), check_src_channels_size_(operation.check_src_channels_size_),
+ definition_(std::move(operation.definition_)), src_(std::move(operation.src_)),
+ dst_(std::move(operation.dst_)), kernel_(std::move(operation.kernel_)),
+ grid_dimension_(operation.grid_dimension_),
+ work_group_launch_order_(operation.work_group_launch_order_), grid_size_(operation.grid_size_),
+ src_tensors_names_(std::move(operation.src_tensors_names_)),
+ dst_tensors_names_(std::move(operation.dst_tensors_names_)),
+ work_groups_count_(operation.work_groups_count_), linkable_count_(operation.linkable_count_),
+ elementwise_code_(std::move(operation.elementwise_code_))
+{
+}
+
+GPUOperation &GPUOperation::operator=(GPUOperation &&operation)
+{
+ if (this != &operation)
+ {
+ args_ = std::move(operation.args_);
+ code_ = std::move(operation.code_);
+ std::swap(work_group_size_, operation.work_group_size_);
+ compiler_options_ = std::move(operation.compiler_options_);
+ tensor_to_grid_ = operation.tensor_to_grid_;
+ elementwise_ = operation.elementwise_;
+ linkable_ = operation.linkable_;
+ check_src_channels_size_ = operation.check_src_channels_size_;
+ definition_ = std::move(operation.definition_);
+ src_ = std::move(operation.src_);
+ dst_ = std::move(operation.dst_);
+ kernel_ = std::move(operation.kernel_);
+ std::swap(grid_dimension_, operation.grid_dimension_);
+ std::swap(work_group_launch_order_, operation.work_group_launch_order_);
+ std::swap(grid_size_, operation.grid_size_);
+ src_tensors_names_ = std::move(operation.src_tensors_names_);
+ dst_tensors_names_ = std::move(operation.dst_tensors_names_);
+ std::swap(work_groups_count_, operation.work_groups_count_);
+ std::swap(linkable_count_, operation.linkable_count_);
+ elementwise_code_ = std::move(operation.elementwise_code_);
+ }
+ return *this;
+}
+
+absl::Status GPUOperation::AddOperation(GPUOperation *operation)
+{
+ linkable_count_ += 1;
+ std::string code = operation->code_;
+ std::string unique_postfix = absl::StrCat("_link", linkable_count_);
+ operation->args_.RenameArgs(unique_postfix, &code);
+ elementwise_code_ += "{\n" + code + "\n}\n";
+ RETURN_IF_ERROR(args_.Merge(std::move(operation->args_), unique_postfix));
+ for (size_t i = 0; i < operation->src_tensors_names_.size(); ++i)
+ {
+ definition_.src_tensors.push_back(operation->definition_.src_tensors[i + 1]);
+ src_tensors_names_.push_back(operation->src_tensors_names_[i] + unique_postfix);
+ }
+ for (size_t i = 0; i < operation->dst_tensors_names_.size(); ++i)
+ {
+ dst_tensors_names_.push_back(operation->dst_tensors_names_[i] + unique_postfix);
+ }
+ return absl::OkStatus();
+}
+
+void GPUOperation::AddSrcTensor(const std::string &tensor_name, const TensorDescriptor &desc)
+{
+ src_tensors_names_.push_back(tensor_name);
+ auto desc_new = std::make_unique<TensorDescriptor>(desc);
+ args_.AddObjectRef(tensor_name, AccessType::READ, std::move(desc_new));
+}
+
+void GPUOperation::AddSrcBuffer(const std::string &buffer_name, const BufferDescriptor &desc)
+{
+ src_tensors_names_.push_back(buffer_name);
+ auto desc_new = std::make_unique<BufferDescriptor>(desc);
+ args_.AddObjectRef(buffer_name, AccessType::READ, std::move(desc_new));
+}
+
+void GPUOperation::AddDstTensor(const std::string &tensor_name, const TensorDescriptor &desc)
+{
+ dst_tensors_names_.push_back(tensor_name);
+ auto desc_new = std::make_unique<TensorDescriptor>(desc);
+ args_.AddObjectRef(tensor_name, AccessType::WRITE, std::move(desc_new));
+}
+
+absl::Status GPUOperation::UpdateParams()
+{
+ for (size_t i = 0; i < src_tensors_names_.size(); ++i)
+ {
+ RETURN_IF_ERROR(args_.SetObjectRef(src_tensors_names_[i], src_[i]));
+ }
+ for (size_t i = 0; i < dst_tensors_names_.size(); ++i)
+ {
+ RETURN_IF_ERROR(args_.SetObjectRef(dst_tensors_names_[i], dst_[i]));
+ }
+ RETURN_IF_ERROR(BindArguments(&args_));
+ grid_size_ = GetGridSize();
+ work_groups_count_ =
+ GetWorkGroupsCount(grid_dimension_, grid_size_, work_group_size_, work_group_launch_order_);
+ return absl::OkStatus();
+}
+
+absl::Status GPUOperation::AssembleCode(const DeviceInfo &device_info, CLContext *context)
+{
+ if (elementwise_)
+ {
+ auto src_desc = absl::make_unique<TensorDescriptor>(definition_.src_tensors[0]);
+ if (definition_.IsBatchSupported())
+ {
+ src_desc->SetStateVar("BatchedWidth", "true");
+ }
+ src_tensors_names_.insert(src_tensors_names_.begin(), "src_tensor");
+ args_.AddObjectRef("src_tensor", AccessType::READ, std::move(src_desc));
+
+ auto dst_desc = absl::make_unique<TensorDescriptor>(definition_.dst_tensors[0]);
+ if (definition_.IsBatchSupported())
+ {
+ dst_desc->SetStateVar("BatchedWidth", "true");
+ }
+ dst_tensors_names_.insert(dst_tensors_names_.begin(), "dst_tensor");
+ args_.AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
+
+ elementwise_code_ = "{\n" + code_ + "\n}\n" + elementwise_code_;
+ code_ = GetElementWiseCode(definition_, check_src_channels_size_);
+ RETURN_IF_ERROR(args_.AllocateObjects(context));
+ RETURN_IF_ERROR(
+ args_.TransformToCLCode(device_info, {{dst_tensors_names_[0], elementwise_code_}}, &code_));
+ }
+ else
+ {
+ RETURN_IF_ERROR(args_.AllocateObjects(context));
+ RETURN_IF_ERROR(
+ args_.TransformToCLCode(device_info, {{dst_tensors_names_[0], elementwise_code_}}, &code_));
+ }
+ return absl::OkStatus();
+}
+
+absl::Status GPUOperation::Compile(const CreationContext &creation_context)
+{
+ RETURN_IF_ERROR(AssembleCode(creation_context.GetDeviceInfo(), creation_context.context));
+ RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
+ code_, "main_function", compiler_options_, *creation_context.context, *creation_context.device,
+ &kernel_));
+ return PostCompileCheck(creation_context.device->info_, kernel_.info_);
+}
+
+absl::Status GPUOperation::CompileDeserialized(const CreationContext &creation_context)
+{
+ return creation_context.cache->GetOrCreateCLKernel(code_, "main_function", compiler_options_,
+ *creation_context.context,
+ *creation_context.device, &kernel_);
+}
+
+void GPUOperation::GetPossibleKernelWorkGroups(TuningType tuning_type,
+ const DeviceInfo &device_info,
+ const KernelInfo &kernel_info,
+ std::vector<int3> *work_groups) const
+{
+ GetPossibleWorkGroups(tuning_type, device_info, kernel_info, grid_size_, work_groups);
+}
+
+absl::Status GPUOperation::Tune(const TuningParameters &params)
+{
+ std::vector<int3> possible_work_groups;
+ GetPossibleKernelWorkGroups(params.tuning_type, *params.info, kernel_.info_,
+ &possible_work_groups);
+ if (possible_work_groups.empty())
+ {
+ return absl::NotFoundError("Can not found work_group size to launch kernel");
+ }
+ if (possible_work_groups.size() == 1)
+ {
+ work_group_size_ = possible_work_groups[0];
+ work_groups_count_ =
+ GetWorkGroupsCount(grid_dimension_, grid_size_, work_group_size_, work_group_launch_order_);
+ return absl::OkStatus();
+ }
+ else
+ {
+ std::vector<int3> work_groups_count(possible_work_groups.size());
+ for (size_t i = 0; i < work_groups_count.size(); ++i)
+ {
+ work_groups_count[i] = GetWorkGroupsCount(grid_dimension_, grid_size_,
+ possible_work_groups[i], work_group_launch_order_);
+ }
+ RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
+ int best_work_group_index;
+ RETURN_IF_ERROR(params.queue->GetBestWorkGroupIndex(
+ kernel_, *params.info, work_groups_count, possible_work_groups, &best_work_group_index));
+ work_group_size_ = possible_work_groups[best_work_group_index];
+ work_groups_count_ =
+ GetWorkGroupsCount(grid_dimension_, grid_size_, work_group_size_, work_group_launch_order_);
+ return absl::OkStatus();
+ }
+}
+
+int3 GPUOperation::GetGridSize() const
+{
+ if (elementwise_ || tensor_to_grid_ == TensorToGrid::kWBToX_HDToY_SToZ)
+ {
+ const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
+ const int grid_y = dst_[0]->Height() * dst_[0]->Depth();
+ const int grid_z = dst_[0]->Slices();
+ return int3(grid_x, grid_y, grid_z);
+ }
+ if (tensor_to_grid_ == TensorToGrid::kWBToX_HDToY_ZIs1)
+ {
+ const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
+ const int grid_y = dst_[0]->Height() * dst_[0]->Depth();
+ const int grid_z = 1;
+ return int3(grid_x, grid_y, grid_z);
+ }
+ if (tensor_to_grid_ == TensorToGrid::kWBToX_HToY_DToZ)
+ {
+ const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
+ const int grid_y = dst_[0]->Height();
+ const int grid_z = dst_[0]->Depth();
+ return int3(grid_x, grid_y, grid_z);
+ }
+ if (tensor_to_grid_ == TensorToGrid::kBToX_YIs1_ZIs1)
+ {
+ const int grid_x = dst_[0]->Batch();
+ const int grid_y = 1;
+ const int grid_z = 1;
+ return int3(grid_x, grid_y, grid_z);
+ }
+ return grid_size_;
+}
+
+void GPUOperation::AddUniquePostfix(const std::string &unique_postfix)
+{
+ for (uint32_t i = 0; i < src_tensors_names_.size(); ++i)
+ {
+ src_tensors_names_[i] += unique_postfix;
+ }
+ for (uint32_t i = 0; i < dst_tensors_names_.size(); ++i)
+ {
+ dst_tensors_names_[i] += unique_postfix;
+ }
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/GpuOperation.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/GpuOperation.h
new file mode 100644
index 000000000..4f531c629
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/GpuOperation.h
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_GPU_OPERATION_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_GPU_OPERATION_H__
+
+#include <string>
+#include <vector>
+
+#include "TuningParameters.h"
+
+#include "open_cl/Arguments.h"
+#include "open_cl/Buffer.h"
+#include "open_cl/ClCommandQueue.h"
+#include "open_cl/ClContext.h"
+#include "open_cl/ClDevice.h"
+#include "open_cl/ClKernel.h"
+#include "open_cl/ClProgram.h"
+#include "open_cl/DataType.h"
+#include "open_cl/DeviceInfo.h"
+#include "open_cl/Precision.h"
+#include "open_cl/ProgramCache.h"
+#include "open_cl/Tensor.h"
+#include "open_cl/TensorType.h"
+#include "open_cl/Types.h"
+#include "open_cl/Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+// kCustom: default value
+// GPUOperation::GetGridSize must be overloaded
+// kWBToX_HDToY_SToZ:
+// grid_x = dst_[0]->Width() * dst_[0]->Batch();
+// grid_y = dst_[0]->Height() * dst_[0]->Depth();
+// grid_z = dst_[0]->Slices();
+// kWBToX_HDToY_ZIs1:
+// grid_x = dst_[0]->Width() * dst_[0]->Batch();
+// grid_y = dst_[0]->Height() * dst_[0]->Depth();
+// grid_z = 1;
+// kWBToX_HToY_DToZ:
+// grid_x = dst_[0]->Width() * dst_[0]->Batch();
+// grid_y = dst_[0]->Height();
+// grid_z = dst_[0]->Depth();
+// kBToX_YIs1_ZIs1:
+// grid_x = dst_[0]->Batch();
+// grid_y = 1;
+// grid_z = 1;
+enum class TensorToGrid
+{
+ kCustom,
+ kWBToX_HDToY_SToZ,
+ kWBToX_HDToY_ZIs1,
+ kWBToX_HToY_DToZ,
+ kBToX_YIs1_ZIs1
+};
+
+struct CreationContext
+{
+ const CLDevice *device;
+ CLContext *context;
+ CLCommandQueue *queue;
+ ProgramCache *cache;
+
+ const DeviceInfo &GetDeviceInfo() const { return device->info_; }
+};
+
+struct OperationDef
+{
+ CalculationsPrecision precision;
+ std::vector<TensorDescriptor> src_tensors;
+ std::vector<TensorDescriptor> dst_tensors;
+
+ // returns FLOAT32 for F32 precision and FLOAT16 for F16 precision
+ DataType GetDataType() const;
+ // Primary means the first src tensor, because first tensor usually defines
+ // the structure of kernel, all other resources(biases) types and etc.
+ DataType GetPrimaryDataType() const;
+ TensorStorageType GetPrimaryStorageType() const;
+ bool IsBatchSupported() const;
+};
+
+// GPUOperation represents some implementation of neural network operation on
+// GPU. GPUOperation can contain another GPU operations with flag elementwise_.
+// When GPUOperation contains another GPU ops, this GPUoperation replaces
+// some sequence of operations Op + op0 + op1 + ...
+// Because of this abilities of GPUOperation, usage scenario is next:
+// Create instance of GPUOperation.
+// Create all instances of GPUOperations that we will(probably) attach
+// to GPUOperation. Attach all GPUOperations to GPUOperation. Call
+// GPUOperation.Compile(). Don't call GPUOperations.Compile() if it
+// attached, it useless(and may be error)
+class GPUOperation
+{
+public:
+ GPUOperation() = default;
+ explicit GPUOperation(const OperationDef &definition);
+ virtual ~GPUOperation() = default;
+ // Move only
+ GPUOperation(GPUOperation &&operation);
+ GPUOperation &operator=(GPUOperation &&operation);
+ GPUOperation(const GPUOperation &) = delete;
+ GPUOperation &operator=(const GPUOperation &) = delete;
+
+ absl::Status AddOperation(GPUOperation *operation);
+
+ void SetSrc(Tensor *ptr, int index = 0);
+ void SetDst(Tensor *ptr, int index = 0);
+
+ // should be called after changes of inputs/outputs.
+ absl::Status UpdateParams();
+
+ absl::Status AddToQueue(CLCommandQueue *queue)
+ {
+ RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
+ return queue->Dispatch(kernel_, work_groups_count_, work_group_size_);
+ }
+
+ virtual void GetPossibleKernelWorkGroups(TuningType tuning_type, const DeviceInfo &device_info,
+ const KernelInfo &kernel_info,
+ std::vector<int3> *work_groups) const;
+
+ absl::Status Tune(const TuningParameters &params);
+
+ absl::Status AssembleCode(const DeviceInfo &device_info, CLContext *context);
+
+ absl::Status Compile(const CreationContext &creation_context);
+
+ absl::Status CompileDeserialized(const CreationContext &creation_context);
+
+ virtual absl::Status PostCompileCheck(const DeviceInfo &, const KernelInfo &)
+ {
+ return absl::OkStatus();
+ }
+
+ const OperationDef &GetDefinition() const { return definition_; }
+
+ void AddSrcTensor(const std::string &tensor_name, const TensorDescriptor &desc);
+ void AddSrcBuffer(const std::string &buffer_name, const BufferDescriptor &desc);
+ void AddDstTensor(const std::string &tensor_name, const TensorDescriptor &desc);
+
+ bool IsLinkable() const { return elementwise_ && linkable_; }
+
+ // for linking
+ void AddUniquePostfix(const std::string &unique_postfix);
+
+ Arguments args_;
+ std::string code_;
+ int3 work_group_size_ = int3(8, 4, 1);
+ std::vector<CompilerOptions> compiler_options_;
+ // not applicable to elementwise
+ TensorToGrid tensor_to_grid_ = TensorToGrid::kCustom;
+
+ bool elementwise_ = false;
+ // applicable only with elementwise_ = true;
+ bool linkable_ = true; // by default every elementwise is linkable
+ // applicable only with elementwise_ = true;
+ bool check_src_channels_size_ = false;
+
+protected:
+ virtual absl::Status BindArguments(ArgumentsBinder *) { return absl::OkStatus(); }
+ virtual int3 GetGridSize() const;
+
+ // Defines operation calculation precision and format of src/dst tensors.
+ OperationDef definition_;
+ std::vector<Tensor *> src_;
+ std::vector<Tensor *> dst_;
+ CLKernel kernel_;
+ int grid_dimension_ = 3; // can be 1, 2 or 3
+ int3 work_group_launch_order_ = int3(0, 1, 2);
+ int3 grid_size_ = int3(0, 0, 0);
+ std::vector<std::string> src_tensors_names_;
+ std::vector<std::string> dst_tensors_names_;
+
+private:
+ int3 work_groups_count_ = int3(0, 0, 0);
+ int linkable_count_ = 0;
+ std::string elementwise_code_; // temporary, used during op construction
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_GPU_OPERATION_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Pooling.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/Pooling.cc
new file mode 100644
index 000000000..ceeab2f39
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/Pooling.cc
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Pooling.h"
+
+#include <string>
+
+#include "Util.h"
+#include "open_cl/Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+namespace
+{
+
+std::string GetAveragePoolingKernelCode(const OperationDef &op_def, bool stride_correction,
+ GPUOperation *op)
+{
+ auto src_desc = op_def.src_tensors[0];
+
+ src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
+
+ if (op_def.IsBatchSupported())
+ {
+ src_desc.SetStateVar("BatchedWidth", "true");
+ }
+ op->AddSrcTensor("src_tensor", src_desc);
+ auto dst_desc = op_def.dst_tensors[0];
+ if (op_def.IsBatchSupported())
+ {
+ dst_desc.SetStateVar("BatchedWidth", "true");
+ }
+ op->AddDstTensor("dst_tensor", dst_desc);
+
+ std::map<Axis, std::string> axis_to_src_coord = {
+ {Axis::WIDTH, "x_c"}, {Axis::HEIGHT, "y_c"}, {Axis::DEPTH, "d_c"},
+ {Axis::CHANNELS, "Z"}, {Axis::BATCH, "B"},
+ };
+
+ std::map<Axis, std::string> axis_to_dst_coord = {
+ {Axis::WIDTH, "X"}, {Axis::HEIGHT, "Y"}, {Axis::DEPTH, "D"},
+ {Axis::CHANNELS, "Z"}, {Axis::BATCH, "B"},
+ };
+
+ std::vector<std::string> src_coords;
+ std::vector<std::string> dst_coords;
+ for (auto axis : {Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH, Axis::CHANNELS})
+ {
+ if (op_def.dst_tensors[0].HasAxis(axis))
+ {
+ dst_coords.push_back(axis_to_dst_coord[axis]);
+ }
+ if (op_def.src_tensors[0].HasAxis(axis))
+ {
+ src_coords.push_back(axis_to_src_coord[axis]);
+ }
+ }
+ std::string src_coord = src_coords[0];
+ for (size_t i = 1; i < src_coords.size(); ++i)
+ {
+ src_coord += ", " + src_coords[i];
+ }
+ std::string dst_coord = dst_coords[0];
+ for (size_t i = 1; i < dst_coords.size(); ++i)
+ {
+ dst_coord += ", " + dst_coords[i];
+ }
+
+ const bool manual_clamp = op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER ||
+ op_def.src_tensors[0].storage_type == TensorStorageType::IMAGE_BUFFER;
+
+ std::string c = GetCommonDefines(op_def.precision);
+ c += "__kernel void main_function(\n";
+ c += "$0) {\n";
+ c += " int X = get_global_id(0);\n";
+ if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH))
+ {
+ c += " int linear_id_1 = get_global_id(1);\n";
+ c += " int Y = linear_id_1 / args.dst_tensor.Depth();\n";
+ c += " int D = linear_id_1 % args.dst_tensor.Depth();\n";
+ }
+ else
+ {
+ c += " int Y = get_global_id(1);\n";
+ }
+ c += " int Z = get_global_id(2);\n";
+ c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+ "Z >= args.dst_tensor.Slices()) { \n";
+ c += " return; \n";
+ c += " } \n";
+ c += " float4 r = (float4)(0.0f);\n";
+ c += " float window_size = 0.0;\n";
+ if (stride_correction)
+ {
+ c += " int xs = " +
+ GetXStrideCorrectedV2("X", "args.src_tensor.Batch()", "args.stride_x", "args.padding_x") +
+ ";\n";
+ }
+ else
+ {
+ if (op_def.IsBatchSupported())
+ {
+ c += " int xs = X * args.stride_x + args.padding_x * "
+ "args.src_tensor.Batch();\n";
+ }
+ else
+ {
+ c += " int xs = X * args.stride_x + args.padding_x;\n";
+ }
+ }
+ c += " int ys = Y * args.stride_y + args.padding_y;\n";
+ if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH))
+ {
+ c += " int ds = D * args.stride_z + args.padding_z;\n";
+ c += " for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
+ c += " int d_c = ds + kz;\n";
+ c += " if (d_c < 0 || d_c >= args.src_tensor.Depth()) continue;\n";
+ }
+ c += " for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
+ c += " int y_c = ys + ky;\n";
+ c += " bool outside_y = y_c < 0 || y_c >= args.src_tensor.Height();\n";
+ c += " for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
+ if (op_def.IsBatchSupported())
+ {
+ c += " int x_c = xs + kx * args.src_tensor.Batch();\n";
+ }
+ else
+ {
+ c += " int x_c = xs + kx;\n";
+ }
+ c += " bool outside = outside_y || x_c < 0 || x_c >= "
+ "args.src_tensor.Width();\n";
+ if (manual_clamp)
+ {
+ c += " r += !outside ? args.src_tensor.Read<float>(" + src_coord +
+ ") : "
+ "(float4)(0.0f);\n";
+ }
+ else
+ {
+ c += " r += args.src_tensor.Read<float>(" + src_coord + ");\n";
+ }
+ c += " window_size += !outside ? 1.0 : 0.0;\n";
+ c += " }\n";
+ c += " }\n";
+ if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH))
+ {
+ c += " } // Depth\n";
+ }
+ // If window_size==0, window covered nothing. This situation is a sign of
+ // incorrectly constructed operation. NaNs are expected as output.
+ c += " FLT4 result = TO_FLT4(r / window_size);\n";
+ c += " args.dst_tensor.Write(result, " + dst_coord + ");\n";
+ c += "}\n";
+
+ return c;
+}
+
+std::string GetMaxPoolingKernelCode(const OperationDef &op_def, bool stride_correction,
+ bool output_indices, GPUOperation *op)
+{
+ auto src_desc = op_def.src_tensors[0];
+ if (op_def.IsBatchSupported())
+ {
+ src_desc.SetStateVar("BatchedWidth", "true");
+ }
+ op->AddSrcTensor("src_tensor", src_desc);
+ auto dst_desc = op_def.dst_tensors[0];
+ if (op_def.IsBatchSupported())
+ {
+ dst_desc.SetStateVar("BatchedWidth", "true");
+ }
+ op->AddDstTensor("dst_tensor", dst_desc);
+ if (output_indices)
+ {
+ auto dst_ind_desc = op_def.dst_tensors[1];
+ if (op_def.IsBatchSupported())
+ {
+ dst_ind_desc.SetStateVar("BatchedWidth", "true");
+ }
+ op->AddDstTensor("dst_indices", dst_ind_desc);
+ }
+
+ std::map<Axis, std::string> axis_to_src_coord = {
+ {Axis::WIDTH, "x_c"}, {Axis::HEIGHT, "y_c"}, {Axis::DEPTH, "d_c"},
+ {Axis::CHANNELS, "Z"}, {Axis::BATCH, "B"},
+ };
+
+ std::map<Axis, std::string> axis_to_dst_coord = {
+ {Axis::WIDTH, "X"}, {Axis::HEIGHT, "Y"}, {Axis::DEPTH, "D"},
+ {Axis::CHANNELS, "Z"}, {Axis::BATCH, "B"},
+ };
+
+ std::vector<std::string> src_coords;
+ std::vector<std::string> dst_coords;
+ for (auto axis : {Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH, Axis::CHANNELS})
+ {
+ if (op_def.dst_tensors[0].HasAxis(axis))
+ {
+ dst_coords.push_back(axis_to_dst_coord[axis]);
+ }
+ if (op_def.src_tensors[0].HasAxis(axis))
+ {
+ src_coords.push_back(axis_to_src_coord[axis]);
+ }
+ }
+ std::string src_coord = src_coords[0];
+ for (size_t i = 1; i < src_coords.size(); ++i)
+ {
+ src_coord += ", " + src_coords[i];
+ }
+ std::string dst_coord = dst_coords[0];
+ for (size_t i = 1; i < dst_coords.size(); ++i)
+ {
+ dst_coord += ", " + dst_coords[i];
+ }
+
+ std::string c = GetCommonDefines(op_def.precision);
+ c += "__kernel void main_function(\n";
+ c += "$0) {\n";
+ c += " int X = get_global_id(0);\n";
+ if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH))
+ {
+ c += " int linear_id_1 = get_global_id(1);\n";
+ c += " int Y = linear_id_1 / args.dst_tensor.Depth();\n";
+ c += " int D = linear_id_1 % args.dst_tensor.Depth();\n";
+ }
+ else
+ {
+ c += " int Y = get_global_id(1);\n";
+ }
+ c += " int Z = get_global_id(2);\n";
+ c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+ "Z >= args.dst_tensor.Slices()) { \n";
+ c += " return; \n";
+ c += " } \n";
+ c += " FLT4 maximum = (FLT4)(-10000.0f);\n";
+ if (output_indices)
+ {
+ c += " FLT4 indexes = (FLT4)(0.0f);\n";
+ }
+ if (stride_correction)
+ {
+ c += " int xs = " +
+ GetXStrideCorrectedV2("X", "args.src_tensor.Batch()", "args.stride_x", "args.padding_x") +
+ ";\n";
+ }
+ else
+ {
+ if (op_def.IsBatchSupported())
+ {
+ c += " int xs = X * args.stride_x + args.padding_x * "
+ "args.src_tensor.Batch();\n";
+ }
+ else
+ {
+ c += " int xs = X * args.stride_x + args.padding_x;\n";
+ }
+ }
+ c += " int ys = Y * args.stride_y + args.padding_y;\n";
+ c += " for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
+ c += " int y_c = ys + ky;\n";
+ c += " if (y_c < 0 || y_c >= args.src_tensor.Height()) continue;\n";
+ c += " for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
+ if (op_def.IsBatchSupported())
+ {
+ c += " int x_c = xs + kx * args.src_tensor.Batch();\n";
+ }
+ else
+ {
+ c += " int x_c = xs + kx;\n";
+ }
+ c += " if (x_c < 0 || x_c >= args.src_tensor.Width()) continue;\n";
+ if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH))
+ {
+ c += " int ds = D * args.stride_z + args.padding_z;\n";
+ c += " for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
+ c += " int d_c = ds + kz;\n";
+ c += " if (d_c < 0 || d_c >= args.src_tensor.Depth()) continue;\n";
+ }
+ c += " FLT4 src = args.src_tensor.Read(" + src_coord + ");\n";
+ if (output_indices)
+ {
+ if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH))
+ {
+ c += " FLT index_counter = (FLT)((ky * args.kernel_size_x + kx) * "
+ "args.kernel_size_z + kz) + (FLT)(0.1f);\n";
+ }
+ else
+ {
+ c += " FLT index_counter = (FLT)(ky * args.kernel_size_x + kx) + "
+ "(FLT)(0.1f);\n";
+ }
+ c += " if (src.x > maximum.x) {\n";
+ c += " indexes.x = index_counter;\n";
+ c += " maximum.x = src.x;\n";
+ c += " }\n";
+ c += " if (src.y > maximum.y) {\n";
+ c += " indexes.y = index_counter;\n";
+ c += " maximum.y = src.y;\n";
+ c += " }\n";
+ c += " if (src.z > maximum.z) {\n";
+ c += " indexes.z = index_counter;\n";
+ c += " maximum.z = src.z;\n";
+ c += " }\n";
+ c += " if (src.w > maximum.w) {\n";
+ c += " indexes.w = index_counter;\n";
+ c += " maximum.w = src.w;\n";
+ c += " }\n";
+ }
+ else
+ {
+ c += " maximum = max(src, maximum);\n";
+ }
+ if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH))
+ {
+ c += " } // Depth\n";
+ }
+ c += " }\n";
+ c += " }\n";
+ c += " args.dst_tensor.Write(maximum, " + dst_coord + ");\n";
+ if (output_indices)
+ {
+ c += " args.dst_indices.Write(indexes, " + dst_coord + ");\n";
+ }
+ c += "}\n";
+
+ return c;
+}
+} // namespace
+
+GPUOperation CreatePooling(const OperationDef &definition, const Pooling2DAttributes &attr)
+{
+ GPUOperation op(definition);
+ op.args_.AddInt("kernel_size_x", attr.kernel.w);
+ op.args_.AddInt("padding_x", -attr.padding.prepended.w);
+ op.args_.AddInt("stride_x", attr.strides.w);
+ op.args_.AddInt("kernel_size_y", attr.kernel.h);
+ op.args_.AddInt("padding_y", -attr.padding.prepended.h);
+ op.args_.AddInt("stride_y", attr.strides.h);
+
+ const bool stride_correction = definition.IsBatchSupported() && attr.strides.w != 1;
+ if (attr.type == PoolingType::AVERAGE)
+ {
+ op.code_ = GetAveragePoolingKernelCode(definition, stride_correction, &op);
+ }
+ else if (attr.type == PoolingType::MAX)
+ {
+ op.code_ = GetMaxPoolingKernelCode(definition, stride_correction, attr.output_indices, &op);
+ }
+ op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+ return op;
+}
+
+GPUOperation CreatePooling(const OperationDef &definition, const Pooling3DAttributes &attr)
+{
+ GPUOperation op(definition);
+ op.args_.AddInt("kernel_size_x", attr.kernel.w);
+ op.args_.AddInt("padding_x", -attr.padding.prepended.w);
+ op.args_.AddInt("stride_x", attr.strides.w);
+ op.args_.AddInt("kernel_size_y", attr.kernel.h);
+ op.args_.AddInt("padding_y", -attr.padding.prepended.h);
+ op.args_.AddInt("stride_y", attr.strides.h);
+ op.args_.AddInt("kernel_size_z", attr.kernel.d);
+ op.args_.AddInt("padding_z", -attr.padding.prepended.d);
+ op.args_.AddInt("stride_z", attr.strides.d);
+ const bool stride_correction = definition.IsBatchSupported() && attr.strides.w != 1;
+ if (attr.type == PoolingType::AVERAGE)
+ {
+ op.code_ = GetAveragePoolingKernelCode(definition, stride_correction, &op);
+ }
+ else if (attr.type == PoolingType::MAX)
+ {
+ op.code_ = GetMaxPoolingKernelCode(definition, stride_correction, attr.output_indices, &op);
+ }
+ op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+ return op;
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Pooling.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/Pooling.h
new file mode 100644
index 000000000..166d81591
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/Pooling.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_POOLING_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_POOLING_H__
+
+#include "GpuOperation.h"
+
+#include "open_cl/Operations.h"
+#include "open_cl/Precision.h"
+#include "open_cl/ClKernel.h"
+#include "open_cl/Tensor.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+GPUOperation CreatePooling(const OperationDef &definition, const Pooling2DAttributes &attr);
+
+GPUOperation CreatePooling(const OperationDef &definition, const Pooling3DAttributes &attr);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_ADD_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Relu.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/Relu.cc
new file mode 100644
index 000000000..37f87e599
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/Relu.cc
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Relu.h"
+
+#include <string>
+#include "Util.h"
+#include "GpuOperation.h"
+#include "absl/strings/str_cat.h"
+#include "open_cl/Precision.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+GPUOperation CreateReLU(const OperationDef &definition, const ReLUAttributes &attr)
+{
+ GPUOperation op(definition);
+ op.elementwise_ = true;
+
+ std::string min_func;
+ if (attr.alpha != 0.0f)
+ {
+ min_func = "min(in_out_value * args.alpha, (FLT)(0.0f))";
+ if (definition.precision == CalculationsPrecision::F32)
+ {
+ op.args_.AddFloat("alpha", attr.alpha);
+ }
+ else
+ {
+#ifdef FIXME_PORTING_HALF_REQIRED
+ op.args_.AddHalf("alpha", half(attr.alpha));
+#endif
+ }
+ }
+ else
+ {
+ min_func = "(FLT)(0.0f)";
+ }
+ if (attr.clip != 0.0f)
+ {
+ if (definition.precision == CalculationsPrecision::F32)
+ {
+ op.args_.AddFloat("clip", attr.clip);
+ }
+ else
+ {
+#ifdef FIXME_PORTING_HALF_REQIRED
+ op.args_.AddHalf("clip", half(attr.clip));
+#endif
+ }
+ op.code_ = absl::StrCat("in_out_value = clamp(in_out_value, " + min_func + ", args.clip);");
+ }
+ else
+ {
+ op.code_ = absl::StrCat("in_out_value = max(in_out_value, ", min_func, ");");
+ }
+ return op;
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Relu.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/Relu.h
new file mode 100644
index 000000000..eb6b1ad1d
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/Relu.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPEN_CL_KERNELS_RELU_H__
+#define __ONERT_BACKEND_GPU_CL_OPEN_CL_KERNELS_RELU_H__
+
+#include "open_cl/ClKernel.h"
+#include "GpuOperation.h"
+#include "open_cl/Precision.h"
+#include "open_cl/Tensor.h"
+#include "open_cl/Types.h"
+#include "open_cl/Operations.h"
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+GPUOperation CreateReLU(const OperationDef &definition, const ReLUAttributes &attr);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPEN_CL_KERNELS_RELU_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Reshape.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/Reshape.cc
new file mode 100644
index 000000000..cdd3e8364
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/Reshape.cc
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Reshape.h"
+
+#include <string>
+
+#include "Util.h"
+#include "open_cl/Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+namespace
+{
+std::string GetReshapeCode(const OperationDef &op_def)
+{
+ std::string c = GetCommonDefines(op_def.precision);
+ c += "__kernel void main_function(\n";
+ c += "$0) {\n";
+ if (op_def.dst_tensors[0].HasAxis(Axis::BATCH))
+ {
+ c += " int linear_id = get_global_id(0);\n";
+ c += " int X = linear_id / args.dst_tensor.Batch();\n";
+ c += " int B = linear_id % args.dst_tensor.Batch();\n";
+ c += " args.dst_tensor.SetBatchRef(B);\n";
+ }
+ else
+ {
+ c += " int X = get_global_id(0);\n";
+ }
+ c += " int Y = get_global_id(1);\n";
+ c += " int Z = get_global_id(2);\n";
+ c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+ "Z >= args.dst_tensor.Slices()) { \n";
+ c += " return; \n";
+ c += " } \n";
+ c += " FLT temps[4];\n";
+ c += " temps[0] = (FLT)(0.0f);\n";
+ c += " temps[1] = (FLT)(0.0f);\n";
+ c += " temps[2] = (FLT)(0.0f);\n";
+ c += " temps[3] = (FLT)(0.0f);\n";
+ if (op_def.dst_tensors[0].HasAxis(Axis::BATCH))
+ {
+ c += " int base = B;\n";
+ }
+ else
+ {
+ c += " int base = 0;\n";
+ }
+ c += " base = ((base * args.dst_tensor.Height() + Y) * "
+ "args.dst_tensor.Width() + X) * args.dst_tensor.Channels() + Z * 4;\n";
+ c += " for (int i = 0; i < 4; ++i) {\n";
+ c += " int dst_channel = Z * 4 + i;\n";
+ c += " if (dst_channel < args.dst_tensor.Channels()) {;\n";
+ c += " int p = base + i;\n";
+ c += " int src_c = p % args.src_tensor.Channels();\n";
+ c += " p = p / args.src_tensor.Channels();\n";
+ c += " int src_x = p % args.src_tensor.Width();\n";
+ c += " p = p / args.src_tensor.Width();\n";
+ c += " int src_y = p % args.src_tensor.Height();\n";
+ if (op_def.src_tensors[0].HasAxis(Axis::BATCH))
+ {
+ c += " int src_b = p / args.src_tensor.Height();\n";
+ c += " args.src_tensor.SetBatchRef(src_b);\n";
+ }
+ c += " int src_z = src_c / 4;\n";
+ c += " int src_sub_ch = src_c % 4;\n";
+ c += " FLT4 t = args.src_tensor.Read(src_x, src_y, src_z);\n";
+ c += " FLT t_ar[4] = {t.x, t.y, t.z, t.w};\n";
+ c += " temps[i] = t_ar[src_sub_ch];\n";
+ c += " }\n";
+ c += " }\n";
+ c += " FLT4 result = (FLT4)(temps[0], temps[1], temps[2], temps[3]);\n";
+ c += " args.dst_tensor.Write(result, X, Y, Z);\n";
+ c += "}\n";
+ return c;
+}
+
+} // namespace
+
+GPUOperation CreateReshape(const OperationDef &definition)
+{
+ GPUOperation op(definition);
+ op.AddSrcTensor("src_tensor", definition.src_tensors[0]);
+ op.AddDstTensor("dst_tensor", definition.dst_tensors[0]);
+ op.code_ = GetReshapeCode(definition);
+ op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+ return op;
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Reshape.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/Reshape.h
new file mode 100644
index 000000000..4f7c5ea38
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/Reshape.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_RESHAPE_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_RESHAPE_H__
+
+#include "GpuOperation.h"
+
+#include "open_cl/Operations.h"
+#include "open_cl/Precision.h"
+#include "open_cl/ClKernel.h"
+#include "open_cl/Tensor.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+GPUOperation CreateReshape(const OperationDef &definition);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_RESHAPE_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Reshapex4.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/Reshapex4.cc
new file mode 100644
index 000000000..13010e791
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/Reshapex4.cc
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Reshape.h"
+
+#include <string>
+
+#include "Util.h"
+#include "open_cl/Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+namespace
+{
+
+std::string GetReshapeCode(const OperationDef &op_def)
+{
+ std::string c = GetCommonDefines(op_def.precision);
+ c += "__kernel void main_function(\n";
+ c += "$0) {\n";
+ if (op_def.dst_tensors[0].HasAxis(Axis::BATCH))
+ {
+ c += " int linear_id = get_global_id(0);\n";
+ c += " int X = linear_id / args.dst_tensor.Batch();\n";
+ c += " int B = linear_id % args.dst_tensor.Batch();\n";
+ c += " args.dst_tensor.SetBatchRef(B);\n";
+ }
+ else
+ {
+ c += " int X = get_global_id(0);\n";
+ }
+ c += " int Y = get_global_id(1);\n";
+ c += " int Z = get_global_id(2);\n";
+ c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
+ "Z >= args.dst_tensor.Slices()) { \n";
+ c += " return; \n";
+ c += " } \n";
+ if (op_def.dst_tensors[0].HasAxis(Axis::BATCH))
+ {
+ c += " int dst_bhwc4 = B;\n";
+ }
+ else
+ {
+ c += " int dst_bhwc4 = 0;\n";
+ }
+ c += " dst_bhwc4 = ((dst_bhwc4 * args.dst_tensor.Height() + Y) * "
+ "args.dst_tensor.Width() + X) * args.dst_tensor.Slices() + Z;\n";
+ c += " int src_z = dst_bhwc4 % args.src_tensor.Slices();\n";
+ c += " dst_bhwc4 = dst_bhwc4 / args.src_tensor.Slices();\n";
+ c += " int src_x = dst_bhwc4 % args.src_tensor.Width();\n";
+ c += " dst_bhwc4 = dst_bhwc4 / args.src_tensor.Width();\n";
+ c += " int src_y = dst_bhwc4 % args.src_tensor.Height();\n";
+ if (op_def.src_tensors[0].HasAxis(Axis::BATCH))
+ {
+ c += " int src_b = dst_bhwc4 / args.src_tensor.Height();\n";
+ c += " args.src_tensor.SetBatchRef(src_b);\n";
+ }
+ c += " FLT4 result = args.src_tensor.Read(src_x, src_y, src_z);\n";
+ c += " args.dst_tensor.Write(result, X, Y, Z);\n";
+ c += "}\n";
+ return c;
+}
+
+} // namespace
+
+GPUOperation CreateReshapex4(const OperationDef &definition)
+{
+ GPUOperation op(definition);
+ op.AddSrcTensor("src_tensor", definition.src_tensors[0]);
+ op.AddDstTensor("dst_tensor", definition.dst_tensors[0]);
+ op.code_ = GetReshapeCode(definition);
+ op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+ return op;
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Reshapex4.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/Reshapex4.h
new file mode 100644
index 000000000..8988e8bd4
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/Reshapex4.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPEN_CL_KERNELS_RESHAPEX4_H__
+#define __ONERT_BACKEND_GPU_CL_OPEN_CL_KERNELS_RESHAPEX4_H__
+
+#include "GpuOperation.h"
+
+#include "open_cl/Operations.h"
+#include "open_cl/Precision.h"
+#include "open_cl/ClKernel.h"
+#include "open_cl/Tensor.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+// More optimized, but require src_channels % 4 == 0 and dst_channels % 4 == 0
+GPUOperation CreateReshapex4(const OperationDef &definition);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPEN_CL_KERNELS_RESHAPEX4_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax.cc
new file mode 100644
index 000000000..4ee164d82
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax.cc
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Softmax.h"
+
+#include <string>
+
+#include "Util.h"
+#include "WorkGroupPicking.h"
+#include "GpuOperation.h"
+#include "open_cl/Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+namespace
+{
+std::string GetSoftmaxKernelCode(const OperationDef &op_def)
+{
+ std::string c = GetCommonDefines(op_def.precision);
+ c += "__kernel void main_function(\n";
+ c += "$0) {\n";
+ c += " int X = get_global_id(0);\n";
+ c += " int Y = get_global_id(1);\n";
+ c += " if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height()) "
+ "return; \n";
+ c += " float sum = 0.0f;\n";
+ c += " for (int d = 0; d < args.dst_tensor.Slices(); ++d) {\n";
+ c += " float4 t = args.src_tensor.Read<float>(X, Y, d);\n";
+ c += " sum += exp(t.x);\n";
+ c += " if (d * 4 + 1 < args.dst_tensor.Channels()) sum += exp(t.y);\n";
+ c += " if (d * 4 + 2 < args.dst_tensor.Channels()) sum += exp(t.z);\n";
+ c += " if (d * 4 + 3 < args.dst_tensor.Channels()) sum += exp(t.w);\n";
+ c += " }\n";
+ c += " for (int d = 0; d < args.dst_tensor.Slices(); ++d) {\n";
+ c += " float4 t = args.src_tensor.Read<float>(X, Y, d);\n";
+ c += " t = exp(t) / sum;\n";
+ c += " FLT4 result = TO_FLT4(t);\n";
+ c += " args.dst_tensor.Write(result, X, Y, d);\n";
+ c += " }\n";
+ c += "}\n";
+ return c;
+}
+} // namespace
+
+GPUOperation CreateSoftmax(const OperationDef &definition)
+{
+ GPUOperation op(definition);
+ auto src_desc = definition.src_tensors[0];
+ if (definition.IsBatchSupported())
+ {
+ src_desc.SetStateVar("BatchedWidth", "true");
+ }
+ op.AddSrcTensor("src_tensor", src_desc);
+ auto dst_desc = definition.dst_tensors[0];
+ if (definition.IsBatchSupported())
+ {
+ dst_desc.SetStateVar("BatchedWidth", "true");
+ }
+ op.AddDstTensor("dst_tensor", dst_desc);
+ op.code_ = GetSoftmaxKernelCode(definition);
+ op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_ZIs1;
+ return op;
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax.h
new file mode 100644
index 000000000..594bab042
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPEN_CL_KERNELS_SOFTMAX_H__
+#define __ONERT_BACKEND_GPU_CL_OPEN_CL_KERNELS_SOFTMAX_H__
+
+#include "open_cl/ClKernel.h"
+#include "GpuOperation.h"
+#include "open_cl/Precision.h"
+#include "open_cl/Tensor.h"
+#include "open_cl/Types.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+GPUOperation CreateSoftmax(const OperationDef &definition);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPEN_CL_KERNELS_SOFTMAX_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax1x1.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax1x1.cc
new file mode 100644
index 000000000..590952dca
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax1x1.cc
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Softmax1x1.h"
+
+#include <string>
+
+#include "Util.h"
+#include "open_cl/Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+Softmax1x1::Softmax1x1(const OperationDef &definition) : GPUOperation(definition)
+{
+ work_group_size_ = int3(32, 1, 1);
+ code_ = GetSoftmaxKernelCode(definition_);
+}
+
+Softmax1x1::Softmax1x1(Softmax1x1 &&kernel) : GPUOperation(std::move(kernel)) {}
+
+Softmax1x1 &Softmax1x1::operator=(Softmax1x1 &&kernel)
+{
+ if (this != &kernel)
+ {
+ GPUOperation::operator=(std::move(kernel));
+ }
+ return *this;
+}
+
+std::string Softmax1x1::GetSoftmaxKernelCode(const OperationDef &op_def)
+{
+ AddSrcTensor("src_tensor", op_def.src_tensors[0]);
+ AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
+ args_.AddFloat("mask_x");
+ args_.AddFloat("mask_y");
+ args_.AddFloat("mask_z");
+ args_.AddFloat("mask_w");
+ args_.AddInt("slices_x32");
+
+ std::string c = GetCommonDefines(op_def.precision);
+ c += "__kernel void main_function(\n";
+ c += "$0) {\n";
+ if (op_def.IsBatchSupported())
+ {
+ c += " int batch_id = get_global_id(1);\n";
+ c += " if (batch_id >= args.dst_tensor.Batch()) return;\n";
+ c += " args.dst_tensor.SetBatchRef(batch_id);\n";
+ c += " args.src_tensor.SetBatchRef(batch_id);\n";
+ }
+ c += " float4 mask = (float4)(args.mask_x, args.mask_y, args.mask_z, "
+ "args.mask_w);\n";
+ c += " int offset = 0;\n";
+ c += " float sum = 0.0f;\n";
+ c += " int s = 0;\n";
+ c += " int tid = get_local_id(0);\n";
+ c += " do {\n";
+ c += " int z = offset + tid;\n";
+ c += " if (z < args.dst_tensor.Slices()) {\n";
+ c += " float4 mask_temp = z == args.dst_tensor.Slices() - 1 ? mask : "
+ "(float4)(1.0f);\n";
+ c += " float4 src = args.src_tensor.Read<float>(0, 0, z);\n";
+ c += " sum += dot(mask_temp, exp(src));\n";
+ c += " offset += 32;\n";
+ c += " }\n";
+ c += " s++;\n";
+ c += " } while (s < args.slices_x32);\n";
+ c += "\n";
+ c += " __local float4 tmp[8];\n";
+ c += " __local float* tmpx1 = (__local float*)tmp;\n";
+ c += " tmpx1[tid] = sum;\n";
+ c += " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ c += " if (tid == 0) {\n";
+ c += " sum = dot((float4)(1.0f), tmp[0]);\n";
+ c += " sum += dot((float4)(1.0f), tmp[1]);\n";
+ c += " sum += dot((float4)(1.0f), tmp[2]);\n";
+ c += " sum += dot((float4)(1.0f), tmp[3]);\n";
+ c += " sum += dot((float4)(1.0f), tmp[4]);\n";
+ c += " sum += dot((float4)(1.0f), tmp[5]);\n";
+ c += " sum += dot((float4)(1.0f), tmp[6]);\n";
+ c += " sum += dot((float4)(1.0f), tmp[7]);\n";
+ c += " tmpx1[0] = 1.0f / sum;\n";
+ c += " }\n";
+ c += " barrier(CLK_LOCAL_MEM_FENCE);\n";
+ c += " sum = tmpx1[0];\n";
+ c += "\n";
+ c += " offset = 0;\n";
+ c += " s = 0;\n";
+ c += " do {\n";
+ c += " int z = offset + tid;\n";
+ c += " if (z < args.dst_tensor.Slices()) {\n";
+ c += " FLT4 res = TO_FLT4(exp(args.src_tensor.Read<float>(0, 0, "
+ "z))*sum);\n";
+ c += " args.dst_tensor.Write(res, 0, 0, z);\n";
+ c += " offset += 32;\n";
+ c += " }\n";
+ c += " s++;\n";
+ c += " } while (s < args.slices_x32);\n";
+ c += "}\n";
+ return c;
+}
+
+absl::Status Softmax1x1::BindArguments(ArgumentsBinder *args)
+{
+ float4 mask = GetMaskForLastPlane(src_[0]->Channels());
+ RETURN_IF_ERROR(args->SetFloat("mask_x", mask.x));
+ RETURN_IF_ERROR(args->SetFloat("mask_y", mask.y));
+ RETURN_IF_ERROR(args->SetFloat("mask_z", mask.z));
+ RETURN_IF_ERROR(args->SetFloat("mask_w", mask.w));
+ RETURN_IF_ERROR(args->SetInt("slices_x32", DivideRoundUp(src_[0]->Slices(), 32)));
+ return absl::OkStatus();
+}
+
+int3 Softmax1x1::GetGridSize() const { return int3(32, dst_[0]->Batch(), 1); }
+
+Softmax1x1 CreateSoftmax1x1(const OperationDef &definition) { return Softmax1x1(definition); }
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax1x1.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax1x1.h
new file mode 100644
index 000000000..da375d457
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/Softmax1x1.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPEN_CL_KERNELS_SOFTMAX1X1_H__
+#define __ONERT_BACKEND_GPU_CL_OPEN_CL_KERNELS_SOFTMAX1X1_H__
+
+#include "GpuOperation.h"
+
+#include "open_cl/Precision.h"
+#include "open_cl/ClKernel.h"
+#include "open_cl/Tensor.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+class Softmax1x1 : public GPUOperation
+{
+public:
+ Softmax1x1() = default;
+ explicit Softmax1x1(const OperationDef &definition);
+
+ absl::Status BindArguments(ArgumentsBinder *args) override;
+ int3 GetGridSize() const override;
+
+ // Move only
+ Softmax1x1(Softmax1x1 &&kernel);
+ Softmax1x1 &operator=(Softmax1x1 &&kernel);
+ Softmax1x1(const Softmax1x1 &) = delete;
+ Softmax1x1 &operator=(const Softmax1x1 &) = delete;
+
+ friend Softmax1x1 CreateSoftmax1x1();
+
+private:
+ std::string GetSoftmaxKernelCode(const OperationDef &op_def);
+};
+
+Softmax1x1 CreateSoftmax1x1(const OperationDef &definition);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPEN_CL_KERNELS_SOFTMAX1X1_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/TuningParameters.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/TuningParameters.h
new file mode 100644
index 000000000..3d99b4fda
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/TuningParameters.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_KERNELS_TUNING_PARAMETERS_H__
+#define __ONERT_BACKEND_GPU_CL_KERNELS_TUNING_PARAMETERS_H__
+
+#include "open_cl/ClCommandQueue.h"
+#include "open_cl/DeviceInfo.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+enum class TuningType
+{
+ EXHAUSTIVE,
+ FAST
+};
+
+struct TuningParameters
+{
+ ProfilingCommandQueue *queue;
+ const DeviceInfo *info;
+ TuningType tuning_type = TuningType::EXHAUSTIVE;
+};
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_KERNELS_TUNING_PARAMETERS_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Util.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/Util.cc
new file mode 100644
index 000000000..df42c66e8
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/Util.cc
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Util.h"
+
+#include <cfloat>
+#include <cmath>
+#include <string>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/substitute.h"
+#include "open_cl/Precision.h"
+#include "open_cl/DataType.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+std::string GetCommonDefines(CalculationsPrecision precision)
+{
+ std::string result;
+
+ switch (precision)
+ {
+ case CalculationsPrecision::F32:
+ result += "#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable\n";
+ result += "#define ACCUM_FLT4 float4\n";
+ result += "#define FLT float\n";
+ result += "#define FLT2 float2\n";
+ result += "#define FLT3 float3\n";
+ result += "#define FLT4 float4\n";
+ result += "#define TO_FLT4 convert_float4\n";
+ result += "#define TO_ACCUM_TYPE convert_float4\n";
+ result += "#define TO_ACCUM_FLT convert_float\n";
+ break;
+ case CalculationsPrecision::F16:
+ result += "#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable\n";
+ result += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+ result += "#define ACCUM_FLT4 half4\n";
+ result += "#define FLT half\n";
+ result += "#define FLT2 half2\n";
+ result += "#define FLT3 half3\n";
+ result += "#define FLT4 half4\n";
+ result += "#define TO_FLT4 convert_half4\n";
+ result += "#define TO_ACCUM_TYPE convert_half4\n";
+ result += "#define TO_ACCUM_FLT convert_half\n";
+ break;
+ case CalculationsPrecision::F32_F16:
+ result += "#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable\n";
+ result += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+ result += "#define ACCUM_FLT4 float4\n";
+ result += "#define FLT half\n";
+ result += "#define FLT2 half2\n";
+ result += "#define FLT3 half3\n";
+ result += "#define FLT4 half4\n";
+ result += "#define TO_FLT4 convert_half4\n";
+ result += "#define TO_ACCUM_TYPE convert_float4\n";
+ result += "#define TO_ACCUM_FLT convert_float\n";
+ break;
+ }
+ return result;
+}
+
+std::string GetXStrideCorrectedV2(const std::string &src_x, const std::string &batch_size,
+ const std::string &stride_x, const std::string &padding_x)
+{
+ // int p0 = src_x / batch_size;\n";
+ // int b0 = src_x % batch_size;\n";
+ // return (p0 * stride_x + padding_x) * batch_size + b0;\n";
+ return absl::Substitute("(((($0) / $1) * $2 + $3) * $1 + ($0) % $1)", src_x, batch_size, stride_x,
+ padding_x);
+}
+
+float4 GetMaskForLastPlane(int channels)
+{
+ float4 mask = float4(0.0f);
+ const int reminder = channels % 4 == 0 ? 4 : channels % 4;
+ for (int i = 0; i < reminder; ++i)
+ {
+ mask[i] = 1.0f;
+ }
+ return mask;
+}
+
+int3 GetFirstSuitableWorkGroup(const std::vector<int3> &wgs, int max_wg_size)
+{
+ for (const auto &wg : wgs)
+ {
+ const int wg_size = wg.x * wg.y * wg.z;
+ if (wg_size <= max_wg_size)
+ {
+ return wg;
+ }
+ }
+ return {1, 1, 1};
+}
+
+int GetRecommendedBlockSizeForConv(const DeviceInfo &device_info, CalculationsPrecision precision,
+ int task_size)
+{
+ const float task_size_per_cu = task_size / static_cast<float>(device_info.compute_units_count);
+ int block_size = 1;
+ float threshold_1 = FLT_MAX;
+ float threshold_2 = FLT_MAX;
+ float threshold_4 = FLT_MAX;
+ if (!device_info.IsMali())
+ {
+ return 1;
+ }
+ MaliInfo mali_info = device_info.mali_info;
+ switch (precision)
+ {
+ case CalculationsPrecision::F16:
+ if (mali_info.IsBifrostGen1())
+ {
+ threshold_1 = 256.0f;
+ threshold_2 = 256.0f * 4.0f;
+ threshold_4 = 256.0f * 8.0f;
+ }
+ else if (mali_info.IsBifrostGen2())
+ {
+ threshold_1 = 256.0f * 2.0f;
+ threshold_2 = 256.0f * 8.0f;
+ threshold_4 = 256.0f * 16.0f;
+ }
+ else if (mali_info.IsBifrostGen3() || mali_info.IsValhall())
+ {
+ threshold_1 = 256.0f;
+ threshold_2 = 256.0f * 6.0f;
+ threshold_4 = 256.0f * 16.0f;
+ }
+ else if (mali_info.IsMidgard())
+ {
+ threshold_1 = 256.0f * 4.0f;
+ threshold_2 = 256.0f * 16.0f;
+ }
+ break;
+ case CalculationsPrecision::F32_F16:
+ if (mali_info.IsBifrostGen1())
+ {
+ threshold_1 = 256.0f;
+ threshold_2 = 256.0f * 3.0f;
+ threshold_4 = 256.0f * 32.0f;
+ }
+ else if (mali_info.IsBifrostGen2())
+ {
+ threshold_1 = 256.0f * 2.0f;
+ threshold_2 = 256.0f * 8.0f;
+ }
+ else if (mali_info.IsBifrostGen3() || mali_info.IsValhall())
+ {
+ threshold_1 = 256.0f;
+ threshold_2 = 256.0f * 8.0f;
+ }
+ else if (mali_info.IsMidgard())
+ {
+ threshold_1 = 256.0f * 4.0f;
+ }
+ break;
+ case CalculationsPrecision::F32:
+ if (mali_info.IsBifrostGen1())
+ {
+ threshold_1 = 256.0f;
+ threshold_2 = 256.0f * 4.0f;
+ }
+ else if (mali_info.IsBifrostGen2())
+ {
+ threshold_1 = 128.0f;
+ threshold_2 = 256.0f * 4.0f;
+ }
+ else if (mali_info.IsBifrostGen3() || mali_info.IsValhall())
+ {
+ threshold_1 = 256.0f;
+ threshold_2 = 256.0f * 12.0f;
+ }
+ else if (mali_info.IsMidgard())
+ {
+ threshold_1 = 256.0f * 16.0f;
+ }
+ break;
+ }
+ if (task_size_per_cu <= threshold_1)
+ {
+ block_size = 1;
+ }
+ else if (task_size_per_cu <= threshold_2)
+ {
+ block_size = 2;
+ }
+ else if (task_size_per_cu <= threshold_4)
+ {
+ block_size = 4;
+ }
+ else
+ {
+ block_size = 8;
+ }
+ return block_size;
+}
+
+int3 GetWorkGroupsCount(const int3 &grid_size, const int3 &work_group_size)
+{
+ int3 work_groups_count;
+ work_groups_count.x = DivideRoundUp(grid_size.x, work_group_size.x);
+ work_groups_count.y = DivideRoundUp(grid_size.y, work_group_size.y);
+ work_groups_count.z = DivideRoundUp(grid_size.z, work_group_size.z);
+ return work_groups_count;
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/Util.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/Util.h
new file mode 100644
index 000000000..8363862c1
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/Util.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_UTIL_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_UTIL_H__
+
+#include <string>
+#include <vector>
+
+#include "open_cl/DeviceInfo.h"
+#include "open_cl/Precision.h"
+#include "open_cl/DataType.h"
+#include "open_cl/Shape.h"
+#include "open_cl/Tensor.h"
+#include "open_cl/Types.h"
+#include "open_cl/Util.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+std::string GetCommonDefines(CalculationsPrecision precision);
+
+// Calculates correct X coordinate when stride != 1 and batch != 1 for layouts
+// with B after W (for example HWBC4) and WB stored in one axis of GPU
+// resources.
+std::string GetXStrideCorrected(const std::string &src_x, const std::string &batch_size,
+ const std::string &stride_x, const std::string &padding_x);
+
+// Calculates correct X coordinate when stride != 1 and batch != 1 for layouts
+// with B after W (for example HWBC4) and WB stored in one axis of GPU
+// resources.
+std::string GetXStrideCorrectedV2(const std::string &src_x, const std::string &batch_size,
+ const std::string &stride_x, const std::string &padding_x);
+
+// Returns float4 mask for last plane(batch of 4 channels)
+// assumes that plane size is 4;
+// for example we have 7 channels, in our data structures we align it to 8
+// but 8s-channel will be empty, then last plane (batch of 4 channels) will
+// have this mask (1, 1, 1, 0).
+float4 GetMaskForLastPlane(int channels);
+
+// returns first work group from wgs that has size not bigger than max_wg_size
+// if no suitable groups among wgs, returns {1, 1, 1}
+int3 GetFirstSuitableWorkGroup(const std::vector<int3> &wgs, int max_wg_size);
+
+// task_size as amount of FLT4 processed elements.
+int GetRecommendedBlockSizeForConv(const DeviceInfo &device, CalculationsPrecision precision,
+ int task_size);
+
+int3 GetWorkGroupsCount(const int3 &grid_size, const int3 &work_group_size);
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_UTIL_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/WorkGroupPicking.cc b/runtime/onert/backend/gpu_cl/open_cl/kernels/WorkGroupPicking.cc
new file mode 100644
index 000000000..214fec271
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/WorkGroupPicking.cc
@@ -0,0 +1,348 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "WorkGroupPicking.h"
+
+#include <algorithm>
+#include <limits>
+#include <set>
+#include <vector>
+
+#include "open_cl/Util.h"
+#include "open_cl/Types.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+namespace
+{
+
+std::vector<int2> Get2DWorkgroupsEqualTo128()
+{
+ return {{128, 1}, {64, 2}, {32, 4}, {16, 8}, {8, 16}, {4, 32}, {2, 64}, {1, 128}};
+}
+
+std::vector<int3> GenerateWorkGroupSizesXYMultipleOf(int multiplier, int3 grid,
+ const KernelInfo &kernel_info,
+ const DeviceInfo &device_info,
+ WorkGroupSizeAlignment z_alignment)
+{
+ std::vector<int3> work_groups;
+ work_groups.reserve(32);
+
+ std::vector<int> possible_z_sizes = GetPossibleSizes(grid.z, z_alignment);
+
+ for (int x = 1; x <= kernel_info.max_work_group_size; x *= 2)
+ {
+ for (int y = 1; y <= kernel_info.max_work_group_size; y *= 2)
+ {
+ int work_group_size_xy = x * y;
+ if (work_group_size_xy % multiplier != 0 ||
+ work_group_size_xy > kernel_info.max_work_group_size)
+ {
+ continue;
+ }
+ for (auto z : possible_z_sizes)
+ {
+ if (work_group_size_xy * z > kernel_info.max_work_group_size)
+ {
+ continue;
+ }
+ if (x <= device_info.max_work_group_size_x && y <= device_info.max_work_group_size_y &&
+ z <= device_info.max_work_group_size_z)
+ {
+ work_groups.push_back({x, y, z});
+ }
+ }
+ }
+ }
+ return work_groups;
+}
+
+std::vector<int3> GenerateWorkGroupSizesXMultipleOf(int multiplier, int3 grid,
+ const KernelInfo &kernel_info,
+ const DeviceInfo &device_info,
+ WorkGroupSizeAlignment z_alignment)
+{
+ std::vector<int3> work_groups;
+ work_groups.reserve(32);
+
+ std::vector<int> possible_z_sizes = GetPossibleSizes(grid.z, z_alignment);
+ std::vector<int> possible_y_sizes = GetPossibleSizes(grid.y, WorkGroupSizeAlignment::PRECISE);
+
+ for (int x = multiplier; x <= kernel_info.max_work_group_size && x < grid.x + multiplier;
+ x += multiplier)
+ {
+ for (auto y : possible_y_sizes)
+ {
+ for (auto z : possible_z_sizes)
+ {
+ if (x <= device_info.max_work_group_size_x && y <= device_info.max_work_group_size_y &&
+ z <= device_info.max_work_group_size_z && x * y * z <= kernel_info.max_work_group_size)
+ {
+ work_groups.push_back({x, y, z});
+ }
+ }
+ }
+ }
+ return work_groups;
+}
+
+void GetWorkGroupsAlignedToGrid(const DeviceInfo &device_info, const KernelInfo &kernel_info,
+ const int3 &grid, std::vector<int3> *work_groups)
+{
+ int3 max_wg_size;
+ max_wg_size.x = device_info.max_work_group_size_x;
+ max_wg_size.y = device_info.max_work_group_size_y;
+ max_wg_size.z = device_info.max_work_group_size_z;
+ GenerateWorkGroupSizesAlignedToGrid(grid, max_wg_size, kernel_info.max_work_group_size,
+ work_groups);
+}
+
+int GetPenalty(int grid_size, int group_size)
+{
+ const int reminder = grid_size % group_size;
+ return reminder == 0 ? 0 : group_size - reminder;
+}
+
+int GetPenalty(int2 grid_size, int2 group_size)
+{
+ const int p_x = GetPenalty(grid_size.x, group_size.x);
+ const int p_y = GetPenalty(grid_size.y, group_size.y);
+ return p_x * grid_size.y + p_y * grid_size.x + p_x * p_y;
+}
+
+int GetMaxSizeWithMinPenalty(int size, int max_size)
+{
+ int best_size = 128;
+ int min_penalty = GetPenalty(size, best_size);
+ for (int i = 2; i * 128 <= max_size; ++i)
+ {
+ if (GetPenalty(size, i * 128) == min_penalty)
+ {
+ best_size = i * 128;
+ }
+ }
+ return best_size;
+}
+
+int2 GetMaxSizeWithMinPenalty(int2 size, int max_size)
+{
+ std::vector<int2> base_groups = Get2DWorkgroupsEqualTo128();
+ int min_penalty = std::numeric_limits<int>::max();
+ for (const auto &group : base_groups)
+ {
+ min_penalty = std::min(GetPenalty(size, group), min_penalty);
+ }
+ for (const auto &group : base_groups)
+ {
+ for (int y = 1; y * group.y <= max_size; ++y)
+ {
+ int new_group_y = y * group.y;
+ for (int x = 1; x * group.x <= max_size; ++x)
+ {
+ int new_group_x = x * group.x;
+ if (new_group_x * new_group_y > max_size)
+ {
+ break;
+ }
+ if (GetPenalty(size, int2(new_group_x, new_group_y)) == min_penalty)
+ {
+ return int2(new_group_x, new_group_y);
+ }
+ }
+ }
+ }
+ return int2(0, 0);
+}
+
+int GetBiggestDividerWithPriority(int number, int max_divider)
+{
+ if (number % 8 == 0 && 8 <= max_divider)
+ {
+ return 8;
+ }
+ if (number % 4 == 0 && 4 <= max_divider)
+ {
+ return 4;
+ }
+ if (number % 2 == 0 && 2 <= max_divider)
+ {
+ return 2;
+ }
+ for (int i = max_divider; i != 0; i--)
+ {
+ if (number % i == 0)
+ {
+ return i;
+ }
+ }
+ return 1;
+}
+
+int GetBiggestDivider(int number, int max_divider)
+{
+ for (int i = max_divider; i != 0; i--)
+ {
+ if (number % i == 0)
+ {
+ return i;
+ }
+ }
+ return 1;
+}
+
+} // namespace
+
+int3 GetWorkGroupXY128ConvLinear(const int3 &grid)
+{
+ int grid_z = GetBiggestDividerWithPriority(grid.z, 4);
+ if (grid.x <= 128)
+ {
+ return int3(128, 1, grid_z);
+ }
+ int grid_x = GetMaxSizeWithMinPenalty(grid.x, 512 / grid_z);
+ return {grid_x, 1, grid_z};
+}
+
+int3 GetWorkGroupXY128Conv(const int3 &grid)
+{
+ int grid_z = GetBiggestDividerWithPriority(grid.z, 4);
+ if (grid.x <= 16 && grid.y <= 8)
+ {
+ return int3(16, 8, grid_z);
+ }
+ int2 grid_xy = GetMaxSizeWithMinPenalty(int2(grid.x, grid.y), 512 / grid_z);
+ return int3(grid_xy.x, grid_xy.y, grid_z);
+}
+
+// int3 GetWorkGroupXY128Simple(const int3& grid) { return int3(16, 8, 1); }
+
+int3 GetWorkGroup(const int3 &grid, int max_size)
+{
+ int wg_z = GetBiggestDividerWithPriority(grid.z, 8);
+ int wg_xy_size = max_size / wg_z;
+ int wg_x = std::min(DivideRoundUp(grid.x, 2), wg_xy_size);
+ int wg_y = std::min(wg_xy_size / wg_x, grid.y);
+ return int3(wg_x, wg_y, wg_z);
+}
+
+int3 GetWorkGroupConv(const int3 &grid, int max_size, int max_z_size)
+{
+ int wg_z = GetBiggestDivider(grid.z, max_z_size);
+ int wg_xy_size = std::min(256, max_size) / wg_z;
+ int wg_x = std::min(grid.x, wg_xy_size);
+ int wg_y = std::min(wg_xy_size / wg_x, grid.y);
+ if (wg_y == grid.y && grid.y % 2 == 0)
+ {
+ wg_y = grid.y / 2;
+ }
+ return int3(wg_x, wg_y, wg_z);
+}
+
+void GetPossibleWorkGroupsXYMultipleOf(int multiplier, const DeviceInfo &device_info,
+ const KernelInfo &kernel_info, const int3 &grid,
+ WorkGroupSizeAlignment z_alignment,
+ std::vector<int3> *work_groups)
+{
+ *work_groups =
+ GenerateWorkGroupSizesXYMultipleOf(multiplier, grid, kernel_info, device_info, z_alignment);
+}
+
+void GetPossibleWorkGroupsXMultipleOf(int multiplier, const DeviceInfo &device_info,
+ const KernelInfo &kernel_info, const int3 &grid,
+ WorkGroupSizeAlignment z_alignment,
+ std::vector<int3> *work_groups)
+{
+ *work_groups =
+ GenerateWorkGroupSizesXMultipleOf(multiplier, grid, kernel_info, device_info, z_alignment);
+}
+
+bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height)
+{
+ int planar_work_groups = DivideRoundUp(width * height, 128);
+ auto base_work_groups = Get2DWorkgroupsEqualTo128();
+ bool have_equal_work_groups = false;
+ for (auto &work_group : base_work_groups)
+ {
+ int x_groups = DivideRoundUp(width, work_group.x);
+ int y_groups = DivideRoundUp(height, work_group.y);
+ int xy_groups = x_groups * y_groups;
+ if (xy_groups == planar_work_groups)
+ {
+ have_equal_work_groups = true;
+ break;
+ }
+ }
+ return !have_equal_work_groups;
+}
+
+void GetPossibleWorkGroups(TuningType tuning_type, const DeviceInfo &device_info,
+ const KernelInfo &kernel_info, const int3 &grid,
+ std::vector<int3> *work_groups)
+{
+ switch (tuning_type)
+ {
+ case TuningType::FAST:
+ work_groups->push_back(GetWorkGroup(grid, kernel_info.max_work_group_size));
+ return;
+ case TuningType::EXHAUSTIVE:
+ {
+ GetWorkGroupsAlignedToGrid(device_info, kernel_info, grid, work_groups);
+ return;
+ }
+ default:
+ work_groups->push_back({8, 4, 1});
+ return;
+ }
+}
+
+void GetPossibleWorkGroupsConv(TuningType tuning_type, const DeviceInfo &device_info,
+ const KernelInfo &kernel_info, const int3 &grid,
+ std::vector<int3> *work_groups)
+{
+ switch (tuning_type)
+ {
+ case TuningType::FAST:
+ {
+ int max_z_size = 16;
+ if (device_info.IsAdreno())
+ {
+ max_z_size = device_info.IsAdreno3xx() ? 16 : 64;
+ }
+ max_z_size = std::min(max_z_size, device_info.max_work_group_size_z);
+ work_groups->push_back(GetWorkGroupConv(grid, kernel_info.max_work_group_size, max_z_size));
+ return;
+ }
+ case TuningType::EXHAUSTIVE:
+ {
+ GetWorkGroupsAlignedToGrid(device_info, kernel_info, grid, work_groups);
+ return;
+ }
+ default:
+ work_groups->push_back({8, 4, 1});
+ return;
+ }
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/kernels/WorkGroupPicking.h b/runtime/onert/backend/gpu_cl/open_cl/kernels/WorkGroupPicking.h
new file mode 100644
index 000000000..c19890de1
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/kernels/WorkGroupPicking.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_WROK_GROUP_PICKING_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_WROK_GROUP_PICKING_H__
+
+#include <vector>
+
+#include "TuningParameters.h"
+
+#include "open_cl/ClKernel.h"
+#include "open_cl/DeviceInfo.h"
+#include "open_cl/Types.h"
+#include "open_cl/WorkgroupSelection.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+// multiplier can be power of two only
+void GetPossibleWorkGroupsXYMultipleOf(int multiplier, const DeviceInfo &device_info,
+ const KernelInfo &kernel_info, const int3 &grid,
+ WorkGroupSizeAlignment z_alignment,
+ std::vector<int3> *work_groups);
+
+void GetPossibleWorkGroupsXMultipleOf(int multiplier, const DeviceInfo &device_info,
+ const KernelInfo &kernel_info, const int3 &grid,
+ WorkGroupSizeAlignment z_alignment,
+ std::vector<int3> *work_groups);
+
+int3 GetWorkGroupXY128ConvLinear(const int3 &grid);
+
+int3 GetWorkGroupXY128Simple(const int3 &grid);
+int3 GetWorkGroupXY128Conv(const int3 &grid);
+
+bool XY128RequiresMoreWorkGroupsThenXY128Linear(int width, int height);
+
+void GetPossibleWorkGroups(TuningType tuning_type, const DeviceInfo &device_info,
+ const KernelInfo &kernel_info, const int3 &grid,
+ std::vector<int3> *work_groups);
+
+void GetPossibleWorkGroupsConv(TuningType tuning_type, const DeviceInfo &device_info,
+ const KernelInfo &kernel_info, const int3 &grid,
+ std::vector<int3> *work_groups);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_KERNELS_WROK_GROUP_PICKING_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/selectors/ConvolutionSelector.cc b/runtime/onert/backend/gpu_cl/open_cl/selectors/ConvolutionSelector.cc
new file mode 100644
index 000000000..eac6f3270
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/selectors/ConvolutionSelector.cc
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "ConvolutionSelector.h"
+
+#include "absl/memory/memory.h"
+#include "open_cl/kernels/ConvBuffer1x1.h"
+#include "open_cl/kernels/ConvConstants.h"
+#include "open_cl/kernels/ConvPowervr.h"
+#include "open_cl/kernels/ConvWeightsConverter.h"
+#include "open_cl/kernels/WorkGroupPicking.h"
+#include "open_cl/TensorType.h"
+#include "open_cl/Util.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+namespace
+{
+
+std::unique_ptr<GPUOperation> SelectConvolutionAdreno(const Convolution2DAttributes &attr,
+ const BHWC &dst_shape,
+ const DeviceInfo &device_info,
+ const OperationDef &op_def, ModelHints)
+{
+ if (IsConvConstantsSupported(device_info, op_def, attr))
+ {
+ GPUOperation conv = CreateConvConstants(device_info, op_def, attr);
+ return absl::make_unique<GPUOperation>(std::move(conv));
+ }
+ else
+ {
+ ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
+ return absl::make_unique<ConvPowerVR>(std::move(conv));
+ }
+}
+
+std::unique_ptr<GPUOperation> SelectConvolutionWinogradAdreno(const Convolution2DAttributes &attr,
+ const BHWC &dst_shape,
+ const DeviceInfo &device_info,
+ const OperationDef &op_def,
+ ModelHints)
+{
+ ConvPowerVR conv = CreateConvPowerVRWino4x4To6x6(device_info, op_def, attr, &dst_shape);
+ return absl::make_unique<ConvPowerVR>(std::move(conv));
+}
+
+std::unique_ptr<GPUOperation>
+SelectConvolutionDynamicWeightsAdreno(const Convolution2DAttributes &attr,
+ const BHWC &weights_shape, const BHWC &dst_shape,
+ const DeviceInfo &device_info, const OperationDef &op_def,
+ ModelHints, ConvWeightsDescription *weights_desc)
+{
+ ConvPowerVR conv =
+ CreateConvPowerVRDynamicWeights(device_info, op_def, attr, weights_shape, &dst_shape);
+ *weights_desc = conv.GetConvWeightsDescription();
+ return absl::make_unique<ConvPowerVR>(std::move(conv));
+}
+
+std::unique_ptr<GPUOperation> SelectConvolutionNVidia(const Convolution2DAttributes &attr,
+ const BHWC &dst_shape,
+ const DeviceInfo &device_info,
+ const OperationDef &op_def)
+{
+ if (IsConvConstantsSupported(device_info, op_def, attr))
+ {
+ GPUOperation conv = CreateConvConstants(device_info, op_def, attr);
+ return absl::make_unique<GPUOperation>(std::move(conv));
+ }
+ else
+ {
+ ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
+ return absl::make_unique<ConvPowerVR>(std::move(conv));
+ }
+}
+
+std::unique_ptr<GPUOperation> SelectConvolutionPowerVR(const Convolution2DAttributes &attr,
+ const DeviceInfo &device_info,
+ const OperationDef &op_def)
+{
+ ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr);
+ return absl::make_unique<ConvPowerVR>(std::move(conv));
+}
+
+std::unique_ptr<GPUOperation> SelectConvolutionMali(const Convolution2DAttributes &attr,
+ const BHWC &dst_shape,
+ const DeviceInfo &device_info,
+ const OperationDef &op_def)
+{
+ if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER &&
+ IsConvBuffer1x1Supported(op_def, attr))
+ {
+ ConvBuffer1x1 conv = CreateConvBuffer1x1(device_info, op_def, attr, &dst_shape);
+ return absl::make_unique<ConvBuffer1x1>(std::move(conv));
+ }
+ else
+ {
+ ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
+ return absl::make_unique<ConvPowerVR>(std::move(conv));
+ }
+}
+
+std::unique_ptr<GPUOperation> SelectConvolutionWinogradMali(const Convolution2DAttributes &attr,
+ const BHWC &dst_shape,
+ const DeviceInfo &device_info,
+ const OperationDef &op_def)
+{
+ if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER)
+ {
+ ConvBuffer1x1 conv = CreateConvBuffer1x1Wino4x4To6x6(device_info, op_def, attr, &dst_shape);
+ return absl::make_unique<ConvBuffer1x1>(std::move(conv));
+ }
+ else
+ {
+ ConvPowerVR conv = CreateConvPowerVRWino4x4To6x6(device_info, op_def, attr, &dst_shape);
+ return absl::make_unique<ConvPowerVR>(std::move(conv));
+ }
+}
+
+std::unique_ptr<GPUOperation>
+SelectConvolutionDynamicWeightsMali(const Convolution2DAttributes &attr, const BHWC &weights_shape,
+ const BHWC &dst_shape, const DeviceInfo &device_info,
+ const OperationDef &op_def, ModelHints,
+ ConvWeightsDescription *weights_desc)
+{
+ if (op_def.src_tensors[0].storage_type == TensorStorageType::BUFFER &&
+ IsConvBuffer1x1Supported(op_def, weights_shape, attr))
+ {
+ ConvBuffer1x1 conv =
+ CreateConvBuffer1x1DynamicWeights(device_info, op_def, attr, weights_shape, &dst_shape);
+ *weights_desc = conv.GetConvWeightsDescription();
+ return absl::make_unique<ConvBuffer1x1>(std::move(conv));
+ }
+ else
+ {
+ ConvPowerVR conv =
+ CreateConvPowerVRDynamicWeights(device_info, op_def, attr, weights_shape, &dst_shape);
+ *weights_desc = conv.GetConvWeightsDescription();
+ return absl::make_unique<ConvPowerVR>(std::move(conv));
+ }
+}
+
+} // namespace
+
+std::unique_ptr<GPUOperation> SelectConvolution(const Convolution2DAttributes &attr,
+ const BHWC &dst_shape,
+ const DeviceInfo &device_info,
+ const OperationDef &op_def, ModelHints hints)
+{
+ if (device_info.IsAdreno())
+ {
+ return SelectConvolutionAdreno(attr, dst_shape, device_info, op_def, hints);
+ }
+ else if (device_info.IsPowerVR() || device_info.IsAMD() || device_info.IsIntel())
+ {
+ return SelectConvolutionPowerVR(attr, device_info, op_def);
+ }
+ else if (device_info.IsNvidia())
+ {
+ return SelectConvolutionNVidia(attr, dst_shape, device_info, op_def);
+ }
+ else if (device_info.IsMali())
+ {
+ return SelectConvolutionMali(attr, dst_shape, device_info, op_def);
+ }
+ else
+ {
+ return SelectConvolutionAdreno(attr, dst_shape, device_info, op_def, hints);
+ }
+}
+
+std::unique_ptr<GPUOperation> SelectConvolutionForWinograd(const Convolution2DAttributes &attr,
+ const BHWC &dst_shape,
+ const DeviceInfo &device_info,
+ const OperationDef &op_def,
+ ModelHints hints)
+{
+ if (device_info.IsAdreno())
+ {
+ return SelectConvolutionWinogradAdreno(attr, dst_shape, device_info, op_def, hints);
+ }
+ else if (device_info.IsPowerVR() || device_info.IsAMD() || device_info.IsNvidia() ||
+ device_info.IsIntel())
+ {
+ ConvPowerVR conv = CreateConvPowerVRWino4x4To6x6(device_info, op_def, attr, &dst_shape);
+ return absl::make_unique<ConvPowerVR>(std::move(conv));
+ }
+ else if (device_info.IsMali())
+ {
+ return SelectConvolutionWinogradMali(attr, dst_shape, device_info, op_def);
+ }
+ else
+ {
+ return SelectConvolutionWinogradAdreno(attr, dst_shape, device_info, op_def, hints);
+ }
+}
+
+std::unique_ptr<GPUOperation>
+SelectConvolutionWithDynamicWeights(const Convolution2DAttributes &attr, const BHWC &weights_shape,
+ const BHWC &dst_shape, const DeviceInfo &device_info,
+ const OperationDef &op_def, ModelHints hints,
+ ConvWeightsDescription *weights_desc)
+{
+ if (device_info.IsAdreno())
+ {
+ return SelectConvolutionDynamicWeightsAdreno(attr, weights_shape, dst_shape, device_info,
+ op_def, hints, weights_desc);
+ }
+ else if (device_info.IsMali())
+ {
+ return SelectConvolutionDynamicWeightsMali(attr, weights_shape, dst_shape, device_info, op_def,
+ hints, weights_desc);
+ }
+ else
+ {
+ ConvPowerVR conv =
+ CreateConvPowerVRDynamicWeights(device_info, op_def, attr, weights_shape, &dst_shape);
+ *weights_desc = conv.GetConvWeightsDescription();
+ return absl::make_unique<ConvPowerVR>(std::move(conv));
+ }
+}
+
+std::unique_ptr<GPUOperation>
+SelectConverterToConvWeights(const ConvWeightsDescription &weights_desc, const OperationDef &op_def,
+ ModelHints)
+{
+ ConverterToConvWeights converter = ConverterToConvWeights(op_def, weights_desc);
+ return absl::make_unique<ConverterToConvWeights>(std::move(converter));
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/selectors/ConvolutionSelector.h b/runtime/onert/backend/gpu_cl/open_cl/selectors/ConvolutionSelector.h
new file mode 100644
index 000000000..d45eea8bd
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/selectors/ConvolutionSelector.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_SELECTORS_CONVOLUTION_SELECTOR_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_SELECTORS_CONVOLUTION_SELECTOR_H__
+
+#include <memory>
+
+#include "open_cl/kernels/ConvCommon.h"
+#include "open_cl/kernels/GpuOperation.h"
+#include "open_cl/ModelHints.h"
+#include "open_cl/Operations.h"
+#include "open_cl/Shape.h"
+#include "open_cl/Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+std::unique_ptr<GPUOperation> SelectConvolution(const Convolution2DAttributes &attr,
+ const BHWC &dst_shape,
+ const DeviceInfo &device_info,
+ const OperationDef &op_def, ModelHints hints);
+
+std::unique_ptr<GPUOperation> SelectConvolutionForWinograd(const Convolution2DAttributes &attr,
+ const BHWC &dst_shape,
+ const DeviceInfo &device_info,
+ const OperationDef &op_def,
+ ModelHints hints);
+
+std::unique_ptr<GPUOperation>
+SelectConvolutionWithDynamicWeights(const Convolution2DAttributes &attr, const BHWC &weights_shape,
+ const BHWC &dst_shape, const DeviceInfo &device_info,
+ const OperationDef &op_def, ModelHints hints,
+ ConvWeightsDescription *weights_desc);
+
+std::unique_ptr<GPUOperation>
+SelectConverterToConvWeights(const ConvWeightsDescription &weights_desc, const OperationDef &op_def,
+ ModelHints hints);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_SELECTORS_CONVOLUTION_SELECTOR_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/selectors/DwConvolutionSelector.cc b/runtime/onert/backend/gpu_cl/open_cl/selectors/DwConvolutionSelector.cc
new file mode 100644
index 000000000..f07eef689
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/selectors/DwConvolutionSelector.cc
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DwConvolutionSelector.h"
+
+#include "absl/memory/memory.h"
+#include "open_cl/ClDevice.h"
+#include "open_cl/kernels/DepthwiseConv.h"
+#include "open_cl/kernels/DepthwiseConv3x3.h"
+#include "open_cl/Precision.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+namespace
+{
+
+std::unique_ptr<GPUOperation>
+SelectDWConvolutionAdreno(const DepthwiseConvolution2DAttributes &attr,
+ const DeviceInfo &device_info, const OperationDef &op_def)
+{
+ if (IsDepthwiseConv3x3Supported(attr))
+ {
+ return absl::make_unique<DepthwiseConv3x3>(CreateDepthwiseConv3x3(device_info, op_def, attr));
+ }
+ else
+ {
+ return absl::make_unique<GPUOperation>(CreateDepthwiseConvolution2D(device_info, op_def, attr));
+ }
+}
+
+std::unique_ptr<GPUOperation>
+SelectDWConvolutionPowerVR(const DepthwiseConvolution2DAttributes &attr,
+ const DeviceInfo &device_info, const OperationDef &op_def)
+{
+ if (IsDepthwiseConv3x3Supported(attr))
+ {
+ return absl::make_unique<DepthwiseConv3x3>(CreateDepthwiseConv3x3(device_info, op_def, attr));
+ }
+ else
+ {
+ return absl::make_unique<GPUOperation>(CreateDepthwiseConvolution2D(device_info, op_def, attr));
+ }
+}
+
+std::unique_ptr<GPUOperation> SelectDWConvolutionMali(const DepthwiseConvolution2DAttributes &attr,
+ const DeviceInfo &device_info,
+ const OperationDef &op_def)
+{
+ const auto storage_type = op_def.src_tensors[0].storage_type;
+ bool buffer_type =
+ storage_type == TensorStorageType::BUFFER || storage_type == TensorStorageType::IMAGE_BUFFER;
+ const MaliInfo mali_info = device_info.mali_info;
+ if (IsDepthwiseConv3x3Supported(attr) && !mali_info.IsMidgard() && !buffer_type &&
+ op_def.precision != CalculationsPrecision::F32)
+ {
+ return absl::make_unique<DepthwiseConv3x3>(CreateDepthwiseConv3x3(device_info, op_def, attr));
+ }
+ else
+ {
+ return absl::make_unique<GPUOperation>(CreateDepthwiseConvolution2D(device_info, op_def, attr));
+ }
+}
+} // namespace
+
+std::unique_ptr<GPUOperation> SelectDWConvolution(const DepthwiseConvolution2DAttributes &attr,
+ const DeviceInfo &device_info,
+ const OperationDef &op_def)
+{
+ if (device_info.IsAdreno())
+ {
+ return SelectDWConvolutionAdreno(attr, device_info, op_def);
+ }
+ else if (device_info.IsPowerVR())
+ {
+ return SelectDWConvolutionPowerVR(attr, device_info, op_def);
+ }
+ else if (device_info.IsMali())
+ {
+ return SelectDWConvolutionMali(attr, device_info, op_def);
+ }
+ else
+ {
+ return SelectDWConvolutionAdreno(attr, device_info, op_def);
+ }
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/selectors/DwConvolutionSelector.h b/runtime/onert/backend/gpu_cl/open_cl/selectors/DwConvolutionSelector.h
new file mode 100644
index 000000000..2fa40c5c3
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/selectors/DwConvolutionSelector.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_SELECTORS_DW_CONVOLUTION_SELECTOR_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_SELECTORS_DW_CONVOLUTION_SELECTOR_H__
+
+#include <memory>
+
+#include "open_cl/kernels/GpuOperation.h"
+#include "open_cl/Operations.h"
+#include "open_cl/Status.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+std::unique_ptr<GPUOperation> SelectDWConvolution(const DepthwiseConvolution2DAttributes &attr,
+ const DeviceInfo &device_info,
+ const OperationDef &op_def);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_SELECTORS_DW_CONVOLUTION_SELECTOR_H__
diff --git a/runtime/onert/backend/gpu_cl/open_cl/selectors/SimpleSelectors.cc b/runtime/onert/backend/gpu_cl/open_cl/selectors/SimpleSelectors.cc
new file mode 100644
index 000000000..ac514b26c
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/selectors/SimpleSelectors.cc
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "SimpleSelectors.h"
+
+#include <memory>
+#include <set>
+
+#include "open_cl/kernels/Add.h"
+#include "open_cl/kernels/DepthwiseConv.h"
+#include "open_cl/kernels/Pooling.h"
+#include "open_cl/kernels/Relu.h"
+#include "open_cl/kernels/Reshape.h"
+#include "open_cl/kernels/Reshapex4.h"
+#include "open_cl/kernels/Softmax.h"
+#include "open_cl/kernels/Softmax1x1.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+void SelectAdd(const OperationDef &op_def, const std::vector<int> &channels, int dst_channels,
+ std::unique_ptr<GPUOperation> *ptr)
+{
+ GPUOperation operation = CreateAdd(op_def, channels, dst_channels);
+ *ptr = std::make_unique<GPUOperation>(std::move(operation));
+}
+
+std::unique_ptr<GPUOperation>
+SelectDWConvolutionDynamicWeights(const DepthwiseConvolution2DAttributes &attr,
+ const DeviceInfo &device_info, const OperationDef &op_def)
+{
+ return absl::make_unique<GPUOperation>(
+ CreateDepthwiseConvolution2DDynamicWeights(device_info, op_def, attr));
+}
+
+std::unique_ptr<GPUOperation> SelectPooling(const Pooling2DAttributes &attr,
+ const OperationDef &op_def)
+{
+ GPUOperation operation = CreatePooling(op_def, attr);
+ return absl::make_unique<GPUOperation>(std::move(operation));
+}
+
+std::unique_ptr<GPUOperation> SelectReLU(const ReLUAttributes &attr, const OperationDef &op_def)
+{
+ return absl::make_unique<GPUOperation>(CreateReLU(op_def, attr));
+}
+
+void SelectReshape(int src_channels, int dst_channels, const OperationDef &op_def,
+ std::unique_ptr<GPUOperation> *ptr)
+{
+ if (src_channels % 4 == 0 && dst_channels % 4 == 0)
+ {
+ GPUOperation operation = CreateReshapex4(op_def);
+ *ptr = std::make_unique<GPUOperation>(std::move(operation));
+ }
+ else
+ {
+ GPUOperation operation = CreateReshape(op_def);
+ *ptr = std::make_unique<GPUOperation>(std::move(operation));
+ }
+}
+
+void SelectSoftmax(const BHWC &shape, const OperationDef &op_def,
+ std::unique_ptr<GPUOperation> *ptr)
+{
+ if (shape.w == 1 && shape.h == 1)
+ {
+ Softmax1x1 operation = CreateSoftmax1x1(op_def);
+ *ptr = absl::make_unique<Softmax1x1>(std::move(operation));
+ }
+ else
+ {
+ GPUOperation operation = CreateSoftmax(op_def);
+ *ptr = absl::make_unique<GPUOperation>(std::move(operation));
+ }
+}
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/open_cl/selectors/SimpleSelectors.h b/runtime/onert/backend/gpu_cl/open_cl/selectors/SimpleSelectors.h
new file mode 100644
index 000000000..2c5837a1d
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/open_cl/selectors/SimpleSelectors.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPENCL_SELECTORS_SIMPLE_SELECTORS_H__
+#define __ONERT_BACKEND_GPU_CL_OPENCL_SELECTORS_SIMPLE_SELECTORS_H__
+
+#include <memory>
+
+#include "open_cl/ClDevice.h"
+#include "open_cl/kernels/GpuOperation.h"
+#include "open_cl/Operations.h"
+#include "open_cl/Shape.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+
+void SelectAdd(const OperationDef &op_def, const std::vector<int> &channels, int dst_channels,
+ std::unique_ptr<GPUOperation> *ptr);
+
+std::unique_ptr<GPUOperation>
+SelectDWConvolutionDynamicWeights(const DepthwiseConvolution2DAttributes &attr,
+ const DeviceInfo &device_info, const OperationDef &op_def);
+
+std::unique_ptr<GPUOperation> SelectPooling(const Pooling2DAttributes &attr,
+ const OperationDef &op_def);
+
+std::unique_ptr<GPUOperation> SelectReLU(const ReLUAttributes &attr, const OperationDef &op_def);
+
+void SelectReshape(int src_channels, int dst_channels, const OperationDef &op_def,
+ std::unique_ptr<GPUOperation> *ptr);
+
+void SelectSoftmax(const BHWC &shape, const OperationDef &op_def,
+ std::unique_ptr<GPUOperation> *ptr);
+
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPENCL_SELECTORS_SIMPLE_SELECTORS_H__
diff --git a/runtime/onert/backend/gpu_cl/operand/CLTensor.cc b/runtime/onert/backend/gpu_cl/operand/CLTensor.cc
new file mode 100644
index 000000000..6dd9bd252
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/operand/CLTensor.cc
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CLTensor.h"
+
+#include "open_cl/Buffer.h"
+#include "open_cl/ClContext.h"
+#include "open_cl/Tensor.h"
+#include "open_cl/TensorType.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+namespace operand
+{
+
+CLTensor::CLTensor(size_t rank, ir::Shape shape, std::shared_ptr<Environment> environment)
+ : ICLTensor{rank, shape, environment}, _tensor(std::make_shared<Tensor>())
+{
+}
+
+const Tensor *CLTensor::handle() const { return _tensor.get(); }
+
+Tensor *CLTensor::handle() { return _tensor.get(); }
+
+void CLTensor::setBuffer(void *host_ptr) { (void)host_ptr; }
+
+} // namespace operand
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/operand/CLTensor.h b/runtime/onert/backend/gpu_cl/operand/CLTensor.h
new file mode 100644
index 000000000..7d2e70a99
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/operand/CLTensor.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPERAND_CL_TENSOR_H__
+#define __ONERT_BACKEND_GPU_CL_OPERAND_CL_TENSOR_H__
+
+#include "ICLTensor.h"
+
+#include "open_cl/Buffer.h"
+#include "open_cl/ClContext.h"
+#include "open_cl/Tensor.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+namespace operand
+{
+
+class CLTensor : public ICLTensor
+{
+public:
+ CLTensor() = delete;
+
+public:
+ CLTensor(size_t rank, ir::Shape shape, std::shared_ptr<Environment> environment);
+
+public:
+ const Tensor *handle() const override;
+ Tensor *handle() override;
+
+public:
+ /** Set given buffer as the buffer of the tensor
+ *
+ * @note Ownership of the memory is not transferred to this object.
+ * Thus management (allocate/free) should be done by the client.
+ *
+ * @param[in] host_ptr Storage to be used.
+ */
+ void setBuffer(void *host_ptr);
+
+private:
+ std::shared_ptr<Tensor> _tensor;
+};
+
+} // namespace operand
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPERAND_CL_TENSOR_H__
diff --git a/runtime/onert/backend/gpu_cl/operand/ICLTensor.cc b/runtime/onert/backend/gpu_cl/operand/ICLTensor.cc
new file mode 100644
index 000000000..3f070be0c
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/operand/ICLTensor.cc
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ICLTensor.h"
+
+#include "open_cl/Api.h"
+#include "open_cl/Spi.h"
+#include "open_cl/OpenclWrapper.h"
+#include "open_cl/TensorTypeUtil.h"
+#include "open_cl/kernels/Converter.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+namespace operand
+{
+
+void ICLTensor::access(const std::function<void(ITensor &tensor)> &fn)
+{
+ if (total_size() == 0)
+ return;
+
+ fn(*this);
+}
+
+void ICLTensor::enqueueWriteBuffer(const void *ptr, bool)
+{
+ const float *arr = (float *)ptr;
+ TensorObject input_obj = MakeReadableCpuMemory(absl::MakeSpan(arr, total_size() / 4));
+
+ TensorObject output_obj;
+
+ if (handle()->GetStorageType() == TensorStorageType::BUFFER)
+ {
+ output_obj = OpenClBuffer{handle()->GetMemoryPtr()};
+ }
+ else if (handle()->GetStorageType() == TensorStorageType::IMAGE_BUFFER)
+ {
+ output_obj = OpenClBuffer{handle()->GetMemoryPtrForWriting()};
+ }
+ else
+ {
+ output_obj = OpenClTexture{handle()->GetMemoryPtr()};
+ }
+
+ TensorObjectDef input_def;
+ input_def.dimensions.b = handle()->Batch();
+ input_def.dimensions.h = handle()->Height();
+ input_def.dimensions.w = handle()->Width();
+ input_def.dimensions.c = handle()->Channels();
+ input_def.object_def.data_layout = DataLayout::BHWC;
+ input_def.object_def.data_type = DataType::FLOAT32;
+ input_def.object_def.object_type = ObjectType::CPU_MEMORY;
+ input_def.object_def.user_provided = true;
+
+ TensorObjectDef tmp_def;
+ tmp_def.dimensions.b = handle()->Batch();
+ tmp_def.dimensions.h = handle()->Height();
+ tmp_def.dimensions.w = handle()->Width();
+ tmp_def.dimensions.c = handle()->Channels();
+ tmp_def.object_def.data_layout = DataLayout::BHWC;
+ tmp_def.object_def.data_type = DataType::FLOAT32;
+ tmp_def.object_def.object_type = ToObjectType(handle()->GetStorageType());
+ tmp_def.object_def.user_provided = true;
+
+ auto dims = tmp_def.dimensions;
+ const BHWC shape(dims.b, dims.h, dims.w, dims.c);
+ const TensorDescriptor desc{
+ tmp_def.object_def.data_type,
+ ToTensorStorageType(tmp_def.object_def.object_type, tmp_def.object_def.data_layout),
+ Layout::BHWC};
+ if (!AllocateTensorMemory(_environment->context(), shape, desc, &_cl_memory).ok())
+ {
+ throw std::runtime_error("AllocateTensorMemory error.");
+ }
+ TensorObject tmp_obj;
+ if (tmp_def.object_def.object_type == ObjectType::OPENCL_TEXTURE)
+ {
+ tmp_obj = OpenClTexture{_cl_memory.memory()};
+ }
+ else
+ {
+ tmp_obj = OpenClBuffer{_cl_memory.memory()};
+ }
+
+ TensorObjectDef output_def = input_def;
+ output_def.dimensions.b = handle()->Batch();
+ output_def.dimensions.h = handle()->Height();
+ output_def.dimensions.w = handle()->Width();
+ output_def.dimensions.c = handle()->Channels();
+ output_def.object_def.data_layout = ToDataLayout(handle()->GetStorageType());
+ output_def.object_def.data_type = handle()->GetDataType();
+ output_def.object_def.object_type = ToObjectType(handle()->GetStorageType());
+
+ _converter_builder = NewConverterBuilder(_environment.get());
+ if (!_converter_builder->MakeConverter(input_def, tmp_def, &_converter_cpu).ok())
+ {
+ throw std::runtime_error("MakeConverter<_converter_cpu> error.");
+ }
+ if (!_converter_builder->MakeConverter(tmp_def, output_def, &_converter_bhwc).ok())
+ {
+ throw std::runtime_error("MakeConverter<_converter_bhwc> error.");
+ }
+
+ if (!_converter_cpu->Convert(input_obj, tmp_obj).ok())
+ {
+ throw std::runtime_error("[w] _converter_cpu Convert error.");
+ }
+ if (!_converter_bhwc->Convert(tmp_obj, output_obj).ok())
+ {
+ throw std::runtime_error("[w] _converter_bhwc Convert error.");
+ }
+}
+
+void ICLTensor::enqueueReadBuffer(void *ptr, bool)
+{
+ float *arr = (float *)ptr;
+ TensorObject output_obj = MakeCpuMemory(absl::MakeSpan(arr, total_size() / 4));
+
+ TensorObject input_obj;
+
+ if (handle()->GetStorageType() == TensorStorageType::BUFFER)
+ {
+ input_obj = OpenClBuffer{handle()->GetMemoryPtr()};
+ }
+ else if (handle()->GetStorageType() == TensorStorageType::IMAGE_BUFFER)
+ {
+ input_obj = OpenClBuffer{handle()->GetMemoryPtrForWriting()};
+ }
+ else
+ {
+ input_obj = OpenClTexture{handle()->GetMemoryPtr()};
+ }
+
+ TensorObjectDef input_def;
+ input_def.dimensions.b = handle()->Batch();
+ input_def.dimensions.h = handle()->Height();
+ input_def.dimensions.w = handle()->Width();
+ input_def.dimensions.c = handle()->Channels();
+ input_def.object_def.data_layout = ToDataLayout(handle()->GetStorageType());
+ input_def.object_def.data_type = handle()->GetDataType();
+ input_def.object_def.object_type = ToObjectType(handle()->GetStorageType());
+ input_def.object_def.user_provided = false;
+
+ TensorObjectDef tmp_def;
+ tmp_def.dimensions.b = handle()->Batch();
+ tmp_def.dimensions.h = handle()->Height();
+ tmp_def.dimensions.w = handle()->Width();
+ tmp_def.dimensions.c = handle()->Channels();
+ tmp_def.object_def.data_layout = DataLayout::BHWC;
+ tmp_def.object_def.data_type = DataType::FLOAT32;
+ tmp_def.object_def.object_type = ToObjectType(handle()->GetStorageType());
+ tmp_def.object_def.user_provided = true;
+
+ auto dims = tmp_def.dimensions;
+ const BHWC shape(dims.b, dims.h, dims.w, dims.c);
+ const TensorDescriptor desc{
+ tmp_def.object_def.data_type,
+ ToTensorStorageType(tmp_def.object_def.object_type, tmp_def.object_def.data_layout),
+ Layout::BHWC};
+ if (!AllocateTensorMemory(_environment->context(), shape, desc, &_cl_memory).ok())
+ {
+ throw std::runtime_error("AllocateTensorMemory error.");
+ }
+ TensorObject tmp_obj;
+ if (tmp_def.object_def.object_type == ObjectType::OPENCL_TEXTURE)
+ {
+ tmp_obj = OpenClTexture{_cl_memory.memory()};
+ }
+ else
+ {
+ tmp_obj = OpenClBuffer{_cl_memory.memory()};
+ }
+ TensorObjectDef output_def = input_def;
+ output_def.dimensions.b = handle()->Batch();
+ output_def.dimensions.h = handle()->Height();
+ output_def.dimensions.w = handle()->Width();
+ output_def.dimensions.c = handle()->Channels();
+ output_def.object_def.data_layout = DataLayout::BHWC;
+ output_def.object_def.data_type = DataType::FLOAT32;
+ output_def.object_def.object_type = ObjectType::CPU_MEMORY;
+ output_def.object_def.user_provided = true;
+
+ _converter_builder = NewConverterBuilder(_environment.get());
+ if (!_converter_builder->MakeConverter(input_def, tmp_def, &_converter_bhwc).ok())
+ {
+ throw std::runtime_error("MakeConverter<_converter_bhwc> error.");
+ }
+ if (!_converter_builder->MakeConverter(tmp_def, output_def, &_converter_cpu).ok())
+ {
+ throw std::runtime_error("MakeConverter<_converter_cpu> error.");
+ }
+
+ if (!_converter_bhwc->Convert(input_obj, tmp_obj).ok())
+ {
+ throw std::runtime_error("[r] _converter_bhwc Convert error.");
+ }
+ if (!_converter_cpu->Convert(tmp_obj, output_obj).ok())
+ {
+ throw std::runtime_error("[r] _converter_cpu Convert error.");
+ }
+}
+
+} // namespace operand
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/gpu_cl/operand/ICLTensor.h b/runtime/onert/backend/gpu_cl/operand/ICLTensor.h
new file mode 100644
index 000000000..28e905d48
--- /dev/null
+++ b/runtime/onert/backend/gpu_cl/operand/ICLTensor.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_GPU_CL_OPERAND_I_CL_TENSOR_H__
+#define __ONERT_BACKEND_GPU_CL_OPERAND_I_CL_TENSOR_H__
+
+#include <backend/ITensor.h>
+
+#include "open_cl/Api.h"
+#include "open_cl/Spi.h"
+#include "open_cl/ClCommandQueue.h"
+#include "open_cl/kernels/Converter.h"
+#include "open_cl/Tensor.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace gpu_cl
+{
+namespace operand
+{
+
+class ICLTensor : public ITensor
+{
+public:
+ ICLTensor() = default;
+ ICLTensor(const ICLTensor &) = delete;
+ ICLTensor &operator=(const ICLTensor &) = delete;
+ ICLTensor(ICLTensor &&) = default;
+ ICLTensor &operator=(ICLTensor &&) = default;
+
+ ICLTensor(size_t rank, ir::Shape shape, std::shared_ptr<Environment> environment)
+ : _rank{rank}, _shape{shape}, _environment(environment)
+ {
+ }
+
+public:
+ uint8_t *buffer() const final { return reinterpret_cast<uint8_t *>(handle()->GetMemoryPtr()); }
+ size_t total_size() const final { return _shape.num_elements() * sizeof(float); }
+ size_t calcOffset(const ir::Coordinates &coords) const final
+ {
+ // NYI
+ (void)coords;
+ return 0;
+ }
+ ir::Layout layout() const final { return ir::Layout::NHWC; }
+ ir::DataType data_type() const final { return ir::DataType::FLOAT32; }
+ float data_scale() const override
+ {
+ throw std::runtime_error("ICLTensor::data_scale() is not supported.");
+ }
+ int32_t data_zero_point() const override
+ {
+ throw std::runtime_error("ICLTensor::data_zero_point() is not supported.");
+ }
+ const std::vector<float> &data_scales() const override
+ {
+ throw std::runtime_error("ICLTensor::data_scales() is not supported.");
+ }
+ const std::vector<int32_t> &data_zero_points() const override
+ {
+ throw std::runtime_error("ICLTensor::data_zero_points() is not supported.");
+ }
+ bool is_dynamic() const override { return false; }
+ ir::Shape getShape() const override { return _shape; }
+ bool has_padding() const override { return false; }
+ void access(const std::function<void(ITensor &tensor)> &fn) final;
+ bool needMemoryMap() const final { return true; }
+ void enqueueWriteBuffer(const void *ptr, bool blocking = true) final;
+ void enqueueReadBuffer(void *ptr, bool blocking = true) final;
+
+public:
+ virtual const Tensor *handle() const = 0;
+ virtual Tensor *handle() = 0;
+
+private:
+protected:
+ size_t _rank; // Actual rank (reflects extended rank)
+ ir::Shape _shape;
+ std::shared_ptr<Environment> _environment;
+ std::unique_ptr<TensorObjectConverterBuilder> _converter_builder;
+ CLMemory _cl_memory;
+ std::unique_ptr<TensorObjectConverter> _converter_cpu;
+ std::unique_ptr<TensorObjectConverter> _converter_bhwc;
+};
+
+} // namespace operand
+} // namespace gpu_cl
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_GPU_CL_OPERAND_I_CL_TENSOR_H__
diff --git a/runtime/onert/core/include/compiler/Compiler.h b/runtime/onert/core/include/compiler/Compiler.h
index c2589f6d5..292de4b12 100644
--- a/runtime/onert/core/include/compiler/Compiler.h
+++ b/runtime/onert/core/include/compiler/Compiler.h
@@ -45,6 +45,11 @@ struct ManualSchedulerOptions
std::unordered_map<ir::OperationIndex, std::string> index_to_backend;
};
+struct PartialGraphOptions
+{
+ std::unordered_map<ir::OperationIndex, ir::SubgraphIndex> index_to_graph;
+};
+
struct CompilerOptions
{
// GENERAL OPTIONS
@@ -59,6 +64,7 @@ struct CompilerOptions
bool he_profiling_mode; //< Whether HEScheduler profiling mode ON/OFF
bool disable_compile; //< Run with Interpreter if true, try compilation otherwise
bool fp16_enable; //< Whether fp16 mode ON/OFF
+ PartialGraphOptions partial_graph_options;
util::TracingCtx *tracing_ctx; //< Profiling information
};
@@ -86,6 +92,15 @@ public:
*/
std::shared_ptr<exec::ExecutorMap> compile(void);
+ /**
+ * @brief Do compilation with the options
+ *
+ * @return std::vector<std::shared_ptr<exec::ExecutorMap>> Executors as a result of compilation
+ * for pipeline
+ */
+ std::vector<std::shared_ptr<exec::ExecutorMap>> compile(const char *package_file_path,
+ const char *map_file_path);
+
State state(void) const { return _state; }
CompilerOptions &options() { return _options; }
@@ -95,6 +110,17 @@ public:
*/
void enableToFp16();
+ /**
+ * @brief Set backends from string-encoded mappings from operation index to backend type (cpu,
+ * acl_cl)
+ */
+ void set_backend_from_str(const char *backend_settings);
+
+ /**
+ * @brief Build the partial graphs to compile with original graph
+ */
+ bool buildPartialGraph(uint32_t num_graphs);
+
private:
void checkProfilerConditions();
std::shared_ptr<ir::Graph> &primary_subgraph() { return _subgraphs->at(ir::SubgraphIndex{0}); }
diff --git a/runtime/onert/core/include/compiler/LoweredGraph.h b/runtime/onert/core/include/compiler/LoweredGraph.h
index 925de3485..d3ef6d4af 100644
--- a/runtime/onert/core/include/compiler/LoweredGraph.h
+++ b/runtime/onert/core/include/compiler/LoweredGraph.h
@@ -36,9 +36,13 @@ class LoweredGraph
{
public:
LoweredGraph(const ir::Graph &graph, const compiler::CompilerOptions &options);
+ LoweredGraph(const ir::Graph &parent_graph, const ir::Graph &graph,
+ const compiler::CompilerOptions &options);
ir::Graph &graph() { return _graph; }
const ir::Graph &graph() const { return _graph; }
+ ir::Graph &parent_graph() { return _parent_graph; }
+ const ir::Graph &parent_graph() const { return _parent_graph; }
const compiler::GraphLowerInfo &lower_info() const { return _lower_info_map; }
compiler::GraphLowerInfo &lower_info() { return _lower_info_map; }
std::shared_ptr<ir::OperationIndexMap<int64_t>> indexed_ranks() { return _indexed_ranks; }
@@ -59,6 +63,7 @@ private:
private:
ir::Graph _graph;
+ ir::Graph _parent_graph;
std::shared_ptr<ir::OperationIndexMap<int64_t>> _indexed_ranks;
compiler::GraphLowerInfo _lower_info_map;
ir::OperationIndexMap<bool> _has_dynamic_tensor_map;
diff --git a/runtime/onert/core/include/exec/Execution.h b/runtime/onert/core/include/exec/Execution.h
index d3c5b6dda..b0a5cced3 100644
--- a/runtime/onert/core/include/exec/Execution.h
+++ b/runtime/onert/core/include/exec/Execution.h
@@ -26,6 +26,8 @@
#include "IODescription.h"
#include <thread>
+#include <deque>
+#include <semaphore.h>
namespace onert
{
@@ -53,6 +55,7 @@ public:
*/
const ir::Graph &primary_subgraph() const { return primary_executor()->graph(); }
+ const ir::Graph &primary_parentgraph() const { return primary_executor()->parent_graph(); }
/**
* @brief Change input shape
* @param[in] index Input index
@@ -69,6 +72,7 @@ public:
*/
void setInput(const ir::IOIndex &index, const void *buffer, size_t length,
ir::Layout layout = ir::Layout::NHWC);
+
/**
* @brief Set input data's information, especially to specify unknown dimensions on model
* build time.
@@ -142,6 +146,102 @@ public:
ir::Shape getInputShape(ir::IOIndex ind) const;
ir::Shape getOutputShape(ir::IOIndex ind) const;
+ //
+ // Experimental API
+ //
+
+ // accessor
+ std::vector<
+ std::tuple<std::shared_ptr<onert::exec::Execution>, onert::ir::IOIndex, onert::ir::IOIndex>>
+ getNextExes()
+ {
+ return next_exes;
+ }
+ std::deque<std::pair<IODescription *, uint32_t>> *getAsyncIoDescs() { return &_async_io_descs; }
+ std::deque<std::vector<void *>> *getAsyncResults() { return &_async_results; }
+
+ /**
+ * @brief Push IO information between related executions into next_exes
+ * @param[in] next address of next execution
+ * @param[in] o_index Output index of current execution (it will be the input of next execution)
+ * @param[in] i_index Input index of next execution
+ */
+ void pushNextExe(std::shared_ptr<onert::exec::Execution> next, onert::ir::IOIndex o_index,
+ onert::ir::IOIndex i_index)
+ {
+ next_exes.push_back({next, o_index, i_index});
+ }
+
+ /**
+ * @brief Create New IODescription instance for new inputs outputs
+ * @param[in] index instance count number
+ */
+ void createNewAsyncDesc(uint32_t count = 0);
+
+ /**
+ * @brief Set async input data's information
+ * @param[in] index Input index
+ * @param[in] buffer Input data's buffer pointer
+ * @param[in] length Input data's length
+ * @param[in] layout Input data's data format
+ */
+ void executeAsyncInput(const ir::IOIndex &index, const void *buffer, size_t length,
+ ir::Layout layout = ir::Layout::NHWC);
+
+ /**
+ * @brief Set async output data's information
+ * @param[in] index Output index
+ * @param[in] buffer Output data's buffer pointer
+ * @param[in] length Output data's length
+ * @param[in] layout Output data's data format
+ */
+ void executeAsyncOutput(const ir::IOIndex &index, void *buffer, size_t length,
+ ir::Layout layout = ir::Layout::NHWC);
+
+ /**
+ * @brief Async execution
+ * @note It should be called after setting input and output buffer
+ */
+ void AsyncExecute();
+
+ /**
+ * @brief Set finish
+ */
+ void setFinish();
+
+ /**
+ * @brief Check if input queue is empty
+ * @return @c true if queue is empty, otherwise @c false
+ */
+ bool isEmptyQueue();
+
+ /**
+ * @brief Wait semaphore to prevent race condition
+ */
+ void asyncIoDescSemWait();
+
+ /**
+ * @brief Post semaphore to prevent race condition
+ */
+ void asyncIoDescSemPost();
+
+ /**
+ * @brief Inference
+ * @note this function provided to the thread for pipelining
+ */
+ void runInference();
+
+ /**
+ * @brief Check if stop_wait is true
+ * @return @c true if stop_wait is true, otherwise @c false
+ */
+ bool stopWait(void) const;
+
+ /**
+ * @brief Set stop_wait to terminate consumer thread
+ */
+ void sholudStop();
+
private:
const std::unique_ptr<IExecutor> &primary_executor() const
{
@@ -152,8 +252,15 @@ private:
private:
const std::shared_ptr<ExecutorMap> _executors;
IODescription _io_desc;
+ std::deque<std::pair<IODescription *, uint32_t>> _async_io_descs;
+ sem_t _async_io_descs_sem;
+ std::deque<std::vector<void *>> _async_results;
+ std::vector<
+ std::tuple<std::shared_ptr<onert::exec::Execution>, onert::ir::IOIndex, onert::ir::IOIndex>>
+ next_exes;
std::unique_ptr<std::thread> _exec_thread;
bool finished{false};
+ bool stop_wait{false};
};
} // namespace exec
diff --git a/runtime/onert/core/include/exec/IExecutor.h b/runtime/onert/core/include/exec/IExecutor.h
index 51fc67af4..adc68074f 100644
--- a/runtime/onert/core/include/exec/IExecutor.h
+++ b/runtime/onert/core/include/exec/IExecutor.h
@@ -69,6 +69,13 @@ struct IExecutor
virtual const ir::Graph &graph() = 0;
/**
+ * @brief Returns parent graph object
+ *
+ * @return Graph object
+ */
+ virtual const ir::Graph &parent_graph() = 0;
+
+ /**
* @brief Set an ordering on operations
* @param[in] ranks The table encoding the ordering
*/
diff --git a/runtime/onert/core/include/exec/IODescription.h b/runtime/onert/core/include/exec/IODescription.h
index 43d4015d5..14c5ffc2b 100644
--- a/runtime/onert/core/include/exec/IODescription.h
+++ b/runtime/onert/core/include/exec/IODescription.h
@@ -19,6 +19,7 @@
#include <vector>
#include <unordered_map>
+#include <semaphore.h>
#include "ir/OperandInfo.h"
#include "ir/Index.h"
diff --git a/runtime/onert/core/include/ir/Graph.h b/runtime/onert/core/include/ir/Graph.h
index 5543d9559..7a7688334 100644
--- a/runtime/onert/core/include/ir/Graph.h
+++ b/runtime/onert/core/include/ir/Graph.h
@@ -88,6 +88,15 @@ public:
void removeOperand(const OperandIndex &ind) { _operands.remove(ind); }
void setLayout(Layout layout) { _layout = layout; }
void setSubgraphs(const std::shared_ptr<Subgraphs> &subgs) { _subgraphs = subgs; }
+ void setPartialgraphs(const std::shared_ptr<Subgraphs> &partialgraphs)
+ {
+ _partialgraphs = partialgraphs;
+ }
+ void
+ setTensorName(std::shared_ptr<std::unordered_map<ir::OperandIndex, std::string>> &tensor_names)
+ {
+ _tensor_names = tensor_names;
+ }
private:
bool checkOperandsForOperation(const Operation &operation);
@@ -128,6 +137,29 @@ public:
const std::shared_ptr<Subgraphs> &subgraphs() const { return _subgraphs; }
std::shared_ptr<Subgraphs> &subgraphs() { return _subgraphs; }
Layout layout() const { return _layout; }
+ std::shared_ptr<Subgraphs> &partialgraphs() { return _partialgraphs; }
+ std::shared_ptr<std::unordered_map<ir::OperandIndex, std::string>> &tensor_names()
+ {
+ return _tensor_names;
+ }
+ std::unordered_map<std::string, IOIndex>::iterator _name_to_input_begin()
+ {
+ return _name_to_input.begin();
+ }
+ std::unordered_map<std::string, IOIndex>::iterator _name_to_input_end()
+ {
+ return _name_to_input.end();
+ }
+ std::unordered_map<std::string, IOIndex>::iterator _name_to_output_begin()
+ {
+ return _name_to_output.begin();
+ }
+ std::unordered_map<std::string, IOIndex>::iterator _name_to_output_end()
+ {
+ return _name_to_output.end();
+ }
+ void input_sort() { _inputs.sort(); }
+ void output_sort() { _outputs.sort(); }
// Topological sort
public:
@@ -144,6 +176,10 @@ private:
std::shared_ptr<Subgraphs> _subgraphs;
// TFLite and circle's default layout is NHWC;
Layout _layout{Layout::NHWC};
+
+ // Partial Graphs
+ std::shared_ptr<ir::Subgraphs> _partialgraphs;
+ std::shared_ptr<std::unordered_map<ir::OperandIndex, std::string>> _tensor_names;
};
} // namespace ir
diff --git a/runtime/onert/core/include/ir/OperandIndexSequence.h b/runtime/onert/core/include/ir/OperandIndexSequence.h
index 846c3f950..dd390748b 100644
--- a/runtime/onert/core/include/ir/OperandIndexSequence.h
+++ b/runtime/onert/core/include/ir/OperandIndexSequence.h
@@ -19,6 +19,7 @@
#include <initializer_list>
#include <vector>
+#include <algorithm>
#include "ir/Index.h"
@@ -45,6 +46,12 @@ public:
void append(const OperandIndex &index) { _vec.emplace_back(index); }
void append(const OperandIndexSequence &l) { _vec.insert(_vec.end(), l.begin(), l.end()); }
+ void sort()
+ {
+ std::sort(_vec.begin(), _vec.end(),
+ [](const auto &lhs, const auto &rhs) { return lhs.value() < rhs.value(); });
+ }
+
public:
uint32_t size() const { return static_cast<uint32_t>(_vec.size()); }
const OperandIndex &at(IOIndex set_index) const { return _vec.at(set_index.value()); }
diff --git a/runtime/onert/core/include/ir/operation/ElementwiseBinary.h b/runtime/onert/core/include/ir/operation/ElementwiseBinary.h
index dd07f6058..e265e81ec 100644
--- a/runtime/onert/core/include/ir/operation/ElementwiseBinary.h
+++ b/runtime/onert/core/include/ir/operation/ElementwiseBinary.h
@@ -37,6 +37,7 @@ public:
enum class ElementwiseBinaryType
{
+ FLOOR_DIV,
LOGICAL_AND,
LOGICAL_OR,
MAX,
diff --git a/runtime/onert/core/include/util/Config.lst b/runtime/onert/core/include/util/Config.lst
index d501345c1..89a9a6ac2 100644
--- a/runtime/onert/core/include/util/Config.lst
+++ b/runtime/onert/core/include/util/Config.lst
@@ -20,7 +20,7 @@
// Name | Type | Default
CONFIG(GRAPH_DOT_DUMP , int , "0")
-CONFIG(BACKENDS , std::string , "cpu;acl_cl;acl_neon;ruy;xnnpack;bcq") // FIXME Remove bcq
+CONFIG(BACKENDS , std::string , "cpu;acl_cl;acl_neon;ruy;xnnpack;gpu_cl;bcq") // FIXME Remove bcq
CONFIG(OP_BACKEND_ALLOPS , std::string , "")
CONFIG(OP_BACKEND_MAP , std::string , "")
CONFIG(DISABLE_COMPILE , bool , "0")
diff --git a/runtime/onert/core/src/compiler/Compiler.cc b/runtime/onert/core/src/compiler/Compiler.cc
index 082bdc9d0..93792dd1c 100644
--- a/runtime/onert/core/src/compiler/Compiler.cc
+++ b/runtime/onert/core/src/compiler/Compiler.cc
@@ -38,7 +38,9 @@
#include "util/ConfigSource.h"
#include "util/logging.h"
#include "ir/OperationDumper.h"
+#include "ir/OperationCloner.h"
#include "misc/string_helpers.h"
+#include "json/json.h"
namespace
{
@@ -139,6 +141,26 @@ Compiler::Compiler(const std::shared_ptr<ir::Subgraphs> &subgs, util::TracingCtx
void Compiler::enableToFp16() { _options.fp16_enable = true; }
+void Compiler::set_backend_from_str(const char *backend_settings)
+{
+ // Backend for all
+ auto &ms_options = _options.manual_scheduler_options;
+ auto key_val_list = nnfw::misc::split(backend_settings, ';');
+ for (const auto &key_val_str : key_val_list)
+ {
+ if (key_val_str.empty())
+ {
+ continue;
+ }
+
+ auto key_val = nnfw::misc::split(key_val_str, '=');
+ const auto &key_str = key_val.at(0);
+ const auto &val = key_val.at(1);
+ auto key = static_cast<uint32_t>(std::stoi(key_str));
+ ms_options.index_to_backend.emplace(ir::OperationIndex{key}, val);
+ }
+}
+
void Compiler::checkProfilerConditions()
{
if (!_options.he_scheduler)
@@ -148,6 +170,164 @@ void Compiler::checkProfilerConditions()
throw std::runtime_error("Profiling mode works only with 'Dataflow' executor");
}
+bool Compiler::buildPartialGraph(uint32_t num_graphs)
+{
+ if (_subgraphs->count() > 1)
+ return false;
+
+ auto partialgraphs = std::make_shared<ir::Subgraphs>();
+
+ for (uint32_t idx = 0; idx < num_graphs; idx++)
+ {
+ auto partialgraph = std::make_unique<ir::Graph>();
+ partialgraphs->push(ir::SubgraphIndex{idx}, std::move(partialgraph));
+ }
+ _subgraphs->primary()->setPartialgraphs(partialgraphs);
+
+ auto partial_graph = primary_subgraph()->partialgraphs();
+
+ primary_subgraph()->operands().iterate(
+ [&](const ir::OperandIndex &operand_index, const ir::Operand &operand) {
+ auto use_operations = operand.getUses();
+
+ for (auto use_operation : use_operations)
+ {
+ auto graph_index = _options.partial_graph_options.index_to_graph.find(use_operation);
+ if (graph_index == _options.partial_graph_options.index_to_graph.end())
+ {
+ throw std::runtime_error("Invalid Partition Map");
+ }
+ auto partition = partial_graph->at(graph_index->second);
+
+ if (partition->operands().exist(operand_index))
+ {
+ continue;
+ }
+
+ auto new_operand = std::make_unique<ir::Operand>(operand);
+ new_operand->clearDefUse();
+ auto new_operand_ind = partition->addOperand(operand_index, std::move(new_operand));
+ UNUSED_RELEASE(new_operand_ind);
+ assert(new_operand_ind == operand_index);
+ }
+ });
+
+ primary_subgraph()->operations().iterate(
+ [&](const ir::OperationIndex &operation_index, const ir::Operation &operation) {
+ auto graph_index = _options.partial_graph_options.index_to_graph.find(operation_index);
+ if (graph_index == _options.partial_graph_options.index_to_graph.end())
+ {
+ throw std::runtime_error("Invalid Partition Map");
+ }
+ auto partition = partial_graph->at(graph_index->second);
+
+ auto operand_io = (operation.getInputs() + operation.getOutputs()) | ir::Remove::DUPLICATED |
+ ir::Remove::UNDEFINED;
+ for (auto operand_index : operand_io)
+ {
+ if (partition->operands().exist(operand_index))
+ continue;
+
+ const auto &operand = primary_subgraph()->operands().at(operand_index);
+
+ auto new_operand = std::make_unique<ir::Operand>(operand);
+ new_operand->clearDefUse();
+
+ auto new_operand_index = partition->addOperand(operand_index, std::move(new_operand));
+ UNUSED_RELEASE(new_operand_index);
+ assert(new_operand_index == operand_index);
+ }
+
+ auto new_operation_index = partition->addOperation(operation_index, clone(operation));
+ UNUSED_RELEASE(new_operation_index);
+ assert(new_operation_index == operation_index);
+ });
+
+ for (uint32_t idx = 0; idx < partial_graph->count(); idx++)
+ {
+ auto partition = partial_graph->at(ir::SubgraphIndex{idx});
+
+ partition->operands().iterate([&](const ir::OperandIndex &operand_index,
+ const ir::Operand &operand) {
+ if (primary_subgraph()->getInputs().contains(operand_index) ||
+ (!operand.getDef().valid() && !operand.isConstant()))
+ {
+ partition->addInput(operand_index, primary_subgraph()->tensor_names()->at(operand_index));
+ }
+ if (primary_subgraph()->getOutputs().contains(operand_index) || operand.getUses().size() == 0)
+ {
+ partition->addOutput(operand_index, primary_subgraph()->tensor_names()->at(operand_index));
+ }
+
+ if (primary_subgraph()->operands().at(operand_index).getUses().size() > 1 &&
+ !primary_subgraph()->operands().at(operand_index).isConstant() &&
+ !partition->getInputs().contains(operand_index))
+ {
+ auto use_operations = primary_subgraph()->operands().at(operand_index).getUses();
+ auto iter = use_operations.begin();
+ ir::SubgraphIndex graph_index =
+ _options.partial_graph_options.index_to_graph.find(*iter++)->second;
+ while (iter != use_operations.end())
+ {
+ if (graph_index != _options.partial_graph_options.index_to_graph.find(*iter)->second &&
+ !partition->getOutputs().contains(operand_index))
+ {
+ partition->addOutput(operand_index,
+ primary_subgraph()->tensor_names()->at(operand_index));
+ }
+ iter++;
+ }
+ }
+ });
+
+ partition->verify();
+
+ bool same = true;
+ if (partition->getInputs().size() == primary_subgraph()->getInputs().size())
+ {
+ for (auto iter = partition->getInputs().begin(); iter != partition->getInputs().end(); ++iter)
+ {
+ if (!primary_subgraph()->getInputs().contains(*iter))
+ {
+ same = false;
+ break;
+ }
+ }
+ if (same == true)
+ {
+ partition->getInputs() = primary_subgraph()->getInputs();
+ }
+ else
+ {
+ partition->input_sort();
+ }
+ }
+
+ same = true;
+ if (partition->getOutputs().size() == primary_subgraph()->getOutputs().size())
+ {
+ for (auto iter = partition->getOutputs().begin(); iter != partition->getOutputs().end();
+ ++iter)
+ {
+ if (!primary_subgraph()->getOutputs().contains(*iter))
+ {
+ same = false;
+ break;
+ }
+ }
+ if (same == true)
+ {
+ partition->getOutputs() = primary_subgraph()->getOutputs();
+ }
+ else
+ {
+ partition->output_sort();
+ }
+ }
+ }
+ return true;
+}
+
std::shared_ptr<exec::ExecutorMap> Compiler::compile(void)
{
// Set control flow backend for control flow operators
@@ -300,6 +480,226 @@ std::shared_ptr<exec::ExecutorMap> Compiler::compile(void)
return executors;
}
+std::vector<std::shared_ptr<exec::ExecutorMap>> Compiler::compile(const char *package_file_path,
+ const char *map_file_path)
+{
+ std::vector<std::shared_ptr<exec::ExecutorMap>> executors;
+ auto executor_map = std::make_shared<exec::ExecutorMap>();
+
+ std::string package_path(package_file_path);
+ std::string partition_map_file;
+
+ if (map_file_path)
+ {
+ partition_map_file = map_file_path;
+ }
+ else
+ {
+ partition_map_file = package_path + "/partition_map.json";
+ }
+
+ std::ifstream pmfs(partition_map_file);
+ Json::Value root;
+ pmfs >> root;
+ const Json::Value &map = root["partition_map"];
+ const Json::Value &np = root["num_partitions"];
+
+ uint32_t num_graphs = 1;
+
+ if (pmfs.is_open())
+ {
+ num_graphs = np.asUInt();
+ for (uint32_t i = 0; i < (uint32_t)map.size(); ++i)
+ {
+ _options.partial_graph_options.index_to_graph[ir::OperationIndex{i}] =
+ ir::SubgraphIndex{map[i].asUInt()};
+ }
+ }
+ else
+ {
+ throw std::runtime_error("There is no partition map file");
+ }
+
+ if (!buildPartialGraph(num_graphs))
+ {
+ throw std::runtime_error("It doesn't support in case there are subgraphs");
+ }
+
+ // Set control flow backend for control flow operators
+ {
+ auto &builtin_id = backend::builtin::Config::ID;
+ _options.manual_scheduler_options.opcode_to_backend[ir::OpCode::If] = builtin_id;
+ _options.manual_scheduler_options.opcode_to_backend[ir::OpCode::While] = builtin_id;
+ _options.manual_scheduler_options.opcode_to_backend[ir::OpCode::Permute] = builtin_id;
+ }
+
+ // FIXME This is a workaround for bcq operations, should remove it
+ {
+ _options.manual_scheduler_options.opcode_to_backend[ir::OpCode::BCQFullyConnected] = "bcq";
+ _options.manual_scheduler_options.opcode_to_backend[ir::OpCode::BCQGather] = "bcq";
+ }
+
+ // It doesn't support tracing in case of partial graph
+ {
+ _options.tracing_ctx = nullptr;
+ }
+
+ {
+ VERBOSE(Compiler) << std::boolalpha << "==== Compiler Options ====" << std::endl;
+ VERBOSE(Compiler) << "backend_list : "
+ << nnfw::misc::join(_options.backend_list.begin(),
+ _options.backend_list.end(), "/")
+ << std::endl;
+ VERBOSE(Compiler) << "trace_filepath : " << _options.trace_filepath << std::endl;
+ VERBOSE(Compiler) << "graph_dump_level : " << _options.graph_dump_level << std::endl;
+ VERBOSE(Compiler) << "executor : " << _options.executor << std::endl;
+ VERBOSE(Compiler) << "manual backend_for_all : "
+ << _options.manual_scheduler_options.backend_for_all << std::endl;
+ VERBOSE(Compiler) << "manual_scheduler_options : "
+ << getOpBackends(_options.manual_scheduler_options.opcode_to_backend)
+ << std::endl;
+ VERBOSE(Compiler) << "he_scheduler : " << _options.he_scheduler << std::endl;
+ VERBOSE(Compiler) << "he_profiling_mode : " << _options.he_profiling_mode << std::endl;
+ VERBOSE(Compiler) << "disable_compile : " << _options.disable_compile << std::endl;
+ VERBOSE(Compiler) << "fp16_enable : " << _options.fp16_enable << std::endl
+ << std::noboolalpha;
+ }
+
+ _subgraphs->iterate([&](const ir::SubgraphIndex &, ir::Graph &subg) {
+ // Mandatory passes
+ auto part = subg.partialgraphs();
+ part->iterate([&](const ir::SubgraphIndex &, ir::Graph &partialgraph) {
+ pass::PassRunner{}
+ .append(std::make_unique<pass::ConstantOutputPass>(partialgraph))
+ .append(std::make_unique<pass::OddOutputPass>(partialgraph))
+ .run();
+
+ // Optimizations
+ pass::PassRunner{}
+ .append(std::make_unique<pass::UnusedOperandEliminationPass>(partialgraph))
+ .run();
+ });
+ });
+
+ /***************************************************
+ * Prepare compilation phase
+ ***************************************************/
+
+ // Compilable check
+ // TODO: Support hybrid execution -
+ // execution between interpreter and compiled executor (including control flow)
+ if (_options.disable_compile)
+ {
+ _subgraphs->iterate([&](const ir::SubgraphIndex &index, ir::Graph &subg) {
+ executor_map->emplace(index, std::make_unique<interp::InterpExecutor>(subg));
+ executors.push_back(executor_map);
+ });
+ _state = State::COMPILED;
+ return executors;
+ }
+
+ // Mode check
+ if (_options.he_profiling_mode)
+ checkProfilerConditions();
+
+ /***************************************************
+ * Backend independent analysis & optimization phase
+ ***************************************************/
+ auto dump_level = static_cast<dumper::dot::DotDumper::Level>(_options.graph_dump_level);
+
+ // Lower: Assign backend
+ std::unordered_map<ir::SubgraphIndex, std::unique_ptr<compiler::LoweredGraph>>
+ lowered_partialgraphs;
+ _subgraphs->iterate([&](const ir::SubgraphIndex &, ir::Graph &subg) {
+ auto part = subg.partialgraphs();
+ part->iterate([&](const ir::SubgraphIndex &pindex, ir::Graph &partialgraph) {
+ onert::dumper::dot::DotDumper dot_dumper_part(partialgraph, dump_level);
+ dot_dumper_part.dump(nnfw::misc::str("before_lower_subg_partialgraph-", pindex.value()));
+
+ // // Lower: Assign backend
+ lowered_partialgraphs[pindex] =
+ std::make_unique<compiler::LoweredGraph>(subg, partialgraph, _options);
+ partialgraph.setSubgraphs(nullptr);
+ });
+ });
+
+ for (auto &pair : lowered_partialgraphs)
+ {
+
+ const auto &partialgraph_index = pair.first;
+ auto &lowered_partialgraph = pair.second;
+ onert::dumper::dot::DotDumper dot_dumper_lowered_part(lowered_partialgraph.get(), dump_level);
+ dot_dumper_lowered_part.dump("after_lower_subg_partialgraph-" +
+ std::to_string(partialgraph_index.value()));
+ }
+
+ // Partial Graph shape inference
+ for (auto &pair : lowered_partialgraphs)
+ {
+ const auto &partialgraph_index = pair.first;
+ auto &lowered_partialgraph = pair.second;
+ StaticShapeInferer partial_inferer(partialgraph_index, lowered_partialgraphs);
+ auto ordered_ops = lowered_partialgraph->graph().topolSortOperations();
+ for (auto op_ind : ordered_ops)
+ {
+ const auto &op = lowered_partialgraph->graph().operations().at(op_ind);
+ bool has_dynamic_tensor = partial_inferer.infer(op);
+ lowered_partialgraph->setHasDynamicTensor(op_ind, has_dynamic_tensor);
+ }
+ partial_inferer.dump();
+ }
+
+ // Shape validation
+ // TODO Move shape independent feature check from ShapeValidator to OperationValidator
+ // TODO Move ShapeValidator into shape inference
+ // - Check input tensor shape validation
+ // - Check parameter value validation which valid value is depend on input tensor shape
+ // - Output tensor shape validation check is needless because
+ // static/dynamic shape inferer will make valid output shape
+ for (auto &pair : lowered_partialgraphs)
+ {
+ auto &lowered_partialgraph = pair.second;
+ compiler::ShapeValidator{lowered_partialgraph->graph()}();
+ }
+
+ /*************************************************************
+ * Backend independent analysis & optimization phase finished
+ *************************************************************/
+ std::map<uint32_t, std::unique_ptr<compiler::LoweredGraph>> ordered;
+ for (auto &pair : lowered_partialgraphs)
+ {
+ // const auto &partialgraph_index = pair.first;
+ auto &lowered_partialgraph = pair.second;
+
+ ordered.insert(make_pair(pair.first.value(), std::move(lowered_partialgraph)));
+ }
+
+ for (auto &pair : ordered)
+ {
+ executor_map = std::make_shared<exec::ExecutorMap>();
+ const auto &partialgraph_index = ir::SubgraphIndex(pair.first);
+ auto &lowered_partialgraph = pair.second;
+ auto indexed_ranks = lowered_partialgraph->indexed_ranks();
+ ir::OperationDumper dumper("Executor generation of Subgraph " +
+ std::to_string(partialgraph_index.value()));
+ lowered_partialgraph->graph().operations().iterate(
+ [&](const ir::OperationIndex &, const ir::Operation &op) { op.accept(dumper); });
+ auto executor = std::unique_ptr<exec::IExecutor>{
+ ExecutorFactory::get().create(std::move(lowered_partialgraph), _options, executor_map)};
+ executor->setIndexedRanks(indexed_ranks);
+ executor_map->insert(std::make_pair(ir::SubgraphIndex{0}, std::move(executor)));
+ executors.push_back(executor_map);
+ }
+
+ _subgraphs.reset();
+ /********************************
+ * Code generation phase finished
+ ********************************/
+ _state = State::COMPILED;
+
+ return executors;
+}
+
} // namespace compiler
} // namespace onert
diff --git a/runtime/onert/core/src/compiler/LoweredGraph.cc b/runtime/onert/core/src/compiler/LoweredGraph.cc
index b469b991b..3b84d02de 100644
--- a/runtime/onert/core/src/compiler/LoweredGraph.cc
+++ b/runtime/onert/core/src/compiler/LoweredGraph.cc
@@ -117,6 +117,85 @@ LoweredGraph::LoweredGraph(const ir::Graph &graph, const CompilerOptions &option
}
}
+LoweredGraph::LoweredGraph(const ir::Graph &parent_graph, const ir::Graph &graph,
+ const CompilerOptions &options)
+ : _graph{graph}, _parent_graph{parent_graph}
+{
+ // set tracing_ctx for copied graph
+ if (options.tracing_ctx)
+ {
+ auto subgraph_index = options.tracing_ctx->getSubgraphIndex(&graph);
+ options.tracing_ctx->setSubgraphIndex(&_graph, subgraph_index.value());
+ }
+
+ // Build backend contexts
+ auto &backend_manager = BackendManager::get();
+ // Create contexts for other backends
+ for (auto backend_str : options.backend_list)
+ {
+ backend_manager.loadBackend(backend_str);
+ auto backend = backend_manager.get(backend_str);
+
+ // TODO As the default value of backend list contains "cpu", "acl_cl" and "acl_neon", and some
+ // are not available on x64 or some other platforms. So this may be a workaround for x64 and
+ // we should change it back(throw if backend is not loaded) later.
+ if (!backend)
+ {
+ VERBOSE(LoweredGraph) << "Cannot load backend - " << backend_str << std::endl;
+ continue;
+ }
+ }
+ if (backend_manager.num_backends() == 0)
+ throw std::runtime_error{"No available backends loaded."};
+
+ // TODO Move "schedule" phase out of here
+ // Schedule
+ std::unique_ptr<BackendResolver> backend_resolver;
+ auto all_backends = backend_manager.getAll();
+ if (options.he_scheduler)
+ {
+ auto scheduler = HEScheduler(all_backends, options);
+ backend_resolver = scheduler.schedule(_graph);
+ _indexed_ranks = scheduler.getIndexedRanks();
+ }
+ else
+ {
+ auto scheduler = ManualScheduler(all_backends, options);
+ backend_resolver = scheduler.schedule(_graph);
+ }
+
+ makeLowerInfo(*backend_resolver);
+ VERBOSE(LoweredGraph) << "dump before mandatory passes" << std::endl;
+ dumper::text::dumpLoweredGraph(*this);
+
+ // Mandatory passes - kind of legalization(?)
+ pass::PassRunner{}
+ .append(std::make_unique<pass::ConstantInsertionPass>(*this))
+ .append(std::make_unique<pass::ConstantLoweringPass>(*this))
+ .append(std::make_unique<pass::PermutationOperationPass>(*this))
+ .append(std::make_unique<pass::PermutationInsertionPass>(*this))
+ .run();
+
+ dumpLowerInfo();
+
+ // Optimization passes (optional)
+ pass::PassRunner{}.append(std::make_unique<pass::PermutationEliminationPass>(*this)).run();
+
+ VERBOSE(LoweredGraph) << "Dump after all the passes" << std::endl;
+ for (auto operand : _graph.getInputs())
+ VERBOSE(LoweredGraph) << "Graph Input : " << operand << std::endl;
+ for (auto operand : _graph.getOutputs())
+ VERBOSE(LoweredGraph) << "Graph Output : " << operand << std::endl;
+ dumper::text::dumpLoweredGraph(*this);
+
+ // Graph verifications
+ {
+ assert(ir::verifier::InputOutputChecker().verify(_graph));
+ assert(ir::verifier::DAGChecker().verify(_graph));
+ assert(ir::verifier::EdgeChecker().verify(_graph));
+ }
+}
+
void LoweredGraph::makeLowerInfo(const compiler::BackendResolver &backend_resolver)
{
_graph.operands().iterate([&](const ir::OperandIndex &index, const ir::Operand &) {
diff --git a/runtime/onert/core/src/exec/Execution.cc b/runtime/onert/core/src/exec/Execution.cc
index 3d88cf5ff..8eff73bac 100644
--- a/runtime/onert/core/src/exec/Execution.cc
+++ b/runtime/onert/core/src/exec/Execution.cc
@@ -30,6 +30,7 @@ Execution::Execution(const std::shared_ptr<ExecutorMap> &executors) : _executors
const auto &primary_subg = primary_subgraph();
_io_desc.inputs.resize(primary_subg.getInputs().size());
_io_desc.outputs.resize(primary_subg.getOutputs().size());
+ sem_init(&_async_io_descs_sem, 0, 1);
}
void Execution::changeInputShape(const ir::IOIndex &index, const ir::Shape &new_shape)
@@ -71,6 +72,82 @@ void Execution::setInput(const ir::IOIndex &index, const void *buffer, size_t le
_io_desc.inputs.at(index.value()) = std::make_unique<InputDesc>(info, buffer, length, layout);
}
+void Execution::createNewAsyncDesc(uint32_t count)
+{
+ IODescription *_async_io_desc = new IODescription;
+ _async_io_desc->inputs.resize(primary_subgraph().getInputs().size());
+ _async_io_desc->outputs.resize(primary_subgraph().getOutputs().size());
+
+ _async_io_descs.push_back({_async_io_desc, count});
+}
+
+void Execution::setFinish() { finished = true; }
+
+bool Execution::isEmptyQueue()
+{
+ asyncIoDescSemWait();
+ bool ret = _async_io_descs.empty();
+ if (!ret)
+ {
+ for (uint32_t idx = 0; idx < _async_io_descs.front().first->inputs.size(); idx++)
+ {
+ if (_async_io_descs.front().first->inputs.at(idx).get() == nullptr)
+ {
+ ret = true;
+ break;
+ }
+ }
+ }
+ asyncIoDescSemPost();
+ return ret;
+}
+
+void Execution::executeAsyncInput(const ir::IOIndex &index, const void *buffer, size_t length,
+ ir::Layout layout)
+{
+ const auto input_index = primary_subgraph().getInputs().at(index);
+ const auto info = primary_subgraph().operands().at(input_index).info();
+ IODescription *_async_io_desc = _async_io_descs.back().first;
+
+ {
+ auto input_shape_sig = _async_io_desc->dynamic_input_shapes.find(index);
+ auto size_required =
+ (input_shape_sig != _async_io_desc->dynamic_input_shapes.end())
+ ? input_shape_sig->second.num_elements() * onert::ir::sizeOfDataType(info.typeInfo().type())
+ : info.total_size();
+
+ if (length < size_required)
+ {
+ throw std::runtime_error{"Too small length"};
+ }
+ }
+ void *_buffer = (void *)malloc(length);
+ if (_buffer == NULL)
+ {
+ throw std::runtime_error{"malloc failed"};
+ }
+ memcpy(_buffer, buffer, length);
+
+ _async_io_desc->inputs.at(index.value()) =
+ std::make_unique<InputDesc>(info, _buffer, length, layout);
+}
+
+void Execution::executeAsyncOutput(const ir::IOIndex &index, void *buffer, size_t length,
+ ir::Layout layout)
+{
+ const auto output_index = primary_subgraph().getOutputs().at(index);
+ const auto info = primary_subgraph().operands().at(output_index).info();
+ IODescription *_async_io_desc = _async_io_descs.front().first;
+
+ if (length < info.total_size())
+ {
+ throw std::runtime_error{"Too small length"};
+ }
+
+ _async_io_desc->outputs.at(index.value()) =
+ std::make_unique<OutputDesc>(info, buffer, length, layout);
+}
+
// TODO Remove default parameter
void Execution::setInput(const ir::IOIndex &index, const ir::TypeInfo &type, const ir::Shape &shape,
const void *buffer, size_t length, ir::Layout layout)
@@ -137,6 +214,18 @@ void Execution::execute()
VERBOSE(Execution) << "Execution finished" << std::endl;
}
+void Execution::AsyncExecute()
+{
+ VERBOSE(Execution) << "Start Async execution" << std::endl;
+ if (_async_io_descs.empty())
+ {
+ VERBOSE(Execution) << "The input is not ready" << std::endl;
+ return;
+ }
+
+ primary_executor()->execute(*_async_io_descs.front().first);
+}
+
void Execution::startExecute()
{
VERBOSE(Execution) << "Create asynchronous execution thread" << std::endl;
@@ -178,5 +267,153 @@ ir::Shape Execution::getOutputShape(ir::IOIndex ind) const
return output_desc->info.shape();
}
+void Execution::asyncIoDescSemWait() { sem_wait(&_async_io_descs_sem); }
+
+void Execution::asyncIoDescSemPost() { sem_post(&_async_io_descs_sem); }
+
+void Execution::runInference()
+{
+ uint32_t inference_cnt;
+ uint32_t output_sz = primary_subgraph().getOutputs().size();
+ while (true)
+ {
+ if (isEmptyQueue())
+ {
+ if (isFinished())
+ {
+ if (!next_exes.empty())
+ {
+ for (uint32_t i = 0; i < next_exes.size(); i++)
+ {
+ std::get<0>(next_exes[i])->setFinish();
+ }
+ }
+ else
+ {
+ sholudStop();
+ }
+ break;
+ }
+ }
+ else
+ {
+ for (uint32_t i = 0; i < output_sz; i++)
+ {
+ auto opidx = primary_subgraph().getOutputs().at(i);
+ auto shape = primary_subgraph().operands().at(opidx).shape();
+ auto dtype = primary_subgraph().operands().at(opidx).typeInfo().type();
+ auto rank = shape.rank();
+ uint32_t tensor_size = 1;
+ for (int32_t j = 0; j < rank; j++)
+ {
+ tensor_size *= shape.dim(j);
+ }
+ if (dtype == onert::ir::DataType::FLOAT32 || dtype == onert::ir::DataType::INT32 ||
+ dtype == onert::ir::DataType::UINT32)
+ tensor_size *= 4;
+ else if (dtype == onert::ir::DataType::INT64)
+ tensor_size *= 8;
+ void *_buffer = (void *)malloc(tensor_size);
+ if (_buffer == NULL)
+ {
+ throw std::runtime_error{"malloc failed"};
+ }
+ executeAsyncOutput(onert::ir::IOIndex(i), _buffer, tensor_size);
+ }
+ AsyncExecute();
+
+ // set inputs of next execution
+ auto _io_desc = getAsyncIoDescs()->front().first;
+ inference_cnt = getAsyncIoDescs()->front().second;
+ getAsyncIoDescs()->pop_front();
+
+ for (uint32_t i = 0; i < next_exes.size(); i++)
+ {
+ auto next_exe = std::get<0>(next_exes[i]);
+ auto o_index = std::get<1>(next_exes[i]);
+ auto i_index = std::get<2>(next_exes[i]);
+
+ next_exe->asyncIoDescSemWait();
+ auto next_io_descs = next_exe->getAsyncIoDescs();
+ bool exist = false;
+ for (auto iter = next_io_descs->begin(); iter != next_io_descs->end(); iter++)
+ {
+ if (inference_cnt == iter->second)
+ {
+ exist = true;
+ }
+ }
+
+ if (!exist)
+ {
+ next_exe->createNewAsyncDesc(inference_cnt);
+ }
+ for (auto iter = next_io_descs->begin(); iter != next_io_descs->end(); iter++)
+ {
+ if (inference_cnt == iter->second)
+ {
+ const auto input_index = next_exe->primary_subgraph().getInputs().at(i_index.value());
+ const auto info = next_exe->primary_subgraph().operands().at(input_index).info();
+
+ size_t length = _io_desc->outputs[o_index.value()]->size;
+ void *_buffer = (void *)malloc(length);
+ if (_buffer == NULL)
+ {
+ throw std::runtime_error{"malloc failed"};
+ }
+ memcpy(_buffer, _io_desc->outputs[o_index.value()]->buffer, length);
+
+ iter->first->inputs.at(i_index.value()) = std::make_unique<onert::exec::InputDesc>(
+ info, _buffer, length, onert::ir::Layout::NHWC);
+ break;
+ }
+ }
+ next_exe->asyncIoDescSemPost();
+ }
+
+ if (next_exes.empty())
+ {
+ std::vector<void *> results;
+ for (uint32_t i = 0; i < _io_desc->outputs.size(); i++)
+ {
+ size_t length = _io_desc->outputs[i]->size;
+ void *_buffer = (void *)malloc(length);
+ if (_buffer == NULL)
+ {
+ throw std::runtime_error{"malloc failed"};
+ }
+ memcpy(_buffer, _io_desc->outputs[i]->buffer, length);
+ results.push_back(_buffer);
+ }
+ _async_results.push_back(results);
+ }
+
+ for (uint32_t i = 0; i < _io_desc->inputs.size(); i++)
+ {
+ auto p = _io_desc->inputs.at(i).release();
+ if (p)
+ {
+ free((void *)p->buffer);
+ delete p;
+ }
+ }
+ for (uint32_t i = 0; i < _io_desc->outputs.size(); i++)
+ {
+ auto p = _io_desc->outputs.at(i).release();
+ if (p)
+ {
+ free(p->buffer);
+ delete p;
+ }
+ }
+ delete _io_desc;
+ }
+ }
+}
+
+bool Execution::stopWait(void) const { return stop_wait; }
+
+void Execution::sholudStop() { stop_wait = true; }
+
} // namespace exec
} // namespace onert
diff --git a/runtime/onert/core/src/exec/ExecutorBase.cc b/runtime/onert/core/src/exec/ExecutorBase.cc
index 3a624adef..efc22cfa5 100644
--- a/runtime/onert/core/src/exec/ExecutorBase.cc
+++ b/runtime/onert/core/src/exec/ExecutorBase.cc
@@ -30,8 +30,8 @@ ExecutorBase::ExecutorBase(std::unique_ptr<compiler::LoweredGraph> &&lowered_gra
backend::BackendContexts &&backend_contexts,
const compiler::TensorRegistries &tensor_regs,
const util::TracingCtx *tracing_ctx)
- : _lowered_graph{std::move(lowered_graph)},
- _backend_contexts{std::move(backend_contexts)}, _graph{_lowered_graph->graph()}, _mutex(),
+ : _lowered_graph{std::move(lowered_graph)}, _backend_contexts{std::move(backend_contexts)},
+ _graph{_lowered_graph->graph()}, _parent_graph{_lowered_graph->parent_graph()}, _mutex(),
_tracing_ctx(tracing_ctx)
{
auto build_tensor_list = [&](const auto &ind_seq, auto &tensors) {
diff --git a/runtime/onert/core/src/exec/ExecutorBase.h b/runtime/onert/core/src/exec/ExecutorBase.h
index 3a124bd5b..c0f609d11 100644
--- a/runtime/onert/core/src/exec/ExecutorBase.h
+++ b/runtime/onert/core/src/exec/ExecutorBase.h
@@ -58,6 +58,8 @@ public:
const ir::Graph &graph() final { return _graph; }
+ const ir::Graph &parent_graph() final { return _parent_graph; }
+
void execute(const IODescription &desc) final;
void execute(const std::vector<backend::IPortableTensor *> &inputs,
@@ -90,6 +92,7 @@ protected:
std::unique_ptr<compiler::LoweredGraph> _lowered_graph;
backend::BackendContexts _backend_contexts;
const ir::Graph &_graph;
+ const ir::Graph &_parent_graph;
std::vector<backend::builtin::IOTensor *> _input_tensors;
std::vector<backend::builtin::IOTensor *> _output_tensors;
std::mutex _mutex;
diff --git a/runtime/onert/core/src/exec/LinearExecutor.cc b/runtime/onert/core/src/exec/LinearExecutor.cc
index 4d10c869b..a64dadcb1 100644
--- a/runtime/onert/core/src/exec/LinearExecutor.cc
+++ b/runtime/onert/core/src/exec/LinearExecutor.cc
@@ -26,30 +26,52 @@ namespace exec
void LinearExecutor::executeImpl()
{
- auto profiling_subg_index = _tracing_ctx->getSubgraphIndex(&_graph);
-
- _subject.notifySubgraphBegin(profiling_subg_index);
- for (auto &&code : _code)
+ if (_tracing_ctx)
{
- const auto backend = code.lower_info->backend();
+ auto profiling_subg_index = _tracing_ctx->getSubgraphIndex(&_graph);
+
+ _subject.notifySubgraphBegin(profiling_subg_index);
+ for (auto &&code : _code)
+ {
+ const auto backend = code.lower_info->backend();
// TODO : Move ruy profiler into ExecutionObserver
#ifdef RUY_PROFILER
- ruy::profiler::ScopeLabel label(code.op->name());
+ ruy::profiler::ScopeLabel label(code.op->name());
#endif
- _subject.notifyJobBegin(this, profiling_subg_index, code.op_ind, backend);
+ _subject.notifyJobBegin(this, profiling_subg_index, code.op_ind, backend);
+
+ auto &fn_seq = code.fn_seq;
+
+ fn_seq->initRunning();
+
+ bool handle_dynamic_tensor =
+ _lowered_graph->getHasDynamicTensor(code.op_ind) || hasDynamicInput();
+ fn_seq->enableDynamicShapeInferer(handle_dynamic_tensor);
+ fn_seq->run();
- auto &fn_seq = code.fn_seq;
+ _subject.notifyJobEnd(this, profiling_subg_index, code.op_ind, backend);
+ }
+ _subject.notifySubgraphEnd(profiling_subg_index);
+ }
+ else
+ {
+ for (auto &&code : _code)
+ {
+// TODO : Move ruy profiler into ExecutionObserver
+#ifdef RUY_PROFILER
+ ruy::profiler::ScopeLabel label(code.op->name());
+#endif
- fn_seq->initRunning();
+ auto &fn_seq = code.fn_seq;
- bool handle_dynamic_tensor =
- _lowered_graph->getHasDynamicTensor(code.op_ind) || hasDynamicInput();
- fn_seq->enableDynamicShapeInferer(handle_dynamic_tensor);
- fn_seq->run();
+ fn_seq->initRunning();
- _subject.notifyJobEnd(this, profiling_subg_index, code.op_ind, backend);
+ bool handle_dynamic_tensor =
+ _lowered_graph->getHasDynamicTensor(code.op_ind) || hasDynamicInput();
+ fn_seq->enableDynamicShapeInferer(handle_dynamic_tensor);
+ fn_seq->run();
+ }
}
- _subject.notifySubgraphEnd(profiling_subg_index);
}
} // namespace exec
diff --git a/runtime/onert/core/src/interp/InterpExecutor.h b/runtime/onert/core/src/interp/InterpExecutor.h
index 6e3a02327..df6153d09 100644
--- a/runtime/onert/core/src/interp/InterpExecutor.h
+++ b/runtime/onert/core/src/interp/InterpExecutor.h
@@ -50,6 +50,11 @@ public:
* @return Graph object
*/
const ir::Graph &graph() final { return _graph; }
+
+ const ir::Graph &parent_graph() final
+ {
+ throw new std::runtime_error{"Interpreter does not support this function."};
+ }
void setIndexedRanks(std::shared_ptr<ir::OperationIndexMap<int64_t>>) override{
// Not implemented
};
diff --git a/runtime/onert/core/src/ir/operation/ElementwiseBinary.cc b/runtime/onert/core/src/ir/operation/ElementwiseBinary.cc
index 8dc42903c..155b660dc 100644
--- a/runtime/onert/core/src/ir/operation/ElementwiseBinary.cc
+++ b/runtime/onert/core/src/ir/operation/ElementwiseBinary.cc
@@ -40,6 +40,7 @@ std::string ElementwiseBinary::name() const
{
using ElementwiseBinaryType = onert::ir::operation::ElementwiseBinary::ElementwiseBinaryType;
static const std::unordered_map<ElementwiseBinaryType, std::string> name_map{
+ {ElementwiseBinaryType::FLOOR_DIV, std::string{"FloorDiv"}},
{ElementwiseBinaryType::LOGICAL_AND, std::string{"LogicalAnd"}},
{ElementwiseBinaryType::LOGICAL_OR, std::string{"LogicalOr"}},
{ElementwiseBinaryType::MAX, std::string{"Max"}},
diff --git a/runtime/onert/frontend/base_loader/include/base_loader.h b/runtime/onert/frontend/base_loader/include/base_loader.h
index c096e705d..c444e7365 100644
--- a/runtime/onert/frontend/base_loader/include/base_loader.h
+++ b/runtime/onert/frontend/base_loader/include/base_loader.h
@@ -68,7 +68,8 @@ public:
* @param graph reference on subgraphs
*/
explicit BaseLoader(std::unique_ptr<ir::Subgraphs> &subgs)
- : _base{nullptr}, _pagesize(getpagesize()), _fd(-1), _subgraphs(subgs), _model{nullptr}
+ : _base{nullptr}, _pagesize(getpagesize()), _fd(-1), _subgraphs(subgs), _model{nullptr},
+ _tensor_names(std::make_shared<std::unordered_map<ir::OperandIndex, std::string>>())
{
_use_mmaped_data = util::getConfigBool(util::config::USE_MMAPED_DATA);
}
@@ -181,7 +182,7 @@ protected:
const Model *_model;
// Maps Tensor indices to onert Operands.
std::vector<ir::OperandIndex> _tensor_to_operand;
- std::unordered_map<ir::OperandIndex, std::string> _tensor_names;
+ std::shared_ptr<std::unordered_map<ir::OperandIndex, std::string>> _tensor_names;
// Verifier
std::unique_ptr<Verifier> _verifier;
// Boolean flag to use MMAPED_DATA
@@ -387,7 +388,7 @@ ir::OperandIndex BaseLoader<LoaderDomain>::loadOperand(const Tensor *tensor, ir:
subg.setOperandValue(operand_index, std::move(data_obj));
}
- _tensor_names.emplace(operand_index, tensor->name()->str());
+ _tensor_names->emplace(operand_index, tensor->name()->str());
// Variable
if (tensor->is_variable())
@@ -1492,6 +1493,10 @@ void BaseLoader<LoaderDomain>::loadOperation(const Operator *op, ir::Graph &subg
case BuiltinOperator::BuiltinOperator_UNPACK:
loadUnpack(op, subg);
return;
+ case BuiltinOperator::BuiltinOperator_FLOOR_DIV:
+ loadElementwiseBinary(op, subg,
+ ir::operation::ElementwiseBinary::ElementwiseBinaryType::FLOOR_DIV);
+ return;
case BuiltinOperator::BuiltinOperator_MINIMUM:
loadElementwiseBinary(op, subg, ir::operation::ElementwiseBinary::ElementwiseBinaryType::MIN);
return;
diff --git a/runtime/onert/frontend/circle/src/circle_loader.cc b/runtime/onert/frontend/circle/src/circle_loader.cc
index 652fbc778..4fb0e71d6 100644
--- a/runtime/onert/frontend/circle/src/circle_loader.cc
+++ b/runtime/onert/frontend/circle/src/circle_loader.cc
@@ -106,13 +106,13 @@ private:
for (const std::int32_t input_ind : *circle_subg->inputs())
{
subg->addInput(tensorIdxToOperandIdx(input_ind),
- _tensor_names.at(_tensor_to_operand[input_ind]));
+ _tensor_names->at(_tensor_to_operand[input_ind]));
}
// Set outputs
for (const std::int32_t output_ind : *circle_subg->outputs())
{
subg->addOutput(tensorIdxToOperandIdx(output_ind),
- _tensor_names.at(_tensor_to_operand[output_ind]));
+ _tensor_names->at(_tensor_to_operand[output_ind]));
}
// Create operations
for (const auto *op : *circle_subg->operators())
diff --git a/runtime/onert/frontend/tflite/src/tflite_loader.cc b/runtime/onert/frontend/tflite/src/tflite_loader.cc
index 8669bbb44..a3038b718 100644
--- a/runtime/onert/frontend/tflite/src/tflite_loader.cc
+++ b/runtime/onert/frontend/tflite/src/tflite_loader.cc
@@ -93,13 +93,13 @@ private:
for (const std::int32_t input_ind : *tflite_subg->inputs())
{
subg->addInput(tensorIdxToOperandIdx(input_ind),
- _tensor_names.at(_tensor_to_operand[input_ind]));
+ _tensor_names->at(_tensor_to_operand[input_ind]));
}
// Set outputs
for (const std::int32_t output_ind : *tflite_subg->outputs())
{
subg->addOutput(tensorIdxToOperandIdx(output_ind),
- _tensor_names.at(_tensor_to_operand[output_ind]));
+ _tensor_names->at(_tensor_to_operand[output_ind]));
}
// Create operations
for (const auto *op : *tflite_subg->operators())
@@ -107,6 +107,7 @@ private:
loadOperation(op, *subg);
}
+ subg->setTensorName(_tensor_names);
subg->verify();
return subg;