summaryrefslogtreecommitdiff
path: root/libs
diff options
context:
space:
mode:
author장지섭/On-Device Lab(SR)/Engineer/삼성전자 <jiseob.jang@samsung.com>2019-01-31 09:51:54 +0900
committer오형석/On-Device Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com>2019-01-31 09:51:54 +0900
commitea8f7a43274fa8255816224c5e9c78400ed900f1 (patch)
tree1dd1a8f1c0d51e5b688fbcc9a4cd9a85b24bc5e9 /libs
parente371768a0b988ad3572467c5225acdbb9f9e9df0 (diff)
downloadnnfw-ea8f7a43274fa8255816224c5e9c78400ed900f1.tar.gz
nnfw-ea8f7a43274fa8255816224c5e9c78400ed900f1.tar.bz2
nnfw-ea8f7a43274fa8255816224c5e9c78400ed900f1.zip
Support NHWC to DepthToSpaceKernel (#4363)
* Support NHWC to DepthToSpaceKernel This commit supports NHWC to DepthToSpaceKernel. * Change a preprocessor argument of kernel for DepthToSpace This commit changes a preprocessor argument of kernel for DepthToSpace to prevent misunderstandings. Signed-off-by: jiseob.jang <jiseob.jang@samsung.com>
Diffstat (limited to 'libs')
-rw-r--r--libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp3
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl68
-rw-r--r--libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp12
3 files changed, 71 insertions, 12 deletions
diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index 4cbcb651f..2ef886e91 100644
--- a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -49,7 +49,8 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
{"cast_qasymm_out", "cast.cl"},
{"comparison_op", "comparison_op.cl"},
{"comparison_op_qasymm8", "comparison_op_quantized.cl"},
- {"depth_to_space", "depth_to_space.cl"},
+ {"depth_to_space_nchw", "depth_to_space.cl"},
+ {"depth_to_space_nhwc", "depth_to_space.cl"},
{"embedding_lookup", "embedding_lookup.cl"},
{"exp_layer", "exp.cl"},
{"gather", "gather.cl"},
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
index fef2243e7..08f72665a 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
@@ -16,11 +16,12 @@
*/
#include "helpers.h"
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE)
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
/** Perform space to depth rearrangement of tensor
*
* @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ * @attention The value of the z-axis of output tensor should be given as a preprocessor argument using -DZ_OUT=size. e.g. -DZ_OUT=16
* @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1
*
* @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
@@ -44,20 +45,20 @@ bytes)
* @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
*/
-__kernel void depth_to_space(
+__kernel void depth_to_space_nchw(
TENSOR4D_DECLARATION(input),
TENSOR4D_DECLARATION(output))
{
Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
int out_index[4]={0};
int in_index[4]={0};
out_index[0] = get_global_id(0);//W
out_index[1] = get_global_id(1);//H
- out_index[2] = get_global_id(2) % DEPTH_OUT;//C
- out_index[3] = get_global_id(2) / DEPTH_OUT;//B
+ out_index[2] = get_global_id(2) % Z_OUT;//C
+ out_index[3] = get_global_id(2) / Z_OUT;//B
in_index[0] = out_index[0]/BLOCK_SIZE;
in_index[1] = out_index[1]/BLOCK_SIZE;
@@ -66,4 +67,57 @@ __kernel void depth_to_space(
*((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2],in_index[3]));
}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE)
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
+/** Perform space to depth rearrangement of tensor (NHWC)
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ * @attention The value of the z-axis of output tensor should be given as a preprocessor argument using -DZ_OUT=size. e.g. -DZ_OUT=16
+ * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu
+t_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in
+bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void depth_to_space_nhwc(
+ TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output))
+ {
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
+
+ int out_index[4]={0};
+ int in_index[4]={0};
+
+ out_index[0] = get_global_id(0);//C
+ out_index[1] = get_global_id(1);//W
+ out_index[2] = get_global_id(2) % Z_OUT;//H
+ out_index[3] = get_global_id(2) / Z_OUT;//B
+
+ in_index[0] = out_index[0] + ((out_index[2] % BLOCK_SIZE) * BLOCK_SIZE + out_index[1] % BLOCK_SIZE) * DEPTH_OUT;
+ in_index[1] = out_index[1]/BLOCK_SIZE;
+ in_index[2] = out_index[2]/BLOCK_SIZE;
+ in_index[3] = out_index[3];
+
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2],in_index[3]));
+ }
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
index 5604e406e..2a3433c2b 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
@@ -64,19 +64,23 @@ CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr)
void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output,
const int32_t block_size)
{
-
+ // TODO Add validation of data_layout
_input = input;
_output = output;
// Set kernel build options
+ auto layout_out = output->info()->data_layout();
std::set<std::string> build_opts;
build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
- build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+ auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
+ auto depth = output->info()->dimension(index_depth);
+ build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(depth));
+ build_opts.emplace("-DZ_OUT=" + support::cpp11::to_string(output->info()->tensor_shape().z()));
// Create kernel
- _kernel =
- static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("depth_to_space", build_opts));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
+ "depth_to_space_" + lower_string(string_from_data_layout(layout_out)), build_opts));
// Configure kernel window
Window win = calculate_max_window(*output->info(), Steps());