diff options
author | 장지섭/On-Device Lab(SR)/Engineer/삼성전자 <jiseob.jang@samsung.com> | 2019-01-31 09:51:54 +0900 |
---|---|---|
committer | 오형석/On-Device Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com> | 2019-01-31 09:51:54 +0900 |
commit | ea8f7a43274fa8255816224c5e9c78400ed900f1 (patch) | |
tree | 1dd1a8f1c0d51e5b688fbcc9a4cd9a85b24bc5e9 /libs | |
parent | e371768a0b988ad3572467c5225acdbb9f9e9df0 (diff) | |
download | nnfw-ea8f7a43274fa8255816224c5e9c78400ed900f1.tar.gz nnfw-ea8f7a43274fa8255816224c5e9c78400ed900f1.tar.bz2 nnfw-ea8f7a43274fa8255816224c5e9c78400ed900f1.zip |
Support NHWC to DepthToSpaceKernel (#4363)
* Support NHWC to DepthToSpaceKernel
This commit supports NHWC to DepthToSpaceKernel.
* Change a preprocessor argument of kernel for DepthToSpace
This commit changes a preprocessor argument of kernel for DepthToSpace to prevent misunderstandings.
Signed-off-by: jiseob.jang <jiseob.jang@samsung.com>
Diffstat (limited to 'libs')
3 files changed, 71 insertions, 12 deletions
diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp index 4cbcb651f..2ef886e91 100644 --- a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp +++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp @@ -49,7 +49,8 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map {"cast_qasymm_out", "cast.cl"}, {"comparison_op", "comparison_op.cl"}, {"comparison_op_qasymm8", "comparison_op_quantized.cl"}, - {"depth_to_space", "depth_to_space.cl"}, + {"depth_to_space_nchw", "depth_to_space.cl"}, + {"depth_to_space_nhwc", "depth_to_space.cl"}, {"embedding_lookup", "embedding_lookup.cl"}, {"exp_layer", "exp.cl"}, {"gather", "gather.cl"}, diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl index fef2243e7..08f72665a 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl @@ -16,11 +16,12 @@ */ #include "helpers.h" -#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) /** Perform space to depth rearrangement of tensor * * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float - * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16 + * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 + * @attention The value of the z-axis of output tensor should be given as a preprocessor argument using -DZ_OUT=size. e.g. -DZ_OUT=16 * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1 * * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 @@ -44,20 +45,20 @@ bytes) * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image */ -__kernel void depth_to_space( +__kernel void depth_to_space_nchw( TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output)) { Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT); int out_index[4]={0}; int in_index[4]={0}; out_index[0] = get_global_id(0);//W out_index[1] = get_global_id(1);//H - out_index[2] = get_global_id(2) % DEPTH_OUT;//C - out_index[3] = get_global_id(2) / DEPTH_OUT;//B + out_index[2] = get_global_id(2) % Z_OUT;//C + out_index[3] = get_global_id(2) / Z_OUT;//B in_index[0] = out_index[0]/BLOCK_SIZE; in_index[1] = out_index[1]/BLOCK_SIZE; @@ -66,4 +67,57 @@ __kernel void depth_to_space( *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2],in_index[3])); } -#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) + +#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) +/** Perform space to depth rearrangement of tensor (NHWC) + * + * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float + * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16 + * @attention The value of the z-axis of output tensor should be given as a preprocessor argument using -DZ_OUT=size. e.g. -DZ_OUT=16 + * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p inpu +t_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in +bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void depth_to_space_nhwc( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output)) + { + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT); + + int out_index[4]={0}; + int in_index[4]={0}; + + out_index[0] = get_global_id(0);//C + out_index[1] = get_global_id(1);//W + out_index[2] = get_global_id(2) % Z_OUT;//H + out_index[3] = get_global_id(2) / Z_OUT;//B + + in_index[0] = out_index[0] + ((out_index[2] % BLOCK_SIZE) * BLOCK_SIZE + out_index[1] % BLOCK_SIZE) * DEPTH_OUT; + in_index[1] = out_index[1]/BLOCK_SIZE; + in_index[2] = out_index[2]/BLOCK_SIZE; + in_index[3] = out_index[3]; + + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2],in_index[3])); + } +#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT) diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp index 5604e406e..2a3433c2b 100644 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp @@ -64,19 +64,23 @@ CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr) void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size) { - + // TODO Add validation of data_layout _input = input; _output = output; // Set kernel build options + auto layout_out = output->info()->data_layout(); std::set<std::string> build_opts; build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size)); - build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2))); + auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL); + auto depth = output->info()->dimension(index_depth); + build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(depth)); + build_opts.emplace("-DZ_OUT=" + support::cpp11::to_string(output->info()->tensor_shape().z())); // Create kernel - _kernel = - static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("depth_to_space", build_opts)); + _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel( + "depth_to_space_" + lower_string(string_from_data_layout(layout_out)), build_opts)); // Configure kernel window Window win = calculate_max_window(*output->info(), Steps()); |