Support NHWC to DepthToSpaceKernel (#4363)

* Support NHWC to DepthToSpaceKernel This commit supports NHWC to DepthToSpaceKernel. * Change a preprocessor argument of kernel for DepthToSpace This commit changes a preprocessor argument of kernel for DepthToSpace to prevent misunderstandings. Signed-off-by: jiseob.jang <jiseob.jang@samsung.com>
author: 장지섭/On-Device Lab(SR)/Engineer/삼성전자 <jiseob.jang@samsung.com> 2019-01-31 09:51:54 +0900
committer: 오형석/On-Device Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com> 2019-01-31 09:51:54 +0900
commit: ea8f7a43274fa8255816224c5e9c78400ed900f1 (patch)
tree: 1dd1a8f1c0d51e5b688fbcc9a4cd9a85b24bc5e9 /libs
parent: e371768a0b988ad3572467c5225acdbb9f9e9df0 (diff)
download: nnfw-ea8f7a43274fa8255816224c5e9c78400ed900f1.tar.gz
nnfw-ea8f7a43274fa8255816224c5e9c78400ed900f1.tar.bz2
nnfw-ea8f7a43274fa8255816224c5e9c78400ed900f1.zip
3 files changed, 71 insertions, 12 deletions
diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index 4cbcb651f..2ef886e91 100644
--- a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -49,7 +49,8 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
     {"cast_qasymm_out", "cast.cl"},
     {"comparison_op", "comparison_op.cl"},
     {"comparison_op_qasymm8", "comparison_op_quantized.cl"},
-    {"depth_to_space", "depth_to_space.cl"},
+    {"depth_to_space_nchw", "depth_to_space.cl"},
+    {"depth_to_space_nhwc", "depth_to_space.cl"},
     {"embedding_lookup", "embedding_lookup.cl"},
     {"exp_layer", "exp.cl"},
     {"gather", "gather.cl"},
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
index fef2243e7..08f72665a 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
@@ -16,11 +16,12 @@
  */
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE)
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
 /** Perform space to depth rearrangement of tensor
  *
  * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ * @attention The value of the z-axis of output tensor should be given as a preprocessor argument using -DZ_OUT=size. e.g. -DZ_OUT=16
  * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1
  *
  * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
@@ -44,20 +45,20 @@ bytes)
  * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
  */
-__kernel void depth_to_space(
+__kernel void depth_to_space_nchw(
      TENSOR4D_DECLARATION(input),
      TENSOR4D_DECLARATION(output))
  {
     Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
 
     int out_index[4]={0};
     int in_index[4]={0};
 
     out_index[0] = get_global_id(0);//W
     out_index[1] = get_global_id(1);//H
-    out_index[2] = get_global_id(2) % DEPTH_OUT;//C
-    out_index[3] = get_global_id(2) / DEPTH_OUT;//B
+    out_index[2] = get_global_id(2) % Z_OUT;//C
+    out_index[3] = get_global_id(2) / Z_OUT;//B
 
     in_index[0] = out_index[0]/BLOCK_SIZE;
     in_index[1] = out_index[1]/BLOCK_SIZE;
@@ -66,4 +67,57 @@ __kernel void depth_to_space(
 
     *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2],in_index[3]));
  }
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE)
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
+/** Perform space to depth rearrangement of tensor (NHWC)
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ * @attention The value of the z-axis of output tensor should be given as a preprocessor argument using -DZ_OUT=size. e.g. -DZ_OUT=16
+ * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in  bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p inpu
+t_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
+bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void depth_to_space_nhwc(
+     TENSOR4D_DECLARATION(input),
+     TENSOR4D_DECLARATION(output))
+ {
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
+
+    int out_index[4]={0};
+    int in_index[4]={0};
+
+    out_index[0] = get_global_id(0);//C
+    out_index[1] = get_global_id(1);//W
+    out_index[2] = get_global_id(2) % Z_OUT;//H
+    out_index[3] = get_global_id(2) / Z_OUT;//B
+
+    in_index[0] = out_index[0] + ((out_index[2] % BLOCK_SIZE) * BLOCK_SIZE + out_index[1] % BLOCK_SIZE) * DEPTH_OUT;
+    in_index[1] = out_index[1]/BLOCK_SIZE;
+    in_index[2] = out_index[2]/BLOCK_SIZE;
+    in_index[3] = out_index[3];
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2],in_index[3]));
+ }
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
index 5604e406e..2a3433c2b 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
@@ -64,19 +64,23 @@ CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr)
 void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output,
                                      const int32_t block_size)
 {
-
+  // TODO Add validation of data_layout
   _input = input;
   _output = output;
 
   // Set kernel build options
+  auto layout_out = output->info()->data_layout();
   std::set<std::string> build_opts;
   build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
   build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+  auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
+  auto depth = output->info()->dimension(index_depth);
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(depth));
+  build_opts.emplace("-DZ_OUT=" + support::cpp11::to_string(output->info()->tensor_shape().z()));
 
   // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("depth_to_space", build_opts));
+  _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
+      "depth_to_space_" + lower_string(string_from_data_layout(layout_out)), build_opts));
 
   // Configure  kernel window
   Window win = calculate_max_window(*output->info(), Steps());
author	장지섭/On-Device Lab(SR)/Engineer/삼성전자 <jiseob.jang@samsung.com>	2019-01-31 09:51:54 +0900
committer	오형석/On-Device Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com>	2019-01-31 09:51:54 +0900
commit	ea8f7a43274fa8255816224c5e9c78400ed900f1 (patch)
tree	1dd1a8f1c0d51e5b688fbcc9a4cd9a85b24bc5e9 /libs
parent	e371768a0b988ad3572467c5225acdbb9f9e9df0 (diff)
download	nnfw-ea8f7a43274fa8255816224c5e9c78400ed900f1.tar.gz nnfw-ea8f7a43274fa8255816224c5e9c78400ed900f1.tar.bz2 nnfw-ea8f7a43274fa8255816224c5e9c78400ed900f1.zip