summaryrefslogtreecommitdiff
path: root/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl
diff options
context:
space:
mode:
Diffstat (limited to 'inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl')
-rw-r--r--inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl169
1 files changed, 169 insertions, 0 deletions
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl
new file mode 100644
index 000000000..130cd8cca
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl
@@ -0,0 +1,169 @@
+// Copyright (c) 2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "include/include_all.cl"
+
+#if MAX_POOLING
+ #define INIT_VAL CHAR_MIN
+#elif AVG_POOLING
+ #define INIT_VAL 0
+#else
+#error
+#endif
+
+
+inline int FUNC(apply_pooling)(int tmp, int in)
+{
+#if MAX_POOLING
+ return max(tmp, in);
+#elif AVG_POOLING
+ return tmp + in;
+#endif
+}
+
+__attribute__((intel_reqd_sub_group_size(8)))
+KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32)(
+ const __global UNIT_TYPE* input,
+ __global UNIT_TYPE* output)
+{
+ const uint x = (uint)get_global_id(0);
+ const uint y = (uint)get_global_id(1);
+ const uint bf = (uint)get_global_id(2);
+ // we process 4 features per workitem that's why we need to divide it
+ const uint aligned32_features = ((INPUT0_FEATURE_NUM + 31) / 32) * 32;
+ const uint f = 4 * (bf % (aligned32_features / 4));
+ const uint b_block = bf / (aligned32_features / 4);
+
+ if (x >= OUTPUT_SIZE_X)
+ {
+ return;
+ }
+
+ const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X;
+ const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
+
+ int4 result[4];
+ for(uint b = 0; b < 4; b++)
+ {
+ result[b] = INIT_VAL;
+ }
+
+#ifdef CHECK_BOUNDRY
+ if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
+ offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y)
+ {
+ return;
+ }
+
+#ifdef DYNAMIC_KERNEL_DIVIDER
+ uint num_elementes = 0;
+#endif
+
+ const uint batch_and_feature_offset = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(INPUT0, b_block * 4, f, 0, 0);
+ for(uint j = 0; j < POOL_SIZE_Y; j++)
+ {
+ int input_offset_y = offset_y + j;
+ bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
+ if(!zero_y)
+ {
+ for(uint i = 0; i < POOL_SIZE_X; i++)
+ {
+ int input_offset_x = offset_x + i;
+ bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
+ if(!zero)
+ {
+ const uint input_idx = batch_and_feature_offset + input_offset_y*IN_Y_PITCH + input_offset_x*IN_X_PITCH;
+
+ int4 int_data = as_int4(intel_sub_group_block_read4((const __global uint*)(input + input_idx)));
+ for(uint b = 0; b < 4; b++)
+ {
+ char4 input_data = as_char4(int_data[b]);
+ result[b][0] = FUNC_CALL(apply_pooling)(result[b][0], (int)input_data[0]);
+ result[b][1] = FUNC_CALL(apply_pooling)(result[b][1], (int)input_data[1]);
+ result[b][2] = FUNC_CALL(apply_pooling)(result[b][2], (int)input_data[2]);
+ result[b][3] = FUNC_CALL(apply_pooling)(result[b][3], (int)input_data[3]);
+
+ }
+
+#ifdef DYNAMIC_KERNEL_DIVIDER
+ num_elementes++;
+#endif
+ }
+ }
+ }
+ }
+#ifdef DYNAMIC_WITH_PADDING_KERNEL_DIVIDER
+ const int hend = min(offset_y + POOL_SIZE_Y, INPUT0_SIZE_Y + PADDING_SIZE_Y);
+ const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X);
+ const uint num_elementes = (hend - offset_y) * (wend - offset_x);
+#endif
+#else
+ uint input_idx = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(INPUT0, b_block * 4, f, offset_y, offset_x);
+
+ for(uint j = 0; j < POOL_SIZE_Y; j++)
+ {
+ for(uint i = 0; i < POOL_SIZE_X; i++)
+ {
+ int4 int_data = as_int4(intel_sub_group_block_read4((const __global uint*)(input + input_idx)));
+ for(uint b = 0; b < 4; b++)
+ {
+ char4 input_data = as_char4(int_data[b]);
+ result[b][0] = FUNC_CALL(apply_pooling)(result[b][0], (int)input_data[0]);
+ result[b][1] = FUNC_CALL(apply_pooling)(result[b][1], (int)input_data[1]);
+ result[b][2] = FUNC_CALL(apply_pooling)(result[b][2], (int)input_data[2]);
+ result[b][3] = FUNC_CALL(apply_pooling)(result[b][3], (int)input_data[3]);
+ }
+
+ input_idx += IN_X_PITCH;
+ }
+ input_idx += (IN_Y_PITCH - POOL_SIZE_X*IN_X_PITCH);
+ }
+
+#if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
+ const uint num_elementes = POOL_SIZE_X*POOL_SIZE_Y;
+#endif
+#endif
+
+#if defined AVG_POOLING
+ #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
+ for(uint b = 0; b < 4; b++)
+ {
+ for(uint i = 0; i < 4; i++)
+ {
+ result[b][i] = convert_int(round(((float)result[b][i] / max(num_elementes, (uint)1)));
+ }
+ }
+ #else
+ for(uint b = 0; b < 4; b++)
+ {
+ for(uint i = 0; i < 4; i++)
+ {
+ result[b][i] = convert_int(round((float)result[b][i] / (int)(POOL_SIZE_Y * POOL_SIZE_X)));
+ }
+ }
+ #endif
+#endif
+
+for(uint b = 0; b < 4; b++)
+{
+ for(uint op = 0; op < 4; op++)
+ {
+ const uint output_pos = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b_block*4 + b, f+op, y, x);
+ output[output_pos] = ACTIVATION(convert_char(result[b][op]), NL_M ,NL_N);
+ }
+}
+}
+
+#undef INIT_VAL \ No newline at end of file