summaryrefslogtreecommitdiff
path: root/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad.cl
diff options
context:
space:
mode:
Diffstat (limited to 'inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad.cl')
-rw-r--r--inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad.cl134
1 files changed, 134 insertions, 0 deletions
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad.cl
new file mode 100644
index 000000000..78d2474c1
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad.cl
@@ -0,0 +1,134 @@
+// Copyright (c) 2016-2017 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/common.cl"
+#include "include/activation_functions.cl"
+#include "include/data_types.cl"
+#include "include/fetch.cl"
+#include "include/mmad.cl"
+
+#define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32)
+#define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8)
+#define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32)
+#define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8)
+
+__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+KERNEL(convolution_MMAD)(
+ __global INPUT0_TYPE* input,
+ __global OUTPUT_TYPE* output,
+ __global FILTER_TYPE* weights,
+#if BIAS_TERM
+ __global BIAS_TYPE* biases,
+#endif
+#if QUANTIZATION_TERM
+ const __global float* quantizations,
+#endif
+#if CALIBRATION_TERM
+ const __global float* calibrations,
+#endif
+ uint split_idx)
+{
+ const uint x = get_global_id(0);
+ const uint y = get_global_id(1);
+#if OUTPUT_BATCH_NUM == 1
+ const uint f = get_global_id(2);
+ const uint b = 0;
+#else
+ const uint f = get_global_id(2) % FILTER_OFM_ALIGNED;
+ const uint b = get_global_id(2) / FILTER_OFM_ALIGNED;
+#endif
+
+#if QUANTIZATION_TERM
+ int dotProd = 0;
+#else
+ UNIT_TYPE dotProd = UNIT_VAL_ZERO;
+#endif
+
+ const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
+ const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
+
+ const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM;
+
+ const uint filter_offset = (get_group_id(2) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH;
+ const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset;
+
+ for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k)
+ {
+ for (uint j = 0; j < FILTER_SIZE_Y ; ++j)
+ {
+ const int input_offset_y = input_y + j * DILATION_SIZE_Y;
+ const bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
+
+ if(!zero_y)
+ {
+ for (uint i = 0; i < FILTER_SIZE_X ; ++i)
+ {
+ const int input_offset_x = input_x + i * DILATION_SIZE_X;
+ const bool zero_x = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
+
+ if(!zero_x)
+ {
+ uint input_idx = input_offset + (uint)input_offset_x*INPUT0_X_PITCH + (uint)input_offset_y*INPUT0_Y_PITCH + k*32;
+ uint filter_idx = filter_offset + k*FILTER_Y_PITCH * FILTER_SIZE_Y + j*FILTER_Y_PITCH + i*FILTER_X_PITCH;
+
+ int input_data = as_int(intel_sub_group_block_read((const __global uint*)(input + input_idx)));
+ int8 activations; //activations of all lanes
+ activations.s0 = sub_group_broadcast(input_data, 0);
+ activations.s1 = sub_group_broadcast(input_data, 1);
+ activations.s2 = sub_group_broadcast(input_data, 2);
+ activations.s3 = sub_group_broadcast(input_data, 3);
+ activations.s4 = sub_group_broadcast(input_data, 4);
+ activations.s5 = sub_group_broadcast(input_data, 5);
+ activations.s6 = sub_group_broadcast(input_data, 6);
+ activations.s7 = sub_group_broadcast(input_data, 7);
+
+ int8 weights_data = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + filter_idx)));
+
+ dotProd = MMAD_8(activations, weights_data, dotProd);
+ }
+ }
+ }
+ }
+ }
+
+#if BIAS_TERM
+#if BIAS_PER_OUTPUT
+ const uint bias_index = GET_DATA_INDEX(BIAS, b, f, y, x);
+#elif BIAS_PER_OFM
+ const uint bias_index = f;
+#endif
+#if QUANTIZATION_TERM
+#if CALIBRATION_TERM
+ dotProd = (UNIT_TYPE)round(((float)dotProd * quantizations[f] * I_QF + biases[bias_index]) * calibrations[f]);
+#else // CALIBRATION_TERM
+ dotProd = (UNIT_TYPE)round(((float)dotProd * quantizations[f] * I_QF + biases[bias_index]) * O_QF);
+#endif // CALIBRATION_TERM
+#else // QUANTIZATION_TERM
+ dotProd += (UNIT_TYPE)biases[bias_index];
+#endif // QUANTIZATION_TERM
+#endif // BIAS_TERM
+
+ const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM;
+ const uint dst_index = GET_DATA_INDEX(OUTPUT, b, f, y, x) + out_split_offset;
+#if QUANTIZATION_TERM
+ output[dst_index] = ACTIVATION(convert_char(dotProd), NL_M, NL_N);
+#else
+ output[dst_index] = ACTIVATION(dotProd, NL_M, NL_N);
+#endif
+}
+
+#undef FILTER_IFM_MMAD_NUM
+#undef FILTER_OFM_MMAD_NUM
+#undef FILTER_IFM_ALIGNED
+#undef FILTER_OFM_ALIGNED \ No newline at end of file