diff options
Diffstat (limited to 'inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad.cl')
-rw-r--r-- | inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad.cl | 134 |
1 files changed, 134 insertions, 0 deletions
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad.cl new file mode 100644 index 000000000..78d2474c1 --- /dev/null +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad.cl @@ -0,0 +1,134 @@ +// Copyright (c) 2016-2017 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include/common.cl" +#include "include/activation_functions.cl" +#include "include/data_types.cl" +#include "include/fetch.cl" +#include "include/mmad.cl" + +#define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32) +#define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8) +#define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32) +#define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8) + +__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) +KERNEL(convolution_MMAD)( + __global INPUT0_TYPE* input, + __global OUTPUT_TYPE* output, + __global FILTER_TYPE* weights, +#if BIAS_TERM + __global BIAS_TYPE* biases, +#endif +#if QUANTIZATION_TERM + const __global float* quantizations, +#endif +#if CALIBRATION_TERM + const __global float* calibrations, +#endif + uint split_idx) +{ + const uint x = get_global_id(0); + const uint y = get_global_id(1); +#if OUTPUT_BATCH_NUM == 1 + const uint f = get_global_id(2); + const uint b = 0; +#else + const uint f = get_global_id(2) % FILTER_OFM_ALIGNED; + const uint b = get_global_id(2) / FILTER_OFM_ALIGNED; +#endif + +#if QUANTIZATION_TERM + int dotProd = 0; +#else + UNIT_TYPE dotProd = UNIT_VAL_ZERO; +#endif + + const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X; + const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y; + + const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM; + + const uint filter_offset = (get_group_id(2) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH; + const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset; + + for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k) + { + for (uint j = 0; j < FILTER_SIZE_Y ; ++j) + { + const int input_offset_y = input_y + j * DILATION_SIZE_Y; + const bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0; + + if(!zero_y) + { + for (uint i = 0; i < FILTER_SIZE_X ; ++i) + { + const int input_offset_x = input_x + i * DILATION_SIZE_X; + const bool zero_x = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0; + + if(!zero_x) + { + uint input_idx = input_offset + (uint)input_offset_x*INPUT0_X_PITCH + (uint)input_offset_y*INPUT0_Y_PITCH + k*32; + uint filter_idx = filter_offset + k*FILTER_Y_PITCH * FILTER_SIZE_Y + j*FILTER_Y_PITCH + i*FILTER_X_PITCH; + + int input_data = as_int(intel_sub_group_block_read((const __global uint*)(input + input_idx))); + int8 activations; //activations of all lanes + activations.s0 = sub_group_broadcast(input_data, 0); + activations.s1 = sub_group_broadcast(input_data, 1); + activations.s2 = sub_group_broadcast(input_data, 2); + activations.s3 = sub_group_broadcast(input_data, 3); + activations.s4 = sub_group_broadcast(input_data, 4); + activations.s5 = sub_group_broadcast(input_data, 5); + activations.s6 = sub_group_broadcast(input_data, 6); + activations.s7 = sub_group_broadcast(input_data, 7); + + int8 weights_data = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + filter_idx))); + + dotProd = MMAD_8(activations, weights_data, dotProd); + } + } + } + } + } + +#if BIAS_TERM +#if BIAS_PER_OUTPUT + const uint bias_index = GET_DATA_INDEX(BIAS, b, f, y, x); +#elif BIAS_PER_OFM + const uint bias_index = f; +#endif +#if QUANTIZATION_TERM +#if CALIBRATION_TERM + dotProd = (UNIT_TYPE)round(((float)dotProd * quantizations[f] * I_QF + biases[bias_index]) * calibrations[f]); +#else // CALIBRATION_TERM + dotProd = (UNIT_TYPE)round(((float)dotProd * quantizations[f] * I_QF + biases[bias_index]) * O_QF); +#endif // CALIBRATION_TERM +#else // QUANTIZATION_TERM + dotProd += (UNIT_TYPE)biases[bias_index]; +#endif // QUANTIZATION_TERM +#endif // BIAS_TERM + + const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM; + const uint dst_index = GET_DATA_INDEX(OUTPUT, b, f, y, x) + out_split_offset; +#if QUANTIZATION_TERM + output[dst_index] = ACTIVATION(convert_char(dotProd), NL_M, NL_N); +#else + output[dst_index] = ACTIVATION(dotProd, NL_M, NL_N); +#endif +} + +#undef FILTER_IFM_MMAD_NUM +#undef FILTER_OFM_MMAD_NUM +#undef FILTER_IFM_ALIGNED +#undef FILTER_OFM_ALIGNED
\ No newline at end of file |