1 files changed, 134 insertions, 0 deletions
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad.cl
new file mode 100644
index 000000000..78d2474c1
--- /dev/null
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_mmad.cl
@@ -0,0 +1,134 @@
+// Copyright (c) 2016-2017 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "include/common.cl"
+#include "include/activation_functions.cl"
+#include "include/data_types.cl"
+#include "include/fetch.cl"
+#include "include/mmad.cl"
+
+#define FILTER_IFM_MMAD_NUM ((FILTER_IFM_NUM + 31) / 32)
+#define FILTER_OFM_MMAD_NUM ((FILTER_OFM_NUM + 7) / 8)
+#define FILTER_IFM_ALIGNED (FILTER_IFM_MMAD_NUM * 32)
+#define FILTER_OFM_ALIGNED (FILTER_OFM_MMAD_NUM * 8)
+
+__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
+KERNEL(convolution_MMAD)(
+    __global INPUT0_TYPE* input, 
+    __global OUTPUT_TYPE* output, 
+    __global FILTER_TYPE* weights, 
+#if BIAS_TERM
+    __global BIAS_TYPE* biases,
+#endif
+#if QUANTIZATION_TERM
+    const __global float* quantizations,
+#endif
+#if CALIBRATION_TERM
+    const __global float* calibrations,
+#endif
+    uint split_idx)
+{
+    const uint x = get_global_id(0);
+    const uint y = get_global_id(1);
+#if OUTPUT_BATCH_NUM == 1
+    const uint f = get_global_id(2);
+    const uint b = 0;
+#else
+    const uint f = get_global_id(2) % FILTER_OFM_ALIGNED;
+    const uint b = get_global_id(2) / FILTER_OFM_ALIGNED;
+#endif
+
+#if QUANTIZATION_TERM
+    int dotProd = 0;
+#else
+    UNIT_TYPE dotProd = UNIT_VAL_ZERO;
+#endif
+
+    const int input_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
+    const int input_y = y * STRIDE_SIZE_Y - PADDING_SIZE_Y;
+
+    const uint in_split_offset = split_idx * INPUT0_FEATURE_PITCH * FILTER_IFM_NUM;
+
+    const uint filter_offset = (get_group_id(2) % FILTER_OFM_MMAD_NUM) * FILTER_OFM_BLOCK_PITCH;
+    const uint input_offset = b*INPUT0_BATCH_PITCH + INPUT0_OFFSET + in_split_offset;
+
+    for (uint k = 0; k < FILTER_IFM_MMAD_NUM; ++k)
+    {
+        for (uint j = 0; j < FILTER_SIZE_Y ; ++j)
+        {
+            const int input_offset_y = input_y + j * DILATION_SIZE_Y;
+            const bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
+
+            if(!zero_y)
+            {
+                for (uint i = 0; i < FILTER_SIZE_X ; ++i)
+                {
+                    const int input_offset_x = input_x + i * DILATION_SIZE_X;
+                    const bool zero_x = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
+
+                    if(!zero_x)
+                    {
+                        uint input_idx = input_offset + (uint)input_offset_x*INPUT0_X_PITCH + (uint)input_offset_y*INPUT0_Y_PITCH + k*32;
+                        uint filter_idx = filter_offset + k*FILTER_Y_PITCH * FILTER_SIZE_Y + j*FILTER_Y_PITCH + i*FILTER_X_PITCH;
+
+						int input_data = as_int(intel_sub_group_block_read((const __global uint*)(input + input_idx)));
+						int8 activations;  //activations of all lanes
+						activations.s0 = sub_group_broadcast(input_data, 0); 
+                        activations.s1 = sub_group_broadcast(input_data, 1); 
+                        activations.s2 = sub_group_broadcast(input_data, 2); 
+                        activations.s3 = sub_group_broadcast(input_data, 3); 
+                        activations.s4 = sub_group_broadcast(input_data, 4); 
+                        activations.s5 = sub_group_broadcast(input_data, 5); 
+                        activations.s6 = sub_group_broadcast(input_data, 6); 
+                        activations.s7 = sub_group_broadcast(input_data, 7); 
+
+						int8 weights_data = as_int8(intel_sub_group_block_read8((const __global uint*)(weights + filter_idx)));
+
+						dotProd = MMAD_8(activations, weights_data, dotProd);
+                    }
+                }
+            }
+        }
+    }
+
+#if BIAS_TERM
+#if   BIAS_PER_OUTPUT
+    const uint bias_index = GET_DATA_INDEX(BIAS, b, f, y, x);
+#elif BIAS_PER_OFM
+    const uint bias_index = f;
+#endif
+#if QUANTIZATION_TERM
+#if CALIBRATION_TERM
+    dotProd = (UNIT_TYPE)round(((float)dotProd * quantizations[f] * I_QF + biases[bias_index]) * calibrations[f]);
+#else  // CALIBRATION_TERM
+    dotProd = (UNIT_TYPE)round(((float)dotProd * quantizations[f] * I_QF + biases[bias_index]) * O_QF);
+#endif // CALIBRATION_TERM
+#else // QUANTIZATION_TERM
+    dotProd += (UNIT_TYPE)biases[bias_index];
+#endif // QUANTIZATION_TERM
+#endif // BIAS_TERM
+
+    const uint out_split_offset = split_idx * OUTPUT_FEATURE_PITCH * OUTPUT_FEATURE_NUM;
+    const uint dst_index = GET_DATA_INDEX(OUTPUT, b, f, y, x) + out_split_offset;
+#if QUANTIZATION_TERM
+    output[dst_index] = ACTIVATION(convert_char(dotProd), NL_M, NL_N);
+#else
+    output[dst_index] = ACTIVATION(dotProd, NL_M, NL_N);
+#endif  
+}
+
+#undef FILTER_IFM_MMAD_NUM
+#undef FILTER_OFM_MMAD_NUM
+#undef FILTER_IFM_ALIGNED
+#undef FILTER_OFM_ALIGNED
+\ No newline at end of file