// Copyright (c) 2016-2017 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "include/common.cl" #include "include/data_types.cl" #ifdef FORCE_SIMD_16 __attribute__((intel_reqd_sub_group_size(16))) #endif KERNEL (lrn_gpu_across_channel_multiple_features)(const __global INPUT0_TYPE* input, __global OUTPUT_TYPE* output) { #if defined OUTPUT_LAYOUT_BFYX // PERF NOTE: SIMD IS OVER global_id(0) so in SIMD global_id(1) and global_id(2) does not change, so we can use group_id to have SIMD1 instructions const uint x = get_global_id(0); const uint y = get_group_id(1); const uint b_f = get_group_id(2); const uint batch_id = (b_f * OFM_PER_SIMD) / INPUT0_FEATURE_NUM; const uint feature_id = (b_f % (INPUT0_FEATURE_NUM / OFM_PER_SIMD)) * OFM_PER_SIMD; if (x >= INPUT0_SIZE_X) return; #elif defined OUTPUT_LAYOUT_YXFB const uint b_f = get_global_id(0); const uint x = get_group_id(1); const uint y = get_group_id(2); const uint feature_id = (b_f / INPUT0_BATCH_NUM) * OFM_PER_SIMD; const uint batch_id = b_f % INPUT0_BATCH_NUM; #endif uint input_id = INPUT0_OFFSET + batch_id*INPUT0_BATCH_PITCH + feature_id*INPUT0_FEATURE_PITCH + y*INPUT0_Y_PITCH + x*INPUT0_X_PITCH; int input_offset_f = feature_id - PADDING; uint input_idx = input_id - PADDING*INPUT0_FEATURE_PITCH; input_idx = MULTIPLY_OFFSET(UNIT_TYPE, input_idx); UNIT_TYPE vals[OFM_PER_SIMD]; UNIT_TYPE results[OFM_PER_SIMD] = { UNIT_VAL_ZERO }; // prefetch for(uint i = 0; i < OFM_PER_SIMD; i++) { bool zero = input_offset_f < 0 || input_offset_f >= INPUT0_FEATURE_NUM; vals[i] = zero ? UNIT_VAL_ZERO : TO_UNIT_TYPE(ALPHA_VAL_FACTOR_DIV_BY_SIZE) * (*OFFSET_GLOBAL_PTR(UNIT_TYPE, input, input_idx)); input_offset_f++; input_idx += MULTIPLY_OFFSET(UNIT_TYPE, INPUT0_FEATURE_PITCH); } for (uint i = 0; i < LOCAL_SIZE-1; i++) { for(uint j = 0; j < OFM_PER_SIMD; j++) { results[j] = mad(vals[j], vals[j], results[j]); } for(uint j = 0; j < OFM_PER_SIMD-1; j++) { vals[j] = vals[j+1]; } bool zero = input_offset_f < 0 || input_offset_f >= INPUT0_FEATURE_NUM; vals[OFM_PER_SIMD-1] = zero ? UNIT_VAL_ZERO : TO_UNIT_TYPE(ALPHA_VAL_FACTOR_DIV_BY_SIZE) * (*OFFSET_GLOBAL_PTR(UNIT_TYPE, input, input_idx)); input_offset_f++; input_idx += MULTIPLY_OFFSET(UNIT_TYPE, INPUT0_FEATURE_PITCH); } for(uint j = 0; j < OFM_PER_SIMD; j++) { results[j] = mad(vals[j], vals[j], results[j]); } for(uint j = 0; j < OFM_PER_SIMD; j++) { results[j] = mad(results[j], TO_UNIT_TYPE(ALPHA_DIV_BY_SIZE), TO_UNIT_TYPE(K)); results[j] = native_powr(results[j], -TO_UNIT_TYPE(BETA)); } uint output_idx = OUTPUT_OFFSET + batch_id*OUTPUT_BATCH_PITCH + feature_id*OUTPUT_FEATURE_PITCH + y*OUTPUT_Y_PITCH + x*OUTPUT_X_PITCH; for(uint j = 0; j < OFM_PER_SIMD; j++) { output[output_idx] = ACTIVATION(results[j] * input[input_id], NL_M ,NL_N); output_idx += OUTPUT_FEATURE_PITCH; input_id += INPUT0_FEATURE_PITCH; } }