summaryrefslogtreecommitdiff
path: root/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp')
-rw-r--r--libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp547
1 files changed, 547 insertions, 0 deletions
diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
new file mode 100644
index 000000000..d535c5da4
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <utility>
+#include <vector>
+
+using namespace arm_compute;
+
+const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = {
+ {"absdiff", "absdiff.cl"},
+ {"accumulate", "accumulate.cl"},
+ {"accumulate_squared", "accumulate.cl"},
+ {"accumulate_weighted", "accumulate.cl"},
+ {"activation_layer", "activation_layer.cl"},
+ {"activation_layer_qa8", "activation_layer_qa8.cl"},
+ {"activation_layer_logistic_qa8", "activation_layer_qa8.cl"},
+ {"arithmetic_add", "arithmetic_op.cl"},
+ {"arithmetic_sub", "arithmetic_op.cl"},
+ {"arithmetic_add_qasymm8", "arithmetic_op_quantized.cl"},
+ {"batchnormalization_layer_nchw", "batchnormalization_layer.cl"},
+ {"batchnormalization_layer_nhwc", "batchnormalization_layer.cl"},
+ {"bitwise_or", "bitwise_op.cl"},
+ {"bitwise_and", "bitwise_op.cl"},
+ {"bitwise_xor", "bitwise_op.cl"},
+ {"bitwise_not", "bitwise_op.cl"},
+ {"cast", "cast.cl"},
+ {"cast_qasymm_in", "cast.cl"},
+ {"cast_qasymm_out", "cast.cl"},
+ {"channel_combine_NV", "channel_combine.cl"},
+ {"channel_combine_RGB888", "channel_combine.cl"},
+ {"channel_combine_RGBA8888", "channel_combine.cl"},
+ {"channel_combine_UYVY422", "channel_combine.cl"},
+ {"channel_combine_YUYV422", "channel_combine.cl"},
+ {"channel_shuffle_nchw", "channel_shuffle.cl"},
+ {"channel_extract_NV12", "channel_extract.cl"},
+ {"channel_extract_NV21", "channel_extract.cl"},
+ {"channel_extract_RGB888", "channel_extract.cl"},
+ {"channel_extract_RGBA8888", "channel_extract.cl"},
+ {"channel_extract_UYVY422", "channel_extract.cl"},
+ {"channel_extract_YUYV422", "channel_extract.cl"},
+ {"combine_gradients_L1", "canny.cl"},
+ {"combine_gradients_L2", "canny.cl"},
+ {"concatenate_depth", "concatenate.cl"},
+ {"concatenate_width", "concatenate.cl"},
+ {"convolution_rectangle", "convolution_rectangle.cl"},
+ {"col2im", "col2im.cl"},
+ {"convert_depth_down", "depth_convert.cl"},
+ {"convert_depth_up", "depth_convert.cl"},
+ {"convert_fc_weights", "convert_fc_weights.cl"},
+ {"convolution3x3_static", "convolution3x3.cl"},
+ {"convolution5x5_static", "convolution5x5.cl"},
+ {"convolution7x7_static", "convolution7x7.cl"},
+ {"convolution9x9_static", "convolution9x9.cl"},
+ {"convolution_separable1x5_static", "convolution5x5.cl"},
+ {"convolution_separable5x1_static", "convolution5x5.cl"},
+ {"convolution_separable1x7_static", "convolution7x7.cl"},
+ {"convolution_separable7x1_static", "convolution7x7.cl"},
+ {"convolution_separable1x9_static", "convolution9x9.cl"},
+ {"convolution_separable9x1_static", "convolution9x9.cl"},
+ {"copy_tensor", "copy_tensor.cl"},
+ {"copy_plane", "channel_extract.cl"},
+ {"copy_planes_3p", "channel_combine.cl"},
+ {"copy_to_keypoint", "fast_corners.cl"},
+ {"deconvolution_upsample", "deconvolution_layer.cl"},
+ {"depthwise_convolution_3x3", "depthwise_convolution.cl"},
+ {"depthwise_convolution_3x3_f16", "depthwise_convolution.cl"},
+ {"depthwise_convolution_3x3_quantized_nchw", "depthwise_convolution_quantized.cl"},
+ {"depthwise_convolution_3x3_quantized_nhwc_stride1", "depthwise_convolution_quantized.cl"},
+ {"depthwise_convolution_3x3_quantized_nhwc_stride2", "depthwise_convolution_quantized.cl"},
+ {"depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16", "depthwise_convolution.cl"},
+ {"depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16", "depthwise_convolution.cl"},
+ {"depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32", "depthwise_convolution.cl"},
+ {"depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32", "depthwise_convolution.cl"},
+ {"depthwise_im2col", "depthwise_convolution.cl"},
+ {"depthwise_vector_to_tensor", "depthwise_convolution.cl"},
+ {"depthwise_weights_reshape", "depthwise_convolution.cl"},
+ {"dequantization_layer", "dequantization_layer.cl"},
+ {"derivative", "derivative.cl"},
+ {"dilate", "dilate.cl"},
+ {"direct_convolution1x1", "direct_convolution1x1.cl"},
+ {"direct_convolution1x1_f32_bifrost", "direct_convolution1x1.cl"},
+ {"direct_convolution3x3", "direct_convolution3x3.cl"},
+ {"direct_convolution3x3_f32_bifrost", "direct_convolution3x3.cl"},
+ {"direct_convolution5x5", "direct_convolution5x5.cl"},
+ {"direct_convolution5x5_f32_bifrost", "direct_convolution5x5.cl"},
+ {"direct_convolution_1x1_3x3_5x5_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl"},
+ {"erode", "erode.cl"},
+ {"fast_corners", "fast_corners.cl"},
+ {"fill_image_borders_constant", "fill_border.cl"},
+ {"fill_image_borders_replicate", "fill_border.cl"},
+ {"finalize", "optical_flow_pyramid_lk.cl"},
+ {"floor_layer", "floor.cl"},
+ {"gather", "gather.cl"},
+ {"gather_1d", "gather.cl"},
+ {"gather_1d_out", "gather.cl"},
+ {"gaussian1x5_sub_x", "gaussian_pyramid.cl"},
+ {"gaussian5x1_sub_y", "gaussian_pyramid.cl"},
+ {"gemm_accumulate_biases", "gemm.cl"},
+ {"gemm_interleave4x4", "gemm.cl"},
+ {"gemm_ma_f16", "gemm.cl"},
+ {"gemm_ma_f32", "gemm.cl"},
+ {"gemm_ma_qs8", "gemm.cl"},
+ {"gemm_ma_qs16", "gemm.cl"},
+ {"gemm_mv", "gemv.cl"},
+ {"gemm_mv_quantized", "gemv.cl"},
+ {"gemm_mm_interleaved_transposed_f16", "gemm.cl"},
+ {"gemm_mm_interleaved_transposed_f16_bifrost", "gemm.cl"},
+ {"gemm_mm_interleaved_transposed_f32", "gemm.cl"},
+ {"gemm_mm_interleaved_transposed_f32_bifrost", "gemm.cl"},
+ {"gemm_mm_interleaved_transposed_qs8", "gemm.cl"},
+ {"gemm_mm_interleaved_transposed_qs16", "gemm.cl"},
+ {"gemm_mm_floating_point", "gemm.cl"},
+ {"gemm_mm_floating_point_f16_bifrost", "gemm.cl"},
+ {"gemm_mm_floating_point_f32_bifrost", "gemm.cl"},
+ {"gemm_mm_floating_point_f32_bifrost_1000", "gemm.cl"},
+ {"gemm_mm_qs8", "gemm.cl"},
+ {"gemm_mm_qs16", "gemm.cl"},
+ {"gemm_lc_vm_f32", "gemm.cl"},
+ {"gemm_transpose1xW", "gemm.cl"},
+ {"gemmlowp_matrix_a_reduction", "gemmlowp.cl"},
+ {"gemmlowp_matrix_b_reduction", "gemmlowp.cl"},
+ {"gemmlowp_mm_bifrost", "gemmlowp.cl"},
+ {"gemmlowp_mm_midgard", "gemmlowp.cl"},
+ {"gemmlowp_mm_interleaved_transposed_bifrost", "gemmlowp.cl"},
+ {"gemmlowp_mm_interleaved_transposed_midgard", "gemmlowp.cl"},
+ {"gemmlowp_offset_contribution", "gemmlowp.cl"},
+ {"gemmlowp_output_stage_quantize_down", "gemmlowp.cl"},
+ {"gemmlowp_output_stage_quantize_down_fixedpoint", "gemmlowp.cl"},
+ {"harris_score_3x3", "harris_corners.cl"},
+ {"harris_score_5x5", "harris_corners.cl"},
+ {"harris_score_7x7", "harris_corners.cl"},
+ {"hist_border_kernel", "histogram.cl"},
+ {"hist_border_kernel_fixed", "histogram.cl"},
+ {"hist_local_kernel", "histogram.cl"},
+ {"hist_local_kernel_fixed", "histogram.cl"},
+ {"hog_block_normalization", "hog.cl"},
+ {"hog_detector", "hog.cl"},
+ {"hog_orientation_binning", "hog.cl"},
+ {"hysteresis", "canny.cl"},
+ {"im2col1x1_stridex1_dchw", "im2col.cl"},
+ {"im2col3x3_dchw", "im2col.cl"},
+ {"im2col5x5_dchw", "im2col.cl"},
+ {"im2col11x11_padx0_pady0_dchw", "im2col.cl"},
+ {"im2col_generic_dchw", "im2col.cl"},
+ {"im2col_generic_padx0_pady0_dchw", "im2col.cl"},
+ {"im2col_reduced_dchw", "im2col.cl"},
+ {"init_level", "optical_flow_pyramid_lk.cl"},
+ {"init_level_max", "optical_flow_pyramid_lk.cl"},
+ {"init_level_max_initial_estimate", "optical_flow_pyramid_lk.cl"},
+ {"integral_horizontal", "integral_image.cl"},
+ {"integral_vertical", "integral_image.cl"},
+ {"IYUV_to_NV12_bt709", "color_convert.cl"},
+ {"IYUV_to_RGB888_bt709", "color_convert.cl"},
+ {"IYUV_to_RGBA8888_bt709", "color_convert.cl"},
+ {"IYUV_to_YUV444_bt709", "color_convert.cl"},
+ {"l2_normalize", "l2_normalize.cl"},
+ {"lktracker_stage0", "optical_flow_pyramid_lk.cl"},
+ {"lktracker_stage1", "optical_flow_pyramid_lk.cl"},
+ {"magnitude_phase", "magnitude_phase.cl"},
+ {"mean_stddev_accumulate", "mean_stddev.cl"},
+ {"minmax", "minmaxloc.cl"},
+ {"minmax_border", "minmaxloc.cl"},
+ {"minmax_layer", "minmax_layer.cl"},
+ {"minmaxloc", "minmaxloc.cl"},
+ {"non_linear_filter_box3x3", "non_linear_filter3x3.cl"},
+ {"non_linear_filter_cross3x3", "non_linear_filter3x3.cl"},
+ {"non_linear_filter_disk3x3", "non_linear_filter3x3.cl"},
+ {"non_linear_filter_box5x5", "non_linear_filter5x5.cl"},
+ {"non_linear_filter_cross5x5", "non_linear_filter5x5.cl"},
+ {"non_linear_filter_disk5x5", "non_linear_filter5x5.cl"},
+ {"non_max_suppression", "nonmax.cl"},
+ {"normalization_layer_cross_map", "normalization_layer.cl"},
+ {"normalization_layer_in_map", "normalization_layer.cl"},
+ {"NV12_to_IYUV_bt709", "color_convert.cl"},
+ {"NV12_to_RGB888_bt709", "color_convert.cl"},
+ {"NV12_to_RGBA8888_bt709", "color_convert.cl"},
+ {"NV12_to_YUV444_bt709", "color_convert.cl"},
+ {"NV21_to_IYUV_bt709", "color_convert.cl"},
+ {"NV21_to_RGB888_bt709", "color_convert.cl"},
+ {"NV21_to_RGBA8888_bt709", "color_convert.cl"},
+ {"NV21_to_YUV444_bt709", "color_convert.cl"},
+ {"output_stage_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl"},
+ {"permute_201", "permute.cl"},
+ {"permute_120", "permute.cl"},
+ {"permute_3201", "permute.cl"},
+ {"pixelwise_mul_float", "pixelwise_mul_float.cl"},
+ {"pixelwise_mul_int", "pixelwise_mul_int.cl"},
+ {"pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl"},
+ {"pixelwise_div_float", "pixelwise_div_float.cl"},
+ {"pixelwise_div_int", "pixelwise_div_int.cl"},
+ {"pooling_layer_2", "pooling_layer.cl"},
+ {"pooling_layer_3", "pooling_layer.cl"},
+ {"pooling_layer_optimized_3", "pooling_layer.cl"},
+ {"pooling_layer_7", "pooling_layer.cl"},
+ {"pooling_layer_MxN_nchw", "pooling_layer.cl"},
+ {"pooling_layer_MxN_nhwc", "pooling_layer.cl"},
+ {"pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl"},
+ {"pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl"},
+ {"quantization_layer", "quantization_layer.cl"},
+ {"reduce_max", "reduce_max.cl"},
+ {"reduction_operation", "reduction_operation.cl"},
+ {"reduction_mean", "reduction_mean.cl"},
+ {"remap_nearest_neighbour", "remap.cl"},
+ {"remap_bilinear", "remap.cl"},
+ {"reshape_layer", "reshape_layer.cl"},
+ {"reshape_to_columns", "convolution_layer.cl"},
+ {"RGB888_to_IYUV_bt709", "color_convert.cl"},
+ {"RGB888_to_NV12_bt709", "color_convert.cl"},
+ {"RGB888_to_RGBA8888_bt709", "color_convert.cl"},
+ {"RGB888_to_YUV444_bt709", "color_convert.cl"},
+ {"RGBA8888_to_IYUV_bt709", "color_convert.cl"},
+ {"RGBA8888_to_NV12_bt709", "color_convert.cl"},
+ {"RGBA8888_to_RGB888_bt709", "color_convert.cl"},
+ {"RGBA8888_to_YUV444_bt709", "color_convert.cl"},
+ {"roi_pooling_layer", "roi_pooling_layer.cl"},
+ {"scale_nearest_neighbour", "scale.cl"},
+ {"scale_bilinear", "scale.cl"},
+ {"scharr3x3", "scharr_filter.cl"},
+ {"sobel3x3", "sobel_filter.cl"},
+ {"sobel_separable5x1", "sobel_filter.cl"},
+ {"sobel_separable1x5", "sobel_filter.cl"},
+ {"sobel_separable7x1", "sobel_filter.cl"},
+ {"sobel_separable1x7", "sobel_filter.cl"},
+ {"softmax_layer_norm", "softmax_layer.cl"},
+ {"softmax_layer_norm_quantized", "softmax_layer_quantized.cl"},
+ {"softmax_layer_max_shift_exp_sum_quantized_serial", "softmax_layer_quantized.cl"},
+ {"softmax_layer_max_shift_exp_sum_quantized_parallel", "softmax_layer_quantized.cl"},
+ {"softmax_layer_max_shift_exp_sum_serial", "softmax_layer.cl"},
+ {"softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl"},
+ {"strided_slice", "strided_slice.cl"},
+ {"suppress_non_maximum", "canny.cl"},
+ {"tablelookup_U8", "tablelookup.cl"},
+ {"tablelookup_S16", "tablelookup.cl"},
+ {"threshold_binary", "threshold.cl"},
+ {"threshold_range", "threshold.cl"},
+ {"transpose", "transpose.cl"},
+ {"UYVY422_to_IYUV_bt709", "color_convert.cl"},
+ {"UYVY422_to_NV12_bt709", "color_convert.cl"},
+ {"UYVY422_to_RGB888_bt709", "color_convert.cl"},
+ {"UYVY422_to_RGBA8888_bt709", "color_convert.cl"},
+ {"warp_affine_nearest_neighbour", "warp_affine.cl"},
+ {"warp_affine_bilinear", "warp_affine.cl"},
+ {"warp_perspective_nearest_neighbour", "warp_perspective.cl"},
+ {"warp_perspective_bilinear", "warp_perspective.cl"},
+ {"winograd_filter_transform_2x2_3x3_nchw", "winograd.cl"},
+ {"winograd_filter_transform_4x4_3x3_nchw", "winograd.cl"},
+ {"winograd_filter_transform_4x4_5x5_nchw", "winograd.cl"},
+ {"winograd_input_transform_4x4_5x5_stepz1_nchw", "winograd.cl"},
+ {"winograd_input_transform_2x2_3x3_stepz1_nchw", "winograd.cl"},
+ {"winograd_input_transform_2x2_3x3_stepz2_nchw", "winograd.cl"},
+ {"winograd_input_transform_4x4_3x3_stepz1_nchw", "winograd.cl"},
+ {"winograd_output_transform_2x2_3x3_nchw", "winograd.cl"},
+ {"winograd_output_transform_4x4_3x3_nchw", "winograd.cl"},
+ {"winograd_output_transform_4x4_5x5_nchw", "winograd.cl"},
+ {"YUYV422_to_IYUV_bt709", "color_convert.cl"},
+ {"YUYV422_to_NV12_bt709", "color_convert.cl"},
+ {"YUYV422_to_RGB888_bt709", "color_convert.cl"},
+ {"YUYV422_to_RGBA8888_bt709", "color_convert.cl"},
+ {"topkv2_init", "topkv2.cl"},
+ {"topkv2_find_first_negative", "topkv2.cl"},
+ {"topkv2_reorder_negatives", "topkv2.cl"},
+ {"topkv2_store", "topkv2.cl"},
+ {"radixsort_histogram", "topkv2_radixsort.cl"},
+ {"radixsort_scanhistograms", "topkv2_radixsort.cl"},
+ {"radixsort_pastehistograms", "topkv2_radixsort.cl"},
+ {"radixsort_reorder", "topkv2_radixsort.cl"},
+ {"topkv2_quicksort", "topkv2_quicksort.cl"},
+};
+
+const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = {
+#ifdef EMBEDDED_KERNELS
+ {
+ "cast.cl",
+#include "./cl_kernels/cast.clembed"
+ },
+ {
+ "fixed_point.h",
+#include "./cl_kernels/fixed_point.hembed"
+ },
+ {
+ "gather.cl",
+#include "./cl_kernels/gather.clembed"
+ },
+ {
+ "helpers.h",
+#include "./cl_kernels/helpers.hembed"
+ },
+ {
+ "helpers_asymm.h",
+#include "./cl_kernels/helpers_asymm.hembed"
+ },
+ {
+ "pixelwise_div_float.cl",
+#include "./cl_kernels/pixelwise_div_float.clembed"
+ },
+ {
+ "pixelwise_div_int.cl",
+#include "./cl_kernels/pixelwise_div_int.clembed"
+ },
+ {
+ "reduce_max.cl",
+#include "./cl_kernels/reduce_max.clembed"
+ },
+ {
+ "reduction_mean.cl",
+#include "./cl_kernels/reduction_mean.clembed"
+ },
+ {
+ "strided_slice.cl",
+#include "./cl_kernels/strided_slice.clembed"
+ },
+ {
+ "topkv2.cl",
+#include "./cl_kernels/topkv2.clembed"
+ },
+ {
+ "topkv2_radixsort.cl",
+#include "./cl_kernels/topkv2_radixsort.clembed"
+ },
+ {
+ "topkv2_quicksort.cl",
+#include "./cl_kernels/topkv2_quicksort.clembed"
+ },
+#endif /* EMBEDDED_KERNELS */
+};
+
+CLKernelLibraryEx::CLKernelLibraryEx()
+ : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map()
+{
+ opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the
+ // CLKernelLibrary is built
+}
+
+CLKernelLibraryEx &CLKernelLibraryEx::get()
+{
+ static CLKernelLibraryEx _kernel_library;
+ return _kernel_library;
+}
+
+Kernel CLKernelLibraryEx::create_kernel(const std::string &kernel_name,
+ const StringSet &build_options_set) const
+{
+ // Find which program contains the kernel
+ auto kernel_program_it = _kernel_program_map.find(kernel_name);
+
+ if (_kernel_program_map.end() == kernel_program_it)
+ {
+ ARM_COMPUTE_ERROR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str());
+ }
+ std::string concat_str;
+
+ if (fp16_supported(_device))
+ {
+ concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 ";
+ }
+
+ if (get_cl_version(_device) == CLVersion::CL20)
+ {
+ concat_str += " -cl-std=CL2.0 ";
+ }
+ else if (arm_non_uniform_workgroup_supported(_device))
+ {
+ concat_str += " -cl-arm-non-uniform-work-group-size ";
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!");
+ }
+
+ // Check if the program has been built before with same build options.
+ const std::string program_name = kernel_program_it->second;
+ const std::string build_options = stringify_set(build_options_set) + concat_str;
+
+ const std::string built_program_name = program_name + "_" + build_options;
+ auto built_program_it = _built_programs_map.find(built_program_name);
+
+ cl::Program cl_program;
+
+ if (_built_programs_map.end() != built_program_it)
+ {
+ // If program has been built, retrieve to create kernel from it
+ cl_program = built_program_it->second;
+ }
+ else
+ {
+ // Get program
+ Program program = load_program(program_name);
+
+ // Build program
+ cl_program = program.build(build_options);
+
+ // Add built program to internal map
+ _built_programs_map.emplace(built_program_name, cl_program);
+ }
+
+ // Create and return kernel
+ return Kernel(kernel_name, cl_program);
+}
+
+void CLKernelLibraryEx::add_built_program(const std::string &built_program_name,
+ cl::Program program)
+{
+ _built_programs_map.emplace(built_program_name, program);
+}
+
+const Program &CLKernelLibraryEx::load_program(const std::string &program_name) const
+{
+ const auto program_it = _programs_map.find(program_name);
+
+ if (program_it != _programs_map.end())
+ {
+ return program_it->second;
+ }
+
+ Program program;
+
+#ifdef EMBEDDED_KERNELS
+ const auto program_source_it = _program_source_map.find(program_name);
+
+ if (_program_source_map.end() == program_source_it)
+ {
+ ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str());
+ }
+
+ program = Program(_context, program_name, program_source_it->second);
+#else /* EMBEDDED_KERNELS */
+ // Check for binary
+ std::string source_name = _kernel_path + program_name;
+ std::string binary_name = source_name + "bin";
+
+ if (std::ifstream(binary_name).is_open())
+ {
+ const std::string program_binary = read_file(binary_name, true);
+ program = Program(_context, _device, program_name,
+ std::vector<unsigned char>(program_binary.begin(), program_binary.end()));
+ }
+ else if (std::ifstream(source_name).is_open())
+ {
+ program = Program(_context, program_name, read_file(source_name, false));
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Kernel file %s does not exist.", source_name.c_str());
+ }
+#endif /* EMBEDDED_KERNELS */
+
+ // Insert program to program map
+ const auto new_program = _programs_map.emplace(program_name, std::move(program));
+
+ return new_program.first->second;
+}
+
+std::string CLKernelLibraryEx::stringify_set(const StringSet &s) const
+{
+ std::string concat_set;
+
+#ifndef EMBEDDED_KERNELS
+ concat_set += "-I" + _kernel_path + " ";
+#endif /* EMBEDDED_KERNELS */
+
+ // Concatenate set
+ for (const auto &el : s)
+ {
+ concat_set += " " + el;
+ }
+
+ return concat_set;
+}
+
+std::string CLKernelLibraryEx::get_program_source(const std::string &program_name)
+{
+ const auto program_source_it = _program_source_map.find(program_name);
+
+ if (program_source_it == _program_source_map.end())
+ {
+ ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str());
+ }
+
+ return program_source_it->second;
+}
+
+size_t CLKernelLibraryEx::max_local_workgroup_size(const cl::Kernel &kernel) const
+{
+ size_t result;
+
+ size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result);
+ ARM_COMPUTE_ERROR_ON_MSG(
+ err != 0,
+ "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
+ ARM_COMPUTE_UNUSED(err);
+
+ return result;
+}
+
+cl::NDRange CLKernelLibraryEx::default_ndrange() const
+{
+ cl::Device device = cl::Device::getDefault();
+ GPUTarget _target = get_target_from_device(device);
+ cl::NDRange default_range;
+
+ switch (_target)
+ {
+ case GPUTarget::MIDGARD:
+ case GPUTarget::T600:
+ case GPUTarget::T700:
+ case GPUTarget::T800:
+ default_range = cl::NDRange(128u, 1);
+ break;
+ default:
+ default_range = cl::NullRange;
+ }
+
+ return default_range;
+}
+
+std::string CLKernelLibraryEx::get_device_version() { return _device.getInfo<CL_DEVICE_VERSION>(); }