diff options
Diffstat (limited to 'inference-engine/thirdparty/clDNN/common/boost/1.64.0/include/boost-1_64/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp')
-rw-r--r-- | inference-engine/thirdparty/clDNN/common/boost/1.64.0/include/boost-1_64/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp | 443 |
1 files changed, 0 insertions, 443 deletions
diff --git a/inference-engine/thirdparty/clDNN/common/boost/1.64.0/include/boost-1_64/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp b/inference-engine/thirdparty/clDNN/common/boost/1.64.0/include/boost-1_64/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp deleted file mode 100644 index 8f2a83c38..000000000 --- a/inference-engine/thirdparty/clDNN/common/boost/1.64.0/include/boost-1_64/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp +++ /dev/null @@ -1,443 +0,0 @@ -//---------------------------------------------------------------------------// -// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com> -// -// Distributed under the Boost Software License, Version 1.0 -// See accompanying file LICENSE_1_0.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt -// -// See http://boostorg.github.com/compute for more information. -//---------------------------------------------------------------------------// - -#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP -#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP - -#include <algorithm> - -#include <boost/compute/types.hpp> -#include <boost/compute/command_queue.hpp> -#include <boost/compute/algorithm/copy.hpp> -#include <boost/compute/allocator/pinned_allocator.hpp> -#include <boost/compute/container/vector.hpp> -#include <boost/compute/detail/meta_kernel.hpp> -#include <boost/compute/detail/iterator_range_size.hpp> -#include <boost/compute/detail/parameter_cache.hpp> -#include <boost/compute/memory/local_buffer.hpp> -#include <boost/compute/type_traits/type_name.hpp> -#include <boost/compute/utility/program_cache.hpp> - -namespace boost { -namespace compute { -namespace detail { - -template<class InputIterator> -bool find_extrema_with_reduce_requirements_met(InputIterator first, - InputIterator last, - command_queue &queue) -{ - typedef typename std::iterator_traits<InputIterator>::value_type input_type; - - const device &device = queue.get_device(); - - // device must have dedicated local memory storage - // otherwise reduction would be highly inefficient - if(device.get_info<CL_DEVICE_LOCAL_MEM_TYPE>() != CL_LOCAL) - { - return false; - } - - const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); - // local memory size in bytes (per compute unit) - const size_t local_mem_size = device.get_info<CL_DEVICE_LOCAL_MEM_SIZE>(); - - std::string cache_key = std::string("__boost_find_extrema_reduce_") - + type_name<input_type>(); - // load parameters - boost::shared_ptr<parameter_cache> parameters = - detail::parameter_cache::get_global_cache(device); - - // Get preferred work group size - size_t work_group_size = parameters->get(cache_key, "wgsize", 256); - - work_group_size = (std::min)(max_work_group_size, work_group_size); - - // local memory size needed to perform parallel reduction - size_t required_local_mem_size = 0; - // indices size - required_local_mem_size += sizeof(uint_) * work_group_size; - // values size - required_local_mem_size += sizeof(input_type) * work_group_size; - - // at least 4 work groups per compute unit otherwise reduction - // would be highly inefficient - return ((required_local_mem_size * 4) <= local_mem_size); -} - -/// \internal_ -/// Algorithm finds the first extremum in given range, i.e., with the lowest -/// index. -/// -/// If \p use_input_idx is false, it's assumed that input data is ordered by -/// increasing index and \p input_idx is not used in the algorithm. -template<class InputIterator, class ResultIterator, class Compare> -inline void find_extrema_with_reduce(InputIterator input, - vector<uint_>::iterator input_idx, - size_t count, - ResultIterator result, - vector<uint_>::iterator result_idx, - size_t work_groups_no, - size_t work_group_size, - Compare compare, - const bool find_minimum, - const bool use_input_idx, - command_queue &queue) -{ - typedef typename std::iterator_traits<InputIterator>::value_type input_type; - - const context &context = queue.get_context(); - - meta_kernel k("find_extrema_reduce"); - size_t count_arg = k.add_arg<uint_>("count"); - size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block"); - size_t block_idx_arg = k.add_arg<uint_ *>(memory_object::local_memory, "block_idx"); - - k << - // Work item global id - k.decl<const uint_>("gid") << " = get_global_id(0);\n" << - - // Index of element that will be read from input buffer - k.decl<uint_>("idx") << " = gid;\n" << - - k.decl<input_type>("acc") << ";\n" << - k.decl<uint_>("acc_idx") << ";\n" << - "if(gid < count) {\n" << - // Real index of currently best element - "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << - k.var<uint_>("acc_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" << - "#else\n" << - k.var<uint_>("acc_idx") << " = idx;\n" << - "#endif\n" << - - // Init accumulator with first[get_global_id(0)] - "acc = " << input[k.var<uint_>("idx")] << ";\n" << - "idx += get_global_size(0);\n" << - "}\n" << - - k.decl<bool>("compare_result") << ";\n" << - k.decl<bool>("equal") << ";\n\n" << - "while( idx < count ){\n" << - // Next element - k.decl<input_type>("next") << " = " << input[k.var<uint_>("idx")] << ";\n" << - "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << - k.decl<uint_>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" << - "#endif\n" << - - // Comparison between currently best element (acc) and next element - "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" << - "compare_result = " << compare(k.var<input_type>("next"), - k.var<input_type>("acc")) << ";\n" << - "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << - "equal = !compare_result && !" << - compare(k.var<input_type>("acc"), - k.var<input_type>("next")) << ";\n" << - "# endif\n" << - "#else\n" << - "compare_result = " << compare(k.var<input_type>("acc"), - k.var<input_type>("next")) << ";\n" << - "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << - "equal = !compare_result && !" << - compare(k.var<input_type>("next"), - k.var<input_type>("acc")) << ";\n" << - "# endif\n" << - "#endif\n" << - - // save the winner - "acc = compare_result ? acc : next;\n" << - "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << - "acc_idx = compare_result ? " << - "acc_idx : " << - "(equal ? min(acc_idx, next_idx) : next_idx);\n" << - "#else\n" << - "acc_idx = compare_result ? acc_idx : idx;\n" << - "#endif\n" << - "idx += get_global_size(0);\n" << - "}\n\n" << - - // Work item local id - k.decl<const uint_>("lid") << " = get_local_id(0);\n" << - "block[lid] = acc;\n" << - "block_idx[lid] = acc_idx;\n" << - "barrier(CLK_LOCAL_MEM_FENCE);\n" << - - k.decl<uint_>("group_offset") << - " = count - (get_local_size(0) * get_group_id(0));\n\n"; - - k << - "#pragma unroll\n" - "for(" << k.decl<uint_>("offset") << " = " << uint_(work_group_size) << " / 2; offset > 0; " << - "offset = offset / 2) {\n" << - "if((lid < offset) && ((lid + offset) < group_offset)) { \n" << - k.decl<input_type>("mine") << " = block[lid];\n" << - k.decl<input_type>("other") << " = block[lid+offset];\n" << - "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" << - "compare_result = " << compare(k.var<input_type>("other"), - k.var<input_type>("mine")) << ";\n" << - "equal = !compare_result && !" << - compare(k.var<input_type>("mine"), - k.var<input_type>("other")) << ";\n" << - "#else\n" << - "compare_result = " << compare(k.var<input_type>("mine"), - k.var<input_type>("other")) << ";\n" << - "equal = !compare_result && !" << - compare(k.var<input_type>("other"), - k.var<input_type>("mine")) << ";\n" << - "#endif\n" << - "block[lid] = compare_result ? mine : other;\n" << - k.decl<uint_>("mine_idx") << " = block_idx[lid];\n" << - k.decl<uint_>("other_idx") << " = block_idx[lid+offset];\n" << - "block_idx[lid] = compare_result ? " << - "mine_idx : " << - "(equal ? min(mine_idx, other_idx) : other_idx);\n" << - "}\n" - "barrier(CLK_LOCAL_MEM_FENCE);\n" << - "}\n\n" << - - // write block result to global output - "if(lid == 0){\n" << - result[k.var<uint_>("get_group_id(0)")] << " = block[0];\n" << - result_idx[k.var<uint_>("get_group_id(0)")] << " = block_idx[0];\n" << - "}"; - - std::string options; - if(!find_minimum){ - options = "-DBOOST_COMPUTE_FIND_MAXIMUM"; - } - if(use_input_idx){ - options += " -DBOOST_COMPUTE_USE_INPUT_IDX"; - } - - kernel kernel = k.compile(context, options); - - kernel.set_arg(count_arg, static_cast<uint_>(count)); - kernel.set_arg(block_arg, local_buffer<input_type>(work_group_size)); - kernel.set_arg(block_idx_arg, local_buffer<uint_>(work_group_size)); - - queue.enqueue_1d_range_kernel(kernel, - 0, - work_groups_no * work_group_size, - work_group_size); -} - -template<class InputIterator, class ResultIterator, class Compare> -inline void find_extrema_with_reduce(InputIterator input, - size_t count, - ResultIterator result, - vector<uint_>::iterator result_idx, - size_t work_groups_no, - size_t work_group_size, - Compare compare, - const bool find_minimum, - command_queue &queue) -{ - // dummy will not be used - buffer_iterator<uint_> dummy = result_idx; - return find_extrema_with_reduce( - input, dummy, count, result, result_idx, work_groups_no, - work_group_size, compare, find_minimum, false, queue - ); -} - -template<class InputIterator, class Compare> -InputIterator find_extrema_with_reduce(InputIterator first, - InputIterator last, - Compare compare, - const bool find_minimum, - command_queue &queue) -{ - typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; - typedef typename std::iterator_traits<InputIterator>::value_type input_type; - - const context &context = queue.get_context(); - const device &device = queue.get_device(); - - // Getting information about used queue and device - const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>(); - const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); - - const size_t count = detail::iterator_range_size(first, last); - - std::string cache_key = std::string("__boost_find_extrema_with_reduce_") - + type_name<input_type>(); - - // load parameters - boost::shared_ptr<parameter_cache> parameters = - detail::parameter_cache::get_global_cache(device); - - // get preferred work group size and preferred number - // of work groups per compute unit - size_t work_group_size = parameters->get(cache_key, "wgsize", 256); - size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 100); - - // calculate work group size and number of work groups - work_group_size = (std::min)(max_work_group_size, work_group_size); - size_t work_groups_no = compute_units_no * work_groups_per_cu; - work_groups_no = (std::min)( - work_groups_no, - static_cast<size_t>(std::ceil(float(count) / work_group_size)) - ); - - // phase I: finding candidates for extremum - - // device buffors for extremum candidates and their indices - // each work-group computes its candidate - vector<input_type> candidates(work_groups_no, context); - vector<uint_> candidates_idx(work_groups_no, context); - - // finding candidates for first extremum and their indices - find_extrema_with_reduce( - first, count, candidates.begin(), candidates_idx.begin(), - work_groups_no, work_group_size, compare, find_minimum, queue - ); - - // phase II: finding extremum from among the candidates - - // zero-copy buffers for final result (value and index) - vector<input_type, ::boost::compute::pinned_allocator<input_type> > - result(1, context); - vector<uint_, ::boost::compute::pinned_allocator<uint_> > - result_idx(1, context); - - // get extremum from among the candidates - find_extrema_with_reduce( - candidates.begin(), candidates_idx.begin(), work_groups_no, result.begin(), - result_idx.begin(), 1, work_group_size, compare, find_minimum, true, queue - ); - - // mapping extremum index to host - uint_* result_idx_host_ptr = - static_cast<uint_*>( - queue.enqueue_map_buffer( - result_idx.get_buffer(), command_queue::map_read, - 0, sizeof(uint_) - ) - ); - - return first + static_cast<difference_type>(*result_idx_host_ptr); -} - -template<class InputIterator> -InputIterator find_extrema_with_reduce(InputIterator first, - InputIterator last, - ::boost::compute::less< - typename std::iterator_traits< - InputIterator - >::value_type - > - compare, - const bool find_minimum, - command_queue &queue) -{ - typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; - typedef typename std::iterator_traits<InputIterator>::value_type input_type; - - const context &context = queue.get_context(); - const device &device = queue.get_device(); - - // Getting information about used queue and device - const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>(); - const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); - - const size_t count = detail::iterator_range_size(first, last); - - std::string cache_key = std::string("__boost_find_extrema_with_reduce_") - + type_name<input_type>(); - - // load parameters - boost::shared_ptr<parameter_cache> parameters = - detail::parameter_cache::get_global_cache(device); - - // get preferred work group size and preferred number - // of work groups per compute unit - size_t work_group_size = parameters->get(cache_key, "wgsize", 256); - size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 64); - - // calculate work group size and number of work groups - work_group_size = (std::min)(max_work_group_size, work_group_size); - size_t work_groups_no = compute_units_no * work_groups_per_cu; - work_groups_no = (std::min)( - work_groups_no, - static_cast<size_t>(std::ceil(float(count) / work_group_size)) - ); - - // phase I: finding candidates for extremum - - // device buffors for extremum candidates and their indices - // each work-group computes its candidate - // zero-copy buffers are used to eliminate copying data back to host - vector<input_type, ::boost::compute::pinned_allocator<input_type> > - candidates(work_groups_no, context); - vector<uint_, ::boost::compute::pinned_allocator <uint_> > - candidates_idx(work_groups_no, context); - - // finding candidates for first extremum and their indices - find_extrema_with_reduce( - first, count, candidates.begin(), candidates_idx.begin(), - work_groups_no, work_group_size, compare, find_minimum, queue - ); - - // phase II: finding extremum from among the candidates - - // mapping candidates and their indices to host - input_type* candidates_host_ptr = - static_cast<input_type*>( - queue.enqueue_map_buffer( - candidates.get_buffer(), command_queue::map_read, - 0, work_groups_no * sizeof(input_type) - ) - ); - - uint_* candidates_idx_host_ptr = - static_cast<uint_*>( - queue.enqueue_map_buffer( - candidates_idx.get_buffer(), command_queue::map_read, - 0, work_groups_no * sizeof(uint_) - ) - ); - - input_type* i = candidates_host_ptr; - uint_* idx = candidates_idx_host_ptr; - uint_* extremum_idx = idx; - input_type extremum = *candidates_host_ptr; - i++; idx++; - - // find extremum (serial) from among the candidates on host - if(!find_minimum) { - while(idx != (candidates_idx_host_ptr + work_groups_no)) { - input_type next = *i; - bool compare_result = next > extremum; - bool equal = next == extremum; - extremum = compare_result ? next : extremum; - extremum_idx = compare_result ? idx : extremum_idx; - extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx; - idx++, i++; - } - } - else { - while(idx != (candidates_idx_host_ptr + work_groups_no)) { - input_type next = *i; - bool compare_result = next < extremum; - bool equal = next == extremum; - extremum = compare_result ? next : extremum; - extremum_idx = compare_result ? idx : extremum_idx; - extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx; - idx++, i++; - } - } - - return first + static_cast<difference_type>(*extremum_idx); -} - -} // end detail namespace -} // end compute namespace -} // end boost namespace - -#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP |