diff options
Diffstat (limited to 'inference-engine/thirdparty/clDNN/common/boost/1.64.0/include/boost-1_64/boost/compute/algorithm/detail/reduce_on_gpu.hpp')
-rw-r--r-- | inference-engine/thirdparty/clDNN/common/boost/1.64.0/include/boost-1_64/boost/compute/algorithm/detail/reduce_on_gpu.hpp | 286 |
1 files changed, 0 insertions, 286 deletions
diff --git a/inference-engine/thirdparty/clDNN/common/boost/1.64.0/include/boost-1_64/boost/compute/algorithm/detail/reduce_on_gpu.hpp b/inference-engine/thirdparty/clDNN/common/boost/1.64.0/include/boost-1_64/boost/compute/algorithm/detail/reduce_on_gpu.hpp deleted file mode 100644 index 335fba872..000000000 --- a/inference-engine/thirdparty/clDNN/common/boost/1.64.0/include/boost-1_64/boost/compute/algorithm/detail/reduce_on_gpu.hpp +++ /dev/null @@ -1,286 +0,0 @@ -//---------------------------------------------------------------------------// -// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> -// -// Distributed under the Boost Software License, Version 1.0 -// See accompanying file LICENSE_1_0.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt -// -// See http://boostorg.github.com/compute for more information. -//---------------------------------------------------------------------------// - -#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP -#define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP - -#include <iterator> - -#include <boost/compute/utility/source.hpp> -#include <boost/compute/program.hpp> -#include <boost/compute/command_queue.hpp> -#include <boost/compute/detail/vendor.hpp> -#include <boost/compute/detail/parameter_cache.hpp> -#include <boost/compute/detail/work_size.hpp> -#include <boost/compute/detail/meta_kernel.hpp> -#include <boost/compute/type_traits/type_name.hpp> -#include <boost/compute/utility/program_cache.hpp> - -namespace boost { -namespace compute { -namespace detail { - -/// \internal -/// body reduction inside a warp -template<typename T,bool isNvidiaDevice> -struct ReduceBody -{ - static std::string body() - { - std::stringstream k; - // local reduction - k << "for(int i = 1; i < TPB; i <<= 1){\n" << - " barrier(CLK_LOCAL_MEM_FENCE);\n" << - " uint mask = (i << 1) - 1;\n" << - " if((lid & mask) == 0){\n" << - " scratch[lid] += scratch[lid+i];\n" << - " }\n" << - "}\n"; - return k.str(); - } -}; - -/// \internal -/// body reduction inside a warp -/// for nvidia device we can use the "unsafe" -/// memory optimisation -template<typename T> -struct ReduceBody<T,true> -{ - static std::string body() - { - std::stringstream k; - // local reduction - // we use TPB to compile only useful instruction - // local reduction when size is greater than warp size - k << "barrier(CLK_LOCAL_MEM_FENCE);\n" << - "if(TPB >= 1024){\n" << - "if(lid < 512) { sum += scratch[lid + 512]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" << - "if(TPB >= 512){\n" << - "if(lid < 256) { sum += scratch[lid + 256]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" << - "if(TPB >= 256){\n" << - "if(lid < 128) { sum += scratch[lid + 128]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" << - "if(TPB >= 128){\n" << - "if(lid < 64) { sum += scratch[lid + 64]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);} \n" << - - // warp reduction - "if(lid < 32){\n" << - // volatile this way we don't need any barrier - "volatile __local " << type_name<T>() << " *lmem = scratch;\n" << - "if(TPB >= 64) { lmem[lid] = sum = sum + lmem[lid+32];} \n" << - "if(TPB >= 32) { lmem[lid] = sum = sum + lmem[lid+16];} \n" << - "if(TPB >= 16) { lmem[lid] = sum = sum + lmem[lid+ 8];} \n" << - "if(TPB >= 8) { lmem[lid] = sum = sum + lmem[lid+ 4];} \n" << - "if(TPB >= 4) { lmem[lid] = sum = sum + lmem[lid+ 2];} \n" << - "if(TPB >= 2) { lmem[lid] = sum = sum + lmem[lid+ 1];} \n" << - "}\n"; - return k.str(); - } -}; - -template<class InputIterator, class Function> -inline void initial_reduce(InputIterator first, - InputIterator last, - buffer result, - const Function &function, - kernel &reduce_kernel, - const uint_ vpt, - const uint_ tpb, - command_queue &queue) -{ - (void) function; - (void) reduce_kernel; - - typedef typename std::iterator_traits<InputIterator>::value_type Arg; - typedef typename boost::tr1_result_of<Function(Arg, Arg)>::type T; - - size_t count = std::distance(first, last); - detail::meta_kernel k("initial_reduce"); - k.add_set_arg<const uint_>("count", uint_(count)); - size_t output_arg = k.add_arg<T *>(memory_object::global_memory, "output"); - - k << - k.decl<const uint_>("offset") << " = get_group_id(0) * VPT * TPB;\n" << - k.decl<const uint_>("lid") << " = get_local_id(0);\n" << - - "__local " << type_name<T>() << " scratch[TPB];\n" << - - // private reduction - k.decl<T>("sum") << " = 0;\n" << - "for(uint i = 0; i < VPT; i++){\n" << - " if(offset + lid + i*TPB < count){\n" << - " sum = sum + " << first[k.var<uint_>("offset+lid+i*TPB")] << ";\n" << - " }\n" << - "}\n" << - - "scratch[lid] = sum;\n" << - - // local reduction - ReduceBody<T,false>::body() << - - // write sum to output - "if(lid == 0){\n" << - " output[get_group_id(0)] = scratch[0];\n" << - "}\n"; - - const context &context = queue.get_context(); - std::stringstream options; - options << "-DVPT=" << vpt << " -DTPB=" << tpb; - kernel generic_reduce_kernel = k.compile(context, options.str()); - generic_reduce_kernel.set_arg(output_arg, result); - - size_t work_size = calculate_work_size(count, vpt, tpb); - - queue.enqueue_1d_range_kernel(generic_reduce_kernel, 0, work_size, tpb); -} - -template<class T> -inline void initial_reduce(const buffer_iterator<T> &first, - const buffer_iterator<T> &last, - const buffer &result, - const plus<T> &function, - kernel &reduce_kernel, - const uint_ vpt, - const uint_ tpb, - command_queue &queue) -{ - (void) function; - - size_t count = std::distance(first, last); - - reduce_kernel.set_arg(0, first.get_buffer()); - reduce_kernel.set_arg(1, uint_(first.get_index())); - reduce_kernel.set_arg(2, uint_(count)); - reduce_kernel.set_arg(3, result); - reduce_kernel.set_arg(4, uint_(0)); - - size_t work_size = calculate_work_size(count, vpt, tpb); - - queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb); -} - -template<class InputIterator, class T, class Function> -inline void reduce_on_gpu(InputIterator first, - InputIterator last, - buffer_iterator<T> result, - Function function, - command_queue &queue) -{ - const device &device = queue.get_device(); - const context &context = queue.get_context(); - - detail::meta_kernel k("reduce"); - k.add_arg<const T*>(memory_object::global_memory, "input"); - k.add_arg<const uint_>("offset"); - k.add_arg<const uint_>("count"); - k.add_arg<T*>(memory_object::global_memory, "output"); - k.add_arg<const uint_>("output_offset"); - - k << - k.decl<const uint_>("block_offset") << " = get_group_id(0) * VPT * TPB;\n" << - "__global const " << type_name<T>() << " *block = input + offset + block_offset;\n" << - k.decl<const uint_>("lid") << " = get_local_id(0);\n" << - - "__local " << type_name<T>() << " scratch[TPB];\n" << - // private reduction - k.decl<T>("sum") << " = 0;\n" << - "for(uint i = 0; i < VPT; i++){\n" << - " if(block_offset + lid + i*TPB < count){\n" << - " sum = sum + block[lid+i*TPB]; \n" << - " }\n" << - "}\n" << - - "scratch[lid] = sum;\n"; - - // discrimination on vendor name - if(is_nvidia_device(device)) - k << ReduceBody<T,true>::body(); - else - k << ReduceBody<T,false>::body(); - - k << - // write sum to output - "if(lid == 0){\n" << - " output[output_offset + get_group_id(0)] = scratch[0];\n" << - "}\n"; - - std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name<T>(); - - // load parameters - boost::shared_ptr<parameter_cache> parameters = - detail::parameter_cache::get_global_cache(device); - - uint_ vpt = parameters->get(cache_key, "vpt", 8); - uint_ tpb = parameters->get(cache_key, "tpb", 128); - - // reduce program compiler flags - std::stringstream options; - options << "-DT=" << type_name<T>() - << " -DVPT=" << vpt - << " -DTPB=" << tpb; - - // load program - boost::shared_ptr<program_cache> cache = - program_cache::get_global_cache(context); - - program reduce_program = cache->get_or_build( - cache_key, options.str(), k.source(), context - ); - - // create reduce kernel - kernel reduce_kernel(reduce_program, "reduce"); - - size_t count = std::distance(first, last); - - // first pass, reduce from input to ping - buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T)); - initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue); - - // update count after initial reduce - count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb)); - - // middle pass(es), reduce between ping and pong - const buffer *input_buffer = &ping; - buffer pong(context, static_cast<size_t>(count / vpt / tpb * sizeof(T))); - const buffer *output_buffer = &pong; - if(count > vpt * tpb){ - while(count > vpt * tpb){ - reduce_kernel.set_arg(0, *input_buffer); - reduce_kernel.set_arg(1, uint_(0)); - reduce_kernel.set_arg(2, uint_(count)); - reduce_kernel.set_arg(3, *output_buffer); - reduce_kernel.set_arg(4, uint_(0)); - - size_t work_size = static_cast<size_t>(std::ceil(float(count) / vpt)); - if(work_size % tpb != 0){ - work_size += tpb - work_size % tpb; - } - queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb); - - std::swap(input_buffer, output_buffer); - count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb)); - } - } - - // final pass, reduce from ping/pong to result - reduce_kernel.set_arg(0, *input_buffer); - reduce_kernel.set_arg(1, uint_(0)); - reduce_kernel.set_arg(2, uint_(count)); - reduce_kernel.set_arg(3, result.get_buffer()); - reduce_kernel.set_arg(4, uint_(result.get_index())); - - queue.enqueue_1d_range_kernel(reduce_kernel, 0, tpb, tpb); -} - -} // end detail namespace -} // end compute namespace -} // end boost namespace - -#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP |