diff options
Diffstat (limited to 'boost/compute/algorithm/detail/scan_on_gpu.hpp')
-rw-r--r-- | boost/compute/algorithm/detail/scan_on_gpu.hpp | 331 |
1 files changed, 331 insertions, 0 deletions
diff --git a/boost/compute/algorithm/detail/scan_on_gpu.hpp b/boost/compute/algorithm/detail/scan_on_gpu.hpp new file mode 100644 index 0000000000..07c6d6d3c0 --- /dev/null +++ b/boost/compute/algorithm/detail/scan_on_gpu.hpp @@ -0,0 +1,331 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_GPU_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_GPU_HPP + +#include <boost/compute/kernel.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/detail/scan_on_cpu.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/memory/local_buffer.hpp> +#include <boost/compute/iterator/buffer_iterator.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator, class OutputIterator, class BinaryOperator> +class local_scan_kernel : public meta_kernel +{ +public: + local_scan_kernel(InputIterator first, + InputIterator last, + OutputIterator result, + bool exclusive, + BinaryOperator op) + : meta_kernel("local_scan") + { + typedef typename std::iterator_traits<InputIterator>::value_type T; + + (void) last; + + bool checked = true; + + m_block_sums_arg = add_arg<T *>(memory_object::global_memory, "block_sums"); + m_scratch_arg = add_arg<T *>(memory_object::local_memory, "scratch"); + m_block_size_arg = add_arg<const cl_uint>("block_size"); + m_count_arg = add_arg<const cl_uint>("count"); + m_init_value_arg = add_arg<const T>("init"); + + // work-item parameters + *this << + "const uint gid = get_global_id(0);\n" << + "const uint lid = get_local_id(0);\n"; + + // check against data size + if(checked){ + *this << + "if(gid < count){\n"; + } + + // copy values from input to local memory + if(exclusive){ + *this << + decl<const T>("local_init") << "= (gid == 0) ? init : 0;\n" << + "if(lid == 0){ scratch[lid] = local_init; }\n" << + "else { scratch[lid] = " << first[expr<cl_uint>("gid-1")] << "; }\n"; + } + else{ + *this << + "scratch[lid] = " << first[expr<cl_uint>("gid")] << ";\n"; + } + + if(checked){ + *this << + "}\n" + "else {\n" << + " scratch[lid] = 0;\n" << + "}\n"; + } + + // wait for all threads to read from input + *this << + "barrier(CLK_LOCAL_MEM_FENCE);\n"; + + // perform scan + *this << + "for(uint i = 1; i < block_size; i <<= 1){\n" << + " " << decl<const T>("x") << " = lid >= i ? scratch[lid-i] : 0;\n" << + " barrier(CLK_LOCAL_MEM_FENCE);\n" << + " if(lid >= i){\n" << + " scratch[lid] = " << op(var<T>("scratch[lid]"), var<T>("x")) << ";\n" << + " }\n" << + " barrier(CLK_LOCAL_MEM_FENCE);\n" << + "}\n"; + + // copy results to output + if(checked){ + *this << + "if(gid < count){\n"; + } + + *this << + result[expr<cl_uint>("gid")] << " = scratch[lid];\n"; + + if(checked){ + *this << "}\n"; + } + + // store sum for the block + if(exclusive){ + *this << + "if(lid == block_size - 1){\n" << + " block_sums[get_group_id(0)] = " << + op(first[expr<cl_uint>("gid")], var<T>("scratch[lid]")) << + ";\n" << + "}\n"; + } + else { + *this << + "if(lid == block_size - 1){\n" << + " block_sums[get_group_id(0)] = scratch[lid];\n" << + "}\n"; + } + } + + size_t m_block_sums_arg; + size_t m_scratch_arg; + size_t m_block_size_arg; + size_t m_count_arg; + size_t m_init_value_arg; +}; + +template<class T, class BinaryOperator> +class write_scanned_output_kernel : public meta_kernel +{ +public: + write_scanned_output_kernel(BinaryOperator op) + : meta_kernel("write_scanned_output") + { + bool checked = true; + + m_output_arg = add_arg<T *>(memory_object::global_memory, "output"); + m_block_sums_arg = add_arg<const T *>(memory_object::global_memory, "block_sums"); + m_count_arg = add_arg<const cl_uint>("count"); + + // work-item parameters + *this << + "const uint gid = get_global_id(0);\n" << + "const uint block_id = get_group_id(0);\n"; + + // check against data size + if(checked){ + *this << "if(gid < count){\n"; + } + + // write output + *this << + "output[gid] = " << + op(var<T>("block_sums[block_id]"), var<T>("output[gid] ")) << ";\n"; + + if(checked){ + *this << "}\n"; + } + } + + size_t m_output_arg; + size_t m_block_sums_arg; + size_t m_count_arg; +}; + +template<class InputIterator> +inline size_t pick_scan_block_size(InputIterator first, InputIterator last) +{ + size_t count = iterator_range_size(first, last); + + if(count == 0) { return 0; } + else if(count <= 1) { return 1; } + else if(count <= 2) { return 2; } + else if(count <= 4) { return 4; } + else if(count <= 8) { return 8; } + else if(count <= 16) { return 16; } + else if(count <= 32) { return 32; } + else if(count <= 64) { return 64; } + else if(count <= 128) { return 128; } + else { return 256; } +} + +template<class InputIterator, class OutputIterator, class T, class BinaryOperator> +inline OutputIterator scan_impl(InputIterator first, + InputIterator last, + OutputIterator result, + bool exclusive, + T init, + BinaryOperator op, + command_queue &queue) +{ + typedef typename + std::iterator_traits<InputIterator>::value_type + input_type; + typedef typename + std::iterator_traits<InputIterator>::difference_type + difference_type; + typedef typename + std::iterator_traits<OutputIterator>::value_type + output_type; + + const context &context = queue.get_context(); + const size_t count = detail::iterator_range_size(first, last); + + size_t block_size = pick_scan_block_size(first, last); + size_t block_count = count / block_size; + + if(block_count * block_size < count){ + block_count++; + } + + ::boost::compute::vector<input_type> block_sums(block_count, context); + + // zero block sums + input_type zero; + std::memset(&zero, 0, sizeof(input_type)); + ::boost::compute::fill(block_sums.begin(), block_sums.end(), zero, queue); + + // local scan + local_scan_kernel<InputIterator, OutputIterator, BinaryOperator> + local_scan_kernel(first, last, result, exclusive, op); + + ::boost::compute::kernel kernel = local_scan_kernel.compile(context); + kernel.set_arg(local_scan_kernel.m_scratch_arg, local_buffer<input_type>(block_size)); + kernel.set_arg(local_scan_kernel.m_block_sums_arg, block_sums); + kernel.set_arg(local_scan_kernel.m_block_size_arg, static_cast<cl_uint>(block_size)); + kernel.set_arg(local_scan_kernel.m_count_arg, static_cast<cl_uint>(count)); + kernel.set_arg(local_scan_kernel.m_init_value_arg, static_cast<output_type>(init)); + + queue.enqueue_1d_range_kernel(kernel, + 0, + block_count * block_size, + block_size); + + // inclusive scan block sums + if(block_count > 1){ + scan_impl(block_sums.begin(), + block_sums.end(), + block_sums.begin(), + false, + init, + op, + queue + ); + } + + // add block sums to each block + if(block_count > 1){ + write_scanned_output_kernel<input_type, BinaryOperator> + write_output_kernel(op); + kernel = write_output_kernel.compile(context); + kernel.set_arg(write_output_kernel.m_output_arg, result.get_buffer()); + kernel.set_arg(write_output_kernel.m_block_sums_arg, block_sums); + kernel.set_arg(write_output_kernel.m_count_arg, static_cast<cl_uint>(count)); + + queue.enqueue_1d_range_kernel(kernel, + block_size, + block_count * block_size, + block_size); + } + + return result + static_cast<difference_type>(count); +} + +template<class InputIterator, class OutputIterator, class T, class BinaryOperator> +inline OutputIterator dispatch_scan(InputIterator first, + InputIterator last, + OutputIterator result, + bool exclusive, + T init, + BinaryOperator op, + command_queue &queue) +{ + return scan_impl(first, last, result, exclusive, init, op, queue); +} + +template<class InputIterator, class T, class BinaryOperator> +inline InputIterator dispatch_scan(InputIterator first, + InputIterator last, + InputIterator result, + bool exclusive, + T init, + BinaryOperator op, + command_queue &queue) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + + if(first == result){ + // scan input in-place + const context &context = queue.get_context(); + + // make a temporary copy the input + size_t count = iterator_range_size(first, last); + vector<value_type> tmp(count, context); + copy(first, last, tmp.begin(), queue); + + // scan from temporary values + return scan_impl(tmp.begin(), tmp.end(), first, exclusive, init, op, queue); + } + else { + // scan input to output + return scan_impl(first, last, result, exclusive, init, op, queue); + } +} + +template<class InputIterator, class OutputIterator, class T, class BinaryOperator> +inline OutputIterator scan_on_gpu(InputIterator first, + InputIterator last, + OutputIterator result, + bool exclusive, + T init, + BinaryOperator op, + command_queue &queue) +{ + if(first == last){ + return result; + } + + return dispatch_scan(first, last, result, exclusive, init, op, queue); +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_GPU_HPP |