diff options
Diffstat (limited to 'boost/compute/algorithm/reduce.hpp')
-rw-r--r-- | boost/compute/algorithm/reduce.hpp | 301 |
1 files changed, 301 insertions, 0 deletions
diff --git a/boost/compute/algorithm/reduce.hpp b/boost/compute/algorithm/reduce.hpp new file mode 100644 index 0000000000..79624a0e50 --- /dev/null +++ b/boost/compute/algorithm/reduce.hpp @@ -0,0 +1,301 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_REDUCE_HPP +#define BOOST_COMPUTE_ALGORITHM_REDUCE_HPP + +#include <iterator> + +#include <boost/compute/system.hpp> +#include <boost/compute/functional.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/container/array.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/algorithm/copy_n.hpp> +#include <boost/compute/algorithm/detail/inplace_reduce.hpp> +#include <boost/compute/algorithm/detail/reduce_on_gpu.hpp> +#include <boost/compute/algorithm/detail/serial_reduce.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/memory/local_buffer.hpp> +#include <boost/compute/type_traits/result_of.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator, class OutputIterator, class BinaryFunction> +size_t reduce(InputIterator first, + size_t count, + OutputIterator result, + size_t block_size, + BinaryFunction function, + command_queue &queue) +{ + typedef typename + std::iterator_traits<InputIterator>::value_type + input_type; + typedef typename + boost::compute::result_of<BinaryFunction(input_type, input_type)>::type + result_type; + + const context &context = queue.get_context(); + size_t block_count = count / 2 / block_size; + size_t total_block_count = + static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size))); + + if(block_count != 0){ + meta_kernel k("block_reduce"); + size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output"); + size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block"); + + k << + "const uint gid = get_global_id(0);\n" << + "const uint lid = get_local_id(0);\n" << + + // copy values to local memory + "block[lid] = " << + function(first[k.make_var<uint_>("gid*2+0")], + first[k.make_var<uint_>("gid*2+1")]) << ";\n" << + + // perform reduction + "for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" << + " barrier(CLK_LOCAL_MEM_FENCE);\n" << + " uint mask = (i << 1) - 1;\n" << + " if((lid & mask) == 0){\n" << + " block[lid] = " << + function(k.expr<input_type>("block[lid]"), + k.expr<input_type>("block[lid+i]")) << ";\n" << + " }\n" << + "}\n" << + + // write block result to global output + "if(lid == 0)\n" << + " output[get_group_id(0)] = block[0];\n"; + + kernel kernel = k.compile(context); + kernel.set_arg(output_arg, result.get_buffer()); + kernel.set_arg(block_arg, local_buffer<input_type>(block_size)); + + queue.enqueue_1d_range_kernel(kernel, + 0, + block_count * block_size, + block_size); + } + + // serially reduce any leftovers + if(block_count * block_size * 2 < count){ + size_t last_block_start = block_count * block_size * 2; + + meta_kernel k("extra_serial_reduce"); + size_t count_arg = k.add_arg<uint_>("count"); + size_t offset_arg = k.add_arg<uint_>("offset"); + size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output"); + size_t output_offset_arg = k.add_arg<uint_>("output_offset"); + + k << + k.decl<result_type>("result") << " = \n" << + first[k.expr<uint_>("offset")] << ";\n" << + "for(uint i = offset + 1; i < count; i++)\n" << + " result = " << + function(k.var<result_type>("result"), + first[k.var<uint_>("i")]) << ";\n" << + "output[output_offset] = result;\n"; + + kernel kernel = k.compile(context); + kernel.set_arg(count_arg, static_cast<uint_>(count)); + kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start)); + kernel.set_arg(output_arg, result.get_buffer()); + kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count)); + + queue.enqueue_task(kernel); + } + + return total_block_count; +} + +template<class InputIterator, class BinaryFunction> +inline vector< + typename boost::compute::result_of< + BinaryFunction( + typename std::iterator_traits<InputIterator>::value_type, + typename std::iterator_traits<InputIterator>::value_type + ) + >::type +> +block_reduce(InputIterator first, + size_t count, + size_t block_size, + BinaryFunction function, + command_queue &queue) +{ + typedef typename + std::iterator_traits<InputIterator>::value_type + input_type; + typedef typename + boost::compute::result_of<BinaryFunction(input_type, input_type)>::type + result_type; + + const context &context = queue.get_context(); + size_t total_block_count = + static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size))); + vector<result_type> result_vector(total_block_count, context); + + reduce(first, count, result_vector.begin(), block_size, function, queue); + + return result_vector; +} + +template<class InputIterator, class OutputIterator, class BinaryFunction> +inline void generic_reduce(InputIterator first, + InputIterator last, + OutputIterator result, + BinaryFunction function, + command_queue &queue) +{ + typedef typename + std::iterator_traits<InputIterator>::value_type + input_type; + typedef typename + boost::compute::result_of<BinaryFunction(input_type, input_type)>::type + result_type; + + const device &device = queue.get_device(); + const context &context = queue.get_context(); + + size_t count = detail::iterator_range_size(first, last); + + if(device.type() & device::cpu){ + boost::compute::vector<result_type> value(1, context); + detail::serial_reduce(first, last, value.begin(), function, queue); + boost::compute::copy_n(value.begin(), 1, result, queue); + } + else { + size_t block_size = 256; + + // first pass + vector<result_type> results = detail::block_reduce(first, + count, + block_size, + function, + queue); + + if(results.size() > 1){ + detail::inplace_reduce(results.begin(), + results.end(), + function, + queue); + } + + boost::compute::copy_n(results.begin(), 1, result, queue); + } +} + +template<class InputIterator, class OutputIterator, class T> +inline void dispatch_reduce(InputIterator first, + InputIterator last, + OutputIterator result, + const plus<T> &function, + command_queue &queue) +{ + const context &context = queue.get_context(); + const device &device = queue.get_device(); + + // reduce to temporary buffer on device + array<T, 1> tmp(context); + if(device.type() & device::cpu){ + detail::serial_reduce(first, last, tmp.begin(), function, queue); + } + else { + reduce_on_gpu(first, last, tmp.begin(), function, queue); + } + + // copy to result iterator + copy_n(tmp.begin(), 1, result, queue); +} + +template<class InputIterator, class OutputIterator, class BinaryFunction> +inline void dispatch_reduce(InputIterator first, + InputIterator last, + OutputIterator result, + BinaryFunction function, + command_queue &queue) +{ + generic_reduce(first, last, result, function, queue); +} + +} // end detail namespace + +/// Returns the result of applying \p function to the elements in the +/// range [\p first, \p last). +/// +/// If no function is specified, \c plus will be used. +/// +/// \param first first element in the input range +/// \param last last element in the input range +/// \param result iterator pointing to the output +/// \param function binary reduction function +/// \param queue command queue to perform the operation +/// +/// The \c reduce() algorithm assumes that the binary reduction function is +/// associative. When used with non-associative functions the result may +/// be non-deterministic and vary in precision. Notably this affects the +/// \c plus<float>() function as floating-point addition is not associative +/// and may produce slightly different results than a serial algorithm. +/// +/// This algorithm supports both host and device iterators for the +/// result argument. This allows for values to be reduced and copied +/// to the host all with a single function call. +/// +/// For example, to calculate the sum of the values in a device vector and +/// copy the result to a value on the host: +/// +/// \snippet test/test_reduce.cpp sum_int +/// +/// Note that while the the \c reduce() algorithm is conceptually identical to +/// the \c accumulate() algorithm, its implementation is substantially more +/// efficient on parallel hardware. For more information, see the documentation +/// on the \c accumulate() algorithm. +/// +/// \see accumulate() +template<class InputIterator, class OutputIterator, class BinaryFunction> +inline void reduce(InputIterator first, + InputIterator last, + OutputIterator result, + BinaryFunction function, + command_queue &queue = system::default_queue()) +{ + if(first == last){ + return; + } + + detail::dispatch_reduce(first, last, result, function, queue); +} + +/// \overload +template<class InputIterator, class OutputIterator> +inline void reduce(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator>::value_type T; + + if(first == last){ + return; + } + + detail::dispatch_reduce(first, last, result, plus<T>(), queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_REDUCE_HPP |