diff options
Diffstat (limited to 'boost/compute/algorithm/detail/inplace_reduce.hpp')
-rw-r--r-- | boost/compute/algorithm/detail/inplace_reduce.hpp | 136 |
1 files changed, 136 insertions, 0 deletions
diff --git a/boost/compute/algorithm/detail/inplace_reduce.hpp b/boost/compute/algorithm/detail/inplace_reduce.hpp new file mode 100644 index 0000000000..60c61e83fe --- /dev/null +++ b/boost/compute/algorithm/detail/inplace_reduce.hpp @@ -0,0 +1,136 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP + +#include <iterator> + +#include <boost/utility/result_of.hpp> + +#include <boost/compute/command_queue.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/memory/local_buffer.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class Iterator, class BinaryFunction> +inline void inplace_reduce(Iterator first, + Iterator last, + BinaryFunction function, + command_queue &queue) +{ + typedef typename + std::iterator_traits<Iterator>::value_type + value_type; + + size_t input_size = iterator_range_size(first, last); + if(input_size < 2){ + return; + } + + const context &context = queue.get_context(); + + size_t block_size = 64; + size_t values_per_thread = 8; + size_t block_count = input_size / (block_size * values_per_thread); + if(block_count * block_size * values_per_thread != input_size) + block_count++; + + vector<value_type> output(block_count, context); + + meta_kernel k("inplace_reduce"); + size_t input_arg = k.add_arg<value_type *>(memory_object::global_memory, "input"); + size_t input_size_arg = k.add_arg<const uint_>("input_size"); + size_t output_arg = k.add_arg<value_type *>(memory_object::global_memory, "output"); + size_t scratch_arg = k.add_arg<value_type *>(memory_object::local_memory, "scratch"); + k << + "const uint gid = get_global_id(0);\n" << + "const uint lid = get_local_id(0);\n" << + "const uint values_per_thread =\n" + << uint_(values_per_thread) << ";\n" << + + // thread reduce + "const uint index = gid * values_per_thread;\n" << + "if(index < input_size){\n" << + k.decl<value_type>("sum") << " = input[index];\n" << + "for(uint i = 1;\n" << + "i < values_per_thread && (index + i) < input_size;\n" << + "i++){\n" << + " sum = " << + function(k.var<value_type>("sum"), + k.var<value_type>("input[index+i]")) << ";\n" << + "}\n" << + "scratch[lid] = sum;\n" << + "}\n" << + + // local reduce + "for(uint i = 1; i < get_local_size(0); i <<= 1){\n" << + " barrier(CLK_LOCAL_MEM_FENCE);\n" << + " uint mask = (i << 1) - 1;\n" << + " uint next_index = (gid + i) * values_per_thread;\n" + " if((lid & mask) == 0 && next_index < input_size){\n" << + " scratch[lid] = " << + function(k.var<value_type>("scratch[lid]"), + k.var<value_type>("scratch[lid+i]")) << ";\n" << + " }\n" << + "}\n" << + + // write output for block + "if(lid == 0){\n" << + " output[get_group_id(0)] = scratch[0];\n" << + "}\n" + ; + + const buffer *input_buffer = &first.get_buffer(); + const buffer *output_buffer = &output.get_buffer(); + + kernel kernel = k.compile(context); + + while(input_size > 1){ + kernel.set_arg(input_arg, *input_buffer); + kernel.set_arg(input_size_arg, static_cast<uint_>(input_size)); + kernel.set_arg(output_arg, *output_buffer); + kernel.set_arg(scratch_arg, local_buffer<value_type>(block_size)); + + queue.enqueue_1d_range_kernel(kernel, + 0, + block_count * block_size, + block_size); + + input_size = + static_cast<size_t>( + std::ceil(float(input_size) / (block_size * values_per_thread) + ) + ); + + block_count = input_size / (block_size * values_per_thread); + if(block_count * block_size * values_per_thread != input_size) + block_count++; + + std::swap(input_buffer, output_buffer); + } + + if(input_buffer != &first.get_buffer()){ + ::boost::compute::copy(output.begin(), + output.begin() + 1, + first, + queue); + } +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP |