diff options
Diffstat (limited to 'boost/compute/algorithm/detail/merge_sort_on_cpu.hpp')
-rw-r--r-- | boost/compute/algorithm/detail/merge_sort_on_cpu.hpp | 366 |
1 files changed, 366 insertions, 0 deletions
diff --git a/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp b/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp new file mode 100644 index 0000000000..f4b53f10ae --- /dev/null +++ b/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp @@ -0,0 +1,366 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP + +#include <boost/compute/kernel.hpp> +#include <boost/compute/program.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/detail/merge_with_merge_path.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class KeyIterator, class ValueIterator, class Compare> +inline void merge_blocks(KeyIterator keys_first, + ValueIterator values_first, + KeyIterator keys_result, + ValueIterator values_result, + Compare compare, + size_t count, + const size_t block_size, + const bool sort_by_key, + command_queue &queue) +{ + (void) values_result; + (void) values_first; + + meta_kernel k("merge_sort_on_cpu_merge_blocks"); + size_t count_arg = k.add_arg<const uint_>("count"); + size_t block_size_arg = k.add_arg<uint_>("block_size"); + + k << + k.decl<uint_>("b1_start") << " = get_global_id(0) * block_size * 2;\n" << + k.decl<uint_>("b1_end") << " = min(count, b1_start + block_size);\n" << + k.decl<uint_>("b2_start") << " = min(count, b1_start + block_size);\n" << + k.decl<uint_>("b2_end") << " = min(count, b2_start + block_size);\n" << + k.decl<uint_>("result_idx") << " = b1_start;\n" << + + // merging block 1 and block 2 (stable) + "while(b1_start < b1_end && b2_start < b2_end){\n" << + " if( " << compare(keys_first[k.var<uint_>("b2_start")], + keys_first[k.var<uint_>("b1_start")]) << "){\n" << + " " << keys_result[k.var<uint_>("result_idx")] << " = " << + keys_first[k.var<uint_>("b2_start")] << ";\n"; + if(sort_by_key){ + k << + " " << values_result[k.var<uint_>("result_idx")] << " = " << + values_first[k.var<uint_>("b2_start")] << ";\n"; + } + k << + " b2_start++;\n" << + " }\n" << + " else {\n" << + " " << keys_result[k.var<uint_>("result_idx")] << " = " << + keys_first[k.var<uint_>("b1_start")] << ";\n"; + if(sort_by_key){ + k << + " " << values_result[k.var<uint_>("result_idx")] << " = " << + values_first[k.var<uint_>("b1_start")] << ";\n"; + } + k << + " b1_start++;\n" << + " }\n" << + " result_idx++;\n" << + "}\n" << + "while(b1_start < b1_end){\n" << + " " << keys_result[k.var<uint_>("result_idx")] << " = " << + keys_first[k.var<uint_>("b1_start")] << ";\n"; + if(sort_by_key){ + k << + " " << values_result[k.var<uint_>("result_idx")] << " = " << + values_first[k.var<uint_>("b1_start")] << ";\n"; + } + k << + " b1_start++;\n" << + " result_idx++;\n" << + "}\n" << + "while(b2_start < b2_end){\n" << + " " << keys_result[k.var<uint_>("result_idx")] << " = " << + keys_first[k.var<uint_>("b2_start")] << ";\n"; + if(sort_by_key){ + k << + " " << values_result[k.var<uint_>("result_idx")] << " = " << + values_first[k.var<uint_>("b2_start")] << ";\n"; + } + k << + " b2_start++;\n" << + " result_idx++;\n" << + "}\n"; + + const context &context = queue.get_context(); + ::boost::compute::kernel kernel = k.compile(context); + kernel.set_arg(count_arg, static_cast<const uint_>(count)); + kernel.set_arg(block_size_arg, static_cast<uint_>(block_size)); + + const size_t global_size = static_cast<size_t>( + std::ceil(float(count) / (2 * block_size)) + ); + queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0); +} + +template<class Iterator, class Compare> +inline void merge_blocks(Iterator first, + Iterator result, + Compare compare, + size_t count, + const size_t block_size, + const bool sort_by_key, + command_queue &queue) +{ + // dummy iterator as it's not sort by key + Iterator dummy; + merge_blocks(first, dummy, result, dummy, compare, count, block_size, false, queue); +} + +template<class Iterator, class Compare> +inline void dispatch_merge_blocks(Iterator first, + Iterator result, + Compare compare, + size_t count, + const size_t block_size, + const size_t input_size_threshold, + const size_t blocks_no_threshold, + command_queue &queue) +{ + const size_t blocks_no = static_cast<size_t>( + std::ceil(float(count) / block_size) + ); + // merge with merge path should used only for the large arrays and at the + // end of merging part when there are only a few big blocks left to be merged + if(blocks_no <= blocks_no_threshold && count >= input_size_threshold){ + Iterator last = first + count; + for(size_t i = 0; i < count; i+= 2*block_size) + { + Iterator first1 = (std::min)(first + i, last); + Iterator last1 = (std::min)(first1 + block_size, last); + Iterator first2 = last1; + Iterator last2 = (std::min)(first2 + block_size, last); + Iterator block_result = (std::min)(result + i, result + count); + merge_with_merge_path(first1, last1, first2, last2, + block_result, compare, queue); + } + } + else { + merge_blocks(first, result, compare, count, block_size, false, queue); + } +} + +template<class KeyIterator, class ValueIterator, class Compare> +inline void block_insertion_sort(KeyIterator keys_first, + ValueIterator values_first, + Compare compare, + const size_t count, + const size_t block_size, + const bool sort_by_key, + command_queue &queue) +{ + (void) values_first; + + typedef typename std::iterator_traits<KeyIterator>::value_type K; + typedef typename std::iterator_traits<ValueIterator>::value_type T; + + meta_kernel k("merge_sort_on_cpu_block_insertion_sort"); + size_t count_arg = k.add_arg<uint_>("count"); + size_t block_size_arg = k.add_arg<uint_>("block_size"); + + k << + k.decl<uint_>("start") << " = get_global_id(0) * block_size;\n" << + k.decl<uint_>("end") << " = min(count, start + block_size);\n" << + + // block insertion sort (stable) + "for(uint i = start+1; i < end; i++){\n" << + " " << k.decl<const K>("key") << " = " << + keys_first[k.var<uint_>("i")] << ";\n"; + if(sort_by_key){ + k << + " " << k.decl<const T>("value") << " = " << + values_first[k.var<uint_>("i")] << ";\n"; + } + k << + " uint pos = i;\n" << + " while(pos > start && " << + compare(k.var<const K>("key"), + keys_first[k.var<uint_>("pos-1")]) << "){\n" << + " " << keys_first[k.var<uint_>("pos")] << " = " << + keys_first[k.var<uint_>("pos-1")] << ";\n"; + if(sort_by_key){ + k << + " " << values_first[k.var<uint_>("pos")] << " = " << + values_first[k.var<uint_>("pos-1")] << ";\n"; + } + k << + " pos--;\n" << + " }\n" << + " " << keys_first[k.var<uint_>("pos")] << " = key;\n"; + if(sort_by_key) { + k << + " " << values_first[k.var<uint_>("pos")] << " = value;\n"; + } + k << + "}\n"; // block insertion sort + + const context &context = queue.get_context(); + ::boost::compute::kernel kernel = k.compile(context); + kernel.set_arg(count_arg, static_cast<uint_>(count)); + kernel.set_arg(block_size_arg, static_cast<uint_>(block_size)); + + const size_t global_size = static_cast<size_t>(std::ceil(float(count) / block_size)); + queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0); +} + +template<class Iterator, class Compare> +inline void block_insertion_sort(Iterator first, + Compare compare, + const size_t count, + const size_t block_size, + command_queue &queue) +{ + // dummy iterator as it's not sort by key + Iterator dummy; + block_insertion_sort(first, dummy, compare, count, block_size, false, queue); +} + +// This sort is stable. +template<class Iterator, class Compare> +inline void merge_sort_on_cpu(Iterator first, + Iterator last, + Compare compare, + command_queue &queue) +{ + typedef typename std::iterator_traits<Iterator>::value_type value_type; + + size_t count = iterator_range_size(first, last); + if(count < 2){ + return; + } + // for small input size only insertion sort is performed + else if(count <= 512){ + block_insertion_sort(first, compare, count, count, queue); + return; + } + + const context &context = queue.get_context(); + const device &device = queue.get_device(); + + // loading parameters + std::string cache_key = + std::string("__boost_merge_sort_on_cpu_") + type_name<value_type>(); + boost::shared_ptr<parameter_cache> parameters = + detail::parameter_cache::get_global_cache(device); + + // When there is merge_with_path_blocks_no_threshold or less blocks left to + // merge AND input size is merge_with_merge_path_input_size_threshold or more + // merge_with_merge_path() algorithm is used to merge sorted blocks; + // otherwise merge_blocks() is used. + const size_t merge_with_path_blocks_no_threshold = + parameters->get(cache_key, "merge_with_merge_path_blocks_no_threshold", 8); + const size_t merge_with_path_input_size_threshold = + parameters->get(cache_key, "merge_with_merge_path_input_size_threshold", 2097152); + + const size_t block_size = + parameters->get(cache_key, "insertion_sort_block_size", 64); + block_insertion_sort(first, compare, count, block_size, queue); + + // temporary buffer for merge result + vector<value_type> temp(count, context); + bool result_in_temporary_buffer = false; + + for(size_t i = block_size; i < count; i *= 2){ + result_in_temporary_buffer = !result_in_temporary_buffer; + if(result_in_temporary_buffer) { + dispatch_merge_blocks(first, temp.begin(), compare, count, i, + merge_with_path_input_size_threshold, + merge_with_path_blocks_no_threshold, + queue); + } else { + dispatch_merge_blocks(temp.begin(), first, compare, count, i, + merge_with_path_input_size_threshold, + merge_with_path_blocks_no_threshold, + queue); + } + } + + if(result_in_temporary_buffer) { + copy(temp.begin(), temp.end(), first, queue); + } +} + +// This sort is stable. +template<class KeyIterator, class ValueIterator, class Compare> +inline void merge_sort_by_key_on_cpu(KeyIterator keys_first, + KeyIterator keys_last, + ValueIterator values_first, + Compare compare, + command_queue &queue) +{ + typedef typename std::iterator_traits<KeyIterator>::value_type key_type; + typedef typename std::iterator_traits<ValueIterator>::value_type value_type; + + size_t count = iterator_range_size(keys_first, keys_last); + if(count < 2){ + return; + } + // for small input size only insertion sort is performed + else if(count <= 512){ + block_insertion_sort(keys_first, values_first, compare, + count, count, true, queue); + return; + } + + const context &context = queue.get_context(); + const device &device = queue.get_device(); + + // loading parameters + std::string cache_key = + std::string("__boost_merge_sort_by_key_on_cpu_") + type_name<value_type>() + + "_with_" + type_name<key_type>(); + boost::shared_ptr<parameter_cache> parameters = + detail::parameter_cache::get_global_cache(device); + + const size_t block_size = + parameters->get(cache_key, "insertion_sort_by_key_block_size", 64); + block_insertion_sort(keys_first, values_first, compare, + count, block_size, true, queue); + + // temporary buffer for merge results + vector<value_type> values_temp(count, context); + vector<key_type> keys_temp(count, context); + bool result_in_temporary_buffer = false; + + for(size_t i = block_size; i < count; i *= 2){ + result_in_temporary_buffer = !result_in_temporary_buffer; + if(result_in_temporary_buffer) { + merge_blocks(keys_first, values_first, + keys_temp.begin(), values_temp.begin(), + compare, count, i, true, queue); + } else { + merge_blocks(keys_temp.begin(), values_temp.begin(), + keys_first, values_first, + compare, count, i, true, queue); + } + } + + if(result_in_temporary_buffer) { + copy(keys_temp.begin(), keys_temp.end(), keys_first, queue); + copy(values_temp.begin(), values_temp.end(), values_first, queue); + } +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP |