summaryrefslogtreecommitdiff
path: root/boost/compute/algorithm
diff options
context:
space:
mode:
Diffstat (limited to 'boost/compute/algorithm')
-rw-r--r--boost/compute/algorithm/accumulate.hpp184
-rw-r--r--boost/compute/algorithm/adjacent_difference.hpp98
-rw-r--r--boost/compute/algorithm/adjacent_find.hpp162
-rw-r--r--boost/compute/algorithm/all_of.hpp36
-rw-r--r--boost/compute/algorithm/any_of.hpp40
-rw-r--r--boost/compute/algorithm/binary_search.hpp37
-rw-r--r--boost/compute/algorithm/copy.hpp362
-rw-r--r--boost/compute/algorithm/copy_if.hpp58
-rw-r--r--boost/compute/algorithm/copy_n.hpp51
-rw-r--r--boost/compute/algorithm/count.hpp55
-rw-r--r--boost/compute/algorithm/count_if.hpp62
-rw-r--r--boost/compute/algorithm/detail/balanced_path.hpp162
-rw-r--r--boost/compute/algorithm/detail/binary_find.hpp133
-rw-r--r--boost/compute/algorithm/detail/compact.hpp77
-rw-r--r--boost/compute/algorithm/detail/copy_on_device.hpp190
-rw-r--r--boost/compute/algorithm/detail/copy_to_device.hpp127
-rw-r--r--boost/compute/algorithm/detail/copy_to_host.hpp137
-rw-r--r--boost/compute/algorithm/detail/count_if_with_ballot.hpp78
-rw-r--r--boost/compute/algorithm/detail/count_if_with_reduce.hpp87
-rw-r--r--boost/compute/algorithm/detail/count_if_with_threads.hpp129
-rw-r--r--boost/compute/algorithm/detail/find_extrema.hpp64
-rw-r--r--boost/compute/algorithm/detail/find_extrema_with_atomics.hpp108
-rw-r--r--boost/compute/algorithm/detail/find_extrema_with_reduce.hpp443
-rw-r--r--boost/compute/algorithm/detail/find_if_with_atomics.hpp212
-rw-r--r--boost/compute/algorithm/detail/inplace_reduce.hpp136
-rw-r--r--boost/compute/algorithm/detail/insertion_sort.hpp165
-rw-r--r--boost/compute/algorithm/detail/merge_path.hpp116
-rw-r--r--boost/compute/algorithm/detail/merge_sort_on_cpu.hpp366
-rw-r--r--boost/compute/algorithm/detail/merge_with_merge_path.hpp203
-rw-r--r--boost/compute/algorithm/detail/radix_sort.hpp415
-rw-r--r--boost/compute/algorithm/detail/random_fill.hpp57
-rw-r--r--boost/compute/algorithm/detail/reduce_by_key.hpp119
-rw-r--r--boost/compute/algorithm/detail/reduce_by_key_with_scan.hpp541
-rw-r--r--boost/compute/algorithm/detail/reduce_on_gpu.hpp286
-rw-r--r--boost/compute/algorithm/detail/scan.hpp45
-rw-r--r--boost/compute/algorithm/detail/scan_on_cpu.hpp103
-rw-r--r--boost/compute/algorithm/detail/scan_on_gpu.hpp331
-rw-r--r--boost/compute/algorithm/detail/search_all.hpp86
-rw-r--r--boost/compute/algorithm/detail/serial_accumulate.hpp56
-rw-r--r--boost/compute/algorithm/detail/serial_count_if.hpp68
-rw-r--r--boost/compute/algorithm/detail/serial_find_extrema.hpp87
-rw-r--r--boost/compute/algorithm/detail/serial_merge.hpp97
-rw-r--r--boost/compute/algorithm/detail/serial_reduce.hpp62
-rw-r--r--boost/compute/algorithm/detail/serial_reduce_by_key.hpp108
-rw-r--r--boost/compute/algorithm/equal.hpp53
-rw-r--r--boost/compute/algorithm/equal_range.hpp42
-rw-r--r--boost/compute/algorithm/exclusive_scan.hpp96
-rw-r--r--boost/compute/algorithm/fill.hpp306
-rw-r--r--boost/compute/algorithm/fill_n.hpp36
-rw-r--r--boost/compute/algorithm/find.hpp57
-rw-r--r--boost/compute/algorithm/find_end.hpp119
-rw-r--r--boost/compute/algorithm/find_if.hpp35
-rw-r--r--boost/compute/algorithm/find_if_not.hpp43
-rw-r--r--boost/compute/algorithm/for_each.hpp65
-rw-r--r--boost/compute/algorithm/for_each_n.hpp35
-rw-r--r--boost/compute/algorithm/gather.hpp84
-rw-r--r--boost/compute/algorithm/generate.hpp49
-rw-r--r--boost/compute/algorithm/generate_n.hpp35
-rw-r--r--boost/compute/algorithm/includes.hpp155
-rw-r--r--boost/compute/algorithm/inclusive_scan.hpp81
-rw-r--r--boost/compute/algorithm/inner_product.hpp93
-rw-r--r--boost/compute/algorithm/inplace_merge.hpp60
-rw-r--r--boost/compute/algorithm/iota.hpp48
-rw-r--r--boost/compute/algorithm/is_partitioned.hpp43
-rw-r--r--boost/compute/algorithm/is_permutation.hpp67
-rw-r--r--boost/compute/algorithm/is_sorted.hpp64
-rw-r--r--boost/compute/algorithm/lexicographical_compare.hpp117
-rw-r--r--boost/compute/algorithm/lower_bound.hpp44
-rw-r--r--boost/compute/algorithm/max_element.hpp74
-rw-r--r--boost/compute/algorithm/merge.hpp105
-rw-r--r--boost/compute/algorithm/min_element.hpp74
-rw-r--r--boost/compute/algorithm/minmax_element.hpp70
-rw-r--r--boost/compute/algorithm/mismatch.hpp89
-rw-r--r--boost/compute/algorithm/next_permutation.hpp170
-rw-r--r--boost/compute/algorithm/none_of.hpp36
-rw-r--r--boost/compute/algorithm/nth_element.hpp87
-rw-r--r--boost/compute/algorithm/partial_sum.hpp37
-rw-r--r--boost/compute/algorithm/partition.hpp39
-rw-r--r--boost/compute/algorithm/partition_copy.hpp63
-rw-r--r--boost/compute/algorithm/partition_point.hpp46
-rw-r--r--boost/compute/algorithm/prev_permutation.hpp170
-rw-r--r--boost/compute/algorithm/random_shuffle.hpp75
-rw-r--r--boost/compute/algorithm/reduce.hpp301
-rw-r--r--boost/compute/algorithm/reduce_by_key.hpp118
-rw-r--r--boost/compute/algorithm/remove.hpp54
-rw-r--r--boost/compute/algorithm/remove_if.hpp47
-rw-r--r--boost/compute/algorithm/replace.hpp90
-rw-r--r--boost/compute/algorithm/replace_copy.hpp62
-rw-r--r--boost/compute/algorithm/reverse.hpp74
-rw-r--r--boost/compute/algorithm/reverse_copy.hpp79
-rw-r--r--boost/compute/algorithm/rotate.hpp54
-rw-r--r--boost/compute/algorithm/rotate_copy.hpp41
-rw-r--r--boost/compute/algorithm/scatter.hpp99
-rw-r--r--boost/compute/algorithm/scatter_if.hpp119
-rw-r--r--boost/compute/algorithm/search.hpp73
-rw-r--r--boost/compute/algorithm/search_n.hpp140
-rw-r--r--boost/compute/algorithm/set_difference.hpp182
-rw-r--r--boost/compute/algorithm/set_intersection.hpp170
-rw-r--r--boost/compute/algorithm/set_symmetric_difference.hpp194
-rw-r--r--boost/compute/algorithm/set_union.hpp195
-rw-r--r--boost/compute/algorithm/sort.hpp194
-rw-r--r--boost/compute/algorithm/sort_by_key.hpp156
-rw-r--r--boost/compute/algorithm/stable_partition.hpp72
-rw-r--r--boost/compute/algorithm/stable_sort.hpp99
-rw-r--r--boost/compute/algorithm/stable_sort_by_key.hpp61
-rw-r--r--boost/compute/algorithm/swap_ranges.hpp44
-rw-r--r--boost/compute/algorithm/transform.hpp76
-rw-r--r--boost/compute/algorithm/transform_if.hpp117
-rw-r--r--boost/compute/algorithm/transform_reduce.hpp89
-rw-r--r--boost/compute/algorithm/unique.hpp66
-rw-r--r--boost/compute/algorithm/unique_copy.hpp164
-rw-r--r--boost/compute/algorithm/upper_bound.hpp43
112 files changed, 12774 insertions, 0 deletions
diff --git a/boost/compute/algorithm/accumulate.hpp b/boost/compute/algorithm/accumulate.hpp
new file mode 100644
index 0000000000..328420a07c
--- /dev/null
+++ b/boost/compute/algorithm/accumulate.hpp
@@ -0,0 +1,184 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_ACCUMULATE_HPP
+#define BOOST_COMPUTE_ALGORITHM_ACCUMULATE_HPP
+
+#include <boost/preprocessor/seq/for_each.hpp>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/functional.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/reduce.hpp>
+#include <boost/compute/algorithm/detail/serial_accumulate.hpp>
+#include <boost/compute/container/array.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class T, class BinaryFunction>
+inline T generic_accumulate(InputIterator first,
+ InputIterator last,
+ T init,
+ BinaryFunction function,
+ command_queue &queue)
+{
+ const context &context = queue.get_context();
+
+ size_t size = iterator_range_size(first, last);
+ if(size == 0){
+ return init;
+ }
+
+ // accumulate on device
+ array<T, 1> device_result(context);
+ detail::serial_accumulate(
+ first, last, device_result.begin(), init, function, queue
+ );
+
+ // copy result to host
+ T result;
+ ::boost::compute::copy_n(device_result.begin(), 1, &result, queue);
+ return result;
+}
+
+// returns true if we can use reduce() instead of accumulate() when
+// accumulate() this is true when the function is commutative (such as
+// addition of integers) and the initial value is the identity value
+// for the operation (zero for addition, one for multiplication).
+template<class T, class F>
+inline bool can_accumulate_with_reduce(T init, F function)
+{
+ (void) init;
+ (void) function;
+
+ return false;
+}
+
+/// \internal_
+#define BOOST_COMPUTE_DETAIL_DECLARE_CAN_ACCUMULATE_WITH_REDUCE(r, data, type) \
+ inline bool can_accumulate_with_reduce(type init, plus<type>) \
+ { \
+ return init == type(0); \
+ } \
+ inline bool can_accumulate_with_reduce(type init, multiplies<type>) \
+ { \
+ return init == type(1); \
+ }
+
+BOOST_PP_SEQ_FOR_EACH(
+ BOOST_COMPUTE_DETAIL_DECLARE_CAN_ACCUMULATE_WITH_REDUCE,
+ _,
+ (char_)(uchar_)(short_)(ushort_)(int_)(uint_)(long_)(ulong_)
+)
+
+template<class T>
+inline bool can_accumulate_with_reduce(T init, min<T>)
+{
+ return init == (std::numeric_limits<T>::max)();
+}
+
+template<class T>
+inline bool can_accumulate_with_reduce(T init, max<T>)
+{
+ return init == (std::numeric_limits<T>::min)();
+}
+
+#undef BOOST_COMPUTE_DETAIL_DECLARE_CAN_ACCUMULATE_WITH_REDUCE
+
+template<class InputIterator, class T, class BinaryFunction>
+inline T dispatch_accumulate(InputIterator first,
+ InputIterator last,
+ T init,
+ BinaryFunction function,
+ command_queue &queue)
+{
+ size_t size = iterator_range_size(first, last);
+ if(size == 0){
+ return init;
+ }
+
+ if(can_accumulate_with_reduce(init, function)){
+ T result;
+ reduce(first, last, &result, function, queue);
+ return result;
+ }
+ else {
+ return generic_accumulate(first, last, init, function, queue);
+ }
+}
+
+} // end detail namespace
+
+/// Returns the result of applying \p function to the elements in the
+/// range [\p first, \p last) and \p init.
+///
+/// If no function is specified, \c plus will be used.
+///
+/// \param first first element in the input range
+/// \param last last element in the input range
+/// \param init initial value
+/// \param function binary reduction function
+/// \param queue command queue to perform the operation
+///
+/// \return the accumulated result value
+///
+/// In specific situations the call to \c accumulate() can be automatically
+/// optimized to a call to the more efficient \c reduce() algorithm. This
+/// occurs when the binary reduction function is recognized as associative
+/// (such as the \c plus<int> function).
+///
+/// Note that because floating-point addition is not associative, calling
+/// \c accumulate() with \c plus<float> results in a less efficient serial
+/// reduction algorithm being executed. If a slight loss in precision is
+/// acceptable, the more efficient parallel \c reduce() algorithm should be
+/// used instead.
+///
+/// For example:
+/// \code
+/// // with vec = boost::compute::vector<int>
+/// accumulate(vec.begin(), vec.end(), 0, plus<int>()); // fast
+/// reduce(vec.begin(), vec.end(), &result, plus<int>()); // fast
+///
+/// // with vec = boost::compute::vector<float>
+/// accumulate(vec.begin(), vec.end(), 0, plus<float>()); // slow
+/// reduce(vec.begin(), vec.end(), &result, plus<float>()); // fast
+/// \endcode
+///
+/// \see reduce()
+template<class InputIterator, class T, class BinaryFunction>
+inline T accumulate(InputIterator first,
+ InputIterator last,
+ T init,
+ BinaryFunction function,
+ command_queue &queue = system::default_queue())
+{
+ return detail::dispatch_accumulate(first, last, init, function, queue);
+}
+
+/// \overload
+template<class InputIterator, class T>
+inline T accumulate(InputIterator first,
+ InputIterator last,
+ T init,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type IT;
+
+ return detail::dispatch_accumulate(first, last, init, plus<IT>(), queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_ACCUMULATE_HPP
diff --git a/boost/compute/algorithm/adjacent_difference.hpp b/boost/compute/algorithm/adjacent_difference.hpp
new file mode 100644
index 0000000000..a8f84e020e
--- /dev/null
+++ b/boost/compute/algorithm/adjacent_difference.hpp
@@ -0,0 +1,98 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_ADJACENT_DIFFERENCE_HPP
+#define BOOST_COMPUTE_ALGORITHM_ADJACENT_DIFFERENCE_HPP
+
+#include <iterator>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/functional/operator.hpp>
+#include <boost/compute/container/vector.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Stores the difference of each pair of consecutive values in the range
+/// [\p first, \p last) to the range beginning at \p result. If \p op is not
+/// provided, \c minus<T> is used.
+///
+/// \param first first element in the input range
+/// \param last last element in the input range
+/// \param result first element in the output range
+/// \param op binary difference function
+/// \param queue command queue to perform the operation
+///
+/// \return \c OutputIterator to the end of the result range
+///
+/// \see adjacent_find()
+template<class InputIterator, class OutputIterator, class BinaryFunction>
+inline OutputIterator
+adjacent_difference(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ BinaryFunction op,
+ command_queue &queue = system::default_queue())
+{
+ if(first == last){
+ return result;
+ }
+
+ size_t count = detail::iterator_range_size(first, last);
+
+ detail::meta_kernel k("adjacent_difference");
+
+ k << "const uint i = get_global_id(0);\n"
+ << "if(i == 0){\n"
+ << " " << result[k.var<uint_>("0")] << " = " << first[k.var<uint_>("0")] << ";\n"
+ << "}\n"
+ << "else {\n"
+ << " " << result[k.var<uint_>("i")] << " = "
+ << op(first[k.var<uint_>("i")], first[k.var<uint_>("i-1")]) << ";\n"
+ << "}\n";
+
+ k.exec_1d(queue, 0, count, 1);
+
+ return result + count;
+}
+
+/// \overload
+template<class InputIterator, class OutputIterator>
+inline OutputIterator
+adjacent_difference(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+ if (first == result) {
+ vector<value_type> temp(detail::iterator_range_size(first, last),
+ queue.get_context());
+ copy(first, last, temp.begin(), queue);
+
+ return ::boost::compute::adjacent_difference(
+ temp.begin(), temp.end(), result, ::boost::compute::minus<value_type>(), queue
+ );
+ }
+ else {
+ return ::boost::compute::adjacent_difference(
+ first, last, result, ::boost::compute::minus<value_type>(), queue
+ );
+ }
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_ADJACENT_DIFFERENCE_HPP
diff --git a/boost/compute/algorithm/adjacent_find.hpp b/boost/compute/algorithm/adjacent_find.hpp
new file mode 100644
index 0000000000..992a01eddc
--- /dev/null
+++ b/boost/compute/algorithm/adjacent_find.hpp
@@ -0,0 +1,162 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_ADJACENT_FIND_HPP
+#define BOOST_COMPUTE_ALGORITHM_ADJACENT_FIND_HPP
+
+#include <iterator>
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/lambda.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/container/detail/scalar.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/functional/operator.hpp>
+#include <boost/compute/type_traits/vector_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class Compare>
+inline InputIterator
+serial_adjacent_find(InputIterator first,
+ InputIterator last,
+ Compare compare,
+ command_queue &queue)
+{
+ if(first == last){
+ return last;
+ }
+
+ const context &context = queue.get_context();
+
+ detail::scalar<uint_> output(context);
+
+ detail::meta_kernel k("serial_adjacent_find");
+
+ size_t size_arg = k.add_arg<const uint_>("size");
+ size_t output_arg = k.add_arg<uint_ *>(memory_object::global_memory, "output");
+
+ k << k.decl<uint_>("result") << " = size;\n"
+ << "for(uint i = 0; i < size - 1; i++){\n"
+ << " if(" << compare(first[k.expr<uint_>("i")],
+ first[k.expr<uint_>("i+1")]) << "){\n"
+ << " result = i;\n"
+ << " break;\n"
+ << " }\n"
+ << "}\n"
+ << "*output = result;\n";
+
+ k.set_arg<const uint_>(
+ size_arg, static_cast<uint_>(detail::iterator_range_size(first, last))
+ );
+ k.set_arg(output_arg, output.get_buffer());
+
+ k.exec_1d(queue, 0, 1, 1);
+
+ return first + output.read(queue);
+}
+
+template<class InputIterator, class Compare>
+inline InputIterator
+adjacent_find_with_atomics(InputIterator first,
+ InputIterator last,
+ Compare compare,
+ command_queue &queue)
+{
+ if(first == last){
+ return last;
+ }
+
+ const context &context = queue.get_context();
+ size_t count = detail::iterator_range_size(first, last);
+
+ // initialize output to the last index
+ detail::scalar<uint_> output(context);
+ output.write(static_cast<uint_>(count), queue);
+
+ detail::meta_kernel k("adjacent_find_with_atomics");
+
+ size_t output_arg = k.add_arg<uint_ *>(memory_object::global_memory, "output");
+
+ k << "const uint i = get_global_id(0);\n"
+ << "if(" << compare(first[k.expr<uint_>("i")],
+ first[k.expr<uint_>("i+1")]) << "){\n"
+ << " atomic_min(output, i);\n"
+ << "}\n";
+
+ k.set_arg(output_arg, output.get_buffer());
+
+ k.exec_1d(queue, 0, count - 1, 1);
+
+ return first + output.read(queue);
+}
+
+} // end detail namespace
+
+/// Searches the range [\p first, \p last) for two identical adjacent
+/// elements and returns an iterator pointing to the first.
+///
+/// \param first first element in the range to search
+/// \param last last element in the range to search
+/// \param compare binary comparison function
+/// \param queue command queue to perform the operation
+///
+/// \return \c InputIteratorm to the first element which compares equal
+/// to the following element. If none are equal, returns \c last.
+///
+/// \see find(), adjacent_difference()
+template<class InputIterator, class Compare>
+inline InputIterator
+adjacent_find(InputIterator first,
+ InputIterator last,
+ Compare compare,
+ command_queue &queue = system::default_queue())
+{
+ size_t count = detail::iterator_range_size(first, last);
+ if(count < 32){
+ return detail::serial_adjacent_find(first, last, compare, queue);
+ }
+ else {
+ return detail::adjacent_find_with_atomics(first, last, compare, queue);
+ }
+}
+
+/// \overload
+template<class InputIterator>
+inline InputIterator
+adjacent_find(InputIterator first,
+ InputIterator last,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+ using ::boost::compute::lambda::_1;
+ using ::boost::compute::lambda::_2;
+ using ::boost::compute::lambda::all;
+
+ if(vector_size<value_type>::value == 1){
+ return ::boost::compute::adjacent_find(
+ first, last, _1 == _2, queue
+ );
+ }
+ else {
+ return ::boost::compute::adjacent_find(
+ first, last, all(_1 == _2), queue
+ );
+ }
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_ADJACENT_FIND_HPP
diff --git a/boost/compute/algorithm/all_of.hpp b/boost/compute/algorithm/all_of.hpp
new file mode 100644
index 0000000000..34d7518f32
--- /dev/null
+++ b/boost/compute/algorithm/all_of.hpp
@@ -0,0 +1,36 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_ALL_OF_HPP
+#define BOOST_COMPUTE_ALGORITHM_ALL_OF_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/find_if_not.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Returns \c true if \p predicate returns \c true for all of the elements in
+/// the range [\p first, \p last).
+///
+/// \see any_of(), none_of()
+template<class InputIterator, class UnaryPredicate>
+inline bool all_of(InputIterator first,
+ InputIterator last,
+ UnaryPredicate predicate,
+ command_queue &queue = system::default_queue())
+{
+ return ::boost::compute::find_if_not(first, last, predicate, queue) == last;
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_ALL_OF_HPP
diff --git a/boost/compute/algorithm/any_of.hpp b/boost/compute/algorithm/any_of.hpp
new file mode 100644
index 0000000000..b07779597c
--- /dev/null
+++ b/boost/compute/algorithm/any_of.hpp
@@ -0,0 +1,40 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_ANY_OF_HPP
+#define BOOST_COMPUTE_ALGORITHM_ANY_OF_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/find_if.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Returns \c true if \p predicate returns \c true for any of the elements in
+/// the range [\p first, \p last).
+///
+/// For example, to test if a vector contains any negative values:
+///
+/// \snippet test/test_any_all_none_of.cpp any_of
+///
+/// \see all_of(), none_of()
+template<class InputIterator, class UnaryPredicate>
+inline bool any_of(InputIterator first,
+ InputIterator last,
+ UnaryPredicate predicate,
+ command_queue &queue = system::default_queue())
+{
+ return ::boost::compute::find_if(first, last, predicate, queue) != last;
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_ANY_OF_HPP
diff --git a/boost/compute/algorithm/binary_search.hpp b/boost/compute/algorithm/binary_search.hpp
new file mode 100644
index 0000000000..6e19498790
--- /dev/null
+++ b/boost/compute/algorithm/binary_search.hpp
@@ -0,0 +1,37 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_BINARY_SEARCH_HPP
+#define BOOST_COMPUTE_ALGORITHM_BINARY_SEARCH_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/lower_bound.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Returns \c true if \p value is in the sorted range [\p first,
+/// \p last).
+template<class InputIterator, class T>
+inline bool binary_search(InputIterator first,
+ InputIterator last,
+ const T &value,
+ command_queue &queue = system::default_queue())
+{
+ InputIterator position = lower_bound(first, last, value, queue);
+
+ return position != last && position.read(queue) == value;
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_BINARY_SEARCH_HPP
diff --git a/boost/compute/algorithm/copy.hpp b/boost/compute/algorithm/copy.hpp
new file mode 100644
index 0000000000..2a25059bba
--- /dev/null
+++ b/boost/compute/algorithm/copy.hpp
@@ -0,0 +1,362 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_COPY_HPP
+#define BOOST_COMPUTE_ALGORITHM_COPY_HPP
+
+#include <algorithm>
+#include <iterator>
+
+#include <boost/utility/enable_if.hpp>
+
+#include <boost/mpl/and.hpp>
+#include <boost/mpl/not.hpp>
+
+#include <boost/compute/buffer.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/detail/copy_on_device.hpp>
+#include <boost/compute/algorithm/detail/copy_to_device.hpp>
+#include <boost/compute/algorithm/detail/copy_to_host.hpp>
+#include <boost/compute/async/future.hpp>
+#include <boost/compute/detail/is_contiguous_iterator.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/type_traits/is_device_iterator.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+namespace mpl = boost::mpl;
+
+// meta-function returning true if copy() between InputIterator and
+// OutputIterator can be implemented with clEnqueueCopyBuffer().
+template<class InputIterator, class OutputIterator>
+struct can_copy_with_copy_buffer :
+ mpl::and_<
+ boost::is_same<
+ InputIterator,
+ buffer_iterator<typename InputIterator::value_type>
+ >,
+ boost::is_same<
+ OutputIterator,
+ buffer_iterator<typename OutputIterator::value_type>
+ >,
+ boost::is_same<
+ typename InputIterator::value_type,
+ typename OutputIterator::value_type
+ >
+ >::type {};
+
+// host -> device
+template<class InputIterator, class OutputIterator>
+inline OutputIterator
+dispatch_copy(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue,
+ typename boost::enable_if_c<
+ !is_device_iterator<InputIterator>::value &&
+ is_device_iterator<OutputIterator>::value
+ >::type* = 0)
+{
+ if(is_contiguous_iterator<InputIterator>::value){
+ return copy_to_device(first, last, result, queue);
+ }
+ else {
+ // for non-contiguous input we first copy the values to
+ // a temporary std::vector and then copy from there
+ typedef typename std::iterator_traits<InputIterator>::value_type T;
+ std::vector<T> vector(first, last);
+ return copy_to_device(vector.begin(), vector.end(), result, queue);
+ }
+}
+
+// host -> device (async)
+template<class InputIterator, class OutputIterator>
+inline future<OutputIterator>
+dispatch_copy_async(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue,
+ typename boost::enable_if_c<
+ !is_device_iterator<InputIterator>::value &&
+ is_device_iterator<OutputIterator>::value
+ >::type* = 0)
+{
+ BOOST_STATIC_ASSERT_MSG(
+ is_contiguous_iterator<InputIterator>::value,
+ "copy_async() is only supported for contiguous host iterators"
+ );
+
+ return copy_to_device_async(first, last, result, queue);
+}
+
+// device -> host
+template<class InputIterator, class OutputIterator>
+inline OutputIterator
+dispatch_copy(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue,
+ typename boost::enable_if_c<
+ is_device_iterator<InputIterator>::value &&
+ !is_device_iterator<OutputIterator>::value
+ >::type* = 0)
+{
+ if(is_contiguous_iterator<OutputIterator>::value){
+ return copy_to_host(first, last, result, queue);
+ }
+ else {
+ // for non-contiguous input we first copy the values to
+ // a temporary std::vector and then copy from there
+ typedef typename std::iterator_traits<InputIterator>::value_type T;
+ std::vector<T> vector(iterator_range_size(first, last));
+ copy_to_host(first, last, vector.begin(), queue);
+ return std::copy(vector.begin(), vector.end(), result);
+ }
+}
+
+// device -> host (async)
+template<class InputIterator, class OutputIterator>
+inline future<OutputIterator>
+dispatch_copy_async(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue,
+ typename boost::enable_if_c<
+ is_device_iterator<InputIterator>::value &&
+ !is_device_iterator<OutputIterator>::value
+ >::type* = 0)
+{
+ BOOST_STATIC_ASSERT_MSG(
+ is_contiguous_iterator<OutputIterator>::value,
+ "copy_async() is only supported for contiguous host iterators"
+ );
+
+ return copy_to_host_async(first, last, result, queue);
+}
+
+// device -> device
+template<class InputIterator, class OutputIterator>
+inline OutputIterator
+dispatch_copy(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue,
+ typename boost::enable_if<
+ mpl::and_<
+ is_device_iterator<InputIterator>,
+ is_device_iterator<OutputIterator>,
+ mpl::not_<
+ can_copy_with_copy_buffer<
+ InputIterator, OutputIterator
+ >
+ >
+ >
+ >::type* = 0)
+{
+ return copy_on_device(first, last, result, queue);
+}
+
+// device -> device (specialization for buffer iterators)
+template<class InputIterator, class OutputIterator>
+inline OutputIterator
+dispatch_copy(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue,
+ typename boost::enable_if<
+ mpl::and_<
+ is_device_iterator<InputIterator>,
+ is_device_iterator<OutputIterator>,
+ can_copy_with_copy_buffer<
+ InputIterator, OutputIterator
+ >
+ >
+ >::type* = 0)
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+ typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
+
+ difference_type n = std::distance(first, last);
+ if(n < 1){
+ // nothing to copy
+ return result;
+ }
+
+ queue.enqueue_copy_buffer(first.get_buffer(),
+ result.get_buffer(),
+ first.get_index() * sizeof(value_type),
+ result.get_index() * sizeof(value_type),
+ static_cast<size_t>(n) * sizeof(value_type));
+ return result + n;
+}
+
+// device -> device (async)
+template<class InputIterator, class OutputIterator>
+inline future<OutputIterator>
+dispatch_copy_async(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue,
+ typename boost::enable_if<
+ mpl::and_<
+ is_device_iterator<InputIterator>,
+ is_device_iterator<OutputIterator>,
+ mpl::not_<
+ can_copy_with_copy_buffer<
+ InputIterator, OutputIterator
+ >
+ >
+ >
+ >::type* = 0)
+{
+ return copy_on_device_async(first, last, result, queue);
+}
+
+// device -> device (async, specialization for buffer iterators)
+template<class InputIterator, class OutputIterator>
+inline future<OutputIterator>
+dispatch_copy_async(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue,
+ typename boost::enable_if<
+ mpl::and_<
+ is_device_iterator<InputIterator>,
+ is_device_iterator<OutputIterator>,
+ can_copy_with_copy_buffer<
+ InputIterator, OutputIterator
+ >
+ >
+ >::type* = 0)
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+ typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
+
+ difference_type n = std::distance(first, last);
+ if(n < 1){
+ // nothing to copy
+ return make_future(result, event());
+ }
+
+ event event_ =
+ queue.enqueue_copy_buffer(
+ first.get_buffer(),
+ result.get_buffer(),
+ first.get_index() * sizeof(value_type),
+ result.get_index() * sizeof(value_type),
+ static_cast<size_t>(n) * sizeof(value_type)
+ );
+
+ return make_future(result + n, event_);
+}
+
+// host -> host
+template<class InputIterator, class OutputIterator>
+inline OutputIterator
+dispatch_copy(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue,
+ typename boost::enable_if_c<
+ !is_device_iterator<InputIterator>::value &&
+ !is_device_iterator<OutputIterator>::value
+ >::type* = 0)
+{
+ (void) queue;
+
+ return std::copy(first, last, result);
+}
+
+} // end detail namespace
+
+/// Copies the values in the range [\p first, \p last) to the range
+/// beginning at \p result.
+///
+/// The generic copy() function can be used for a variety of data
+/// transfer tasks and provides a standard interface to the following
+/// OpenCL functions:
+///
+/// \li \c clEnqueueReadBuffer()
+/// \li \c clEnqueueWriteBuffer()
+/// \li \c clEnqueueCopyBuffer()
+///
+/// Unlike the aforementioned OpenCL functions, copy() will also work
+/// with non-contiguous data-structures (e.g. \c std::list<T>) as
+/// well as with "fancy" iterators (e.g. transform_iterator).
+///
+/// \param first first element in the range to copy
+/// \param last last element in the range to copy
+/// \param result first element in the result range
+/// \param queue command queue to perform the operation
+///
+/// \return \c OutputIterator to the end of the result range
+///
+/// For example, to copy an array of \c int values on the host to a vector on
+/// the device:
+/// \code
+/// // array on the host
+/// int data[] = { 1, 2, 3, 4 };
+///
+/// // vector on the device
+/// boost::compute::vector<int> vec(4, context);
+///
+/// // copy values to the device vector
+/// boost::compute::copy(data, data + 4, vec.begin(), queue);
+/// \endcode
+///
+/// The copy algorithm can also be used with standard containers such as
+/// \c std::vector<T>:
+/// \code
+/// std::vector<int> host_vector = ...
+/// boost::compute::vector<int> device_vector = ...
+///
+/// // copy from the host to the device
+/// boost::compute::copy(
+/// host_vector.begin(), host_vector.end(), device_vector.begin(), queue
+/// );
+///
+/// // copy from the device to the host
+/// boost::compute::copy(
+/// device_vector.begin(), device_vector.end(), host_vector.begin(), queue
+/// );
+/// \endcode
+///
+/// \see copy_n(), copy_if(), copy_async()
+template<class InputIterator, class OutputIterator>
+inline OutputIterator copy(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue = system::default_queue())
+{
+ return detail::dispatch_copy(first, last, result, queue);
+}
+
+/// Copies the values in the range [\p first, \p last) to the range
+/// beginning at \p result. The copy is performed asynchronously.
+///
+/// \see copy()
+template<class InputIterator, class OutputIterator>
+inline future<OutputIterator>
+copy_async(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue = system::default_queue())
+{
+ return detail::dispatch_copy_async(first, last, result, queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_COPY_HPP
diff --git a/boost/compute/algorithm/copy_if.hpp b/boost/compute/algorithm/copy_if.hpp
new file mode 100644
index 0000000000..3cd08ef293
--- /dev/null
+++ b/boost/compute/algorithm/copy_if.hpp
@@ -0,0 +1,58 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_COPY_IF_HPP
+#define BOOST_COMPUTE_ALGORITHM_COPY_IF_HPP
+
+#include <boost/compute/algorithm/transform_if.hpp>
+#include <boost/compute/functional/identity.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+// like the copy_if() algorithm but writes the indices of the values for which
+// predicate returns true.
+template<class InputIterator, class OutputIterator, class Predicate>
+inline OutputIterator copy_index_if(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ Predicate predicate,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+ return detail::transform_if_impl(
+ first, last, result, identity<T>(), predicate, true, queue
+ );
+}
+
+} // end detail namespace
+
+/// Copies each element in the range [\p first, \p last) for which
+/// \p predicate returns \c true to the range beginning at \p result.
+template<class InputIterator, class OutputIterator, class Predicate>
+inline OutputIterator copy_if(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ Predicate predicate,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+ return ::boost::compute::transform_if(
+ first, last, result, identity<T>(), predicate, queue
+ );
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_COPY_IF_HPP
diff --git a/boost/compute/algorithm/copy_n.hpp b/boost/compute/algorithm/copy_n.hpp
new file mode 100644
index 0000000000..f0989edc67
--- /dev/null
+++ b/boost/compute/algorithm/copy_n.hpp
@@ -0,0 +1,51 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_COPY_N_HPP
+#define BOOST_COMPUTE_ALGORITHM_COPY_N_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Copies \p count elements from \p first to \p result.
+///
+/// For example, to copy four values from the host to the device:
+/// \code
+/// // values on the host and vector on the device
+/// float values[4] = { 1.f, 2.f, 3.f, 4.f };
+/// boost::compute::vector<float> vec(4, context);
+///
+/// // copy from the host to the device
+/// boost::compute::copy_n(values, 4, vec.begin(), queue);
+/// \endcode
+///
+/// \see copy()
+template<class InputIterator, class Size, class OutputIterator>
+inline OutputIterator copy_n(InputIterator first,
+ Size count,
+ OutputIterator result,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
+
+ return ::boost::compute::copy(first,
+ first + static_cast<difference_type>(count),
+ result,
+ queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_COPY_N_HPP
diff --git a/boost/compute/algorithm/count.hpp b/boost/compute/algorithm/count.hpp
new file mode 100644
index 0000000000..140d67379f
--- /dev/null
+++ b/boost/compute/algorithm/count.hpp
@@ -0,0 +1,55 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_COUNT_HPP
+#define BOOST_COMPUTE_ALGORITHM_COUNT_HPP
+
+#include <boost/compute/lambda.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/count_if.hpp>
+#include <boost/compute/type_traits/vector_size.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Returns the number of occurrences of \p value in the range
+/// [\p first, \p last).
+///
+/// \see count_if()
+template<class InputIterator, class T>
+inline size_t count(InputIterator first,
+ InputIterator last,
+ const T &value,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+ using ::boost::compute::_1;
+ using ::boost::compute::lambda::all;
+
+ if(vector_size<value_type>::value == 1){
+ return ::boost::compute::count_if(first,
+ last,
+ _1 == value,
+ queue);
+ }
+ else {
+ return ::boost::compute::count_if(first,
+ last,
+ all(_1 == value),
+ queue);
+ }
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_COUNT_HPP
diff --git a/boost/compute/algorithm/count_if.hpp b/boost/compute/algorithm/count_if.hpp
new file mode 100644
index 0000000000..c9381ce5d4
--- /dev/null
+++ b/boost/compute/algorithm/count_if.hpp
@@ -0,0 +1,62 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_COUNT_IF_HPP
+#define BOOST_COMPUTE_ALGORITHM_COUNT_IF_HPP
+
+#include <boost/compute/device.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/detail/count_if_with_ballot.hpp>
+#include <boost/compute/algorithm/detail/count_if_with_reduce.hpp>
+#include <boost/compute/algorithm/detail/count_if_with_threads.hpp>
+#include <boost/compute/algorithm/detail/serial_count_if.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Returns the number of elements in the range [\p first, \p last)
+/// for which \p predicate returns \c true.
+template<class InputIterator, class Predicate>
+inline size_t count_if(InputIterator first,
+ InputIterator last,
+ Predicate predicate,
+ command_queue &queue = system::default_queue())
+{
+ const device &device = queue.get_device();
+
+ size_t input_size = detail::iterator_range_size(first, last);
+ if(input_size == 0){
+ return 0;
+ }
+
+ if(device.type() & device::cpu){
+ if(input_size < 1024){
+ return detail::serial_count_if(first, last, predicate, queue);
+ }
+ else {
+ return detail::count_if_with_threads(first, last, predicate, queue);
+ }
+ }
+ else {
+ if(input_size < 32){
+ return detail::serial_count_if(first, last, predicate, queue);
+ }
+ else {
+ return detail::count_if_with_reduce(first, last, predicate, queue);
+ }
+ }
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_COUNT_IF_HPP
diff --git a/boost/compute/algorithm/detail/balanced_path.hpp b/boost/compute/algorithm/detail/balanced_path.hpp
new file mode 100644
index 0000000000..e5025532d3
--- /dev/null
+++ b/boost/compute/algorithm/detail/balanced_path.hpp
@@ -0,0 +1,162 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_BALANCED_PATH_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_BALANCED_PATH_HPP
+
+#include <iterator>
+
+#include <boost/compute/algorithm/find_if.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/lambda.hpp>
+#include <boost/compute/system.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+///
+/// \brief Balanced Path kernel class
+///
+/// Subclass of meta_kernel to break two sets into tiles according
+/// to their balanced path.
+///
+class balanced_path_kernel : public meta_kernel
+{
+public:
+ unsigned int tile_size;
+
+ balanced_path_kernel() : meta_kernel("balanced_path")
+ {
+ tile_size = 4;
+ }
+
+ template<class InputIterator1, class InputIterator2,
+ class OutputIterator1, class OutputIterator2,
+ class Compare>
+ void set_range(InputIterator1 first1,
+ InputIterator1 last1,
+ InputIterator2 first2,
+ InputIterator2 last2,
+ OutputIterator1 result_a,
+ OutputIterator2 result_b,
+ Compare comp)
+ {
+ typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
+
+ m_a_count = iterator_range_size(first1, last1);
+ m_a_count_arg = add_arg<uint_>("a_count");
+
+ m_b_count = iterator_range_size(first2, last2);
+ m_b_count_arg = add_arg<uint_>("b_count");
+
+ *this <<
+ "uint i = get_global_id(0);\n" <<
+ "uint target = (i+1)*" << tile_size << ";\n" <<
+ "uint start = max(convert_int(0),convert_int(target)-convert_int(b_count));\n" <<
+ "uint end = min(target,a_count);\n" <<
+ "uint a_index, b_index;\n" <<
+ "while(start<end)\n" <<
+ "{\n" <<
+ " a_index = (start + end)/2;\n" <<
+ " b_index = target - a_index - 1;\n" <<
+ " if(!(" << comp(first2[expr<uint_>("b_index")],
+ first1[expr<uint_>("a_index")]) << "))\n" <<
+ " start = a_index + 1;\n" <<
+ " else end = a_index;\n" <<
+ "}\n" <<
+ "a_index = start;\n" <<
+ "b_index = target - start;\n" <<
+ "if(b_index < b_count)\n" <<
+ "{\n" <<
+ " " << decl<const value_type>("x") << " = " <<
+ first2[expr<uint_>("b_index")] << ";\n" <<
+ " uint a_start = 0, a_end = a_index, a_mid;\n" <<
+ " uint b_start = 0, b_end = b_index, b_mid;\n" <<
+ " while(a_start<a_end)\n" <<
+ " {\n" <<
+ " a_mid = (a_start + a_end)/2;\n" <<
+ " if(" << comp(first1[expr<uint_>("a_mid")], expr<value_type>("x")) << ")\n" <<
+ " a_start = a_mid+1;\n" <<
+ " else a_end = a_mid;\n" <<
+ " }\n" <<
+ " while(b_start<b_end)\n" <<
+ " {\n" <<
+ " b_mid = (b_start + b_end)/2;\n" <<
+ " if(" << comp(first2[expr<uint_>("b_mid")], expr<value_type>("x")) << ")\n" <<
+ " b_start = b_mid+1;\n" <<
+ " else b_end = b_mid;\n" <<
+ " }\n" <<
+ " uint a_run = a_index - a_start;\n" <<
+ " uint b_run = b_index - b_start;\n" <<
+ " uint x_count = a_run + b_run;\n" <<
+ " uint b_advance = max(x_count / 2, x_count - a_run);\n" <<
+ " b_end = min(b_count, b_start + b_advance + 1);\n" <<
+ " uint temp_start = b_index, temp_end = b_end, temp_mid;" <<
+ " while(temp_start < temp_end)\n" <<
+ " {\n" <<
+ " temp_mid = (temp_start + temp_end + 1)/2;\n" <<
+ " if(" << comp(expr<value_type>("x"), first2[expr<uint_>("temp_mid")]) << ")\n" <<
+ " temp_end = temp_mid-1;\n" <<
+ " else temp_start = temp_mid;\n" <<
+ " }\n" <<
+ " b_run = temp_start - b_start + 1;\n" <<
+ " b_advance = min(b_advance, b_run);\n" <<
+ " uint a_advance = x_count - b_advance;\n" <<
+ " uint star = convert_uint((a_advance == b_advance + 1) " <<
+ "&& (b_advance < b_run));\n" <<
+ " a_index = a_start + a_advance;\n" <<
+ " b_index = target - a_index + star;\n" <<
+ "}\n" <<
+ result_a[expr<uint_>("i")] << " = a_index;\n" <<
+ result_b[expr<uint_>("i")] << " = b_index;\n";
+
+ }
+
+ template<class InputIterator1, class InputIterator2,
+ class OutputIterator1, class OutputIterator2>
+ void set_range(InputIterator1 first1,
+ InputIterator1 last1,
+ InputIterator2 first2,
+ InputIterator2 last2,
+ OutputIterator1 result_a,
+ OutputIterator2 result_b)
+ {
+ typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
+ ::boost::compute::less<value_type> less_than;
+ set_range(first1, last1, first2, last2, result_a, result_b, less_than);
+ }
+
+ event exec(command_queue &queue)
+ {
+ if((m_a_count + m_b_count)/tile_size == 0) {
+ return event();
+ }
+
+ set_arg(m_a_count_arg, uint_(m_a_count));
+ set_arg(m_b_count_arg, uint_(m_b_count));
+
+ return exec_1d(queue, 0, (m_a_count + m_b_count)/tile_size);
+ }
+
+private:
+ size_t m_a_count;
+ size_t m_a_count_arg;
+ size_t m_b_count;
+ size_t m_b_count_arg;
+};
+
+} //end detail namespace
+} //end compute namespace
+} //end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_BALANCED_PATH_HPP
diff --git a/boost/compute/algorithm/detail/binary_find.hpp b/boost/compute/algorithm/detail/binary_find.hpp
new file mode 100644
index 0000000000..27fa11fbaf
--- /dev/null
+++ b/boost/compute/algorithm/detail/binary_find.hpp
@@ -0,0 +1,133 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_BINARY_FIND_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_BINARY_FIND_HPP
+
+#include <boost/compute/functional.hpp>
+#include <boost/compute/algorithm/find_if.hpp>
+#include <boost/compute/algorithm/transform.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/detail/parameter_cache.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail{
+
+///
+/// \brief Binary find kernel class
+///
+/// Subclass of meta_kernel to perform single step in binary find.
+///
+template<class InputIterator, class UnaryPredicate>
+class binary_find_kernel : public meta_kernel
+{
+public:
+ binary_find_kernel(InputIterator first,
+ InputIterator last,
+ UnaryPredicate predicate)
+ : meta_kernel("binary_find")
+ {
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+ m_index_arg = add_arg<uint_ *>(memory_object::global_memory, "index");
+ m_block_arg = add_arg<uint_>("block");
+
+ atomic_min<uint_> atomic_min_uint;
+
+ *this <<
+ "uint i = get_global_id(0) * block;\n" <<
+ decl<value_type>("value") << "=" << first[var<uint_>("i")] << ";\n" <<
+ "if(" << predicate(var<value_type>("value")) << ") {\n" <<
+ atomic_min_uint(var<uint_ *>("index"), var<uint_>("i")) << ";\n" <<
+ "}\n";
+ }
+
+ size_t m_index_arg;
+ size_t m_block_arg;
+};
+
+///
+/// \brief Binary find algorithm
+///
+/// Finds the end of true values in the partitioned range [first, last).
+/// \return Iterator pointing to end of true values
+///
+/// \param first Iterator pointing to start of range
+/// \param last Iterator pointing to end of range
+/// \param predicate Predicate according to which the range is partitioned
+/// \param queue Queue on which to execute
+///
+template<class InputIterator, class UnaryPredicate>
+inline InputIterator binary_find(InputIterator first,
+ InputIterator last,
+ UnaryPredicate predicate,
+ command_queue &queue = system::default_queue())
+{
+ const device &device = queue.get_device();
+
+ boost::shared_ptr<parameter_cache> parameters =
+ detail::parameter_cache::get_global_cache(device);
+
+ const std::string cache_key = "__boost_binary_find";
+
+ size_t find_if_limit = 128;
+ size_t threads = parameters->get(cache_key, "tpb", 128);
+ size_t count = iterator_range_size(first, last);
+
+ InputIterator search_first = first;
+ InputIterator search_last = last;
+
+ scalar<uint_> index(queue.get_context());
+
+ // construct and compile binary_find kernel
+ binary_find_kernel<InputIterator, UnaryPredicate>
+ binary_find_kernel(search_first, search_last, predicate);
+ ::boost::compute::kernel kernel = binary_find_kernel.compile(queue.get_context());
+
+ // set buffer for index
+ kernel.set_arg(binary_find_kernel.m_index_arg, index.get_buffer());
+
+ while(count > find_if_limit) {
+ index.write(static_cast<uint_>(count), queue);
+
+ // set block and run binary_find kernel
+ uint_ block = static_cast<uint_>((count - 1)/(threads - 1));
+ kernel.set_arg(binary_find_kernel.m_block_arg, block);
+ queue.enqueue_1d_range_kernel(kernel, 0, threads, 0);
+
+ size_t i = index.read(queue);
+
+ if(i == count) {
+ search_first = search_last - ((count - 1)%(threads - 1));
+ break;
+ } else {
+ search_last = search_first + i;
+ search_first = search_last - ((count - 1)/(threads - 1));
+ }
+
+ // Make sure that first and last stay within the input range
+ search_last = (std::min)(search_last, last);
+ search_last = (std::max)(search_last, first);
+
+ search_first = (std::max)(search_first, first);
+ search_first = (std::min)(search_first, last);
+
+ count = iterator_range_size(search_first, search_last);
+ }
+
+ return find_if(search_first, search_last, predicate, queue);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_BINARY_FIND_HPP
diff --git a/boost/compute/algorithm/detail/compact.hpp b/boost/compute/algorithm/detail/compact.hpp
new file mode 100644
index 0000000000..983352d543
--- /dev/null
+++ b/boost/compute/algorithm/detail/compact.hpp
@@ -0,0 +1,77 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COMPACT_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_COMPACT_HPP
+
+#include <iterator>
+
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/system.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+///
+/// \brief Compact kernel class
+///
+/// Subclass of meta_kernel to compact the result of set kernels to
+/// get actual sets
+///
+class compact_kernel : public meta_kernel
+{
+public:
+ unsigned int tile_size;
+
+ compact_kernel() : meta_kernel("compact")
+ {
+ tile_size = 4;
+ }
+
+ template<class InputIterator1, class InputIterator2, class OutputIterator>
+ void set_range(InputIterator1 start,
+ InputIterator2 counts_begin,
+ InputIterator2 counts_end,
+ OutputIterator result)
+ {
+ m_count = iterator_range_size(counts_begin, counts_end) - 1;
+
+ *this <<
+ "uint i = get_global_id(0);\n" <<
+ "uint count = i*" << tile_size << ";\n" <<
+ "for(uint j = " << counts_begin[expr<uint_>("i")] << "; j<" <<
+ counts_begin[expr<uint_>("i+1")] << "; j++, count++)\n" <<
+ "{\n" <<
+ result[expr<uint_>("j")] << " = " << start[expr<uint_>("count")]
+ << ";\n" <<
+ "}\n";
+ }
+
+ event exec(command_queue &queue)
+ {
+ if(m_count == 0) {
+ return event();
+ }
+
+ return exec_1d(queue, 0, m_count);
+ }
+
+private:
+ size_t m_count;
+};
+
+} //end detail namespace
+} //end compute namespace
+} //end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COMPACT_HPP
diff --git a/boost/compute/algorithm/detail/copy_on_device.hpp b/boost/compute/algorithm/detail/copy_on_device.hpp
new file mode 100644
index 0000000000..0bcee27ed5
--- /dev/null
+++ b/boost/compute/algorithm/detail/copy_on_device.hpp
@@ -0,0 +1,190 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP
+
+#include <iterator>
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/async/future.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/iterator/discard_iterator.hpp>
+#include <boost/compute/memory/svm_ptr.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/parameter_cache.hpp>
+#include <boost/compute/detail/work_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+inline size_t pick_copy_work_group_size(size_t n, const device &device)
+{
+ (void) device;
+
+ if(n % 32 == 0) return 32;
+ else if(n % 16 == 0) return 16;
+ else if(n % 8 == 0) return 8;
+ else if(n % 4 == 0) return 4;
+ else if(n % 2 == 0) return 2;
+ else return 1;
+}
+
+template<class InputIterator, class OutputIterator>
+class copy_kernel : public meta_kernel
+{
+public:
+ copy_kernel(const device &device)
+ : meta_kernel("copy")
+ {
+ m_count = 0;
+
+ typedef typename std::iterator_traits<InputIterator>::value_type input_type;
+
+ boost::shared_ptr<parameter_cache> parameters =
+ detail::parameter_cache::get_global_cache(device);
+
+ std::string cache_key =
+ "__boost_copy_kernel_" + boost::lexical_cast<std::string>(sizeof(input_type));
+
+ m_vpt = parameters->get(cache_key, "vpt", 4);
+ m_tpb = parameters->get(cache_key, "tpb", 128);
+ }
+
+ void set_range(InputIterator first,
+ InputIterator last,
+ OutputIterator result)
+ {
+ m_count_arg = add_arg<uint_>("count");
+
+ *this <<
+ "uint index = get_local_id(0) + " <<
+ "(" << m_vpt * m_tpb << " * get_group_id(0));\n" <<
+ "for(uint i = 0; i < " << m_vpt << "; i++){\n" <<
+ " if(index < count){\n" <<
+ result[expr<uint_>("index")] << '=' <<
+ first[expr<uint_>("index")] << ";\n" <<
+ " index += " << m_tpb << ";\n"
+ " }\n"
+ "}\n";
+
+ m_count = detail::iterator_range_size(first, last);
+ }
+
+ event exec(command_queue &queue)
+ {
+ if(m_count == 0){
+ // nothing to do
+ return event();
+ }
+
+ size_t global_work_size = calculate_work_size(m_count, m_vpt, m_tpb);
+
+ set_arg(m_count_arg, uint_(m_count));
+
+ return exec_1d(queue, 0, global_work_size, m_tpb);
+ }
+
+private:
+ size_t m_count;
+ size_t m_count_arg;
+ uint_ m_vpt;
+ uint_ m_tpb;
+};
+
+template<class InputIterator, class OutputIterator>
+inline OutputIterator copy_on_device(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue)
+{
+ const device &device = queue.get_device();
+
+ copy_kernel<InputIterator, OutputIterator> kernel(device);
+
+ kernel.set_range(first, last, result);
+ kernel.exec(queue);
+
+ return result + std::distance(first, last);
+}
+
+template<class InputIterator>
+inline discard_iterator copy_on_device(InputIterator first,
+ InputIterator last,
+ discard_iterator result,
+ command_queue &queue)
+{
+ (void) queue;
+
+ return result + std::distance(first, last);
+}
+
+template<class InputIterator, class OutputIterator>
+inline future<OutputIterator> copy_on_device_async(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue)
+{
+ const device &device = queue.get_device();
+
+ copy_kernel<InputIterator, OutputIterator> kernel(device);
+
+ kernel.set_range(first, last, result);
+ event event_ = kernel.exec(queue);
+
+ return make_future(result + std::distance(first, last), event_);
+}
+
+#ifdef CL_VERSION_2_0
+// copy_on_device() specialization for svm_ptr
+template<class T>
+inline svm_ptr<T> copy_on_device(svm_ptr<T> first,
+ svm_ptr<T> last,
+ svm_ptr<T> result,
+ command_queue &queue)
+{
+ size_t count = iterator_range_size(first, last);
+ if(count == 0){
+ return result;
+ }
+
+ queue.enqueue_svm_memcpy(
+ result.get(), first.get(), count * sizeof(T)
+ );
+
+ return result + count;
+}
+
+template<class T>
+inline future<svm_ptr<T> > copy_on_device_async(svm_ptr<T> first,
+ svm_ptr<T> last,
+ svm_ptr<T> result,
+ command_queue &queue)
+{
+ size_t count = iterator_range_size(first, last);
+ if(count == 0){
+ return result;
+ }
+
+ event event_ = queue.enqueue_svm_memcpy_async(
+ result.get(), first.get(), count * sizeof(T)
+ );
+
+ return make_future(result + count, event_);
+}
+#endif // CL_VERSION_2_0
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP
diff --git a/boost/compute/algorithm/detail/copy_to_device.hpp b/boost/compute/algorithm/detail/copy_to_device.hpp
new file mode 100644
index 0000000000..90545fb4ed
--- /dev/null
+++ b/boost/compute/algorithm/detail/copy_to_device.hpp
@@ -0,0 +1,127 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_TO_DEVICE_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_TO_DEVICE_HPP
+
+#include <iterator>
+
+#include <boost/utility/addressof.hpp>
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/async/future.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/memory/svm_ptr.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class HostIterator, class DeviceIterator>
+inline DeviceIterator copy_to_device(HostIterator first,
+ HostIterator last,
+ DeviceIterator result,
+ command_queue &queue)
+{
+ typedef typename
+ std::iterator_traits<DeviceIterator>::value_type
+ value_type;
+ typedef typename
+ std::iterator_traits<DeviceIterator>::difference_type
+ difference_type;
+
+ size_t count = iterator_range_size(first, last);
+ if(count == 0){
+ return result;
+ }
+
+ size_t offset = result.get_index();
+
+ queue.enqueue_write_buffer(result.get_buffer(),
+ offset * sizeof(value_type),
+ count * sizeof(value_type),
+ ::boost::addressof(*first));
+
+ return result + static_cast<difference_type>(count);
+}
+
+template<class HostIterator, class DeviceIterator>
+inline future<DeviceIterator> copy_to_device_async(HostIterator first,
+ HostIterator last,
+ DeviceIterator result,
+ command_queue &queue)
+{
+ typedef typename
+ std::iterator_traits<DeviceIterator>::value_type
+ value_type;
+ typedef typename
+ std::iterator_traits<DeviceIterator>::difference_type
+ difference_type;
+
+ size_t count = iterator_range_size(first, last);
+ if(count == 0){
+ return future<DeviceIterator>();
+ }
+
+ size_t offset = result.get_index();
+
+ event event_ =
+ queue.enqueue_write_buffer_async(result.get_buffer(),
+ offset * sizeof(value_type),
+ count * sizeof(value_type),
+ ::boost::addressof(*first));
+
+ return make_future(result + static_cast<difference_type>(count), event_);
+}
+
+#ifdef CL_VERSION_2_0
+// copy_to_device() specialization for svm_ptr
+template<class HostIterator, class T>
+inline svm_ptr<T> copy_to_device(HostIterator first,
+ HostIterator last,
+ svm_ptr<T> result,
+ command_queue &queue)
+{
+ size_t count = iterator_range_size(first, last);
+ if(count == 0){
+ return result;
+ }
+
+ queue.enqueue_svm_memcpy(
+ result.get(), ::boost::addressof(*first), count * sizeof(T)
+ );
+
+ return result + count;
+}
+
+template<class HostIterator, class T>
+inline future<svm_ptr<T> > copy_to_device_async(HostIterator first,
+ HostIterator last,
+ svm_ptr<T> result,
+ command_queue &queue)
+{
+ size_t count = iterator_range_size(first, last);
+ if(count == 0){
+ return result;
+ }
+
+ event event_ = queue.enqueue_svm_memcpy_async(
+ result.get(), ::boost::addressof(*first), count * sizeof(T)
+ );
+
+ return make_future(result + count, event_);
+}
+#endif // CL_VERSION_2_0
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_TO_DEVICE_HPP
diff --git a/boost/compute/algorithm/detail/copy_to_host.hpp b/boost/compute/algorithm/detail/copy_to_host.hpp
new file mode 100644
index 0000000000..b889e0c871
--- /dev/null
+++ b/boost/compute/algorithm/detail/copy_to_host.hpp
@@ -0,0 +1,137 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_TO_HOST_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_TO_HOST_HPP
+
+#include <iterator>
+
+#include <boost/utility/addressof.hpp>
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/async/future.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/memory/svm_ptr.hpp>
+#include <boost/compute/detail/iterator_plus_distance.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class DeviceIterator, class HostIterator>
+inline HostIterator copy_to_host(DeviceIterator first,
+ DeviceIterator last,
+ HostIterator result,
+ command_queue &queue)
+{
+ typedef typename
+ std::iterator_traits<DeviceIterator>::value_type
+ value_type;
+
+ size_t count = iterator_range_size(first, last);
+ if(count == 0){
+ return result;
+ }
+
+ const buffer &buffer = first.get_buffer();
+ size_t offset = first.get_index();
+
+ queue.enqueue_read_buffer(buffer,
+ offset * sizeof(value_type),
+ count * sizeof(value_type),
+ ::boost::addressof(*result));
+
+ return iterator_plus_distance(result, count);
+}
+
+// copy_to_host() specialization for std::vector<bool>
+template<class DeviceIterator>
+inline std::vector<bool>::iterator
+copy_to_host(DeviceIterator first,
+ DeviceIterator last,
+ std::vector<bool>::iterator result,
+ command_queue &queue)
+{
+ std::vector<uint8_t> temp(std::distance(first, last));
+ copy_to_host(first, last, temp.begin(), queue);
+ return std::copy(temp.begin(), temp.end(), result);
+}
+
+template<class DeviceIterator, class HostIterator>
+inline future<HostIterator> copy_to_host_async(DeviceIterator first,
+ DeviceIterator last,
+ HostIterator result,
+ command_queue &queue)
+{
+ typedef typename
+ std::iterator_traits<DeviceIterator>::value_type
+ value_type;
+
+ size_t count = iterator_range_size(first, last);
+ if(count == 0){
+ return future<HostIterator>();
+ }
+
+ const buffer &buffer = first.get_buffer();
+ size_t offset = first.get_index();
+
+ event event_ =
+ queue.enqueue_read_buffer_async(buffer,
+ offset * sizeof(value_type),
+ count * sizeof(value_type),
+ ::boost::addressof(*result));
+
+ return make_future(iterator_plus_distance(result, count), event_);
+}
+
+#ifdef CL_VERSION_2_0
+// copy_to_host() specialization for svm_ptr
+template<class T, class HostIterator>
+inline HostIterator copy_to_host(svm_ptr<T> first,
+ svm_ptr<T> last,
+ HostIterator result,
+ command_queue &queue)
+{
+ size_t count = iterator_range_size(first, last);
+ if(count == 0){
+ return result;
+ }
+
+ queue.enqueue_svm_memcpy(
+ ::boost::addressof(*result), first.get(), count * sizeof(T)
+ );
+
+ return result + count;
+}
+
+template<class T, class HostIterator>
+inline future<HostIterator> copy_to_host_async(svm_ptr<T> first,
+ svm_ptr<T> last,
+ HostIterator result,
+ command_queue &queue)
+{
+ size_t count = iterator_range_size(first, last);
+ if(count == 0){
+ return result;
+ }
+
+ event event_ = queue.enqueue_svm_memcpy_async(
+ ::boost::addressof(*result), first.get(), count * sizeof(T)
+ );
+
+ return make_future(iterator_plus_distance(result, count), event_);
+}
+#endif // CL_VERSION_2_0
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_TO_HOST_HPP
diff --git a/boost/compute/algorithm/detail/count_if_with_ballot.hpp b/boost/compute/algorithm/detail/count_if_with_ballot.hpp
new file mode 100644
index 0000000000..584ef37ab9
--- /dev/null
+++ b/boost/compute/algorithm/detail/count_if_with_ballot.hpp
@@ -0,0 +1,78 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_BALLOT_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_BALLOT_HPP
+
+#include <boost/compute/context.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/algorithm/reduce.hpp>
+#include <boost/compute/functional/detail/nvidia_ballot.hpp>
+#include <boost/compute/functional/detail/nvidia_popcount.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class Predicate>
+inline size_t count_if_with_ballot(InputIterator first,
+ InputIterator last,
+ Predicate predicate,
+ command_queue &queue)
+{
+ size_t count = iterator_range_size(first, last);
+ size_t block_size = 32;
+ size_t block_count = count / block_size;
+ if(block_count * block_size != count){
+ block_count++;
+ }
+
+ const ::boost::compute::context &context = queue.get_context();
+
+ ::boost::compute::vector<uint_> counts(block_count, context);
+
+ ::boost::compute::detail::nvidia_popcount<uint_> popc;
+ ::boost::compute::detail::nvidia_ballot<uint_> ballot;
+
+ meta_kernel k("count_if_with_ballot");
+ k <<
+ "const uint gid = get_global_id(0);\n" <<
+
+ "bool value = false;\n" <<
+ "if(gid < count)\n" <<
+ " value = " << predicate(first[k.var<const uint_>("gid")]) << ";\n" <<
+
+ "uint bits = " << ballot(k.var<const uint_>("value")) << ";\n" <<
+
+ "if(get_local_id(0) == 0)\n" <<
+ counts.begin()[k.var<uint_>("get_group_id(0)") ]
+ << " = " << popc(k.var<uint_>("bits")) << ";\n";
+
+ k.add_set_arg<const uint_>("count", count);
+
+ k.exec_1d(queue, 0, block_size * block_count, block_size);
+
+ uint_ result;
+ ::boost::compute::reduce(
+ counts.begin(),
+ counts.end(),
+ &result,
+ queue
+ );
+ return result;
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_BALLOT_HPP
diff --git a/boost/compute/algorithm/detail/count_if_with_reduce.hpp b/boost/compute/algorithm/detail/count_if_with_reduce.hpp
new file mode 100644
index 0000000000..f9449f4a41
--- /dev/null
+++ b/boost/compute/algorithm/detail/count_if_with_reduce.hpp
@@ -0,0 +1,87 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_REDUCE_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_REDUCE_HPP
+
+#include <boost/compute/algorithm/reduce.hpp>
+#include <boost/compute/iterator/transform_iterator.hpp>
+#include <boost/compute/types/fundamental.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class Predicate, class Arg>
+struct invoked_countable_predicate
+{
+ invoked_countable_predicate(Predicate p, Arg a)
+ : predicate(p), arg(a)
+ {
+ }
+
+ Predicate predicate;
+ Arg arg;
+};
+
+template<class Predicate, class Arg>
+inline meta_kernel& operator<<(meta_kernel &kernel,
+ const invoked_countable_predicate<Predicate, Arg> &expr)
+{
+ return kernel << "(" << expr.predicate(expr.arg) << " ? 1 : 0)";
+}
+
+// the countable_predicate wraps Predicate and converts its result from
+// bool to ulong so that it can be used with reduce()
+template<class Predicate>
+struct countable_predicate
+{
+ typedef ulong_ result_type;
+
+ countable_predicate(Predicate predicate)
+ : m_predicate(predicate)
+ {
+ }
+
+ template<class Arg>
+ invoked_countable_predicate<Predicate, Arg> operator()(const Arg &arg) const
+ {
+ return invoked_countable_predicate<Predicate, Arg>(m_predicate, arg);
+ }
+
+ Predicate m_predicate;
+};
+
+// counts the number of elements matching predicate using reduce()
+template<class InputIterator, class Predicate>
+inline size_t count_if_with_reduce(InputIterator first,
+ InputIterator last,
+ Predicate predicate,
+ command_queue &queue)
+{
+ countable_predicate<Predicate> reduce_predicate(predicate);
+
+ ulong_ count = 0;
+ ::boost::compute::reduce(
+ ::boost::compute::make_transform_iterator(first, reduce_predicate),
+ ::boost::compute::make_transform_iterator(last, reduce_predicate),
+ &count,
+ ::boost::compute::plus<ulong_>(),
+ queue
+ );
+
+ return static_cast<size_t>(count);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_REDUCE_HPP
diff --git a/boost/compute/algorithm/detail/count_if_with_threads.hpp b/boost/compute/algorithm/detail/count_if_with_threads.hpp
new file mode 100644
index 0000000000..6f282982e0
--- /dev/null
+++ b/boost/compute/algorithm/detail/count_if_with_threads.hpp
@@ -0,0 +1,129 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_THREADS_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_THREADS_HPP
+
+#include <numeric>
+
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/container/vector.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class Predicate>
+class count_if_with_threads_kernel : meta_kernel
+{
+public:
+ typedef typename
+ std::iterator_traits<InputIterator>::value_type
+ value_type;
+
+ count_if_with_threads_kernel()
+ : meta_kernel("count_if_with_threads")
+ {
+ }
+
+ void set_args(InputIterator first,
+ InputIterator last,
+ Predicate predicate)
+
+ {
+ typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+ m_size = detail::iterator_range_size(first, last);
+
+ m_size_arg = add_arg<const ulong_>("size");
+ m_counts_arg = add_arg<ulong_ *>(memory_object::global_memory, "counts");
+
+ *this <<
+ // thread parameters
+ "const uint gid = get_global_id(0);\n" <<
+ "const uint block_size = size / get_global_size(0);\n" <<
+ "const uint start = block_size * gid;\n" <<
+ "uint end = 0;\n" <<
+ "if(gid == get_global_size(0) - 1)\n" <<
+ " end = size;\n" <<
+ "else\n" <<
+ " end = block_size * gid + block_size;\n" <<
+
+ // count values
+ "uint count = 0;\n" <<
+ "for(uint i = start; i < end; i++){\n" <<
+ decl<const T>("value") << "="
+ << first[expr<uint_>("i")] << ";\n" <<
+ if_(predicate(var<const T>("value"))) << "{\n" <<
+ "count++;\n" <<
+ "}\n" <<
+ "}\n" <<
+
+ // write count
+ "counts[gid] = count;\n";
+ }
+
+ size_t exec(command_queue &queue)
+ {
+ const device &device = queue.get_device();
+ const context &context = queue.get_context();
+
+ size_t threads = device.compute_units();
+
+ const size_t minimum_block_size = 2048;
+ if(m_size / threads < minimum_block_size){
+ threads = static_cast<size_t>(
+ (std::max)(
+ std::ceil(float(m_size) / minimum_block_size),
+ 1.0f
+ )
+ );
+ }
+
+ // storage for counts
+ ::boost::compute::vector<ulong_> counts(threads, context);
+
+ // exec kernel
+ set_arg(m_size_arg, static_cast<ulong_>(m_size));
+ set_arg(m_counts_arg, counts.get_buffer());
+ exec_1d(queue, 0, threads, 1);
+
+ // copy counts to the host
+ std::vector<ulong_> host_counts(threads);
+ ::boost::compute::copy(counts.begin(), counts.end(), host_counts.begin(), queue);
+
+ // return sum of counts
+ return std::accumulate(host_counts.begin(), host_counts.end(), size_t(0));
+ }
+
+private:
+ size_t m_size;
+ size_t m_size_arg;
+ size_t m_counts_arg;
+};
+
+// counts values that match the predicate using one thread per block. this is
+// optimized for cpu-type devices with a small number of compute units.
+template<class InputIterator, class Predicate>
+inline size_t count_if_with_threads(InputIterator first,
+ InputIterator last,
+ Predicate predicate,
+ command_queue &queue)
+{
+ count_if_with_threads_kernel<InputIterator, Predicate> kernel;
+ kernel.set_args(first, last, predicate);
+ return kernel.exec(queue);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_THREADS_HPP
diff --git a/boost/compute/algorithm/detail/find_extrema.hpp b/boost/compute/algorithm/detail/find_extrema.hpp
new file mode 100644
index 0000000000..6e756c3904
--- /dev/null
+++ b/boost/compute/algorithm/detail/find_extrema.hpp
@@ -0,0 +1,64 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_HPP
+
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/algorithm/detail/find_extrema_with_reduce.hpp>
+#include <boost/compute/algorithm/detail/find_extrema_with_atomics.hpp>
+#include <boost/compute/algorithm/detail/serial_find_extrema.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class Compare>
+inline InputIterator find_extrema(InputIterator first,
+ InputIterator last,
+ Compare compare,
+ const bool find_minimum,
+ command_queue &queue)
+{
+ size_t count = iterator_range_size(first, last);
+
+ // handle trivial cases
+ if(count == 0 || count == 1){
+ return first;
+ }
+
+ const device &device = queue.get_device();
+
+ // use serial method for small inputs
+ // and when device is a CPU
+ if(count < 512 || (device.type() & device::cpu)){
+ return serial_find_extrema(first, last, compare, find_minimum, queue);
+ }
+
+ // find_extrema_with_reduce() is used only if requirements are met
+ if(find_extrema_with_reduce_requirements_met(first, last, queue))
+ {
+ return find_extrema_with_reduce(first, last, compare, find_minimum, queue);
+ }
+
+ // use serial method for OpenCL version 1.0 due to
+ // problems with atomic_cmpxchg()
+ #ifndef CL_VERSION_1_1
+ return serial_find_extrema(first, last, compare, find_minimum, queue);
+ #endif
+
+ return find_extrema_with_atomics(first, last, compare, find_minimum, queue);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_HPP
diff --git a/boost/compute/algorithm/detail/find_extrema_with_atomics.hpp b/boost/compute/algorithm/detail/find_extrema_with_atomics.hpp
new file mode 100644
index 0000000000..406d1becb7
--- /dev/null
+++ b/boost/compute/algorithm/detail/find_extrema_with_atomics.hpp
@@ -0,0 +1,108 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_ATOMICS_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_ATOMICS_HPP
+
+#include <boost/compute/types.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/container/detail/scalar.hpp>
+#include <boost/compute/functional/atomic.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class Compare>
+inline InputIterator find_extrema_with_atomics(InputIterator first,
+ InputIterator last,
+ Compare compare,
+ const bool find_minimum,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+ typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
+
+ const context &context = queue.get_context();
+
+ meta_kernel k("find_extrema");
+ atomic_cmpxchg<uint_> atomic_cmpxchg_uint;
+
+ k <<
+ "const uint gid = get_global_id(0);\n" <<
+ "uint old_index = *index;\n" <<
+
+ k.decl<value_type>("old") <<
+ " = " << first[k.var<uint_>("old_index")] << ";\n" <<
+ k.decl<value_type>("new") <<
+ " = " << first[k.var<uint_>("gid")] << ";\n" <<
+
+ k.decl<bool>("compare_result") << ";\n" <<
+ "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
+ "while(" <<
+ "(compare_result = " << compare(k.var<value_type>("old"),
+ k.var<value_type>("new")) << ")" <<
+ " || (!(compare_result" <<
+ " || " << compare(k.var<value_type>("new"),
+ k.var<value_type>("old")) << ") "
+ "&& gid < old_index)){\n" <<
+ "#else\n" <<
+ // while condition explained for minimum case with less (<)
+ // as comparison function:
+ // while(new_value < old_value
+ // OR (new_value == old_value AND new_index < old_index))
+ "while(" <<
+ "(compare_result = " << compare(k.var<value_type>("new"),
+ k.var<value_type>("old")) << ")" <<
+ " || (!(compare_result" <<
+ " || " << compare(k.var<value_type>("old"),
+ k.var<value_type>("new")) << ") "
+ "&& gid < old_index)){\n" <<
+ "#endif\n" <<
+
+ " if(" << atomic_cmpxchg_uint(k.var<uint_ *>("index"),
+ k.var<uint_>("old_index"),
+ k.var<uint_>("gid")) << " == old_index)\n" <<
+ " break;\n" <<
+ " else\n" <<
+ " old_index = *index;\n" <<
+ "old = " << first[k.var<uint_>("old_index")] << ";\n" <<
+ "}\n";
+
+ size_t index_arg_index = k.add_arg<uint_ *>(memory_object::global_memory, "index");
+
+ std::string options;
+ if(!find_minimum){
+ options = "-DBOOST_COMPUTE_FIND_MAXIMUM";
+ }
+ kernel kernel = k.compile(context, options);
+
+ // setup index buffer
+ scalar<uint_> index(context);
+ kernel.set_arg(index_arg_index, index.get_buffer());
+
+ // initialize index
+ index.write(0, queue);
+
+ // run kernel
+ size_t count = iterator_range_size(first, last);
+ queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
+
+ // read index and return iterator
+ return first + static_cast<difference_type>(index.read(queue));
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_ATOMICS_HPP
diff --git a/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp b/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp
new file mode 100644
index 0000000000..1fbb7dee19
--- /dev/null
+++ b/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp
@@ -0,0 +1,443 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
+
+#include <algorithm>
+
+#include <boost/compute/types.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/allocator/pinned_allocator.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/parameter_cache.hpp>
+#include <boost/compute/memory/local_buffer.hpp>
+#include <boost/compute/type_traits/type_name.hpp>
+#include <boost/compute/utility/program_cache.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator>
+bool find_extrema_with_reduce_requirements_met(InputIterator first,
+ InputIterator last,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type input_type;
+
+ const device &device = queue.get_device();
+
+ // device must have dedicated local memory storage
+ // otherwise reduction would be highly inefficient
+ if(device.get_info<CL_DEVICE_LOCAL_MEM_TYPE>() != CL_LOCAL)
+ {
+ return false;
+ }
+
+ const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
+ // local memory size in bytes (per compute unit)
+ const size_t local_mem_size = device.get_info<CL_DEVICE_LOCAL_MEM_SIZE>();
+
+ std::string cache_key = std::string("__boost_find_extrema_reduce_")
+ + type_name<input_type>();
+ // load parameters
+ boost::shared_ptr<parameter_cache> parameters =
+ detail::parameter_cache::get_global_cache(device);
+
+ // Get preferred work group size
+ size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
+
+ work_group_size = (std::min)(max_work_group_size, work_group_size);
+
+ // local memory size needed to perform parallel reduction
+ size_t required_local_mem_size = 0;
+ // indices size
+ required_local_mem_size += sizeof(uint_) * work_group_size;
+ // values size
+ required_local_mem_size += sizeof(input_type) * work_group_size;
+
+ // at least 4 work groups per compute unit otherwise reduction
+ // would be highly inefficient
+ return ((required_local_mem_size * 4) <= local_mem_size);
+}
+
+/// \internal_
+/// Algorithm finds the first extremum in given range, i.e., with the lowest
+/// index.
+///
+/// If \p use_input_idx is false, it's assumed that input data is ordered by
+/// increasing index and \p input_idx is not used in the algorithm.
+template<class InputIterator, class ResultIterator, class Compare>
+inline void find_extrema_with_reduce(InputIterator input,
+ vector<uint_>::iterator input_idx,
+ size_t count,
+ ResultIterator result,
+ vector<uint_>::iterator result_idx,
+ size_t work_groups_no,
+ size_t work_group_size,
+ Compare compare,
+ const bool find_minimum,
+ const bool use_input_idx,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type input_type;
+
+ const context &context = queue.get_context();
+
+ meta_kernel k("find_extrema_reduce");
+ size_t count_arg = k.add_arg<uint_>("count");
+ size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");
+ size_t block_idx_arg = k.add_arg<uint_ *>(memory_object::local_memory, "block_idx");
+
+ k <<
+ // Work item global id
+ k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
+
+ // Index of element that will be read from input buffer
+ k.decl<uint_>("idx") << " = gid;\n" <<
+
+ k.decl<input_type>("acc") << ";\n" <<
+ k.decl<uint_>("acc_idx") << ";\n" <<
+ "if(gid < count) {\n" <<
+ // Real index of currently best element
+ "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
+ k.var<uint_>("acc_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
+ "#else\n" <<
+ k.var<uint_>("acc_idx") << " = idx;\n" <<
+ "#endif\n" <<
+
+ // Init accumulator with first[get_global_id(0)]
+ "acc = " << input[k.var<uint_>("idx")] << ";\n" <<
+ "idx += get_global_size(0);\n" <<
+ "}\n" <<
+
+ k.decl<bool>("compare_result") << ";\n" <<
+ k.decl<bool>("equal") << ";\n\n" <<
+ "while( idx < count ){\n" <<
+ // Next element
+ k.decl<input_type>("next") << " = " << input[k.var<uint_>("idx")] << ";\n" <<
+ "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
+ k.decl<input_type>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
+ "#endif\n" <<
+
+ // Comparison between currently best element (acc) and next element
+ "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
+ "compare_result = " << compare(k.var<input_type>("next"),
+ k.var<input_type>("acc")) << ";\n" <<
+ "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
+ "equal = !compare_result && !" <<
+ compare(k.var<input_type>("acc"),
+ k.var<input_type>("next")) << ";\n" <<
+ "# endif\n" <<
+ "#else\n" <<
+ "compare_result = " << compare(k.var<input_type>("acc"),
+ k.var<input_type>("next")) << ";\n" <<
+ "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
+ "equal = !compare_result && !" <<
+ compare(k.var<input_type>("next"),
+ k.var<input_type>("acc")) << ";\n" <<
+ "# endif\n" <<
+ "#endif\n" <<
+
+ // save the winner
+ "acc = compare_result ? acc : next;\n" <<
+ "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
+ "acc_idx = compare_result ? " <<
+ "acc_idx : " <<
+ "(equal ? min(acc_idx, next_idx) : next_idx);\n" <<
+ "#else\n" <<
+ "acc_idx = compare_result ? acc_idx : idx;\n" <<
+ "#endif\n" <<
+ "idx += get_global_size(0);\n" <<
+ "}\n\n" <<
+
+ // Work item local id
+ k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
+ "block[lid] = acc;\n" <<
+ "block_idx[lid] = acc_idx;\n" <<
+ "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+
+ k.decl<uint_>("group_offset") <<
+ " = count - (get_local_size(0) * get_group_id(0));\n\n";
+
+ k <<
+ "#pragma unroll\n"
+ "for(" << k.decl<uint_>("offset") << " = " << uint_(work_group_size) << " / 2; offset > 0; " <<
+ "offset = offset / 2) {\n" <<
+ "if((lid < offset) && ((lid + offset) < group_offset)) { \n" <<
+ k.decl<input_type>("mine") << " = block[lid];\n" <<
+ k.decl<input_type>("other") << " = block[lid+offset];\n" <<
+ "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
+ "compare_result = " << compare(k.var<input_type>("other"),
+ k.var<input_type>("mine")) << ";\n" <<
+ "equal = !compare_result && !" <<
+ compare(k.var<input_type>("mine"),
+ k.var<input_type>("other")) << ";\n" <<
+ "#else\n" <<
+ "compare_result = " << compare(k.var<input_type>("mine"),
+ k.var<input_type>("other")) << ";\n" <<
+ "equal = !compare_result && !" <<
+ compare(k.var<input_type>("other"),
+ k.var<input_type>("mine")) << ";\n" <<
+ "#endif\n" <<
+ "block[lid] = compare_result ? mine : other;\n" <<
+ k.decl<uint_>("mine_idx") << " = block_idx[lid];\n" <<
+ k.decl<uint_>("other_idx") << " = block_idx[lid+offset];\n" <<
+ "block_idx[lid] = compare_result ? " <<
+ "mine_idx : " <<
+ "(equal ? min(mine_idx, other_idx) : other_idx);\n" <<
+ "}\n"
+ "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+ "}\n\n" <<
+
+ // write block result to global output
+ "if(lid == 0){\n" <<
+ result[k.var<uint_>("get_group_id(0)")] << " = block[0];\n" <<
+ result_idx[k.var<uint_>("get_group_id(0)")] << " = block_idx[0];\n" <<
+ "}";
+
+ std::string options;
+ if(!find_minimum){
+ options = "-DBOOST_COMPUTE_FIND_MAXIMUM";
+ }
+ if(use_input_idx){
+ options += " -DBOOST_COMPUTE_USE_INPUT_IDX";
+ }
+
+ kernel kernel = k.compile(context, options);
+
+ kernel.set_arg(count_arg, static_cast<uint_>(count));
+ kernel.set_arg(block_arg, local_buffer<input_type>(work_group_size));
+ kernel.set_arg(block_idx_arg, local_buffer<uint_>(work_group_size));
+
+ queue.enqueue_1d_range_kernel(kernel,
+ 0,
+ work_groups_no * work_group_size,
+ work_group_size);
+}
+
+template<class InputIterator, class ResultIterator, class Compare>
+inline void find_extrema_with_reduce(InputIterator input,
+ size_t count,
+ ResultIterator result,
+ vector<uint_>::iterator result_idx,
+ size_t work_groups_no,
+ size_t work_group_size,
+ Compare compare,
+ const bool find_minimum,
+ command_queue &queue)
+{
+ // dummy will not be used
+ buffer_iterator<uint_> dummy = result_idx;
+ return find_extrema_with_reduce(
+ input, dummy, count, result, result_idx, work_groups_no,
+ work_group_size, compare, find_minimum, false, queue
+ );
+}
+
+template<class InputIterator, class Compare>
+InputIterator find_extrema_with_reduce(InputIterator first,
+ InputIterator last,
+ Compare compare,
+ const bool find_minimum,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
+ typedef typename std::iterator_traits<InputIterator>::value_type input_type;
+
+ const context &context = queue.get_context();
+ const device &device = queue.get_device();
+
+ // Getting information about used queue and device
+ const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
+ const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
+
+ const size_t count = detail::iterator_range_size(first, last);
+
+ std::string cache_key = std::string("__boost_find_extrema_with_reduce_")
+ + type_name<input_type>();
+
+ // load parameters
+ boost::shared_ptr<parameter_cache> parameters =
+ detail::parameter_cache::get_global_cache(device);
+
+ // get preferred work group size and preferred number
+ // of work groups per compute unit
+ size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
+ size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 100);
+
+ // calculate work group size and number of work groups
+ work_group_size = (std::min)(max_work_group_size, work_group_size);
+ size_t work_groups_no = compute_units_no * work_groups_per_cu;
+ work_groups_no = (std::min)(
+ work_groups_no,
+ static_cast<size_t>(std::ceil(float(count) / work_group_size))
+ );
+
+ // phase I: finding candidates for extremum
+
+ // device buffors for extremum candidates and their indices
+ // each work-group computes its candidate
+ vector<input_type> candidates(work_groups_no, context);
+ vector<uint_> candidates_idx(work_groups_no, context);
+
+ // finding candidates for first extremum and their indices
+ find_extrema_with_reduce(
+ first, count, candidates.begin(), candidates_idx.begin(),
+ work_groups_no, work_group_size, compare, find_minimum, queue
+ );
+
+ // phase II: finding extremum from among the candidates
+
+ // zero-copy buffers for final result (value and index)
+ vector<input_type, ::boost::compute::pinned_allocator<input_type> >
+ result(1, context);
+ vector<uint_, ::boost::compute::pinned_allocator<uint_> >
+ result_idx(1, context);
+
+ // get extremum from among the candidates
+ find_extrema_with_reduce(
+ candidates.begin(), candidates_idx.begin(), work_groups_no, result.begin(),
+ result_idx.begin(), 1, work_group_size, compare, find_minimum, true, queue
+ );
+
+ // mapping extremum index to host
+ uint_* result_idx_host_ptr =
+ static_cast<uint_*>(
+ queue.enqueue_map_buffer(
+ result_idx.get_buffer(), command_queue::map_read,
+ 0, sizeof(uint_)
+ )
+ );
+
+ return first + static_cast<difference_type>(*result_idx_host_ptr);
+}
+
+template<class InputIterator>
+InputIterator find_extrema_with_reduce(InputIterator first,
+ InputIterator last,
+ ::boost::compute::less<
+ typename std::iterator_traits<
+ InputIterator
+ >::value_type
+ >
+ compare,
+ const bool find_minimum,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
+ typedef typename std::iterator_traits<InputIterator>::value_type input_type;
+
+ const context &context = queue.get_context();
+ const device &device = queue.get_device();
+
+ // Getting information about used queue and device
+ const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
+ const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
+
+ const size_t count = detail::iterator_range_size(first, last);
+
+ std::string cache_key = std::string("__boost_find_extrema_with_reduce_")
+ + type_name<input_type>();
+
+ // load parameters
+ boost::shared_ptr<parameter_cache> parameters =
+ detail::parameter_cache::get_global_cache(device);
+
+ // get preferred work group size and preferred number
+ // of work groups per compute unit
+ size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
+ size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 64);
+
+ // calculate work group size and number of work groups
+ work_group_size = (std::min)(max_work_group_size, work_group_size);
+ size_t work_groups_no = compute_units_no * work_groups_per_cu;
+ work_groups_no = (std::min)(
+ work_groups_no,
+ static_cast<size_t>(std::ceil(float(count) / work_group_size))
+ );
+
+ // phase I: finding candidates for extremum
+
+ // device buffors for extremum candidates and their indices
+ // each work-group computes its candidate
+ // zero-copy buffers are used to eliminate copying data back to host
+ vector<input_type, ::boost::compute::pinned_allocator<input_type> >
+ candidates(work_groups_no, context);
+ vector<uint_, ::boost::compute::pinned_allocator <uint_> >
+ candidates_idx(work_groups_no, context);
+
+ // finding candidates for first extremum and their indices
+ find_extrema_with_reduce(
+ first, count, candidates.begin(), candidates_idx.begin(),
+ work_groups_no, work_group_size, compare, find_minimum, queue
+ );
+
+ // phase II: finding extremum from among the candidates
+
+ // mapping candidates and their indices to host
+ input_type* candidates_host_ptr =
+ static_cast<input_type*>(
+ queue.enqueue_map_buffer(
+ candidates.get_buffer(), command_queue::map_read,
+ 0, work_groups_no * sizeof(input_type)
+ )
+ );
+
+ uint_* candidates_idx_host_ptr =
+ static_cast<uint_*>(
+ queue.enqueue_map_buffer(
+ candidates_idx.get_buffer(), command_queue::map_read,
+ 0, work_groups_no * sizeof(uint_)
+ )
+ );
+
+ input_type* i = candidates_host_ptr;
+ uint_* idx = candidates_idx_host_ptr;
+ uint_* extremum_idx = idx;
+ input_type extremum = *candidates_host_ptr;
+ i++; idx++;
+
+ // find extremum (serial) from among the candidates on host
+ if(!find_minimum) {
+ while(idx != (candidates_idx_host_ptr + work_groups_no)) {
+ input_type next = *i;
+ bool compare_result = next > extremum;
+ bool equal = next == extremum;
+ extremum = compare_result ? next : extremum;
+ extremum_idx = compare_result ? idx : extremum_idx;
+ extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
+ idx++, i++;
+ }
+ }
+ else {
+ while(idx != (candidates_idx_host_ptr + work_groups_no)) {
+ input_type next = *i;
+ bool compare_result = next < extremum;
+ bool equal = next == extremum;
+ extremum = compare_result ? next : extremum;
+ extremum_idx = compare_result ? idx : extremum_idx;
+ extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
+ idx++, i++;
+ }
+ }
+
+ return first + static_cast<difference_type>(*extremum_idx);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
diff --git a/boost/compute/algorithm/detail/find_if_with_atomics.hpp b/boost/compute/algorithm/detail/find_if_with_atomics.hpp
new file mode 100644
index 0000000000..112c34cf00
--- /dev/null
+++ b/boost/compute/algorithm/detail/find_if_with_atomics.hpp
@@ -0,0 +1,212 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
+
+#include <iterator>
+
+#include <boost/compute/types.hpp>
+#include <boost/compute/functional.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/container/detail/scalar.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/type_traits/type_name.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/parameter_cache.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class UnaryPredicate>
+inline InputIterator find_if_with_atomics_one_vpt(InputIterator first,
+ InputIterator last,
+ UnaryPredicate predicate,
+ const size_t count,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+ typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
+
+ const context &context = queue.get_context();
+
+ detail::meta_kernel k("find_if");
+ size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index");
+ atomic_min<uint_> atomic_min_uint;
+
+ k << k.decl<const uint_>("i") << " = get_global_id(0);\n"
+ << k.decl<const value_type>("value") << "="
+ << first[k.var<const uint_>("i")] << ";\n"
+ << "if(" << predicate(k.var<const value_type>("value")) << "){\n"
+ << " " << atomic_min_uint(k.var<uint_ *>("index"), k.var<uint_>("i")) << ";\n"
+ << "}\n";
+
+ kernel kernel = k.compile(context);
+
+ scalar<uint_> index(context);
+ kernel.set_arg(index_arg, index.get_buffer());
+
+ // initialize index to the last iterator's index
+ index.write(static_cast<uint_>(count), queue);
+ queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
+
+ // read index and return iterator
+ return first + static_cast<difference_type>(index.read(queue));
+}
+
+template<class InputIterator, class UnaryPredicate>
+inline InputIterator find_if_with_atomics_multiple_vpt(InputIterator first,
+ InputIterator last,
+ UnaryPredicate predicate,
+ const size_t count,
+ const size_t vpt,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+ typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
+
+ const context &context = queue.get_context();
+ const device &device = queue.get_device();
+
+ detail::meta_kernel k("find_if");
+ size_t index_arg = k.add_arg<uint_ *>(memory_object::global_memory, "index");
+ size_t count_arg = k.add_arg<const uint_>("count");
+ size_t vpt_arg = k.add_arg<const uint_>("vpt");
+ atomic_min<uint_> atomic_min_uint;
+
+ // for GPUs reads from global memory are coalesced
+ if(device.type() & device::gpu) {
+ k <<
+ k.decl<const uint_>("lsize") << " = get_local_size(0);\n" <<
+ k.decl<uint_>("id") << " = get_local_id(0) + get_group_id(0) * lsize * vpt;\n" <<
+ k.decl<const uint_>("end") << " = min(" <<
+ "id + (lsize *" << k.var<uint_>("vpt") << ")," <<
+ "count" <<
+ ");\n" <<
+
+ // checking if the index is already found
+ "__local uint local_index;\n" <<
+ "if(get_local_id(0) == 0){\n" <<
+ " local_index = *index;\n " <<
+ "};\n" <<
+ "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+ "if(local_index < id){\n" <<
+ " return;\n" <<
+ "}\n" <<
+
+ "while(id < end){\n" <<
+ " " << k.decl<const value_type>("value") << " = " <<
+ first[k.var<const uint_>("id")] << ";\n"
+ " if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
+ " " << atomic_min_uint(k.var<uint_ *>("index"),
+ k.var<uint_>("id")) << ";\n" <<
+ " return;\n"
+ " }\n" <<
+ " id+=lsize;\n" <<
+ "}\n";
+ // for CPUs (and other devices) reads are ordered so the big cache is
+ // efficiently used.
+ } else {
+ k <<
+ k.decl<uint_>("id") << " = get_global_id(0) * " << k.var<uint_>("vpt") << ";\n" <<
+ k.decl<const uint_>("end") << " = min(" <<
+ "id + " << k.var<uint_>("vpt") << "," <<
+ "count" <<
+ ");\n" <<
+ "while(id < end && (*index) > id){\n" <<
+ " " << k.decl<const value_type>("value") << " = " <<
+ first[k.var<const uint_>("id")] << ";\n"
+ " if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
+ " " << atomic_min_uint(k.var<uint_ *>("index"),
+ k.var<uint_>("id")) << ";\n" <<
+ " return;\n" <<
+ " }\n" <<
+ " id++;\n" <<
+ "}\n";
+ }
+
+ kernel kernel = k.compile(context);
+
+ scalar<uint_> index(context);
+ kernel.set_arg(index_arg, index.get_buffer());
+ kernel.set_arg(count_arg, static_cast<uint_>(count));
+ kernel.set_arg(vpt_arg, static_cast<uint_>(vpt));
+
+ // initialize index to the last iterator's index
+ index.write(static_cast<uint_>(count), queue);
+
+ const size_t global_wg_size = static_cast<size_t>(
+ std::ceil(float(count) / vpt)
+ );
+ queue.enqueue_1d_range_kernel(kernel, 0, global_wg_size, 0);
+
+ // read index and return iterator
+ return first + static_cast<difference_type>(index.read(queue));
+}
+
+template<class InputIterator, class UnaryPredicate>
+inline InputIterator find_if_with_atomics(InputIterator first,
+ InputIterator last,
+ UnaryPredicate predicate,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+ size_t count = detail::iterator_range_size(first, last);
+ if(count == 0){
+ return last;
+ }
+
+ const device &device = queue.get_device();
+
+ // load cached parameters
+ std::string cache_key = std::string("__boost_find_if_with_atomics_")
+ + type_name<value_type>();
+ boost::shared_ptr<parameter_cache> parameters =
+ detail::parameter_cache::get_global_cache(device);
+
+ // for relatively small inputs on GPUs kernel checking one value per thread
+ // (work-item) is more efficient than its multiple values per thread version
+ if(device.type() & device::gpu){
+ const size_t one_vpt_threshold =
+ parameters->get(cache_key, "one_vpt_threshold", 1048576);
+ if(count <= one_vpt_threshold){
+ return find_if_with_atomics_one_vpt(
+ first, last, predicate, count, queue
+ );
+ }
+ }
+
+ // values per thread
+ size_t vpt;
+ if(device.type() & device::gpu){
+ // get vpt parameter
+ vpt = parameters->get(cache_key, "vpt", 32);
+ } else {
+ // for CPUs work is split equally between compute units
+ const size_t max_compute_units =
+ device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
+ vpt = static_cast<size_t>(
+ std::ceil(float(count) / max_compute_units)
+ );
+ }
+
+ return find_if_with_atomics_multiple_vpt(
+ first, last, predicate, count, vpt, queue
+ );
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
diff --git a/boost/compute/algorithm/detail/inplace_reduce.hpp b/boost/compute/algorithm/detail/inplace_reduce.hpp
new file mode 100644
index 0000000000..60c61e83fe
--- /dev/null
+++ b/boost/compute/algorithm/detail/inplace_reduce.hpp
@@ -0,0 +1,136 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP
+
+#include <iterator>
+
+#include <boost/utility/result_of.hpp>
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/memory/local_buffer.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class Iterator, class BinaryFunction>
+inline void inplace_reduce(Iterator first,
+ Iterator last,
+ BinaryFunction function,
+ command_queue &queue)
+{
+ typedef typename
+ std::iterator_traits<Iterator>::value_type
+ value_type;
+
+ size_t input_size = iterator_range_size(first, last);
+ if(input_size < 2){
+ return;
+ }
+
+ const context &context = queue.get_context();
+
+ size_t block_size = 64;
+ size_t values_per_thread = 8;
+ size_t block_count = input_size / (block_size * values_per_thread);
+ if(block_count * block_size * values_per_thread != input_size)
+ block_count++;
+
+ vector<value_type> output(block_count, context);
+
+ meta_kernel k("inplace_reduce");
+ size_t input_arg = k.add_arg<value_type *>(memory_object::global_memory, "input");
+ size_t input_size_arg = k.add_arg<const uint_>("input_size");
+ size_t output_arg = k.add_arg<value_type *>(memory_object::global_memory, "output");
+ size_t scratch_arg = k.add_arg<value_type *>(memory_object::local_memory, "scratch");
+ k <<
+ "const uint gid = get_global_id(0);\n" <<
+ "const uint lid = get_local_id(0);\n" <<
+ "const uint values_per_thread =\n"
+ << uint_(values_per_thread) << ";\n" <<
+
+ // thread reduce
+ "const uint index = gid * values_per_thread;\n" <<
+ "if(index < input_size){\n" <<
+ k.decl<value_type>("sum") << " = input[index];\n" <<
+ "for(uint i = 1;\n" <<
+ "i < values_per_thread && (index + i) < input_size;\n" <<
+ "i++){\n" <<
+ " sum = " <<
+ function(k.var<value_type>("sum"),
+ k.var<value_type>("input[index+i]")) << ";\n" <<
+ "}\n" <<
+ "scratch[lid] = sum;\n" <<
+ "}\n" <<
+
+ // local reduce
+ "for(uint i = 1; i < get_local_size(0); i <<= 1){\n" <<
+ " barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+ " uint mask = (i << 1) - 1;\n" <<
+ " uint next_index = (gid + i) * values_per_thread;\n"
+ " if((lid & mask) == 0 && next_index < input_size){\n" <<
+ " scratch[lid] = " <<
+ function(k.var<value_type>("scratch[lid]"),
+ k.var<value_type>("scratch[lid+i]")) << ";\n" <<
+ " }\n" <<
+ "}\n" <<
+
+ // write output for block
+ "if(lid == 0){\n" <<
+ " output[get_group_id(0)] = scratch[0];\n" <<
+ "}\n"
+ ;
+
+ const buffer *input_buffer = &first.get_buffer();
+ const buffer *output_buffer = &output.get_buffer();
+
+ kernel kernel = k.compile(context);
+
+ while(input_size > 1){
+ kernel.set_arg(input_arg, *input_buffer);
+ kernel.set_arg(input_size_arg, static_cast<uint_>(input_size));
+ kernel.set_arg(output_arg, *output_buffer);
+ kernel.set_arg(scratch_arg, local_buffer<value_type>(block_size));
+
+ queue.enqueue_1d_range_kernel(kernel,
+ 0,
+ block_count * block_size,
+ block_size);
+
+ input_size =
+ static_cast<size_t>(
+ std::ceil(float(input_size) / (block_size * values_per_thread)
+ )
+ );
+
+ block_count = input_size / (block_size * values_per_thread);
+ if(block_count * block_size * values_per_thread != input_size)
+ block_count++;
+
+ std::swap(input_buffer, output_buffer);
+ }
+
+ if(input_buffer != &first.get_buffer()){
+ ::boost::compute::copy(output.begin(),
+ output.begin() + 1,
+ first,
+ queue);
+ }
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP
diff --git a/boost/compute/algorithm/detail/insertion_sort.hpp b/boost/compute/algorithm/detail/insertion_sort.hpp
new file mode 100644
index 0000000000..4b5b95139a
--- /dev/null
+++ b/boost/compute/algorithm/detail/insertion_sort.hpp
@@ -0,0 +1,165 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_INSERTION_SORT_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_INSERTION_SORT_HPP
+
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/program.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/memory/local_buffer.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class Iterator, class Compare>
+inline void serial_insertion_sort(Iterator first,
+ Iterator last,
+ Compare compare,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<Iterator>::value_type T;
+
+ size_t count = iterator_range_size(first, last);
+ if(count < 2){
+ return;
+ }
+
+ meta_kernel k("serial_insertion_sort");
+ size_t local_data_arg = k.add_arg<T *>(memory_object::local_memory, "data");
+ size_t count_arg = k.add_arg<uint_>("n");
+
+ k <<
+ // copy data to local memory
+ "for(uint i = 0; i < n; i++){\n" <<
+ " data[i] = " << first[k.var<uint_>("i")] << ";\n"
+ "}\n"
+
+ // sort data in local memory
+ "for(uint i = 1; i < n; i++){\n" <<
+ " " << k.decl<const T>("value") << " = data[i];\n" <<
+ " uint pos = i;\n" <<
+ " while(pos > 0 && " <<
+ compare(k.var<const T>("value"),
+ k.var<const T>("data[pos-1]")) << "){\n" <<
+ " data[pos] = data[pos-1];\n" <<
+ " pos--;\n" <<
+ " }\n" <<
+ " data[pos] = value;\n" <<
+ "}\n" <<
+
+ // copy sorted data to output
+ "for(uint i = 0; i < n; i++){\n" <<
+ " " << first[k.var<uint_>("i")] << " = data[i];\n"
+ "}\n";
+
+ const context &context = queue.get_context();
+ ::boost::compute::kernel kernel = k.compile(context);
+ kernel.set_arg(local_data_arg, local_buffer<T>(count));
+ kernel.set_arg(count_arg, static_cast<uint_>(count));
+
+ queue.enqueue_task(kernel);
+}
+
+template<class Iterator>
+inline void serial_insertion_sort(Iterator first,
+ Iterator last,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<Iterator>::value_type T;
+
+ ::boost::compute::less<T> less;
+
+ return serial_insertion_sort(first, last, less, queue);
+}
+
+template<class KeyIterator, class ValueIterator, class Compare>
+inline void serial_insertion_sort_by_key(KeyIterator keys_first,
+ KeyIterator keys_last,
+ ValueIterator values_first,
+ Compare compare,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
+ typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
+
+ size_t count = iterator_range_size(keys_first, keys_last);
+ if(count < 2){
+ return;
+ }
+
+ meta_kernel k("serial_insertion_sort_by_key");
+ size_t local_keys_arg = k.add_arg<key_type *>(memory_object::local_memory, "keys");
+ size_t local_data_arg = k.add_arg<value_type *>(memory_object::local_memory, "data");
+ size_t count_arg = k.add_arg<uint_>("n");
+
+ k <<
+ // copy data to local memory
+ "for(uint i = 0; i < n; i++){\n" <<
+ " keys[i] = " << keys_first[k.var<uint_>("i")] << ";\n"
+ " data[i] = " << values_first[k.var<uint_>("i")] << ";\n"
+ "}\n"
+
+ // sort data in local memory
+ "for(uint i = 1; i < n; i++){\n" <<
+ " " << k.decl<const key_type>("key") << " = keys[i];\n" <<
+ " " << k.decl<const value_type>("value") << " = data[i];\n" <<
+ " uint pos = i;\n" <<
+ " while(pos > 0 && " <<
+ compare(k.var<const key_type>("key"),
+ k.var<const key_type>("keys[pos-1]")) << "){\n" <<
+ " keys[pos] = keys[pos-1];\n" <<
+ " data[pos] = data[pos-1];\n" <<
+ " pos--;\n" <<
+ " }\n" <<
+ " keys[pos] = key;\n" <<
+ " data[pos] = value;\n" <<
+ "}\n" <<
+
+ // copy sorted data to output
+ "for(uint i = 0; i < n; i++){\n" <<
+ " " << keys_first[k.var<uint_>("i")] << " = keys[i];\n"
+ " " << values_first[k.var<uint_>("i")] << " = data[i];\n"
+ "}\n";
+
+ const context &context = queue.get_context();
+ ::boost::compute::kernel kernel = k.compile(context);
+ kernel.set_arg(local_keys_arg, static_cast<uint_>(count * sizeof(key_type)), 0);
+ kernel.set_arg(local_data_arg, static_cast<uint_>(count * sizeof(value_type)), 0);
+ kernel.set_arg(count_arg, static_cast<uint_>(count));
+
+ queue.enqueue_task(kernel);
+}
+
+template<class KeyIterator, class ValueIterator>
+inline void serial_insertion_sort_by_key(KeyIterator keys_first,
+ KeyIterator keys_last,
+ ValueIterator values_first,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
+
+ serial_insertion_sort_by_key(
+ keys_first,
+ keys_last,
+ values_first,
+ boost::compute::less<key_type>(),
+ queue
+ );
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_INSERTION_SORT_HPP
diff --git a/boost/compute/algorithm/detail/merge_path.hpp b/boost/compute/algorithm/detail/merge_path.hpp
new file mode 100644
index 0000000000..bc2c8fa88c
--- /dev/null
+++ b/boost/compute/algorithm/detail/merge_path.hpp
@@ -0,0 +1,116 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_PATH_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_PATH_HPP
+
+#include <iterator>
+
+#include <boost/compute/algorithm/find_if.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/lambda.hpp>
+#include <boost/compute/system.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+///
+/// \brief Merge Path kernel class
+///
+/// Subclass of meta_kernel to break two sets into tiles according
+/// to their merge path
+///
+class merge_path_kernel : public meta_kernel
+{
+public:
+ unsigned int tile_size;
+
+ merge_path_kernel() : meta_kernel("merge_path")
+ {
+ tile_size = 4;
+ }
+
+ template<class InputIterator1, class InputIterator2,
+ class OutputIterator1, class OutputIterator2,
+ class Compare>
+ void set_range(InputIterator1 first1,
+ InputIterator1 last1,
+ InputIterator2 first2,
+ InputIterator2 last2,
+ OutputIterator1 result_a,
+ OutputIterator2 result_b,
+ Compare comp)
+ {
+ m_a_count = iterator_range_size(first1, last1);
+ m_a_count_arg = add_arg<uint_>("a_count");
+
+ m_b_count = iterator_range_size(first2, last2);
+ m_b_count_arg = add_arg<uint_>("b_count");
+
+ *this <<
+ "uint i = get_global_id(0);\n" <<
+ "uint target = (i+1)*" << tile_size << ";\n" <<
+ "uint start = max(convert_int(0),convert_int(target)-convert_int(b_count));\n" <<
+ "uint end = min(target,a_count);\n" <<
+ "uint a_index, b_index;\n" <<
+ "while(start<end)\n" <<
+ "{\n" <<
+ " a_index = (start + end)/2;\n" <<
+ " b_index = target - a_index - 1;\n" <<
+ " if(!(" << comp(first2[expr<uint_>("b_index")],
+ first1[expr<uint_>("a_index")]) << "))\n" <<
+ " start = a_index + 1;\n" <<
+ " else end = a_index;\n" <<
+ "}\n" <<
+ result_a[expr<uint_>("i")] << " = start;\n" <<
+ result_b[expr<uint_>("i")] << " = target - start;\n";
+ }
+
+ template<class InputIterator1, class InputIterator2,
+ class OutputIterator1, class OutputIterator2>
+ void set_range(InputIterator1 first1,
+ InputIterator1 last1,
+ InputIterator2 first2,
+ InputIterator2 last2,
+ OutputIterator1 result_a,
+ OutputIterator2 result_b)
+ {
+ typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
+ ::boost::compute::less<value_type> less_than;
+ set_range(first1, last1, first2, last2, result_a, result_b, less_than);
+ }
+
+ event exec(command_queue &queue)
+ {
+ if((m_a_count + m_b_count)/tile_size == 0) {
+ return event();
+ }
+
+ set_arg(m_a_count_arg, uint_(m_a_count));
+ set_arg(m_b_count_arg, uint_(m_b_count));
+
+ return exec_1d(queue, 0, (m_a_count + m_b_count)/tile_size);
+ }
+
+private:
+ size_t m_a_count;
+ size_t m_a_count_arg;
+ size_t m_b_count;
+ size_t m_b_count_arg;
+};
+
+} //end detail namespace
+} //end compute namespace
+} //end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_PATH_HPP
diff --git a/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp b/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp
new file mode 100644
index 0000000000..f4b53f10ae
--- /dev/null
+++ b/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp
@@ -0,0 +1,366 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
+
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/program.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/detail/merge_with_merge_path.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class KeyIterator, class ValueIterator, class Compare>
+inline void merge_blocks(KeyIterator keys_first,
+ ValueIterator values_first,
+ KeyIterator keys_result,
+ ValueIterator values_result,
+ Compare compare,
+ size_t count,
+ const size_t block_size,
+ const bool sort_by_key,
+ command_queue &queue)
+{
+ (void) values_result;
+ (void) values_first;
+
+ meta_kernel k("merge_sort_on_cpu_merge_blocks");
+ size_t count_arg = k.add_arg<const uint_>("count");
+ size_t block_size_arg = k.add_arg<uint_>("block_size");
+
+ k <<
+ k.decl<uint_>("b1_start") << " = get_global_id(0) * block_size * 2;\n" <<
+ k.decl<uint_>("b1_end") << " = min(count, b1_start + block_size);\n" <<
+ k.decl<uint_>("b2_start") << " = min(count, b1_start + block_size);\n" <<
+ k.decl<uint_>("b2_end") << " = min(count, b2_start + block_size);\n" <<
+ k.decl<uint_>("result_idx") << " = b1_start;\n" <<
+
+ // merging block 1 and block 2 (stable)
+ "while(b1_start < b1_end && b2_start < b2_end){\n" <<
+ " if( " << compare(keys_first[k.var<uint_>("b2_start")],
+ keys_first[k.var<uint_>("b1_start")]) << "){\n" <<
+ " " << keys_result[k.var<uint_>("result_idx")] << " = " <<
+ keys_first[k.var<uint_>("b2_start")] << ";\n";
+ if(sort_by_key){
+ k <<
+ " " << values_result[k.var<uint_>("result_idx")] << " = " <<
+ values_first[k.var<uint_>("b2_start")] << ";\n";
+ }
+ k <<
+ " b2_start++;\n" <<
+ " }\n" <<
+ " else {\n" <<
+ " " << keys_result[k.var<uint_>("result_idx")] << " = " <<
+ keys_first[k.var<uint_>("b1_start")] << ";\n";
+ if(sort_by_key){
+ k <<
+ " " << values_result[k.var<uint_>("result_idx")] << " = " <<
+ values_first[k.var<uint_>("b1_start")] << ";\n";
+ }
+ k <<
+ " b1_start++;\n" <<
+ " }\n" <<
+ " result_idx++;\n" <<
+ "}\n" <<
+ "while(b1_start < b1_end){\n" <<
+ " " << keys_result[k.var<uint_>("result_idx")] << " = " <<
+ keys_first[k.var<uint_>("b1_start")] << ";\n";
+ if(sort_by_key){
+ k <<
+ " " << values_result[k.var<uint_>("result_idx")] << " = " <<
+ values_first[k.var<uint_>("b1_start")] << ";\n";
+ }
+ k <<
+ " b1_start++;\n" <<
+ " result_idx++;\n" <<
+ "}\n" <<
+ "while(b2_start < b2_end){\n" <<
+ " " << keys_result[k.var<uint_>("result_idx")] << " = " <<
+ keys_first[k.var<uint_>("b2_start")] << ";\n";
+ if(sort_by_key){
+ k <<
+ " " << values_result[k.var<uint_>("result_idx")] << " = " <<
+ values_first[k.var<uint_>("b2_start")] << ";\n";
+ }
+ k <<
+ " b2_start++;\n" <<
+ " result_idx++;\n" <<
+ "}\n";
+
+ const context &context = queue.get_context();
+ ::boost::compute::kernel kernel = k.compile(context);
+ kernel.set_arg(count_arg, static_cast<const uint_>(count));
+ kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));
+
+ const size_t global_size = static_cast<size_t>(
+ std::ceil(float(count) / (2 * block_size))
+ );
+ queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0);
+}
+
+template<class Iterator, class Compare>
+inline void merge_blocks(Iterator first,
+ Iterator result,
+ Compare compare,
+ size_t count,
+ const size_t block_size,
+ const bool sort_by_key,
+ command_queue &queue)
+{
+ // dummy iterator as it's not sort by key
+ Iterator dummy;
+ merge_blocks(first, dummy, result, dummy, compare, count, block_size, false, queue);
+}
+
+template<class Iterator, class Compare>
+inline void dispatch_merge_blocks(Iterator first,
+ Iterator result,
+ Compare compare,
+ size_t count,
+ const size_t block_size,
+ const size_t input_size_threshold,
+ const size_t blocks_no_threshold,
+ command_queue &queue)
+{
+ const size_t blocks_no = static_cast<size_t>(
+ std::ceil(float(count) / block_size)
+ );
+ // merge with merge path should used only for the large arrays and at the
+ // end of merging part when there are only a few big blocks left to be merged
+ if(blocks_no <= blocks_no_threshold && count >= input_size_threshold){
+ Iterator last = first + count;
+ for(size_t i = 0; i < count; i+= 2*block_size)
+ {
+ Iterator first1 = (std::min)(first + i, last);
+ Iterator last1 = (std::min)(first1 + block_size, last);
+ Iterator first2 = last1;
+ Iterator last2 = (std::min)(first2 + block_size, last);
+ Iterator block_result = (std::min)(result + i, result + count);
+ merge_with_merge_path(first1, last1, first2, last2,
+ block_result, compare, queue);
+ }
+ }
+ else {
+ merge_blocks(first, result, compare, count, block_size, false, queue);
+ }
+}
+
+template<class KeyIterator, class ValueIterator, class Compare>
+inline void block_insertion_sort(KeyIterator keys_first,
+ ValueIterator values_first,
+ Compare compare,
+ const size_t count,
+ const size_t block_size,
+ const bool sort_by_key,
+ command_queue &queue)
+{
+ (void) values_first;
+
+ typedef typename std::iterator_traits<KeyIterator>::value_type K;
+ typedef typename std::iterator_traits<ValueIterator>::value_type T;
+
+ meta_kernel k("merge_sort_on_cpu_block_insertion_sort");
+ size_t count_arg = k.add_arg<uint_>("count");
+ size_t block_size_arg = k.add_arg<uint_>("block_size");
+
+ k <<
+ k.decl<uint_>("start") << " = get_global_id(0) * block_size;\n" <<
+ k.decl<uint_>("end") << " = min(count, start + block_size);\n" <<
+
+ // block insertion sort (stable)
+ "for(uint i = start+1; i < end; i++){\n" <<
+ " " << k.decl<const K>("key") << " = " <<
+ keys_first[k.var<uint_>("i")] << ";\n";
+ if(sort_by_key){
+ k <<
+ " " << k.decl<const T>("value") << " = " <<
+ values_first[k.var<uint_>("i")] << ";\n";
+ }
+ k <<
+ " uint pos = i;\n" <<
+ " while(pos > start && " <<
+ compare(k.var<const K>("key"),
+ keys_first[k.var<uint_>("pos-1")]) << "){\n" <<
+ " " << keys_first[k.var<uint_>("pos")] << " = " <<
+ keys_first[k.var<uint_>("pos-1")] << ";\n";
+ if(sort_by_key){
+ k <<
+ " " << values_first[k.var<uint_>("pos")] << " = " <<
+ values_first[k.var<uint_>("pos-1")] << ";\n";
+ }
+ k <<
+ " pos--;\n" <<
+ " }\n" <<
+ " " << keys_first[k.var<uint_>("pos")] << " = key;\n";
+ if(sort_by_key) {
+ k <<
+ " " << values_first[k.var<uint_>("pos")] << " = value;\n";
+ }
+ k <<
+ "}\n"; // block insertion sort
+
+ const context &context = queue.get_context();
+ ::boost::compute::kernel kernel = k.compile(context);
+ kernel.set_arg(count_arg, static_cast<uint_>(count));
+ kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));
+
+ const size_t global_size = static_cast<size_t>(std::ceil(float(count) / block_size));
+ queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0);
+}
+
+template<class Iterator, class Compare>
+inline void block_insertion_sort(Iterator first,
+ Compare compare,
+ const size_t count,
+ const size_t block_size,
+ command_queue &queue)
+{
+ // dummy iterator as it's not sort by key
+ Iterator dummy;
+ block_insertion_sort(first, dummy, compare, count, block_size, false, queue);
+}
+
+// This sort is stable.
+template<class Iterator, class Compare>
+inline void merge_sort_on_cpu(Iterator first,
+ Iterator last,
+ Compare compare,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<Iterator>::value_type value_type;
+
+ size_t count = iterator_range_size(first, last);
+ if(count < 2){
+ return;
+ }
+ // for small input size only insertion sort is performed
+ else if(count <= 512){
+ block_insertion_sort(first, compare, count, count, queue);
+ return;
+ }
+
+ const context &context = queue.get_context();
+ const device &device = queue.get_device();
+
+ // loading parameters
+ std::string cache_key =
+ std::string("__boost_merge_sort_on_cpu_") + type_name<value_type>();
+ boost::shared_ptr<parameter_cache> parameters =
+ detail::parameter_cache::get_global_cache(device);
+
+ // When there is merge_with_path_blocks_no_threshold or less blocks left to
+ // merge AND input size is merge_with_merge_path_input_size_threshold or more
+ // merge_with_merge_path() algorithm is used to merge sorted blocks;
+ // otherwise merge_blocks() is used.
+ const size_t merge_with_path_blocks_no_threshold =
+ parameters->get(cache_key, "merge_with_merge_path_blocks_no_threshold", 8);
+ const size_t merge_with_path_input_size_threshold =
+ parameters->get(cache_key, "merge_with_merge_path_input_size_threshold", 2097152);
+
+ const size_t block_size =
+ parameters->get(cache_key, "insertion_sort_block_size", 64);
+ block_insertion_sort(first, compare, count, block_size, queue);
+
+ // temporary buffer for merge result
+ vector<value_type> temp(count, context);
+ bool result_in_temporary_buffer = false;
+
+ for(size_t i = block_size; i < count; i *= 2){
+ result_in_temporary_buffer = !result_in_temporary_buffer;
+ if(result_in_temporary_buffer) {
+ dispatch_merge_blocks(first, temp.begin(), compare, count, i,
+ merge_with_path_input_size_threshold,
+ merge_with_path_blocks_no_threshold,
+ queue);
+ } else {
+ dispatch_merge_blocks(temp.begin(), first, compare, count, i,
+ merge_with_path_input_size_threshold,
+ merge_with_path_blocks_no_threshold,
+ queue);
+ }
+ }
+
+ if(result_in_temporary_buffer) {
+ copy(temp.begin(), temp.end(), first, queue);
+ }
+}
+
+// This sort is stable.
+template<class KeyIterator, class ValueIterator, class Compare>
+inline void merge_sort_by_key_on_cpu(KeyIterator keys_first,
+ KeyIterator keys_last,
+ ValueIterator values_first,
+ Compare compare,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
+ typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
+
+ size_t count = iterator_range_size(keys_first, keys_last);
+ if(count < 2){
+ return;
+ }
+ // for small input size only insertion sort is performed
+ else if(count <= 512){
+ block_insertion_sort(keys_first, values_first, compare,
+ count, count, true, queue);
+ return;
+ }
+
+ const context &context = queue.get_context();
+ const device &device = queue.get_device();
+
+ // loading parameters
+ std::string cache_key =
+ std::string("__boost_merge_sort_by_key_on_cpu_") + type_name<value_type>()
+ + "_with_" + type_name<key_type>();
+ boost::shared_ptr<parameter_cache> parameters =
+ detail::parameter_cache::get_global_cache(device);
+
+ const size_t block_size =
+ parameters->get(cache_key, "insertion_sort_by_key_block_size", 64);
+ block_insertion_sort(keys_first, values_first, compare,
+ count, block_size, true, queue);
+
+ // temporary buffer for merge results
+ vector<value_type> values_temp(count, context);
+ vector<key_type> keys_temp(count, context);
+ bool result_in_temporary_buffer = false;
+
+ for(size_t i = block_size; i < count; i *= 2){
+ result_in_temporary_buffer = !result_in_temporary_buffer;
+ if(result_in_temporary_buffer) {
+ merge_blocks(keys_first, values_first,
+ keys_temp.begin(), values_temp.begin(),
+ compare, count, i, true, queue);
+ } else {
+ merge_blocks(keys_temp.begin(), values_temp.begin(),
+ keys_first, values_first,
+ compare, count, i, true, queue);
+ }
+ }
+
+ if(result_in_temporary_buffer) {
+ copy(keys_temp.begin(), keys_temp.end(), keys_first, queue);
+ copy(values_temp.begin(), values_temp.end(), values_first, queue);
+ }
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP
diff --git a/boost/compute/algorithm/detail/merge_with_merge_path.hpp b/boost/compute/algorithm/detail/merge_with_merge_path.hpp
new file mode 100644
index 0000000000..c3cc5e8e9c
--- /dev/null
+++ b/boost/compute/algorithm/detail/merge_with_merge_path.hpp
@@ -0,0 +1,203 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_WIH_MERGE_PATH_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_WIH_MERGE_PATH_HPP
+
+#include <iterator>
+
+#include <boost/compute/algorithm/detail/merge_path.hpp>
+#include <boost/compute/algorithm/fill_n.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/system.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+///
+/// \brief Serial merge kernel class
+///
+/// Subclass of meta_kernel to perform serial merge after tiling
+///
+class serial_merge_kernel : meta_kernel
+{
+public:
+ unsigned int tile_size;
+
+ serial_merge_kernel() : meta_kernel("merge")
+ {
+ tile_size = 4;
+ }
+
+ template<class InputIterator1, class InputIterator2,
+ class InputIterator3, class InputIterator4,
+ class OutputIterator, class Compare>
+ void set_range(InputIterator1 first1,
+ InputIterator2 first2,
+ InputIterator3 tile_first1,
+ InputIterator3 tile_last1,
+ InputIterator4 tile_first2,
+ OutputIterator result,
+ Compare comp)
+ {
+ m_count = iterator_range_size(tile_first1, tile_last1) - 1;
+
+ *this <<
+ "uint i = get_global_id(0);\n" <<
+ "uint start1 = " << tile_first1[expr<uint_>("i")] << ";\n" <<
+ "uint end1 = " << tile_first1[expr<uint_>("i+1")] << ";\n" <<
+ "uint start2 = " << tile_first2[expr<uint_>("i")] << ";\n" <<
+ "uint end2 = " << tile_first2[expr<uint_>("i+1")] << ";\n" <<
+ "uint index = i*" << tile_size << ";\n" <<
+ "while(start1<end1 && start2<end2)\n" <<
+ "{\n" <<
+ " if(!(" << comp(first2[expr<uint_>("start2")],
+ first1[expr<uint_>("start1")]) << "))\n" <<
+ " {\n" <<
+ result[expr<uint_>("index")] <<
+ " = " << first1[expr<uint_>("start1")] << ";\n" <<
+ " index++;\n" <<
+ " start1++;\n" <<
+ " }\n" <<
+ " else\n" <<
+ " {\n" <<
+ result[expr<uint_>("index")] <<
+ " = " << first2[expr<uint_>("start2")] << ";\n" <<
+ " index++;\n" <<
+ " start2++;\n" <<
+ " }\n" <<
+ "}\n" <<
+ "while(start1<end1)\n" <<
+ "{\n" <<
+ result[expr<uint_>("index")] <<
+ " = " << first1[expr<uint_>("start1")] << ";\n" <<
+ " index++;\n" <<
+ " start1++;\n" <<
+ "}\n" <<
+ "while(start2<end2)\n" <<
+ "{\n" <<
+ result[expr<uint_>("index")] <<
+ " = " << first2[expr<uint_>("start2")] << ";\n" <<
+ " index++;\n" <<
+ " start2++;\n" <<
+ "}\n";
+ }
+
+ template<class InputIterator1, class InputIterator2,
+ class InputIterator3, class InputIterator4,
+ class OutputIterator>
+ void set_range(InputIterator1 first1,
+ InputIterator2 first2,
+ InputIterator3 tile_first1,
+ InputIterator3 tile_last1,
+ InputIterator4 tile_first2,
+ OutputIterator result)
+ {
+ typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
+ ::boost::compute::less<value_type> less_than;
+ set_range(first1, first2, tile_first1, tile_last1, tile_first2, result, less_than);
+ }
+
+ event exec(command_queue &queue)
+ {
+ if(m_count == 0) {
+ return event();
+ }
+
+ return exec_1d(queue, 0, m_count);
+ }
+
+private:
+ size_t m_count;
+};
+
+///
+/// \brief Merge algorithm with merge path
+///
+/// Merges the sorted values in the range [\p first1, \p last1) with
+/// the sorted values in the range [\p first2, last2) and stores the
+/// result in the range beginning at \p result
+///
+/// \param first1 Iterator pointing to start of first set
+/// \param last1 Iterator pointing to end of first set
+/// \param first2 Iterator pointing to start of second set
+/// \param last2 Iterator pointing to end of second set
+/// \param result Iterator pointing to start of range in which the result
+/// will be stored
+/// \param comp Comparator which performs less than function
+/// \param queue Queue on which to execute
+///
+template<class InputIterator1, class InputIterator2, class OutputIterator, class Compare>
+inline OutputIterator
+merge_with_merge_path(InputIterator1 first1,
+ InputIterator1 last1,
+ InputIterator2 first2,
+ InputIterator2 last2,
+ OutputIterator result,
+ Compare comp,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename
+ std::iterator_traits<OutputIterator>::difference_type result_difference_type;
+
+ size_t tile_size = 1024;
+
+ size_t count1 = iterator_range_size(first1, last1);
+ size_t count2 = iterator_range_size(first2, last2);
+
+ vector<uint_> tile_a((count1+count2+tile_size-1)/tile_size+1, queue.get_context());
+ vector<uint_> tile_b((count1+count2+tile_size-1)/tile_size+1, queue.get_context());
+
+ // Tile the sets
+ merge_path_kernel tiling_kernel;
+ tiling_kernel.tile_size = static_cast<unsigned int>(tile_size);
+ tiling_kernel.set_range(first1, last1, first2, last2,
+ tile_a.begin()+1, tile_b.begin()+1, comp);
+ fill_n(tile_a.begin(), 1, uint_(0), queue);
+ fill_n(tile_b.begin(), 1, uint_(0), queue);
+ tiling_kernel.exec(queue);
+
+ fill_n(tile_a.end()-1, 1, static_cast<uint_>(count1), queue);
+ fill_n(tile_b.end()-1, 1, static_cast<uint_>(count2), queue);
+
+ // Merge
+ serial_merge_kernel merge_kernel;
+ merge_kernel.tile_size = static_cast<unsigned int>(tile_size);
+ merge_kernel.set_range(first1, first2, tile_a.begin(), tile_a.end(),
+ tile_b.begin(), result, comp);
+
+ merge_kernel.exec(queue);
+
+ return result + static_cast<result_difference_type>(count1 + count2);
+}
+
+/// \overload
+template<class InputIterator1, class InputIterator2, class OutputIterator>
+inline OutputIterator
+merge_with_merge_path(InputIterator1 first1,
+ InputIterator1 last1,
+ InputIterator2 first2,
+ InputIterator2 last2,
+ OutputIterator result,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
+ ::boost::compute::less<value_type> less_than;
+ return merge_with_merge_path(first1, last1, first2, last2, result, less_than, queue);
+}
+
+} //end detail namespace
+} //end compute namespace
+} //end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_WIH_MERGE_PATH_HPP
diff --git a/boost/compute/algorithm/detail/radix_sort.hpp b/boost/compute/algorithm/detail/radix_sort.hpp
new file mode 100644
index 0000000000..c2ba4ed17c
--- /dev/null
+++ b/boost/compute/algorithm/detail/radix_sort.hpp
@@ -0,0 +1,415 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_RADIX_SORT_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_RADIX_SORT_HPP
+
+#include <iterator>
+
+#include <boost/assert.hpp>
+#include <boost/type_traits/is_signed.hpp>
+#include <boost/type_traits/is_floating_point.hpp>
+
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/program.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/exclusive_scan.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/parameter_cache.hpp>
+#include <boost/compute/type_traits/type_name.hpp>
+#include <boost/compute/type_traits/is_fundamental.hpp>
+#include <boost/compute/type_traits/is_vector_type.hpp>
+#include <boost/compute/utility/program_cache.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+// meta-function returning true if type T is radix-sortable
+template<class T>
+struct is_radix_sortable :
+ boost::mpl::and_<
+ typename ::boost::compute::is_fundamental<T>::type,
+ typename boost::mpl::not_<typename is_vector_type<T>::type>::type
+ >
+{
+};
+
+template<size_t N>
+struct radix_sort_value_type
+{
+};
+
+template<>
+struct radix_sort_value_type<1>
+{
+ typedef uchar_ type;
+};
+
+template<>
+struct radix_sort_value_type<2>
+{
+ typedef ushort_ type;
+};
+
+template<>
+struct radix_sort_value_type<4>
+{
+ typedef uint_ type;
+};
+
+template<>
+struct radix_sort_value_type<8>
+{
+ typedef ulong_ type;
+};
+
+template<typename T>
+inline const char* enable_double()
+{
+ return " -DT2_double=0";
+}
+
+template<>
+inline const char* enable_double<double>()
+{
+ return " -DT2_double=1";
+}
+
+const char radix_sort_source[] =
+"#if T2_double\n"
+"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+"#endif\n"
+"#define K2_BITS (1 << K_BITS)\n"
+"#define RADIX_MASK ((((T)(1)) << K_BITS) - 1)\n"
+"#define SIGN_BIT ((sizeof(T) * CHAR_BIT) - 1)\n"
+
+"inline uint radix(const T x, const uint low_bit)\n"
+"{\n"
+"#if defined(IS_FLOATING_POINT)\n"
+" const T mask = -(x >> SIGN_BIT) | (((T)(1)) << SIGN_BIT);\n"
+" return ((x ^ mask) >> low_bit) & RADIX_MASK;\n"
+"#elif defined(IS_SIGNED)\n"
+" return ((x ^ (((T)(1)) << SIGN_BIT)) >> low_bit) & RADIX_MASK;\n"
+"#else\n"
+" return (x >> low_bit) & RADIX_MASK;\n"
+"#endif\n"
+"}\n"
+
+"__kernel void count(__global const T *input,\n"
+" const uint input_offset,\n"
+" const uint input_size,\n"
+" __global uint *global_counts,\n"
+" __global uint *global_offsets,\n"
+" __local uint *local_counts,\n"
+" const uint low_bit)\n"
+"{\n"
+ // work-item parameters
+" const uint gid = get_global_id(0);\n"
+" const uint lid = get_local_id(0);\n"
+
+ // zero local counts
+" if(lid < K2_BITS){\n"
+" local_counts[lid] = 0;\n"
+" }\n"
+" barrier(CLK_LOCAL_MEM_FENCE);\n"
+
+ // reduce local counts
+" if(gid < input_size){\n"
+" T value = input[input_offset+gid];\n"
+" uint bucket = radix(value, low_bit);\n"
+" atomic_inc(local_counts + bucket);\n"
+" }\n"
+" barrier(CLK_LOCAL_MEM_FENCE);\n"
+
+ // write block-relative offsets
+" if(lid < K2_BITS){\n"
+" global_counts[K2_BITS*get_group_id(0) + lid] = local_counts[lid];\n"
+
+ // write global offsets
+" if(get_group_id(0) == (get_num_groups(0) - 1)){\n"
+" global_offsets[lid] = local_counts[lid];\n"
+" }\n"
+" }\n"
+"}\n"
+
+"__kernel void scan(__global const uint *block_offsets,\n"
+" __global uint *global_offsets,\n"
+" const uint block_count)\n"
+"{\n"
+" __global const uint *last_block_offsets =\n"
+" block_offsets + K2_BITS * (block_count - 1);\n"
+
+ // calculate and scan global_offsets
+" uint sum = 0;\n"
+" for(uint i = 0; i < K2_BITS; i++){\n"
+" uint x = global_offsets[i] + last_block_offsets[i];\n"
+" global_offsets[i] = sum;\n"
+" sum += x;\n"
+" }\n"
+"}\n"
+
+"__kernel void scatter(__global const T *input,\n"
+" const uint input_offset,\n"
+" const uint input_size,\n"
+" const uint low_bit,\n"
+" __global const uint *counts,\n"
+" __global const uint *global_offsets,\n"
+"#ifndef SORT_BY_KEY\n"
+" __global T *output,\n"
+" const uint output_offset)\n"
+"#else\n"
+" __global T *keys_output,\n"
+" const uint keys_output_offset,\n"
+" __global T2 *values_input,\n"
+" const uint values_input_offset,\n"
+" __global T2 *values_output,\n"
+" const uint values_output_offset)\n"
+"#endif\n"
+"{\n"
+ // work-item parameters
+" const uint gid = get_global_id(0);\n"
+" const uint lid = get_local_id(0);\n"
+
+ // copy input to local memory
+" T value;\n"
+" uint bucket;\n"
+" __local uint local_input[BLOCK_SIZE];\n"
+" if(gid < input_size){\n"
+" value = input[input_offset+gid];\n"
+" bucket = radix(value, low_bit);\n"
+" local_input[lid] = bucket;\n"
+" }\n"
+
+ // copy block counts to local memory
+" __local uint local_counts[(1 << K_BITS)];\n"
+" if(lid < K2_BITS){\n"
+" local_counts[lid] = counts[get_group_id(0) * K2_BITS + lid];\n"
+" }\n"
+
+ // wait until local memory is ready
+" barrier(CLK_LOCAL_MEM_FENCE);\n"
+
+" if(gid >= input_size){\n"
+" return;\n"
+" }\n"
+
+ // get global offset
+" uint offset = global_offsets[bucket] + local_counts[bucket];\n"
+
+ // calculate local offset
+" uint local_offset = 0;\n"
+" for(uint i = 0; i < lid; i++){\n"
+" if(local_input[i] == bucket)\n"
+" local_offset++;\n"
+" }\n"
+
+"#ifndef SORT_BY_KEY\n"
+ // write value to output
+" output[output_offset + offset + local_offset] = value;\n"
+"#else\n"
+ // write key and value if doing sort_by_key
+" keys_output[keys_output_offset+offset + local_offset] = value;\n"
+" values_output[values_output_offset+offset + local_offset] =\n"
+" values_input[values_input_offset+gid];\n"
+"#endif\n"
+"}\n";
+
+template<class T, class T2>
+inline void radix_sort_impl(const buffer_iterator<T> first,
+ const buffer_iterator<T> last,
+ const buffer_iterator<T2> values_first,
+ command_queue &queue)
+{
+
+ typedef T value_type;
+ typedef typename radix_sort_value_type<sizeof(T)>::type sort_type;
+
+ const device &device = queue.get_device();
+ const context &context = queue.get_context();
+
+
+ // if we have a valid values iterator then we are doing a
+ // sort by key and have to set up the values buffer
+ bool sort_by_key = (values_first.get_buffer().get() != 0);
+
+ // load (or create) radix sort program
+ std::string cache_key =
+ std::string("__boost_radix_sort_") + type_name<value_type>();
+
+ if(sort_by_key){
+ cache_key += std::string("_with_") + type_name<T2>();
+ }
+
+ boost::shared_ptr<program_cache> cache =
+ program_cache::get_global_cache(context);
+ boost::shared_ptr<parameter_cache> parameters =
+ detail::parameter_cache::get_global_cache(device);
+
+ // sort parameters
+ const uint_ k = parameters->get(cache_key, "k", 4);
+ const uint_ k2 = 1 << k;
+ const uint_ block_size = parameters->get(cache_key, "tpb", 128);
+
+ // sort program compiler options
+ std::stringstream options;
+ options << "-DK_BITS=" << k;
+ options << " -DT=" << type_name<sort_type>();
+ options << " -DBLOCK_SIZE=" << block_size;
+
+ if(boost::is_floating_point<value_type>::value){
+ options << " -DIS_FLOATING_POINT";
+ }
+
+ if(boost::is_signed<value_type>::value){
+ options << " -DIS_SIGNED";
+ }
+
+ if(sort_by_key){
+ options << " -DSORT_BY_KEY";
+ options << " -DT2=" << type_name<T2>();
+ options << enable_double<T2>();
+ }
+
+ // load radix sort program
+ program radix_sort_program = cache->get_or_build(
+ cache_key, options.str(), radix_sort_source, context
+ );
+
+ kernel count_kernel(radix_sort_program, "count");
+ kernel scan_kernel(radix_sort_program, "scan");
+ kernel scatter_kernel(radix_sort_program, "scatter");
+
+ size_t count = detail::iterator_range_size(first, last);
+
+ uint_ block_count = static_cast<uint_>(count / block_size);
+ if(block_count * block_size != count){
+ block_count++;
+ }
+
+ // setup temporary buffers
+ vector<value_type> output(count, context);
+ vector<T2> values_output(sort_by_key ? count : 0, context);
+ vector<uint_> offsets(k2, context);
+ vector<uint_> counts(block_count * k2, context);
+
+ const buffer *input_buffer = &first.get_buffer();
+ uint_ input_offset = static_cast<uint_>(first.get_index());
+ const buffer *output_buffer = &output.get_buffer();
+ uint_ output_offset = 0;
+ const buffer *values_input_buffer = &values_first.get_buffer();
+ uint_ values_input_offset = static_cast<uint_>(values_first.get_index());
+ const buffer *values_output_buffer = &values_output.get_buffer();
+ uint_ values_output_offset = 0;
+
+ for(uint_ i = 0; i < sizeof(sort_type) * CHAR_BIT / k; i++){
+ // write counts
+ count_kernel.set_arg(0, *input_buffer);
+ count_kernel.set_arg(1, input_offset);
+ count_kernel.set_arg(2, static_cast<uint_>(count));
+ count_kernel.set_arg(3, counts);
+ count_kernel.set_arg(4, offsets);
+ count_kernel.set_arg(5, block_size * sizeof(uint_), 0);
+ count_kernel.set_arg(6, i * k);
+ queue.enqueue_1d_range_kernel(count_kernel,
+ 0,
+ block_count * block_size,
+ block_size);
+
+ // scan counts
+ if(k == 1){
+ typedef uint2_ counter_type;
+ ::boost::compute::exclusive_scan(
+ make_buffer_iterator<counter_type>(counts.get_buffer(), 0),
+ make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 2),
+ make_buffer_iterator<counter_type>(counts.get_buffer()),
+ queue
+ );
+ }
+ else if(k == 2){
+ typedef uint4_ counter_type;
+ ::boost::compute::exclusive_scan(
+ make_buffer_iterator<counter_type>(counts.get_buffer(), 0),
+ make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 4),
+ make_buffer_iterator<counter_type>(counts.get_buffer()),
+ queue
+ );
+ }
+ else if(k == 4){
+ typedef uint16_ counter_type;
+ ::boost::compute::exclusive_scan(
+ make_buffer_iterator<counter_type>(counts.get_buffer(), 0),
+ make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 16),
+ make_buffer_iterator<counter_type>(counts.get_buffer()),
+ queue
+ );
+ }
+ else {
+ BOOST_ASSERT(false && "unknown k");
+ break;
+ }
+
+ // scan global offsets
+ scan_kernel.set_arg(0, counts);
+ scan_kernel.set_arg(1, offsets);
+ scan_kernel.set_arg(2, block_count);
+ queue.enqueue_task(scan_kernel);
+
+ // scatter values
+ scatter_kernel.set_arg(0, *input_buffer);
+ scatter_kernel.set_arg(1, input_offset);
+ scatter_kernel.set_arg(2, static_cast<uint_>(count));
+ scatter_kernel.set_arg(3, i * k);
+ scatter_kernel.set_arg(4, counts);
+ scatter_kernel.set_arg(5, offsets);
+ scatter_kernel.set_arg(6, *output_buffer);
+ scatter_kernel.set_arg(7, output_offset);
+ if(sort_by_key){
+ scatter_kernel.set_arg(8, *values_input_buffer);
+ scatter_kernel.set_arg(9, values_input_offset);
+ scatter_kernel.set_arg(10, *values_output_buffer);
+ scatter_kernel.set_arg(11, values_output_offset);
+ }
+ queue.enqueue_1d_range_kernel(scatter_kernel,
+ 0,
+ block_count * block_size,
+ block_size);
+
+ // swap buffers
+ std::swap(input_buffer, output_buffer);
+ std::swap(values_input_buffer, values_output_buffer);
+ std::swap(input_offset, output_offset);
+ std::swap(values_input_offset, values_output_offset);
+ }
+}
+
+template<class Iterator>
+inline void radix_sort(Iterator first,
+ Iterator last,
+ command_queue &queue)
+{
+ radix_sort_impl(first, last, buffer_iterator<int>(), queue);
+}
+
+template<class KeyIterator, class ValueIterator>
+inline void radix_sort_by_key(KeyIterator keys_first,
+ KeyIterator keys_last,
+ ValueIterator values_first,
+ command_queue &queue)
+{
+ radix_sort_impl(keys_first, keys_last, values_first, queue);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_RADIX_SORT_HPP
diff --git a/boost/compute/algorithm/detail/random_fill.hpp b/boost/compute/algorithm/detail/random_fill.hpp
new file mode 100644
index 0000000000..5c3827a9f8
--- /dev/null
+++ b/boost/compute/algorithm/detail/random_fill.hpp
@@ -0,0 +1,57 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_RANDOM_FILL_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_RANDOM_FILL_HPP
+
+#include <iterator>
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/random/default_random_engine.hpp>
+#include <boost/compute/random/uniform_real_distribution.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class OutputIterator, class Generator>
+inline void random_fill(OutputIterator first,
+ OutputIterator last,
+ Generator &g,
+ command_queue &queue)
+{
+ g.fill(first, last, queue);
+}
+
+template<class OutputIterator>
+inline void
+random_fill(OutputIterator first,
+ OutputIterator last,
+ typename std::iterator_traits<OutputIterator>::value_type lo,
+ typename std::iterator_traits<OutputIterator>::value_type hi,
+ command_queue &queue)
+{
+ typedef typename
+ std::iterator_traits<OutputIterator>::value_type value_type;
+ typedef typename
+ boost::compute::default_random_engine engine_type;
+ typedef typename
+ boost::compute::uniform_real_distribution<value_type> distribution_type;
+
+ engine_type engine(queue);
+ distribution_type generator(lo, hi);
+ generator.fill(first, last, engine, queue);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_RANDOM_FILL_HPP
diff --git a/boost/compute/algorithm/detail/reduce_by_key.hpp b/boost/compute/algorithm/detail/reduce_by_key.hpp
new file mode 100644
index 0000000000..65844c9ebf
--- /dev/null
+++ b/boost/compute/algorithm/detail/reduce_by_key.hpp
@@ -0,0 +1,119 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_BY_KEY_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_BY_KEY_HPP
+
+#include <algorithm>
+#include <iterator>
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/functional.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/algorithm/detail/serial_reduce_by_key.hpp>
+#include <boost/compute/algorithm/detail/reduce_by_key_with_scan.hpp>
+#include <boost/compute/type_traits.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputKeyIterator, class InputValueIterator,
+ class OutputKeyIterator, class OutputValueIterator,
+ class BinaryFunction, class BinaryPredicate>
+size_t reduce_by_key_on_gpu(InputKeyIterator keys_first,
+ InputKeyIterator keys_last,
+ InputValueIterator values_first,
+ OutputKeyIterator keys_result,
+ OutputValueIterator values_result,
+ BinaryFunction function,
+ BinaryPredicate predicate,
+ command_queue &queue)
+{
+ return detail::reduce_by_key_with_scan(keys_first, keys_last, values_first,
+ keys_result, values_result, function,
+ predicate, queue);
+}
+
+template<class InputKeyIterator, class InputValueIterator,
+ class OutputKeyIterator, class OutputValueIterator>
+bool reduce_by_key_on_gpu_requirements_met(InputKeyIterator keys_first,
+ InputValueIterator values_first,
+ OutputKeyIterator keys_result,
+ OutputValueIterator values_result,
+ const size_t count,
+ command_queue &queue)
+{
+ const device &device = queue.get_device();
+ return (count > 256)
+ && !(device.type() & device::cpu)
+ && reduce_by_key_with_scan_requirements_met(keys_first, values_first,
+ keys_result,values_result,
+ count, queue);
+ return true;
+}
+
+template<class InputKeyIterator, class InputValueIterator,
+ class OutputKeyIterator, class OutputValueIterator,
+ class BinaryFunction, class BinaryPredicate>
+inline std::pair<OutputKeyIterator, OutputValueIterator>
+dispatch_reduce_by_key(InputKeyIterator keys_first,
+ InputKeyIterator keys_last,
+ InputValueIterator values_first,
+ OutputKeyIterator keys_result,
+ OutputValueIterator values_result,
+ BinaryFunction function,
+ BinaryPredicate predicate,
+ command_queue &queue)
+{
+ typedef typename
+ std::iterator_traits<OutputKeyIterator>::difference_type key_difference_type;
+ typedef typename
+ std::iterator_traits<OutputValueIterator>::difference_type value_difference_type;
+
+ const size_t count = detail::iterator_range_size(keys_first, keys_last);
+ if (count < 2) {
+ boost::compute::copy_n(keys_first, count, keys_result, queue);
+ boost::compute::copy_n(values_first, count, values_result, queue);
+ return
+ std::make_pair<OutputKeyIterator, OutputValueIterator>(
+ keys_result + static_cast<key_difference_type>(count),
+ values_result + static_cast<value_difference_type>(count)
+ );
+ }
+
+ size_t result_size = 0;
+ if(reduce_by_key_on_gpu_requirements_met(keys_first, values_first, keys_result,
+ values_result, count, queue)){
+ result_size =
+ detail::reduce_by_key_on_gpu(keys_first, keys_last, values_first,
+ keys_result, values_result, function,
+ predicate, queue);
+ }
+ else {
+ result_size =
+ detail::serial_reduce_by_key(keys_first, keys_last, values_first,
+ keys_result, values_result, function,
+ predicate, queue);
+ }
+
+ return
+ std::make_pair<OutputKeyIterator, OutputValueIterator>(
+ keys_result + static_cast<key_difference_type>(result_size),
+ values_result + static_cast<value_difference_type>(result_size)
+ );
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_BY_KEY_HPP
diff --git a/boost/compute/algorithm/detail/reduce_by_key_with_scan.hpp b/boost/compute/algorithm/detail/reduce_by_key_with_scan.hpp
new file mode 100644
index 0000000000..e6852a67eb
--- /dev/null
+++ b/boost/compute/algorithm/detail/reduce_by_key_with_scan.hpp
@@ -0,0 +1,541 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_BY_KEY_WITH_SCAN_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_BY_KEY_WITH_SCAN_HPP
+
+#include <algorithm>
+#include <iterator>
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/functional.hpp>
+#include <boost/compute/algorithm/inclusive_scan.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/container/detail/scalar.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/read_write_single_value.hpp>
+#include <boost/compute/type_traits.hpp>
+#include <boost/compute/utility/program_cache.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+/// \internal_
+///
+/// Fills \p new_keys_first with unsigned integer keys generated from vector
+/// of original keys \p keys_first. New keys can be distinguish by simple equality
+/// predicate.
+///
+/// \param keys_first iterator pointing to the first key
+/// \param number_of_keys number of keys
+/// \param predicate binary predicate for key comparison
+/// \param new_keys_first iterator pointing to the new keys vector
+/// \param preferred_work_group_size preferred work group size
+/// \param queue command queue to perform the operation
+///
+/// Binary function \p predicate must take two keys as arguments and
+/// return true only if they are considered the same.
+///
+/// The first new key equals zero and the last equals number of unique keys
+/// minus one.
+///
+/// No local memory usage.
+template<class InputKeyIterator, class BinaryPredicate>
+inline void generate_uint_keys(InputKeyIterator keys_first,
+ size_t number_of_keys,
+ BinaryPredicate predicate,
+ vector<uint_>::iterator new_keys_first,
+ size_t preferred_work_group_size,
+ command_queue &queue)
+{
+ typedef typename
+ std::iterator_traits<InputKeyIterator>::value_type key_type;
+
+ detail::meta_kernel k("reduce_by_key_new_key_flags");
+ k.add_set_arg<const uint_>("count", uint_(number_of_keys));
+
+ k <<
+ k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
+ k.decl<uint_>("value") << " = 0;\n" <<
+ "if(gid >= count){\n return;\n}\n" <<
+ "if(gid > 0){ \n" <<
+ k.decl<key_type>("key") << " = " <<
+ keys_first[k.var<const uint_>("gid")] << ";\n" <<
+ k.decl<key_type>("previous_key") << " = " <<
+ keys_first[k.var<const uint_>("gid - 1")] << ";\n" <<
+ " value = " << predicate(k.var<key_type>("previous_key"),
+ k.var<key_type>("key")) <<
+ " ? 0 : 1;\n" <<
+ "}\n else {\n" <<
+ " value = 0;\n" <<
+ "}\n" <<
+ new_keys_first[k.var<const uint_>("gid")] << " = value;\n";
+
+ const context &context = queue.get_context();
+ kernel kernel = k.compile(context);
+
+ size_t work_group_size = preferred_work_group_size;
+ size_t work_groups_no = static_cast<size_t>(
+ std::ceil(float(number_of_keys) / work_group_size)
+ );
+
+ queue.enqueue_1d_range_kernel(kernel,
+ 0,
+ work_groups_no * work_group_size,
+ work_group_size);
+
+ inclusive_scan(new_keys_first, new_keys_first + number_of_keys,
+ new_keys_first, queue);
+}
+
+/// \internal_
+/// Calculate carry-out for each work group.
+/// Carry-out is a pair of the last key processed by a work group and sum of all
+/// values under this key in this work group.
+template<class InputValueIterator, class OutputValueIterator, class BinaryFunction>
+inline void carry_outs(vector<uint_>::iterator keys_first,
+ InputValueIterator values_first,
+ size_t count,
+ vector<uint_>::iterator carry_out_keys_first,
+ OutputValueIterator carry_out_values_first,
+ BinaryFunction function,
+ size_t work_group_size,
+ command_queue &queue)
+{
+ typedef typename
+ std::iterator_traits<OutputValueIterator>::value_type value_out_type;
+
+ detail::meta_kernel k("reduce_by_key_with_scan_carry_outs");
+ k.add_set_arg<const uint_>("count", uint_(count));
+ size_t local_keys_arg = k.add_arg<uint_ *>(memory_object::local_memory, "lkeys");
+ size_t local_vals_arg = k.add_arg<value_out_type *>(memory_object::local_memory, "lvals");
+
+ k <<
+ k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
+ k.decl<const uint_>("wg_size") << " = get_local_size(0);\n" <<
+ k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
+ k.decl<const uint_>("group_id") << " = get_group_id(0);\n" <<
+
+ k.decl<uint_>("key") << ";\n" <<
+ k.decl<value_out_type>("value") << ";\n" <<
+ "if(gid < count){\n" <<
+ k.var<uint_>("key") << " = " <<
+ keys_first[k.var<const uint_>("gid")] << ";\n" <<
+ k.var<value_out_type>("value") << " = " <<
+ values_first[k.var<const uint_>("gid")] << ";\n" <<
+ "lkeys[lid] = key;\n" <<
+ "lvals[lid] = value;\n" <<
+ "}\n" <<
+
+ // Calculate carry out for each work group by performing Hillis/Steele scan
+ // where only last element (key-value pair) is saved
+ k.decl<value_out_type>("result") << " = value;\n" <<
+ k.decl<uint_>("other_key") << ";\n" <<
+ k.decl<value_out_type>("other_value") << ";\n" <<
+
+ "for(" << k.decl<uint_>("offset") << " = 1; " <<
+ "offset < wg_size; offset *= 2){\n"
+ " barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+ " if(lid >= offset){\n"
+ " other_key = lkeys[lid - offset];\n" <<
+ " if(other_key == key){\n" <<
+ " other_value = lvals[lid - offset];\n" <<
+ " result = " << function(k.var<value_out_type>("result"),
+ k.var<value_out_type>("other_value")) << ";\n" <<
+ " }\n" <<
+ " }\n" <<
+ " barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+ " lvals[lid] = result;\n" <<
+ "}\n" <<
+
+ // save carry out
+ "if(lid == (wg_size - 1)){\n" <<
+ carry_out_keys_first[k.var<const uint_>("group_id")] << " = key;\n" <<
+ carry_out_values_first[k.var<const uint_>("group_id")] << " = result;\n" <<
+ "}\n";
+
+ size_t work_groups_no = static_cast<size_t>(
+ std::ceil(float(count) / work_group_size)
+ );
+
+ const context &context = queue.get_context();
+ kernel kernel = k.compile(context);
+ kernel.set_arg(local_keys_arg, local_buffer<uint_>(work_group_size));
+ kernel.set_arg(local_vals_arg, local_buffer<value_out_type>(work_group_size));
+
+ queue.enqueue_1d_range_kernel(kernel,
+ 0,
+ work_groups_no * work_group_size,
+ work_group_size);
+}
+
+/// \internal_
+/// Calculate carry-in by performing inclusive scan by key on carry-outs vector.
+template<class OutputValueIterator, class BinaryFunction>
+inline void carry_ins(vector<uint_>::iterator carry_out_keys_first,
+ OutputValueIterator carry_out_values_first,
+ OutputValueIterator carry_in_values_first,
+ size_t carry_out_size,
+ BinaryFunction function,
+ size_t work_group_size,
+ command_queue &queue)
+{
+ typedef typename
+ std::iterator_traits<OutputValueIterator>::value_type value_out_type;
+
+ uint_ values_pre_work_item = static_cast<uint_>(
+ std::ceil(float(carry_out_size) / work_group_size)
+ );
+
+ detail::meta_kernel k("reduce_by_key_with_scan_carry_ins");
+ k.add_set_arg<const uint_>("carry_out_size", uint_(carry_out_size));
+ k.add_set_arg<const uint_>("values_per_work_item", values_pre_work_item);
+ size_t local_keys_arg = k.add_arg<uint_ *>(memory_object::local_memory, "lkeys");
+ size_t local_vals_arg = k.add_arg<value_out_type *>(memory_object::local_memory, "lvals");
+
+ k <<
+ k.decl<uint_>("id") << " = get_global_id(0) * values_per_work_item;\n" <<
+ k.decl<uint_>("idx") << " = id;\n" <<
+ k.decl<const uint_>("wg_size") << " = get_local_size(0);\n" <<
+ k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
+ k.decl<const uint_>("group_id") << " = get_group_id(0);\n" <<
+
+ k.decl<uint_>("key") << ";\n" <<
+ k.decl<value_out_type>("value") << ";\n" <<
+ k.decl<uint_>("previous_key") << ";\n" <<
+ k.decl<value_out_type>("result") << ";\n" <<
+
+ "if(id < carry_out_size){\n" <<
+ k.var<uint_>("previous_key") << " = " <<
+ carry_out_keys_first[k.var<const uint_>("id")] << ";\n" <<
+ k.var<value_out_type>("result") << " = " <<
+ carry_out_values_first[k.var<const uint_>("id")] << ";\n" <<
+ carry_in_values_first[k.var<const uint_>("id")] << " = result;\n" <<
+ "}\n" <<
+
+ k.decl<const uint_>("end") << " = (id + values_per_work_item) <= carry_out_size" <<
+ " ? (values_per_work_item + id) : carry_out_size;\n" <<
+
+ "for(idx = idx + 1; idx < end; idx += 1){\n" <<
+ " key = " << carry_out_keys_first[k.var<const uint_>("idx")] << ";\n" <<
+ " value = " << carry_out_values_first[k.var<const uint_>("idx")] << ";\n" <<
+ " if(previous_key == key){\n" <<
+ " result = " << function(k.var<value_out_type>("result"),
+ k.var<value_out_type>("value")) << ";\n" <<
+ " }\n else { \n" <<
+ " result = value;\n"
+ " }\n" <<
+ " " << carry_in_values_first[k.var<const uint_>("idx")] << " = result;\n" <<
+ " previous_key = key;\n"
+ "}\n" <<
+
+ // save the last key and result to local memory
+ "lkeys[lid] = previous_key;\n" <<
+ "lvals[lid] = result;\n" <<
+
+ // Hillis/Steele scan
+ "for(" << k.decl<uint_>("offset") << " = 1; " <<
+ "offset < wg_size; offset *= 2){\n"
+ " barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+ " if(lid >= offset){\n"
+ " key = lkeys[lid - offset];\n" <<
+ " if(previous_key == key){\n" <<
+ " value = lvals[lid - offset];\n" <<
+ " result = " << function(k.var<value_out_type>("result"),
+ k.var<value_out_type>("value")) << ";\n" <<
+ " }\n" <<
+ " }\n" <<
+ " barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+ " lvals[lid] = result;\n" <<
+ "}\n" <<
+ "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+
+ "if(lid > 0){\n" <<
+ // load key-value reduced by previous work item
+ " previous_key = lkeys[lid - 1];\n" <<
+ " result = lvals[lid - 1];\n" <<
+ "}\n" <<
+
+ // add key-value reduced by previous work item
+ "for(idx = id; idx < id + values_per_work_item; idx += 1){\n" <<
+ // make sure all carry-ins are saved in global memory
+ " barrier( CLK_GLOBAL_MEM_FENCE );\n" <<
+ " if(lid > 0 && idx < carry_out_size) {\n"
+ " key = " << carry_out_keys_first[k.var<const uint_>("idx")] << ";\n" <<
+ " value = " << carry_in_values_first[k.var<const uint_>("idx")] << ";\n" <<
+ " if(previous_key == key){\n" <<
+ " value = " << function(k.var<value_out_type>("result"),
+ k.var<value_out_type>("value")) << ";\n" <<
+ " }\n" <<
+ " " << carry_in_values_first[k.var<const uint_>("idx")] << " = value;\n" <<
+ " }\n" <<
+ "}\n";
+
+
+ const context &context = queue.get_context();
+ kernel kernel = k.compile(context);
+ kernel.set_arg(local_keys_arg, local_buffer<uint_>(work_group_size));
+ kernel.set_arg(local_vals_arg, local_buffer<value_out_type>(work_group_size));
+
+ queue.enqueue_1d_range_kernel(kernel,
+ 0,
+ work_group_size,
+ work_group_size);
+}
+
+/// \internal_
+///
+/// Perform final reduction by key. Each work item:
+/// 1. Perform local work-group reduction (Hillis/Steele scan)
+/// 2. Add carry-in (if keys are right)
+/// 3. Save reduced value if next key is different than processed one
+template<class InputKeyIterator, class InputValueIterator,
+ class OutputKeyIterator, class OutputValueIterator,
+ class BinaryFunction>
+inline void final_reduction(InputKeyIterator keys_first,
+ InputValueIterator values_first,
+ OutputKeyIterator keys_result,
+ OutputValueIterator values_result,
+ size_t count,
+ BinaryFunction function,
+ vector<uint_>::iterator new_keys_first,
+ vector<uint_>::iterator carry_in_keys_first,
+ OutputValueIterator carry_in_values_first,
+ size_t carry_in_size,
+ size_t work_group_size,
+ command_queue &queue)
+{
+ typedef typename
+ std::iterator_traits<OutputValueIterator>::value_type value_out_type;
+
+ detail::meta_kernel k("reduce_by_key_with_scan_final_reduction");
+ k.add_set_arg<const uint_>("count", uint_(count));
+ size_t local_keys_arg = k.add_arg<uint_ *>(memory_object::local_memory, "lkeys");
+ size_t local_vals_arg = k.add_arg<value_out_type *>(memory_object::local_memory, "lvals");
+
+ k <<
+ k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
+ k.decl<const uint_>("wg_size") << " = get_local_size(0);\n" <<
+ k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
+ k.decl<const uint_>("group_id") << " = get_group_id(0);\n" <<
+
+ k.decl<uint_>("key") << ";\n" <<
+ k.decl<value_out_type>("value") << ";\n"
+
+ "if(gid < count){\n" <<
+ k.var<uint_>("key") << " = " <<
+ new_keys_first[k.var<const uint_>("gid")] << ";\n" <<
+ k.var<value_out_type>("value") << " = " <<
+ values_first[k.var<const uint_>("gid")] << ";\n" <<
+ "lkeys[lid] = key;\n" <<
+ "lvals[lid] = value;\n" <<
+ "}\n" <<
+
+ // Hillis/Steele scan
+ k.decl<value_out_type>("result") << " = value;\n" <<
+ k.decl<uint_>("other_key") << ";\n" <<
+ k.decl<value_out_type>("other_value") << ";\n" <<
+
+ "for(" << k.decl<uint_>("offset") << " = 1; " <<
+ "offset < wg_size ; offset *= 2){\n"
+ " barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+ " if(lid >= offset) {\n" <<
+ " other_key = lkeys[lid - offset];\n" <<
+ " if(other_key == key){\n" <<
+ " other_value = lvals[lid - offset];\n" <<
+ " result = " << function(k.var<value_out_type>("result"),
+ k.var<value_out_type>("other_value")) << ";\n" <<
+ " }\n" <<
+ " }\n" <<
+ " barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+ " lvals[lid] = result;\n" <<
+ "}\n" <<
+
+ "if(gid >= count) {\n return;\n};\n" <<
+
+ k.decl<const bool>("save") << " = (gid < (count - 1)) ?"
+ << new_keys_first[k.var<const uint_>("gid + 1")] << " != key" <<
+ ": true;\n" <<
+
+ // Add carry in
+ k.decl<uint_>("carry_in_key") << ";\n" <<
+ "if(group_id > 0 && save) {\n" <<
+ " carry_in_key = " << carry_in_keys_first[k.var<const uint_>("group_id - 1")] << ";\n" <<
+ " if(key == carry_in_key){\n" <<
+ " other_value = " << carry_in_values_first[k.var<const uint_>("group_id - 1")] << ";\n" <<
+ " result = " << function(k.var<value_out_type>("result"),
+ k.var<value_out_type>("other_value")) << ";\n" <<
+ " }\n" <<
+ "}\n" <<
+
+ // Save result only if the next key is different or it's the last element.
+ "if(save){\n" <<
+ keys_result[k.var<uint_>("key")] << " = " << keys_first[k.var<const uint_>("gid")] << ";\n" <<
+ values_result[k.var<uint_>("key")] << " = result;\n" <<
+ "}\n"
+ ;
+
+ size_t work_groups_no = static_cast<size_t>(
+ std::ceil(float(count) / work_group_size)
+ );
+
+ const context &context = queue.get_context();
+ kernel kernel = k.compile(context);
+ kernel.set_arg(local_keys_arg, local_buffer<uint_>(work_group_size));
+ kernel.set_arg(local_vals_arg, local_buffer<value_out_type>(work_group_size));
+
+ queue.enqueue_1d_range_kernel(kernel,
+ 0,
+ work_groups_no * work_group_size,
+ work_group_size);
+}
+
+/// \internal_
+/// Returns preferred work group size for reduce by key with scan algorithm.
+template<class KeyType, class ValueType>
+inline size_t get_work_group_size(const device& device)
+{
+ std::string cache_key = std::string("__boost_reduce_by_key_with_scan")
+ + "k_" + type_name<KeyType>() + "_v_" + type_name<ValueType>();
+
+ // load parameters
+ boost::shared_ptr<parameter_cache> parameters =
+ detail::parameter_cache::get_global_cache(device);
+
+ return (std::max)(
+ static_cast<size_t>(parameters->get(cache_key, "wgsize", 256)),
+ static_cast<size_t>(device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>())
+ );
+}
+
+/// \internal_
+///
+/// 1. For each work group carry-out value is calculated (it's done by key-oriented
+/// Hillis/Steele scan). Carry-out is a pair of the last key processed by work
+/// group and sum of all values under this key in work group.
+/// 2. From every carry-out carry-in is calculated by performing inclusive scan
+/// by key.
+/// 3. Final reduction by key is performed (key-oriented Hillis/Steele scan),
+/// carry-in values are added where needed.
+template<class InputKeyIterator, class InputValueIterator,
+ class OutputKeyIterator, class OutputValueIterator,
+ class BinaryFunction, class BinaryPredicate>
+inline size_t reduce_by_key_with_scan(InputKeyIterator keys_first,
+ InputKeyIterator keys_last,
+ InputValueIterator values_first,
+ OutputKeyIterator keys_result,
+ OutputValueIterator values_result,
+ BinaryFunction function,
+ BinaryPredicate predicate,
+ command_queue &queue)
+{
+ typedef typename
+ std::iterator_traits<InputValueIterator>::value_type value_type;
+ typedef typename
+ std::iterator_traits<InputKeyIterator>::value_type key_type;
+ typedef typename
+ std::iterator_traits<OutputValueIterator>::value_type value_out_type;
+
+ const context &context = queue.get_context();
+ size_t count = detail::iterator_range_size(keys_first, keys_last);
+
+ if(count == 0){
+ return size_t(0);
+ }
+
+ const device &device = queue.get_device();
+ size_t work_group_size = get_work_group_size<value_type, key_type>(device);
+
+ // Replace original key with unsigned integer keys generated based on given
+ // predicate. New key is also an index for keys_result and values_result vectors,
+ // which points to place where reduced value should be saved.
+ vector<uint_> new_keys(count, context);
+ vector<uint_>::iterator new_keys_first = new_keys.begin();
+ generate_uint_keys(keys_first, count, predicate, new_keys_first,
+ work_group_size, queue);
+
+ // Calculate carry-out and carry-in vectors size
+ const size_t carry_out_size = static_cast<size_t>(
+ std::ceil(float(count) / work_group_size)
+ );
+ vector<uint_> carry_out_keys(carry_out_size, context);
+ vector<value_out_type> carry_out_values(carry_out_size, context);
+ carry_outs(new_keys_first, values_first, count, carry_out_keys.begin(),
+ carry_out_values.begin(), function, work_group_size, queue);
+
+ vector<value_out_type> carry_in_values(carry_out_size, context);
+ carry_ins(carry_out_keys.begin(), carry_out_values.begin(),
+ carry_in_values.begin(), carry_out_size, function, work_group_size,
+ queue);
+
+ final_reduction(keys_first, values_first, keys_result, values_result,
+ count, function, new_keys_first, carry_out_keys.begin(),
+ carry_in_values.begin(), carry_out_size, work_group_size,
+ queue);
+
+ const size_t result = read_single_value<uint_>(new_keys.get_buffer(),
+ count - 1, queue);
+ return result + 1;
+}
+
+/// \internal_
+/// Return true if requirements for running reduce by key with scan on given
+/// device are met (at least one work group of preferred size can be run).
+template<class InputKeyIterator, class InputValueIterator,
+ class OutputKeyIterator, class OutputValueIterator>
+bool reduce_by_key_with_scan_requirements_met(InputKeyIterator keys_first,
+ InputValueIterator values_first,
+ OutputKeyIterator keys_result,
+ OutputValueIterator values_result,
+ const size_t count,
+ command_queue &queue)
+{
+ typedef typename
+ std::iterator_traits<InputValueIterator>::value_type value_type;
+ typedef typename
+ std::iterator_traits<InputKeyIterator>::value_type key_type;
+ typedef typename
+ std::iterator_traits<OutputValueIterator>::value_type value_out_type;
+
+ (void) keys_first;
+ (void) values_first;
+ (void) keys_result;
+ (void) values_result;
+
+ const device &device = queue.get_device();
+ // device must have dedicated local memory storage
+ if(device.get_info<CL_DEVICE_LOCAL_MEM_TYPE>() != CL_LOCAL)
+ {
+ return false;
+ }
+
+ // local memory size in bytes (per compute unit)
+ const size_t local_mem_size = device.get_info<CL_DEVICE_LOCAL_MEM_SIZE>();
+
+ // preferred work group size
+ size_t work_group_size = get_work_group_size<key_type, value_type>(device);
+
+ // local memory size needed to perform parallel reduction
+ size_t required_local_mem_size = 0;
+ // keys size
+ required_local_mem_size += sizeof(uint_) * work_group_size;
+ // reduced values size
+ required_local_mem_size += sizeof(value_out_type) * work_group_size;
+
+ return (required_local_mem_size <= local_mem_size);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_BY_KEY_WITH_SCAN_HPP
diff --git a/boost/compute/algorithm/detail/reduce_on_gpu.hpp b/boost/compute/algorithm/detail/reduce_on_gpu.hpp
new file mode 100644
index 0000000000..335fba8724
--- /dev/null
+++ b/boost/compute/algorithm/detail/reduce_on_gpu.hpp
@@ -0,0 +1,286 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
+
+#include <iterator>
+
+#include <boost/compute/utility/source.hpp>
+#include <boost/compute/program.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/detail/vendor.hpp>
+#include <boost/compute/detail/parameter_cache.hpp>
+#include <boost/compute/detail/work_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/type_traits/type_name.hpp>
+#include <boost/compute/utility/program_cache.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+/// \internal
+/// body reduction inside a warp
+template<typename T,bool isNvidiaDevice>
+struct ReduceBody
+{
+ static std::string body()
+ {
+ std::stringstream k;
+ // local reduction
+ k << "for(int i = 1; i < TPB; i <<= 1){\n" <<
+ " barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+ " uint mask = (i << 1) - 1;\n" <<
+ " if((lid & mask) == 0){\n" <<
+ " scratch[lid] += scratch[lid+i];\n" <<
+ " }\n" <<
+ "}\n";
+ return k.str();
+ }
+};
+
+/// \internal
+/// body reduction inside a warp
+/// for nvidia device we can use the "unsafe"
+/// memory optimisation
+template<typename T>
+struct ReduceBody<T,true>
+{
+ static std::string body()
+ {
+ std::stringstream k;
+ // local reduction
+ // we use TPB to compile only useful instruction
+ // local reduction when size is greater than warp size
+ k << "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+ "if(TPB >= 1024){\n" <<
+ "if(lid < 512) { sum += scratch[lid + 512]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
+ "if(TPB >= 512){\n" <<
+ "if(lid < 256) { sum += scratch[lid + 256]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
+ "if(TPB >= 256){\n" <<
+ "if(lid < 128) { sum += scratch[lid + 128]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
+ "if(TPB >= 128){\n" <<
+ "if(lid < 64) { sum += scratch[lid + 64]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);} \n" <<
+
+ // warp reduction
+ "if(lid < 32){\n" <<
+ // volatile this way we don't need any barrier
+ "volatile __local " << type_name<T>() << " *lmem = scratch;\n" <<
+ "if(TPB >= 64) { lmem[lid] = sum = sum + lmem[lid+32];} \n" <<
+ "if(TPB >= 32) { lmem[lid] = sum = sum + lmem[lid+16];} \n" <<
+ "if(TPB >= 16) { lmem[lid] = sum = sum + lmem[lid+ 8];} \n" <<
+ "if(TPB >= 8) { lmem[lid] = sum = sum + lmem[lid+ 4];} \n" <<
+ "if(TPB >= 4) { lmem[lid] = sum = sum + lmem[lid+ 2];} \n" <<
+ "if(TPB >= 2) { lmem[lid] = sum = sum + lmem[lid+ 1];} \n" <<
+ "}\n";
+ return k.str();
+ }
+};
+
+template<class InputIterator, class Function>
+inline void initial_reduce(InputIterator first,
+ InputIterator last,
+ buffer result,
+ const Function &function,
+ kernel &reduce_kernel,
+ const uint_ vpt,
+ const uint_ tpb,
+ command_queue &queue)
+{
+ (void) function;
+ (void) reduce_kernel;
+
+ typedef typename std::iterator_traits<InputIterator>::value_type Arg;
+ typedef typename boost::tr1_result_of<Function(Arg, Arg)>::type T;
+
+ size_t count = std::distance(first, last);
+ detail::meta_kernel k("initial_reduce");
+ k.add_set_arg<const uint_>("count", uint_(count));
+ size_t output_arg = k.add_arg<T *>(memory_object::global_memory, "output");
+
+ k <<
+ k.decl<const uint_>("offset") << " = get_group_id(0) * VPT * TPB;\n" <<
+ k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
+
+ "__local " << type_name<T>() << " scratch[TPB];\n" <<
+
+ // private reduction
+ k.decl<T>("sum") << " = 0;\n" <<
+ "for(uint i = 0; i < VPT; i++){\n" <<
+ " if(offset + lid + i*TPB < count){\n" <<
+ " sum = sum + " << first[k.var<uint_>("offset+lid+i*TPB")] << ";\n" <<
+ " }\n" <<
+ "}\n" <<
+
+ "scratch[lid] = sum;\n" <<
+
+ // local reduction
+ ReduceBody<T,false>::body() <<
+
+ // write sum to output
+ "if(lid == 0){\n" <<
+ " output[get_group_id(0)] = scratch[0];\n" <<
+ "}\n";
+
+ const context &context = queue.get_context();
+ std::stringstream options;
+ options << "-DVPT=" << vpt << " -DTPB=" << tpb;
+ kernel generic_reduce_kernel = k.compile(context, options.str());
+ generic_reduce_kernel.set_arg(output_arg, result);
+
+ size_t work_size = calculate_work_size(count, vpt, tpb);
+
+ queue.enqueue_1d_range_kernel(generic_reduce_kernel, 0, work_size, tpb);
+}
+
+template<class T>
+inline void initial_reduce(const buffer_iterator<T> &first,
+ const buffer_iterator<T> &last,
+ const buffer &result,
+ const plus<T> &function,
+ kernel &reduce_kernel,
+ const uint_ vpt,
+ const uint_ tpb,
+ command_queue &queue)
+{
+ (void) function;
+
+ size_t count = std::distance(first, last);
+
+ reduce_kernel.set_arg(0, first.get_buffer());
+ reduce_kernel.set_arg(1, uint_(first.get_index()));
+ reduce_kernel.set_arg(2, uint_(count));
+ reduce_kernel.set_arg(3, result);
+ reduce_kernel.set_arg(4, uint_(0));
+
+ size_t work_size = calculate_work_size(count, vpt, tpb);
+
+ queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);
+}
+
+template<class InputIterator, class T, class Function>
+inline void reduce_on_gpu(InputIterator first,
+ InputIterator last,
+ buffer_iterator<T> result,
+ Function function,
+ command_queue &queue)
+{
+ const device &device = queue.get_device();
+ const context &context = queue.get_context();
+
+ detail::meta_kernel k("reduce");
+ k.add_arg<const T*>(memory_object::global_memory, "input");
+ k.add_arg<const uint_>("offset");
+ k.add_arg<const uint_>("count");
+ k.add_arg<T*>(memory_object::global_memory, "output");
+ k.add_arg<const uint_>("output_offset");
+
+ k <<
+ k.decl<const uint_>("block_offset") << " = get_group_id(0) * VPT * TPB;\n" <<
+ "__global const " << type_name<T>() << " *block = input + offset + block_offset;\n" <<
+ k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
+
+ "__local " << type_name<T>() << " scratch[TPB];\n" <<
+ // private reduction
+ k.decl<T>("sum") << " = 0;\n" <<
+ "for(uint i = 0; i < VPT; i++){\n" <<
+ " if(block_offset + lid + i*TPB < count){\n" <<
+ " sum = sum + block[lid+i*TPB]; \n" <<
+ " }\n" <<
+ "}\n" <<
+
+ "scratch[lid] = sum;\n";
+
+ // discrimination on vendor name
+ if(is_nvidia_device(device))
+ k << ReduceBody<T,true>::body();
+ else
+ k << ReduceBody<T,false>::body();
+
+ k <<
+ // write sum to output
+ "if(lid == 0){\n" <<
+ " output[output_offset + get_group_id(0)] = scratch[0];\n" <<
+ "}\n";
+
+ std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name<T>();
+
+ // load parameters
+ boost::shared_ptr<parameter_cache> parameters =
+ detail::parameter_cache::get_global_cache(device);
+
+ uint_ vpt = parameters->get(cache_key, "vpt", 8);
+ uint_ tpb = parameters->get(cache_key, "tpb", 128);
+
+ // reduce program compiler flags
+ std::stringstream options;
+ options << "-DT=" << type_name<T>()
+ << " -DVPT=" << vpt
+ << " -DTPB=" << tpb;
+
+ // load program
+ boost::shared_ptr<program_cache> cache =
+ program_cache::get_global_cache(context);
+
+ program reduce_program = cache->get_or_build(
+ cache_key, options.str(), k.source(), context
+ );
+
+ // create reduce kernel
+ kernel reduce_kernel(reduce_program, "reduce");
+
+ size_t count = std::distance(first, last);
+
+ // first pass, reduce from input to ping
+ buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T));
+ initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue);
+
+ // update count after initial reduce
+ count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));
+
+ // middle pass(es), reduce between ping and pong
+ const buffer *input_buffer = &ping;
+ buffer pong(context, static_cast<size_t>(count / vpt / tpb * sizeof(T)));
+ const buffer *output_buffer = &pong;
+ if(count > vpt * tpb){
+ while(count > vpt * tpb){
+ reduce_kernel.set_arg(0, *input_buffer);
+ reduce_kernel.set_arg(1, uint_(0));
+ reduce_kernel.set_arg(2, uint_(count));
+ reduce_kernel.set_arg(3, *output_buffer);
+ reduce_kernel.set_arg(4, uint_(0));
+
+ size_t work_size = static_cast<size_t>(std::ceil(float(count) / vpt));
+ if(work_size % tpb != 0){
+ work_size += tpb - work_size % tpb;
+ }
+ queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);
+
+ std::swap(input_buffer, output_buffer);
+ count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));
+ }
+ }
+
+ // final pass, reduce from ping/pong to result
+ reduce_kernel.set_arg(0, *input_buffer);
+ reduce_kernel.set_arg(1, uint_(0));
+ reduce_kernel.set_arg(2, uint_(count));
+ reduce_kernel.set_arg(3, result.get_buffer());
+ reduce_kernel.set_arg(4, uint_(result.get_index()));
+
+ queue.enqueue_1d_range_kernel(reduce_kernel, 0, tpb, tpb);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
diff --git a/boost/compute/algorithm/detail/scan.hpp b/boost/compute/algorithm/detail/scan.hpp
new file mode 100644
index 0000000000..154b6001be
--- /dev/null
+++ b/boost/compute/algorithm/detail/scan.hpp
@@ -0,0 +1,45 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_HPP
+
+#include <boost/compute/device.hpp>
+#include <boost/compute/algorithm/detail/scan_on_cpu.hpp>
+#include <boost/compute/algorithm/detail/scan_on_gpu.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
+inline OutputIterator scan(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ bool exclusive,
+ T init,
+ BinaryOperator op,
+ command_queue &queue)
+{
+ const device &device = queue.get_device();
+
+ if(device.type() & device::cpu){
+ return scan_on_cpu(first, last, result, exclusive, init, op, queue);
+ }
+ else {
+ return scan_on_gpu(first, last, result, exclusive, init, op, queue);
+ }
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_HPP
diff --git a/boost/compute/algorithm/detail/scan_on_cpu.hpp b/boost/compute/algorithm/detail/scan_on_cpu.hpp
new file mode 100644
index 0000000000..6611c0ba3e
--- /dev/null
+++ b/boost/compute/algorithm/detail/scan_on_cpu.hpp
@@ -0,0 +1,103 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP
+
+#include <iterator>
+
+#include <boost/compute/device.hpp>
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
+inline OutputIterator scan_on_cpu(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ bool exclusive,
+ T init,
+ BinaryOperator op,
+ command_queue &queue)
+{
+ if(first == last){
+ return result;
+ }
+
+ typedef typename
+ std::iterator_traits<InputIterator>::value_type input_type;
+ typedef typename
+ std::iterator_traits<OutputIterator>::value_type output_type;
+
+ const context &context = queue.get_context();
+
+ // create scan kernel
+ meta_kernel k("scan_on_cpu");
+
+ // Arguments
+ size_t n_arg = k.add_arg<ulong_>("n");
+ size_t init_arg = k.add_arg<output_type>("initial_value");
+
+ if(!exclusive){
+ k <<
+ k.decl<const ulong_>("start_idx") << " = 1;\n" <<
+ k.decl<output_type>("sum") << " = " << first[0] << ";\n" <<
+ result[0] << " = sum;\n";
+ }
+ else {
+ k <<
+ k.decl<const ulong_>("start_idx") << " = 0;\n" <<
+ k.decl<output_type>("sum") << " = initial_value;\n";
+ }
+
+ k <<
+ "for(ulong i = start_idx; i < n; i++){\n" <<
+ k.decl<const input_type>("x") << " = "
+ << first[k.var<ulong_>("i")] << ";\n";
+
+ if(exclusive){
+ k << result[k.var<ulong_>("i")] << " = sum;\n";
+ }
+
+ k << " sum = "
+ << op(k.var<output_type>("sum"), k.var<output_type>("x"))
+ << ";\n";
+
+ if(!exclusive){
+ k << result[k.var<ulong_>("i")] << " = sum;\n";
+ }
+
+ k << "}\n";
+
+ // compile scan kernel
+ kernel scan_kernel = k.compile(context);
+
+ // setup kernel arguments
+ size_t n = detail::iterator_range_size(first, last);
+ scan_kernel.set_arg<ulong_>(n_arg, n);
+ scan_kernel.set_arg<output_type>(init_arg, static_cast<output_type>(init));
+
+ // execute the kernel
+ queue.enqueue_1d_range_kernel(scan_kernel, 0, 1, 1);
+
+ // return iterator pointing to the end of the result range
+ return result + n;
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP
diff --git a/boost/compute/algorithm/detail/scan_on_gpu.hpp b/boost/compute/algorithm/detail/scan_on_gpu.hpp
new file mode 100644
index 0000000000..07c6d6d3c0
--- /dev/null
+++ b/boost/compute/algorithm/detail/scan_on_gpu.hpp
@@ -0,0 +1,331 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_GPU_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_GPU_HPP
+
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/detail/scan_on_cpu.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/memory/local_buffer.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class OutputIterator, class BinaryOperator>
+class local_scan_kernel : public meta_kernel
+{
+public:
+ local_scan_kernel(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ bool exclusive,
+ BinaryOperator op)
+ : meta_kernel("local_scan")
+ {
+ typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+ (void) last;
+
+ bool checked = true;
+
+ m_block_sums_arg = add_arg<T *>(memory_object::global_memory, "block_sums");
+ m_scratch_arg = add_arg<T *>(memory_object::local_memory, "scratch");
+ m_block_size_arg = add_arg<const cl_uint>("block_size");
+ m_count_arg = add_arg<const cl_uint>("count");
+ m_init_value_arg = add_arg<const T>("init");
+
+ // work-item parameters
+ *this <<
+ "const uint gid = get_global_id(0);\n" <<
+ "const uint lid = get_local_id(0);\n";
+
+ // check against data size
+ if(checked){
+ *this <<
+ "if(gid < count){\n";
+ }
+
+ // copy values from input to local memory
+ if(exclusive){
+ *this <<
+ decl<const T>("local_init") << "= (gid == 0) ? init : 0;\n" <<
+ "if(lid == 0){ scratch[lid] = local_init; }\n" <<
+ "else { scratch[lid] = " << first[expr<cl_uint>("gid-1")] << "; }\n";
+ }
+ else{
+ *this <<
+ "scratch[lid] = " << first[expr<cl_uint>("gid")] << ";\n";
+ }
+
+ if(checked){
+ *this <<
+ "}\n"
+ "else {\n" <<
+ " scratch[lid] = 0;\n" <<
+ "}\n";
+ }
+
+ // wait for all threads to read from input
+ *this <<
+ "barrier(CLK_LOCAL_MEM_FENCE);\n";
+
+ // perform scan
+ *this <<
+ "for(uint i = 1; i < block_size; i <<= 1){\n" <<
+ " " << decl<const T>("x") << " = lid >= i ? scratch[lid-i] : 0;\n" <<
+ " barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+ " if(lid >= i){\n" <<
+ " scratch[lid] = " << op(var<T>("scratch[lid]"), var<T>("x")) << ";\n" <<
+ " }\n" <<
+ " barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+ "}\n";
+
+ // copy results to output
+ if(checked){
+ *this <<
+ "if(gid < count){\n";
+ }
+
+ *this <<
+ result[expr<cl_uint>("gid")] << " = scratch[lid];\n";
+
+ if(checked){
+ *this << "}\n";
+ }
+
+ // store sum for the block
+ if(exclusive){
+ *this <<
+ "if(lid == block_size - 1){\n" <<
+ " block_sums[get_group_id(0)] = " <<
+ op(first[expr<cl_uint>("gid")], var<T>("scratch[lid]")) <<
+ ";\n" <<
+ "}\n";
+ }
+ else {
+ *this <<
+ "if(lid == block_size - 1){\n" <<
+ " block_sums[get_group_id(0)] = scratch[lid];\n" <<
+ "}\n";
+ }
+ }
+
+ size_t m_block_sums_arg;
+ size_t m_scratch_arg;
+ size_t m_block_size_arg;
+ size_t m_count_arg;
+ size_t m_init_value_arg;
+};
+
+template<class T, class BinaryOperator>
+class write_scanned_output_kernel : public meta_kernel
+{
+public:
+ write_scanned_output_kernel(BinaryOperator op)
+ : meta_kernel("write_scanned_output")
+ {
+ bool checked = true;
+
+ m_output_arg = add_arg<T *>(memory_object::global_memory, "output");
+ m_block_sums_arg = add_arg<const T *>(memory_object::global_memory, "block_sums");
+ m_count_arg = add_arg<const cl_uint>("count");
+
+ // work-item parameters
+ *this <<
+ "const uint gid = get_global_id(0);\n" <<
+ "const uint block_id = get_group_id(0);\n";
+
+ // check against data size
+ if(checked){
+ *this << "if(gid < count){\n";
+ }
+
+ // write output
+ *this <<
+ "output[gid] = " <<
+ op(var<T>("block_sums[block_id]"), var<T>("output[gid] ")) << ";\n";
+
+ if(checked){
+ *this << "}\n";
+ }
+ }
+
+ size_t m_output_arg;
+ size_t m_block_sums_arg;
+ size_t m_count_arg;
+};
+
+template<class InputIterator>
+inline size_t pick_scan_block_size(InputIterator first, InputIterator last)
+{
+ size_t count = iterator_range_size(first, last);
+
+ if(count == 0) { return 0; }
+ else if(count <= 1) { return 1; }
+ else if(count <= 2) { return 2; }
+ else if(count <= 4) { return 4; }
+ else if(count <= 8) { return 8; }
+ else if(count <= 16) { return 16; }
+ else if(count <= 32) { return 32; }
+ else if(count <= 64) { return 64; }
+ else if(count <= 128) { return 128; }
+ else { return 256; }
+}
+
+template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
+inline OutputIterator scan_impl(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ bool exclusive,
+ T init,
+ BinaryOperator op,
+ command_queue &queue)
+{
+ typedef typename
+ std::iterator_traits<InputIterator>::value_type
+ input_type;
+ typedef typename
+ std::iterator_traits<InputIterator>::difference_type
+ difference_type;
+ typedef typename
+ std::iterator_traits<OutputIterator>::value_type
+ output_type;
+
+ const context &context = queue.get_context();
+ const size_t count = detail::iterator_range_size(first, last);
+
+ size_t block_size = pick_scan_block_size(first, last);
+ size_t block_count = count / block_size;
+
+ if(block_count * block_size < count){
+ block_count++;
+ }
+
+ ::boost::compute::vector<input_type> block_sums(block_count, context);
+
+ // zero block sums
+ input_type zero;
+ std::memset(&zero, 0, sizeof(input_type));
+ ::boost::compute::fill(block_sums.begin(), block_sums.end(), zero, queue);
+
+ // local scan
+ local_scan_kernel<InputIterator, OutputIterator, BinaryOperator>
+ local_scan_kernel(first, last, result, exclusive, op);
+
+ ::boost::compute::kernel kernel = local_scan_kernel.compile(context);
+ kernel.set_arg(local_scan_kernel.m_scratch_arg, local_buffer<input_type>(block_size));
+ kernel.set_arg(local_scan_kernel.m_block_sums_arg, block_sums);
+ kernel.set_arg(local_scan_kernel.m_block_size_arg, static_cast<cl_uint>(block_size));
+ kernel.set_arg(local_scan_kernel.m_count_arg, static_cast<cl_uint>(count));
+ kernel.set_arg(local_scan_kernel.m_init_value_arg, static_cast<output_type>(init));
+
+ queue.enqueue_1d_range_kernel(kernel,
+ 0,
+ block_count * block_size,
+ block_size);
+
+ // inclusive scan block sums
+ if(block_count > 1){
+ scan_impl(block_sums.begin(),
+ block_sums.end(),
+ block_sums.begin(),
+ false,
+ init,
+ op,
+ queue
+ );
+ }
+
+ // add block sums to each block
+ if(block_count > 1){
+ write_scanned_output_kernel<input_type, BinaryOperator>
+ write_output_kernel(op);
+ kernel = write_output_kernel.compile(context);
+ kernel.set_arg(write_output_kernel.m_output_arg, result.get_buffer());
+ kernel.set_arg(write_output_kernel.m_block_sums_arg, block_sums);
+ kernel.set_arg(write_output_kernel.m_count_arg, static_cast<cl_uint>(count));
+
+ queue.enqueue_1d_range_kernel(kernel,
+ block_size,
+ block_count * block_size,
+ block_size);
+ }
+
+ return result + static_cast<difference_type>(count);
+}
+
+template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
+inline OutputIterator dispatch_scan(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ bool exclusive,
+ T init,
+ BinaryOperator op,
+ command_queue &queue)
+{
+ return scan_impl(first, last, result, exclusive, init, op, queue);
+}
+
+template<class InputIterator, class T, class BinaryOperator>
+inline InputIterator dispatch_scan(InputIterator first,
+ InputIterator last,
+ InputIterator result,
+ bool exclusive,
+ T init,
+ BinaryOperator op,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+ if(first == result){
+ // scan input in-place
+ const context &context = queue.get_context();
+
+ // make a temporary copy the input
+ size_t count = iterator_range_size(first, last);
+ vector<value_type> tmp(count, context);
+ copy(first, last, tmp.begin(), queue);
+
+ // scan from temporary values
+ return scan_impl(tmp.begin(), tmp.end(), first, exclusive, init, op, queue);
+ }
+ else {
+ // scan input to output
+ return scan_impl(first, last, result, exclusive, init, op, queue);
+ }
+}
+
+template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
+inline OutputIterator scan_on_gpu(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ bool exclusive,
+ T init,
+ BinaryOperator op,
+ command_queue &queue)
+{
+ if(first == last){
+ return result;
+ }
+
+ return dispatch_scan(first, last, result, exclusive, init, op, queue);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_GPU_HPP
diff --git a/boost/compute/algorithm/detail/search_all.hpp b/boost/compute/algorithm/detail/search_all.hpp
new file mode 100644
index 0000000000..a874bcdebe
--- /dev/null
+++ b/boost/compute/algorithm/detail/search_all.hpp
@@ -0,0 +1,86 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SEARCH_ALL_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_SEARCH_ALL_HPP
+
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/lambda.hpp>
+#include <boost/compute/system.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+///
+/// \brief Search kernel class
+///
+/// Subclass of meta_kernel which is capable of performing pattern matching
+///
+template<class PatternIterator, class TextIterator, class OutputIterator>
+class search_kernel : public meta_kernel
+{
+public:
+ search_kernel() : meta_kernel("search")
+ {}
+
+ void set_range(PatternIterator p_first,
+ PatternIterator p_last,
+ TextIterator t_first,
+ TextIterator t_last,
+ OutputIterator result)
+ {
+ m_p_count = iterator_range_size(p_first, p_last);
+ m_p_count_arg = add_arg<uint_>("p_count");
+
+ m_count = iterator_range_size(t_first, t_last);
+ m_count = m_count + 1 - m_p_count;
+
+ *this <<
+ "uint i = get_global_id(0);\n" <<
+ "uint i1 = i;\n" <<
+ "uint j;\n" <<
+ "for(j = 0; j<p_count; j++,i++)\n" <<
+ "{\n" <<
+ " if(" << p_first[expr<uint_>("j")] << " != " <<
+ t_first[expr<uint_>("i")] << ")\n" <<
+ " j = p_count + 1;\n" <<
+ "}\n" <<
+ "if(j == p_count)\n" <<
+ result[expr<uint_>("i1")] << " = 1;\n" <<
+ "else\n" <<
+ result[expr<uint_>("i1")] << " = 0;\n";
+ }
+
+ event exec(command_queue &queue)
+ {
+ if(m_count == 0) {
+ return event();
+ }
+
+ set_arg(m_p_count_arg, uint_(m_p_count));
+
+ return exec_1d(queue, 0, m_count);
+ }
+
+private:
+ size_t m_p_count;
+ size_t m_p_count_arg;
+ size_t m_count;
+};
+
+} //end detail namespace
+} //end compute namespace
+} //end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SEARCH_ALL_HPP
diff --git a/boost/compute/algorithm/detail/serial_accumulate.hpp b/boost/compute/algorithm/detail/serial_accumulate.hpp
new file mode 100644
index 0000000000..84f9910122
--- /dev/null
+++ b/boost/compute/algorithm/detail/serial_accumulate.hpp
@@ -0,0 +1,56 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_ACCUMULATE_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_ACCUMULATE_HPP
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class OutputIterator, class T, class BinaryFunction>
+inline void serial_accumulate(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ T init,
+ BinaryFunction function,
+ command_queue &queue)
+{
+ const context &context = queue.get_context();
+ size_t count = detail::iterator_range_size(first, last);
+
+ meta_kernel k("serial_accumulate");
+ size_t init_arg = k.add_arg<T>("init");
+ size_t count_arg = k.add_arg<cl_uint>("count");
+
+ k <<
+ k.decl<T>("result") << " = init;\n" <<
+ "for(uint i = 0; i < count; i++)\n" <<
+ " result = " << function(k.var<T>("result"),
+ first[k.var<cl_uint>("i")]) << ";\n" <<
+ result[0] << " = result;\n";
+
+ kernel kernel = k.compile(context);
+
+ kernel.set_arg(init_arg, init);
+ kernel.set_arg(count_arg, static_cast<cl_uint>(count));
+
+ queue.enqueue_task(kernel);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_ACCUMULATE_HPP
diff --git a/boost/compute/algorithm/detail/serial_count_if.hpp b/boost/compute/algorithm/detail/serial_count_if.hpp
new file mode 100644
index 0000000000..be6794c426
--- /dev/null
+++ b/boost/compute/algorithm/detail/serial_count_if.hpp
@@ -0,0 +1,68 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_COUNT_IF_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_COUNT_IF_HPP
+
+#include <iterator>
+
+#include <boost/compute/container/detail/scalar.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+// counts values that match the predicate using a single thread
+template<class InputIterator, class Predicate>
+inline size_t serial_count_if(InputIterator first,
+ InputIterator last,
+ Predicate predicate,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+ const context &context = queue.get_context();
+ size_t size = iterator_range_size(first, last);
+
+ meta_kernel k("serial_count_if");
+ k.add_set_arg("size", static_cast<uint_>(size));
+ size_t result_arg = k.add_arg<uint_ *>(memory_object::global_memory, "result");
+
+ k <<
+ "uint count = 0;\n" <<
+ "for(uint i = 0; i < size; i++){\n" <<
+ k.decl<const value_type>("value") << "="
+ << first[k.var<uint_>("i")] << ";\n" <<
+ "if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
+ "count++;\n" <<
+ "}\n"
+ "}\n"
+ "*result = count;\n";
+
+ kernel kernel = k.compile(context);
+
+ // setup result buffer
+ scalar<uint_> result(context);
+ kernel.set_arg(result_arg, result.get_buffer());
+
+ // run kernel
+ queue.enqueue_task(kernel);
+
+ // read index
+ return result.read(queue);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_COUNT_IF_HPP
diff --git a/boost/compute/algorithm/detail/serial_find_extrema.hpp b/boost/compute/algorithm/detail/serial_find_extrema.hpp
new file mode 100644
index 0000000000..8407c88129
--- /dev/null
+++ b/boost/compute/algorithm/detail/serial_find_extrema.hpp
@@ -0,0 +1,87 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_FIND_EXTREMA_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_FIND_EXTREMA_HPP
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/types/fundamental.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/container/detail/scalar.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class Compare>
+inline InputIterator serial_find_extrema(InputIterator first,
+ InputIterator last,
+ Compare compare,
+ const bool find_minimum,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+ typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
+
+ const context &context = queue.get_context();
+
+ meta_kernel k("serial_find_extrema");
+
+ k <<
+ k.decl<value_type>("value") << " = " << first[k.expr<uint_>("0")] << ";\n" <<
+ k.decl<uint_>("value_index") << " = 0;\n" <<
+ "for(uint i = 1; i < size; i++){\n" <<
+ " " << k.decl<value_type>("candidate") << "="
+ << first[k.expr<uint_>("i")] << ";\n" <<
+
+ "#ifndef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
+ " if(" << compare(k.var<value_type>("candidate"),
+ k.var<value_type>("value")) << "){\n" <<
+ "#else\n" <<
+ " if(" << compare(k.var<value_type>("value"),
+ k.var<value_type>("candidate")) << "){\n" <<
+ "#endif\n" <<
+
+ " value = candidate;\n" <<
+ " value_index = i;\n" <<
+ " }\n" <<
+ "}\n" <<
+ "*index = value_index;\n";
+
+ size_t index_arg_index = k.add_arg<uint_ *>(memory_object::global_memory, "index");
+ size_t size_arg_index = k.add_arg<uint_>("size");
+
+ std::string options;
+ if(!find_minimum){
+ options = "-DBOOST_COMPUTE_FIND_MAXIMUM";
+ }
+ kernel kernel = k.compile(context, options);
+
+ // setup index buffer
+ scalar<uint_> index(context);
+ kernel.set_arg(index_arg_index, index.get_buffer());
+
+ // setup count
+ size_t count = iterator_range_size(first, last);
+ kernel.set_arg(size_arg_index, static_cast<uint_>(count));
+
+ // run kernel
+ queue.enqueue_task(kernel);
+
+ // read index and return iterator
+ return first + static_cast<difference_type>(index.read(queue));
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_FIND_EXTREMA_HPP
diff --git a/boost/compute/algorithm/detail/serial_merge.hpp b/boost/compute/algorithm/detail/serial_merge.hpp
new file mode 100644
index 0000000000..85e38f704c
--- /dev/null
+++ b/boost/compute/algorithm/detail/serial_merge.hpp
@@ -0,0 +1,97 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_SERIAL_MERGE_HPP
+#define BOOST_COMPUTE_ALGORITHM_SERIAL_MERGE_HPP
+
+#include <iterator>
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator1,
+ class InputIterator2,
+ class OutputIterator,
+ class Compare>
+inline OutputIterator serial_merge(InputIterator1 first1,
+ InputIterator1 last1,
+ InputIterator2 first2,
+ InputIterator2 last2,
+ OutputIterator result,
+ Compare comp,
+ command_queue &queue)
+{
+ typedef typename
+ std::iterator_traits<InputIterator1>::value_type
+ input_type1;
+ typedef typename
+ std::iterator_traits<InputIterator2>::value_type
+ input_type2;
+ typedef typename
+ std::iterator_traits<OutputIterator>::difference_type
+ result_difference_type;
+
+ std::ptrdiff_t size1 = std::distance(first1, last1);
+ std::ptrdiff_t size2 = std::distance(first2, last2);
+
+ meta_kernel k("serial_merge");
+ k.add_set_arg<uint_>("size1", static_cast<uint_>(size1));
+ k.add_set_arg<uint_>("size2", static_cast<uint_>(size2));
+
+ k <<
+ "uint i = 0;\n" << // index in result range
+ "uint j = 0;\n" << // index in first input range
+ "uint k = 0;\n" << // index in second input range
+
+ // fetch initial values from each range
+ k.decl<input_type1>("j_value") << " = " << first1[0] << ";\n" <<
+ k.decl<input_type2>("k_value") << " = " << first2[0] << ";\n" <<
+
+ // merge values from both input ranges to the result range
+ "while(j < size1 && k < size2){\n" <<
+ " if(" << comp(k.var<input_type1>("j_value"),
+ k.var<input_type2>("k_value")) << "){\n" <<
+ " " << result[k.var<uint_>("i++")] << " = j_value;\n" <<
+ " j_value = " << first1[k.var<uint_>("++j")] << ";\n" <<
+ " }\n" <<
+ " else{\n"
+ " " << result[k.var<uint_>("i++")] << " = k_value;\n"
+ " k_value = " << first2[k.var<uint_>("++k")] << ";\n" <<
+ " }\n"
+ "}\n"
+
+ // copy any remaining values from first range
+ "while(j < size1){\n" <<
+ result[k.var<uint_>("i++")] << " = " <<
+ first1[k.var<uint_>("j++")] << ";\n" <<
+ "}\n"
+
+ // copy any remaining values from second range
+ "while(k < size2){\n" <<
+ result[k.var<uint_>("i++")] << " = " <<
+ first2[k.var<uint_>("k++")] << ";\n" <<
+ "}\n";
+
+ // run kernel
+ k.exec(queue);
+
+ return result + static_cast<result_difference_type>(size1 + size2);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_SERIAL_MERGE_HPP
diff --git a/boost/compute/algorithm/detail/serial_reduce.hpp b/boost/compute/algorithm/detail/serial_reduce.hpp
new file mode 100644
index 0000000000..53aaf140fe
--- /dev/null
+++ b/boost/compute/algorithm/detail/serial_reduce.hpp
@@ -0,0 +1,62 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_REDUCE_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_REDUCE_HPP
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/type_traits/result_of.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class OutputIterator, class BinaryFunction>
+inline void serial_reduce(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ BinaryFunction function,
+ command_queue &queue)
+{
+ typedef typename
+ std::iterator_traits<InputIterator>::value_type T;
+ typedef typename
+ ::boost::compute::result_of<BinaryFunction(T, T)>::type result_type;
+
+ const context &context = queue.get_context();
+ size_t count = detail::iterator_range_size(first, last);
+ if(count == 0){
+ return;
+ }
+
+ meta_kernel k("serial_reduce");
+ size_t count_arg = k.add_arg<cl_uint>("count");
+
+ k <<
+ k.decl<result_type>("result") << " = " << first[0] << ";\n" <<
+ "for(uint i = 1; i < count; i++)\n" <<
+ " result = " << function(k.var<T>("result"),
+ first[k.var<uint_>("i")]) << ";\n" <<
+ result[0] << " = result;\n";
+
+ kernel kernel = k.compile(context);
+
+ kernel.set_arg(count_arg, static_cast<uint_>(count));
+
+ queue.enqueue_task(kernel);
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_REDUCE_HPP
diff --git a/boost/compute/algorithm/detail/serial_reduce_by_key.hpp b/boost/compute/algorithm/detail/serial_reduce_by_key.hpp
new file mode 100644
index 0000000000..f9bda8e476
--- /dev/null
+++ b/boost/compute/algorithm/detail/serial_reduce_by_key.hpp
@@ -0,0 +1,108 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_REDUCE_BY_KEY_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_REDUCE_BY_KEY_HPP
+
+#include <iterator>
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/functional.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/container/detail/scalar.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/type_traits/result_of.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputKeyIterator, class InputValueIterator,
+ class OutputKeyIterator, class OutputValueIterator,
+ class BinaryFunction, class BinaryPredicate>
+inline size_t serial_reduce_by_key(InputKeyIterator keys_first,
+ InputKeyIterator keys_last,
+ InputValueIterator values_first,
+ OutputKeyIterator keys_result,
+ OutputValueIterator values_result,
+ BinaryFunction function,
+ BinaryPredicate predicate,
+ command_queue &queue)
+{
+ typedef typename
+ std::iterator_traits<InputValueIterator>::value_type value_type;
+ typedef typename
+ std::iterator_traits<InputKeyIterator>::value_type key_type;
+ typedef typename
+ ::boost::compute::result_of<BinaryFunction(value_type, value_type)>::type result_type;
+
+ const context &context = queue.get_context();
+ size_t count = detail::iterator_range_size(keys_first, keys_last);
+ if(count < 1){
+ return count;
+ }
+
+ meta_kernel k("serial_reduce_by_key");
+ size_t count_arg = k.add_arg<uint_>("count");
+ size_t result_size_arg = k.add_arg<uint_ *>(memory_object::global_memory,
+ "result_size");
+
+ convert<result_type> to_result_type;
+
+ k <<
+ k.decl<result_type>("result") <<
+ " = " << to_result_type(values_first[0]) << ";\n" <<
+ k.decl<key_type>("previous_key") << " = " << keys_first[0] << ";\n" <<
+ k.decl<result_type>("value") << ";\n" <<
+ k.decl<key_type>("key") << ";\n" <<
+
+ k.decl<uint_>("size") << " = 1;\n" <<
+
+ keys_result[0] << " = previous_key;\n" <<
+ values_result[0] << " = result;\n" <<
+
+ "for(ulong i = 1; i < count; i++) {\n" <<
+ " value = " << to_result_type(values_first[k.var<uint_>("i")]) << ";\n" <<
+ " key = " << keys_first[k.var<uint_>("i")] << ";\n" <<
+ " if (" << predicate(k.var<key_type>("previous_key"),
+ k.var<key_type>("key")) << ") {\n" <<
+
+ " result = " << function(k.var<result_type>("result"),
+ k.var<result_type>("value")) << ";\n" <<
+ " }\n " <<
+ " else { \n" <<
+ keys_result[k.var<uint_>("size - 1")] << " = previous_key;\n" <<
+ values_result[k.var<uint_>("size - 1")] << " = result;\n" <<
+ " result = value;\n" <<
+ " size++;\n" <<
+ " } \n" <<
+ " previous_key = key;\n" <<
+ "}\n" <<
+ keys_result[k.var<uint_>("size - 1")] << " = previous_key;\n" <<
+ values_result[k.var<uint_>("size - 1")] << " = result;\n" <<
+ "*result_size = size;";
+
+ kernel kernel = k.compile(context);
+
+ scalar<uint_> result_size(context);
+ kernel.set_arg(result_size_arg, result_size.get_buffer());
+ kernel.set_arg(count_arg, static_cast<uint_>(count));
+
+ queue.enqueue_task(kernel);
+
+ return static_cast<size_t>(result_size.read(queue));
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_REDUCE_BY_KEY_HPP
diff --git a/boost/compute/algorithm/equal.hpp b/boost/compute/algorithm/equal.hpp
new file mode 100644
index 0000000000..35d0c5f0ea
--- /dev/null
+++ b/boost/compute/algorithm/equal.hpp
@@ -0,0 +1,53 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_EQUAL_HPP
+#define BOOST_COMPUTE_ALGORITHM_EQUAL_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/mismatch.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Returns \c true if the range [\p first1, \p last1) and the range
+/// beginning at \p first2 are equal.
+template<class InputIterator1, class InputIterator2>
+inline bool equal(InputIterator1 first1,
+ InputIterator1 last1,
+ InputIterator2 first2,
+ command_queue &queue = system::default_queue())
+{
+ return ::boost::compute::mismatch(first1,
+ last1,
+ first2,
+ queue).first == last1;
+}
+
+/// \overload
+template<class InputIterator1, class InputIterator2>
+inline bool equal(InputIterator1 first1,
+ InputIterator1 last1,
+ InputIterator2 first2,
+ InputIterator2 last2,
+ command_queue &queue = system::default_queue())
+{
+ if(std::distance(first1, last1) != std::distance(first2, last2)){
+ return false;
+ }
+
+ return ::boost::compute::equal(first1, last1, first2, queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_EQUAL_HPP
diff --git a/boost/compute/algorithm/equal_range.hpp b/boost/compute/algorithm/equal_range.hpp
new file mode 100644
index 0000000000..fd82177324
--- /dev/null
+++ b/boost/compute/algorithm/equal_range.hpp
@@ -0,0 +1,42 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_EQUAL_RANGE_HPP
+#define BOOST_COMPUTE_ALGORITHM_EQUAL_RANGE_HPP
+
+#include <utility>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/lower_bound.hpp>
+#include <boost/compute/algorithm/upper_bound.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Returns a pair of iterators containing the range of values equal
+/// to \p value in the sorted range [\p first, \p last).
+template<class InputIterator, class T>
+inline std::pair<InputIterator, InputIterator>
+equal_range(InputIterator first,
+ InputIterator last,
+ const T &value,
+ command_queue &queue = system::default_queue())
+{
+ return std::make_pair(
+ ::boost::compute::lower_bound(first, last, value, queue),
+ ::boost::compute::upper_bound(first, last, value, queue)
+ );
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_EQUAL_RANGE_HPP
diff --git a/boost/compute/algorithm/exclusive_scan.hpp b/boost/compute/algorithm/exclusive_scan.hpp
new file mode 100644
index 0000000000..205d3de658
--- /dev/null
+++ b/boost/compute/algorithm/exclusive_scan.hpp
@@ -0,0 +1,96 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_EXCLUSIVE_SCAN_HPP
+#define BOOST_COMPUTE_ALGORITHM_EXCLUSIVE_SCAN_HPP
+
+#include <boost/compute/functional.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/detail/scan.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Performs an exclusive scan of the elements in the range [\p first, \p last)
+/// and stores the results in the range beginning at \p result.
+///
+/// Each element in the output is assigned to the sum of all the previous
+/// values in the input.
+///
+/// \param first first element in the range to scan
+/// \param last last element in the range to scan
+/// \param result first element in the result range
+/// \param init value used to initialize the scan sequence
+/// \param binary_op associative binary operator
+/// \param queue command queue to perform the operation
+///
+/// \return \c OutputIterator to the end of the result range
+///
+/// The default operation is to add the elements up.
+///
+/// \snippet test/test_scan.cpp exclusive_scan_int
+///
+/// But different associative operation can be specified as \p binary_op
+/// instead (e.g., multiplication, maximum, minimum). Also value used to
+/// initialized the scan sequence can be specified.
+///
+/// \snippet test/test_scan.cpp exclusive_scan_int_multiplies
+///
+/// \see inclusive_scan()
+template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
+inline OutputIterator
+exclusive_scan(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ T init,
+ BinaryOperator binary_op,
+ command_queue &queue = system::default_queue())
+{
+ return detail::scan(first, last, result, true, init, binary_op, queue);
+}
+
+/// \overload
+template<class InputIterator, class OutputIterator, class T>
+inline OutputIterator
+exclusive_scan(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ T init,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename
+ std::iterator_traits<OutputIterator>::value_type output_type;
+
+ return detail::scan(first, last, result, true,
+ init, boost::compute::plus<output_type>(),
+ queue);
+}
+
+/// \overload
+template<class InputIterator, class OutputIterator>
+inline OutputIterator
+exclusive_scan(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename
+ std::iterator_traits<OutputIterator>::value_type output_type;
+
+ return detail::scan(first, last, result, true,
+ output_type(0), boost::compute::plus<output_type>(),
+ queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_EXCLUSIVE_SCAN_HPP
diff --git a/boost/compute/algorithm/fill.hpp b/boost/compute/algorithm/fill.hpp
new file mode 100644
index 0000000000..c711f46b94
--- /dev/null
+++ b/boost/compute/algorithm/fill.hpp
@@ -0,0 +1,306 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_FILL_HPP
+#define BOOST_COMPUTE_ALGORITHM_FILL_HPP
+
+#include <iterator>
+
+#include <boost/mpl/int.hpp>
+#include <boost/mpl/vector.hpp>
+#include <boost/mpl/contains.hpp>
+#include <boost/utility/enable_if.hpp>
+
+#include <boost/compute/cl.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/async/future.hpp>
+#include <boost/compute/iterator/constant_iterator.hpp>
+#include <boost/compute/iterator/discard_iterator.hpp>
+#include <boost/compute/detail/is_buffer_iterator.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+namespace mpl = boost::mpl;
+
+// fills the range [first, first + count) with value using copy()
+template<class BufferIterator, class T>
+inline void fill_with_copy(BufferIterator first,
+ size_t count,
+ const T &value,
+ command_queue &queue)
+{
+ ::boost::compute::copy(
+ ::boost::compute::make_constant_iterator(value, 0),
+ ::boost::compute::make_constant_iterator(value, count),
+ first,
+ queue
+ );
+}
+
+// fills the range [first, first + count) with value using copy_async()
+template<class BufferIterator, class T>
+inline future<void> fill_async_with_copy(BufferIterator first,
+ size_t count,
+ const T &value,
+ command_queue &queue)
+{
+ return ::boost::compute::copy_async(
+ ::boost::compute::make_constant_iterator(value, 0),
+ ::boost::compute::make_constant_iterator(value, count),
+ first,
+ queue
+ );
+}
+
+#if defined(CL_VERSION_1_2)
+
+// meta-function returing true if Iterator points to a range of values
+// that can be filled using clEnqueueFillBuffer(). to meet this criteria
+// it must have a buffer accessible through iter.get_buffer() and the
+// size of its value_type must by in {1, 2, 4, 8, 16, 32, 64, 128}.
+template<class Iterator>
+struct is_valid_fill_buffer_iterator :
+ public mpl::and_<
+ is_buffer_iterator<Iterator>,
+ mpl::contains<
+ mpl::vector<
+ mpl::int_<1>,
+ mpl::int_<2>,
+ mpl::int_<4>,
+ mpl::int_<8>,
+ mpl::int_<16>,
+ mpl::int_<32>,
+ mpl::int_<64>,
+ mpl::int_<128>
+ >,
+ mpl::int_<
+ sizeof(typename std::iterator_traits<Iterator>::value_type)
+ >
+ >
+ >::type { };
+
+template<>
+struct is_valid_fill_buffer_iterator<discard_iterator> : public boost::false_type {};
+
+// specialization which uses clEnqueueFillBuffer for buffer iterators
+template<class BufferIterator, class T>
+inline void
+dispatch_fill(BufferIterator first,
+ size_t count,
+ const T &value,
+ command_queue &queue,
+ typename boost::enable_if<
+ is_valid_fill_buffer_iterator<BufferIterator>
+ >::type* = 0)
+{
+ typedef typename std::iterator_traits<BufferIterator>::value_type value_type;
+
+ if(count == 0){
+ // nothing to do
+ return;
+ }
+
+ // check if the device supports OpenCL 1.2 (required for enqueue_fill_buffer)
+ if(!queue.check_device_version(1, 2)){
+ return fill_with_copy(first, count, value, queue);
+ }
+
+ value_type pattern = static_cast<value_type>(value);
+ size_t offset = static_cast<size_t>(first.get_index());
+
+ if(count == 1){
+ // use clEnqueueWriteBuffer() directly when writing a single value
+ // to the device buffer. this is potentially more efficient and also
+ // works around a bug in the intel opencl driver.
+ queue.enqueue_write_buffer(
+ first.get_buffer(),
+ offset * sizeof(value_type),
+ sizeof(value_type),
+ &pattern
+ );
+ }
+ else {
+ queue.enqueue_fill_buffer(
+ first.get_buffer(),
+ &pattern,
+ sizeof(value_type),
+ offset * sizeof(value_type),
+ count * sizeof(value_type)
+ );
+ }
+}
+
+template<class BufferIterator, class T>
+inline future<void>
+dispatch_fill_async(BufferIterator first,
+ size_t count,
+ const T &value,
+ command_queue &queue,
+ typename boost::enable_if<
+ is_valid_fill_buffer_iterator<BufferIterator>
+ >::type* = 0)
+{
+ typedef typename std::iterator_traits<BufferIterator>::value_type value_type;
+
+ // check if the device supports OpenCL 1.2 (required for enqueue_fill_buffer)
+ if(!queue.check_device_version(1, 2)){
+ return fill_async_with_copy(first, count, value, queue);
+ }
+
+ value_type pattern = static_cast<value_type>(value);
+ size_t offset = static_cast<size_t>(first.get_index());
+
+ event event_ =
+ queue.enqueue_fill_buffer(first.get_buffer(),
+ &pattern,
+ sizeof(value_type),
+ offset * sizeof(value_type),
+ count * sizeof(value_type));
+
+ return future<void>(event_);
+}
+
+#ifdef CL_VERSION_2_0
+// specializations for svm_ptr<T>
+template<class T>
+inline void dispatch_fill(svm_ptr<T> first,
+ size_t count,
+ const T &value,
+ command_queue &queue)
+{
+ if(count == 0){
+ return;
+ }
+
+ queue.enqueue_svm_fill(
+ first.get(), &value, sizeof(T), count * sizeof(T)
+ );
+}
+
+template<class T>
+inline future<void> dispatch_fill_async(svm_ptr<T> first,
+ size_t count,
+ const T &value,
+ command_queue &queue)
+{
+ if(count == 0){
+ return future<void>();
+ }
+
+ event event_ = queue.enqueue_svm_fill(
+ first.get(), &value, sizeof(T), count * sizeof(T)
+ );
+
+ return future<void>(event_);
+}
+#endif // CL_VERSION_2_0
+
+// default implementations
+template<class BufferIterator, class T>
+inline void
+dispatch_fill(BufferIterator first,
+ size_t count,
+ const T &value,
+ command_queue &queue,
+ typename boost::disable_if<
+ is_valid_fill_buffer_iterator<BufferIterator>
+ >::type* = 0)
+{
+ fill_with_copy(first, count, value, queue);
+}
+
+template<class BufferIterator, class T>
+inline future<void>
+dispatch_fill_async(BufferIterator first,
+ size_t count,
+ const T &value,
+ command_queue &queue,
+ typename boost::disable_if<
+ is_valid_fill_buffer_iterator<BufferIterator>
+ >::type* = 0)
+{
+ return fill_async_with_copy(first, count, value, queue);
+}
+#else
+template<class BufferIterator, class T>
+inline void dispatch_fill(BufferIterator first,
+ size_t count,
+ const T &value,
+ command_queue &queue)
+{
+ fill_with_copy(first, count, value, queue);
+}
+
+template<class BufferIterator, class T>
+inline future<void> dispatch_fill_async(BufferIterator first,
+ size_t count,
+ const T &value,
+ command_queue &queue)
+{
+ return fill_async_with_copy(first, count, value, queue);
+}
+#endif // !defined(CL_VERSION_1_2)
+
+} // end detail namespace
+
+/// Fills the range [\p first, \p last) with \p value.
+///
+/// \param first first element in the range to fill
+/// \param last last element in the range to fill
+/// \param value value to copy to each element
+/// \param queue command queue to perform the operation
+///
+/// For example, to fill a vector on the device with sevens:
+/// \code
+/// // vector on the device
+/// boost::compute::vector<int> vec(10, context);
+///
+/// // fill vector with sevens
+/// boost::compute::fill(vec.begin(), vec.end(), 7, queue);
+/// \endcode
+///
+/// \see boost::compute::fill_n()
+template<class BufferIterator, class T>
+inline void fill(BufferIterator first,
+ BufferIterator last,
+ const T &value,
+ command_queue &queue = system::default_queue())
+{
+ size_t count = detail::iterator_range_size(first, last);
+ if(count == 0){
+ return;
+ }
+
+ detail::dispatch_fill(first, count, value, queue);
+}
+
+template<class BufferIterator, class T>
+inline future<void> fill_async(BufferIterator first,
+ BufferIterator last,
+ const T &value,
+ command_queue &queue = system::default_queue())
+{
+ size_t count = detail::iterator_range_size(first, last);
+ if(count == 0){
+ return future<void>();
+ }
+
+ return detail::dispatch_fill_async(first, count, value, queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_FILL_HPP
diff --git a/boost/compute/algorithm/fill_n.hpp b/boost/compute/algorithm/fill_n.hpp
new file mode 100644
index 0000000000..18a8f706a5
--- /dev/null
+++ b/boost/compute/algorithm/fill_n.hpp
@@ -0,0 +1,36 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_FILL_N_HPP
+#define BOOST_COMPUTE_ALGORITHM_FILL_N_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/fill.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Fills the range [\p first, \p first + count) with \p value.
+///
+/// \see fill()
+template<class BufferIterator, class Size, class T>
+inline void fill_n(BufferIterator first,
+ Size count,
+ const T &value,
+ command_queue &queue = system::default_queue())
+{
+ ::boost::compute::fill(first, first + count, value, queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_FILL_N_HPP
diff --git a/boost/compute/algorithm/find.hpp b/boost/compute/algorithm/find.hpp
new file mode 100644
index 0000000000..ef3ebf0c47
--- /dev/null
+++ b/boost/compute/algorithm/find.hpp
@@ -0,0 +1,57 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_FIND_HPP
+#define BOOST_COMPUTE_ALGORITHM_FIND_HPP
+
+#include <boost/compute/lambda.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/find_if.hpp>
+#include <boost/compute/type_traits/vector_size.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Returns an iterator pointing to the first element in the range
+/// [\p first, \p last) that equals \p value.
+template<class InputIterator, class T>
+inline InputIterator find(InputIterator first,
+ InputIterator last,
+ const T &value,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+ using ::boost::compute::_1;
+ using ::boost::compute::lambda::all;
+
+ if(vector_size<value_type>::value == 1){
+ return ::boost::compute::find_if(
+ first,
+ last,
+ _1 == value,
+ queue
+ );
+ }
+ else {
+ return ::boost::compute::find_if(
+ first,
+ last,
+ all(_1 == value),
+ queue
+ );
+ }
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_FIND_HPP
diff --git a/boost/compute/algorithm/find_end.hpp b/boost/compute/algorithm/find_end.hpp
new file mode 100644
index 0000000000..5c40055113
--- /dev/null
+++ b/boost/compute/algorithm/find_end.hpp
@@ -0,0 +1,119 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_FIND_END_HPP
+#define BOOST_COMPUTE_ALGORITHM_FIND_END_HPP
+
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/detail/search_all.hpp>
+#include <boost/compute/container/detail/scalar.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/system.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+///
+/// \brief Helper function for find_end
+///
+/// Basically a copy of find_if which returns last occurence
+/// instead of first occurence
+///
+template<class InputIterator, class UnaryPredicate>
+inline InputIterator find_end_helper(InputIterator first,
+ InputIterator last,
+ UnaryPredicate predicate,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+ size_t count = detail::iterator_range_size(first, last);
+ if(count == 0){
+ return last;
+ }
+
+ const context &context = queue.get_context();
+
+ detail::meta_kernel k("find_end");
+ size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index");
+ atomic_max<int_> atomic_max_int;
+
+ k << k.decl<const int_>("i") << " = get_global_id(0);\n"
+ << k.decl<const value_type>("value") << "="
+ << first[k.var<const int_>("i")] << ";\n"
+ << "if(" << predicate(k.var<const value_type>("value")) << "){\n"
+ << " " << atomic_max_int(k.var<int_ *>("index"), k.var<int_>("i")) << ";\n"
+ << "}\n";
+
+ kernel kernel = k.compile(context);
+
+ scalar<int_> index(context);
+ kernel.set_arg(index_arg, index.get_buffer());
+
+ index.write(static_cast<int_>(-1), queue);
+
+ queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
+
+ int result = static_cast<int>(index.read(queue));
+ if(result == -1) return last;
+ else return first + result;
+}
+
+} // end detail namespace
+
+///
+/// \brief Substring matching algorithm
+///
+/// Searches for the last match of the pattern [p_first, p_last)
+/// in text [t_first, t_last).
+/// \return Iterator pointing to beginning of last occurence
+///
+/// \param t_first Iterator pointing to start of text
+/// \param t_last Iterator pointing to end of text
+/// \param p_first Iterator pointing to start of pattern
+/// \param p_last Iterator pointing to end of pattern
+/// \param queue Queue on which to execute
+///
+template<class TextIterator, class PatternIterator>
+inline TextIterator find_end(TextIterator t_first,
+ TextIterator t_last,
+ PatternIterator p_first,
+ PatternIterator p_last,
+ command_queue &queue = system::default_queue())
+{
+ const context &context = queue.get_context();
+ vector<uint_> matching_indices(detail::iterator_range_size(t_first, t_last),
+ context);
+
+ detail::search_kernel<PatternIterator,
+ TextIterator,
+ vector<uint_>::iterator> kernel;
+
+ kernel.set_range(p_first, p_last, t_first, t_last, matching_indices.begin());
+ kernel.exec(queue);
+
+ using boost::compute::_1;
+
+ vector<uint_>::iterator index =
+ detail::find_end_helper(matching_indices.begin(),
+ matching_indices.end(),
+ _1 == 1,
+ queue);
+
+ return t_first + detail::iterator_range_size(matching_indices.begin(), index);
+}
+
+} //end compute namespace
+} //end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_FIND_END_HPP
diff --git a/boost/compute/algorithm/find_if.hpp b/boost/compute/algorithm/find_if.hpp
new file mode 100644
index 0000000000..db99cc0396
--- /dev/null
+++ b/boost/compute/algorithm/find_if.hpp
@@ -0,0 +1,35 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_FIND_IF_HPP
+#define BOOST_COMPUTE_ALGORITHM_FIND_IF_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/detail/find_if_with_atomics.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Returns an iterator pointing to the first element in the range
+/// [\p first, \p last) for which \p predicate returns \c true.
+template<class InputIterator, class UnaryPredicate>
+inline InputIterator find_if(InputIterator first,
+ InputIterator last,
+ UnaryPredicate predicate,
+ command_queue &queue = system::default_queue())
+{
+ return detail::find_if_with_atomics(first, last, predicate, queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_FIND_IF_HPP
diff --git a/boost/compute/algorithm/find_if_not.hpp b/boost/compute/algorithm/find_if_not.hpp
new file mode 100644
index 0000000000..61de050d31
--- /dev/null
+++ b/boost/compute/algorithm/find_if_not.hpp
@@ -0,0 +1,43 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_FIND_IF_NOT_HPP
+#define BOOST_COMPUTE_ALGORITHM_FIND_IF_NOT_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/functional.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/find_if.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Returns an iterator pointing to the first element in the range
+/// [\p first, \p last) for which \p predicate returns \c false.
+///
+/// \see find_if()
+template<class InputIterator, class UnaryPredicate>
+inline InputIterator find_if_not(InputIterator first,
+ InputIterator last,
+ UnaryPredicate predicate,
+ command_queue &queue = system::default_queue())
+{
+ return ::boost::compute::find_if(
+ first,
+ last,
+ not1(predicate),
+ queue
+ );
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_FIND_IF_NOT_HPP
diff --git a/boost/compute/algorithm/for_each.hpp b/boost/compute/algorithm/for_each.hpp
new file mode 100644
index 0000000000..3ed399e6e9
--- /dev/null
+++ b/boost/compute/algorithm/for_each.hpp
@@ -0,0 +1,65 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_FOR_EACH_HPP
+#define BOOST_COMPUTE_ALGORITHM_FOR_EACH_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class Function>
+struct for_each_kernel : public meta_kernel
+{
+ for_each_kernel(InputIterator first, InputIterator last, Function function)
+ : meta_kernel("for_each")
+ {
+ // store range size
+ m_count = detail::iterator_range_size(first, last);
+
+ // setup kernel source
+ *this << function(first[get_global_id(0)]) << ";\n";
+ }
+
+ void exec(command_queue &queue)
+ {
+ exec_1d(queue, 0, m_count);
+ }
+
+ size_t m_count;
+};
+
+} // end detail namespace
+
+/// Calls \p function on each element in the range [\p first, \p last).
+///
+/// \see transform()
+template<class InputIterator, class UnaryFunction>
+inline UnaryFunction for_each(InputIterator first,
+ InputIterator last,
+ UnaryFunction function,
+ command_queue &queue = system::default_queue())
+{
+ detail::for_each_kernel<InputIterator, UnaryFunction> kernel(first, last, function);
+
+ kernel.exec(queue);
+
+ return function;
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_FOR_EACH_HPP
diff --git a/boost/compute/algorithm/for_each_n.hpp b/boost/compute/algorithm/for_each_n.hpp
new file mode 100644
index 0000000000..d0be784bf7
--- /dev/null
+++ b/boost/compute/algorithm/for_each_n.hpp
@@ -0,0 +1,35 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_FOR_EACH_N_HPP
+#define BOOST_COMPUTE_ALGORITHM_FOR_EACH_N_HPP
+
+#include <boost/compute/algorithm/for_each.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Calls \p function on each element in the range [\p first, \p first
+/// \c + \p count).
+///
+/// \see for_each()
+template<class InputIterator, class Size, class UnaryFunction>
+inline UnaryFunction for_each_n(InputIterator first,
+ Size count,
+ UnaryFunction function,
+ command_queue &queue = system::default_queue())
+{
+ return ::boost::compute::for_each(first, first + count, function, queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_FOR_EACH_N_HPP
diff --git a/boost/compute/algorithm/gather.hpp b/boost/compute/algorithm/gather.hpp
new file mode 100644
index 0000000000..b2f725d54e
--- /dev/null
+++ b/boost/compute/algorithm/gather.hpp
@@ -0,0 +1,84 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_GATHER_HPP
+#define BOOST_COMPUTE_ALGORITHM_GATHER_HPP
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/exception.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/type_traits/type_name.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class MapIterator, class OutputIterator>
+class gather_kernel : public meta_kernel
+{
+public:
+ gather_kernel() : meta_kernel("gather")
+ {}
+
+ void set_range(MapIterator first,
+ MapIterator last,
+ InputIterator input,
+ OutputIterator result)
+ {
+ m_count = iterator_range_size(first, last);
+ m_offset = first.get_index();
+
+ *this <<
+ "const uint i = get_global_id(0);\n" <<
+ result[expr<uint_>("i")] << "=" <<
+ input[first[expr<uint_>("i")]] << ";\n";
+ }
+
+ event exec(command_queue &queue)
+ {
+ if(m_count == 0) {
+ return event();
+ }
+
+ return exec_1d(queue, m_offset, m_count);
+ }
+
+private:
+ size_t m_count;
+ size_t m_offset;
+};
+
+} // end detail namespace
+
+/// Copies the elements using the indices from the range [\p first, \p last)
+/// to the range beginning at \p result using the input values from the range
+/// beginning at \p input.
+///
+/// \see scatter()
+template<class InputIterator, class MapIterator, class OutputIterator>
+inline void gather(MapIterator first,
+ MapIterator last,
+ InputIterator input,
+ OutputIterator result,
+ command_queue &queue = system::default_queue())
+{
+ detail::gather_kernel<InputIterator, MapIterator, OutputIterator> kernel;
+
+ kernel.set_range(first, last, input, result);
+ kernel.exec(queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_GATHER_HPP
diff --git a/boost/compute/algorithm/generate.hpp b/boost/compute/algorithm/generate.hpp
new file mode 100644
index 0000000000..c70a542683
--- /dev/null
+++ b/boost/compute/algorithm/generate.hpp
@@ -0,0 +1,49 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_GENERATE_HPP
+#define BOOST_COMPUTE_ALGORITHM_GENERATE_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/iterator/function_input_iterator.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Stores the result of \p generator for each element in the range
+/// [\p first, \p last).
+template<class OutputIterator, class Generator>
+inline void generate(OutputIterator first,
+ OutputIterator last,
+ Generator generator,
+ command_queue &queue = system::default_queue())
+{
+ size_t count = detail::iterator_range_size(first, last);
+ if(count == 0){
+ return;
+ }
+
+ ::boost::compute::copy(
+ ::boost::compute::make_function_input_iterator(generator,
+ first.get_index()),
+ ::boost::compute::make_function_input_iterator(generator,
+ last.get_index()),
+ first,
+ queue
+ );
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_GENERATE_HPP
diff --git a/boost/compute/algorithm/generate_n.hpp b/boost/compute/algorithm/generate_n.hpp
new file mode 100644
index 0000000000..6d8e607b64
--- /dev/null
+++ b/boost/compute/algorithm/generate_n.hpp
@@ -0,0 +1,35 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_GENERATE_N_HPP
+#define BOOST_COMPUTE_ALGORITHM_GENERATE_N_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/generate.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Stores the result of \p generator for each element in the range
+/// [\p first, \p first + \p count).
+template<class OutputIterator, class Size, class Generator>
+inline void generate_n(OutputIterator first,
+ Size count,
+ Generator generator,
+ command_queue &queue = system::default_queue())
+{
+ ::boost::compute::generate(first, first + count, generator, queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_GENERATE_N_HPP
diff --git a/boost/compute/algorithm/includes.hpp b/boost/compute/algorithm/includes.hpp
new file mode 100644
index 0000000000..c4e7c793e7
--- /dev/null
+++ b/boost/compute/algorithm/includes.hpp
@@ -0,0 +1,155 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_INCLUDES_HPP
+#define BOOST_COMPUTE_ALGORITHM_INCLUDES_HPP
+
+#include <iterator>
+
+#include <boost/compute/algorithm/detail/balanced_path.hpp>
+#include <boost/compute/algorithm/fill_n.hpp>
+#include <boost/compute/algorithm/find.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/read_write_single_value.hpp>
+#include <boost/compute/system.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+///
+/// \brief Serial includes kernel class
+///
+/// Subclass of meta_kernel to perform includes operation after tiling
+///
+class serial_includes_kernel : meta_kernel
+{
+public:
+
+ serial_includes_kernel() : meta_kernel("includes")
+ {
+
+ }
+
+ template<class InputIterator1, class InputIterator2,
+ class InputIterator3, class InputIterator4,
+ class OutputIterator>
+ void set_range(InputIterator1 first1,
+ InputIterator2 first2,
+ InputIterator3 tile_first1,
+ InputIterator3 tile_last1,
+ InputIterator4 tile_first2,
+ OutputIterator result)
+ {
+ m_count = iterator_range_size(tile_first1, tile_last1) - 1;
+
+ *this <<
+ "uint i = get_global_id(0);\n" <<
+ "uint start1 = " << tile_first1[expr<uint_>("i")] << ";\n" <<
+ "uint end1 = " << tile_first1[expr<uint_>("i+1")] << ";\n" <<
+ "uint start2 = " << tile_first2[expr<uint_>("i")] << ";\n" <<
+ "uint end2 = " << tile_first2[expr<uint_>("i+1")] << ";\n" <<
+ "uint includes = 1;\n" <<
+ "while(start1<end1 && start2<end2)\n" <<
+ "{\n" <<
+ " if(" << first1[expr<uint_>("start1")] << " == " <<
+ first2[expr<uint_>("start2")] << ")\n" <<
+ " {\n" <<
+ " start1++; start2++;\n" <<
+ " }\n" <<
+ " else if(" << first1[expr<uint_>("start1")] << " < " <<
+ first2[expr<uint_>("start2")] << ")\n" <<
+ " start1++;\n" <<
+ " else\n" <<
+ " {\n" <<
+ " includes = 0;\n" <<
+ " break;\n" <<
+ " }\n" <<
+ "}\n" <<
+ "if(start2<end2)\n" <<
+ " includes = 0;\n" <<
+ result[expr<uint_>("i")] << " = includes;\n";
+ }
+
+ event exec(command_queue &queue)
+ {
+ if(m_count == 0) {
+ return event();
+ }
+
+ return exec_1d(queue, 0, m_count);
+ }
+
+private:
+ size_t m_count;
+};
+
+} //end detail namespace
+
+///
+/// \brief Includes algorithm
+///
+/// Finds if the sorted range [first1, last1) includes the sorted
+/// range [first2, last2). In other words, it checks if [first1, last1) is
+/// a superset of [first2, last2).
+///
+/// \return True, if [first1, last1) includes [first2, last2). False otherwise.
+///
+/// \param first1 Iterator pointing to start of first set
+/// \param last1 Iterator pointing to end of first set
+/// \param first2 Iterator pointing to start of second set
+/// \param last2 Iterator pointing to end of second set
+/// \param queue Queue on which to execute
+///
+template<class InputIterator1, class InputIterator2>
+inline bool includes(InputIterator1 first1,
+ InputIterator1 last1,
+ InputIterator2 first2,
+ InputIterator2 last2,
+ command_queue &queue = system::default_queue())
+{
+ size_t tile_size = 1024;
+
+ size_t count1 = detail::iterator_range_size(first1, last1);
+ size_t count2 = detail::iterator_range_size(first2, last2);
+
+ vector<uint_> tile_a((count1+count2+tile_size-1)/tile_size+1, queue.get_context());
+ vector<uint_> tile_b((count1+count2+tile_size-1)/tile_size+1, queue.get_context());
+
+ // Tile the sets
+ detail::balanced_path_kernel tiling_kernel;
+ tiling_kernel.tile_size = static_cast<unsigned int>(tile_size);
+ tiling_kernel.set_range(first1, last1, first2, last2,
+ tile_a.begin()+1, tile_b.begin()+1);
+ fill_n(tile_a.begin(), 1, uint_(0), queue);
+ fill_n(tile_b.begin(), 1, uint_(0), queue);
+ tiling_kernel.exec(queue);
+
+ fill_n(tile_a.end()-1, 1, static_cast<uint_>(count1), queue);
+ fill_n(tile_b.end()-1, 1, static_cast<uint_>(count2), queue);
+
+ vector<uint_> result((count1+count2+tile_size-1)/tile_size, queue.get_context());
+
+ // Find individually
+ detail::serial_includes_kernel includes_kernel;
+ includes_kernel.set_range(first1, first2, tile_a.begin(), tile_a.end(),
+ tile_b.begin(), result.begin());
+
+ includes_kernel.exec(queue);
+
+ return find(result.begin(), result.end(), 0, queue) == result.end();
+}
+
+} //end compute namespace
+} //end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_SET_UNION_HPP
diff --git a/boost/compute/algorithm/inclusive_scan.hpp b/boost/compute/algorithm/inclusive_scan.hpp
new file mode 100644
index 0000000000..9f98beaf7c
--- /dev/null
+++ b/boost/compute/algorithm/inclusive_scan.hpp
@@ -0,0 +1,81 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_INCLUSIVE_SCAN_HPP
+#define BOOST_COMPUTE_ALGORITHM_INCLUSIVE_SCAN_HPP
+
+#include <boost/compute/functional.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/detail/scan.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Performs an inclusive scan of the elements in the range [\p first, \p last)
+/// and stores the results in the range beginning at \p result.
+///
+/// Each element in the output is assigned to the sum of the current value in
+/// the input with the sum of every previous value in the input.
+///
+/// \param first first element in the range to scan
+/// \param last last element in the range to scan
+/// \param result first element in the result range
+/// \param binary_op associative binary operator
+/// \param queue command queue to perform the operation
+///
+/// \return \c OutputIterator to the end of the result range
+///
+/// The default operation is to add the elements up.
+///
+/// \snippet test/test_scan.cpp inclusive_scan_int
+///
+/// But different associative operation can be specified as \p binary_op
+/// instead (e.g., multiplication, maximum, minimum).
+///
+/// \snippet test/test_scan.cpp inclusive_scan_int_multiplies
+///
+/// \see exclusive_scan()
+template<class InputIterator, class OutputIterator, class BinaryOperator>
+inline OutputIterator
+inclusive_scan(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ BinaryOperator binary_op,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename
+ std::iterator_traits<OutputIterator>::value_type output_type;
+
+ return detail::scan(first, last, result, false,
+ output_type(0), binary_op,
+ queue);
+}
+
+/// \overload
+template<class InputIterator, class OutputIterator>
+inline OutputIterator
+inclusive_scan(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename
+ std::iterator_traits<OutputIterator>::value_type output_type;
+
+ return detail::scan(first, last, result, false,
+ output_type(0), boost::compute::plus<output_type>(),
+ queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_INCLUSIVE_SCAN_HPP
diff --git a/boost/compute/algorithm/inner_product.hpp b/boost/compute/algorithm/inner_product.hpp
new file mode 100644
index 0000000000..614611f91e
--- /dev/null
+++ b/boost/compute/algorithm/inner_product.hpp
@@ -0,0 +1,93 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_INNER_PRODUCT_HPP
+#define BOOST_COMPUTE_ALGORITHM_INNER_PRODUCT_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/functional.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/accumulate.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/iterator/transform_iterator.hpp>
+#include <boost/compute/iterator/zip_iterator.hpp>
+#include <boost/compute/functional/detail/unpack.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Returns the inner product of the elements in the range
+/// [\p first1, \p last1) with the elements in the range beginning
+/// at \p first2.
+template<class InputIterator1, class InputIterator2, class T>
+inline T inner_product(InputIterator1 first1,
+ InputIterator1 last1,
+ InputIterator2 first2,
+ T init,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator1>::value_type input_type;
+
+ ptrdiff_t n = std::distance(first1, last1);
+
+ return ::boost::compute::accumulate(
+ ::boost::compute::make_transform_iterator(
+ ::boost::compute::make_zip_iterator(
+ boost::make_tuple(first1, first2)
+ ),
+ detail::unpack(multiplies<input_type>())
+ ),
+ ::boost::compute::make_transform_iterator(
+ ::boost::compute::make_zip_iterator(
+ boost::make_tuple(last1, first2 + n)
+ ),
+ detail::unpack(multiplies<input_type>())
+ ),
+ init,
+ queue
+ );
+}
+
+/// \overload
+template<class InputIterator1,
+ class InputIterator2,
+ class T,
+ class BinaryAccumulateFunction,
+ class BinaryTransformFunction>
+inline T inner_product(InputIterator1 first1,
+ InputIterator1 last1,
+ InputIterator2 first2,
+ T init,
+ BinaryAccumulateFunction accumulate_function,
+ BinaryTransformFunction transform_function,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
+
+ size_t count = detail::iterator_range_size(first1, last1);
+ vector<value_type> result(count, queue.get_context());
+ transform(first1,
+ last1,
+ first2,
+ result.begin(),
+ transform_function,
+ queue);
+
+ return ::boost::compute::accumulate(result.begin(),
+ result.end(),
+ init,
+ accumulate_function,
+ queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_INNER_PRODUCT_HPP
diff --git a/boost/compute/algorithm/inplace_merge.hpp b/boost/compute/algorithm/inplace_merge.hpp
new file mode 100644
index 0000000000..3080950df5
--- /dev/null
+++ b/boost/compute/algorithm/inplace_merge.hpp
@@ -0,0 +1,60 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_INPLACE_MERGE_HPP
+#define BOOST_COMPUTE_ALGORITHM_INPLACE_MERGE_HPP
+
+#include <iterator>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/merge.hpp>
+#include <boost/compute/container/vector.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Merges the sorted values in the range [\p first, \p middle) with
+/// the sorted values in the range [\p middle, \p last) in-place.
+template<class Iterator>
+inline void inplace_merge(Iterator first,
+ Iterator middle,
+ Iterator last,
+ command_queue &queue = system::default_queue())
+{
+ BOOST_ASSERT(first < middle && middle < last);
+
+ typedef typename std::iterator_traits<Iterator>::value_type T;
+
+ const context &context = queue.get_context();
+
+ ptrdiff_t left_size = std::distance(first, middle);
+ ptrdiff_t right_size = std::distance(middle, last);
+
+ vector<T> left(left_size, context);
+ vector<T> right(right_size, context);
+
+ copy(first, middle, left.begin(), queue);
+ copy(middle, last, right.begin(), queue);
+
+ ::boost::compute::merge(
+ left.begin(),
+ left.end(),
+ right.begin(),
+ right.end(),
+ first,
+ queue
+ );
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_INPLACE_MERGE_HPP
diff --git a/boost/compute/algorithm/iota.hpp b/boost/compute/algorithm/iota.hpp
new file mode 100644
index 0000000000..084c3d8d97
--- /dev/null
+++ b/boost/compute/algorithm/iota.hpp
@@ -0,0 +1,48 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_IOTA_HPP
+#define BOOST_COMPUTE_ALGORITHM_IOTA_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/iterator/counting_iterator.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Fills the range [\p first, \p last) with sequential values starting at
+/// \p value.
+///
+/// For example, the following code:
+/// \snippet test/test_iota.cpp iota
+///
+/// Will fill \c vec with the values (\c 0, \c 1, \c 2, \c ...).
+template<class BufferIterator, class T>
+inline void iota(BufferIterator first,
+ BufferIterator last,
+ const T &value,
+ command_queue &queue = system::default_queue())
+{
+ T count = static_cast<T>(detail::iterator_range_size(first, last));
+
+ copy(
+ ::boost::compute::make_counting_iterator(value),
+ ::boost::compute::make_counting_iterator(value + count),
+ first,
+ queue
+ );
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_IOTA_HPP
diff --git a/boost/compute/algorithm/is_partitioned.hpp b/boost/compute/algorithm/is_partitioned.hpp
new file mode 100644
index 0000000000..3916825057
--- /dev/null
+++ b/boost/compute/algorithm/is_partitioned.hpp
@@ -0,0 +1,43 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_IS_PARTITIONED_HPP
+#define BOOST_COMPUTE_ALGORITHM_IS_PARTITIONED_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/find_if.hpp>
+#include <boost/compute/algorithm/find_if_not.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Returns \c true if the values in the range [\p first, \p last)
+/// are partitioned according to \p predicate.
+template<class InputIterator, class UnaryPredicate>
+inline bool is_partitioned(InputIterator first,
+ InputIterator last,
+ UnaryPredicate predicate,
+ command_queue &queue = system::default_queue())
+{
+ return ::boost::compute::find_if(
+ ::boost::compute::find_if_not(first,
+ last,
+ predicate,
+ queue),
+ last,
+ predicate,
+ queue) == last;
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_PARTITION_HPP
diff --git a/boost/compute/algorithm/is_permutation.hpp b/boost/compute/algorithm/is_permutation.hpp
new file mode 100644
index 0000000000..1e502efb37
--- /dev/null
+++ b/boost/compute/algorithm/is_permutation.hpp
@@ -0,0 +1,67 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_IS_PERMUTATION_HPP
+#define BOOST_COMPUTE_ALGORITHM_IS_PERMUTATION_HPP
+
+#include <iterator>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/algorithm/equal.hpp>
+#include <boost/compute/algorithm/sort.hpp>
+
+namespace boost {
+namespace compute {
+
+///
+/// \brief Permutation checking algorithm
+///
+/// Checks if the range [first1, last1) can be permuted into the
+/// range [first2, last2)
+/// \return True, if it can be permuted. False, otherwise.
+///
+/// \param first1 Iterator pointing to start of first range
+/// \param last1 Iterator pointing to end of first range
+/// \param first2 Iterator pointing to start of second range
+/// \param last2 Iterator pointing to end of second range
+/// \param queue Queue on which to execute
+///
+template<class InputIterator1, class InputIterator2>
+inline bool is_permutation(InputIterator1 first1,
+ InputIterator1 last1,
+ InputIterator2 first2,
+ InputIterator2 last2,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator1>::value_type value_type1;
+ typedef typename std::iterator_traits<InputIterator2>::value_type value_type2;
+
+ size_t count1 = detail::iterator_range_size(first1, last1);
+ size_t count2 = detail::iterator_range_size(first2, last2);
+
+ if(count1 != count2) return false;
+
+ vector<value_type1> temp1(first1, last1, queue);
+ vector<value_type2> temp2(first2, last2, queue);
+
+ sort(temp1.begin(), temp1.end(), queue);
+ sort(temp2.begin(), temp2.end(), queue);
+
+ return equal(temp1.begin(), temp1.end(),
+ temp2.begin(), queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_IS_PERMUTATION_HPP
diff --git a/boost/compute/algorithm/is_sorted.hpp b/boost/compute/algorithm/is_sorted.hpp
new file mode 100644
index 0000000000..a605159ac3
--- /dev/null
+++ b/boost/compute/algorithm/is_sorted.hpp
@@ -0,0 +1,64 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_IS_SORTED_HPP
+#define BOOST_COMPUTE_ALGORITHM_IS_SORTED_HPP
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/functional/bind.hpp>
+#include <boost/compute/functional/operator.hpp>
+#include <boost/compute/algorithm/adjacent_find.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Returns \c true if the values in the range [\p first, \p last)
+/// are in sorted order.
+///
+/// \param first first element in the range to check
+/// \param last last element in the range to check
+/// \param compare comparison function (by default \c less)
+/// \param queue command queue to perform the operation
+///
+/// \return \c true if the range [\p first, \p last) is sorted
+///
+/// \see sort()
+template<class InputIterator, class Compare>
+inline bool is_sorted(InputIterator first,
+ InputIterator last,
+ Compare compare,
+ command_queue &queue = system::default_queue())
+{
+ using ::boost::compute::placeholders::_1;
+ using ::boost::compute::placeholders::_2;
+
+ return ::boost::compute::adjacent_find(
+ first, last, ::boost::compute::bind(compare, _2, _1), queue
+ ) == last;
+}
+
+/// \overload
+template<class InputIterator>
+inline bool is_sorted(InputIterator first,
+ InputIterator last,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+ return ::boost::compute::is_sorted(
+ first, last, ::boost::compute::less<value_type>(), queue
+ );
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_IS_SORTED_HPP
diff --git a/boost/compute/algorithm/lexicographical_compare.hpp b/boost/compute/algorithm/lexicographical_compare.hpp
new file mode 100644
index 0000000000..c4f7120807
--- /dev/null
+++ b/boost/compute/algorithm/lexicographical_compare.hpp
@@ -0,0 +1,117 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Mageswaran.D <mageswaran1989@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/context.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/any_of.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/utility/program_cache.hpp>
+
+namespace boost {
+namespace compute {
+
+namespace detail {
+
+const char lexicographical_compare_source[] =
+"__kernel void lexicographical_compare(const uint size1,\n"
+" const uint size2,\n"
+" __global const T1 *range1,\n"
+" __global const T2 *range2,\n"
+" __global bool *result_buf)\n"
+"{\n"
+" const uint i = get_global_id(0);\n"
+" if((i != size1) && (i != size2)){\n"
+ //Individual elements are compared and results are stored in parallel.
+ //0 is true
+" if(range1[i] < range2[i])\n"
+" result_buf[i] = 0;\n"
+" else\n"
+" result_buf[i] = 1;\n"
+" }\n"
+" else\n"
+" result_buf[i] = !((i == size1) && (i != size2));\n"
+"}\n";
+
+template<class InputIterator1, class InputIterator2>
+inline bool dispatch_lexicographical_compare(InputIterator1 first1,
+ InputIterator1 last1,
+ InputIterator2 first2,
+ InputIterator2 last2,
+ command_queue &queue)
+{
+ const boost::compute::context &context = queue.get_context();
+
+ boost::shared_ptr<program_cache> cache =
+ program_cache::get_global_cache(context);
+
+ size_t iterator_size1 = iterator_range_size(first1, last1);
+ size_t iterator_size2 = iterator_range_size(first2, last2);
+ size_t max_size = (std::max)(iterator_size1, iterator_size2);
+
+ if(max_size == 0){
+ return false;
+ }
+
+ boost::compute::vector<bool> result_vector(max_size, context);
+
+
+ typedef typename std::iterator_traits<InputIterator1>::value_type value_type1;
+ typedef typename std::iterator_traits<InputIterator2>::value_type value_type2;
+
+ // load (or create) lexicographical compare program
+ std::string cache_key =
+ std::string("__boost_lexicographical_compare")
+ + type_name<value_type1>() + type_name<value_type2>();
+
+ std::stringstream options;
+ options << " -DT1=" << type_name<value_type1>();
+ options << " -DT2=" << type_name<value_type2>();
+
+ program lexicographical_compare_program = cache->get_or_build(
+ cache_key, options.str(), lexicographical_compare_source, context
+ );
+
+ kernel lexicographical_compare_kernel(lexicographical_compare_program,
+ "lexicographical_compare");
+
+ lexicographical_compare_kernel.set_arg<uint_>(0, iterator_size1);
+ lexicographical_compare_kernel.set_arg<uint_>(1, iterator_size2);
+ lexicographical_compare_kernel.set_arg(2, first1.get_buffer());
+ lexicographical_compare_kernel.set_arg(3, first2.get_buffer());
+ lexicographical_compare_kernel.set_arg(4, result_vector.get_buffer());
+
+ queue.enqueue_1d_range_kernel(lexicographical_compare_kernel,
+ 0,
+ max_size,
+ 0);
+
+ return boost::compute::any_of(result_vector.begin(),
+ result_vector.end(),
+ _1 == 0,
+ queue);
+}
+
+} // end detail namespace
+
+/// Checks if the first range [first1, last1) is lexicographically
+/// less than the second range [first2, last2).
+template<class InputIterator1, class InputIterator2>
+inline bool lexicographical_compare(InputIterator1 first1,
+ InputIterator1 last1,
+ InputIterator2 first2,
+ InputIterator2 last2,
+ command_queue &queue = system::default_queue())
+{
+ return detail::dispatch_lexicographical_compare(first1, last1, first2, last2, queue);
+}
+
+} // end compute namespace
+} // end boost namespac
diff --git a/boost/compute/algorithm/lower_bound.hpp b/boost/compute/algorithm/lower_bound.hpp
new file mode 100644
index 0000000000..b2011c66ef
--- /dev/null
+++ b/boost/compute/algorithm/lower_bound.hpp
@@ -0,0 +1,44 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_LOWER_BOUND_HPP
+#define BOOST_COMPUTE_ALGORITHM_LOWER_BOUND_HPP
+
+#include <boost/compute/lambda.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/detail/binary_find.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Returns an iterator pointing to the first element in the sorted
+/// range [\p first, \p last) that is not less than \p value.
+///
+/// \see upper_bound()
+template<class InputIterator, class T>
+inline InputIterator
+lower_bound(InputIterator first,
+ InputIterator last,
+ const T &value,
+ command_queue &queue = system::default_queue())
+{
+ using ::boost::compute::_1;
+
+ InputIterator position =
+ detail::binary_find(first, last, _1 >= value, queue);
+
+ return position;
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_LOWER_BOUND_HPP
diff --git a/boost/compute/algorithm/max_element.hpp b/boost/compute/algorithm/max_element.hpp
new file mode 100644
index 0000000000..55f2f7ffbf
--- /dev/null
+++ b/boost/compute/algorithm/max_element.hpp
@@ -0,0 +1,74 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_MAX_ELEMENT_HPP
+#define BOOST_COMPUTE_ALGORITHM_MAX_ELEMENT_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/functional.hpp>
+#include <boost/compute/algorithm/detail/find_extrema.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Returns an iterator pointing to the element in the range
+/// [\p first, \p last) with the maximum value.
+///
+/// \param first first element in the input range
+/// \param last last element in the input range
+/// \param compare comparison function object which returns true if the first
+/// argument is less than (i.e. is ordered before) the second.
+/// \param queue command queue to perform the operation
+///
+/// For example, to find \c int2 value with maximum first component in given vector:
+/// \code
+/// // comparison function object
+/// BOOST_COMPUTE_FUNCTION(bool, compare_first, (const int2_ &a, const int2_ &b),
+/// {
+/// return a.x < b.x;
+/// });
+///
+/// // create vector
+/// boost::compute::vector<uint2_> data = ...
+///
+/// boost::compute::vector<uint2_>::iterator max =
+/// boost::compute::max_element(data.begin(), data.end(), compare_first, queue);
+/// \endcode
+///
+/// \see min_element()
+template<class InputIterator, class Compare>
+inline InputIterator
+max_element(InputIterator first,
+ InputIterator last,
+ Compare compare,
+ command_queue &queue = system::default_queue())
+{
+ return detail::find_extrema(first, last, compare, false, queue);
+}
+
+///\overload
+template<class InputIterator>
+inline InputIterator
+max_element(InputIterator first,
+ InputIterator last,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+ return ::boost::compute::max_element(
+ first, last, ::boost::compute::less<value_type>(), queue
+ );
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_MAX_ELEMENT_HPP
diff --git a/boost/compute/algorithm/merge.hpp b/boost/compute/algorithm/merge.hpp
new file mode 100644
index 0000000000..875a283044
--- /dev/null
+++ b/boost/compute/algorithm/merge.hpp
@@ -0,0 +1,105 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_MERGE_HPP
+#define BOOST_COMPUTE_ALGORITHM_MERGE_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/detail/merge_with_merge_path.hpp>
+#include <boost/compute/algorithm/detail/serial_merge.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/parameter_cache.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Merges the sorted values in the range [\p first1, \p last1) with the sorted
+/// values in the range [\p first2, last2) and stores the result in the range
+/// beginning at \p result. Values are compared using the \p comp function. If
+/// no comparision function is given, \c less is used.
+///
+/// \param first1 first element in the first range to merge
+/// \param last1 last element in the first range to merge
+/// \param first2 first element in the second range to merge
+/// \param last2 last element in the second range to merge
+/// \param result first element in the result range
+/// \param comp comparison function (by default \c less)
+/// \param queue command queue to perform the operation
+///
+/// \return \c OutputIterator to the end of the result range
+///
+/// \see inplace_merge()
+template<class InputIterator1,
+ class InputIterator2,
+ class OutputIterator,
+ class Compare>
+inline OutputIterator merge(InputIterator1 first1,
+ InputIterator1 last1,
+ InputIterator2 first2,
+ InputIterator2 last2,
+ OutputIterator result,
+ Compare comp,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator1>::value_type input1_type;
+ typedef typename std::iterator_traits<InputIterator2>::value_type input2_type;
+ typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
+
+ const device &device = queue.get_device();
+
+ std::string cache_key =
+ std::string("__boost_merge_") + type_name<input1_type>() + "_"
+ + type_name<input2_type>() + "_" + type_name<output_type>();
+ boost::shared_ptr<detail::parameter_cache> parameters =
+ detail::parameter_cache::get_global_cache(device);
+
+ // default serial merge threshold depends on device type
+ size_t default_serial_merge_threshold = 32768;
+ if(device.type() & device::gpu) {
+ default_serial_merge_threshold = 2048;
+ }
+
+ // loading serial merge threshold parameter
+ const size_t serial_merge_threshold =
+ parameters->get(cache_key, "serial_merge_threshold",
+ static_cast<uint_>(default_serial_merge_threshold));
+
+ // choosing merge algorithm
+ const size_t total_count =
+ detail::iterator_range_size(first1, last1)
+ + detail::iterator_range_size(first2, last2);
+ // for small inputs serial merge turns out to outperform
+ // merge with merge path algorithm
+ if(total_count <= serial_merge_threshold){
+ return detail::serial_merge(first1, last1, first2, last2, result, comp, queue);
+ }
+ return detail::merge_with_merge_path(first1, last1, first2, last2, result, comp, queue);
+}
+
+/// \overload
+template<class InputIterator1, class InputIterator2, class OutputIterator>
+inline OutputIterator merge(InputIterator1 first1,
+ InputIterator1 last1,
+ InputIterator2 first2,
+ InputIterator2 last2,
+ OutputIterator result,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
+ less<value_type> less_than;
+ return merge(first1, last1, first2, last2, result, less_than, queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_MERGE_HPP
diff --git a/boost/compute/algorithm/min_element.hpp b/boost/compute/algorithm/min_element.hpp
new file mode 100644
index 0000000000..62744efb98
--- /dev/null
+++ b/boost/compute/algorithm/min_element.hpp
@@ -0,0 +1,74 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_MIN_ELEMENT_HPP
+#define BOOST_COMPUTE_ALGORITHM_MIN_ELEMENT_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/functional.hpp>
+#include <boost/compute/algorithm/detail/find_extrema.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Returns an iterator pointing to the element in range
+/// [\p first, \p last) with the minimum value.
+///
+/// \param first first element in the input range
+/// \param last last element in the input range
+/// \param compare comparison function object which returns true if the first
+/// argument is less than (i.e. is ordered before) the second.
+/// \param queue command queue to perform the operation
+///
+/// For example, to find \c int2 value with minimum first component in given vector:
+/// \code
+/// // comparison function object
+/// BOOST_COMPUTE_FUNCTION(bool, compare_first, (const int2_ &a, const int2_ &b),
+/// {
+/// return a.x < b.x;
+/// });
+///
+/// // create vector
+/// boost::compute::vector<uint2_> data = ...
+///
+/// boost::compute::vector<uint2_>::iterator min =
+/// boost::compute::min_element(data.begin(), data.end(), compare_first, queue);
+/// \endcode
+///
+/// \see max_element()
+template<class InputIterator, class Compare>
+inline InputIterator
+min_element(InputIterator first,
+ InputIterator last,
+ Compare compare,
+ command_queue &queue = system::default_queue())
+{
+ return detail::find_extrema(first, last, compare, true, queue);
+}
+
+///\overload
+template<class InputIterator>
+inline InputIterator
+min_element(InputIterator first,
+ InputIterator last,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+ return ::boost::compute::min_element(
+ first, last, ::boost::compute::less<value_type>(), queue
+ );
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_MIN_ELEMENT_HPP
diff --git a/boost/compute/algorithm/minmax_element.hpp b/boost/compute/algorithm/minmax_element.hpp
new file mode 100644
index 0000000000..bf32c3c989
--- /dev/null
+++ b/boost/compute/algorithm/minmax_element.hpp
@@ -0,0 +1,70 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_MINMAX_ELEMENT_HPP
+#define BOOST_COMPUTE_ALGORITHM_MINMAX_ELEMENT_HPP
+
+#include <utility>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/max_element.hpp>
+#include <boost/compute/algorithm/min_element.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Returns a pair of iterators with the first pointing to the minimum
+/// element and the second pointing to the maximum element in the range
+/// [\p first, \p last).
+///
+/// \param first first element in the input range
+/// \param last last element in the input range
+/// \param compare comparison function object which returns true if the first
+/// argument is less than (i.e. is ordered before) the second.
+/// \param queue command queue to perform the operation
+///
+/// \see max_element(), min_element()
+template<class InputIterator, class Compare>
+inline std::pair<InputIterator, InputIterator>
+minmax_element(InputIterator first,
+ InputIterator last,
+ Compare compare,
+ command_queue &queue = system::default_queue())
+{
+ if(first == last){
+ // empty range
+ return std::make_pair(first, first);
+ }
+
+ return std::make_pair(min_element(first, last, compare, queue),
+ max_element(first, last, compare, queue));
+}
+
+///\overload
+template<class InputIterator, class Compare>
+inline std::pair<InputIterator, InputIterator>
+minmax_element(InputIterator first,
+ InputIterator last,
+ command_queue &queue = system::default_queue())
+{
+ if(first == last){
+ // empty range
+ return std::make_pair(first, first);
+ }
+
+ return std::make_pair(min_element(first, last, queue),
+ max_element(first, last, queue));
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_MINMAX_ELEMENT_HPP
diff --git a/boost/compute/algorithm/mismatch.hpp b/boost/compute/algorithm/mismatch.hpp
new file mode 100644
index 0000000000..e7db883004
--- /dev/null
+++ b/boost/compute/algorithm/mismatch.hpp
@@ -0,0 +1,89 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_MISMATCH_HPP
+#define BOOST_COMPUTE_ALGORITHM_MISMATCH_HPP
+
+#include <iterator>
+#include <utility>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/functional.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/find.hpp>
+#include <boost/compute/iterator/transform_iterator.hpp>
+#include <boost/compute/iterator/zip_iterator.hpp>
+#include <boost/compute/functional/detail/unpack.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Returns a pair of iterators pointing to the first position where the
+/// range [\p first1, \p last1) and the range starting at \p first2
+/// differ.
+template<class InputIterator1, class InputIterator2>
+inline std::pair<InputIterator1, InputIterator2>
+mismatch(InputIterator1 first1,
+ InputIterator1 last1,
+ InputIterator2 first2,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
+
+ ::boost::compute::equal_to<value_type> op;
+
+ InputIterator2 last2 = first2 + std::distance(first1, last1);
+
+ InputIterator1 iter =
+ boost::get<0>(
+ ::boost::compute::find(
+ ::boost::compute::make_transform_iterator(
+ ::boost::compute::make_zip_iterator(
+ boost::make_tuple(first1, first2)
+ ),
+ detail::unpack(op)
+ ),
+ ::boost::compute::make_transform_iterator(
+ ::boost::compute::make_zip_iterator(
+ boost::make_tuple(last1, last2)
+ ),
+ detail::unpack(op)
+ ),
+ false,
+ queue
+ ).base().get_iterator_tuple()
+ );
+
+ return std::make_pair(iter, first2 + std::distance(first1, iter));
+}
+
+/// \overload
+template<class InputIterator1, class InputIterator2>
+inline std::pair<InputIterator1, InputIterator2>
+mismatch(InputIterator1 first1,
+ InputIterator1 last1,
+ InputIterator2 first2,
+ InputIterator2 last2,
+ command_queue &queue = system::default_queue())
+{
+ if(std::distance(first1, last1) < std::distance(first2, last2)){
+ return ::boost::compute::mismatch(first1, last1, first2, queue);
+ }
+ else {
+ return ::boost::compute::mismatch(
+ first1, first1 + std::distance(first2, last2), first2, queue
+ );
+ }
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_MISMATCH_HPP
diff --git a/boost/compute/algorithm/next_permutation.hpp b/boost/compute/algorithm/next_permutation.hpp
new file mode 100644
index 0000000000..e81fbd2ee8
--- /dev/null
+++ b/boost/compute/algorithm/next_permutation.hpp
@@ -0,0 +1,170 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_NEXT_PERMUTATION_HPP
+#define BOOST_COMPUTE_ALGORITHM_NEXT_PERMUTATION_HPP
+
+#include <iterator>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/container/detail/scalar.hpp>
+#include <boost/compute/algorithm/reverse.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+///
+/// \brief Helper function for next_permutation
+///
+/// To find rightmost element which is smaller
+/// than its next element
+///
+template<class InputIterator>
+inline InputIterator next_permutation_helper(InputIterator first,
+ InputIterator last,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+ size_t count = detail::iterator_range_size(first, last);
+ if(count == 0 || count == 1){
+ return last;
+ }
+ count = count - 1;
+ const context &context = queue.get_context();
+
+ detail::meta_kernel k("next_permutation");
+ size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index");
+ atomic_max<int_> atomic_max_int;
+
+ k << k.decl<const int_>("i") << " = get_global_id(0);\n"
+ << k.decl<const value_type>("cur_value") << "="
+ << first[k.var<const int_>("i")] << ";\n"
+ << k.decl<const value_type>("next_value") << "="
+ << first[k.expr<const int_>("i+1")] << ";\n"
+ << "if(cur_value < next_value){\n"
+ << " " << atomic_max_int(k.var<int_ *>("index"), k.var<int_>("i")) << ";\n"
+ << "}\n";
+
+ kernel kernel = k.compile(context);
+
+ scalar<int_> index(context);
+ kernel.set_arg(index_arg, index.get_buffer());
+
+ index.write(static_cast<int_>(-1), queue);
+
+ queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
+
+ int result = static_cast<int>(index.read(queue));
+ if(result == -1) return last;
+ else return first + result;
+}
+
+///
+/// \brief Helper function for next_permutation
+///
+/// To find the smallest element to the right of the element found above
+/// that is greater than it
+///
+template<class InputIterator, class ValueType>
+inline InputIterator np_ceiling(InputIterator first,
+ InputIterator last,
+ ValueType value,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+ size_t count = detail::iterator_range_size(first, last);
+ if(count == 0){
+ return last;
+ }
+ const context &context = queue.get_context();
+
+ detail::meta_kernel k("np_ceiling");
+ size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index");
+ size_t value_arg = k.add_arg<value_type>(memory_object::private_memory, "value");
+ atomic_max<int_> atomic_max_int;
+
+ k << k.decl<const int_>("i") << " = get_global_id(0);\n"
+ << k.decl<const value_type>("cur_value") << "="
+ << first[k.var<const int_>("i")] << ";\n"
+ << "if(cur_value <= " << first[k.expr<int_>("*index")]
+ << " && cur_value > value){\n"
+ << " " << atomic_max_int(k.var<int_ *>("index"), k.var<int_>("i")) << ";\n"
+ << "}\n";
+
+ kernel kernel = k.compile(context);
+
+ scalar<int_> index(context);
+ kernel.set_arg(index_arg, index.get_buffer());
+
+ index.write(static_cast<int_>(0), queue);
+
+ kernel.set_arg(value_arg, value);
+
+ queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
+
+ int result = static_cast<int>(index.read(queue));
+ return first + result;
+}
+
+} // end detail namespace
+
+///
+/// \brief Permutation generating algorithm
+///
+/// Transforms the range [first, last) into the next permutation from the
+/// set of all permutations arranged in lexicographic order
+/// \return Boolean value signifying if the last permutation was crossed
+/// and the range was reset
+///
+/// \param first Iterator pointing to start of range
+/// \param last Iterator pointing to end of range
+/// \param queue Queue on which to execute
+///
+template<class InputIterator>
+inline bool next_permutation(InputIterator first,
+ InputIterator last,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+ if(first == last) return false;
+
+ InputIterator first_element =
+ detail::next_permutation_helper(first, last, queue);
+
+ if(first_element == last)
+ {
+ reverse(first, last, queue);
+ return false;
+ }
+
+ value_type first_value = first_element.read(queue);
+
+ InputIterator ceiling_element =
+ detail::np_ceiling(first_element + 1, last, first_value, queue);
+
+ value_type ceiling_value = ceiling_element.read(queue);
+
+ first_element.write(ceiling_value, queue);
+ ceiling_element.write(first_value, queue);
+
+ reverse(first_element + 1, last, queue);
+
+ return true;
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_NEXT_PERMUTATION_HPP
diff --git a/boost/compute/algorithm/none_of.hpp b/boost/compute/algorithm/none_of.hpp
new file mode 100644
index 0000000000..c25dd12a87
--- /dev/null
+++ b/boost/compute/algorithm/none_of.hpp
@@ -0,0 +1,36 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_NONE_OF_HPP
+#define BOOST_COMPUTE_ALGORITHM_NONE_OF_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/find_if.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Returns \c true if \p predicate returns \c true for none of the elements in
+/// the range [\p first, \p last).
+///
+/// \see all_of(), any_of()
+template<class InputIterator, class UnaryPredicate>
+inline bool none_of(InputIterator first,
+ InputIterator last,
+ UnaryPredicate predicate,
+ command_queue &queue = system::default_queue())
+{
+ return ::boost::compute::find_if(first, last, predicate, queue) == last;
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_NONE_OF_HPP
diff --git a/boost/compute/algorithm/nth_element.hpp b/boost/compute/algorithm/nth_element.hpp
new file mode 100644
index 0000000000..68f7a3dbc0
--- /dev/null
+++ b/boost/compute/algorithm/nth_element.hpp
@@ -0,0 +1,87 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_NTH_ELEMENT_HPP
+#define BOOST_COMPUTE_ALGORITHM_NTH_ELEMENT_HPP
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/fill_n.hpp>
+#include <boost/compute/algorithm/find.hpp>
+#include <boost/compute/algorithm/partition.hpp>
+#include <boost/compute/algorithm/sort.hpp>
+#include <boost/compute/functional/bind.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Rearranges the elements in the range [\p first, \p last) such that
+/// the \p nth element would be in that position in a sorted sequence.
+template<class Iterator, class Compare>
+inline void nth_element(Iterator first,
+ Iterator nth,
+ Iterator last,
+ Compare compare,
+ command_queue &queue = system::default_queue())
+{
+ if(nth == last) return;
+
+ typedef typename std::iterator_traits<Iterator>::value_type value_type;
+
+ while(1)
+ {
+ value_type value = nth.read(queue);
+
+ using boost::compute::placeholders::_1;
+ Iterator new_nth = partition(
+ first, last, ::boost::compute::bind(compare, _1, value), queue
+ );
+
+ Iterator old_nth = find(new_nth, last, value, queue);
+
+ value_type new_value = new_nth.read(queue);
+
+ fill_n(new_nth, 1, value, queue);
+ fill_n(old_nth, 1, new_value, queue);
+
+ new_value = nth.read(queue);
+
+ if(value == new_value) break;
+
+ if(std::distance(first, nth) < std::distance(first, new_nth))
+ {
+ last = new_nth;
+ }
+ else
+ {
+ first = new_nth;
+ }
+ }
+}
+
+/// \overload
+template<class Iterator>
+inline void nth_element(Iterator first,
+ Iterator nth,
+ Iterator last,
+ command_queue &queue = system::default_queue())
+{
+ if(nth == last) return;
+
+ typedef typename std::iterator_traits<Iterator>::value_type value_type;
+
+ less<value_type> less_than;
+
+ return nth_element(first, nth, last, less_than, queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_NTH_ELEMENT_HPP
diff --git a/boost/compute/algorithm/partial_sum.hpp b/boost/compute/algorithm/partial_sum.hpp
new file mode 100644
index 0000000000..d440369a5a
--- /dev/null
+++ b/boost/compute/algorithm/partial_sum.hpp
@@ -0,0 +1,37 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_PARTIAL_SUM_HPP
+#define BOOST_COMPUTE_ALGORITHM_PARTIAL_SUM_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/inclusive_scan.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Calculates the cumulative sum of the elements in the range [\p first,
+/// \p last) and writes the resulting values to the range beginning at
+/// \p result.
+template<class InputIterator, class OutputIterator>
+inline OutputIterator
+partial_sum(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue = system::default_queue())
+{
+ return ::boost::compute::inclusive_scan(first, last, result, queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_PARTIAL_SUM_HPP
diff --git a/boost/compute/algorithm/partition.hpp b/boost/compute/algorithm/partition.hpp
new file mode 100644
index 0000000000..7860350e0d
--- /dev/null
+++ b/boost/compute/algorithm/partition.hpp
@@ -0,0 +1,39 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_PARTITION_HPP
+#define BOOST_COMPUTE_ALGORITHM_PARTITION_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/stable_partition.hpp>
+
+namespace boost {
+namespace compute {
+
+///
+/// Partitions the elements in the range [\p first, \p last) according to
+/// \p predicate. Order of the elements need not be preserved.
+///
+/// \see is_partitioned() and stable_partition()
+///
+template<class Iterator, class UnaryPredicate>
+inline Iterator partition(Iterator first,
+ Iterator last,
+ UnaryPredicate predicate,
+ command_queue &queue = system::default_queue())
+{
+ return stable_partition(first, last, predicate, queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_PARTITION_HPP
diff --git a/boost/compute/algorithm/partition_copy.hpp b/boost/compute/algorithm/partition_copy.hpp
new file mode 100644
index 0000000000..80a2c6475f
--- /dev/null
+++ b/boost/compute/algorithm/partition_copy.hpp
@@ -0,0 +1,63 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_PARTITION_COPY_HPP
+#define BOOST_COMPUTE_ALGORITHM_PARTITION_COPY_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/functional.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/copy_if.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Copies all of the elements in the range [\p first, \p last) for which
+/// \p predicate returns \c true to the range beginning at \p first_true
+/// and all of the elements for which \p predicate returns \c false to
+/// the range beginning at \p first_false.
+///
+/// \see partition()
+template<class InputIterator,
+ class OutputIterator1,
+ class OutputIterator2,
+ class UnaryPredicate>
+inline std::pair<OutputIterator1, OutputIterator2>
+partition_copy(InputIterator first,
+ InputIterator last,
+ OutputIterator1 first_true,
+ OutputIterator2 first_false,
+ UnaryPredicate predicate,
+ command_queue &queue = system::default_queue())
+{
+ // copy true values
+ OutputIterator1 last_true =
+ ::boost::compute::copy_if(first,
+ last,
+ first_true,
+ predicate,
+ queue);
+
+ // copy false values
+ OutputIterator2 last_false =
+ ::boost::compute::copy_if(first,
+ last,
+ first_false,
+ not1(predicate),
+ queue);
+
+ // return iterators to the end of the true and the false ranges
+ return std::make_pair(last_true, last_false);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_PARTITION_COPY_HPP
diff --git a/boost/compute/algorithm/partition_point.hpp b/boost/compute/algorithm/partition_point.hpp
new file mode 100644
index 0000000000..3cc2bc0ca6
--- /dev/null
+++ b/boost/compute/algorithm/partition_point.hpp
@@ -0,0 +1,46 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_PARTITION_POINT_HPP
+#define BOOST_COMPUTE_ALGORITHM_PARTITION_POINT_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/detail/binary_find.hpp>
+
+namespace boost {
+namespace compute {
+
+///
+/// \brief Partition point algorithm
+///
+/// Finds the end of true values in the partitioned range [first, last)
+/// \return Iterator pointing to end of true values
+///
+/// \param first Iterator pointing to start of range
+/// \param last Iterator pointing to end of range
+/// \param predicate Unary predicate to be applied on each element
+/// \param queue Queue on which to execute
+///
+/// \see partition() and stable_partition()
+///
+template<class InputIterator, class UnaryPredicate>
+inline InputIterator partition_point(InputIterator first,
+ InputIterator last,
+ UnaryPredicate predicate,
+ command_queue &queue = system::default_queue())
+{
+ return detail::binary_find(first, last, not1(predicate), queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_PARTITION_POINT_HPP
diff --git a/boost/compute/algorithm/prev_permutation.hpp b/boost/compute/algorithm/prev_permutation.hpp
new file mode 100644
index 0000000000..03c01bf8f4
--- /dev/null
+++ b/boost/compute/algorithm/prev_permutation.hpp
@@ -0,0 +1,170 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_PREV_PERMUTATION_HPP
+#define BOOST_COMPUTE_ALGORITHM_PREV_PERMUTATION_HPP
+
+#include <iterator>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/container/detail/scalar.hpp>
+#include <boost/compute/algorithm/reverse.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+///
+/// \brief Helper function for prev_permutation
+///
+/// To find rightmost element which is greater
+/// than its next element
+///
+template<class InputIterator>
+inline InputIterator prev_permutation_helper(InputIterator first,
+ InputIterator last,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+ size_t count = detail::iterator_range_size(first, last);
+ if(count == 0 || count == 1){
+ return last;
+ }
+ count = count - 1;
+ const context &context = queue.get_context();
+
+ detail::meta_kernel k("prev_permutation");
+ size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index");
+ atomic_max<int_> atomic_max_int;
+
+ k << k.decl<const int_>("i") << " = get_global_id(0);\n"
+ << k.decl<const value_type>("cur_value") << "="
+ << first[k.var<const int_>("i")] << ";\n"
+ << k.decl<const value_type>("next_value") << "="
+ << first[k.expr<const int_>("i+1")] << ";\n"
+ << "if(cur_value > next_value){\n"
+ << " " << atomic_max_int(k.var<int_ *>("index"), k.var<int_>("i")) << ";\n"
+ << "}\n";
+
+ kernel kernel = k.compile(context);
+
+ scalar<int_> index(context);
+ kernel.set_arg(index_arg, index.get_buffer());
+
+ index.write(static_cast<int_>(-1), queue);
+
+ queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
+
+ int result = static_cast<int>(index.read(queue));
+ if(result == -1) return last;
+ else return first + result;
+}
+
+///
+/// \brief Helper function for prev_permutation
+///
+/// To find the largest element to the right of the element found above
+/// that is smaller than it
+///
+template<class InputIterator, class ValueType>
+inline InputIterator pp_floor(InputIterator first,
+ InputIterator last,
+ ValueType value,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+ size_t count = detail::iterator_range_size(first, last);
+ if(count == 0){
+ return last;
+ }
+ const context &context = queue.get_context();
+
+ detail::meta_kernel k("pp_floor");
+ size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index");
+ size_t value_arg = k.add_arg<value_type>(memory_object::private_memory, "value");
+ atomic_max<int_> atomic_max_int;
+
+ k << k.decl<const int_>("i") << " = get_global_id(0);\n"
+ << k.decl<const value_type>("cur_value") << "="
+ << first[k.var<const int_>("i")] << ";\n"
+ << "if(cur_value >= " << first[k.expr<int_>("*index")]
+ << " && cur_value < value){\n"
+ << " " << atomic_max_int(k.var<int_ *>("index"), k.var<int_>("i")) << ";\n"
+ << "}\n";
+
+ kernel kernel = k.compile(context);
+
+ scalar<int_> index(context);
+ kernel.set_arg(index_arg, index.get_buffer());
+
+ index.write(static_cast<int_>(0), queue);
+
+ kernel.set_arg(value_arg, value);
+
+ queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
+
+ int result = static_cast<int>(index.read(queue));
+ return first + result;
+}
+
+} // end detail namespace
+
+///
+/// \brief Permutation generating algorithm
+///
+/// Transforms the range [first, last) into the previous permutation from
+/// the set of all permutations arranged in lexicographic order
+/// \return Boolean value signifying if the first permutation was crossed
+/// and the range was reset
+///
+/// \param first Iterator pointing to start of range
+/// \param last Iterator pointing to end of range
+/// \param queue Queue on which to execute
+///
+template<class InputIterator>
+inline bool prev_permutation(InputIterator first,
+ InputIterator last,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+ if(first == last) return false;
+
+ InputIterator first_element =
+ detail::prev_permutation_helper(first, last, queue);
+
+ if(first_element == last)
+ {
+ reverse(first, last, queue);
+ return false;
+ }
+
+ value_type first_value = first_element.read(queue);
+
+ InputIterator ceiling_element =
+ detail::pp_floor(first_element + 1, last, first_value, queue);
+
+ value_type ceiling_value = ceiling_element.read(queue);
+
+ first_element.write(ceiling_value, queue);
+ ceiling_element.write(first_value, queue);
+
+ reverse(first_element + 1, last, queue);
+
+ return true;
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_PREV_PERMUTATION_HPP
diff --git a/boost/compute/algorithm/random_shuffle.hpp b/boost/compute/algorithm/random_shuffle.hpp
new file mode 100644
index 0000000000..7d2d46a133
--- /dev/null
+++ b/boost/compute/algorithm/random_shuffle.hpp
@@ -0,0 +1,75 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_RANDOM_SHUFFLE_HPP
+#define BOOST_COMPUTE_ALGORITHM_RANDOM_SHUFFLE_HPP
+
+#include <vector>
+#include <algorithm>
+
+#include <boost/range/algorithm_ext/iota.hpp>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/functional.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/algorithm/scatter.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Randomly shuffles the elements in the range [\p first, \p last).
+///
+/// \see scatter()
+template<class Iterator>
+inline void random_shuffle(Iterator first,
+ Iterator last,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<Iterator>::value_type value_type;
+
+ size_t count = detail::iterator_range_size(first, last);
+ if(count == 0){
+ return;
+ }
+
+ // generate shuffled indices on the host
+ std::vector<cl_uint> random_indices(count);
+ boost::iota(random_indices, 0);
+ std::random_shuffle(random_indices.begin(), random_indices.end());
+
+ // copy random indices to the device
+ const context &context = queue.get_context();
+ vector<cl_uint> indices(count, context);
+ ::boost::compute::copy(random_indices.begin(),
+ random_indices.end(),
+ indices.begin(),
+ queue);
+
+ // make a copy of the values on the device
+ vector<value_type> tmp(count, context);
+ ::boost::compute::copy(first,
+ last,
+ tmp.begin(),
+ queue);
+
+ // write values to their new locations
+ ::boost::compute::scatter(tmp.begin(),
+ tmp.end(),
+ indices.begin(),
+ first,
+ queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_RANDOM_SHUFFLE_HPP
diff --git a/boost/compute/algorithm/reduce.hpp b/boost/compute/algorithm/reduce.hpp
new file mode 100644
index 0000000000..79624a0e50
--- /dev/null
+++ b/boost/compute/algorithm/reduce.hpp
@@ -0,0 +1,301 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
+#define BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
+
+#include <iterator>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/functional.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/container/array.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/algorithm/copy_n.hpp>
+#include <boost/compute/algorithm/detail/inplace_reduce.hpp>
+#include <boost/compute/algorithm/detail/reduce_on_gpu.hpp>
+#include <boost/compute/algorithm/detail/serial_reduce.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/memory/local_buffer.hpp>
+#include <boost/compute/type_traits/result_of.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class OutputIterator, class BinaryFunction>
+size_t reduce(InputIterator first,
+ size_t count,
+ OutputIterator result,
+ size_t block_size,
+ BinaryFunction function,
+ command_queue &queue)
+{
+ typedef typename
+ std::iterator_traits<InputIterator>::value_type
+ input_type;
+ typedef typename
+ boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
+ result_type;
+
+ const context &context = queue.get_context();
+ size_t block_count = count / 2 / block_size;
+ size_t total_block_count =
+ static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));
+
+ if(block_count != 0){
+ meta_kernel k("block_reduce");
+ size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
+ size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");
+
+ k <<
+ "const uint gid = get_global_id(0);\n" <<
+ "const uint lid = get_local_id(0);\n" <<
+
+ // copy values to local memory
+ "block[lid] = " <<
+ function(first[k.make_var<uint_>("gid*2+0")],
+ first[k.make_var<uint_>("gid*2+1")]) << ";\n" <<
+
+ // perform reduction
+ "for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" <<
+ " barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+ " uint mask = (i << 1) - 1;\n" <<
+ " if((lid & mask) == 0){\n" <<
+ " block[lid] = " <<
+ function(k.expr<input_type>("block[lid]"),
+ k.expr<input_type>("block[lid+i]")) << ";\n" <<
+ " }\n" <<
+ "}\n" <<
+
+ // write block result to global output
+ "if(lid == 0)\n" <<
+ " output[get_group_id(0)] = block[0];\n";
+
+ kernel kernel = k.compile(context);
+ kernel.set_arg(output_arg, result.get_buffer());
+ kernel.set_arg(block_arg, local_buffer<input_type>(block_size));
+
+ queue.enqueue_1d_range_kernel(kernel,
+ 0,
+ block_count * block_size,
+ block_size);
+ }
+
+ // serially reduce any leftovers
+ if(block_count * block_size * 2 < count){
+ size_t last_block_start = block_count * block_size * 2;
+
+ meta_kernel k("extra_serial_reduce");
+ size_t count_arg = k.add_arg<uint_>("count");
+ size_t offset_arg = k.add_arg<uint_>("offset");
+ size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
+ size_t output_offset_arg = k.add_arg<uint_>("output_offset");
+
+ k <<
+ k.decl<result_type>("result") << " = \n" <<
+ first[k.expr<uint_>("offset")] << ";\n" <<
+ "for(uint i = offset + 1; i < count; i++)\n" <<
+ " result = " <<
+ function(k.var<result_type>("result"),
+ first[k.var<uint_>("i")]) << ";\n" <<
+ "output[output_offset] = result;\n";
+
+ kernel kernel = k.compile(context);
+ kernel.set_arg(count_arg, static_cast<uint_>(count));
+ kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start));
+ kernel.set_arg(output_arg, result.get_buffer());
+ kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count));
+
+ queue.enqueue_task(kernel);
+ }
+
+ return total_block_count;
+}
+
+template<class InputIterator, class BinaryFunction>
+inline vector<
+ typename boost::compute::result_of<
+ BinaryFunction(
+ typename std::iterator_traits<InputIterator>::value_type,
+ typename std::iterator_traits<InputIterator>::value_type
+ )
+ >::type
+>
+block_reduce(InputIterator first,
+ size_t count,
+ size_t block_size,
+ BinaryFunction function,
+ command_queue &queue)
+{
+ typedef typename
+ std::iterator_traits<InputIterator>::value_type
+ input_type;
+ typedef typename
+ boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
+ result_type;
+
+ const context &context = queue.get_context();
+ size_t total_block_count =
+ static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));
+ vector<result_type> result_vector(total_block_count, context);
+
+ reduce(first, count, result_vector.begin(), block_size, function, queue);
+
+ return result_vector;
+}
+
+template<class InputIterator, class OutputIterator, class BinaryFunction>
+inline void generic_reduce(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ BinaryFunction function,
+ command_queue &queue)
+{
+ typedef typename
+ std::iterator_traits<InputIterator>::value_type
+ input_type;
+ typedef typename
+ boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
+ result_type;
+
+ const device &device = queue.get_device();
+ const context &context = queue.get_context();
+
+ size_t count = detail::iterator_range_size(first, last);
+
+ if(device.type() & device::cpu){
+ boost::compute::vector<result_type> value(1, context);
+ detail::serial_reduce(first, last, value.begin(), function, queue);
+ boost::compute::copy_n(value.begin(), 1, result, queue);
+ }
+ else {
+ size_t block_size = 256;
+
+ // first pass
+ vector<result_type> results = detail::block_reduce(first,
+ count,
+ block_size,
+ function,
+ queue);
+
+ if(results.size() > 1){
+ detail::inplace_reduce(results.begin(),
+ results.end(),
+ function,
+ queue);
+ }
+
+ boost::compute::copy_n(results.begin(), 1, result, queue);
+ }
+}
+
+template<class InputIterator, class OutputIterator, class T>
+inline void dispatch_reduce(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ const plus<T> &function,
+ command_queue &queue)
+{
+ const context &context = queue.get_context();
+ const device &device = queue.get_device();
+
+ // reduce to temporary buffer on device
+ array<T, 1> tmp(context);
+ if(device.type() & device::cpu){
+ detail::serial_reduce(first, last, tmp.begin(), function, queue);
+ }
+ else {
+ reduce_on_gpu(first, last, tmp.begin(), function, queue);
+ }
+
+ // copy to result iterator
+ copy_n(tmp.begin(), 1, result, queue);
+}
+
+template<class InputIterator, class OutputIterator, class BinaryFunction>
+inline void dispatch_reduce(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ BinaryFunction function,
+ command_queue &queue)
+{
+ generic_reduce(first, last, result, function, queue);
+}
+
+} // end detail namespace
+
+/// Returns the result of applying \p function to the elements in the
+/// range [\p first, \p last).
+///
+/// If no function is specified, \c plus will be used.
+///
+/// \param first first element in the input range
+/// \param last last element in the input range
+/// \param result iterator pointing to the output
+/// \param function binary reduction function
+/// \param queue command queue to perform the operation
+///
+/// The \c reduce() algorithm assumes that the binary reduction function is
+/// associative. When used with non-associative functions the result may
+/// be non-deterministic and vary in precision. Notably this affects the
+/// \c plus<float>() function as floating-point addition is not associative
+/// and may produce slightly different results than a serial algorithm.
+///
+/// This algorithm supports both host and device iterators for the
+/// result argument. This allows for values to be reduced and copied
+/// to the host all with a single function call.
+///
+/// For example, to calculate the sum of the values in a device vector and
+/// copy the result to a value on the host:
+///
+/// \snippet test/test_reduce.cpp sum_int
+///
+/// Note that while the the \c reduce() algorithm is conceptually identical to
+/// the \c accumulate() algorithm, its implementation is substantially more
+/// efficient on parallel hardware. For more information, see the documentation
+/// on the \c accumulate() algorithm.
+///
+/// \see accumulate()
+template<class InputIterator, class OutputIterator, class BinaryFunction>
+inline void reduce(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ BinaryFunction function,
+ command_queue &queue = system::default_queue())
+{
+ if(first == last){
+ return;
+ }
+
+ detail::dispatch_reduce(first, last, result, function, queue);
+}
+
+/// \overload
+template<class InputIterator, class OutputIterator>
+inline void reduce(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+ if(first == last){
+ return;
+ }
+
+ detail::dispatch_reduce(first, last, result, plus<T>(), queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
diff --git a/boost/compute/algorithm/reduce_by_key.hpp b/boost/compute/algorithm/reduce_by_key.hpp
new file mode 100644
index 0000000000..87c73e887f
--- /dev/null
+++ b/boost/compute/algorithm/reduce_by_key.hpp
@@ -0,0 +1,118 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_REDUCE_BY_KEY_HPP
+#define BOOST_COMPUTE_ALGORITHM_REDUCE_BY_KEY_HPP
+
+#include <iterator>
+#include <utility>
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/device.hpp>
+#include <boost/compute/functional.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/detail/reduce_by_key.hpp>
+
+namespace boost {
+namespace compute {
+
+/// The \c reduce_by_key() algorithm performs reduction for each contiguous
+/// subsequence of values determinate by equivalent keys.
+///
+/// Returns a pair of iterators at the end of the ranges [\p keys_result, keys_result_last)
+/// and [\p values_result, \p values_result_last).
+///
+/// If no function is specified, \c plus will be used.
+/// If no predicate is specified, \c equal_to will be used.
+///
+/// \param keys_first the first key
+/// \param keys_last the last key
+/// \param values_first the first input value
+/// \param keys_result iterator pointing to the key output
+/// \param values_result iterator pointing to the reduced value output
+/// \param function binary reduction function
+/// \param predicate binary predicate which returns true only if two keys are equal
+/// \param queue command queue to perform the operation
+///
+/// The \c reduce_by_key() algorithm assumes that the binary reduction function
+/// is associative. When used with non-associative functions the result may
+/// be non-deterministic and vary in precision. Notably this affects the
+/// \c plus<float>() function as floating-point addition is not associative
+/// and may produce slightly different results than a serial algorithm.
+///
+/// For example, to calculate the sum of the values for each key:
+///
+/// \snippet test/test_reduce_by_key.cpp reduce_by_key_int
+///
+/// \see reduce()
+template<class InputKeyIterator, class InputValueIterator,
+ class OutputKeyIterator, class OutputValueIterator,
+ class BinaryFunction, class BinaryPredicate>
+inline std::pair<OutputKeyIterator, OutputValueIterator>
+reduce_by_key(InputKeyIterator keys_first,
+ InputKeyIterator keys_last,
+ InputValueIterator values_first,
+ OutputKeyIterator keys_result,
+ OutputValueIterator values_result,
+ BinaryFunction function,
+ BinaryPredicate predicate,
+ command_queue &queue = system::default_queue())
+{
+ return detail::dispatch_reduce_by_key(keys_first, keys_last, values_first,
+ keys_result, values_result,
+ function, predicate,
+ queue);
+}
+
+/// \overload
+template<class InputKeyIterator, class InputValueIterator,
+ class OutputKeyIterator, class OutputValueIterator,
+ class BinaryFunction>
+inline std::pair<OutputKeyIterator, OutputValueIterator>
+reduce_by_key(InputKeyIterator keys_first,
+ InputKeyIterator keys_last,
+ InputValueIterator values_first,
+ OutputKeyIterator keys_result,
+ OutputValueIterator values_result,
+ BinaryFunction function,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputKeyIterator>::value_type key_type;
+
+ return reduce_by_key(keys_first, keys_last, values_first,
+ keys_result, values_result,
+ function, equal_to<key_type>(),
+ queue);
+}
+
+/// \overload
+template<class InputKeyIterator, class InputValueIterator,
+ class OutputKeyIterator, class OutputValueIterator>
+inline std::pair<OutputKeyIterator, OutputValueIterator>
+reduce_by_key(InputKeyIterator keys_first,
+ InputKeyIterator keys_last,
+ InputValueIterator values_first,
+ OutputKeyIterator keys_result,
+ OutputValueIterator values_result,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputKeyIterator>::value_type key_type;
+ typedef typename std::iterator_traits<InputValueIterator>::value_type value_type;
+
+ return reduce_by_key(keys_first, keys_last, values_first,
+ keys_result, values_result,
+ plus<value_type>(), equal_to<key_type>(),
+ queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_REDUCE_BY_KEY_HPP
diff --git a/boost/compute/algorithm/remove.hpp b/boost/compute/algorithm/remove.hpp
new file mode 100644
index 0000000000..98feb1f9d8
--- /dev/null
+++ b/boost/compute/algorithm/remove.hpp
@@ -0,0 +1,54 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_REMOVE_HPP
+#define BOOST_COMPUTE_ALGORITHM_REMOVE_HPP
+
+#include <boost/compute/lambda.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/remove_if.hpp>
+#include <boost/compute/type_traits/vector_size.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Removes each element equal to \p value in the range [\p first,
+/// \p last).
+///
+/// \see remove_if()
+template<class Iterator, class T>
+inline Iterator remove(Iterator first,
+ Iterator last,
+ const T &value,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<Iterator>::value_type value_type;
+
+ using ::boost::compute::_1;
+ using ::boost::compute::lambda::all;
+
+ if(vector_size<value_type>::value == 1){
+ return ::boost::compute::remove_if(first,
+ last,
+ _1 == value,
+ queue);
+ }
+ else {
+ return ::boost::compute::remove_if(first,
+ last,
+ all(_1 == value),
+ queue);
+ }
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_REMOVE_HPP
diff --git a/boost/compute/algorithm/remove_if.hpp b/boost/compute/algorithm/remove_if.hpp
new file mode 100644
index 0000000000..5e416bef88
--- /dev/null
+++ b/boost/compute/algorithm/remove_if.hpp
@@ -0,0 +1,47 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_REMOVE_IF_HPP
+#define BOOST_COMPUTE_ALGORITHM_REMOVE_IF_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/copy_if.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/functional/logical.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Removes each element for which \p predicate returns \c true in the
+/// range [\p first, \p last).
+///
+/// \see remove()
+template<class Iterator, class Predicate>
+inline Iterator remove_if(Iterator first,
+ Iterator last,
+ Predicate predicate,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<Iterator>::value_type value_type;
+
+ // temporary storage for the input data
+ ::boost::compute::vector<value_type> tmp(first, last, queue);
+
+ return ::boost::compute::copy_if(tmp.begin(),
+ tmp.end(),
+ first,
+ not1(predicate),
+ queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_REMOVE_IF_HPP
diff --git a/boost/compute/algorithm/replace.hpp b/boost/compute/algorithm/replace.hpp
new file mode 100644
index 0000000000..fd649a2fad
--- /dev/null
+++ b/boost/compute/algorithm/replace.hpp
@@ -0,0 +1,90 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_REPLACE_HPP
+#define BOOST_COMPUTE_ALGORITHM_REPLACE_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class Iterator, class T>
+class replace_kernel : public meta_kernel
+{
+public:
+ replace_kernel()
+ : meta_kernel("replace")
+ {
+ m_count = 0;
+ }
+
+ void set_range(Iterator first, Iterator last)
+ {
+ m_count = detail::iterator_range_size(first, last);
+
+ *this <<
+ "const uint i = get_global_id(0);\n" <<
+ "if(" << first[var<cl_uint>("i")] << " == " << var<T>("old_value") << ")\n" <<
+ " " << first[var<cl_uint>("i")] << '=' << var<T>("new_value") << ";\n";
+ }
+
+ void set_old_value(const T &old_value)
+ {
+ add_set_arg<T>("old_value", old_value);
+ }
+
+ void set_new_value(const T &new_value)
+ {
+ add_set_arg<T>("new_value", new_value);
+ }
+
+ void exec(command_queue &queue)
+ {
+ if(m_count == 0){
+ // nothing to do
+ return;
+ }
+
+ exec_1d(queue, 0, m_count);
+ }
+
+private:
+ size_t m_count;
+};
+
+} // end detail namespace
+
+/// Replaces each instance of \p old_value in the range [\p first,
+/// \p last) with \p new_value.
+template<class Iterator, class T>
+inline void replace(Iterator first,
+ Iterator last,
+ const T &old_value,
+ const T &new_value,
+ command_queue &queue = system::default_queue())
+{
+ detail::replace_kernel<Iterator, T> kernel;
+
+ kernel.set_range(first, last);
+ kernel.set_old_value(old_value);
+ kernel.set_new_value(new_value);
+
+ kernel.exec(queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_REPLACE_HPP
diff --git a/boost/compute/algorithm/replace_copy.hpp b/boost/compute/algorithm/replace_copy.hpp
new file mode 100644
index 0000000000..7224bd3ae6
--- /dev/null
+++ b/boost/compute/algorithm/replace_copy.hpp
@@ -0,0 +1,62 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_REPLACE_COPY_HPP
+#define BOOST_COMPUTE_ALGORITHM_REPLACE_COPY_HPP
+
+#include <iterator>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/replace.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Copies the value in the range [\p first, \p last) to the range
+/// beginning at \p result while replacing each instance of \p old_value
+/// with \p new_value.
+///
+/// \see replace()
+template<class InputIterator, class OutputIterator, class T>
+inline OutputIterator
+replace_copy(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ const T &old_value,
+ const T &new_value,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<OutputIterator>::difference_type difference_type;
+
+ difference_type count = std::distance(first, last);
+ if(count == 0){
+ return result;
+ }
+
+ // copy data to result
+ ::boost::compute::copy(first, last, result, queue);
+
+ // replace in result
+ ::boost::compute::replace(result,
+ result + count,
+ old_value,
+ new_value,
+ queue);
+
+ // return iterator to the end of result
+ return result + count;
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_REPLACE_COPY_HPP
diff --git a/boost/compute/algorithm/reverse.hpp b/boost/compute/algorithm/reverse.hpp
new file mode 100644
index 0000000000..b6a9e8098c
--- /dev/null
+++ b/boost/compute/algorithm/reverse.hpp
@@ -0,0 +1,74 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_REVERSE_HPP
+#define BOOST_COMPUTE_ALGORITHM_REVERSE_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class Iterator>
+struct reverse_kernel : public meta_kernel
+{
+ reverse_kernel(Iterator first, Iterator last)
+ : meta_kernel("reverse")
+ {
+ typedef typename std::iterator_traits<Iterator>::value_type value_type;
+
+ // store size of the range
+ m_size = detail::iterator_range_size(first, last);
+ add_set_arg<const cl_uint>("size", static_cast<const cl_uint>(m_size));
+
+ *this <<
+ decl<cl_uint>("i") << " = get_global_id(0);\n" <<
+ decl<cl_uint>("j") << " = size - get_global_id(0) - 1;\n" <<
+ decl<value_type>("tmp") << "=" << first[var<cl_uint>("i")] << ";\n" <<
+ first[var<cl_uint>("i")] << "=" << first[var<cl_uint>("j")] << ";\n" <<
+ first[var<cl_uint>("j")] << "= tmp;\n";
+ }
+
+ void exec(command_queue &queue)
+ {
+ exec_1d(queue, 0, m_size / 2);
+ }
+
+ size_t m_size;
+};
+
+} // end detail namespace
+
+/// Reverses the elements in the range [\p first, \p last).
+///
+/// \see reverse_copy()
+template<class Iterator>
+inline void reverse(Iterator first,
+ Iterator last,
+ command_queue &queue = system::default_queue())
+{
+ size_t count = detail::iterator_range_size(first, last);
+ if(count < 2){
+ return;
+ }
+
+ detail::reverse_kernel<Iterator> kernel(first, last);
+
+ kernel.exec(queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_REVERSE_HPP
diff --git a/boost/compute/algorithm/reverse_copy.hpp b/boost/compute/algorithm/reverse_copy.hpp
new file mode 100644
index 0000000000..c839f44651
--- /dev/null
+++ b/boost/compute/algorithm/reverse_copy.hpp
@@ -0,0 +1,79 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_REVERSE_COPY_HPP
+#define BOOST_COMPUTE_ALGORITHM_REVERSE_COPY_HPP
+
+#include <iterator>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/algorithm/reverse.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class Iterator, class OutputIterator>
+struct reverse_copy_kernel : public meta_kernel
+{
+ reverse_copy_kernel(Iterator first, Iterator last, OutputIterator result)
+ : meta_kernel("reverse_copy")
+ {
+ // store size of the range
+ m_size = detail::iterator_range_size(first, last);
+ add_set_arg<const cl_uint>("size", static_cast<const cl_uint>(m_size));
+
+ *this <<
+ decl<cl_uint>("i") << " = get_global_id(0);\n" <<
+ decl<cl_uint>("j") << " = size - get_global_id(0) - 1;\n" <<
+ result[var<cl_uint>("j")] << "=" << first[var<cl_uint>("i")] << ";\n";
+ }
+
+ void exec(command_queue &queue)
+ {
+ exec_1d(queue, 0, m_size);
+ }
+
+ size_t m_size;
+};
+
+} // end detail namespace
+
+/// Copies the elements in the range [\p first, \p last) in reversed
+/// order to the range beginning at \p result.
+///
+/// \see reverse()
+template<class InputIterator, class OutputIterator>
+inline OutputIterator
+reverse_copy(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<OutputIterator>::difference_type difference_type;
+
+ difference_type count = std::distance(first, last);
+
+ detail::reverse_copy_kernel<InputIterator, OutputIterator>
+ kernel(first, last, result);
+
+ // run kernel
+ kernel.exec(queue);
+
+ // return iterator to the end of result
+ return result + count;
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_REVERSE_COPY_HPP
diff --git a/boost/compute/algorithm/rotate.hpp b/boost/compute/algorithm/rotate.hpp
new file mode 100644
index 0000000000..54cb073cc2
--- /dev/null
+++ b/boost/compute/algorithm/rotate.hpp
@@ -0,0 +1,54 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_ROTATE_HPP
+#define BOOST_COMPUTE_ALGORITHM_ROTATE_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/container/vector.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Performs left rotation such that element at \p n_first comes to the
+/// beginning.
+///
+/// \see rotate_copy()
+template<class InputIterator>
+inline void rotate(InputIterator first,
+ InputIterator n_first,
+ InputIterator last,
+ command_queue &queue = system::default_queue())
+{
+ //Handle trivial cases
+ if (n_first==first || n_first==last)
+ {
+ return;
+ }
+
+ //Handle others
+ typedef typename std::iterator_traits<InputIterator>::value_type T;
+
+ size_t count = detail::iterator_range_size(first, n_first);
+ size_t count2 = detail::iterator_range_size(first, last);
+
+ const context &context = queue.get_context();
+ vector<T> temp(count2, context);
+ ::boost::compute::copy(first, last, temp.begin(), queue);
+
+ ::boost::compute::copy(temp.begin()+count, temp.end(), first, queue);
+ ::boost::compute::copy(temp.begin(), temp.begin()+count, last-count, queue);
+}
+
+} //end compute namespace
+} //end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_ROTATE_HPP
diff --git a/boost/compute/algorithm/rotate_copy.hpp b/boost/compute/algorithm/rotate_copy.hpp
new file mode 100644
index 0000000000..fa1b44c5e5
--- /dev/null
+++ b/boost/compute/algorithm/rotate_copy.hpp
@@ -0,0 +1,41 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_ROTATE_COPY_HPP
+#define BOOST_COMPUTE_ALGORITHM_ROTATE_COPY_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Performs left rotation such that element at n_first comes to the
+/// beginning and the output is stored in range starting at result.
+///
+/// \see rotate()
+template<class InputIterator, class OutputIterator>
+inline void rotate_copy(InputIterator first,
+ InputIterator n_first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue = system::default_queue())
+{
+ size_t count = detail::iterator_range_size(first, n_first);
+ size_t count2 = detail::iterator_range_size(n_first, last);
+
+ ::boost::compute::copy(first+count, last, result, queue);
+ ::boost::compute::copy(first, first+count, result+count2, queue);
+}
+
+} //end compute namespace
+} //end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_ROTATE_COPY_HPP
diff --git a/boost/compute/algorithm/scatter.hpp b/boost/compute/algorithm/scatter.hpp
new file mode 100644
index 0000000000..bea4201628
--- /dev/null
+++ b/boost/compute/algorithm/scatter.hpp
@@ -0,0 +1,99 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_SCATTER_HPP
+#define BOOST_COMPUTE_ALGORITHM_SCATTER_HPP
+
+#include <boost/algorithm/string/replace.hpp>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/exception.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/type_traits/type_name.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class MapIterator, class OutputIterator>
+class scatter_kernel : meta_kernel
+{
+public:
+ scatter_kernel() : meta_kernel("scatter")
+ {}
+
+ void set_range(InputIterator first,
+ InputIterator last,
+ MapIterator map,
+ OutputIterator result)
+ {
+ m_count = iterator_range_size(first, last);
+ m_input_offset = first.get_index();
+ m_output_offset = result.get_index();
+
+ m_input_offset_arg = add_arg<uint_>("input_offset");
+ m_output_offset_arg = add_arg<uint_>("output_offset");
+
+ *this <<
+ "const uint i = get_global_id(0);\n" <<
+ "uint i1 = " << map[expr<uint_>("i")] <<
+ " + output_offset;\n" <<
+ "uint i2 = i + input_offset;\n" <<
+ result[expr<uint_>("i1")] << "=" <<
+ first[expr<uint_>("i2")] << ";\n";
+ }
+
+ event exec(command_queue &queue)
+ {
+ if(m_count == 0) {
+ return event();
+ }
+
+ set_arg(m_input_offset_arg, uint_(m_input_offset));
+ set_arg(m_output_offset_arg, uint_(m_output_offset));
+
+ return exec_1d(queue, 0, m_count);
+ }
+
+private:
+ size_t m_count;
+ size_t m_input_offset;
+ size_t m_input_offset_arg;
+ size_t m_output_offset;
+ size_t m_output_offset_arg;
+};
+
+} // end detail namespace
+
+/// Copies the elements from the range [\p first, \p last) to the range
+/// beginning at \p result using the output indices from the range beginning
+/// at \p map.
+///
+/// \see gather()
+template<class InputIterator, class MapIterator, class OutputIterator>
+inline void scatter(InputIterator first,
+ InputIterator last,
+ MapIterator map,
+ OutputIterator result,
+ command_queue &queue = system::default_queue())
+{
+ detail::scatter_kernel<InputIterator, MapIterator, OutputIterator> kernel;
+
+ kernel.set_range(first, last, map, result);
+ kernel.exec(queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_SCATTER_HPP
diff --git a/boost/compute/algorithm/scatter_if.hpp b/boost/compute/algorithm/scatter_if.hpp
new file mode 100644
index 0000000000..159edd8c86
--- /dev/null
+++ b/boost/compute/algorithm/scatter_if.hpp
@@ -0,0 +1,119 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2015 Jakub Pola <jakub.pola@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_SCATTER_IF_HPP
+#define BOOST_COMPUTE_ALGORITHM_SCATTER_IF_HPP
+
+#include <boost/algorithm/string/replace.hpp>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/exception.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/type_traits/type_name.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class MapIterator, class StencilIterator, class OutputIterator, class Predicate>
+class scatter_if_kernel : meta_kernel
+{
+public:
+ scatter_if_kernel() : meta_kernel("scatter_if")
+ {}
+
+ void set_range(InputIterator first,
+ InputIterator last,
+ MapIterator map,
+ StencilIterator stencil,
+ OutputIterator result,
+ Predicate predicate)
+ {
+ m_count = iterator_range_size(first, last);
+ m_input_offset = first.get_index();
+ m_output_offset = result.get_index();
+
+ m_input_offset_arg = add_arg<uint_>("input_offset");
+ m_output_offset_arg = add_arg<uint_>("output_offset");
+
+ *this <<
+ "const uint i = get_global_id(0);\n" <<
+ "uint i1 = " << map[expr<uint_>("i")] <<
+ " + output_offset;\n" <<
+ "uint i2 = i + input_offset;\n" <<
+ if_(predicate(stencil[expr<uint_>("i")])) << "\n" <<
+ result[expr<uint_>("i1")] << "=" <<
+ first[expr<uint_>("i2")] << ";\n";
+ }
+
+ event exec(command_queue &queue)
+ {
+ if(m_count == 0) {
+ return event();
+ }
+
+ set_arg(m_input_offset_arg, uint_(m_input_offset));
+ set_arg(m_output_offset_arg, uint_(m_output_offset));
+
+ return exec_1d(queue, 0, m_count);
+ }
+
+private:
+ size_t m_count;
+ size_t m_input_offset;
+ size_t m_input_offset_arg;
+ size_t m_output_offset;
+ size_t m_output_offset_arg;
+};
+
+} // end detail namespace
+
+/// Copies the elements from the range [\p first, \p last) to the range
+/// beginning at \p result using the output indices from the range beginning
+/// at \p map if stencil is resolved to true. By default the predicate is
+/// an identity
+///
+///
+template<class InputIterator, class MapIterator, class StencilIterator, class OutputIterator,
+ class Predicate>
+inline void scatter_if(InputIterator first,
+ InputIterator last,
+ MapIterator map,
+ StencilIterator stencil,
+ OutputIterator result,
+ Predicate predicate,
+ command_queue &queue = system::default_queue())
+{
+ detail::scatter_if_kernel<InputIterator, MapIterator, StencilIterator, OutputIterator, Predicate> kernel;
+
+ kernel.set_range(first, last, map, stencil, result, predicate);
+ kernel.exec(queue);
+}
+
+template<class InputIterator, class MapIterator, class StencilIterator, class OutputIterator>
+inline void scatter_if(InputIterator first,
+ InputIterator last,
+ MapIterator map,
+ StencilIterator stencil,
+ OutputIterator result,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<StencilIterator>::value_type T;
+
+ scatter_if(first, last, map, stencil, result, identity<T>(), queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_SCATTER_IF_HPP
diff --git a/boost/compute/algorithm/search.hpp b/boost/compute/algorithm/search.hpp
new file mode 100644
index 0000000000..3d3d035b3c
--- /dev/null
+++ b/boost/compute/algorithm/search.hpp
@@ -0,0 +1,73 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_SEARCH_HPP
+#define BOOST_COMPUTE_ALGORITHM_SEARCH_HPP
+
+#include <boost/compute/algorithm/detail/search_all.hpp>
+#include <boost/compute/algorithm/find.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/system.hpp>
+
+namespace boost {
+namespace compute {
+
+///
+/// \brief Substring matching algorithm
+///
+/// Searches for the first match of the pattern [p_first, p_last)
+/// in text [t_first, t_last).
+/// \return Iterator pointing to beginning of first occurrence
+///
+/// \param t_first Iterator pointing to start of text
+/// \param t_last Iterator pointing to end of text
+/// \param p_first Iterator pointing to start of pattern
+/// \param p_last Iterator pointing to end of pattern
+/// \param queue Queue on which to execute
+///
+template<class TextIterator, class PatternIterator>
+inline TextIterator search(TextIterator t_first,
+ TextIterator t_last,
+ PatternIterator p_first,
+ PatternIterator p_last,
+ command_queue &queue = system::default_queue())
+{
+ // there is no need to check if pattern starts at last n - 1 indices
+ vector<uint_> matching_indices(
+ detail::iterator_range_size(t_first, t_last)
+ - detail::iterator_range_size(p_first, p_last) + 1,
+ queue.get_context()
+ );
+
+ // search_kernel puts value 1 at every index in vector where pattern starts at
+ detail::search_kernel<PatternIterator,
+ TextIterator,
+ vector<uint_>::iterator> kernel;
+
+ kernel.set_range(p_first, p_last, t_first, t_last, matching_indices.begin());
+ kernel.exec(queue);
+
+ vector<uint_>::iterator index = ::boost::compute::find(
+ matching_indices.begin(), matching_indices.end(), uint_(1), queue
+ );
+
+ // pattern was not found
+ if(index == matching_indices.end())
+ return t_last;
+
+ return t_first + detail::iterator_range_size(matching_indices.begin(), index);
+}
+
+} //end compute namespace
+} //end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_SEARCH_HPP
diff --git a/boost/compute/algorithm/search_n.hpp b/boost/compute/algorithm/search_n.hpp
new file mode 100644
index 0000000000..9e03111bb0
--- /dev/null
+++ b/boost/compute/algorithm/search_n.hpp
@@ -0,0 +1,140 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SEARCH_N_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_SEARCH_N_HPP
+
+#include <iterator>
+
+#include <boost/compute/algorithm/find.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/system.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+///
+/// \brief Search kernel class
+///
+/// Subclass of meta_kernel which is capable of performing search_n
+///
+template<class TextIterator, class OutputIterator>
+class search_n_kernel : public meta_kernel
+{
+public:
+ typedef typename std::iterator_traits<TextIterator>::value_type value_type;
+
+ search_n_kernel() : meta_kernel("search_n")
+ {}
+
+ void set_range(TextIterator t_first,
+ TextIterator t_last,
+ value_type value,
+ size_t n,
+ OutputIterator result)
+ {
+ m_n = n;
+ m_n_arg = add_arg<uint_>("n");
+
+ m_value = value;
+ m_value_arg = add_arg<value_type>("value");
+
+ m_count = iterator_range_size(t_first, t_last);
+ m_count = m_count + 1 - m_n;
+
+ *this <<
+ "uint i = get_global_id(0);\n" <<
+ "uint i1 = i;\n" <<
+ "uint j;\n" <<
+ "for(j = 0; j<n; j++,i++)\n" <<
+ "{\n" <<
+ " if(value != " << t_first[expr<uint_>("i")] << ")\n" <<
+ " j = n + 1;\n" <<
+ "}\n" <<
+ "if(j == n)\n" <<
+ result[expr<uint_>("i1")] << " = 1;\n" <<
+ "else\n" <<
+ result[expr<uint_>("i1")] << " = 0;\n";
+ }
+
+ event exec(command_queue &queue)
+ {
+ if(m_count == 0) {
+ return event();
+ }
+
+ set_arg(m_n_arg, uint_(m_n));
+ set_arg(m_value_arg, m_value);
+
+ return exec_1d(queue, 0, m_count);
+ }
+
+private:
+ size_t m_n;
+ size_t m_n_arg;
+ size_t m_count;
+ value_type m_value;
+ size_t m_value_arg;
+};
+
+} //end detail namespace
+
+///
+/// \brief Substring matching algorithm
+///
+/// Searches for the first occurrence of n consecutive occurrences of
+/// value in text [t_first, t_last).
+/// \return Iterator pointing to beginning of first occurrence
+///
+/// \param t_first Iterator pointing to start of text
+/// \param t_last Iterator pointing to end of text
+/// \param n Number of times value repeats
+/// \param value Value which repeats
+/// \param queue Queue on which to execute
+///
+template<class TextIterator, class ValueType>
+inline TextIterator search_n(TextIterator t_first,
+ TextIterator t_last,
+ size_t n,
+ ValueType value,
+ command_queue &queue = system::default_queue())
+{
+ // there is no need to check if pattern starts at last n - 1 indices
+ vector<uint_> matching_indices(
+ detail::iterator_range_size(t_first, t_last) + 1 - n,
+ queue.get_context()
+ );
+
+ // search_n_kernel puts value 1 at every index in vector where pattern
+ // of n values starts at
+ detail::search_n_kernel<TextIterator,
+ vector<uint_>::iterator> kernel;
+
+ kernel.set_range(t_first, t_last, value, n, matching_indices.begin());
+ kernel.exec(queue);
+
+ vector<uint_>::iterator index = ::boost::compute::find(
+ matching_indices.begin(), matching_indices.end(), uint_(1), queue
+ );
+
+ // pattern was not found
+ if(index == matching_indices.end())
+ return t_last;
+
+ return t_first + detail::iterator_range_size(matching_indices.begin(), index);
+}
+
+} //end compute namespace
+} //end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SEARCH_N_HPP
diff --git a/boost/compute/algorithm/set_difference.hpp b/boost/compute/algorithm/set_difference.hpp
new file mode 100644
index 0000000000..17ce7bd3f6
--- /dev/null
+++ b/boost/compute/algorithm/set_difference.hpp
@@ -0,0 +1,182 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_SET_DIFFERENCE_HPP
+#define BOOST_COMPUTE_ALGORITHM_SET_DIFFERENCE_HPP
+
+#include <iterator>
+
+#include <boost/compute/algorithm/detail/compact.hpp>
+#include <boost/compute/algorithm/detail/balanced_path.hpp>
+#include <boost/compute/algorithm/exclusive_scan.hpp>
+#include <boost/compute/algorithm/fill_n.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/system.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+///
+/// \brief Serial set difference kernel class
+///
+/// Subclass of meta_kernel to perform serial set difference after tiling
+///
+class serial_set_difference_kernel : meta_kernel
+{
+public:
+ unsigned int tile_size;
+
+ serial_set_difference_kernel() : meta_kernel("set_difference")
+ {
+ tile_size = 4;
+ }
+
+ template<class InputIterator1, class InputIterator2,
+ class InputIterator3, class InputIterator4,
+ class OutputIterator1, class OutputIterator2>
+ void set_range(InputIterator1 first1,
+ InputIterator2 first2,
+ InputIterator3 tile_first1,
+ InputIterator3 tile_last1,
+ InputIterator4 tile_first2,
+ OutputIterator1 result,
+ OutputIterator2 counts)
+ {
+ m_count = iterator_range_size(tile_first1, tile_last1) - 1;
+
+ *this <<
+ "uint i = get_global_id(0);\n" <<
+ "uint start1 = " << tile_first1[expr<uint_>("i")] << ";\n" <<
+ "uint end1 = " << tile_first1[expr<uint_>("i+1")] << ";\n" <<
+ "uint start2 = " << tile_first2[expr<uint_>("i")] << ";\n" <<
+ "uint end2 = " << tile_first2[expr<uint_>("i+1")] << ";\n" <<
+ "uint index = i*" << tile_size << ";\n" <<
+ "uint count = 0;\n" <<
+ "while(start1<end1 && start2<end2)\n" <<
+ "{\n" <<
+ " if(" << first1[expr<uint_>("start1")] << " == " <<
+ first2[expr<uint_>("start2")] << ")\n" <<
+ " {\n" <<
+ " start1++; start2++;\n" <<
+ " }\n" <<
+ " else if(" << first1[expr<uint_>("start1")] << " < " <<
+ first2[expr<uint_>("start2")] << ")\n" <<
+ " {\n" <<
+ result[expr<uint_>("index")] <<
+ " = " << first1[expr<uint_>("start1")] << ";\n" <<
+ " index++; count++;\n" <<
+ " start1++;\n" <<
+ " }\n" <<
+ " else\n" <<
+ " {\n" <<
+ " start2++;\n" <<
+ " }\n" <<
+ "}\n" <<
+ "while(start1<end1)\n" <<
+ "{\n" <<
+ result[expr<uint_>("index")] <<
+ " = " << first1[expr<uint_>("start1")] << ";\n" <<
+ " index++; count++;\n" <<
+ " start1++;\n" <<
+ "}\n" <<
+ counts[expr<uint_>("i")] << " = count;\n";
+ }
+
+ event exec(command_queue &queue)
+ {
+ if(m_count == 0) {
+ return event();
+ }
+
+ return exec_1d(queue, 0, m_count);
+ }
+
+private:
+ size_t m_count;
+};
+
+} //end detail namespace
+
+///
+/// \brief Set difference algorithm
+///
+/// Finds the difference of the sorted range [first2, last2) from the sorted
+/// range [first1, last1) and stores it in range starting at result
+/// \return Iterator pointing to end of difference
+///
+/// \param first1 Iterator pointing to start of first set
+/// \param last1 Iterator pointing to end of first set
+/// \param first2 Iterator pointing to start of second set
+/// \param last2 Iterator pointing to end of second set
+/// \param result Iterator pointing to start of range in which the difference
+/// will be stored
+/// \param queue Queue on which to execute
+///
+template<class InputIterator1, class InputIterator2, class OutputIterator>
+inline OutputIterator set_difference(InputIterator1 first1,
+ InputIterator1 last1,
+ InputIterator2 first2,
+ InputIterator2 last2,
+ OutputIterator result,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
+
+ int tile_size = 1024;
+
+ int count1 = detail::iterator_range_size(first1, last1);
+ int count2 = detail::iterator_range_size(first2, last2);
+
+ vector<uint_> tile_a((count1+count2+tile_size-1)/tile_size+1, queue.get_context());
+ vector<uint_> tile_b((count1+count2+tile_size-1)/tile_size+1, queue.get_context());
+
+ // Tile the sets
+ detail::balanced_path_kernel tiling_kernel;
+ tiling_kernel.tile_size = tile_size;
+ tiling_kernel.set_range(first1, last1, first2, last2,
+ tile_a.begin()+1, tile_b.begin()+1);
+ fill_n(tile_a.begin(), 1, 0, queue);
+ fill_n(tile_b.begin(), 1, 0, queue);
+ tiling_kernel.exec(queue);
+
+ fill_n(tile_a.end()-1, 1, count1, queue);
+ fill_n(tile_b.end()-1, 1, count2, queue);
+
+ vector<value_type> temp_result(count1+count2, queue.get_context());
+ vector<uint_> counts((count1+count2+tile_size-1)/tile_size + 1, queue.get_context());
+ fill_n(counts.end()-1, 1, 0, queue);
+
+ // Find individual differences
+ detail::serial_set_difference_kernel difference_kernel;
+ difference_kernel.tile_size = tile_size;
+ difference_kernel.set_range(first1, first2, tile_a.begin(), tile_a.end(),
+ tile_b.begin(), temp_result.begin(), counts.begin());
+
+ difference_kernel.exec(queue);
+
+ exclusive_scan(counts.begin(), counts.end(), counts.begin(), queue);
+
+ // Compact the results
+ detail::compact_kernel compact_kernel;
+ compact_kernel.tile_size = tile_size;
+ compact_kernel.set_range(temp_result.begin(), counts.begin(), counts.end(), result);
+
+ compact_kernel.exec(queue);
+
+ return result + (counts.end() - 1).read(queue);
+}
+
+} //end compute namespace
+} //end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_SET_DIFFERENCE_HPP
diff --git a/boost/compute/algorithm/set_intersection.hpp b/boost/compute/algorithm/set_intersection.hpp
new file mode 100644
index 0000000000..50f291e84a
--- /dev/null
+++ b/boost/compute/algorithm/set_intersection.hpp
@@ -0,0 +1,170 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_SET_INTERSECTION_HPP
+#define BOOST_COMPUTE_ALGORITHM_SET_INTERSECTION_HPP
+
+#include <iterator>
+
+#include <boost/compute/algorithm/detail/compact.hpp>
+#include <boost/compute/algorithm/detail/balanced_path.hpp>
+#include <boost/compute/algorithm/exclusive_scan.hpp>
+#include <boost/compute/algorithm/fill_n.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/system.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+///
+/// \brief Serial set intersection kernel class
+///
+/// Subclass of meta_kernel to perform serial set intersection after tiling
+///
+class serial_set_intersection_kernel : meta_kernel
+{
+public:
+ unsigned int tile_size;
+
+ serial_set_intersection_kernel() : meta_kernel("set_intersection")
+ {
+ tile_size = 4;
+ }
+
+ template<class InputIterator1, class InputIterator2,
+ class InputIterator3, class InputIterator4,
+ class OutputIterator1, class OutputIterator2>
+ void set_range(InputIterator1 first1,
+ InputIterator2 first2,
+ InputIterator3 tile_first1,
+ InputIterator3 tile_last1,
+ InputIterator4 tile_first2,
+ OutputIterator1 result,
+ OutputIterator2 counts)
+ {
+ m_count = iterator_range_size(tile_first1, tile_last1) - 1;
+
+ *this <<
+ "uint i = get_global_id(0);\n" <<
+ "uint start1 = " << tile_first1[expr<uint_>("i")] << ";\n" <<
+ "uint end1 = " << tile_first1[expr<uint_>("i+1")] << ";\n" <<
+ "uint start2 = " << tile_first2[expr<uint_>("i")] << ";\n" <<
+ "uint end2 = " << tile_first2[expr<uint_>("i+1")] << ";\n" <<
+ "uint index = i*" << tile_size << ";\n" <<
+ "uint count = 0;\n" <<
+ "while(start1<end1 && start2<end2)\n" <<
+ "{\n" <<
+ " if(" << first1[expr<uint_>("start1")] << " == " <<
+ first2[expr<uint_>("start2")] << ")\n" <<
+ " {\n" <<
+ result[expr<uint_>("index")] <<
+ " = " << first1[expr<uint_>("start1")] << ";\n" <<
+ " index++; count++;\n" <<
+ " start1++; start2++;\n" <<
+ " }\n" <<
+ " else if(" << first1[expr<uint_>("start1")] << " < " <<
+ first2[expr<uint_>("start2")] << ")\n" <<
+ " start1++;\n" <<
+ " else start2++;\n" <<
+ "}\n" <<
+ counts[expr<uint_>("i")] << " = count;\n";
+ }
+
+ event exec(command_queue &queue)
+ {
+ if(m_count == 0) {
+ return event();
+ }
+
+ return exec_1d(queue, 0, m_count);
+ }
+
+private:
+ size_t m_count;
+};
+
+} //end detail namespace
+
+///
+/// \brief Set intersection algorithm
+///
+/// Finds the intersection of the sorted range [first1, last1) with the sorted
+/// range [first2, last2) and stores it in range starting at result
+/// \return Iterator pointing to end of intersection
+///
+/// \param first1 Iterator pointing to start of first set
+/// \param last1 Iterator pointing to end of first set
+/// \param first2 Iterator pointing to start of second set
+/// \param last2 Iterator pointing to end of second set
+/// \param result Iterator pointing to start of range in which the intersection
+/// will be stored
+/// \param queue Queue on which to execute
+///
+template<class InputIterator1, class InputIterator2, class OutputIterator>
+inline OutputIterator set_intersection(InputIterator1 first1,
+ InputIterator1 last1,
+ InputIterator2 first2,
+ InputIterator2 last2,
+ OutputIterator result,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
+
+ int tile_size = 1024;
+
+ int count1 = detail::iterator_range_size(first1, last1);
+ int count2 = detail::iterator_range_size(first2, last2);
+
+ vector<uint_> tile_a((count1+count2+tile_size-1)/tile_size+1, queue.get_context());
+ vector<uint_> tile_b((count1+count2+tile_size-1)/tile_size+1, queue.get_context());
+
+ // Tile the sets
+ detail::balanced_path_kernel tiling_kernel;
+ tiling_kernel.tile_size = tile_size;
+ tiling_kernel.set_range(first1, last1, first2, last2,
+ tile_a.begin()+1, tile_b.begin()+1);
+ fill_n(tile_a.begin(), 1, 0, queue);
+ fill_n(tile_b.begin(), 1, 0, queue);
+ tiling_kernel.exec(queue);
+
+ fill_n(tile_a.end()-1, 1, count1, queue);
+ fill_n(tile_b.end()-1, 1, count2, queue);
+
+ vector<value_type> temp_result(count1+count2, queue.get_context());
+ vector<uint_> counts((count1+count2+tile_size-1)/tile_size + 1, queue.get_context());
+ fill_n(counts.end()-1, 1, 0, queue);
+
+ // Find individual intersections
+ detail::serial_set_intersection_kernel intersection_kernel;
+ intersection_kernel.tile_size = tile_size;
+ intersection_kernel.set_range(first1, first2, tile_a.begin(), tile_a.end(),
+ tile_b.begin(), temp_result.begin(), counts.begin());
+
+ intersection_kernel.exec(queue);
+
+ exclusive_scan(counts.begin(), counts.end(), counts.begin(), queue);
+
+ // Compact the results
+ detail::compact_kernel compact_kernel;
+ compact_kernel.tile_size = tile_size;
+ compact_kernel.set_range(temp_result.begin(), counts.begin(), counts.end(), result);
+
+ compact_kernel.exec(queue);
+
+ return result + (counts.end() - 1).read(queue);
+}
+
+} //end compute namespace
+} //end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_SET_INTERSECTION_HPP
diff --git a/boost/compute/algorithm/set_symmetric_difference.hpp b/boost/compute/algorithm/set_symmetric_difference.hpp
new file mode 100644
index 0000000000..6e60b38511
--- /dev/null
+++ b/boost/compute/algorithm/set_symmetric_difference.hpp
@@ -0,0 +1,194 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_SET_SYMMETRIC_DIFFERENCE_HPP
+#define BOOST_COMPUTE_ALGORITHM_SET_SYMMETRIC_DIFFERENCE_HPP
+
+#include <iterator>
+
+#include <boost/compute/algorithm/detail/compact.hpp>
+#include <boost/compute/algorithm/detail/balanced_path.hpp>
+#include <boost/compute/algorithm/exclusive_scan.hpp>
+#include <boost/compute/algorithm/fill_n.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/system.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+///
+/// \brief Serial set symmetric difference kernel class
+///
+/// Subclass of meta_kernel to perform serial set symmetric
+/// difference after tiling
+///
+class serial_set_symmetric_difference_kernel : meta_kernel
+{
+public:
+ unsigned int tile_size;
+
+ serial_set_symmetric_difference_kernel() : meta_kernel("set_symmetric_difference")
+ {
+ tile_size = 4;
+ }
+
+ template<class InputIterator1, class InputIterator2,
+ class InputIterator3, class InputIterator4,
+ class OutputIterator1, class OutputIterator2>
+ void set_range(InputIterator1 first1,
+ InputIterator2 first2,
+ InputIterator3 tile_first1,
+ InputIterator3 tile_last1,
+ InputIterator4 tile_first2,
+ OutputIterator1 result,
+ OutputIterator2 counts)
+ {
+ m_count = iterator_range_size(tile_first1, tile_last1) - 1;
+
+ *this <<
+ "uint i = get_global_id(0);\n" <<
+ "uint start1 = " << tile_first1[expr<uint_>("i")] << ";\n" <<
+ "uint end1 = " << tile_first1[expr<uint_>("i+1")] << ";\n" <<
+ "uint start2 = " << tile_first2[expr<uint_>("i")] << ";\n" <<
+ "uint end2 = " << tile_first2[expr<uint_>("i+1")] << ";\n" <<
+ "uint index = i*" << tile_size << ";\n" <<
+ "uint count = 0;\n" <<
+ "while(start1<end1 && start2<end2)\n" <<
+ "{\n" <<
+ " if(" << first1[expr<uint_>("start1")] << " == " <<
+ first2[expr<uint_>("start2")] << ")\n" <<
+ " {\n" <<
+ " start1++; start2++;\n" <<
+ " }\n" <<
+ " else if(" << first1[expr<uint_>("start1")] << " < " <<
+ first2[expr<uint_>("start2")] << ")\n" <<
+ " {\n" <<
+ result[expr<uint_>("index")] <<
+ " = " << first1[expr<uint_>("start1")] << ";\n" <<
+ " index++; count++;\n" <<
+ " start1++;\n" <<
+ " }\n" <<
+ " else\n" <<
+ " {\n" <<
+ result[expr<uint_>("index")] <<
+ " = " << first2[expr<uint_>("start2")] << ";\n" <<
+ " index++; count++;\n" <<
+ " start2++;\n" <<
+ " }\n" <<
+ "}\n" <<
+ "while(start1<end1)\n" <<
+ "{\n" <<
+ result[expr<uint_>("index")] <<
+ " = " << first1[expr<uint_>("start1")] << ";\n" <<
+ " index++; count++;\n" <<
+ " start1++;\n" <<
+ "}\n" <<
+ "while(start2<end2)\n" <<
+ "{\n" <<
+ result[expr<uint_>("index")] <<
+ " = " << first2[expr<uint_>("start2")] << ";\n" <<
+ " index++; count++;\n" <<
+ " start2++;\n" <<
+ "}\n" <<
+ counts[expr<uint_>("i")] << " = count;\n";
+ }
+
+ event exec(command_queue &queue)
+ {
+ if(m_count == 0) {
+ return event();
+ }
+
+ return exec_1d(queue, 0, m_count);
+ }
+
+private:
+ size_t m_count;
+};
+
+} //end detail namespace
+
+///
+/// \brief Set symmetric difference algorithm
+///
+/// Finds the symmetric difference of the sorted range [first2, last2) from
+/// the sorted range [first1, last1) and stores it in range starting at result
+/// \return Iterator pointing to end of symmetric difference
+///
+/// \param first1 Iterator pointing to start of first set
+/// \param last1 Iterator pointing to end of first set
+/// \param first2 Iterator pointing to start of second set
+/// \param last2 Iterator pointing to end of second set
+/// \param result Iterator pointing to start of range in which the symmetric
+/// difference will be stored
+/// \param queue Queue on which to execute
+///
+template<class InputIterator1, class InputIterator2, class OutputIterator>
+inline OutputIterator set_symmetric_difference(InputIterator1 first1,
+ InputIterator1 last1,
+ InputIterator2 first2,
+ InputIterator2 last2,
+ OutputIterator result,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
+
+ int tile_size = 1024;
+
+ int count1 = detail::iterator_range_size(first1, last1);
+ int count2 = detail::iterator_range_size(first2, last2);
+
+ vector<uint_> tile_a((count1+count2+tile_size-1)/tile_size+1, queue.get_context());
+ vector<uint_> tile_b((count1+count2+tile_size-1)/tile_size+1, queue.get_context());
+
+ // Tile the sets
+ detail::balanced_path_kernel tiling_kernel;
+ tiling_kernel.tile_size = tile_size;
+ tiling_kernel.set_range(first1, last1, first2, last2,
+ tile_a.begin()+1, tile_b.begin()+1);
+ fill_n(tile_a.begin(), 1, 0, queue);
+ fill_n(tile_b.begin(), 1, 0, queue);
+ tiling_kernel.exec(queue);
+
+ fill_n(tile_a.end()-1, 1, count1, queue);
+ fill_n(tile_b.end()-1, 1, count2, queue);
+
+ vector<value_type> temp_result(count1+count2, queue.get_context());
+ vector<uint_> counts((count1+count2+tile_size-1)/tile_size + 1, queue.get_context());
+ fill_n(counts.end()-1, 1, 0, queue);
+
+ // Find individual symmetric differences
+ detail::serial_set_symmetric_difference_kernel symmetric_difference_kernel;
+ symmetric_difference_kernel.tile_size = tile_size;
+ symmetric_difference_kernel.set_range(first1, first2, tile_a.begin(),
+ tile_a.end(), tile_b.begin(),
+ temp_result.begin(), counts.begin());
+
+ symmetric_difference_kernel.exec(queue);
+
+ exclusive_scan(counts.begin(), counts.end(), counts.begin(), queue);
+
+ // Compact the results
+ detail::compact_kernel compact_kernel;
+ compact_kernel.tile_size = tile_size;
+ compact_kernel.set_range(temp_result.begin(), counts.begin(), counts.end(), result);
+
+ compact_kernel.exec(queue);
+
+ return result + (counts.end() - 1).read(queue);
+}
+
+} //end compute namespace
+} //end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_SET_SYMMETRIC_DIFFERENCE_HPP
diff --git a/boost/compute/algorithm/set_union.hpp b/boost/compute/algorithm/set_union.hpp
new file mode 100644
index 0000000000..c61f7b29b3
--- /dev/null
+++ b/boost/compute/algorithm/set_union.hpp
@@ -0,0 +1,195 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_SET_UNION_HPP
+#define BOOST_COMPUTE_ALGORITHM_SET_UNION_HPP
+
+#include <iterator>
+
+#include <boost/compute/algorithm/detail/balanced_path.hpp>
+#include <boost/compute/algorithm/detail/compact.hpp>
+#include <boost/compute/algorithm/exclusive_scan.hpp>
+#include <boost/compute/algorithm/fill_n.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/system.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+///
+/// \brief Serial set union kernel class
+///
+/// Subclass of meta_kernel to perform serial set union after tiling
+///
+class serial_set_union_kernel : meta_kernel
+{
+public:
+ unsigned int tile_size;
+
+ serial_set_union_kernel() : meta_kernel("set_union")
+ {
+ tile_size = 4;
+ }
+
+ template<class InputIterator1, class InputIterator2,
+ class InputIterator3, class InputIterator4,
+ class OutputIterator1, class OutputIterator2>
+ void set_range(InputIterator1 first1,
+ InputIterator2 first2,
+ InputIterator3 tile_first1,
+ InputIterator3 tile_last1,
+ InputIterator4 tile_first2,
+ OutputIterator1 result,
+ OutputIterator2 counts)
+ {
+ m_count = iterator_range_size(tile_first1, tile_last1) - 1;
+
+ *this <<
+ "uint i = get_global_id(0);\n" <<
+ "uint start1 = " << tile_first1[expr<uint_>("i")] << ";\n" <<
+ "uint end1 = " << tile_first1[expr<uint_>("i+1")] << ";\n" <<
+ "uint start2 = " << tile_first2[expr<uint_>("i")] << ";\n" <<
+ "uint end2 = " << tile_first2[expr<uint_>("i+1")] << ";\n" <<
+ "uint index = i*" << tile_size << ";\n" <<
+ "uint count = 0;\n" <<
+ "while(start1<end1 && start2<end2)\n" <<
+ "{\n" <<
+ " if(" << first1[expr<uint_>("start1")] << " == " <<
+ first2[expr<uint_>("start2")] << ")\n" <<
+ " {\n" <<
+ result[expr<uint_>("index")] <<
+ " = " << first1[expr<uint_>("start1")] << ";\n" <<
+ " index++; count++;\n" <<
+ " start1++; start2++;\n" <<
+ " }\n" <<
+ " else if(" << first1[expr<uint_>("start1")] << " < " <<
+ first2[expr<uint_>("start2")] << ")\n" <<
+ " {\n" <<
+ result[expr<uint_>("index")] <<
+ " = " << first1[expr<uint_>("start1")] << ";\n" <<
+ " index++; count++;\n" <<
+ " start1++;\n" <<
+ " }\n" <<
+ " else\n" <<
+ " {\n" <<
+ result[expr<uint_>("index")] <<
+ " = " << first2[expr<uint_>("start2")] << ";\n" <<
+ " index++; count++;\n" <<
+ " start2++;\n" <<
+ " }\n" <<
+ "}\n" <<
+ "while(start1<end1)\n" <<
+ "{\n" <<
+ result[expr<uint_>("index")] <<
+ " = " << first1[expr<uint_>("start1")] << ";\n" <<
+ " index++; count++;\n" <<
+ " start1++;\n" <<
+ "}\n" <<
+ "while(start2<end2)\n" <<
+ "{\n" <<
+ result[expr<uint_>("index")] <<
+ " = " << first2[expr<uint_>("start2")] << ";\n" <<
+ " index++; count++;\n" <<
+ " start2++;\n" <<
+ "}\n" <<
+ counts[expr<uint_>("i")] << " = count;\n";
+ }
+
+ event exec(command_queue &queue)
+ {
+ if(m_count == 0) {
+ return event();
+ }
+
+ return exec_1d(queue, 0, m_count);
+ }
+
+private:
+ size_t m_count;
+};
+
+} //end detail namespace
+
+///
+/// \brief Set union algorithm
+///
+/// Finds the union of the sorted range [first1, last1) with the sorted
+/// range [first2, last2) and stores it in range starting at result
+/// \return Iterator pointing to end of union
+///
+/// \param first1 Iterator pointing to start of first set
+/// \param last1 Iterator pointing to end of first set
+/// \param first2 Iterator pointing to start of second set
+/// \param last2 Iterator pointing to end of second set
+/// \param result Iterator pointing to start of range in which the union
+/// will be stored
+/// \param queue Queue on which to execute
+///
+template<class InputIterator1, class InputIterator2, class OutputIterator>
+inline OutputIterator set_union(InputIterator1 first1,
+ InputIterator1 last1,
+ InputIterator2 first2,
+ InputIterator2 last2,
+ OutputIterator result,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator1>::value_type value_type;
+
+ int tile_size = 1024;
+
+ int count1 = detail::iterator_range_size(first1, last1);
+ int count2 = detail::iterator_range_size(first2, last2);
+
+ vector<uint_> tile_a((count1+count2+tile_size-1)/tile_size+1, queue.get_context());
+ vector<uint_> tile_b((count1+count2+tile_size-1)/tile_size+1, queue.get_context());
+
+ // Tile the sets
+ detail::balanced_path_kernel tiling_kernel;
+ tiling_kernel.tile_size = tile_size;
+ tiling_kernel.set_range(first1, last1, first2, last2,
+ tile_a.begin()+1, tile_b.begin()+1);
+ fill_n(tile_a.begin(), 1, 0, queue);
+ fill_n(tile_b.begin(), 1, 0, queue);
+ tiling_kernel.exec(queue);
+
+ fill_n(tile_a.end()-1, 1, count1, queue);
+ fill_n(tile_b.end()-1, 1, count2, queue);
+
+ vector<value_type> temp_result(count1+count2, queue.get_context());
+ vector<uint_> counts((count1+count2+tile_size-1)/tile_size + 1, queue.get_context());
+ fill_n(counts.end()-1, 1, 0, queue);
+
+ // Find individual unions
+ detail::serial_set_union_kernel union_kernel;
+ union_kernel.tile_size = tile_size;
+ union_kernel.set_range(first1, first2, tile_a.begin(), tile_a.end(),
+ tile_b.begin(), temp_result.begin(), counts.begin());
+
+ union_kernel.exec(queue);
+
+ exclusive_scan(counts.begin(), counts.end(), counts.begin(), queue);
+
+ // Compact the results
+ detail::compact_kernel compact_kernel;
+ compact_kernel.tile_size = tile_size;
+ compact_kernel.set_range(temp_result.begin(), counts.begin(), counts.end(), result);
+
+ compact_kernel.exec(queue);
+
+ return result + (counts.end() - 1).read(queue);
+}
+
+} //end compute namespace
+} //end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_SET_UNION_HPP
diff --git a/boost/compute/algorithm/sort.hpp b/boost/compute/algorithm/sort.hpp
new file mode 100644
index 0000000000..b2730b3e2b
--- /dev/null
+++ b/boost/compute/algorithm/sort.hpp
@@ -0,0 +1,194 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_SORT_HPP
+#define BOOST_COMPUTE_ALGORITHM_SORT_HPP
+
+#include <iterator>
+
+#include <boost/utility/enable_if.hpp>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/detail/merge_sort_on_cpu.hpp>
+#include <boost/compute/algorithm/detail/radix_sort.hpp>
+#include <boost/compute/algorithm/detail/insertion_sort.hpp>
+#include <boost/compute/algorithm/reverse.hpp>
+#include <boost/compute/container/mapped_view.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/type_traits/is_device_iterator.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class T>
+inline void dispatch_gpu_sort(buffer_iterator<T> first,
+ buffer_iterator<T> last,
+ less<T>,
+ command_queue &queue,
+ typename boost::enable_if_c<
+ is_radix_sortable<T>::value
+ >::type* = 0)
+{
+ size_t count = detail::iterator_range_size(first, last);
+
+ if(count < 2){
+ // nothing to do
+ return;
+ }
+ else if(count <= 32){
+ ::boost::compute::detail::serial_insertion_sort(first, last, queue);
+ }
+ else {
+ ::boost::compute::detail::radix_sort(first, last, queue);
+ }
+}
+
+template<class T>
+inline void dispatch_gpu_sort(buffer_iterator<T> first,
+ buffer_iterator<T> last,
+ greater<T> compare,
+ command_queue &queue,
+ typename boost::enable_if_c<
+ is_radix_sortable<T>::value
+ >::type* = 0)
+{
+ size_t count = detail::iterator_range_size(first, last);
+
+ if(count < 2){
+ // nothing to do
+ return;
+ }
+ else if(count <= 32){
+ ::boost::compute::detail::serial_insertion_sort(
+ first, last, compare, queue
+ );
+ }
+ else {
+ // radix sort in ascending order
+ ::boost::compute::detail::radix_sort(first, last, queue);
+
+ // reverse range to descending order
+ ::boost::compute::reverse(first, last, queue);
+ }
+}
+
+template<class Iterator, class Compare>
+inline void dispatch_gpu_sort(Iterator first,
+ Iterator last,
+ Compare compare,
+ command_queue &queue)
+{
+ ::boost::compute::detail::serial_insertion_sort(
+ first, last, compare, queue
+ );
+}
+
+// sort() for device iterators
+template<class Iterator, class Compare>
+inline void dispatch_sort(Iterator first,
+ Iterator last,
+ Compare compare,
+ command_queue &queue,
+ typename boost::enable_if<
+ is_device_iterator<Iterator>
+ >::type* = 0)
+{
+ if(queue.get_device().type() & device::gpu) {
+ dispatch_gpu_sort(first, last, compare, queue);
+ return;
+ }
+ ::boost::compute::detail::merge_sort_on_cpu(first, last, compare, queue);
+}
+
+// sort() for host iterators
+template<class Iterator, class Compare>
+inline void dispatch_sort(Iterator first,
+ Iterator last,
+ Compare compare,
+ command_queue &queue,
+ typename boost::disable_if<
+ is_device_iterator<Iterator>
+ >::type* = 0)
+{
+ typedef typename std::iterator_traits<Iterator>::value_type T;
+
+ size_t size = static_cast<size_t>(std::distance(first, last));
+
+ // create mapped buffer
+ mapped_view<T> view(
+ boost::addressof(*first), size, queue.get_context()
+ );
+
+ // sort mapped buffer
+ dispatch_sort(view.begin(), view.end(), compare, queue);
+
+ // return results to host
+ view.map(queue);
+}
+
+} // end detail namespace
+
+/// Sorts the values in the range [\p first, \p last) according to
+/// \p compare.
+///
+/// \param first first element in the range to sort
+/// \param last last element in the range to sort
+/// \param compare comparison function (by default \c less)
+/// \param queue command queue to perform the operation
+///
+/// For example, to sort a vector on the device:
+/// \code
+/// // create vector on the device with data
+/// float data[] = { 2.f, 4.f, 1.f, 3.f };
+/// boost::compute::vector<float> vec(data, data + 4, queue);
+///
+/// // sort the vector on the device
+/// boost::compute::sort(vec.begin(), vec.end(), queue);
+/// \endcode
+///
+/// The sort() algorithm can also be directly used with host iterators. This
+/// example will automatically transfer the data to the device, sort it, and
+/// then transfer the data back to the host:
+/// \code
+/// std::vector<int> data = { 9, 3, 2, 5, 1, 4, 6, 7 };
+///
+/// boost::compute::sort(data.begin(), data.end(), queue);
+/// \endcode
+///
+/// \see is_sorted()
+template<class Iterator, class Compare>
+inline void sort(Iterator first,
+ Iterator last,
+ Compare compare,
+ command_queue &queue = system::default_queue())
+{
+ ::boost::compute::detail::dispatch_sort(first, last, compare, queue);
+}
+
+/// \overload
+template<class Iterator>
+inline void sort(Iterator first,
+ Iterator last,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<Iterator>::value_type value_type;
+
+ ::boost::compute::sort(
+ first, last, ::boost::compute::less<value_type>(), queue
+ );
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_SORT_HPP
diff --git a/boost/compute/algorithm/sort_by_key.hpp b/boost/compute/algorithm/sort_by_key.hpp
new file mode 100644
index 0000000000..0e3dba81eb
--- /dev/null
+++ b/boost/compute/algorithm/sort_by_key.hpp
@@ -0,0 +1,156 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_SORT_BY_KEY_HPP
+#define BOOST_COMPUTE_ALGORITHM_SORT_BY_KEY_HPP
+
+#include <iterator>
+
+#include <boost/utility/enable_if.hpp>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/detail/merge_sort_on_cpu.hpp>
+#include <boost/compute/algorithm/detail/insertion_sort.hpp>
+#include <boost/compute/algorithm/detail/radix_sort.hpp>
+#include <boost/compute/algorithm/reverse.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+
+namespace detail {
+
+template<class KeyIterator, class ValueIterator>
+inline void
+dispatch_gpu_sort_by_key(KeyIterator keys_first,
+ KeyIterator keys_last,
+ ValueIterator values_first,
+ less<typename std::iterator_traits<KeyIterator>::value_type> compare,
+ command_queue &queue,
+ typename boost::enable_if_c<
+ is_radix_sortable<
+ typename std::iterator_traits<KeyIterator>::value_type
+ >::value
+ >::type* = 0)
+{
+ size_t count = detail::iterator_range_size(keys_first, keys_last);
+
+ if(count < 32){
+ detail::serial_insertion_sort_by_key(
+ keys_first, keys_last, values_first, compare, queue
+ );
+ }
+ else {
+ detail::radix_sort_by_key(
+ keys_first, keys_last, values_first, queue
+ );
+ }
+}
+
+template<class KeyIterator, class ValueIterator>
+inline void
+dispatch_gpu_sort_by_key(KeyIterator keys_first,
+ KeyIterator keys_last,
+ ValueIterator values_first,
+ greater<typename std::iterator_traits<KeyIterator>::value_type> compare,
+ command_queue &queue,
+ typename boost::enable_if_c<
+ is_radix_sortable<
+ typename std::iterator_traits<KeyIterator>::value_type
+ >::value
+ >::type* = 0)
+{
+ size_t count = detail::iterator_range_size(keys_first, keys_last);
+
+ if(count < 32){
+ detail::serial_insertion_sort_by_key(
+ keys_first, keys_last, values_first, compare, queue
+ );
+ }
+ else {
+ // radix sorts in ascending order
+ detail::radix_sort_by_key(
+ keys_first, keys_last, values_first, queue
+ );
+
+ // Reverse keys, values for descending order
+ ::boost::compute::reverse(keys_first, keys_last, queue);
+ ::boost::compute::reverse(values_first, values_first + count, queue);
+ }
+}
+
+template<class KeyIterator, class ValueIterator, class Compare>
+inline void dispatch_gpu_sort_by_key(KeyIterator keys_first,
+ KeyIterator keys_last,
+ ValueIterator values_first,
+ Compare compare,
+ command_queue &queue)
+{
+ detail::serial_insertion_sort_by_key(
+ keys_first, keys_last, values_first, compare, queue
+ );
+}
+
+template<class KeyIterator, class ValueIterator, class Compare>
+inline void dispatch_sort_by_key(KeyIterator keys_first,
+ KeyIterator keys_last,
+ ValueIterator values_first,
+ Compare compare,
+ command_queue &queue)
+{
+ if(queue.get_device().type() & device::gpu) {
+ dispatch_gpu_sort_by_key(keys_first, keys_last, values_first, compare, queue);
+ return;
+ }
+ ::boost::compute::detail::merge_sort_by_key_on_cpu(
+ keys_first, keys_last, values_first, compare, queue
+ );
+}
+
+} // end detail namespace
+
+/// Performs a key-value sort using the keys in the range [\p keys_first,
+/// \p keys_last) on the values in the range [\p values_first,
+/// \p values_first \c + (\p keys_last \c - \p keys_first)) using \p compare.
+///
+/// If no compare function is specified, \c less is used.
+///
+/// \see sort()
+template<class KeyIterator, class ValueIterator, class Compare>
+inline void sort_by_key(KeyIterator keys_first,
+ KeyIterator keys_last,
+ ValueIterator values_first,
+ Compare compare,
+ command_queue &queue = system::default_queue())
+{
+ ::boost::compute::detail::dispatch_sort_by_key(
+ keys_first, keys_last, values_first, compare, queue
+ );
+}
+
+/// \overload
+template<class KeyIterator, class ValueIterator>
+inline void sort_by_key(KeyIterator keys_first,
+ KeyIterator keys_last,
+ ValueIterator values_first,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
+
+ ::boost::compute::sort_by_key(
+ keys_first, keys_last, values_first, less<key_type>(), queue
+ );
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_SORT_BY_KEY_HPP
diff --git a/boost/compute/algorithm/stable_partition.hpp b/boost/compute/algorithm/stable_partition.hpp
new file mode 100644
index 0000000000..283b068283
--- /dev/null
+++ b/boost/compute/algorithm/stable_partition.hpp
@@ -0,0 +1,72 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_STABLE_PARTITION_HPP
+#define BOOST_COMPUTE_ALGORITHM_STABLE_PARTITION_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/context.hpp>
+#include <boost/compute/functional.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/copy_if.hpp>
+#include <boost/compute/container/vector.hpp>
+
+namespace boost {
+namespace compute {
+
+///
+/// \brief Partitioning algorithm
+///
+/// Partitions the elements in the range [\p first, \p last) according to
+/// \p predicate. The order of the elements is preserved.
+/// \return Iterator pointing to end of true values
+///
+/// \param first Iterator pointing to start of range
+/// \param last Iterator pointing to end of range
+/// \param predicate Unary predicate to be applied on each element
+/// \param queue Queue on which to execute
+///
+/// \see is_partitioned() and partition()
+///
+template<class Iterator, class UnaryPredicate>
+inline Iterator stable_partition(Iterator first,
+ Iterator last,
+ UnaryPredicate predicate,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<Iterator>::value_type value_type;
+
+ // make temporary copy of the input
+ ::boost::compute::vector<value_type> tmp(first, last, queue);
+
+ // copy true values
+ Iterator last_true =
+ ::boost::compute::copy_if(tmp.begin(),
+ tmp.end(),
+ first,
+ predicate,
+ queue);
+
+ // copy false values
+ Iterator last_false =
+ ::boost::compute::copy_if(tmp.begin(),
+ tmp.end(),
+ last_true,
+ not1(predicate),
+ queue);
+
+ // return iterator pointing to the last true value
+ return last_true;
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_STABLE_PARTITION_HPP
diff --git a/boost/compute/algorithm/stable_sort.hpp b/boost/compute/algorithm/stable_sort.hpp
new file mode 100644
index 0000000000..cd82a0a606
--- /dev/null
+++ b/boost/compute/algorithm/stable_sort.hpp
@@ -0,0 +1,99 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_STABLE_SORT_HPP
+#define BOOST_COMPUTE_ALGORITHM_STABLE_SORT_HPP
+
+#include <iterator>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/detail/merge_sort_on_cpu.hpp>
+#include <boost/compute/algorithm/detail/radix_sort.hpp>
+#include <boost/compute/algorithm/detail/insertion_sort.hpp>
+#include <boost/compute/algorithm/reverse.hpp>
+#include <boost/compute/functional/operator.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class Iterator, class Compare>
+inline void dispatch_gpu_stable_sort(Iterator first,
+ Iterator last,
+ Compare compare,
+ command_queue &queue)
+{
+ ::boost::compute::detail::serial_insertion_sort(
+ first, last, compare, queue
+ );
+}
+
+template<class T>
+inline typename boost::enable_if_c<is_radix_sortable<T>::value>::type
+dispatch_gpu_stable_sort(buffer_iterator<T> first,
+ buffer_iterator<T> last,
+ less<T>,
+ command_queue &queue)
+{
+ ::boost::compute::detail::radix_sort(first, last, queue);
+}
+
+template<class T>
+inline typename boost::enable_if_c<is_radix_sortable<T>::value>::type
+dispatch_gpu_stable_sort(buffer_iterator<T> first,
+ buffer_iterator<T> last,
+ greater<T>,
+ command_queue &queue)
+{
+ // radix sort in ascending order
+ ::boost::compute::detail::radix_sort(first, last, queue);
+
+ // reverse range to descending order
+ ::boost::compute::reverse(first, last, queue);
+}
+
+} // end detail namespace
+
+/// Sorts the values in the range [\p first, \p last) according to
+/// \p compare. The relative order of identical values is preserved.
+///
+/// \see sort(), is_sorted()
+template<class Iterator, class Compare>
+inline void stable_sort(Iterator first,
+ Iterator last,
+ Compare compare,
+ command_queue &queue = system::default_queue())
+{
+ if(queue.get_device().type() & device::gpu) {
+ ::boost::compute::detail::dispatch_gpu_stable_sort(
+ first, last, compare, queue
+ );
+ }
+ ::boost::compute::detail::merge_sort_on_cpu(first, last, compare, queue);
+}
+
+/// \overload
+template<class Iterator>
+inline void stable_sort(Iterator first,
+ Iterator last,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<Iterator>::value_type value_type;
+
+ ::boost::compute::less<value_type> less;
+
+ ::boost::compute::stable_sort(first, last, less, queue);
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_STABLE_SORT_HPP
diff --git a/boost/compute/algorithm/stable_sort_by_key.hpp b/boost/compute/algorithm/stable_sort_by_key.hpp
new file mode 100644
index 0000000000..8a51372ede
--- /dev/null
+++ b/boost/compute/algorithm/stable_sort_by_key.hpp
@@ -0,0 +1,61 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_STABLE_SORT_BY_KEY_HPP
+#define BOOST_COMPUTE_ALGORITHM_STABLE_SORT_BY_KEY_HPP
+
+#include <iterator>
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/sort_by_key.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Performs a key-value stable sort using the keys in the range [\p keys_first,
+/// \p keys_last) on the values in the range [\p values_first,
+/// \p values_first \c + (\p keys_last \c - \p keys_first)) using \p compare.
+///
+/// If no compare function is specified, \c less is used.
+///
+/// \see sort()
+template<class KeyIterator, class ValueIterator, class Compare>
+inline void stable_sort_by_key(KeyIterator keys_first,
+ KeyIterator keys_last,
+ ValueIterator values_first,
+ Compare compare,
+ command_queue &queue = system::default_queue())
+{
+ // sort_by_key is stable
+ ::boost::compute::sort_by_key(
+ keys_first, keys_last, values_first, compare, queue
+ );
+}
+
+/// \overload
+template<class KeyIterator, class ValueIterator>
+inline void stable_sort_by_key(KeyIterator keys_first,
+ KeyIterator keys_last,
+ ValueIterator values_first,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
+
+ ::boost::compute::stable_sort_by_key(
+ keys_first, keys_last, values_first, less<key_type>(), queue
+ );
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_STABLE_SORT_BY_KEY_HPP
diff --git a/boost/compute/algorithm/swap_ranges.hpp b/boost/compute/algorithm/swap_ranges.hpp
new file mode 100644
index 0000000000..6ff3e14f6a
--- /dev/null
+++ b/boost/compute/algorithm/swap_ranges.hpp
@@ -0,0 +1,44 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_SWAP_RANGES_HPP
+#define BOOST_COMPUTE_ALGORITHM_SWAP_RANGES_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/container/vector.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Swaps the elements in the range [\p first1, \p last1) with the
+/// elements in the range beginning at \p first2.
+template<class Iterator1, class Iterator2>
+inline Iterator2 swap_ranges(Iterator1 first1,
+ Iterator1 last1,
+ Iterator2 first2,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<Iterator1>::value_type value_type;
+
+ Iterator2 last2 = first2 + std::distance(first1, last1);
+
+ ::boost::compute::vector<value_type> tmp(first1, last1, queue);
+ ::boost::compute::copy(first2, last2, first1, queue);
+ ::boost::compute::copy(tmp.begin(), tmp.end(), first2, queue);
+
+ return last2;
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_SWAP_RANGES_HPP
diff --git a/boost/compute/algorithm/transform.hpp b/boost/compute/algorithm/transform.hpp
new file mode 100644
index 0000000000..022a4988bd
--- /dev/null
+++ b/boost/compute/algorithm/transform.hpp
@@ -0,0 +1,76 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_TRANSFORM_HPP
+#define BOOST_COMPUTE_ALGORITHM_TRANSFORM_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/copy.hpp>
+#include <boost/compute/iterator/transform_iterator.hpp>
+#include <boost/compute/iterator/zip_iterator.hpp>
+#include <boost/compute/functional/detail/unpack.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Transforms the elements in the range [\p first, \p last) using
+/// \p transform and stores the results in the range beginning at
+/// \p result.
+///
+/// For example, to calculate the absolute value for each element in a vector:
+///
+/// \snippet test/test_transform.cpp transform_abs
+///
+/// \see copy()
+template<class InputIterator, class OutputIterator, class UnaryOperator>
+inline OutputIterator transform(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ UnaryOperator op,
+ command_queue &queue = system::default_queue())
+{
+ return copy(
+ ::boost::compute::make_transform_iterator(first, op),
+ ::boost::compute::make_transform_iterator(last, op),
+ result,
+ queue
+ );
+}
+
+/// \overload
+template<class InputIterator1,
+ class InputIterator2,
+ class OutputIterator,
+ class BinaryOperator>
+inline OutputIterator transform(InputIterator1 first1,
+ InputIterator1 last1,
+ InputIterator2 first2,
+ OutputIterator result,
+ BinaryOperator op,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator1>::difference_type difference_type;
+
+ difference_type n = std::distance(first1, last1);
+
+ return transform(
+ make_zip_iterator(boost::make_tuple(first1, first2)),
+ make_zip_iterator(boost::make_tuple(last1, first2 + n)),
+ result,
+ detail::unpack(op),
+ queue
+ );
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_TRANSFORM_HPP
diff --git a/boost/compute/algorithm/transform_if.hpp b/boost/compute/algorithm/transform_if.hpp
new file mode 100644
index 0000000000..0eb0fd434e
--- /dev/null
+++ b/boost/compute/algorithm/transform_if.hpp
@@ -0,0 +1,117 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013-2015 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_TRANSFORM_IF_HPP
+#define BOOST_COMPUTE_ALGORITHM_TRANSFORM_IF_HPP
+
+#include <boost/compute/cl.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/count.hpp>
+#include <boost/compute/algorithm/count_if.hpp>
+#include <boost/compute/algorithm/exclusive_scan.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/iterator/discard_iterator.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class OutputIterator, class UnaryFunction, class Predicate>
+inline OutputIterator transform_if_impl(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ UnaryFunction function,
+ Predicate predicate,
+ bool copyIndex,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<OutputIterator>::difference_type difference_type;
+
+ size_t count = detail::iterator_range_size(first, last);
+ if(count == 0){
+ return result;
+ }
+
+ const context &context = queue.get_context();
+
+ // storage for destination indices
+ ::boost::compute::vector<cl_uint> indices(count, context);
+
+ // write counts
+ ::boost::compute::detail::meta_kernel k1("transform_if_write_counts");
+ k1 << indices.begin()[k1.get_global_id(0)] << " = "
+ << predicate(first[k1.get_global_id(0)]) << " ? 1 : 0;\n";
+ k1.exec_1d(queue, 0, count);
+
+ // count number of elements to be copied
+ size_t copied_element_count =
+ ::boost::compute::count(indices.begin(), indices.end(), 1, queue);
+
+ // scan indices
+ ::boost::compute::exclusive_scan(
+ indices.begin(), indices.end(), indices.begin(), queue
+ );
+
+ // copy values
+ ::boost::compute::detail::meta_kernel k2("transform_if_do_copy");
+ k2 << "if(" << predicate(first[k2.get_global_id(0)]) << ")" <<
+ " " << result[indices.begin()[k2.get_global_id(0)]] << "=";
+
+ if(copyIndex){
+ k2 << k2.get_global_id(0) << ";\n";
+ }
+ else {
+ k2 << function(first[k2.get_global_id(0)]) << ";\n";
+ }
+
+ k2.exec_1d(queue, 0, count);
+
+ return result + static_cast<difference_type>(copied_element_count);
+}
+
+template<class InputIterator, class UnaryFunction, class Predicate>
+inline discard_iterator transform_if_impl(InputIterator first,
+ InputIterator last,
+ discard_iterator result,
+ UnaryFunction function,
+ Predicate predicate,
+ bool copyIndex,
+ command_queue &queue)
+{
+ (void) function;
+ (void) copyIndex;
+
+ return result + count_if(first, last, predicate, queue);
+}
+
+} // end detail namespace
+
+/// Copies each element in the range [\p first, \p last) for which
+/// \p predicate returns \c true to the range beginning at \p result.
+template<class InputIterator, class OutputIterator, class UnaryFunction, class Predicate>
+inline OutputIterator transform_if(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ UnaryFunction function,
+ Predicate predicate,
+ command_queue &queue = system::default_queue())
+{
+ return detail::transform_if_impl(
+ first, last, result, function, predicate, false, queue
+ );
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_TRANSFORM_IF_HPP
diff --git a/boost/compute/algorithm/transform_reduce.hpp b/boost/compute/algorithm/transform_reduce.hpp
new file mode 100644
index 0000000000..fbeee5a691
--- /dev/null
+++ b/boost/compute/algorithm/transform_reduce.hpp
@@ -0,0 +1,89 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_TRANSFORM_REDUCE_HPP
+#define BOOST_COMPUTE_ALGORITHM_TRANSFORM_REDUCE_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/reduce.hpp>
+#include <boost/compute/iterator/transform_iterator.hpp>
+#include <boost/compute/iterator/zip_iterator.hpp>
+#include <boost/compute/functional/detail/unpack.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Transforms each value in the range [\p first, \p last) with the unary
+/// \p transform_function and then reduces each transformed value with
+/// \p reduce_function.
+///
+/// For example, to calculate the sum of the absolute values of a vector
+/// of integers:
+///
+/// \snippet test/test_transform_reduce.cpp sum_abs_int
+///
+/// \see reduce(), inner_product()
+template<class InputIterator,
+ class OutputIterator,
+ class UnaryTransformFunction,
+ class BinaryReduceFunction>
+inline void transform_reduce(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ UnaryTransformFunction transform_function,
+ BinaryReduceFunction reduce_function,
+ command_queue &queue = system::default_queue())
+{
+ ::boost::compute::reduce(
+ ::boost::compute::make_transform_iterator(first, transform_function),
+ ::boost::compute::make_transform_iterator(last, transform_function),
+ result,
+ reduce_function,
+ queue
+ );
+}
+
+/// \overload
+template<class InputIterator1,
+ class InputIterator2,
+ class OutputIterator,
+ class BinaryTransformFunction,
+ class BinaryReduceFunction>
+inline void transform_reduce(InputIterator1 first1,
+ InputIterator1 last1,
+ InputIterator2 first2,
+ OutputIterator result,
+ BinaryTransformFunction transform_function,
+ BinaryReduceFunction reduce_function,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator1>::difference_type difference_type;
+
+ difference_type n = std::distance(first1, last1);
+
+ ::boost::compute::transform_reduce(
+ ::boost::compute::make_zip_iterator(
+ boost::make_tuple(first1, first2)
+ ),
+ ::boost::compute::make_zip_iterator(
+ boost::make_tuple(last1, first2 + n)
+ ),
+ result,
+ detail::unpack(transform_function),
+ reduce_function,
+ queue
+ );
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_TRANSFORM_REDUCE_HPP
diff --git a/boost/compute/algorithm/unique.hpp b/boost/compute/algorithm/unique.hpp
new file mode 100644
index 0000000000..faa36bad9d
--- /dev/null
+++ b/boost/compute/algorithm/unique.hpp
@@ -0,0 +1,66 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_UNIQUE_HPP
+#define BOOST_COMPUTE_ALGORITHM_UNIQUE_HPP
+
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/unique_copy.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/functional/operator.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Removes all consecutive duplicate elements (determined by \p op) from the
+/// range [first, last). If \p op is not provided, the equality operator is
+/// used.
+///
+/// \param first first element in the input range
+/// \param last last element in the input range
+/// \param op binary operator used to check for uniqueness
+/// \param queue command queue to perform the operation
+///
+/// \return \c InputIterator to the new logical end of the range
+///
+/// \see unique_copy()
+template<class InputIterator, class BinaryPredicate>
+inline InputIterator unique(InputIterator first,
+ InputIterator last,
+ BinaryPredicate op,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+ vector<value_type> temp(first, last, queue);
+
+ return ::boost::compute::unique_copy(
+ temp.begin(), temp.end(), first, op, queue
+ );
+}
+
+/// \overload
+template<class InputIterator>
+inline InputIterator unique(InputIterator first,
+ InputIterator last,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+ return ::boost::compute::unique(
+ first, last, ::boost::compute::equal_to<value_type>(), queue
+ );
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_UNIQUE_HPP
diff --git a/boost/compute/algorithm/unique_copy.hpp b/boost/compute/algorithm/unique_copy.hpp
new file mode 100644
index 0000000000..2ce60a9359
--- /dev/null
+++ b/boost/compute/algorithm/unique_copy.hpp
@@ -0,0 +1,164 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_UNIQUE_COPY_HPP
+#define BOOST_COMPUTE_ALGORITHM_UNIQUE_COPY_HPP
+
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/lambda.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/algorithm/copy_if.hpp>
+#include <boost/compute/algorithm/transform.hpp>
+#include <boost/compute/algorithm/gather.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/functional/operator.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class OutputIterator, class BinaryPredicate>
+inline OutputIterator serial_unique_copy(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ BinaryPredicate op,
+ command_queue &queue)
+{
+ if(first == last){
+ return result;
+ }
+
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+ const context &context = queue.get_context();
+
+ size_t count = detail::iterator_range_size(first, last);
+
+ detail::meta_kernel k("serial_unique_copy");
+
+ vector<uint_> unique_count_vector(1, context);
+
+ size_t size_arg = k.add_arg<const uint_>("size");
+ size_t unique_count_arg = k.add_arg<uint_ *>(memory_object::global_memory, "unique_count");
+
+ k << k.decl<uint_>("index") << " = 0;\n"
+ << k.decl<value_type>("current") << " = " << first[k.var<uint_>("0")] << ";\n"
+ << result[k.var<uint_>("0")] << " = current;\n"
+ << "for(uint i = 1; i < size; i++){\n"
+ << " " << k.decl<value_type>("next") << " = " << first[k.var<uint_>("i")] << ";\n"
+ << " if(!" << op(k.var<value_type>("current"), k.var<value_type>("next")) << "){\n"
+ << " " << result[k.var<uint_>("++index")] << " = next;\n"
+ << " " << "current = next;\n"
+ << " }\n"
+ << "}\n"
+ << "*unique_count = index + 1;\n";
+
+ k.set_arg<const uint_>(size_arg, count);
+ k.set_arg(unique_count_arg, unique_count_vector.get_buffer());
+
+ k.exec_1d(queue, 0, 1, 1);
+
+ uint_ unique_count;
+ copy_n(unique_count_vector.begin(), 1, &unique_count, queue);
+
+ return result + unique_count;
+}
+
+template<class InputIterator, class OutputIterator, class BinaryPredicate>
+inline OutputIterator unique_copy(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ BinaryPredicate op,
+ command_queue &queue)
+{
+ if(first == last){
+ return result;
+ }
+
+ const context &context = queue.get_context();
+ size_t count = detail::iterator_range_size(first, last);
+
+ // flags marking unique elements
+ vector<uint_> flags(count, context);
+
+ // find each unique element and mark it with a one
+ transform(
+ first, last - 1, first + 1, flags.begin() + 1, not2(op), queue
+ );
+
+ // first element is always unique
+ fill_n(flags.begin(), 1, 1, queue);
+
+ // storage for desination indices
+ vector<uint_> indices(count, context);
+
+ // copy indices for each unique element
+ vector<uint_>::iterator last_index = detail::copy_index_if(
+ flags.begin(), flags.end(), indices.begin(), lambda::_1 == 1, queue
+ );
+
+ // copy unique values from input to output using the computed indices
+ gather(indices.begin(), last_index, first, result, queue);
+
+ // return an iterator to the end of the unique output range
+ return result + std::distance(indices.begin(), last_index);
+}
+
+} // end detail namespace
+
+/// Makes a copy of the range [first, last) and removes all consecutive
+/// duplicate elements (determined by \p op) from the copy. If \p op is not
+/// provided, the equality operator is used.
+///
+/// \param first first element in the input range
+/// \param last last element in the input range
+/// \param result first element in the result range
+/// \param op binary operator used to check for uniqueness
+/// \param queue command queue to perform the operation
+///
+/// \return \c OutputIterator to the end of the result range
+///
+/// \see unique()
+template<class InputIterator, class OutputIterator, class BinaryPredicate>
+inline OutputIterator unique_copy(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ BinaryPredicate op,
+ command_queue &queue = system::default_queue())
+{
+ size_t count = detail::iterator_range_size(first, last);
+ if(count < 32){
+ return detail::serial_unique_copy(first, last, result, op, queue);
+ }
+ else {
+ return detail::unique_copy(first, last, result, op, queue);
+ }
+}
+
+/// \overload
+template<class InputIterator, class OutputIterator>
+inline OutputIterator unique_copy(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+ return ::boost::compute::unique_copy(
+ first, last, result, ::boost::compute::equal_to<value_type>(), queue
+ );
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_UNIQUE_COPY_HPP
diff --git a/boost/compute/algorithm/upper_bound.hpp b/boost/compute/algorithm/upper_bound.hpp
new file mode 100644
index 0000000000..a5a82d301c
--- /dev/null
+++ b/boost/compute/algorithm/upper_bound.hpp
@@ -0,0 +1,43 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_UPPER_BOUND_HPP
+#define BOOST_COMPUTE_ALGORITHM_UPPER_BOUND_HPP
+
+#include <boost/compute/lambda.hpp>
+#include <boost/compute/system.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/detail/binary_find.hpp>
+
+namespace boost {
+namespace compute {
+
+/// Returns an iterator pointing to the first element in the sorted
+/// range [\p first, \p last) that is not less than or equal to
+/// \p value.
+template<class InputIterator, class T>
+inline InputIterator
+upper_bound(InputIterator first,
+ InputIterator last,
+ const T &value,
+ command_queue &queue = system::default_queue())
+{
+ using ::boost::compute::_1;
+
+ InputIterator position =
+ detail::binary_find(first, last, _1 > value, queue);
+
+ return position;
+}
+
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_UPPER_BOUND_HPP