diff options
Diffstat (limited to 'boost/compute/algorithm')
112 files changed, 12774 insertions, 0 deletions
diff --git a/boost/compute/algorithm/accumulate.hpp b/boost/compute/algorithm/accumulate.hpp new file mode 100644 index 0000000000..328420a07c --- /dev/null +++ b/boost/compute/algorithm/accumulate.hpp @@ -0,0 +1,184 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_ACCUMULATE_HPP +#define BOOST_COMPUTE_ALGORITHM_ACCUMULATE_HPP + +#include <boost/preprocessor/seq/for_each.hpp> + +#include <boost/compute/system.hpp> +#include <boost/compute/functional.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/reduce.hpp> +#include <boost/compute/algorithm/detail/serial_accumulate.hpp> +#include <boost/compute/container/array.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator, class T, class BinaryFunction> +inline T generic_accumulate(InputIterator first, + InputIterator last, + T init, + BinaryFunction function, + command_queue &queue) +{ + const context &context = queue.get_context(); + + size_t size = iterator_range_size(first, last); + if(size == 0){ + return init; + } + + // accumulate on device + array<T, 1> device_result(context); + detail::serial_accumulate( + first, last, device_result.begin(), init, function, queue + ); + + // copy result to host + T result; + ::boost::compute::copy_n(device_result.begin(), 1, &result, queue); + return result; +} + +// returns true if we can use reduce() instead of accumulate() when +// accumulate() this is true when the function is commutative (such as +// addition of integers) and the initial value is the identity value +// for the operation (zero for addition, one for multiplication). +template<class T, class F> +inline bool can_accumulate_with_reduce(T init, F function) +{ + (void) init; + (void) function; + + return false; +} + +/// \internal_ +#define BOOST_COMPUTE_DETAIL_DECLARE_CAN_ACCUMULATE_WITH_REDUCE(r, data, type) \ + inline bool can_accumulate_with_reduce(type init, plus<type>) \ + { \ + return init == type(0); \ + } \ + inline bool can_accumulate_with_reduce(type init, multiplies<type>) \ + { \ + return init == type(1); \ + } + +BOOST_PP_SEQ_FOR_EACH( + BOOST_COMPUTE_DETAIL_DECLARE_CAN_ACCUMULATE_WITH_REDUCE, + _, + (char_)(uchar_)(short_)(ushort_)(int_)(uint_)(long_)(ulong_) +) + +template<class T> +inline bool can_accumulate_with_reduce(T init, min<T>) +{ + return init == (std::numeric_limits<T>::max)(); +} + +template<class T> +inline bool can_accumulate_with_reduce(T init, max<T>) +{ + return init == (std::numeric_limits<T>::min)(); +} + +#undef BOOST_COMPUTE_DETAIL_DECLARE_CAN_ACCUMULATE_WITH_REDUCE + +template<class InputIterator, class T, class BinaryFunction> +inline T dispatch_accumulate(InputIterator first, + InputIterator last, + T init, + BinaryFunction function, + command_queue &queue) +{ + size_t size = iterator_range_size(first, last); + if(size == 0){ + return init; + } + + if(can_accumulate_with_reduce(init, function)){ + T result; + reduce(first, last, &result, function, queue); + return result; + } + else { + return generic_accumulate(first, last, init, function, queue); + } +} + +} // end detail namespace + +/// Returns the result of applying \p function to the elements in the +/// range [\p first, \p last) and \p init. +/// +/// If no function is specified, \c plus will be used. +/// +/// \param first first element in the input range +/// \param last last element in the input range +/// \param init initial value +/// \param function binary reduction function +/// \param queue command queue to perform the operation +/// +/// \return the accumulated result value +/// +/// In specific situations the call to \c accumulate() can be automatically +/// optimized to a call to the more efficient \c reduce() algorithm. This +/// occurs when the binary reduction function is recognized as associative +/// (such as the \c plus<int> function). +/// +/// Note that because floating-point addition is not associative, calling +/// \c accumulate() with \c plus<float> results in a less efficient serial +/// reduction algorithm being executed. If a slight loss in precision is +/// acceptable, the more efficient parallel \c reduce() algorithm should be +/// used instead. +/// +/// For example: +/// \code +/// // with vec = boost::compute::vector<int> +/// accumulate(vec.begin(), vec.end(), 0, plus<int>()); // fast +/// reduce(vec.begin(), vec.end(), &result, plus<int>()); // fast +/// +/// // with vec = boost::compute::vector<float> +/// accumulate(vec.begin(), vec.end(), 0, plus<float>()); // slow +/// reduce(vec.begin(), vec.end(), &result, plus<float>()); // fast +/// \endcode +/// +/// \see reduce() +template<class InputIterator, class T, class BinaryFunction> +inline T accumulate(InputIterator first, + InputIterator last, + T init, + BinaryFunction function, + command_queue &queue = system::default_queue()) +{ + return detail::dispatch_accumulate(first, last, init, function, queue); +} + +/// \overload +template<class InputIterator, class T> +inline T accumulate(InputIterator first, + InputIterator last, + T init, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator>::value_type IT; + + return detail::dispatch_accumulate(first, last, init, plus<IT>(), queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_ACCUMULATE_HPP diff --git a/boost/compute/algorithm/adjacent_difference.hpp b/boost/compute/algorithm/adjacent_difference.hpp new file mode 100644 index 0000000000..a8f84e020e --- /dev/null +++ b/boost/compute/algorithm/adjacent_difference.hpp @@ -0,0 +1,98 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_ADJACENT_DIFFERENCE_HPP +#define BOOST_COMPUTE_ALGORITHM_ADJACENT_DIFFERENCE_HPP + +#include <iterator> + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/functional/operator.hpp> +#include <boost/compute/container/vector.hpp> + +namespace boost { +namespace compute { + +/// Stores the difference of each pair of consecutive values in the range +/// [\p first, \p last) to the range beginning at \p result. If \p op is not +/// provided, \c minus<T> is used. +/// +/// \param first first element in the input range +/// \param last last element in the input range +/// \param result first element in the output range +/// \param op binary difference function +/// \param queue command queue to perform the operation +/// +/// \return \c OutputIterator to the end of the result range +/// +/// \see adjacent_find() +template<class InputIterator, class OutputIterator, class BinaryFunction> +inline OutputIterator +adjacent_difference(InputIterator first, + InputIterator last, + OutputIterator result, + BinaryFunction op, + command_queue &queue = system::default_queue()) +{ + if(first == last){ + return result; + } + + size_t count = detail::iterator_range_size(first, last); + + detail::meta_kernel k("adjacent_difference"); + + k << "const uint i = get_global_id(0);\n" + << "if(i == 0){\n" + << " " << result[k.var<uint_>("0")] << " = " << first[k.var<uint_>("0")] << ";\n" + << "}\n" + << "else {\n" + << " " << result[k.var<uint_>("i")] << " = " + << op(first[k.var<uint_>("i")], first[k.var<uint_>("i-1")]) << ";\n" + << "}\n"; + + k.exec_1d(queue, 0, count, 1); + + return result + count; +} + +/// \overload +template<class InputIterator, class OutputIterator> +inline OutputIterator +adjacent_difference(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + + if (first == result) { + vector<value_type> temp(detail::iterator_range_size(first, last), + queue.get_context()); + copy(first, last, temp.begin(), queue); + + return ::boost::compute::adjacent_difference( + temp.begin(), temp.end(), result, ::boost::compute::minus<value_type>(), queue + ); + } + else { + return ::boost::compute::adjacent_difference( + first, last, result, ::boost::compute::minus<value_type>(), queue + ); + } +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_ADJACENT_DIFFERENCE_HPP diff --git a/boost/compute/algorithm/adjacent_find.hpp b/boost/compute/algorithm/adjacent_find.hpp new file mode 100644 index 0000000000..992a01eddc --- /dev/null +++ b/boost/compute/algorithm/adjacent_find.hpp @@ -0,0 +1,162 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_ADJACENT_FIND_HPP +#define BOOST_COMPUTE_ALGORITHM_ADJACENT_FIND_HPP + +#include <iterator> + +#include <boost/compute/command_queue.hpp> +#include <boost/compute/lambda.hpp> +#include <boost/compute/system.hpp> +#include <boost/compute/container/detail/scalar.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/functional/operator.hpp> +#include <boost/compute/type_traits/vector_size.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator, class Compare> +inline InputIterator +serial_adjacent_find(InputIterator first, + InputIterator last, + Compare compare, + command_queue &queue) +{ + if(first == last){ + return last; + } + + const context &context = queue.get_context(); + + detail::scalar<uint_> output(context); + + detail::meta_kernel k("serial_adjacent_find"); + + size_t size_arg = k.add_arg<const uint_>("size"); + size_t output_arg = k.add_arg<uint_ *>(memory_object::global_memory, "output"); + + k << k.decl<uint_>("result") << " = size;\n" + << "for(uint i = 0; i < size - 1; i++){\n" + << " if(" << compare(first[k.expr<uint_>("i")], + first[k.expr<uint_>("i+1")]) << "){\n" + << " result = i;\n" + << " break;\n" + << " }\n" + << "}\n" + << "*output = result;\n"; + + k.set_arg<const uint_>( + size_arg, static_cast<uint_>(detail::iterator_range_size(first, last)) + ); + k.set_arg(output_arg, output.get_buffer()); + + k.exec_1d(queue, 0, 1, 1); + + return first + output.read(queue); +} + +template<class InputIterator, class Compare> +inline InputIterator +adjacent_find_with_atomics(InputIterator first, + InputIterator last, + Compare compare, + command_queue &queue) +{ + if(first == last){ + return last; + } + + const context &context = queue.get_context(); + size_t count = detail::iterator_range_size(first, last); + + // initialize output to the last index + detail::scalar<uint_> output(context); + output.write(static_cast<uint_>(count), queue); + + detail::meta_kernel k("adjacent_find_with_atomics"); + + size_t output_arg = k.add_arg<uint_ *>(memory_object::global_memory, "output"); + + k << "const uint i = get_global_id(0);\n" + << "if(" << compare(first[k.expr<uint_>("i")], + first[k.expr<uint_>("i+1")]) << "){\n" + << " atomic_min(output, i);\n" + << "}\n"; + + k.set_arg(output_arg, output.get_buffer()); + + k.exec_1d(queue, 0, count - 1, 1); + + return first + output.read(queue); +} + +} // end detail namespace + +/// Searches the range [\p first, \p last) for two identical adjacent +/// elements and returns an iterator pointing to the first. +/// +/// \param first first element in the range to search +/// \param last last element in the range to search +/// \param compare binary comparison function +/// \param queue command queue to perform the operation +/// +/// \return \c InputIteratorm to the first element which compares equal +/// to the following element. If none are equal, returns \c last. +/// +/// \see find(), adjacent_difference() +template<class InputIterator, class Compare> +inline InputIterator +adjacent_find(InputIterator first, + InputIterator last, + Compare compare, + command_queue &queue = system::default_queue()) +{ + size_t count = detail::iterator_range_size(first, last); + if(count < 32){ + return detail::serial_adjacent_find(first, last, compare, queue); + } + else { + return detail::adjacent_find_with_atomics(first, last, compare, queue); + } +} + +/// \overload +template<class InputIterator> +inline InputIterator +adjacent_find(InputIterator first, + InputIterator last, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + + using ::boost::compute::lambda::_1; + using ::boost::compute::lambda::_2; + using ::boost::compute::lambda::all; + + if(vector_size<value_type>::value == 1){ + return ::boost::compute::adjacent_find( + first, last, _1 == _2, queue + ); + } + else { + return ::boost::compute::adjacent_find( + first, last, all(_1 == _2), queue + ); + } +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_ADJACENT_FIND_HPP diff --git a/boost/compute/algorithm/all_of.hpp b/boost/compute/algorithm/all_of.hpp new file mode 100644 index 0000000000..34d7518f32 --- /dev/null +++ b/boost/compute/algorithm/all_of.hpp @@ -0,0 +1,36 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_ALL_OF_HPP +#define BOOST_COMPUTE_ALGORITHM_ALL_OF_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/algorithm/find_if_not.hpp> + +namespace boost { +namespace compute { + +/// Returns \c true if \p predicate returns \c true for all of the elements in +/// the range [\p first, \p last). +/// +/// \see any_of(), none_of() +template<class InputIterator, class UnaryPredicate> +inline bool all_of(InputIterator first, + InputIterator last, + UnaryPredicate predicate, + command_queue &queue = system::default_queue()) +{ + return ::boost::compute::find_if_not(first, last, predicate, queue) == last; +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_ALL_OF_HPP diff --git a/boost/compute/algorithm/any_of.hpp b/boost/compute/algorithm/any_of.hpp new file mode 100644 index 0000000000..b07779597c --- /dev/null +++ b/boost/compute/algorithm/any_of.hpp @@ -0,0 +1,40 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_ANY_OF_HPP +#define BOOST_COMPUTE_ALGORITHM_ANY_OF_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/algorithm/find_if.hpp> + +namespace boost { +namespace compute { + +/// Returns \c true if \p predicate returns \c true for any of the elements in +/// the range [\p first, \p last). +/// +/// For example, to test if a vector contains any negative values: +/// +/// \snippet test/test_any_all_none_of.cpp any_of +/// +/// \see all_of(), none_of() +template<class InputIterator, class UnaryPredicate> +inline bool any_of(InputIterator first, + InputIterator last, + UnaryPredicate predicate, + command_queue &queue = system::default_queue()) +{ + return ::boost::compute::find_if(first, last, predicate, queue) != last; +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_ANY_OF_HPP diff --git a/boost/compute/algorithm/binary_search.hpp b/boost/compute/algorithm/binary_search.hpp new file mode 100644 index 0000000000..6e19498790 --- /dev/null +++ b/boost/compute/algorithm/binary_search.hpp @@ -0,0 +1,37 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_BINARY_SEARCH_HPP +#define BOOST_COMPUTE_ALGORITHM_BINARY_SEARCH_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/lower_bound.hpp> + +namespace boost { +namespace compute { + +/// Returns \c true if \p value is in the sorted range [\p first, +/// \p last). +template<class InputIterator, class T> +inline bool binary_search(InputIterator first, + InputIterator last, + const T &value, + command_queue &queue = system::default_queue()) +{ + InputIterator position = lower_bound(first, last, value, queue); + + return position != last && position.read(queue) == value; +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_BINARY_SEARCH_HPP diff --git a/boost/compute/algorithm/copy.hpp b/boost/compute/algorithm/copy.hpp new file mode 100644 index 0000000000..2a25059bba --- /dev/null +++ b/boost/compute/algorithm/copy.hpp @@ -0,0 +1,362 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_COPY_HPP +#define BOOST_COMPUTE_ALGORITHM_COPY_HPP + +#include <algorithm> +#include <iterator> + +#include <boost/utility/enable_if.hpp> + +#include <boost/mpl/and.hpp> +#include <boost/mpl/not.hpp> + +#include <boost/compute/buffer.hpp> +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/detail/copy_on_device.hpp> +#include <boost/compute/algorithm/detail/copy_to_device.hpp> +#include <boost/compute/algorithm/detail/copy_to_host.hpp> +#include <boost/compute/async/future.hpp> +#include <boost/compute/detail/is_contiguous_iterator.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/iterator/buffer_iterator.hpp> +#include <boost/compute/type_traits/is_device_iterator.hpp> + +namespace boost { +namespace compute { +namespace detail { + +namespace mpl = boost::mpl; + +// meta-function returning true if copy() between InputIterator and +// OutputIterator can be implemented with clEnqueueCopyBuffer(). +template<class InputIterator, class OutputIterator> +struct can_copy_with_copy_buffer : + mpl::and_< + boost::is_same< + InputIterator, + buffer_iterator<typename InputIterator::value_type> + >, + boost::is_same< + OutputIterator, + buffer_iterator<typename OutputIterator::value_type> + >, + boost::is_same< + typename InputIterator::value_type, + typename OutputIterator::value_type + > + >::type {}; + +// host -> device +template<class InputIterator, class OutputIterator> +inline OutputIterator +dispatch_copy(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue, + typename boost::enable_if_c< + !is_device_iterator<InputIterator>::value && + is_device_iterator<OutputIterator>::value + >::type* = 0) +{ + if(is_contiguous_iterator<InputIterator>::value){ + return copy_to_device(first, last, result, queue); + } + else { + // for non-contiguous input we first copy the values to + // a temporary std::vector and then copy from there + typedef typename std::iterator_traits<InputIterator>::value_type T; + std::vector<T> vector(first, last); + return copy_to_device(vector.begin(), vector.end(), result, queue); + } +} + +// host -> device (async) +template<class InputIterator, class OutputIterator> +inline future<OutputIterator> +dispatch_copy_async(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue, + typename boost::enable_if_c< + !is_device_iterator<InputIterator>::value && + is_device_iterator<OutputIterator>::value + >::type* = 0) +{ + BOOST_STATIC_ASSERT_MSG( + is_contiguous_iterator<InputIterator>::value, + "copy_async() is only supported for contiguous host iterators" + ); + + return copy_to_device_async(first, last, result, queue); +} + +// device -> host +template<class InputIterator, class OutputIterator> +inline OutputIterator +dispatch_copy(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue, + typename boost::enable_if_c< + is_device_iterator<InputIterator>::value && + !is_device_iterator<OutputIterator>::value + >::type* = 0) +{ + if(is_contiguous_iterator<OutputIterator>::value){ + return copy_to_host(first, last, result, queue); + } + else { + // for non-contiguous input we first copy the values to + // a temporary std::vector and then copy from there + typedef typename std::iterator_traits<InputIterator>::value_type T; + std::vector<T> vector(iterator_range_size(first, last)); + copy_to_host(first, last, vector.begin(), queue); + return std::copy(vector.begin(), vector.end(), result); + } +} + +// device -> host (async) +template<class InputIterator, class OutputIterator> +inline future<OutputIterator> +dispatch_copy_async(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue, + typename boost::enable_if_c< + is_device_iterator<InputIterator>::value && + !is_device_iterator<OutputIterator>::value + >::type* = 0) +{ + BOOST_STATIC_ASSERT_MSG( + is_contiguous_iterator<OutputIterator>::value, + "copy_async() is only supported for contiguous host iterators" + ); + + return copy_to_host_async(first, last, result, queue); +} + +// device -> device +template<class InputIterator, class OutputIterator> +inline OutputIterator +dispatch_copy(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue, + typename boost::enable_if< + mpl::and_< + is_device_iterator<InputIterator>, + is_device_iterator<OutputIterator>, + mpl::not_< + can_copy_with_copy_buffer< + InputIterator, OutputIterator + > + > + > + >::type* = 0) +{ + return copy_on_device(first, last, result, queue); +} + +// device -> device (specialization for buffer iterators) +template<class InputIterator, class OutputIterator> +inline OutputIterator +dispatch_copy(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue, + typename boost::enable_if< + mpl::and_< + is_device_iterator<InputIterator>, + is_device_iterator<OutputIterator>, + can_copy_with_copy_buffer< + InputIterator, OutputIterator + > + > + >::type* = 0) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; + + difference_type n = std::distance(first, last); + if(n < 1){ + // nothing to copy + return result; + } + + queue.enqueue_copy_buffer(first.get_buffer(), + result.get_buffer(), + first.get_index() * sizeof(value_type), + result.get_index() * sizeof(value_type), + static_cast<size_t>(n) * sizeof(value_type)); + return result + n; +} + +// device -> device (async) +template<class InputIterator, class OutputIterator> +inline future<OutputIterator> +dispatch_copy_async(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue, + typename boost::enable_if< + mpl::and_< + is_device_iterator<InputIterator>, + is_device_iterator<OutputIterator>, + mpl::not_< + can_copy_with_copy_buffer< + InputIterator, OutputIterator + > + > + > + >::type* = 0) +{ + return copy_on_device_async(first, last, result, queue); +} + +// device -> device (async, specialization for buffer iterators) +template<class InputIterator, class OutputIterator> +inline future<OutputIterator> +dispatch_copy_async(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue, + typename boost::enable_if< + mpl::and_< + is_device_iterator<InputIterator>, + is_device_iterator<OutputIterator>, + can_copy_with_copy_buffer< + InputIterator, OutputIterator + > + > + >::type* = 0) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; + + difference_type n = std::distance(first, last); + if(n < 1){ + // nothing to copy + return make_future(result, event()); + } + + event event_ = + queue.enqueue_copy_buffer( + first.get_buffer(), + result.get_buffer(), + first.get_index() * sizeof(value_type), + result.get_index() * sizeof(value_type), + static_cast<size_t>(n) * sizeof(value_type) + ); + + return make_future(result + n, event_); +} + +// host -> host +template<class InputIterator, class OutputIterator> +inline OutputIterator +dispatch_copy(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue, + typename boost::enable_if_c< + !is_device_iterator<InputIterator>::value && + !is_device_iterator<OutputIterator>::value + >::type* = 0) +{ + (void) queue; + + return std::copy(first, last, result); +} + +} // end detail namespace + +/// Copies the values in the range [\p first, \p last) to the range +/// beginning at \p result. +/// +/// The generic copy() function can be used for a variety of data +/// transfer tasks and provides a standard interface to the following +/// OpenCL functions: +/// +/// \li \c clEnqueueReadBuffer() +/// \li \c clEnqueueWriteBuffer() +/// \li \c clEnqueueCopyBuffer() +/// +/// Unlike the aforementioned OpenCL functions, copy() will also work +/// with non-contiguous data-structures (e.g. \c std::list<T>) as +/// well as with "fancy" iterators (e.g. transform_iterator). +/// +/// \param first first element in the range to copy +/// \param last last element in the range to copy +/// \param result first element in the result range +/// \param queue command queue to perform the operation +/// +/// \return \c OutputIterator to the end of the result range +/// +/// For example, to copy an array of \c int values on the host to a vector on +/// the device: +/// \code +/// // array on the host +/// int data[] = { 1, 2, 3, 4 }; +/// +/// // vector on the device +/// boost::compute::vector<int> vec(4, context); +/// +/// // copy values to the device vector +/// boost::compute::copy(data, data + 4, vec.begin(), queue); +/// \endcode +/// +/// The copy algorithm can also be used with standard containers such as +/// \c std::vector<T>: +/// \code +/// std::vector<int> host_vector = ... +/// boost::compute::vector<int> device_vector = ... +/// +/// // copy from the host to the device +/// boost::compute::copy( +/// host_vector.begin(), host_vector.end(), device_vector.begin(), queue +/// ); +/// +/// // copy from the device to the host +/// boost::compute::copy( +/// device_vector.begin(), device_vector.end(), host_vector.begin(), queue +/// ); +/// \endcode +/// +/// \see copy_n(), copy_if(), copy_async() +template<class InputIterator, class OutputIterator> +inline OutputIterator copy(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue = system::default_queue()) +{ + return detail::dispatch_copy(first, last, result, queue); +} + +/// Copies the values in the range [\p first, \p last) to the range +/// beginning at \p result. The copy is performed asynchronously. +/// +/// \see copy() +template<class InputIterator, class OutputIterator> +inline future<OutputIterator> +copy_async(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue = system::default_queue()) +{ + return detail::dispatch_copy_async(first, last, result, queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_COPY_HPP diff --git a/boost/compute/algorithm/copy_if.hpp b/boost/compute/algorithm/copy_if.hpp new file mode 100644 index 0000000000..3cd08ef293 --- /dev/null +++ b/boost/compute/algorithm/copy_if.hpp @@ -0,0 +1,58 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_COPY_IF_HPP +#define BOOST_COMPUTE_ALGORITHM_COPY_IF_HPP + +#include <boost/compute/algorithm/transform_if.hpp> +#include <boost/compute/functional/identity.hpp> + +namespace boost { +namespace compute { +namespace detail { + +// like the copy_if() algorithm but writes the indices of the values for which +// predicate returns true. +template<class InputIterator, class OutputIterator, class Predicate> +inline OutputIterator copy_index_if(InputIterator first, + InputIterator last, + OutputIterator result, + Predicate predicate, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator>::value_type T; + + return detail::transform_if_impl( + first, last, result, identity<T>(), predicate, true, queue + ); +} + +} // end detail namespace + +/// Copies each element in the range [\p first, \p last) for which +/// \p predicate returns \c true to the range beginning at \p result. +template<class InputIterator, class OutputIterator, class Predicate> +inline OutputIterator copy_if(InputIterator first, + InputIterator last, + OutputIterator result, + Predicate predicate, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator>::value_type T; + + return ::boost::compute::transform_if( + first, last, result, identity<T>(), predicate, queue + ); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_COPY_IF_HPP diff --git a/boost/compute/algorithm/copy_n.hpp b/boost/compute/algorithm/copy_n.hpp new file mode 100644 index 0000000000..f0989edc67 --- /dev/null +++ b/boost/compute/algorithm/copy_n.hpp @@ -0,0 +1,51 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_COPY_N_HPP +#define BOOST_COMPUTE_ALGORITHM_COPY_N_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/copy.hpp> + +namespace boost { +namespace compute { + +/// Copies \p count elements from \p first to \p result. +/// +/// For example, to copy four values from the host to the device: +/// \code +/// // values on the host and vector on the device +/// float values[4] = { 1.f, 2.f, 3.f, 4.f }; +/// boost::compute::vector<float> vec(4, context); +/// +/// // copy from the host to the device +/// boost::compute::copy_n(values, 4, vec.begin(), queue); +/// \endcode +/// +/// \see copy() +template<class InputIterator, class Size, class OutputIterator> +inline OutputIterator copy_n(InputIterator first, + Size count, + OutputIterator result, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; + + return ::boost::compute::copy(first, + first + static_cast<difference_type>(count), + result, + queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_COPY_N_HPP diff --git a/boost/compute/algorithm/count.hpp b/boost/compute/algorithm/count.hpp new file mode 100644 index 0000000000..140d67379f --- /dev/null +++ b/boost/compute/algorithm/count.hpp @@ -0,0 +1,55 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_COUNT_HPP +#define BOOST_COMPUTE_ALGORITHM_COUNT_HPP + +#include <boost/compute/lambda.hpp> +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/count_if.hpp> +#include <boost/compute/type_traits/vector_size.hpp> + +namespace boost { +namespace compute { + +/// Returns the number of occurrences of \p value in the range +/// [\p first, \p last). +/// +/// \see count_if() +template<class InputIterator, class T> +inline size_t count(InputIterator first, + InputIterator last, + const T &value, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + + using ::boost::compute::_1; + using ::boost::compute::lambda::all; + + if(vector_size<value_type>::value == 1){ + return ::boost::compute::count_if(first, + last, + _1 == value, + queue); + } + else { + return ::boost::compute::count_if(first, + last, + all(_1 == value), + queue); + } +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_COUNT_HPP diff --git a/boost/compute/algorithm/count_if.hpp b/boost/compute/algorithm/count_if.hpp new file mode 100644 index 0000000000..c9381ce5d4 --- /dev/null +++ b/boost/compute/algorithm/count_if.hpp @@ -0,0 +1,62 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_COUNT_IF_HPP +#define BOOST_COMPUTE_ALGORITHM_COUNT_IF_HPP + +#include <boost/compute/device.hpp> +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/detail/count_if_with_ballot.hpp> +#include <boost/compute/algorithm/detail/count_if_with_reduce.hpp> +#include <boost/compute/algorithm/detail/count_if_with_threads.hpp> +#include <boost/compute/algorithm/detail/serial_count_if.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> + +namespace boost { +namespace compute { + +/// Returns the number of elements in the range [\p first, \p last) +/// for which \p predicate returns \c true. +template<class InputIterator, class Predicate> +inline size_t count_if(InputIterator first, + InputIterator last, + Predicate predicate, + command_queue &queue = system::default_queue()) +{ + const device &device = queue.get_device(); + + size_t input_size = detail::iterator_range_size(first, last); + if(input_size == 0){ + return 0; + } + + if(device.type() & device::cpu){ + if(input_size < 1024){ + return detail::serial_count_if(first, last, predicate, queue); + } + else { + return detail::count_if_with_threads(first, last, predicate, queue); + } + } + else { + if(input_size < 32){ + return detail::serial_count_if(first, last, predicate, queue); + } + else { + return detail::count_if_with_reduce(first, last, predicate, queue); + } + } +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_COUNT_IF_HPP diff --git a/boost/compute/algorithm/detail/balanced_path.hpp b/boost/compute/algorithm/detail/balanced_path.hpp new file mode 100644 index 0000000000..e5025532d3 --- /dev/null +++ b/boost/compute/algorithm/detail/balanced_path.hpp @@ -0,0 +1,162 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_BALANCED_PATH_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_BALANCED_PATH_HPP + +#include <iterator> + +#include <boost/compute/algorithm/find_if.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/lambda.hpp> +#include <boost/compute/system.hpp> + +namespace boost { +namespace compute { +namespace detail { + +/// +/// \brief Balanced Path kernel class +/// +/// Subclass of meta_kernel to break two sets into tiles according +/// to their balanced path. +/// +class balanced_path_kernel : public meta_kernel +{ +public: + unsigned int tile_size; + + balanced_path_kernel() : meta_kernel("balanced_path") + { + tile_size = 4; + } + + template<class InputIterator1, class InputIterator2, + class OutputIterator1, class OutputIterator2, + class Compare> + void set_range(InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + InputIterator2 last2, + OutputIterator1 result_a, + OutputIterator2 result_b, + Compare comp) + { + typedef typename std::iterator_traits<InputIterator1>::value_type value_type; + + m_a_count = iterator_range_size(first1, last1); + m_a_count_arg = add_arg<uint_>("a_count"); + + m_b_count = iterator_range_size(first2, last2); + m_b_count_arg = add_arg<uint_>("b_count"); + + *this << + "uint i = get_global_id(0);\n" << + "uint target = (i+1)*" << tile_size << ";\n" << + "uint start = max(convert_int(0),convert_int(target)-convert_int(b_count));\n" << + "uint end = min(target,a_count);\n" << + "uint a_index, b_index;\n" << + "while(start<end)\n" << + "{\n" << + " a_index = (start + end)/2;\n" << + " b_index = target - a_index - 1;\n" << + " if(!(" << comp(first2[expr<uint_>("b_index")], + first1[expr<uint_>("a_index")]) << "))\n" << + " start = a_index + 1;\n" << + " else end = a_index;\n" << + "}\n" << + "a_index = start;\n" << + "b_index = target - start;\n" << + "if(b_index < b_count)\n" << + "{\n" << + " " << decl<const value_type>("x") << " = " << + first2[expr<uint_>("b_index")] << ";\n" << + " uint a_start = 0, a_end = a_index, a_mid;\n" << + " uint b_start = 0, b_end = b_index, b_mid;\n" << + " while(a_start<a_end)\n" << + " {\n" << + " a_mid = (a_start + a_end)/2;\n" << + " if(" << comp(first1[expr<uint_>("a_mid")], expr<value_type>("x")) << ")\n" << + " a_start = a_mid+1;\n" << + " else a_end = a_mid;\n" << + " }\n" << + " while(b_start<b_end)\n" << + " {\n" << + " b_mid = (b_start + b_end)/2;\n" << + " if(" << comp(first2[expr<uint_>("b_mid")], expr<value_type>("x")) << ")\n" << + " b_start = b_mid+1;\n" << + " else b_end = b_mid;\n" << + " }\n" << + " uint a_run = a_index - a_start;\n" << + " uint b_run = b_index - b_start;\n" << + " uint x_count = a_run + b_run;\n" << + " uint b_advance = max(x_count / 2, x_count - a_run);\n" << + " b_end = min(b_count, b_start + b_advance + 1);\n" << + " uint temp_start = b_index, temp_end = b_end, temp_mid;" << + " while(temp_start < temp_end)\n" << + " {\n" << + " temp_mid = (temp_start + temp_end + 1)/2;\n" << + " if(" << comp(expr<value_type>("x"), first2[expr<uint_>("temp_mid")]) << ")\n" << + " temp_end = temp_mid-1;\n" << + " else temp_start = temp_mid;\n" << + " }\n" << + " b_run = temp_start - b_start + 1;\n" << + " b_advance = min(b_advance, b_run);\n" << + " uint a_advance = x_count - b_advance;\n" << + " uint star = convert_uint((a_advance == b_advance + 1) " << + "&& (b_advance < b_run));\n" << + " a_index = a_start + a_advance;\n" << + " b_index = target - a_index + star;\n" << + "}\n" << + result_a[expr<uint_>("i")] << " = a_index;\n" << + result_b[expr<uint_>("i")] << " = b_index;\n"; + + } + + template<class InputIterator1, class InputIterator2, + class OutputIterator1, class OutputIterator2> + void set_range(InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + InputIterator2 last2, + OutputIterator1 result_a, + OutputIterator2 result_b) + { + typedef typename std::iterator_traits<InputIterator1>::value_type value_type; + ::boost::compute::less<value_type> less_than; + set_range(first1, last1, first2, last2, result_a, result_b, less_than); + } + + event exec(command_queue &queue) + { + if((m_a_count + m_b_count)/tile_size == 0) { + return event(); + } + + set_arg(m_a_count_arg, uint_(m_a_count)); + set_arg(m_b_count_arg, uint_(m_b_count)); + + return exec_1d(queue, 0, (m_a_count + m_b_count)/tile_size); + } + +private: + size_t m_a_count; + size_t m_a_count_arg; + size_t m_b_count; + size_t m_b_count_arg; +}; + +} //end detail namespace +} //end compute namespace +} //end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_BALANCED_PATH_HPP diff --git a/boost/compute/algorithm/detail/binary_find.hpp b/boost/compute/algorithm/detail/binary_find.hpp new file mode 100644 index 0000000000..27fa11fbaf --- /dev/null +++ b/boost/compute/algorithm/detail/binary_find.hpp @@ -0,0 +1,133 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_BINARY_FIND_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_BINARY_FIND_HPP + +#include <boost/compute/functional.hpp> +#include <boost/compute/algorithm/find_if.hpp> +#include <boost/compute/algorithm/transform.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/detail/parameter_cache.hpp> + +namespace boost { +namespace compute { +namespace detail{ + +/// +/// \brief Binary find kernel class +/// +/// Subclass of meta_kernel to perform single step in binary find. +/// +template<class InputIterator, class UnaryPredicate> +class binary_find_kernel : public meta_kernel +{ +public: + binary_find_kernel(InputIterator first, + InputIterator last, + UnaryPredicate predicate) + : meta_kernel("binary_find") + { + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + + m_index_arg = add_arg<uint_ *>(memory_object::global_memory, "index"); + m_block_arg = add_arg<uint_>("block"); + + atomic_min<uint_> atomic_min_uint; + + *this << + "uint i = get_global_id(0) * block;\n" << + decl<value_type>("value") << "=" << first[var<uint_>("i")] << ";\n" << + "if(" << predicate(var<value_type>("value")) << ") {\n" << + atomic_min_uint(var<uint_ *>("index"), var<uint_>("i")) << ";\n" << + "}\n"; + } + + size_t m_index_arg; + size_t m_block_arg; +}; + +/// +/// \brief Binary find algorithm +/// +/// Finds the end of true values in the partitioned range [first, last). +/// \return Iterator pointing to end of true values +/// +/// \param first Iterator pointing to start of range +/// \param last Iterator pointing to end of range +/// \param predicate Predicate according to which the range is partitioned +/// \param queue Queue on which to execute +/// +template<class InputIterator, class UnaryPredicate> +inline InputIterator binary_find(InputIterator first, + InputIterator last, + UnaryPredicate predicate, + command_queue &queue = system::default_queue()) +{ + const device &device = queue.get_device(); + + boost::shared_ptr<parameter_cache> parameters = + detail::parameter_cache::get_global_cache(device); + + const std::string cache_key = "__boost_binary_find"; + + size_t find_if_limit = 128; + size_t threads = parameters->get(cache_key, "tpb", 128); + size_t count = iterator_range_size(first, last); + + InputIterator search_first = first; + InputIterator search_last = last; + + scalar<uint_> index(queue.get_context()); + + // construct and compile binary_find kernel + binary_find_kernel<InputIterator, UnaryPredicate> + binary_find_kernel(search_first, search_last, predicate); + ::boost::compute::kernel kernel = binary_find_kernel.compile(queue.get_context()); + + // set buffer for index + kernel.set_arg(binary_find_kernel.m_index_arg, index.get_buffer()); + + while(count > find_if_limit) { + index.write(static_cast<uint_>(count), queue); + + // set block and run binary_find kernel + uint_ block = static_cast<uint_>((count - 1)/(threads - 1)); + kernel.set_arg(binary_find_kernel.m_block_arg, block); + queue.enqueue_1d_range_kernel(kernel, 0, threads, 0); + + size_t i = index.read(queue); + + if(i == count) { + search_first = search_last - ((count - 1)%(threads - 1)); + break; + } else { + search_last = search_first + i; + search_first = search_last - ((count - 1)/(threads - 1)); + } + + // Make sure that first and last stay within the input range + search_last = (std::min)(search_last, last); + search_last = (std::max)(search_last, first); + + search_first = (std::max)(search_first, first); + search_first = (std::min)(search_first, last); + + count = iterator_range_size(search_first, search_last); + } + + return find_if(search_first, search_last, predicate, queue); +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_BINARY_FIND_HPP diff --git a/boost/compute/algorithm/detail/compact.hpp b/boost/compute/algorithm/detail/compact.hpp new file mode 100644 index 0000000000..983352d543 --- /dev/null +++ b/boost/compute/algorithm/detail/compact.hpp @@ -0,0 +1,77 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COMPACT_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_COMPACT_HPP + +#include <iterator> + +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/system.hpp> + +namespace boost { +namespace compute { +namespace detail { + +/// +/// \brief Compact kernel class +/// +/// Subclass of meta_kernel to compact the result of set kernels to +/// get actual sets +/// +class compact_kernel : public meta_kernel +{ +public: + unsigned int tile_size; + + compact_kernel() : meta_kernel("compact") + { + tile_size = 4; + } + + template<class InputIterator1, class InputIterator2, class OutputIterator> + void set_range(InputIterator1 start, + InputIterator2 counts_begin, + InputIterator2 counts_end, + OutputIterator result) + { + m_count = iterator_range_size(counts_begin, counts_end) - 1; + + *this << + "uint i = get_global_id(0);\n" << + "uint count = i*" << tile_size << ";\n" << + "for(uint j = " << counts_begin[expr<uint_>("i")] << "; j<" << + counts_begin[expr<uint_>("i+1")] << "; j++, count++)\n" << + "{\n" << + result[expr<uint_>("j")] << " = " << start[expr<uint_>("count")] + << ";\n" << + "}\n"; + } + + event exec(command_queue &queue) + { + if(m_count == 0) { + return event(); + } + + return exec_1d(queue, 0, m_count); + } + +private: + size_t m_count; +}; + +} //end detail namespace +} //end compute namespace +} //end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COMPACT_HPP diff --git a/boost/compute/algorithm/detail/copy_on_device.hpp b/boost/compute/algorithm/detail/copy_on_device.hpp new file mode 100644 index 0000000000..0bcee27ed5 --- /dev/null +++ b/boost/compute/algorithm/detail/copy_on_device.hpp @@ -0,0 +1,190 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP + +#include <iterator> + +#include <boost/compute/command_queue.hpp> +#include <boost/compute/async/future.hpp> +#include <boost/compute/iterator/buffer_iterator.hpp> +#include <boost/compute/iterator/discard_iterator.hpp> +#include <boost/compute/memory/svm_ptr.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/detail/parameter_cache.hpp> +#include <boost/compute/detail/work_size.hpp> + +namespace boost { +namespace compute { +namespace detail { + +inline size_t pick_copy_work_group_size(size_t n, const device &device) +{ + (void) device; + + if(n % 32 == 0) return 32; + else if(n % 16 == 0) return 16; + else if(n % 8 == 0) return 8; + else if(n % 4 == 0) return 4; + else if(n % 2 == 0) return 2; + else return 1; +} + +template<class InputIterator, class OutputIterator> +class copy_kernel : public meta_kernel +{ +public: + copy_kernel(const device &device) + : meta_kernel("copy") + { + m_count = 0; + + typedef typename std::iterator_traits<InputIterator>::value_type input_type; + + boost::shared_ptr<parameter_cache> parameters = + detail::parameter_cache::get_global_cache(device); + + std::string cache_key = + "__boost_copy_kernel_" + boost::lexical_cast<std::string>(sizeof(input_type)); + + m_vpt = parameters->get(cache_key, "vpt", 4); + m_tpb = parameters->get(cache_key, "tpb", 128); + } + + void set_range(InputIterator first, + InputIterator last, + OutputIterator result) + { + m_count_arg = add_arg<uint_>("count"); + + *this << + "uint index = get_local_id(0) + " << + "(" << m_vpt * m_tpb << " * get_group_id(0));\n" << + "for(uint i = 0; i < " << m_vpt << "; i++){\n" << + " if(index < count){\n" << + result[expr<uint_>("index")] << '=' << + first[expr<uint_>("index")] << ";\n" << + " index += " << m_tpb << ";\n" + " }\n" + "}\n"; + + m_count = detail::iterator_range_size(first, last); + } + + event exec(command_queue &queue) + { + if(m_count == 0){ + // nothing to do + return event(); + } + + size_t global_work_size = calculate_work_size(m_count, m_vpt, m_tpb); + + set_arg(m_count_arg, uint_(m_count)); + + return exec_1d(queue, 0, global_work_size, m_tpb); + } + +private: + size_t m_count; + size_t m_count_arg; + uint_ m_vpt; + uint_ m_tpb; +}; + +template<class InputIterator, class OutputIterator> +inline OutputIterator copy_on_device(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue) +{ + const device &device = queue.get_device(); + + copy_kernel<InputIterator, OutputIterator> kernel(device); + + kernel.set_range(first, last, result); + kernel.exec(queue); + + return result + std::distance(first, last); +} + +template<class InputIterator> +inline discard_iterator copy_on_device(InputIterator first, + InputIterator last, + discard_iterator result, + command_queue &queue) +{ + (void) queue; + + return result + std::distance(first, last); +} + +template<class InputIterator, class OutputIterator> +inline future<OutputIterator> copy_on_device_async(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue) +{ + const device &device = queue.get_device(); + + copy_kernel<InputIterator, OutputIterator> kernel(device); + + kernel.set_range(first, last, result); + event event_ = kernel.exec(queue); + + return make_future(result + std::distance(first, last), event_); +} + +#ifdef CL_VERSION_2_0 +// copy_on_device() specialization for svm_ptr +template<class T> +inline svm_ptr<T> copy_on_device(svm_ptr<T> first, + svm_ptr<T> last, + svm_ptr<T> result, + command_queue &queue) +{ + size_t count = iterator_range_size(first, last); + if(count == 0){ + return result; + } + + queue.enqueue_svm_memcpy( + result.get(), first.get(), count * sizeof(T) + ); + + return result + count; +} + +template<class T> +inline future<svm_ptr<T> > copy_on_device_async(svm_ptr<T> first, + svm_ptr<T> last, + svm_ptr<T> result, + command_queue &queue) +{ + size_t count = iterator_range_size(first, last); + if(count == 0){ + return result; + } + + event event_ = queue.enqueue_svm_memcpy_async( + result.get(), first.get(), count * sizeof(T) + ); + + return make_future(result + count, event_); +} +#endif // CL_VERSION_2_0 + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP diff --git a/boost/compute/algorithm/detail/copy_to_device.hpp b/boost/compute/algorithm/detail/copy_to_device.hpp new file mode 100644 index 0000000000..90545fb4ed --- /dev/null +++ b/boost/compute/algorithm/detail/copy_to_device.hpp @@ -0,0 +1,127 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_TO_DEVICE_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_TO_DEVICE_HPP + +#include <iterator> + +#include <boost/utility/addressof.hpp> + +#include <boost/compute/command_queue.hpp> +#include <boost/compute/async/future.hpp> +#include <boost/compute/iterator/buffer_iterator.hpp> +#include <boost/compute/memory/svm_ptr.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class HostIterator, class DeviceIterator> +inline DeviceIterator copy_to_device(HostIterator first, + HostIterator last, + DeviceIterator result, + command_queue &queue) +{ + typedef typename + std::iterator_traits<DeviceIterator>::value_type + value_type; + typedef typename + std::iterator_traits<DeviceIterator>::difference_type + difference_type; + + size_t count = iterator_range_size(first, last); + if(count == 0){ + return result; + } + + size_t offset = result.get_index(); + + queue.enqueue_write_buffer(result.get_buffer(), + offset * sizeof(value_type), + count * sizeof(value_type), + ::boost::addressof(*first)); + + return result + static_cast<difference_type>(count); +} + +template<class HostIterator, class DeviceIterator> +inline future<DeviceIterator> copy_to_device_async(HostIterator first, + HostIterator last, + DeviceIterator result, + command_queue &queue) +{ + typedef typename + std::iterator_traits<DeviceIterator>::value_type + value_type; + typedef typename + std::iterator_traits<DeviceIterator>::difference_type + difference_type; + + size_t count = iterator_range_size(first, last); + if(count == 0){ + return future<DeviceIterator>(); + } + + size_t offset = result.get_index(); + + event event_ = + queue.enqueue_write_buffer_async(result.get_buffer(), + offset * sizeof(value_type), + count * sizeof(value_type), + ::boost::addressof(*first)); + + return make_future(result + static_cast<difference_type>(count), event_); +} + +#ifdef CL_VERSION_2_0 +// copy_to_device() specialization for svm_ptr +template<class HostIterator, class T> +inline svm_ptr<T> copy_to_device(HostIterator first, + HostIterator last, + svm_ptr<T> result, + command_queue &queue) +{ + size_t count = iterator_range_size(first, last); + if(count == 0){ + return result; + } + + queue.enqueue_svm_memcpy( + result.get(), ::boost::addressof(*first), count * sizeof(T) + ); + + return result + count; +} + +template<class HostIterator, class T> +inline future<svm_ptr<T> > copy_to_device_async(HostIterator first, + HostIterator last, + svm_ptr<T> result, + command_queue &queue) +{ + size_t count = iterator_range_size(first, last); + if(count == 0){ + return result; + } + + event event_ = queue.enqueue_svm_memcpy_async( + result.get(), ::boost::addressof(*first), count * sizeof(T) + ); + + return make_future(result + count, event_); +} +#endif // CL_VERSION_2_0 + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_TO_DEVICE_HPP diff --git a/boost/compute/algorithm/detail/copy_to_host.hpp b/boost/compute/algorithm/detail/copy_to_host.hpp new file mode 100644 index 0000000000..b889e0c871 --- /dev/null +++ b/boost/compute/algorithm/detail/copy_to_host.hpp @@ -0,0 +1,137 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_TO_HOST_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_TO_HOST_HPP + +#include <iterator> + +#include <boost/utility/addressof.hpp> + +#include <boost/compute/command_queue.hpp> +#include <boost/compute/async/future.hpp> +#include <boost/compute/iterator/buffer_iterator.hpp> +#include <boost/compute/memory/svm_ptr.hpp> +#include <boost/compute/detail/iterator_plus_distance.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class DeviceIterator, class HostIterator> +inline HostIterator copy_to_host(DeviceIterator first, + DeviceIterator last, + HostIterator result, + command_queue &queue) +{ + typedef typename + std::iterator_traits<DeviceIterator>::value_type + value_type; + + size_t count = iterator_range_size(first, last); + if(count == 0){ + return result; + } + + const buffer &buffer = first.get_buffer(); + size_t offset = first.get_index(); + + queue.enqueue_read_buffer(buffer, + offset * sizeof(value_type), + count * sizeof(value_type), + ::boost::addressof(*result)); + + return iterator_plus_distance(result, count); +} + +// copy_to_host() specialization for std::vector<bool> +template<class DeviceIterator> +inline std::vector<bool>::iterator +copy_to_host(DeviceIterator first, + DeviceIterator last, + std::vector<bool>::iterator result, + command_queue &queue) +{ + std::vector<uint8_t> temp(std::distance(first, last)); + copy_to_host(first, last, temp.begin(), queue); + return std::copy(temp.begin(), temp.end(), result); +} + +template<class DeviceIterator, class HostIterator> +inline future<HostIterator> copy_to_host_async(DeviceIterator first, + DeviceIterator last, + HostIterator result, + command_queue &queue) +{ + typedef typename + std::iterator_traits<DeviceIterator>::value_type + value_type; + + size_t count = iterator_range_size(first, last); + if(count == 0){ + return future<HostIterator>(); + } + + const buffer &buffer = first.get_buffer(); + size_t offset = first.get_index(); + + event event_ = + queue.enqueue_read_buffer_async(buffer, + offset * sizeof(value_type), + count * sizeof(value_type), + ::boost::addressof(*result)); + + return make_future(iterator_plus_distance(result, count), event_); +} + +#ifdef CL_VERSION_2_0 +// copy_to_host() specialization for svm_ptr +template<class T, class HostIterator> +inline HostIterator copy_to_host(svm_ptr<T> first, + svm_ptr<T> last, + HostIterator result, + command_queue &queue) +{ + size_t count = iterator_range_size(first, last); + if(count == 0){ + return result; + } + + queue.enqueue_svm_memcpy( + ::boost::addressof(*result), first.get(), count * sizeof(T) + ); + + return result + count; +} + +template<class T, class HostIterator> +inline future<HostIterator> copy_to_host_async(svm_ptr<T> first, + svm_ptr<T> last, + HostIterator result, + command_queue &queue) +{ + size_t count = iterator_range_size(first, last); + if(count == 0){ + return result; + } + + event event_ = queue.enqueue_svm_memcpy_async( + ::boost::addressof(*result), first.get(), count * sizeof(T) + ); + + return make_future(iterator_plus_distance(result, count), event_); +} +#endif // CL_VERSION_2_0 + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_TO_HOST_HPP diff --git a/boost/compute/algorithm/detail/count_if_with_ballot.hpp b/boost/compute/algorithm/detail/count_if_with_ballot.hpp new file mode 100644 index 0000000000..584ef37ab9 --- /dev/null +++ b/boost/compute/algorithm/detail/count_if_with_ballot.hpp @@ -0,0 +1,78 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_BALLOT_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_BALLOT_HPP + +#include <boost/compute/context.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/algorithm/reduce.hpp> +#include <boost/compute/functional/detail/nvidia_ballot.hpp> +#include <boost/compute/functional/detail/nvidia_popcount.hpp> +#include <boost/compute/detail/meta_kernel.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator, class Predicate> +inline size_t count_if_with_ballot(InputIterator first, + InputIterator last, + Predicate predicate, + command_queue &queue) +{ + size_t count = iterator_range_size(first, last); + size_t block_size = 32; + size_t block_count = count / block_size; + if(block_count * block_size != count){ + block_count++; + } + + const ::boost::compute::context &context = queue.get_context(); + + ::boost::compute::vector<uint_> counts(block_count, context); + + ::boost::compute::detail::nvidia_popcount<uint_> popc; + ::boost::compute::detail::nvidia_ballot<uint_> ballot; + + meta_kernel k("count_if_with_ballot"); + k << + "const uint gid = get_global_id(0);\n" << + + "bool value = false;\n" << + "if(gid < count)\n" << + " value = " << predicate(first[k.var<const uint_>("gid")]) << ";\n" << + + "uint bits = " << ballot(k.var<const uint_>("value")) << ";\n" << + + "if(get_local_id(0) == 0)\n" << + counts.begin()[k.var<uint_>("get_group_id(0)") ] + << " = " << popc(k.var<uint_>("bits")) << ";\n"; + + k.add_set_arg<const uint_>("count", count); + + k.exec_1d(queue, 0, block_size * block_count, block_size); + + uint_ result; + ::boost::compute::reduce( + counts.begin(), + counts.end(), + &result, + queue + ); + return result; +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_BALLOT_HPP diff --git a/boost/compute/algorithm/detail/count_if_with_reduce.hpp b/boost/compute/algorithm/detail/count_if_with_reduce.hpp new file mode 100644 index 0000000000..f9449f4a41 --- /dev/null +++ b/boost/compute/algorithm/detail/count_if_with_reduce.hpp @@ -0,0 +1,87 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_REDUCE_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_REDUCE_HPP + +#include <boost/compute/algorithm/reduce.hpp> +#include <boost/compute/iterator/transform_iterator.hpp> +#include <boost/compute/types/fundamental.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class Predicate, class Arg> +struct invoked_countable_predicate +{ + invoked_countable_predicate(Predicate p, Arg a) + : predicate(p), arg(a) + { + } + + Predicate predicate; + Arg arg; +}; + +template<class Predicate, class Arg> +inline meta_kernel& operator<<(meta_kernel &kernel, + const invoked_countable_predicate<Predicate, Arg> &expr) +{ + return kernel << "(" << expr.predicate(expr.arg) << " ? 1 : 0)"; +} + +// the countable_predicate wraps Predicate and converts its result from +// bool to ulong so that it can be used with reduce() +template<class Predicate> +struct countable_predicate +{ + typedef ulong_ result_type; + + countable_predicate(Predicate predicate) + : m_predicate(predicate) + { + } + + template<class Arg> + invoked_countable_predicate<Predicate, Arg> operator()(const Arg &arg) const + { + return invoked_countable_predicate<Predicate, Arg>(m_predicate, arg); + } + + Predicate m_predicate; +}; + +// counts the number of elements matching predicate using reduce() +template<class InputIterator, class Predicate> +inline size_t count_if_with_reduce(InputIterator first, + InputIterator last, + Predicate predicate, + command_queue &queue) +{ + countable_predicate<Predicate> reduce_predicate(predicate); + + ulong_ count = 0; + ::boost::compute::reduce( + ::boost::compute::make_transform_iterator(first, reduce_predicate), + ::boost::compute::make_transform_iterator(last, reduce_predicate), + &count, + ::boost::compute::plus<ulong_>(), + queue + ); + + return static_cast<size_t>(count); +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_REDUCE_HPP diff --git a/boost/compute/algorithm/detail/count_if_with_threads.hpp b/boost/compute/algorithm/detail/count_if_with_threads.hpp new file mode 100644 index 0000000000..6f282982e0 --- /dev/null +++ b/boost/compute/algorithm/detail/count_if_with_threads.hpp @@ -0,0 +1,129 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_THREADS_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_THREADS_HPP + +#include <numeric> + +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/container/vector.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator, class Predicate> +class count_if_with_threads_kernel : meta_kernel +{ +public: + typedef typename + std::iterator_traits<InputIterator>::value_type + value_type; + + count_if_with_threads_kernel() + : meta_kernel("count_if_with_threads") + { + } + + void set_args(InputIterator first, + InputIterator last, + Predicate predicate) + + { + typedef typename std::iterator_traits<InputIterator>::value_type T; + + m_size = detail::iterator_range_size(first, last); + + m_size_arg = add_arg<const ulong_>("size"); + m_counts_arg = add_arg<ulong_ *>(memory_object::global_memory, "counts"); + + *this << + // thread parameters + "const uint gid = get_global_id(0);\n" << + "const uint block_size = size / get_global_size(0);\n" << + "const uint start = block_size * gid;\n" << + "uint end = 0;\n" << + "if(gid == get_global_size(0) - 1)\n" << + " end = size;\n" << + "else\n" << + " end = block_size * gid + block_size;\n" << + + // count values + "uint count = 0;\n" << + "for(uint i = start; i < end; i++){\n" << + decl<const T>("value") << "=" + << first[expr<uint_>("i")] << ";\n" << + if_(predicate(var<const T>("value"))) << "{\n" << + "count++;\n" << + "}\n" << + "}\n" << + + // write count + "counts[gid] = count;\n"; + } + + size_t exec(command_queue &queue) + { + const device &device = queue.get_device(); + const context &context = queue.get_context(); + + size_t threads = device.compute_units(); + + const size_t minimum_block_size = 2048; + if(m_size / threads < minimum_block_size){ + threads = static_cast<size_t>( + (std::max)( + std::ceil(float(m_size) / minimum_block_size), + 1.0f + ) + ); + } + + // storage for counts + ::boost::compute::vector<ulong_> counts(threads, context); + + // exec kernel + set_arg(m_size_arg, static_cast<ulong_>(m_size)); + set_arg(m_counts_arg, counts.get_buffer()); + exec_1d(queue, 0, threads, 1); + + // copy counts to the host + std::vector<ulong_> host_counts(threads); + ::boost::compute::copy(counts.begin(), counts.end(), host_counts.begin(), queue); + + // return sum of counts + return std::accumulate(host_counts.begin(), host_counts.end(), size_t(0)); + } + +private: + size_t m_size; + size_t m_size_arg; + size_t m_counts_arg; +}; + +// counts values that match the predicate using one thread per block. this is +// optimized for cpu-type devices with a small number of compute units. +template<class InputIterator, class Predicate> +inline size_t count_if_with_threads(InputIterator first, + InputIterator last, + Predicate predicate, + command_queue &queue) +{ + count_if_with_threads_kernel<InputIterator, Predicate> kernel; + kernel.set_args(first, last, predicate); + return kernel.exec(queue); +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COUNT_IF_WITH_THREADS_HPP diff --git a/boost/compute/algorithm/detail/find_extrema.hpp b/boost/compute/algorithm/detail/find_extrema.hpp new file mode 100644 index 0000000000..6e756c3904 --- /dev/null +++ b/boost/compute/algorithm/detail/find_extrema.hpp @@ -0,0 +1,64 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_HPP + +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/algorithm/detail/find_extrema_with_reduce.hpp> +#include <boost/compute/algorithm/detail/find_extrema_with_atomics.hpp> +#include <boost/compute/algorithm/detail/serial_find_extrema.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator, class Compare> +inline InputIterator find_extrema(InputIterator first, + InputIterator last, + Compare compare, + const bool find_minimum, + command_queue &queue) +{ + size_t count = iterator_range_size(first, last); + + // handle trivial cases + if(count == 0 || count == 1){ + return first; + } + + const device &device = queue.get_device(); + + // use serial method for small inputs + // and when device is a CPU + if(count < 512 || (device.type() & device::cpu)){ + return serial_find_extrema(first, last, compare, find_minimum, queue); + } + + // find_extrema_with_reduce() is used only if requirements are met + if(find_extrema_with_reduce_requirements_met(first, last, queue)) + { + return find_extrema_with_reduce(first, last, compare, find_minimum, queue); + } + + // use serial method for OpenCL version 1.0 due to + // problems with atomic_cmpxchg() + #ifndef CL_VERSION_1_1 + return serial_find_extrema(first, last, compare, find_minimum, queue); + #endif + + return find_extrema_with_atomics(first, last, compare, find_minimum, queue); +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_HPP diff --git a/boost/compute/algorithm/detail/find_extrema_with_atomics.hpp b/boost/compute/algorithm/detail/find_extrema_with_atomics.hpp new file mode 100644 index 0000000000..406d1becb7 --- /dev/null +++ b/boost/compute/algorithm/detail/find_extrema_with_atomics.hpp @@ -0,0 +1,108 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_ATOMICS_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_ATOMICS_HPP + +#include <boost/compute/types.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/container/detail/scalar.hpp> +#include <boost/compute/functional/atomic.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator, class Compare> +inline InputIterator find_extrema_with_atomics(InputIterator first, + InputIterator last, + Compare compare, + const bool find_minimum, + command_queue &queue) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; + + const context &context = queue.get_context(); + + meta_kernel k("find_extrema"); + atomic_cmpxchg<uint_> atomic_cmpxchg_uint; + + k << + "const uint gid = get_global_id(0);\n" << + "uint old_index = *index;\n" << + + k.decl<value_type>("old") << + " = " << first[k.var<uint_>("old_index")] << ";\n" << + k.decl<value_type>("new") << + " = " << first[k.var<uint_>("gid")] << ";\n" << + + k.decl<bool>("compare_result") << ";\n" << + "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" << + "while(" << + "(compare_result = " << compare(k.var<value_type>("old"), + k.var<value_type>("new")) << ")" << + " || (!(compare_result" << + " || " << compare(k.var<value_type>("new"), + k.var<value_type>("old")) << ") " + "&& gid < old_index)){\n" << + "#else\n" << + // while condition explained for minimum case with less (<) + // as comparison function: + // while(new_value < old_value + // OR (new_value == old_value AND new_index < old_index)) + "while(" << + "(compare_result = " << compare(k.var<value_type>("new"), + k.var<value_type>("old")) << ")" << + " || (!(compare_result" << + " || " << compare(k.var<value_type>("old"), + k.var<value_type>("new")) << ") " + "&& gid < old_index)){\n" << + "#endif\n" << + + " if(" << atomic_cmpxchg_uint(k.var<uint_ *>("index"), + k.var<uint_>("old_index"), + k.var<uint_>("gid")) << " == old_index)\n" << + " break;\n" << + " else\n" << + " old_index = *index;\n" << + "old = " << first[k.var<uint_>("old_index")] << ";\n" << + "}\n"; + + size_t index_arg_index = k.add_arg<uint_ *>(memory_object::global_memory, "index"); + + std::string options; + if(!find_minimum){ + options = "-DBOOST_COMPUTE_FIND_MAXIMUM"; + } + kernel kernel = k.compile(context, options); + + // setup index buffer + scalar<uint_> index(context); + kernel.set_arg(index_arg_index, index.get_buffer()); + + // initialize index + index.write(0, queue); + + // run kernel + size_t count = iterator_range_size(first, last); + queue.enqueue_1d_range_kernel(kernel, 0, count, 0); + + // read index and return iterator + return first + static_cast<difference_type>(index.read(queue)); +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_ATOMICS_HPP diff --git a/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp b/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp new file mode 100644 index 0000000000..1fbb7dee19 --- /dev/null +++ b/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp @@ -0,0 +1,443 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP + +#include <algorithm> + +#include <boost/compute/types.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/copy.hpp> +#include <boost/compute/allocator/pinned_allocator.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/parameter_cache.hpp> +#include <boost/compute/memory/local_buffer.hpp> +#include <boost/compute/type_traits/type_name.hpp> +#include <boost/compute/utility/program_cache.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator> +bool find_extrema_with_reduce_requirements_met(InputIterator first, + InputIterator last, + command_queue &queue) +{ + typedef typename std::iterator_traits<InputIterator>::value_type input_type; + + const device &device = queue.get_device(); + + // device must have dedicated local memory storage + // otherwise reduction would be highly inefficient + if(device.get_info<CL_DEVICE_LOCAL_MEM_TYPE>() != CL_LOCAL) + { + return false; + } + + const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); + // local memory size in bytes (per compute unit) + const size_t local_mem_size = device.get_info<CL_DEVICE_LOCAL_MEM_SIZE>(); + + std::string cache_key = std::string("__boost_find_extrema_reduce_") + + type_name<input_type>(); + // load parameters + boost::shared_ptr<parameter_cache> parameters = + detail::parameter_cache::get_global_cache(device); + + // Get preferred work group size + size_t work_group_size = parameters->get(cache_key, "wgsize", 256); + + work_group_size = (std::min)(max_work_group_size, work_group_size); + + // local memory size needed to perform parallel reduction + size_t required_local_mem_size = 0; + // indices size + required_local_mem_size += sizeof(uint_) * work_group_size; + // values size + required_local_mem_size += sizeof(input_type) * work_group_size; + + // at least 4 work groups per compute unit otherwise reduction + // would be highly inefficient + return ((required_local_mem_size * 4) <= local_mem_size); +} + +/// \internal_ +/// Algorithm finds the first extremum in given range, i.e., with the lowest +/// index. +/// +/// If \p use_input_idx is false, it's assumed that input data is ordered by +/// increasing index and \p input_idx is not used in the algorithm. +template<class InputIterator, class ResultIterator, class Compare> +inline void find_extrema_with_reduce(InputIterator input, + vector<uint_>::iterator input_idx, + size_t count, + ResultIterator result, + vector<uint_>::iterator result_idx, + size_t work_groups_no, + size_t work_group_size, + Compare compare, + const bool find_minimum, + const bool use_input_idx, + command_queue &queue) +{ + typedef typename std::iterator_traits<InputIterator>::value_type input_type; + + const context &context = queue.get_context(); + + meta_kernel k("find_extrema_reduce"); + size_t count_arg = k.add_arg<uint_>("count"); + size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block"); + size_t block_idx_arg = k.add_arg<uint_ *>(memory_object::local_memory, "block_idx"); + + k << + // Work item global id + k.decl<const uint_>("gid") << " = get_global_id(0);\n" << + + // Index of element that will be read from input buffer + k.decl<uint_>("idx") << " = gid;\n" << + + k.decl<input_type>("acc") << ";\n" << + k.decl<uint_>("acc_idx") << ";\n" << + "if(gid < count) {\n" << + // Real index of currently best element + "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << + k.var<uint_>("acc_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" << + "#else\n" << + k.var<uint_>("acc_idx") << " = idx;\n" << + "#endif\n" << + + // Init accumulator with first[get_global_id(0)] + "acc = " << input[k.var<uint_>("idx")] << ";\n" << + "idx += get_global_size(0);\n" << + "}\n" << + + k.decl<bool>("compare_result") << ";\n" << + k.decl<bool>("equal") << ";\n\n" << + "while( idx < count ){\n" << + // Next element + k.decl<input_type>("next") << " = " << input[k.var<uint_>("idx")] << ";\n" << + "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << + k.decl<input_type>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" << + "#endif\n" << + + // Comparison between currently best element (acc) and next element + "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" << + "compare_result = " << compare(k.var<input_type>("next"), + k.var<input_type>("acc")) << ";\n" << + "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << + "equal = !compare_result && !" << + compare(k.var<input_type>("acc"), + k.var<input_type>("next")) << ";\n" << + "# endif\n" << + "#else\n" << + "compare_result = " << compare(k.var<input_type>("acc"), + k.var<input_type>("next")) << ";\n" << + "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << + "equal = !compare_result && !" << + compare(k.var<input_type>("next"), + k.var<input_type>("acc")) << ";\n" << + "# endif\n" << + "#endif\n" << + + // save the winner + "acc = compare_result ? acc : next;\n" << + "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << + "acc_idx = compare_result ? " << + "acc_idx : " << + "(equal ? min(acc_idx, next_idx) : next_idx);\n" << + "#else\n" << + "acc_idx = compare_result ? acc_idx : idx;\n" << + "#endif\n" << + "idx += get_global_size(0);\n" << + "}\n\n" << + + // Work item local id + k.decl<const uint_>("lid") << " = get_local_id(0);\n" << + "block[lid] = acc;\n" << + "block_idx[lid] = acc_idx;\n" << + "barrier(CLK_LOCAL_MEM_FENCE);\n" << + + k.decl<uint_>("group_offset") << + " = count - (get_local_size(0) * get_group_id(0));\n\n"; + + k << + "#pragma unroll\n" + "for(" << k.decl<uint_>("offset") << " = " << uint_(work_group_size) << " / 2; offset > 0; " << + "offset = offset / 2) {\n" << + "if((lid < offset) && ((lid + offset) < group_offset)) { \n" << + k.decl<input_type>("mine") << " = block[lid];\n" << + k.decl<input_type>("other") << " = block[lid+offset];\n" << + "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" << + "compare_result = " << compare(k.var<input_type>("other"), + k.var<input_type>("mine")) << ";\n" << + "equal = !compare_result && !" << + compare(k.var<input_type>("mine"), + k.var<input_type>("other")) << ";\n" << + "#else\n" << + "compare_result = " << compare(k.var<input_type>("mine"), + k.var<input_type>("other")) << ";\n" << + "equal = !compare_result && !" << + compare(k.var<input_type>("other"), + k.var<input_type>("mine")) << ";\n" << + "#endif\n" << + "block[lid] = compare_result ? mine : other;\n" << + k.decl<uint_>("mine_idx") << " = block_idx[lid];\n" << + k.decl<uint_>("other_idx") << " = block_idx[lid+offset];\n" << + "block_idx[lid] = compare_result ? " << + "mine_idx : " << + "(equal ? min(mine_idx, other_idx) : other_idx);\n" << + "}\n" + "barrier(CLK_LOCAL_MEM_FENCE);\n" << + "}\n\n" << + + // write block result to global output + "if(lid == 0){\n" << + result[k.var<uint_>("get_group_id(0)")] << " = block[0];\n" << + result_idx[k.var<uint_>("get_group_id(0)")] << " = block_idx[0];\n" << + "}"; + + std::string options; + if(!find_minimum){ + options = "-DBOOST_COMPUTE_FIND_MAXIMUM"; + } + if(use_input_idx){ + options += " -DBOOST_COMPUTE_USE_INPUT_IDX"; + } + + kernel kernel = k.compile(context, options); + + kernel.set_arg(count_arg, static_cast<uint_>(count)); + kernel.set_arg(block_arg, local_buffer<input_type>(work_group_size)); + kernel.set_arg(block_idx_arg, local_buffer<uint_>(work_group_size)); + + queue.enqueue_1d_range_kernel(kernel, + 0, + work_groups_no * work_group_size, + work_group_size); +} + +template<class InputIterator, class ResultIterator, class Compare> +inline void find_extrema_with_reduce(InputIterator input, + size_t count, + ResultIterator result, + vector<uint_>::iterator result_idx, + size_t work_groups_no, + size_t work_group_size, + Compare compare, + const bool find_minimum, + command_queue &queue) +{ + // dummy will not be used + buffer_iterator<uint_> dummy = result_idx; + return find_extrema_with_reduce( + input, dummy, count, result, result_idx, work_groups_no, + work_group_size, compare, find_minimum, false, queue + ); +} + +template<class InputIterator, class Compare> +InputIterator find_extrema_with_reduce(InputIterator first, + InputIterator last, + Compare compare, + const bool find_minimum, + command_queue &queue) +{ + typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; + typedef typename std::iterator_traits<InputIterator>::value_type input_type; + + const context &context = queue.get_context(); + const device &device = queue.get_device(); + + // Getting information about used queue and device + const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>(); + const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); + + const size_t count = detail::iterator_range_size(first, last); + + std::string cache_key = std::string("__boost_find_extrema_with_reduce_") + + type_name<input_type>(); + + // load parameters + boost::shared_ptr<parameter_cache> parameters = + detail::parameter_cache::get_global_cache(device); + + // get preferred work group size and preferred number + // of work groups per compute unit + size_t work_group_size = parameters->get(cache_key, "wgsize", 256); + size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 100); + + // calculate work group size and number of work groups + work_group_size = (std::min)(max_work_group_size, work_group_size); + size_t work_groups_no = compute_units_no * work_groups_per_cu; + work_groups_no = (std::min)( + work_groups_no, + static_cast<size_t>(std::ceil(float(count) / work_group_size)) + ); + + // phase I: finding candidates for extremum + + // device buffors for extremum candidates and their indices + // each work-group computes its candidate + vector<input_type> candidates(work_groups_no, context); + vector<uint_> candidates_idx(work_groups_no, context); + + // finding candidates for first extremum and their indices + find_extrema_with_reduce( + first, count, candidates.begin(), candidates_idx.begin(), + work_groups_no, work_group_size, compare, find_minimum, queue + ); + + // phase II: finding extremum from among the candidates + + // zero-copy buffers for final result (value and index) + vector<input_type, ::boost::compute::pinned_allocator<input_type> > + result(1, context); + vector<uint_, ::boost::compute::pinned_allocator<uint_> > + result_idx(1, context); + + // get extremum from among the candidates + find_extrema_with_reduce( + candidates.begin(), candidates_idx.begin(), work_groups_no, result.begin(), + result_idx.begin(), 1, work_group_size, compare, find_minimum, true, queue + ); + + // mapping extremum index to host + uint_* result_idx_host_ptr = + static_cast<uint_*>( + queue.enqueue_map_buffer( + result_idx.get_buffer(), command_queue::map_read, + 0, sizeof(uint_) + ) + ); + + return first + static_cast<difference_type>(*result_idx_host_ptr); +} + +template<class InputIterator> +InputIterator find_extrema_with_reduce(InputIterator first, + InputIterator last, + ::boost::compute::less< + typename std::iterator_traits< + InputIterator + >::value_type + > + compare, + const bool find_minimum, + command_queue &queue) +{ + typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; + typedef typename std::iterator_traits<InputIterator>::value_type input_type; + + const context &context = queue.get_context(); + const device &device = queue.get_device(); + + // Getting information about used queue and device + const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>(); + const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); + + const size_t count = detail::iterator_range_size(first, last); + + std::string cache_key = std::string("__boost_find_extrema_with_reduce_") + + type_name<input_type>(); + + // load parameters + boost::shared_ptr<parameter_cache> parameters = + detail::parameter_cache::get_global_cache(device); + + // get preferred work group size and preferred number + // of work groups per compute unit + size_t work_group_size = parameters->get(cache_key, "wgsize", 256); + size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 64); + + // calculate work group size and number of work groups + work_group_size = (std::min)(max_work_group_size, work_group_size); + size_t work_groups_no = compute_units_no * work_groups_per_cu; + work_groups_no = (std::min)( + work_groups_no, + static_cast<size_t>(std::ceil(float(count) / work_group_size)) + ); + + // phase I: finding candidates for extremum + + // device buffors for extremum candidates and their indices + // each work-group computes its candidate + // zero-copy buffers are used to eliminate copying data back to host + vector<input_type, ::boost::compute::pinned_allocator<input_type> > + candidates(work_groups_no, context); + vector<uint_, ::boost::compute::pinned_allocator <uint_> > + candidates_idx(work_groups_no, context); + + // finding candidates for first extremum and their indices + find_extrema_with_reduce( + first, count, candidates.begin(), candidates_idx.begin(), + work_groups_no, work_group_size, compare, find_minimum, queue + ); + + // phase II: finding extremum from among the candidates + + // mapping candidates and their indices to host + input_type* candidates_host_ptr = + static_cast<input_type*>( + queue.enqueue_map_buffer( + candidates.get_buffer(), command_queue::map_read, + 0, work_groups_no * sizeof(input_type) + ) + ); + + uint_* candidates_idx_host_ptr = + static_cast<uint_*>( + queue.enqueue_map_buffer( + candidates_idx.get_buffer(), command_queue::map_read, + 0, work_groups_no * sizeof(uint_) + ) + ); + + input_type* i = candidates_host_ptr; + uint_* idx = candidates_idx_host_ptr; + uint_* extremum_idx = idx; + input_type extremum = *candidates_host_ptr; + i++; idx++; + + // find extremum (serial) from among the candidates on host + if(!find_minimum) { + while(idx != (candidates_idx_host_ptr + work_groups_no)) { + input_type next = *i; + bool compare_result = next > extremum; + bool equal = next == extremum; + extremum = compare_result ? next : extremum; + extremum_idx = compare_result ? idx : extremum_idx; + extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx; + idx++, i++; + } + } + else { + while(idx != (candidates_idx_host_ptr + work_groups_no)) { + input_type next = *i; + bool compare_result = next < extremum; + bool equal = next == extremum; + extremum = compare_result ? next : extremum; + extremum_idx = compare_result ? idx : extremum_idx; + extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx; + idx++, i++; + } + } + + return first + static_cast<difference_type>(*extremum_idx); +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP diff --git a/boost/compute/algorithm/detail/find_if_with_atomics.hpp b/boost/compute/algorithm/detail/find_if_with_atomics.hpp new file mode 100644 index 0000000000..112c34cf00 --- /dev/null +++ b/boost/compute/algorithm/detail/find_if_with_atomics.hpp @@ -0,0 +1,212 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP + +#include <iterator> + +#include <boost/compute/types.hpp> +#include <boost/compute/functional.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/container/detail/scalar.hpp> +#include <boost/compute/iterator/buffer_iterator.hpp> +#include <boost/compute/type_traits/type_name.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/parameter_cache.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator, class UnaryPredicate> +inline InputIterator find_if_with_atomics_one_vpt(InputIterator first, + InputIterator last, + UnaryPredicate predicate, + const size_t count, + command_queue &queue) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; + + const context &context = queue.get_context(); + + detail::meta_kernel k("find_if"); + size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index"); + atomic_min<uint_> atomic_min_uint; + + k << k.decl<const uint_>("i") << " = get_global_id(0);\n" + << k.decl<const value_type>("value") << "=" + << first[k.var<const uint_>("i")] << ";\n" + << "if(" << predicate(k.var<const value_type>("value")) << "){\n" + << " " << atomic_min_uint(k.var<uint_ *>("index"), k.var<uint_>("i")) << ";\n" + << "}\n"; + + kernel kernel = k.compile(context); + + scalar<uint_> index(context); + kernel.set_arg(index_arg, index.get_buffer()); + + // initialize index to the last iterator's index + index.write(static_cast<uint_>(count), queue); + queue.enqueue_1d_range_kernel(kernel, 0, count, 0); + + // read index and return iterator + return first + static_cast<difference_type>(index.read(queue)); +} + +template<class InputIterator, class UnaryPredicate> +inline InputIterator find_if_with_atomics_multiple_vpt(InputIterator first, + InputIterator last, + UnaryPredicate predicate, + const size_t count, + const size_t vpt, + command_queue &queue) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; + + const context &context = queue.get_context(); + const device &device = queue.get_device(); + + detail::meta_kernel k("find_if"); + size_t index_arg = k.add_arg<uint_ *>(memory_object::global_memory, "index"); + size_t count_arg = k.add_arg<const uint_>("count"); + size_t vpt_arg = k.add_arg<const uint_>("vpt"); + atomic_min<uint_> atomic_min_uint; + + // for GPUs reads from global memory are coalesced + if(device.type() & device::gpu) { + k << + k.decl<const uint_>("lsize") << " = get_local_size(0);\n" << + k.decl<uint_>("id") << " = get_local_id(0) + get_group_id(0) * lsize * vpt;\n" << + k.decl<const uint_>("end") << " = min(" << + "id + (lsize *" << k.var<uint_>("vpt") << ")," << + "count" << + ");\n" << + + // checking if the index is already found + "__local uint local_index;\n" << + "if(get_local_id(0) == 0){\n" << + " local_index = *index;\n " << + "};\n" << + "barrier(CLK_LOCAL_MEM_FENCE);\n" << + "if(local_index < id){\n" << + " return;\n" << + "}\n" << + + "while(id < end){\n" << + " " << k.decl<const value_type>("value") << " = " << + first[k.var<const uint_>("id")] << ";\n" + " if(" << predicate(k.var<const value_type>("value")) << "){\n" << + " " << atomic_min_uint(k.var<uint_ *>("index"), + k.var<uint_>("id")) << ";\n" << + " return;\n" + " }\n" << + " id+=lsize;\n" << + "}\n"; + // for CPUs (and other devices) reads are ordered so the big cache is + // efficiently used. + } else { + k << + k.decl<uint_>("id") << " = get_global_id(0) * " << k.var<uint_>("vpt") << ";\n" << + k.decl<const uint_>("end") << " = min(" << + "id + " << k.var<uint_>("vpt") << "," << + "count" << + ");\n" << + "while(id < end && (*index) > id){\n" << + " " << k.decl<const value_type>("value") << " = " << + first[k.var<const uint_>("id")] << ";\n" + " if(" << predicate(k.var<const value_type>("value")) << "){\n" << + " " << atomic_min_uint(k.var<uint_ *>("index"), + k.var<uint_>("id")) << ";\n" << + " return;\n" << + " }\n" << + " id++;\n" << + "}\n"; + } + + kernel kernel = k.compile(context); + + scalar<uint_> index(context); + kernel.set_arg(index_arg, index.get_buffer()); + kernel.set_arg(count_arg, static_cast<uint_>(count)); + kernel.set_arg(vpt_arg, static_cast<uint_>(vpt)); + + // initialize index to the last iterator's index + index.write(static_cast<uint_>(count), queue); + + const size_t global_wg_size = static_cast<size_t>( + std::ceil(float(count) / vpt) + ); + queue.enqueue_1d_range_kernel(kernel, 0, global_wg_size, 0); + + // read index and return iterator + return first + static_cast<difference_type>(index.read(queue)); +} + +template<class InputIterator, class UnaryPredicate> +inline InputIterator find_if_with_atomics(InputIterator first, + InputIterator last, + UnaryPredicate predicate, + command_queue &queue) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + + size_t count = detail::iterator_range_size(first, last); + if(count == 0){ + return last; + } + + const device &device = queue.get_device(); + + // load cached parameters + std::string cache_key = std::string("__boost_find_if_with_atomics_") + + type_name<value_type>(); + boost::shared_ptr<parameter_cache> parameters = + detail::parameter_cache::get_global_cache(device); + + // for relatively small inputs on GPUs kernel checking one value per thread + // (work-item) is more efficient than its multiple values per thread version + if(device.type() & device::gpu){ + const size_t one_vpt_threshold = + parameters->get(cache_key, "one_vpt_threshold", 1048576); + if(count <= one_vpt_threshold){ + return find_if_with_atomics_one_vpt( + first, last, predicate, count, queue + ); + } + } + + // values per thread + size_t vpt; + if(device.type() & device::gpu){ + // get vpt parameter + vpt = parameters->get(cache_key, "vpt", 32); + } else { + // for CPUs work is split equally between compute units + const size_t max_compute_units = + device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>(); + vpt = static_cast<size_t>( + std::ceil(float(count) / max_compute_units) + ); + } + + return find_if_with_atomics_multiple_vpt( + first, last, predicate, count, vpt, queue + ); +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP diff --git a/boost/compute/algorithm/detail/inplace_reduce.hpp b/boost/compute/algorithm/detail/inplace_reduce.hpp new file mode 100644 index 0000000000..60c61e83fe --- /dev/null +++ b/boost/compute/algorithm/detail/inplace_reduce.hpp @@ -0,0 +1,136 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP + +#include <iterator> + +#include <boost/utility/result_of.hpp> + +#include <boost/compute/command_queue.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/memory/local_buffer.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class Iterator, class BinaryFunction> +inline void inplace_reduce(Iterator first, + Iterator last, + BinaryFunction function, + command_queue &queue) +{ + typedef typename + std::iterator_traits<Iterator>::value_type + value_type; + + size_t input_size = iterator_range_size(first, last); + if(input_size < 2){ + return; + } + + const context &context = queue.get_context(); + + size_t block_size = 64; + size_t values_per_thread = 8; + size_t block_count = input_size / (block_size * values_per_thread); + if(block_count * block_size * values_per_thread != input_size) + block_count++; + + vector<value_type> output(block_count, context); + + meta_kernel k("inplace_reduce"); + size_t input_arg = k.add_arg<value_type *>(memory_object::global_memory, "input"); + size_t input_size_arg = k.add_arg<const uint_>("input_size"); + size_t output_arg = k.add_arg<value_type *>(memory_object::global_memory, "output"); + size_t scratch_arg = k.add_arg<value_type *>(memory_object::local_memory, "scratch"); + k << + "const uint gid = get_global_id(0);\n" << + "const uint lid = get_local_id(0);\n" << + "const uint values_per_thread =\n" + << uint_(values_per_thread) << ";\n" << + + // thread reduce + "const uint index = gid * values_per_thread;\n" << + "if(index < input_size){\n" << + k.decl<value_type>("sum") << " = input[index];\n" << + "for(uint i = 1;\n" << + "i < values_per_thread && (index + i) < input_size;\n" << + "i++){\n" << + " sum = " << + function(k.var<value_type>("sum"), + k.var<value_type>("input[index+i]")) << ";\n" << + "}\n" << + "scratch[lid] = sum;\n" << + "}\n" << + + // local reduce + "for(uint i = 1; i < get_local_size(0); i <<= 1){\n" << + " barrier(CLK_LOCAL_MEM_FENCE);\n" << + " uint mask = (i << 1) - 1;\n" << + " uint next_index = (gid + i) * values_per_thread;\n" + " if((lid & mask) == 0 && next_index < input_size){\n" << + " scratch[lid] = " << + function(k.var<value_type>("scratch[lid]"), + k.var<value_type>("scratch[lid+i]")) << ";\n" << + " }\n" << + "}\n" << + + // write output for block + "if(lid == 0){\n" << + " output[get_group_id(0)] = scratch[0];\n" << + "}\n" + ; + + const buffer *input_buffer = &first.get_buffer(); + const buffer *output_buffer = &output.get_buffer(); + + kernel kernel = k.compile(context); + + while(input_size > 1){ + kernel.set_arg(input_arg, *input_buffer); + kernel.set_arg(input_size_arg, static_cast<uint_>(input_size)); + kernel.set_arg(output_arg, *output_buffer); + kernel.set_arg(scratch_arg, local_buffer<value_type>(block_size)); + + queue.enqueue_1d_range_kernel(kernel, + 0, + block_count * block_size, + block_size); + + input_size = + static_cast<size_t>( + std::ceil(float(input_size) / (block_size * values_per_thread) + ) + ); + + block_count = input_size / (block_size * values_per_thread); + if(block_count * block_size * values_per_thread != input_size) + block_count++; + + std::swap(input_buffer, output_buffer); + } + + if(input_buffer != &first.get_buffer()){ + ::boost::compute::copy(output.begin(), + output.begin() + 1, + first, + queue); + } +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP diff --git a/boost/compute/algorithm/detail/insertion_sort.hpp b/boost/compute/algorithm/detail/insertion_sort.hpp new file mode 100644 index 0000000000..4b5b95139a --- /dev/null +++ b/boost/compute/algorithm/detail/insertion_sort.hpp @@ -0,0 +1,165 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_INSERTION_SORT_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_INSERTION_SORT_HPP + +#include <boost/compute/kernel.hpp> +#include <boost/compute/program.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/memory/local_buffer.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class Iterator, class Compare> +inline void serial_insertion_sort(Iterator first, + Iterator last, + Compare compare, + command_queue &queue) +{ + typedef typename std::iterator_traits<Iterator>::value_type T; + + size_t count = iterator_range_size(first, last); + if(count < 2){ + return; + } + + meta_kernel k("serial_insertion_sort"); + size_t local_data_arg = k.add_arg<T *>(memory_object::local_memory, "data"); + size_t count_arg = k.add_arg<uint_>("n"); + + k << + // copy data to local memory + "for(uint i = 0; i < n; i++){\n" << + " data[i] = " << first[k.var<uint_>("i")] << ";\n" + "}\n" + + // sort data in local memory + "for(uint i = 1; i < n; i++){\n" << + " " << k.decl<const T>("value") << " = data[i];\n" << + " uint pos = i;\n" << + " while(pos > 0 && " << + compare(k.var<const T>("value"), + k.var<const T>("data[pos-1]")) << "){\n" << + " data[pos] = data[pos-1];\n" << + " pos--;\n" << + " }\n" << + " data[pos] = value;\n" << + "}\n" << + + // copy sorted data to output + "for(uint i = 0; i < n; i++){\n" << + " " << first[k.var<uint_>("i")] << " = data[i];\n" + "}\n"; + + const context &context = queue.get_context(); + ::boost::compute::kernel kernel = k.compile(context); + kernel.set_arg(local_data_arg, local_buffer<T>(count)); + kernel.set_arg(count_arg, static_cast<uint_>(count)); + + queue.enqueue_task(kernel); +} + +template<class Iterator> +inline void serial_insertion_sort(Iterator first, + Iterator last, + command_queue &queue) +{ + typedef typename std::iterator_traits<Iterator>::value_type T; + + ::boost::compute::less<T> less; + + return serial_insertion_sort(first, last, less, queue); +} + +template<class KeyIterator, class ValueIterator, class Compare> +inline void serial_insertion_sort_by_key(KeyIterator keys_first, + KeyIterator keys_last, + ValueIterator values_first, + Compare compare, + command_queue &queue) +{ + typedef typename std::iterator_traits<KeyIterator>::value_type key_type; + typedef typename std::iterator_traits<ValueIterator>::value_type value_type; + + size_t count = iterator_range_size(keys_first, keys_last); + if(count < 2){ + return; + } + + meta_kernel k("serial_insertion_sort_by_key"); + size_t local_keys_arg = k.add_arg<key_type *>(memory_object::local_memory, "keys"); + size_t local_data_arg = k.add_arg<value_type *>(memory_object::local_memory, "data"); + size_t count_arg = k.add_arg<uint_>("n"); + + k << + // copy data to local memory + "for(uint i = 0; i < n; i++){\n" << + " keys[i] = " << keys_first[k.var<uint_>("i")] << ";\n" + " data[i] = " << values_first[k.var<uint_>("i")] << ";\n" + "}\n" + + // sort data in local memory + "for(uint i = 1; i < n; i++){\n" << + " " << k.decl<const key_type>("key") << " = keys[i];\n" << + " " << k.decl<const value_type>("value") << " = data[i];\n" << + " uint pos = i;\n" << + " while(pos > 0 && " << + compare(k.var<const key_type>("key"), + k.var<const key_type>("keys[pos-1]")) << "){\n" << + " keys[pos] = keys[pos-1];\n" << + " data[pos] = data[pos-1];\n" << + " pos--;\n" << + " }\n" << + " keys[pos] = key;\n" << + " data[pos] = value;\n" << + "}\n" << + + // copy sorted data to output + "for(uint i = 0; i < n; i++){\n" << + " " << keys_first[k.var<uint_>("i")] << " = keys[i];\n" + " " << values_first[k.var<uint_>("i")] << " = data[i];\n" + "}\n"; + + const context &context = queue.get_context(); + ::boost::compute::kernel kernel = k.compile(context); + kernel.set_arg(local_keys_arg, static_cast<uint_>(count * sizeof(key_type)), 0); + kernel.set_arg(local_data_arg, static_cast<uint_>(count * sizeof(value_type)), 0); + kernel.set_arg(count_arg, static_cast<uint_>(count)); + + queue.enqueue_task(kernel); +} + +template<class KeyIterator, class ValueIterator> +inline void serial_insertion_sort_by_key(KeyIterator keys_first, + KeyIterator keys_last, + ValueIterator values_first, + command_queue &queue) +{ + typedef typename std::iterator_traits<KeyIterator>::value_type key_type; + + serial_insertion_sort_by_key( + keys_first, + keys_last, + values_first, + boost::compute::less<key_type>(), + queue + ); +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_INSERTION_SORT_HPP diff --git a/boost/compute/algorithm/detail/merge_path.hpp b/boost/compute/algorithm/detail/merge_path.hpp new file mode 100644 index 0000000000..bc2c8fa88c --- /dev/null +++ b/boost/compute/algorithm/detail/merge_path.hpp @@ -0,0 +1,116 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_PATH_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_PATH_HPP + +#include <iterator> + +#include <boost/compute/algorithm/find_if.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/lambda.hpp> +#include <boost/compute/system.hpp> + +namespace boost { +namespace compute { +namespace detail { + +/// +/// \brief Merge Path kernel class +/// +/// Subclass of meta_kernel to break two sets into tiles according +/// to their merge path +/// +class merge_path_kernel : public meta_kernel +{ +public: + unsigned int tile_size; + + merge_path_kernel() : meta_kernel("merge_path") + { + tile_size = 4; + } + + template<class InputIterator1, class InputIterator2, + class OutputIterator1, class OutputIterator2, + class Compare> + void set_range(InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + InputIterator2 last2, + OutputIterator1 result_a, + OutputIterator2 result_b, + Compare comp) + { + m_a_count = iterator_range_size(first1, last1); + m_a_count_arg = add_arg<uint_>("a_count"); + + m_b_count = iterator_range_size(first2, last2); + m_b_count_arg = add_arg<uint_>("b_count"); + + *this << + "uint i = get_global_id(0);\n" << + "uint target = (i+1)*" << tile_size << ";\n" << + "uint start = max(convert_int(0),convert_int(target)-convert_int(b_count));\n" << + "uint end = min(target,a_count);\n" << + "uint a_index, b_index;\n" << + "while(start<end)\n" << + "{\n" << + " a_index = (start + end)/2;\n" << + " b_index = target - a_index - 1;\n" << + " if(!(" << comp(first2[expr<uint_>("b_index")], + first1[expr<uint_>("a_index")]) << "))\n" << + " start = a_index + 1;\n" << + " else end = a_index;\n" << + "}\n" << + result_a[expr<uint_>("i")] << " = start;\n" << + result_b[expr<uint_>("i")] << " = target - start;\n"; + } + + template<class InputIterator1, class InputIterator2, + class OutputIterator1, class OutputIterator2> + void set_range(InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + InputIterator2 last2, + OutputIterator1 result_a, + OutputIterator2 result_b) + { + typedef typename std::iterator_traits<InputIterator1>::value_type value_type; + ::boost::compute::less<value_type> less_than; + set_range(first1, last1, first2, last2, result_a, result_b, less_than); + } + + event exec(command_queue &queue) + { + if((m_a_count + m_b_count)/tile_size == 0) { + return event(); + } + + set_arg(m_a_count_arg, uint_(m_a_count)); + set_arg(m_b_count_arg, uint_(m_b_count)); + + return exec_1d(queue, 0, (m_a_count + m_b_count)/tile_size); + } + +private: + size_t m_a_count; + size_t m_a_count_arg; + size_t m_b_count; + size_t m_b_count_arg; +}; + +} //end detail namespace +} //end compute namespace +} //end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_PATH_HPP diff --git a/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp b/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp new file mode 100644 index 0000000000..f4b53f10ae --- /dev/null +++ b/boost/compute/algorithm/detail/merge_sort_on_cpu.hpp @@ -0,0 +1,366 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP + +#include <boost/compute/kernel.hpp> +#include <boost/compute/program.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/detail/merge_with_merge_path.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class KeyIterator, class ValueIterator, class Compare> +inline void merge_blocks(KeyIterator keys_first, + ValueIterator values_first, + KeyIterator keys_result, + ValueIterator values_result, + Compare compare, + size_t count, + const size_t block_size, + const bool sort_by_key, + command_queue &queue) +{ + (void) values_result; + (void) values_first; + + meta_kernel k("merge_sort_on_cpu_merge_blocks"); + size_t count_arg = k.add_arg<const uint_>("count"); + size_t block_size_arg = k.add_arg<uint_>("block_size"); + + k << + k.decl<uint_>("b1_start") << " = get_global_id(0) * block_size * 2;\n" << + k.decl<uint_>("b1_end") << " = min(count, b1_start + block_size);\n" << + k.decl<uint_>("b2_start") << " = min(count, b1_start + block_size);\n" << + k.decl<uint_>("b2_end") << " = min(count, b2_start + block_size);\n" << + k.decl<uint_>("result_idx") << " = b1_start;\n" << + + // merging block 1 and block 2 (stable) + "while(b1_start < b1_end && b2_start < b2_end){\n" << + " if( " << compare(keys_first[k.var<uint_>("b2_start")], + keys_first[k.var<uint_>("b1_start")]) << "){\n" << + " " << keys_result[k.var<uint_>("result_idx")] << " = " << + keys_first[k.var<uint_>("b2_start")] << ";\n"; + if(sort_by_key){ + k << + " " << values_result[k.var<uint_>("result_idx")] << " = " << + values_first[k.var<uint_>("b2_start")] << ";\n"; + } + k << + " b2_start++;\n" << + " }\n" << + " else {\n" << + " " << keys_result[k.var<uint_>("result_idx")] << " = " << + keys_first[k.var<uint_>("b1_start")] << ";\n"; + if(sort_by_key){ + k << + " " << values_result[k.var<uint_>("result_idx")] << " = " << + values_first[k.var<uint_>("b1_start")] << ";\n"; + } + k << + " b1_start++;\n" << + " }\n" << + " result_idx++;\n" << + "}\n" << + "while(b1_start < b1_end){\n" << + " " << keys_result[k.var<uint_>("result_idx")] << " = " << + keys_first[k.var<uint_>("b1_start")] << ";\n"; + if(sort_by_key){ + k << + " " << values_result[k.var<uint_>("result_idx")] << " = " << + values_first[k.var<uint_>("b1_start")] << ";\n"; + } + k << + " b1_start++;\n" << + " result_idx++;\n" << + "}\n" << + "while(b2_start < b2_end){\n" << + " " << keys_result[k.var<uint_>("result_idx")] << " = " << + keys_first[k.var<uint_>("b2_start")] << ";\n"; + if(sort_by_key){ + k << + " " << values_result[k.var<uint_>("result_idx")] << " = " << + values_first[k.var<uint_>("b2_start")] << ";\n"; + } + k << + " b2_start++;\n" << + " result_idx++;\n" << + "}\n"; + + const context &context = queue.get_context(); + ::boost::compute::kernel kernel = k.compile(context); + kernel.set_arg(count_arg, static_cast<const uint_>(count)); + kernel.set_arg(block_size_arg, static_cast<uint_>(block_size)); + + const size_t global_size = static_cast<size_t>( + std::ceil(float(count) / (2 * block_size)) + ); + queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0); +} + +template<class Iterator, class Compare> +inline void merge_blocks(Iterator first, + Iterator result, + Compare compare, + size_t count, + const size_t block_size, + const bool sort_by_key, + command_queue &queue) +{ + // dummy iterator as it's not sort by key + Iterator dummy; + merge_blocks(first, dummy, result, dummy, compare, count, block_size, false, queue); +} + +template<class Iterator, class Compare> +inline void dispatch_merge_blocks(Iterator first, + Iterator result, + Compare compare, + size_t count, + const size_t block_size, + const size_t input_size_threshold, + const size_t blocks_no_threshold, + command_queue &queue) +{ + const size_t blocks_no = static_cast<size_t>( + std::ceil(float(count) / block_size) + ); + // merge with merge path should used only for the large arrays and at the + // end of merging part when there are only a few big blocks left to be merged + if(blocks_no <= blocks_no_threshold && count >= input_size_threshold){ + Iterator last = first + count; + for(size_t i = 0; i < count; i+= 2*block_size) + { + Iterator first1 = (std::min)(first + i, last); + Iterator last1 = (std::min)(first1 + block_size, last); + Iterator first2 = last1; + Iterator last2 = (std::min)(first2 + block_size, last); + Iterator block_result = (std::min)(result + i, result + count); + merge_with_merge_path(first1, last1, first2, last2, + block_result, compare, queue); + } + } + else { + merge_blocks(first, result, compare, count, block_size, false, queue); + } +} + +template<class KeyIterator, class ValueIterator, class Compare> +inline void block_insertion_sort(KeyIterator keys_first, + ValueIterator values_first, + Compare compare, + const size_t count, + const size_t block_size, + const bool sort_by_key, + command_queue &queue) +{ + (void) values_first; + + typedef typename std::iterator_traits<KeyIterator>::value_type K; + typedef typename std::iterator_traits<ValueIterator>::value_type T; + + meta_kernel k("merge_sort_on_cpu_block_insertion_sort"); + size_t count_arg = k.add_arg<uint_>("count"); + size_t block_size_arg = k.add_arg<uint_>("block_size"); + + k << + k.decl<uint_>("start") << " = get_global_id(0) * block_size;\n" << + k.decl<uint_>("end") << " = min(count, start + block_size);\n" << + + // block insertion sort (stable) + "for(uint i = start+1; i < end; i++){\n" << + " " << k.decl<const K>("key") << " = " << + keys_first[k.var<uint_>("i")] << ";\n"; + if(sort_by_key){ + k << + " " << k.decl<const T>("value") << " = " << + values_first[k.var<uint_>("i")] << ";\n"; + } + k << + " uint pos = i;\n" << + " while(pos > start && " << + compare(k.var<const K>("key"), + keys_first[k.var<uint_>("pos-1")]) << "){\n" << + " " << keys_first[k.var<uint_>("pos")] << " = " << + keys_first[k.var<uint_>("pos-1")] << ";\n"; + if(sort_by_key){ + k << + " " << values_first[k.var<uint_>("pos")] << " = " << + values_first[k.var<uint_>("pos-1")] << ";\n"; + } + k << + " pos--;\n" << + " }\n" << + " " << keys_first[k.var<uint_>("pos")] << " = key;\n"; + if(sort_by_key) { + k << + " " << values_first[k.var<uint_>("pos")] << " = value;\n"; + } + k << + "}\n"; // block insertion sort + + const context &context = queue.get_context(); + ::boost::compute::kernel kernel = k.compile(context); + kernel.set_arg(count_arg, static_cast<uint_>(count)); + kernel.set_arg(block_size_arg, static_cast<uint_>(block_size)); + + const size_t global_size = static_cast<size_t>(std::ceil(float(count) / block_size)); + queue.enqueue_1d_range_kernel(kernel, 0, global_size, 0); +} + +template<class Iterator, class Compare> +inline void block_insertion_sort(Iterator first, + Compare compare, + const size_t count, + const size_t block_size, + command_queue &queue) +{ + // dummy iterator as it's not sort by key + Iterator dummy; + block_insertion_sort(first, dummy, compare, count, block_size, false, queue); +} + +// This sort is stable. +template<class Iterator, class Compare> +inline void merge_sort_on_cpu(Iterator first, + Iterator last, + Compare compare, + command_queue &queue) +{ + typedef typename std::iterator_traits<Iterator>::value_type value_type; + + size_t count = iterator_range_size(first, last); + if(count < 2){ + return; + } + // for small input size only insertion sort is performed + else if(count <= 512){ + block_insertion_sort(first, compare, count, count, queue); + return; + } + + const context &context = queue.get_context(); + const device &device = queue.get_device(); + + // loading parameters + std::string cache_key = + std::string("__boost_merge_sort_on_cpu_") + type_name<value_type>(); + boost::shared_ptr<parameter_cache> parameters = + detail::parameter_cache::get_global_cache(device); + + // When there is merge_with_path_blocks_no_threshold or less blocks left to + // merge AND input size is merge_with_merge_path_input_size_threshold or more + // merge_with_merge_path() algorithm is used to merge sorted blocks; + // otherwise merge_blocks() is used. + const size_t merge_with_path_blocks_no_threshold = + parameters->get(cache_key, "merge_with_merge_path_blocks_no_threshold", 8); + const size_t merge_with_path_input_size_threshold = + parameters->get(cache_key, "merge_with_merge_path_input_size_threshold", 2097152); + + const size_t block_size = + parameters->get(cache_key, "insertion_sort_block_size", 64); + block_insertion_sort(first, compare, count, block_size, queue); + + // temporary buffer for merge result + vector<value_type> temp(count, context); + bool result_in_temporary_buffer = false; + + for(size_t i = block_size; i < count; i *= 2){ + result_in_temporary_buffer = !result_in_temporary_buffer; + if(result_in_temporary_buffer) { + dispatch_merge_blocks(first, temp.begin(), compare, count, i, + merge_with_path_input_size_threshold, + merge_with_path_blocks_no_threshold, + queue); + } else { + dispatch_merge_blocks(temp.begin(), first, compare, count, i, + merge_with_path_input_size_threshold, + merge_with_path_blocks_no_threshold, + queue); + } + } + + if(result_in_temporary_buffer) { + copy(temp.begin(), temp.end(), first, queue); + } +} + +// This sort is stable. +template<class KeyIterator, class ValueIterator, class Compare> +inline void merge_sort_by_key_on_cpu(KeyIterator keys_first, + KeyIterator keys_last, + ValueIterator values_first, + Compare compare, + command_queue &queue) +{ + typedef typename std::iterator_traits<KeyIterator>::value_type key_type; + typedef typename std::iterator_traits<ValueIterator>::value_type value_type; + + size_t count = iterator_range_size(keys_first, keys_last); + if(count < 2){ + return; + } + // for small input size only insertion sort is performed + else if(count <= 512){ + block_insertion_sort(keys_first, values_first, compare, + count, count, true, queue); + return; + } + + const context &context = queue.get_context(); + const device &device = queue.get_device(); + + // loading parameters + std::string cache_key = + std::string("__boost_merge_sort_by_key_on_cpu_") + type_name<value_type>() + + "_with_" + type_name<key_type>(); + boost::shared_ptr<parameter_cache> parameters = + detail::parameter_cache::get_global_cache(device); + + const size_t block_size = + parameters->get(cache_key, "insertion_sort_by_key_block_size", 64); + block_insertion_sort(keys_first, values_first, compare, + count, block_size, true, queue); + + // temporary buffer for merge results + vector<value_type> values_temp(count, context); + vector<key_type> keys_temp(count, context); + bool result_in_temporary_buffer = false; + + for(size_t i = block_size; i < count; i *= 2){ + result_in_temporary_buffer = !result_in_temporary_buffer; + if(result_in_temporary_buffer) { + merge_blocks(keys_first, values_first, + keys_temp.begin(), values_temp.begin(), + compare, count, i, true, queue); + } else { + merge_blocks(keys_temp.begin(), values_temp.begin(), + keys_first, values_first, + compare, count, i, true, queue); + } + } + + if(result_in_temporary_buffer) { + copy(keys_temp.begin(), keys_temp.end(), keys_first, queue); + copy(values_temp.begin(), values_temp.end(), values_first, queue); + } +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_CPU_HPP diff --git a/boost/compute/algorithm/detail/merge_with_merge_path.hpp b/boost/compute/algorithm/detail/merge_with_merge_path.hpp new file mode 100644 index 0000000000..c3cc5e8e9c --- /dev/null +++ b/boost/compute/algorithm/detail/merge_with_merge_path.hpp @@ -0,0 +1,203 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_WIH_MERGE_PATH_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_WIH_MERGE_PATH_HPP + +#include <iterator> + +#include <boost/compute/algorithm/detail/merge_path.hpp> +#include <boost/compute/algorithm/fill_n.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/system.hpp> + +namespace boost { +namespace compute { +namespace detail { + +/// +/// \brief Serial merge kernel class +/// +/// Subclass of meta_kernel to perform serial merge after tiling +/// +class serial_merge_kernel : meta_kernel +{ +public: + unsigned int tile_size; + + serial_merge_kernel() : meta_kernel("merge") + { + tile_size = 4; + } + + template<class InputIterator1, class InputIterator2, + class InputIterator3, class InputIterator4, + class OutputIterator, class Compare> + void set_range(InputIterator1 first1, + InputIterator2 first2, + InputIterator3 tile_first1, + InputIterator3 tile_last1, + InputIterator4 tile_first2, + OutputIterator result, + Compare comp) + { + m_count = iterator_range_size(tile_first1, tile_last1) - 1; + + *this << + "uint i = get_global_id(0);\n" << + "uint start1 = " << tile_first1[expr<uint_>("i")] << ";\n" << + "uint end1 = " << tile_first1[expr<uint_>("i+1")] << ";\n" << + "uint start2 = " << tile_first2[expr<uint_>("i")] << ";\n" << + "uint end2 = " << tile_first2[expr<uint_>("i+1")] << ";\n" << + "uint index = i*" << tile_size << ";\n" << + "while(start1<end1 && start2<end2)\n" << + "{\n" << + " if(!(" << comp(first2[expr<uint_>("start2")], + first1[expr<uint_>("start1")]) << "))\n" << + " {\n" << + result[expr<uint_>("index")] << + " = " << first1[expr<uint_>("start1")] << ";\n" << + " index++;\n" << + " start1++;\n" << + " }\n" << + " else\n" << + " {\n" << + result[expr<uint_>("index")] << + " = " << first2[expr<uint_>("start2")] << ";\n" << + " index++;\n" << + " start2++;\n" << + " }\n" << + "}\n" << + "while(start1<end1)\n" << + "{\n" << + result[expr<uint_>("index")] << + " = " << first1[expr<uint_>("start1")] << ";\n" << + " index++;\n" << + " start1++;\n" << + "}\n" << + "while(start2<end2)\n" << + "{\n" << + result[expr<uint_>("index")] << + " = " << first2[expr<uint_>("start2")] << ";\n" << + " index++;\n" << + " start2++;\n" << + "}\n"; + } + + template<class InputIterator1, class InputIterator2, + class InputIterator3, class InputIterator4, + class OutputIterator> + void set_range(InputIterator1 first1, + InputIterator2 first2, + InputIterator3 tile_first1, + InputIterator3 tile_last1, + InputIterator4 tile_first2, + OutputIterator result) + { + typedef typename std::iterator_traits<InputIterator1>::value_type value_type; + ::boost::compute::less<value_type> less_than; + set_range(first1, first2, tile_first1, tile_last1, tile_first2, result, less_than); + } + + event exec(command_queue &queue) + { + if(m_count == 0) { + return event(); + } + + return exec_1d(queue, 0, m_count); + } + +private: + size_t m_count; +}; + +/// +/// \brief Merge algorithm with merge path +/// +/// Merges the sorted values in the range [\p first1, \p last1) with +/// the sorted values in the range [\p first2, last2) and stores the +/// result in the range beginning at \p result +/// +/// \param first1 Iterator pointing to start of first set +/// \param last1 Iterator pointing to end of first set +/// \param first2 Iterator pointing to start of second set +/// \param last2 Iterator pointing to end of second set +/// \param result Iterator pointing to start of range in which the result +/// will be stored +/// \param comp Comparator which performs less than function +/// \param queue Queue on which to execute +/// +template<class InputIterator1, class InputIterator2, class OutputIterator, class Compare> +inline OutputIterator +merge_with_merge_path(InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + InputIterator2 last2, + OutputIterator result, + Compare comp, + command_queue &queue = system::default_queue()) +{ + typedef typename + std::iterator_traits<OutputIterator>::difference_type result_difference_type; + + size_t tile_size = 1024; + + size_t count1 = iterator_range_size(first1, last1); + size_t count2 = iterator_range_size(first2, last2); + + vector<uint_> tile_a((count1+count2+tile_size-1)/tile_size+1, queue.get_context()); + vector<uint_> tile_b((count1+count2+tile_size-1)/tile_size+1, queue.get_context()); + + // Tile the sets + merge_path_kernel tiling_kernel; + tiling_kernel.tile_size = static_cast<unsigned int>(tile_size); + tiling_kernel.set_range(first1, last1, first2, last2, + tile_a.begin()+1, tile_b.begin()+1, comp); + fill_n(tile_a.begin(), 1, uint_(0), queue); + fill_n(tile_b.begin(), 1, uint_(0), queue); + tiling_kernel.exec(queue); + + fill_n(tile_a.end()-1, 1, static_cast<uint_>(count1), queue); + fill_n(tile_b.end()-1, 1, static_cast<uint_>(count2), queue); + + // Merge + serial_merge_kernel merge_kernel; + merge_kernel.tile_size = static_cast<unsigned int>(tile_size); + merge_kernel.set_range(first1, first2, tile_a.begin(), tile_a.end(), + tile_b.begin(), result, comp); + + merge_kernel.exec(queue); + + return result + static_cast<result_difference_type>(count1 + count2); +} + +/// \overload +template<class InputIterator1, class InputIterator2, class OutputIterator> +inline OutputIterator +merge_with_merge_path(InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + InputIterator2 last2, + OutputIterator result, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator1>::value_type value_type; + ::boost::compute::less<value_type> less_than; + return merge_with_merge_path(first1, last1, first2, last2, result, less_than, queue); +} + +} //end detail namespace +} //end compute namespace +} //end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_WIH_MERGE_PATH_HPP diff --git a/boost/compute/algorithm/detail/radix_sort.hpp b/boost/compute/algorithm/detail/radix_sort.hpp new file mode 100644 index 0000000000..c2ba4ed17c --- /dev/null +++ b/boost/compute/algorithm/detail/radix_sort.hpp @@ -0,0 +1,415 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_RADIX_SORT_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_RADIX_SORT_HPP + +#include <iterator> + +#include <boost/assert.hpp> +#include <boost/type_traits/is_signed.hpp> +#include <boost/type_traits/is_floating_point.hpp> + +#include <boost/compute/kernel.hpp> +#include <boost/compute/program.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/exclusive_scan.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/parameter_cache.hpp> +#include <boost/compute/type_traits/type_name.hpp> +#include <boost/compute/type_traits/is_fundamental.hpp> +#include <boost/compute/type_traits/is_vector_type.hpp> +#include <boost/compute/utility/program_cache.hpp> + +namespace boost { +namespace compute { +namespace detail { + +// meta-function returning true if type T is radix-sortable +template<class T> +struct is_radix_sortable : + boost::mpl::and_< + typename ::boost::compute::is_fundamental<T>::type, + typename boost::mpl::not_<typename is_vector_type<T>::type>::type + > +{ +}; + +template<size_t N> +struct radix_sort_value_type +{ +}; + +template<> +struct radix_sort_value_type<1> +{ + typedef uchar_ type; +}; + +template<> +struct radix_sort_value_type<2> +{ + typedef ushort_ type; +}; + +template<> +struct radix_sort_value_type<4> +{ + typedef uint_ type; +}; + +template<> +struct radix_sort_value_type<8> +{ + typedef ulong_ type; +}; + +template<typename T> +inline const char* enable_double() +{ + return " -DT2_double=0"; +} + +template<> +inline const char* enable_double<double>() +{ + return " -DT2_double=1"; +} + +const char radix_sort_source[] = +"#if T2_double\n" +"#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" +"#endif\n" +"#define K2_BITS (1 << K_BITS)\n" +"#define RADIX_MASK ((((T)(1)) << K_BITS) - 1)\n" +"#define SIGN_BIT ((sizeof(T) * CHAR_BIT) - 1)\n" + +"inline uint radix(const T x, const uint low_bit)\n" +"{\n" +"#if defined(IS_FLOATING_POINT)\n" +" const T mask = -(x >> SIGN_BIT) | (((T)(1)) << SIGN_BIT);\n" +" return ((x ^ mask) >> low_bit) & RADIX_MASK;\n" +"#elif defined(IS_SIGNED)\n" +" return ((x ^ (((T)(1)) << SIGN_BIT)) >> low_bit) & RADIX_MASK;\n" +"#else\n" +" return (x >> low_bit) & RADIX_MASK;\n" +"#endif\n" +"}\n" + +"__kernel void count(__global const T *input,\n" +" const uint input_offset,\n" +" const uint input_size,\n" +" __global uint *global_counts,\n" +" __global uint *global_offsets,\n" +" __local uint *local_counts,\n" +" const uint low_bit)\n" +"{\n" + // work-item parameters +" const uint gid = get_global_id(0);\n" +" const uint lid = get_local_id(0);\n" + + // zero local counts +" if(lid < K2_BITS){\n" +" local_counts[lid] = 0;\n" +" }\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" + + // reduce local counts +" if(gid < input_size){\n" +" T value = input[input_offset+gid];\n" +" uint bucket = radix(value, low_bit);\n" +" atomic_inc(local_counts + bucket);\n" +" }\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" + + // write block-relative offsets +" if(lid < K2_BITS){\n" +" global_counts[K2_BITS*get_group_id(0) + lid] = local_counts[lid];\n" + + // write global offsets +" if(get_group_id(0) == (get_num_groups(0) - 1)){\n" +" global_offsets[lid] = local_counts[lid];\n" +" }\n" +" }\n" +"}\n" + +"__kernel void scan(__global const uint *block_offsets,\n" +" __global uint *global_offsets,\n" +" const uint block_count)\n" +"{\n" +" __global const uint *last_block_offsets =\n" +" block_offsets + K2_BITS * (block_count - 1);\n" + + // calculate and scan global_offsets +" uint sum = 0;\n" +" for(uint i = 0; i < K2_BITS; i++){\n" +" uint x = global_offsets[i] + last_block_offsets[i];\n" +" global_offsets[i] = sum;\n" +" sum += x;\n" +" }\n" +"}\n" + +"__kernel void scatter(__global const T *input,\n" +" const uint input_offset,\n" +" const uint input_size,\n" +" const uint low_bit,\n" +" __global const uint *counts,\n" +" __global const uint *global_offsets,\n" +"#ifndef SORT_BY_KEY\n" +" __global T *output,\n" +" const uint output_offset)\n" +"#else\n" +" __global T *keys_output,\n" +" const uint keys_output_offset,\n" +" __global T2 *values_input,\n" +" const uint values_input_offset,\n" +" __global T2 *values_output,\n" +" const uint values_output_offset)\n" +"#endif\n" +"{\n" + // work-item parameters +" const uint gid = get_global_id(0);\n" +" const uint lid = get_local_id(0);\n" + + // copy input to local memory +" T value;\n" +" uint bucket;\n" +" __local uint local_input[BLOCK_SIZE];\n" +" if(gid < input_size){\n" +" value = input[input_offset+gid];\n" +" bucket = radix(value, low_bit);\n" +" local_input[lid] = bucket;\n" +" }\n" + + // copy block counts to local memory +" __local uint local_counts[(1 << K_BITS)];\n" +" if(lid < K2_BITS){\n" +" local_counts[lid] = counts[get_group_id(0) * K2_BITS + lid];\n" +" }\n" + + // wait until local memory is ready +" barrier(CLK_LOCAL_MEM_FENCE);\n" + +" if(gid >= input_size){\n" +" return;\n" +" }\n" + + // get global offset +" uint offset = global_offsets[bucket] + local_counts[bucket];\n" + + // calculate local offset +" uint local_offset = 0;\n" +" for(uint i = 0; i < lid; i++){\n" +" if(local_input[i] == bucket)\n" +" local_offset++;\n" +" }\n" + +"#ifndef SORT_BY_KEY\n" + // write value to output +" output[output_offset + offset + local_offset] = value;\n" +"#else\n" + // write key and value if doing sort_by_key +" keys_output[keys_output_offset+offset + local_offset] = value;\n" +" values_output[values_output_offset+offset + local_offset] =\n" +" values_input[values_input_offset+gid];\n" +"#endif\n" +"}\n"; + +template<class T, class T2> +inline void radix_sort_impl(const buffer_iterator<T> first, + const buffer_iterator<T> last, + const buffer_iterator<T2> values_first, + command_queue &queue) +{ + + typedef T value_type; + typedef typename radix_sort_value_type<sizeof(T)>::type sort_type; + + const device &device = queue.get_device(); + const context &context = queue.get_context(); + + + // if we have a valid values iterator then we are doing a + // sort by key and have to set up the values buffer + bool sort_by_key = (values_first.get_buffer().get() != 0); + + // load (or create) radix sort program + std::string cache_key = + std::string("__boost_radix_sort_") + type_name<value_type>(); + + if(sort_by_key){ + cache_key += std::string("_with_") + type_name<T2>(); + } + + boost::shared_ptr<program_cache> cache = + program_cache::get_global_cache(context); + boost::shared_ptr<parameter_cache> parameters = + detail::parameter_cache::get_global_cache(device); + + // sort parameters + const uint_ k = parameters->get(cache_key, "k", 4); + const uint_ k2 = 1 << k; + const uint_ block_size = parameters->get(cache_key, "tpb", 128); + + // sort program compiler options + std::stringstream options; + options << "-DK_BITS=" << k; + options << " -DT=" << type_name<sort_type>(); + options << " -DBLOCK_SIZE=" << block_size; + + if(boost::is_floating_point<value_type>::value){ + options << " -DIS_FLOATING_POINT"; + } + + if(boost::is_signed<value_type>::value){ + options << " -DIS_SIGNED"; + } + + if(sort_by_key){ + options << " -DSORT_BY_KEY"; + options << " -DT2=" << type_name<T2>(); + options << enable_double<T2>(); + } + + // load radix sort program + program radix_sort_program = cache->get_or_build( + cache_key, options.str(), radix_sort_source, context + ); + + kernel count_kernel(radix_sort_program, "count"); + kernel scan_kernel(radix_sort_program, "scan"); + kernel scatter_kernel(radix_sort_program, "scatter"); + + size_t count = detail::iterator_range_size(first, last); + + uint_ block_count = static_cast<uint_>(count / block_size); + if(block_count * block_size != count){ + block_count++; + } + + // setup temporary buffers + vector<value_type> output(count, context); + vector<T2> values_output(sort_by_key ? count : 0, context); + vector<uint_> offsets(k2, context); + vector<uint_> counts(block_count * k2, context); + + const buffer *input_buffer = &first.get_buffer(); + uint_ input_offset = static_cast<uint_>(first.get_index()); + const buffer *output_buffer = &output.get_buffer(); + uint_ output_offset = 0; + const buffer *values_input_buffer = &values_first.get_buffer(); + uint_ values_input_offset = static_cast<uint_>(values_first.get_index()); + const buffer *values_output_buffer = &values_output.get_buffer(); + uint_ values_output_offset = 0; + + for(uint_ i = 0; i < sizeof(sort_type) * CHAR_BIT / k; i++){ + // write counts + count_kernel.set_arg(0, *input_buffer); + count_kernel.set_arg(1, input_offset); + count_kernel.set_arg(2, static_cast<uint_>(count)); + count_kernel.set_arg(3, counts); + count_kernel.set_arg(4, offsets); + count_kernel.set_arg(5, block_size * sizeof(uint_), 0); + count_kernel.set_arg(6, i * k); + queue.enqueue_1d_range_kernel(count_kernel, + 0, + block_count * block_size, + block_size); + + // scan counts + if(k == 1){ + typedef uint2_ counter_type; + ::boost::compute::exclusive_scan( + make_buffer_iterator<counter_type>(counts.get_buffer(), 0), + make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 2), + make_buffer_iterator<counter_type>(counts.get_buffer()), + queue + ); + } + else if(k == 2){ + typedef uint4_ counter_type; + ::boost::compute::exclusive_scan( + make_buffer_iterator<counter_type>(counts.get_buffer(), 0), + make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 4), + make_buffer_iterator<counter_type>(counts.get_buffer()), + queue + ); + } + else if(k == 4){ + typedef uint16_ counter_type; + ::boost::compute::exclusive_scan( + make_buffer_iterator<counter_type>(counts.get_buffer(), 0), + make_buffer_iterator<counter_type>(counts.get_buffer(), counts.size() / 16), + make_buffer_iterator<counter_type>(counts.get_buffer()), + queue + ); + } + else { + BOOST_ASSERT(false && "unknown k"); + break; + } + + // scan global offsets + scan_kernel.set_arg(0, counts); + scan_kernel.set_arg(1, offsets); + scan_kernel.set_arg(2, block_count); + queue.enqueue_task(scan_kernel); + + // scatter values + scatter_kernel.set_arg(0, *input_buffer); + scatter_kernel.set_arg(1, input_offset); + scatter_kernel.set_arg(2, static_cast<uint_>(count)); + scatter_kernel.set_arg(3, i * k); + scatter_kernel.set_arg(4, counts); + scatter_kernel.set_arg(5, offsets); + scatter_kernel.set_arg(6, *output_buffer); + scatter_kernel.set_arg(7, output_offset); + if(sort_by_key){ + scatter_kernel.set_arg(8, *values_input_buffer); + scatter_kernel.set_arg(9, values_input_offset); + scatter_kernel.set_arg(10, *values_output_buffer); + scatter_kernel.set_arg(11, values_output_offset); + } + queue.enqueue_1d_range_kernel(scatter_kernel, + 0, + block_count * block_size, + block_size); + + // swap buffers + std::swap(input_buffer, output_buffer); + std::swap(values_input_buffer, values_output_buffer); + std::swap(input_offset, output_offset); + std::swap(values_input_offset, values_output_offset); + } +} + +template<class Iterator> +inline void radix_sort(Iterator first, + Iterator last, + command_queue &queue) +{ + radix_sort_impl(first, last, buffer_iterator<int>(), queue); +} + +template<class KeyIterator, class ValueIterator> +inline void radix_sort_by_key(KeyIterator keys_first, + KeyIterator keys_last, + ValueIterator values_first, + command_queue &queue) +{ + radix_sort_impl(keys_first, keys_last, values_first, queue); +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_RADIX_SORT_HPP diff --git a/boost/compute/algorithm/detail/random_fill.hpp b/boost/compute/algorithm/detail/random_fill.hpp new file mode 100644 index 0000000000..5c3827a9f8 --- /dev/null +++ b/boost/compute/algorithm/detail/random_fill.hpp @@ -0,0 +1,57 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_RANDOM_FILL_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_RANDOM_FILL_HPP + +#include <iterator> + +#include <boost/compute/command_queue.hpp> +#include <boost/compute/random/default_random_engine.hpp> +#include <boost/compute/random/uniform_real_distribution.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class OutputIterator, class Generator> +inline void random_fill(OutputIterator first, + OutputIterator last, + Generator &g, + command_queue &queue) +{ + g.fill(first, last, queue); +} + +template<class OutputIterator> +inline void +random_fill(OutputIterator first, + OutputIterator last, + typename std::iterator_traits<OutputIterator>::value_type lo, + typename std::iterator_traits<OutputIterator>::value_type hi, + command_queue &queue) +{ + typedef typename + std::iterator_traits<OutputIterator>::value_type value_type; + typedef typename + boost::compute::default_random_engine engine_type; + typedef typename + boost::compute::uniform_real_distribution<value_type> distribution_type; + + engine_type engine(queue); + distribution_type generator(lo, hi); + generator.fill(first, last, engine, queue); +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_RANDOM_FILL_HPP diff --git a/boost/compute/algorithm/detail/reduce_by_key.hpp b/boost/compute/algorithm/detail/reduce_by_key.hpp new file mode 100644 index 0000000000..65844c9ebf --- /dev/null +++ b/boost/compute/algorithm/detail/reduce_by_key.hpp @@ -0,0 +1,119 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_BY_KEY_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_BY_KEY_HPP + +#include <algorithm> +#include <iterator> + +#include <boost/compute/command_queue.hpp> +#include <boost/compute/functional.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/algorithm/detail/serial_reduce_by_key.hpp> +#include <boost/compute/algorithm/detail/reduce_by_key_with_scan.hpp> +#include <boost/compute/type_traits.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputKeyIterator, class InputValueIterator, + class OutputKeyIterator, class OutputValueIterator, + class BinaryFunction, class BinaryPredicate> +size_t reduce_by_key_on_gpu(InputKeyIterator keys_first, + InputKeyIterator keys_last, + InputValueIterator values_first, + OutputKeyIterator keys_result, + OutputValueIterator values_result, + BinaryFunction function, + BinaryPredicate predicate, + command_queue &queue) +{ + return detail::reduce_by_key_with_scan(keys_first, keys_last, values_first, + keys_result, values_result, function, + predicate, queue); +} + +template<class InputKeyIterator, class InputValueIterator, + class OutputKeyIterator, class OutputValueIterator> +bool reduce_by_key_on_gpu_requirements_met(InputKeyIterator keys_first, + InputValueIterator values_first, + OutputKeyIterator keys_result, + OutputValueIterator values_result, + const size_t count, + command_queue &queue) +{ + const device &device = queue.get_device(); + return (count > 256) + && !(device.type() & device::cpu) + && reduce_by_key_with_scan_requirements_met(keys_first, values_first, + keys_result,values_result, + count, queue); + return true; +} + +template<class InputKeyIterator, class InputValueIterator, + class OutputKeyIterator, class OutputValueIterator, + class BinaryFunction, class BinaryPredicate> +inline std::pair<OutputKeyIterator, OutputValueIterator> +dispatch_reduce_by_key(InputKeyIterator keys_first, + InputKeyIterator keys_last, + InputValueIterator values_first, + OutputKeyIterator keys_result, + OutputValueIterator values_result, + BinaryFunction function, + BinaryPredicate predicate, + command_queue &queue) +{ + typedef typename + std::iterator_traits<OutputKeyIterator>::difference_type key_difference_type; + typedef typename + std::iterator_traits<OutputValueIterator>::difference_type value_difference_type; + + const size_t count = detail::iterator_range_size(keys_first, keys_last); + if (count < 2) { + boost::compute::copy_n(keys_first, count, keys_result, queue); + boost::compute::copy_n(values_first, count, values_result, queue); + return + std::make_pair<OutputKeyIterator, OutputValueIterator>( + keys_result + static_cast<key_difference_type>(count), + values_result + static_cast<value_difference_type>(count) + ); + } + + size_t result_size = 0; + if(reduce_by_key_on_gpu_requirements_met(keys_first, values_first, keys_result, + values_result, count, queue)){ + result_size = + detail::reduce_by_key_on_gpu(keys_first, keys_last, values_first, + keys_result, values_result, function, + predicate, queue); + } + else { + result_size = + detail::serial_reduce_by_key(keys_first, keys_last, values_first, + keys_result, values_result, function, + predicate, queue); + } + + return + std::make_pair<OutputKeyIterator, OutputValueIterator>( + keys_result + static_cast<key_difference_type>(result_size), + values_result + static_cast<value_difference_type>(result_size) + ); +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_BY_KEY_HPP diff --git a/boost/compute/algorithm/detail/reduce_by_key_with_scan.hpp b/boost/compute/algorithm/detail/reduce_by_key_with_scan.hpp new file mode 100644 index 0000000000..e6852a67eb --- /dev/null +++ b/boost/compute/algorithm/detail/reduce_by_key_with_scan.hpp @@ -0,0 +1,541 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_BY_KEY_WITH_SCAN_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_BY_KEY_WITH_SCAN_HPP + +#include <algorithm> +#include <iterator> + +#include <boost/compute/command_queue.hpp> +#include <boost/compute/functional.hpp> +#include <boost/compute/algorithm/inclusive_scan.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/container/detail/scalar.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/read_write_single_value.hpp> +#include <boost/compute/type_traits.hpp> +#include <boost/compute/utility/program_cache.hpp> + +namespace boost { +namespace compute { +namespace detail { + +/// \internal_ +/// +/// Fills \p new_keys_first with unsigned integer keys generated from vector +/// of original keys \p keys_first. New keys can be distinguish by simple equality +/// predicate. +/// +/// \param keys_first iterator pointing to the first key +/// \param number_of_keys number of keys +/// \param predicate binary predicate for key comparison +/// \param new_keys_first iterator pointing to the new keys vector +/// \param preferred_work_group_size preferred work group size +/// \param queue command queue to perform the operation +/// +/// Binary function \p predicate must take two keys as arguments and +/// return true only if they are considered the same. +/// +/// The first new key equals zero and the last equals number of unique keys +/// minus one. +/// +/// No local memory usage. +template<class InputKeyIterator, class BinaryPredicate> +inline void generate_uint_keys(InputKeyIterator keys_first, + size_t number_of_keys, + BinaryPredicate predicate, + vector<uint_>::iterator new_keys_first, + size_t preferred_work_group_size, + command_queue &queue) +{ + typedef typename + std::iterator_traits<InputKeyIterator>::value_type key_type; + + detail::meta_kernel k("reduce_by_key_new_key_flags"); + k.add_set_arg<const uint_>("count", uint_(number_of_keys)); + + k << + k.decl<const uint_>("gid") << " = get_global_id(0);\n" << + k.decl<uint_>("value") << " = 0;\n" << + "if(gid >= count){\n return;\n}\n" << + "if(gid > 0){ \n" << + k.decl<key_type>("key") << " = " << + keys_first[k.var<const uint_>("gid")] << ";\n" << + k.decl<key_type>("previous_key") << " = " << + keys_first[k.var<const uint_>("gid - 1")] << ";\n" << + " value = " << predicate(k.var<key_type>("previous_key"), + k.var<key_type>("key")) << + " ? 0 : 1;\n" << + "}\n else {\n" << + " value = 0;\n" << + "}\n" << + new_keys_first[k.var<const uint_>("gid")] << " = value;\n"; + + const context &context = queue.get_context(); + kernel kernel = k.compile(context); + + size_t work_group_size = preferred_work_group_size; + size_t work_groups_no = static_cast<size_t>( + std::ceil(float(number_of_keys) / work_group_size) + ); + + queue.enqueue_1d_range_kernel(kernel, + 0, + work_groups_no * work_group_size, + work_group_size); + + inclusive_scan(new_keys_first, new_keys_first + number_of_keys, + new_keys_first, queue); +} + +/// \internal_ +/// Calculate carry-out for each work group. +/// Carry-out is a pair of the last key processed by a work group and sum of all +/// values under this key in this work group. +template<class InputValueIterator, class OutputValueIterator, class BinaryFunction> +inline void carry_outs(vector<uint_>::iterator keys_first, + InputValueIterator values_first, + size_t count, + vector<uint_>::iterator carry_out_keys_first, + OutputValueIterator carry_out_values_first, + BinaryFunction function, + size_t work_group_size, + command_queue &queue) +{ + typedef typename + std::iterator_traits<OutputValueIterator>::value_type value_out_type; + + detail::meta_kernel k("reduce_by_key_with_scan_carry_outs"); + k.add_set_arg<const uint_>("count", uint_(count)); + size_t local_keys_arg = k.add_arg<uint_ *>(memory_object::local_memory, "lkeys"); + size_t local_vals_arg = k.add_arg<value_out_type *>(memory_object::local_memory, "lvals"); + + k << + k.decl<const uint_>("gid") << " = get_global_id(0);\n" << + k.decl<const uint_>("wg_size") << " = get_local_size(0);\n" << + k.decl<const uint_>("lid") << " = get_local_id(0);\n" << + k.decl<const uint_>("group_id") << " = get_group_id(0);\n" << + + k.decl<uint_>("key") << ";\n" << + k.decl<value_out_type>("value") << ";\n" << + "if(gid < count){\n" << + k.var<uint_>("key") << " = " << + keys_first[k.var<const uint_>("gid")] << ";\n" << + k.var<value_out_type>("value") << " = " << + values_first[k.var<const uint_>("gid")] << ";\n" << + "lkeys[lid] = key;\n" << + "lvals[lid] = value;\n" << + "}\n" << + + // Calculate carry out for each work group by performing Hillis/Steele scan + // where only last element (key-value pair) is saved + k.decl<value_out_type>("result") << " = value;\n" << + k.decl<uint_>("other_key") << ";\n" << + k.decl<value_out_type>("other_value") << ";\n" << + + "for(" << k.decl<uint_>("offset") << " = 1; " << + "offset < wg_size; offset *= 2){\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" << + " if(lid >= offset){\n" + " other_key = lkeys[lid - offset];\n" << + " if(other_key == key){\n" << + " other_value = lvals[lid - offset];\n" << + " result = " << function(k.var<value_out_type>("result"), + k.var<value_out_type>("other_value")) << ";\n" << + " }\n" << + " }\n" << + " barrier(CLK_LOCAL_MEM_FENCE);\n" << + " lvals[lid] = result;\n" << + "}\n" << + + // save carry out + "if(lid == (wg_size - 1)){\n" << + carry_out_keys_first[k.var<const uint_>("group_id")] << " = key;\n" << + carry_out_values_first[k.var<const uint_>("group_id")] << " = result;\n" << + "}\n"; + + size_t work_groups_no = static_cast<size_t>( + std::ceil(float(count) / work_group_size) + ); + + const context &context = queue.get_context(); + kernel kernel = k.compile(context); + kernel.set_arg(local_keys_arg, local_buffer<uint_>(work_group_size)); + kernel.set_arg(local_vals_arg, local_buffer<value_out_type>(work_group_size)); + + queue.enqueue_1d_range_kernel(kernel, + 0, + work_groups_no * work_group_size, + work_group_size); +} + +/// \internal_ +/// Calculate carry-in by performing inclusive scan by key on carry-outs vector. +template<class OutputValueIterator, class BinaryFunction> +inline void carry_ins(vector<uint_>::iterator carry_out_keys_first, + OutputValueIterator carry_out_values_first, + OutputValueIterator carry_in_values_first, + size_t carry_out_size, + BinaryFunction function, + size_t work_group_size, + command_queue &queue) +{ + typedef typename + std::iterator_traits<OutputValueIterator>::value_type value_out_type; + + uint_ values_pre_work_item = static_cast<uint_>( + std::ceil(float(carry_out_size) / work_group_size) + ); + + detail::meta_kernel k("reduce_by_key_with_scan_carry_ins"); + k.add_set_arg<const uint_>("carry_out_size", uint_(carry_out_size)); + k.add_set_arg<const uint_>("values_per_work_item", values_pre_work_item); + size_t local_keys_arg = k.add_arg<uint_ *>(memory_object::local_memory, "lkeys"); + size_t local_vals_arg = k.add_arg<value_out_type *>(memory_object::local_memory, "lvals"); + + k << + k.decl<uint_>("id") << " = get_global_id(0) * values_per_work_item;\n" << + k.decl<uint_>("idx") << " = id;\n" << + k.decl<const uint_>("wg_size") << " = get_local_size(0);\n" << + k.decl<const uint_>("lid") << " = get_local_id(0);\n" << + k.decl<const uint_>("group_id") << " = get_group_id(0);\n" << + + k.decl<uint_>("key") << ";\n" << + k.decl<value_out_type>("value") << ";\n" << + k.decl<uint_>("previous_key") << ";\n" << + k.decl<value_out_type>("result") << ";\n" << + + "if(id < carry_out_size){\n" << + k.var<uint_>("previous_key") << " = " << + carry_out_keys_first[k.var<const uint_>("id")] << ";\n" << + k.var<value_out_type>("result") << " = " << + carry_out_values_first[k.var<const uint_>("id")] << ";\n" << + carry_in_values_first[k.var<const uint_>("id")] << " = result;\n" << + "}\n" << + + k.decl<const uint_>("end") << " = (id + values_per_work_item) <= carry_out_size" << + " ? (values_per_work_item + id) : carry_out_size;\n" << + + "for(idx = idx + 1; idx < end; idx += 1){\n" << + " key = " << carry_out_keys_first[k.var<const uint_>("idx")] << ";\n" << + " value = " << carry_out_values_first[k.var<const uint_>("idx")] << ";\n" << + " if(previous_key == key){\n" << + " result = " << function(k.var<value_out_type>("result"), + k.var<value_out_type>("value")) << ";\n" << + " }\n else { \n" << + " result = value;\n" + " }\n" << + " " << carry_in_values_first[k.var<const uint_>("idx")] << " = result;\n" << + " previous_key = key;\n" + "}\n" << + + // save the last key and result to local memory + "lkeys[lid] = previous_key;\n" << + "lvals[lid] = result;\n" << + + // Hillis/Steele scan + "for(" << k.decl<uint_>("offset") << " = 1; " << + "offset < wg_size; offset *= 2){\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" << + " if(lid >= offset){\n" + " key = lkeys[lid - offset];\n" << + " if(previous_key == key){\n" << + " value = lvals[lid - offset];\n" << + " result = " << function(k.var<value_out_type>("result"), + k.var<value_out_type>("value")) << ";\n" << + " }\n" << + " }\n" << + " barrier(CLK_LOCAL_MEM_FENCE);\n" << + " lvals[lid] = result;\n" << + "}\n" << + "barrier(CLK_LOCAL_MEM_FENCE);\n" << + + "if(lid > 0){\n" << + // load key-value reduced by previous work item + " previous_key = lkeys[lid - 1];\n" << + " result = lvals[lid - 1];\n" << + "}\n" << + + // add key-value reduced by previous work item + "for(idx = id; idx < id + values_per_work_item; idx += 1){\n" << + // make sure all carry-ins are saved in global memory + " barrier( CLK_GLOBAL_MEM_FENCE );\n" << + " if(lid > 0 && idx < carry_out_size) {\n" + " key = " << carry_out_keys_first[k.var<const uint_>("idx")] << ";\n" << + " value = " << carry_in_values_first[k.var<const uint_>("idx")] << ";\n" << + " if(previous_key == key){\n" << + " value = " << function(k.var<value_out_type>("result"), + k.var<value_out_type>("value")) << ";\n" << + " }\n" << + " " << carry_in_values_first[k.var<const uint_>("idx")] << " = value;\n" << + " }\n" << + "}\n"; + + + const context &context = queue.get_context(); + kernel kernel = k.compile(context); + kernel.set_arg(local_keys_arg, local_buffer<uint_>(work_group_size)); + kernel.set_arg(local_vals_arg, local_buffer<value_out_type>(work_group_size)); + + queue.enqueue_1d_range_kernel(kernel, + 0, + work_group_size, + work_group_size); +} + +/// \internal_ +/// +/// Perform final reduction by key. Each work item: +/// 1. Perform local work-group reduction (Hillis/Steele scan) +/// 2. Add carry-in (if keys are right) +/// 3. Save reduced value if next key is different than processed one +template<class InputKeyIterator, class InputValueIterator, + class OutputKeyIterator, class OutputValueIterator, + class BinaryFunction> +inline void final_reduction(InputKeyIterator keys_first, + InputValueIterator values_first, + OutputKeyIterator keys_result, + OutputValueIterator values_result, + size_t count, + BinaryFunction function, + vector<uint_>::iterator new_keys_first, + vector<uint_>::iterator carry_in_keys_first, + OutputValueIterator carry_in_values_first, + size_t carry_in_size, + size_t work_group_size, + command_queue &queue) +{ + typedef typename + std::iterator_traits<OutputValueIterator>::value_type value_out_type; + + detail::meta_kernel k("reduce_by_key_with_scan_final_reduction"); + k.add_set_arg<const uint_>("count", uint_(count)); + size_t local_keys_arg = k.add_arg<uint_ *>(memory_object::local_memory, "lkeys"); + size_t local_vals_arg = k.add_arg<value_out_type *>(memory_object::local_memory, "lvals"); + + k << + k.decl<const uint_>("gid") << " = get_global_id(0);\n" << + k.decl<const uint_>("wg_size") << " = get_local_size(0);\n" << + k.decl<const uint_>("lid") << " = get_local_id(0);\n" << + k.decl<const uint_>("group_id") << " = get_group_id(0);\n" << + + k.decl<uint_>("key") << ";\n" << + k.decl<value_out_type>("value") << ";\n" + + "if(gid < count){\n" << + k.var<uint_>("key") << " = " << + new_keys_first[k.var<const uint_>("gid")] << ";\n" << + k.var<value_out_type>("value") << " = " << + values_first[k.var<const uint_>("gid")] << ";\n" << + "lkeys[lid] = key;\n" << + "lvals[lid] = value;\n" << + "}\n" << + + // Hillis/Steele scan + k.decl<value_out_type>("result") << " = value;\n" << + k.decl<uint_>("other_key") << ";\n" << + k.decl<value_out_type>("other_value") << ";\n" << + + "for(" << k.decl<uint_>("offset") << " = 1; " << + "offset < wg_size ; offset *= 2){\n" + " barrier(CLK_LOCAL_MEM_FENCE);\n" << + " if(lid >= offset) {\n" << + " other_key = lkeys[lid - offset];\n" << + " if(other_key == key){\n" << + " other_value = lvals[lid - offset];\n" << + " result = " << function(k.var<value_out_type>("result"), + k.var<value_out_type>("other_value")) << ";\n" << + " }\n" << + " }\n" << + " barrier(CLK_LOCAL_MEM_FENCE);\n" << + " lvals[lid] = result;\n" << + "}\n" << + + "if(gid >= count) {\n return;\n};\n" << + + k.decl<const bool>("save") << " = (gid < (count - 1)) ?" + << new_keys_first[k.var<const uint_>("gid + 1")] << " != key" << + ": true;\n" << + + // Add carry in + k.decl<uint_>("carry_in_key") << ";\n" << + "if(group_id > 0 && save) {\n" << + " carry_in_key = " << carry_in_keys_first[k.var<const uint_>("group_id - 1")] << ";\n" << + " if(key == carry_in_key){\n" << + " other_value = " << carry_in_values_first[k.var<const uint_>("group_id - 1")] << ";\n" << + " result = " << function(k.var<value_out_type>("result"), + k.var<value_out_type>("other_value")) << ";\n" << + " }\n" << + "}\n" << + + // Save result only if the next key is different or it's the last element. + "if(save){\n" << + keys_result[k.var<uint_>("key")] << " = " << keys_first[k.var<const uint_>("gid")] << ";\n" << + values_result[k.var<uint_>("key")] << " = result;\n" << + "}\n" + ; + + size_t work_groups_no = static_cast<size_t>( + std::ceil(float(count) / work_group_size) + ); + + const context &context = queue.get_context(); + kernel kernel = k.compile(context); + kernel.set_arg(local_keys_arg, local_buffer<uint_>(work_group_size)); + kernel.set_arg(local_vals_arg, local_buffer<value_out_type>(work_group_size)); + + queue.enqueue_1d_range_kernel(kernel, + 0, + work_groups_no * work_group_size, + work_group_size); +} + +/// \internal_ +/// Returns preferred work group size for reduce by key with scan algorithm. +template<class KeyType, class ValueType> +inline size_t get_work_group_size(const device& device) +{ + std::string cache_key = std::string("__boost_reduce_by_key_with_scan") + + "k_" + type_name<KeyType>() + "_v_" + type_name<ValueType>(); + + // load parameters + boost::shared_ptr<parameter_cache> parameters = + detail::parameter_cache::get_global_cache(device); + + return (std::max)( + static_cast<size_t>(parameters->get(cache_key, "wgsize", 256)), + static_cast<size_t>(device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>()) + ); +} + +/// \internal_ +/// +/// 1. For each work group carry-out value is calculated (it's done by key-oriented +/// Hillis/Steele scan). Carry-out is a pair of the last key processed by work +/// group and sum of all values under this key in work group. +/// 2. From every carry-out carry-in is calculated by performing inclusive scan +/// by key. +/// 3. Final reduction by key is performed (key-oriented Hillis/Steele scan), +/// carry-in values are added where needed. +template<class InputKeyIterator, class InputValueIterator, + class OutputKeyIterator, class OutputValueIterator, + class BinaryFunction, class BinaryPredicate> +inline size_t reduce_by_key_with_scan(InputKeyIterator keys_first, + InputKeyIterator keys_last, + InputValueIterator values_first, + OutputKeyIterator keys_result, + OutputValueIterator values_result, + BinaryFunction function, + BinaryPredicate predicate, + command_queue &queue) +{ + typedef typename + std::iterator_traits<InputValueIterator>::value_type value_type; + typedef typename + std::iterator_traits<InputKeyIterator>::value_type key_type; + typedef typename + std::iterator_traits<OutputValueIterator>::value_type value_out_type; + + const context &context = queue.get_context(); + size_t count = detail::iterator_range_size(keys_first, keys_last); + + if(count == 0){ + return size_t(0); + } + + const device &device = queue.get_device(); + size_t work_group_size = get_work_group_size<value_type, key_type>(device); + + // Replace original key with unsigned integer keys generated based on given + // predicate. New key is also an index for keys_result and values_result vectors, + // which points to place where reduced value should be saved. + vector<uint_> new_keys(count, context); + vector<uint_>::iterator new_keys_first = new_keys.begin(); + generate_uint_keys(keys_first, count, predicate, new_keys_first, + work_group_size, queue); + + // Calculate carry-out and carry-in vectors size + const size_t carry_out_size = static_cast<size_t>( + std::ceil(float(count) / work_group_size) + ); + vector<uint_> carry_out_keys(carry_out_size, context); + vector<value_out_type> carry_out_values(carry_out_size, context); + carry_outs(new_keys_first, values_first, count, carry_out_keys.begin(), + carry_out_values.begin(), function, work_group_size, queue); + + vector<value_out_type> carry_in_values(carry_out_size, context); + carry_ins(carry_out_keys.begin(), carry_out_values.begin(), + carry_in_values.begin(), carry_out_size, function, work_group_size, + queue); + + final_reduction(keys_first, values_first, keys_result, values_result, + count, function, new_keys_first, carry_out_keys.begin(), + carry_in_values.begin(), carry_out_size, work_group_size, + queue); + + const size_t result = read_single_value<uint_>(new_keys.get_buffer(), + count - 1, queue); + return result + 1; +} + +/// \internal_ +/// Return true if requirements for running reduce by key with scan on given +/// device are met (at least one work group of preferred size can be run). +template<class InputKeyIterator, class InputValueIterator, + class OutputKeyIterator, class OutputValueIterator> +bool reduce_by_key_with_scan_requirements_met(InputKeyIterator keys_first, + InputValueIterator values_first, + OutputKeyIterator keys_result, + OutputValueIterator values_result, + const size_t count, + command_queue &queue) +{ + typedef typename + std::iterator_traits<InputValueIterator>::value_type value_type; + typedef typename + std::iterator_traits<InputKeyIterator>::value_type key_type; + typedef typename + std::iterator_traits<OutputValueIterator>::value_type value_out_type; + + (void) keys_first; + (void) values_first; + (void) keys_result; + (void) values_result; + + const device &device = queue.get_device(); + // device must have dedicated local memory storage + if(device.get_info<CL_DEVICE_LOCAL_MEM_TYPE>() != CL_LOCAL) + { + return false; + } + + // local memory size in bytes (per compute unit) + const size_t local_mem_size = device.get_info<CL_DEVICE_LOCAL_MEM_SIZE>(); + + // preferred work group size + size_t work_group_size = get_work_group_size<key_type, value_type>(device); + + // local memory size needed to perform parallel reduction + size_t required_local_mem_size = 0; + // keys size + required_local_mem_size += sizeof(uint_) * work_group_size; + // reduced values size + required_local_mem_size += sizeof(value_out_type) * work_group_size; + + return (required_local_mem_size <= local_mem_size); +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_BY_KEY_WITH_SCAN_HPP diff --git a/boost/compute/algorithm/detail/reduce_on_gpu.hpp b/boost/compute/algorithm/detail/reduce_on_gpu.hpp new file mode 100644 index 0000000000..335fba8724 --- /dev/null +++ b/boost/compute/algorithm/detail/reduce_on_gpu.hpp @@ -0,0 +1,286 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP + +#include <iterator> + +#include <boost/compute/utility/source.hpp> +#include <boost/compute/program.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/detail/vendor.hpp> +#include <boost/compute/detail/parameter_cache.hpp> +#include <boost/compute/detail/work_size.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/type_traits/type_name.hpp> +#include <boost/compute/utility/program_cache.hpp> + +namespace boost { +namespace compute { +namespace detail { + +/// \internal +/// body reduction inside a warp +template<typename T,bool isNvidiaDevice> +struct ReduceBody +{ + static std::string body() + { + std::stringstream k; + // local reduction + k << "for(int i = 1; i < TPB; i <<= 1){\n" << + " barrier(CLK_LOCAL_MEM_FENCE);\n" << + " uint mask = (i << 1) - 1;\n" << + " if((lid & mask) == 0){\n" << + " scratch[lid] += scratch[lid+i];\n" << + " }\n" << + "}\n"; + return k.str(); + } +}; + +/// \internal +/// body reduction inside a warp +/// for nvidia device we can use the "unsafe" +/// memory optimisation +template<typename T> +struct ReduceBody<T,true> +{ + static std::string body() + { + std::stringstream k; + // local reduction + // we use TPB to compile only useful instruction + // local reduction when size is greater than warp size + k << "barrier(CLK_LOCAL_MEM_FENCE);\n" << + "if(TPB >= 1024){\n" << + "if(lid < 512) { sum += scratch[lid + 512]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" << + "if(TPB >= 512){\n" << + "if(lid < 256) { sum += scratch[lid + 256]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" << + "if(TPB >= 256){\n" << + "if(lid < 128) { sum += scratch[lid + 128]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" << + "if(TPB >= 128){\n" << + "if(lid < 64) { sum += scratch[lid + 64]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);} \n" << + + // warp reduction + "if(lid < 32){\n" << + // volatile this way we don't need any barrier + "volatile __local " << type_name<T>() << " *lmem = scratch;\n" << + "if(TPB >= 64) { lmem[lid] = sum = sum + lmem[lid+32];} \n" << + "if(TPB >= 32) { lmem[lid] = sum = sum + lmem[lid+16];} \n" << + "if(TPB >= 16) { lmem[lid] = sum = sum + lmem[lid+ 8];} \n" << + "if(TPB >= 8) { lmem[lid] = sum = sum + lmem[lid+ 4];} \n" << + "if(TPB >= 4) { lmem[lid] = sum = sum + lmem[lid+ 2];} \n" << + "if(TPB >= 2) { lmem[lid] = sum = sum + lmem[lid+ 1];} \n" << + "}\n"; + return k.str(); + } +}; + +template<class InputIterator, class Function> +inline void initial_reduce(InputIterator first, + InputIterator last, + buffer result, + const Function &function, + kernel &reduce_kernel, + const uint_ vpt, + const uint_ tpb, + command_queue &queue) +{ + (void) function; + (void) reduce_kernel; + + typedef typename std::iterator_traits<InputIterator>::value_type Arg; + typedef typename boost::tr1_result_of<Function(Arg, Arg)>::type T; + + size_t count = std::distance(first, last); + detail::meta_kernel k("initial_reduce"); + k.add_set_arg<const uint_>("count", uint_(count)); + size_t output_arg = k.add_arg<T *>(memory_object::global_memory, "output"); + + k << + k.decl<const uint_>("offset") << " = get_group_id(0) * VPT * TPB;\n" << + k.decl<const uint_>("lid") << " = get_local_id(0);\n" << + + "__local " << type_name<T>() << " scratch[TPB];\n" << + + // private reduction + k.decl<T>("sum") << " = 0;\n" << + "for(uint i = 0; i < VPT; i++){\n" << + " if(offset + lid + i*TPB < count){\n" << + " sum = sum + " << first[k.var<uint_>("offset+lid+i*TPB")] << ";\n" << + " }\n" << + "}\n" << + + "scratch[lid] = sum;\n" << + + // local reduction + ReduceBody<T,false>::body() << + + // write sum to output + "if(lid == 0){\n" << + " output[get_group_id(0)] = scratch[0];\n" << + "}\n"; + + const context &context = queue.get_context(); + std::stringstream options; + options << "-DVPT=" << vpt << " -DTPB=" << tpb; + kernel generic_reduce_kernel = k.compile(context, options.str()); + generic_reduce_kernel.set_arg(output_arg, result); + + size_t work_size = calculate_work_size(count, vpt, tpb); + + queue.enqueue_1d_range_kernel(generic_reduce_kernel, 0, work_size, tpb); +} + +template<class T> +inline void initial_reduce(const buffer_iterator<T> &first, + const buffer_iterator<T> &last, + const buffer &result, + const plus<T> &function, + kernel &reduce_kernel, + const uint_ vpt, + const uint_ tpb, + command_queue &queue) +{ + (void) function; + + size_t count = std::distance(first, last); + + reduce_kernel.set_arg(0, first.get_buffer()); + reduce_kernel.set_arg(1, uint_(first.get_index())); + reduce_kernel.set_arg(2, uint_(count)); + reduce_kernel.set_arg(3, result); + reduce_kernel.set_arg(4, uint_(0)); + + size_t work_size = calculate_work_size(count, vpt, tpb); + + queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb); +} + +template<class InputIterator, class T, class Function> +inline void reduce_on_gpu(InputIterator first, + InputIterator last, + buffer_iterator<T> result, + Function function, + command_queue &queue) +{ + const device &device = queue.get_device(); + const context &context = queue.get_context(); + + detail::meta_kernel k("reduce"); + k.add_arg<const T*>(memory_object::global_memory, "input"); + k.add_arg<const uint_>("offset"); + k.add_arg<const uint_>("count"); + k.add_arg<T*>(memory_object::global_memory, "output"); + k.add_arg<const uint_>("output_offset"); + + k << + k.decl<const uint_>("block_offset") << " = get_group_id(0) * VPT * TPB;\n" << + "__global const " << type_name<T>() << " *block = input + offset + block_offset;\n" << + k.decl<const uint_>("lid") << " = get_local_id(0);\n" << + + "__local " << type_name<T>() << " scratch[TPB];\n" << + // private reduction + k.decl<T>("sum") << " = 0;\n" << + "for(uint i = 0; i < VPT; i++){\n" << + " if(block_offset + lid + i*TPB < count){\n" << + " sum = sum + block[lid+i*TPB]; \n" << + " }\n" << + "}\n" << + + "scratch[lid] = sum;\n"; + + // discrimination on vendor name + if(is_nvidia_device(device)) + k << ReduceBody<T,true>::body(); + else + k << ReduceBody<T,false>::body(); + + k << + // write sum to output + "if(lid == 0){\n" << + " output[output_offset + get_group_id(0)] = scratch[0];\n" << + "}\n"; + + std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name<T>(); + + // load parameters + boost::shared_ptr<parameter_cache> parameters = + detail::parameter_cache::get_global_cache(device); + + uint_ vpt = parameters->get(cache_key, "vpt", 8); + uint_ tpb = parameters->get(cache_key, "tpb", 128); + + // reduce program compiler flags + std::stringstream options; + options << "-DT=" << type_name<T>() + << " -DVPT=" << vpt + << " -DTPB=" << tpb; + + // load program + boost::shared_ptr<program_cache> cache = + program_cache::get_global_cache(context); + + program reduce_program = cache->get_or_build( + cache_key, options.str(), k.source(), context + ); + + // create reduce kernel + kernel reduce_kernel(reduce_program, "reduce"); + + size_t count = std::distance(first, last); + + // first pass, reduce from input to ping + buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T)); + initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue); + + // update count after initial reduce + count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb)); + + // middle pass(es), reduce between ping and pong + const buffer *input_buffer = &ping; + buffer pong(context, static_cast<size_t>(count / vpt / tpb * sizeof(T))); + const buffer *output_buffer = &pong; + if(count > vpt * tpb){ + while(count > vpt * tpb){ + reduce_kernel.set_arg(0, *input_buffer); + reduce_kernel.set_arg(1, uint_(0)); + reduce_kernel.set_arg(2, uint_(count)); + reduce_kernel.set_arg(3, *output_buffer); + reduce_kernel.set_arg(4, uint_(0)); + + size_t work_size = static_cast<size_t>(std::ceil(float(count) / vpt)); + if(work_size % tpb != 0){ + work_size += tpb - work_size % tpb; + } + queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb); + + std::swap(input_buffer, output_buffer); + count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb)); + } + } + + // final pass, reduce from ping/pong to result + reduce_kernel.set_arg(0, *input_buffer); + reduce_kernel.set_arg(1, uint_(0)); + reduce_kernel.set_arg(2, uint_(count)); + reduce_kernel.set_arg(3, result.get_buffer()); + reduce_kernel.set_arg(4, uint_(result.get_index())); + + queue.enqueue_1d_range_kernel(reduce_kernel, 0, tpb, tpb); +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP diff --git a/boost/compute/algorithm/detail/scan.hpp b/boost/compute/algorithm/detail/scan.hpp new file mode 100644 index 0000000000..154b6001be --- /dev/null +++ b/boost/compute/algorithm/detail/scan.hpp @@ -0,0 +1,45 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_HPP + +#include <boost/compute/device.hpp> +#include <boost/compute/algorithm/detail/scan_on_cpu.hpp> +#include <boost/compute/algorithm/detail/scan_on_gpu.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator, class OutputIterator, class T, class BinaryOperator> +inline OutputIterator scan(InputIterator first, + InputIterator last, + OutputIterator result, + bool exclusive, + T init, + BinaryOperator op, + command_queue &queue) +{ + const device &device = queue.get_device(); + + if(device.type() & device::cpu){ + return scan_on_cpu(first, last, result, exclusive, init, op, queue); + } + else { + return scan_on_gpu(first, last, result, exclusive, init, op, queue); + } +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_HPP diff --git a/boost/compute/algorithm/detail/scan_on_cpu.hpp b/boost/compute/algorithm/detail/scan_on_cpu.hpp new file mode 100644 index 0000000000..6611c0ba3e --- /dev/null +++ b/boost/compute/algorithm/detail/scan_on_cpu.hpp @@ -0,0 +1,103 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP + +#include <iterator> + +#include <boost/compute/device.hpp> +#include <boost/compute/kernel.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator, class OutputIterator, class T, class BinaryOperator> +inline OutputIterator scan_on_cpu(InputIterator first, + InputIterator last, + OutputIterator result, + bool exclusive, + T init, + BinaryOperator op, + command_queue &queue) +{ + if(first == last){ + return result; + } + + typedef typename + std::iterator_traits<InputIterator>::value_type input_type; + typedef typename + std::iterator_traits<OutputIterator>::value_type output_type; + + const context &context = queue.get_context(); + + // create scan kernel + meta_kernel k("scan_on_cpu"); + + // Arguments + size_t n_arg = k.add_arg<ulong_>("n"); + size_t init_arg = k.add_arg<output_type>("initial_value"); + + if(!exclusive){ + k << + k.decl<const ulong_>("start_idx") << " = 1;\n" << + k.decl<output_type>("sum") << " = " << first[0] << ";\n" << + result[0] << " = sum;\n"; + } + else { + k << + k.decl<const ulong_>("start_idx") << " = 0;\n" << + k.decl<output_type>("sum") << " = initial_value;\n"; + } + + k << + "for(ulong i = start_idx; i < n; i++){\n" << + k.decl<const input_type>("x") << " = " + << first[k.var<ulong_>("i")] << ";\n"; + + if(exclusive){ + k << result[k.var<ulong_>("i")] << " = sum;\n"; + } + + k << " sum = " + << op(k.var<output_type>("sum"), k.var<output_type>("x")) + << ";\n"; + + if(!exclusive){ + k << result[k.var<ulong_>("i")] << " = sum;\n"; + } + + k << "}\n"; + + // compile scan kernel + kernel scan_kernel = k.compile(context); + + // setup kernel arguments + size_t n = detail::iterator_range_size(first, last); + scan_kernel.set_arg<ulong_>(n_arg, n); + scan_kernel.set_arg<output_type>(init_arg, static_cast<output_type>(init)); + + // execute the kernel + queue.enqueue_1d_range_kernel(scan_kernel, 0, 1, 1); + + // return iterator pointing to the end of the result range + return result + n; +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP diff --git a/boost/compute/algorithm/detail/scan_on_gpu.hpp b/boost/compute/algorithm/detail/scan_on_gpu.hpp new file mode 100644 index 0000000000..07c6d6d3c0 --- /dev/null +++ b/boost/compute/algorithm/detail/scan_on_gpu.hpp @@ -0,0 +1,331 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_GPU_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_GPU_HPP + +#include <boost/compute/kernel.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/detail/scan_on_cpu.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/memory/local_buffer.hpp> +#include <boost/compute/iterator/buffer_iterator.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator, class OutputIterator, class BinaryOperator> +class local_scan_kernel : public meta_kernel +{ +public: + local_scan_kernel(InputIterator first, + InputIterator last, + OutputIterator result, + bool exclusive, + BinaryOperator op) + : meta_kernel("local_scan") + { + typedef typename std::iterator_traits<InputIterator>::value_type T; + + (void) last; + + bool checked = true; + + m_block_sums_arg = add_arg<T *>(memory_object::global_memory, "block_sums"); + m_scratch_arg = add_arg<T *>(memory_object::local_memory, "scratch"); + m_block_size_arg = add_arg<const cl_uint>("block_size"); + m_count_arg = add_arg<const cl_uint>("count"); + m_init_value_arg = add_arg<const T>("init"); + + // work-item parameters + *this << + "const uint gid = get_global_id(0);\n" << + "const uint lid = get_local_id(0);\n"; + + // check against data size + if(checked){ + *this << + "if(gid < count){\n"; + } + + // copy values from input to local memory + if(exclusive){ + *this << + decl<const T>("local_init") << "= (gid == 0) ? init : 0;\n" << + "if(lid == 0){ scratch[lid] = local_init; }\n" << + "else { scratch[lid] = " << first[expr<cl_uint>("gid-1")] << "; }\n"; + } + else{ + *this << + "scratch[lid] = " << first[expr<cl_uint>("gid")] << ";\n"; + } + + if(checked){ + *this << + "}\n" + "else {\n" << + " scratch[lid] = 0;\n" << + "}\n"; + } + + // wait for all threads to read from input + *this << + "barrier(CLK_LOCAL_MEM_FENCE);\n"; + + // perform scan + *this << + "for(uint i = 1; i < block_size; i <<= 1){\n" << + " " << decl<const T>("x") << " = lid >= i ? scratch[lid-i] : 0;\n" << + " barrier(CLK_LOCAL_MEM_FENCE);\n" << + " if(lid >= i){\n" << + " scratch[lid] = " << op(var<T>("scratch[lid]"), var<T>("x")) << ";\n" << + " }\n" << + " barrier(CLK_LOCAL_MEM_FENCE);\n" << + "}\n"; + + // copy results to output + if(checked){ + *this << + "if(gid < count){\n"; + } + + *this << + result[expr<cl_uint>("gid")] << " = scratch[lid];\n"; + + if(checked){ + *this << "}\n"; + } + + // store sum for the block + if(exclusive){ + *this << + "if(lid == block_size - 1){\n" << + " block_sums[get_group_id(0)] = " << + op(first[expr<cl_uint>("gid")], var<T>("scratch[lid]")) << + ";\n" << + "}\n"; + } + else { + *this << + "if(lid == block_size - 1){\n" << + " block_sums[get_group_id(0)] = scratch[lid];\n" << + "}\n"; + } + } + + size_t m_block_sums_arg; + size_t m_scratch_arg; + size_t m_block_size_arg; + size_t m_count_arg; + size_t m_init_value_arg; +}; + +template<class T, class BinaryOperator> +class write_scanned_output_kernel : public meta_kernel +{ +public: + write_scanned_output_kernel(BinaryOperator op) + : meta_kernel("write_scanned_output") + { + bool checked = true; + + m_output_arg = add_arg<T *>(memory_object::global_memory, "output"); + m_block_sums_arg = add_arg<const T *>(memory_object::global_memory, "block_sums"); + m_count_arg = add_arg<const cl_uint>("count"); + + // work-item parameters + *this << + "const uint gid = get_global_id(0);\n" << + "const uint block_id = get_group_id(0);\n"; + + // check against data size + if(checked){ + *this << "if(gid < count){\n"; + } + + // write output + *this << + "output[gid] = " << + op(var<T>("block_sums[block_id]"), var<T>("output[gid] ")) << ";\n"; + + if(checked){ + *this << "}\n"; + } + } + + size_t m_output_arg; + size_t m_block_sums_arg; + size_t m_count_arg; +}; + +template<class InputIterator> +inline size_t pick_scan_block_size(InputIterator first, InputIterator last) +{ + size_t count = iterator_range_size(first, last); + + if(count == 0) { return 0; } + else if(count <= 1) { return 1; } + else if(count <= 2) { return 2; } + else if(count <= 4) { return 4; } + else if(count <= 8) { return 8; } + else if(count <= 16) { return 16; } + else if(count <= 32) { return 32; } + else if(count <= 64) { return 64; } + else if(count <= 128) { return 128; } + else { return 256; } +} + +template<class InputIterator, class OutputIterator, class T, class BinaryOperator> +inline OutputIterator scan_impl(InputIterator first, + InputIterator last, + OutputIterator result, + bool exclusive, + T init, + BinaryOperator op, + command_queue &queue) +{ + typedef typename + std::iterator_traits<InputIterator>::value_type + input_type; + typedef typename + std::iterator_traits<InputIterator>::difference_type + difference_type; + typedef typename + std::iterator_traits<OutputIterator>::value_type + output_type; + + const context &context = queue.get_context(); + const size_t count = detail::iterator_range_size(first, last); + + size_t block_size = pick_scan_block_size(first, last); + size_t block_count = count / block_size; + + if(block_count * block_size < count){ + block_count++; + } + + ::boost::compute::vector<input_type> block_sums(block_count, context); + + // zero block sums + input_type zero; + std::memset(&zero, 0, sizeof(input_type)); + ::boost::compute::fill(block_sums.begin(), block_sums.end(), zero, queue); + + // local scan + local_scan_kernel<InputIterator, OutputIterator, BinaryOperator> + local_scan_kernel(first, last, result, exclusive, op); + + ::boost::compute::kernel kernel = local_scan_kernel.compile(context); + kernel.set_arg(local_scan_kernel.m_scratch_arg, local_buffer<input_type>(block_size)); + kernel.set_arg(local_scan_kernel.m_block_sums_arg, block_sums); + kernel.set_arg(local_scan_kernel.m_block_size_arg, static_cast<cl_uint>(block_size)); + kernel.set_arg(local_scan_kernel.m_count_arg, static_cast<cl_uint>(count)); + kernel.set_arg(local_scan_kernel.m_init_value_arg, static_cast<output_type>(init)); + + queue.enqueue_1d_range_kernel(kernel, + 0, + block_count * block_size, + block_size); + + // inclusive scan block sums + if(block_count > 1){ + scan_impl(block_sums.begin(), + block_sums.end(), + block_sums.begin(), + false, + init, + op, + queue + ); + } + + // add block sums to each block + if(block_count > 1){ + write_scanned_output_kernel<input_type, BinaryOperator> + write_output_kernel(op); + kernel = write_output_kernel.compile(context); + kernel.set_arg(write_output_kernel.m_output_arg, result.get_buffer()); + kernel.set_arg(write_output_kernel.m_block_sums_arg, block_sums); + kernel.set_arg(write_output_kernel.m_count_arg, static_cast<cl_uint>(count)); + + queue.enqueue_1d_range_kernel(kernel, + block_size, + block_count * block_size, + block_size); + } + + return result + static_cast<difference_type>(count); +} + +template<class InputIterator, class OutputIterator, class T, class BinaryOperator> +inline OutputIterator dispatch_scan(InputIterator first, + InputIterator last, + OutputIterator result, + bool exclusive, + T init, + BinaryOperator op, + command_queue &queue) +{ + return scan_impl(first, last, result, exclusive, init, op, queue); +} + +template<class InputIterator, class T, class BinaryOperator> +inline InputIterator dispatch_scan(InputIterator first, + InputIterator last, + InputIterator result, + bool exclusive, + T init, + BinaryOperator op, + command_queue &queue) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + + if(first == result){ + // scan input in-place + const context &context = queue.get_context(); + + // make a temporary copy the input + size_t count = iterator_range_size(first, last); + vector<value_type> tmp(count, context); + copy(first, last, tmp.begin(), queue); + + // scan from temporary values + return scan_impl(tmp.begin(), tmp.end(), first, exclusive, init, op, queue); + } + else { + // scan input to output + return scan_impl(first, last, result, exclusive, init, op, queue); + } +} + +template<class InputIterator, class OutputIterator, class T, class BinaryOperator> +inline OutputIterator scan_on_gpu(InputIterator first, + InputIterator last, + OutputIterator result, + bool exclusive, + T init, + BinaryOperator op, + command_queue &queue) +{ + if(first == last){ + return result; + } + + return dispatch_scan(first, last, result, exclusive, init, op, queue); +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_GPU_HPP diff --git a/boost/compute/algorithm/detail/search_all.hpp b/boost/compute/algorithm/detail/search_all.hpp new file mode 100644 index 0000000000..a874bcdebe --- /dev/null +++ b/boost/compute/algorithm/detail/search_all.hpp @@ -0,0 +1,86 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SEARCH_ALL_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_SEARCH_ALL_HPP + +#include <boost/compute/algorithm/copy.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/lambda.hpp> +#include <boost/compute/system.hpp> + +namespace boost { +namespace compute { +namespace detail { + +/// +/// \brief Search kernel class +/// +/// Subclass of meta_kernel which is capable of performing pattern matching +/// +template<class PatternIterator, class TextIterator, class OutputIterator> +class search_kernel : public meta_kernel +{ +public: + search_kernel() : meta_kernel("search") + {} + + void set_range(PatternIterator p_first, + PatternIterator p_last, + TextIterator t_first, + TextIterator t_last, + OutputIterator result) + { + m_p_count = iterator_range_size(p_first, p_last); + m_p_count_arg = add_arg<uint_>("p_count"); + + m_count = iterator_range_size(t_first, t_last); + m_count = m_count + 1 - m_p_count; + + *this << + "uint i = get_global_id(0);\n" << + "uint i1 = i;\n" << + "uint j;\n" << + "for(j = 0; j<p_count; j++,i++)\n" << + "{\n" << + " if(" << p_first[expr<uint_>("j")] << " != " << + t_first[expr<uint_>("i")] << ")\n" << + " j = p_count + 1;\n" << + "}\n" << + "if(j == p_count)\n" << + result[expr<uint_>("i1")] << " = 1;\n" << + "else\n" << + result[expr<uint_>("i1")] << " = 0;\n"; + } + + event exec(command_queue &queue) + { + if(m_count == 0) { + return event(); + } + + set_arg(m_p_count_arg, uint_(m_p_count)); + + return exec_1d(queue, 0, m_count); + } + +private: + size_t m_p_count; + size_t m_p_count_arg; + size_t m_count; +}; + +} //end detail namespace +} //end compute namespace +} //end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SEARCH_ALL_HPP diff --git a/boost/compute/algorithm/detail/serial_accumulate.hpp b/boost/compute/algorithm/detail/serial_accumulate.hpp new file mode 100644 index 0000000000..84f9910122 --- /dev/null +++ b/boost/compute/algorithm/detail/serial_accumulate.hpp @@ -0,0 +1,56 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_ACCUMULATE_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_ACCUMULATE_HPP + +#include <boost/compute/command_queue.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator, class OutputIterator, class T, class BinaryFunction> +inline void serial_accumulate(InputIterator first, + InputIterator last, + OutputIterator result, + T init, + BinaryFunction function, + command_queue &queue) +{ + const context &context = queue.get_context(); + size_t count = detail::iterator_range_size(first, last); + + meta_kernel k("serial_accumulate"); + size_t init_arg = k.add_arg<T>("init"); + size_t count_arg = k.add_arg<cl_uint>("count"); + + k << + k.decl<T>("result") << " = init;\n" << + "for(uint i = 0; i < count; i++)\n" << + " result = " << function(k.var<T>("result"), + first[k.var<cl_uint>("i")]) << ";\n" << + result[0] << " = result;\n"; + + kernel kernel = k.compile(context); + + kernel.set_arg(init_arg, init); + kernel.set_arg(count_arg, static_cast<cl_uint>(count)); + + queue.enqueue_task(kernel); +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_ACCUMULATE_HPP diff --git a/boost/compute/algorithm/detail/serial_count_if.hpp b/boost/compute/algorithm/detail/serial_count_if.hpp new file mode 100644 index 0000000000..be6794c426 --- /dev/null +++ b/boost/compute/algorithm/detail/serial_count_if.hpp @@ -0,0 +1,68 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_COUNT_IF_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_COUNT_IF_HPP + +#include <iterator> + +#include <boost/compute/container/detail/scalar.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> + +namespace boost { +namespace compute { +namespace detail { + +// counts values that match the predicate using a single thread +template<class InputIterator, class Predicate> +inline size_t serial_count_if(InputIterator first, + InputIterator last, + Predicate predicate, + command_queue &queue) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + + const context &context = queue.get_context(); + size_t size = iterator_range_size(first, last); + + meta_kernel k("serial_count_if"); + k.add_set_arg("size", static_cast<uint_>(size)); + size_t result_arg = k.add_arg<uint_ *>(memory_object::global_memory, "result"); + + k << + "uint count = 0;\n" << + "for(uint i = 0; i < size; i++){\n" << + k.decl<const value_type>("value") << "=" + << first[k.var<uint_>("i")] << ";\n" << + "if(" << predicate(k.var<const value_type>("value")) << "){\n" << + "count++;\n" << + "}\n" + "}\n" + "*result = count;\n"; + + kernel kernel = k.compile(context); + + // setup result buffer + scalar<uint_> result(context); + kernel.set_arg(result_arg, result.get_buffer()); + + // run kernel + queue.enqueue_task(kernel); + + // read index + return result.read(queue); +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_COUNT_IF_HPP diff --git a/boost/compute/algorithm/detail/serial_find_extrema.hpp b/boost/compute/algorithm/detail/serial_find_extrema.hpp new file mode 100644 index 0000000000..8407c88129 --- /dev/null +++ b/boost/compute/algorithm/detail/serial_find_extrema.hpp @@ -0,0 +1,87 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_FIND_EXTREMA_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_FIND_EXTREMA_HPP + +#include <boost/compute/command_queue.hpp> +#include <boost/compute/types/fundamental.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/container/detail/scalar.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator, class Compare> +inline InputIterator serial_find_extrema(InputIterator first, + InputIterator last, + Compare compare, + const bool find_minimum, + command_queue &queue) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; + + const context &context = queue.get_context(); + + meta_kernel k("serial_find_extrema"); + + k << + k.decl<value_type>("value") << " = " << first[k.expr<uint_>("0")] << ";\n" << + k.decl<uint_>("value_index") << " = 0;\n" << + "for(uint i = 1; i < size; i++){\n" << + " " << k.decl<value_type>("candidate") << "=" + << first[k.expr<uint_>("i")] << ";\n" << + + "#ifndef BOOST_COMPUTE_FIND_MAXIMUM\n" << + " if(" << compare(k.var<value_type>("candidate"), + k.var<value_type>("value")) << "){\n" << + "#else\n" << + " if(" << compare(k.var<value_type>("value"), + k.var<value_type>("candidate")) << "){\n" << + "#endif\n" << + + " value = candidate;\n" << + " value_index = i;\n" << + " }\n" << + "}\n" << + "*index = value_index;\n"; + + size_t index_arg_index = k.add_arg<uint_ *>(memory_object::global_memory, "index"); + size_t size_arg_index = k.add_arg<uint_>("size"); + + std::string options; + if(!find_minimum){ + options = "-DBOOST_COMPUTE_FIND_MAXIMUM"; + } + kernel kernel = k.compile(context, options); + + // setup index buffer + scalar<uint_> index(context); + kernel.set_arg(index_arg_index, index.get_buffer()); + + // setup count + size_t count = iterator_range_size(first, last); + kernel.set_arg(size_arg_index, static_cast<uint_>(count)); + + // run kernel + queue.enqueue_task(kernel); + + // read index and return iterator + return first + static_cast<difference_type>(index.read(queue)); +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_FIND_EXTREMA_HPP diff --git a/boost/compute/algorithm/detail/serial_merge.hpp b/boost/compute/algorithm/detail/serial_merge.hpp new file mode 100644 index 0000000000..85e38f704c --- /dev/null +++ b/boost/compute/algorithm/detail/serial_merge.hpp @@ -0,0 +1,97 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_SERIAL_MERGE_HPP +#define BOOST_COMPUTE_ALGORITHM_SERIAL_MERGE_HPP + +#include <iterator> + +#include <boost/compute/command_queue.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator1, + class InputIterator2, + class OutputIterator, + class Compare> +inline OutputIterator serial_merge(InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + InputIterator2 last2, + OutputIterator result, + Compare comp, + command_queue &queue) +{ + typedef typename + std::iterator_traits<InputIterator1>::value_type + input_type1; + typedef typename + std::iterator_traits<InputIterator2>::value_type + input_type2; + typedef typename + std::iterator_traits<OutputIterator>::difference_type + result_difference_type; + + std::ptrdiff_t size1 = std::distance(first1, last1); + std::ptrdiff_t size2 = std::distance(first2, last2); + + meta_kernel k("serial_merge"); + k.add_set_arg<uint_>("size1", static_cast<uint_>(size1)); + k.add_set_arg<uint_>("size2", static_cast<uint_>(size2)); + + k << + "uint i = 0;\n" << // index in result range + "uint j = 0;\n" << // index in first input range + "uint k = 0;\n" << // index in second input range + + // fetch initial values from each range + k.decl<input_type1>("j_value") << " = " << first1[0] << ";\n" << + k.decl<input_type2>("k_value") << " = " << first2[0] << ";\n" << + + // merge values from both input ranges to the result range + "while(j < size1 && k < size2){\n" << + " if(" << comp(k.var<input_type1>("j_value"), + k.var<input_type2>("k_value")) << "){\n" << + " " << result[k.var<uint_>("i++")] << " = j_value;\n" << + " j_value = " << first1[k.var<uint_>("++j")] << ";\n" << + " }\n" << + " else{\n" + " " << result[k.var<uint_>("i++")] << " = k_value;\n" + " k_value = " << first2[k.var<uint_>("++k")] << ";\n" << + " }\n" + "}\n" + + // copy any remaining values from first range + "while(j < size1){\n" << + result[k.var<uint_>("i++")] << " = " << + first1[k.var<uint_>("j++")] << ";\n" << + "}\n" + + // copy any remaining values from second range + "while(k < size2){\n" << + result[k.var<uint_>("i++")] << " = " << + first2[k.var<uint_>("k++")] << ";\n" << + "}\n"; + + // run kernel + k.exec(queue); + + return result + static_cast<result_difference_type>(size1 + size2); +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_SERIAL_MERGE_HPP diff --git a/boost/compute/algorithm/detail/serial_reduce.hpp b/boost/compute/algorithm/detail/serial_reduce.hpp new file mode 100644 index 0000000000..53aaf140fe --- /dev/null +++ b/boost/compute/algorithm/detail/serial_reduce.hpp @@ -0,0 +1,62 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_REDUCE_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_REDUCE_HPP + +#include <boost/compute/command_queue.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/type_traits/result_of.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator, class OutputIterator, class BinaryFunction> +inline void serial_reduce(InputIterator first, + InputIterator last, + OutputIterator result, + BinaryFunction function, + command_queue &queue) +{ + typedef typename + std::iterator_traits<InputIterator>::value_type T; + typedef typename + ::boost::compute::result_of<BinaryFunction(T, T)>::type result_type; + + const context &context = queue.get_context(); + size_t count = detail::iterator_range_size(first, last); + if(count == 0){ + return; + } + + meta_kernel k("serial_reduce"); + size_t count_arg = k.add_arg<cl_uint>("count"); + + k << + k.decl<result_type>("result") << " = " << first[0] << ";\n" << + "for(uint i = 1; i < count; i++)\n" << + " result = " << function(k.var<T>("result"), + first[k.var<uint_>("i")]) << ";\n" << + result[0] << " = result;\n"; + + kernel kernel = k.compile(context); + + kernel.set_arg(count_arg, static_cast<uint_>(count)); + + queue.enqueue_task(kernel); +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_REDUCE_HPP diff --git a/boost/compute/algorithm/detail/serial_reduce_by_key.hpp b/boost/compute/algorithm/detail/serial_reduce_by_key.hpp new file mode 100644 index 0000000000..f9bda8e476 --- /dev/null +++ b/boost/compute/algorithm/detail/serial_reduce_by_key.hpp @@ -0,0 +1,108 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_REDUCE_BY_KEY_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_REDUCE_BY_KEY_HPP + +#include <iterator> + +#include <boost/compute/command_queue.hpp> +#include <boost/compute/functional.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/container/detail/scalar.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/type_traits/result_of.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputKeyIterator, class InputValueIterator, + class OutputKeyIterator, class OutputValueIterator, + class BinaryFunction, class BinaryPredicate> +inline size_t serial_reduce_by_key(InputKeyIterator keys_first, + InputKeyIterator keys_last, + InputValueIterator values_first, + OutputKeyIterator keys_result, + OutputValueIterator values_result, + BinaryFunction function, + BinaryPredicate predicate, + command_queue &queue) +{ + typedef typename + std::iterator_traits<InputValueIterator>::value_type value_type; + typedef typename + std::iterator_traits<InputKeyIterator>::value_type key_type; + typedef typename + ::boost::compute::result_of<BinaryFunction(value_type, value_type)>::type result_type; + + const context &context = queue.get_context(); + size_t count = detail::iterator_range_size(keys_first, keys_last); + if(count < 1){ + return count; + } + + meta_kernel k("serial_reduce_by_key"); + size_t count_arg = k.add_arg<uint_>("count"); + size_t result_size_arg = k.add_arg<uint_ *>(memory_object::global_memory, + "result_size"); + + convert<result_type> to_result_type; + + k << + k.decl<result_type>("result") << + " = " << to_result_type(values_first[0]) << ";\n" << + k.decl<key_type>("previous_key") << " = " << keys_first[0] << ";\n" << + k.decl<result_type>("value") << ";\n" << + k.decl<key_type>("key") << ";\n" << + + k.decl<uint_>("size") << " = 1;\n" << + + keys_result[0] << " = previous_key;\n" << + values_result[0] << " = result;\n" << + + "for(ulong i = 1; i < count; i++) {\n" << + " value = " << to_result_type(values_first[k.var<uint_>("i")]) << ";\n" << + " key = " << keys_first[k.var<uint_>("i")] << ";\n" << + " if (" << predicate(k.var<key_type>("previous_key"), + k.var<key_type>("key")) << ") {\n" << + + " result = " << function(k.var<result_type>("result"), + k.var<result_type>("value")) << ";\n" << + " }\n " << + " else { \n" << + keys_result[k.var<uint_>("size - 1")] << " = previous_key;\n" << + values_result[k.var<uint_>("size - 1")] << " = result;\n" << + " result = value;\n" << + " size++;\n" << + " } \n" << + " previous_key = key;\n" << + "}\n" << + keys_result[k.var<uint_>("size - 1")] << " = previous_key;\n" << + values_result[k.var<uint_>("size - 1")] << " = result;\n" << + "*result_size = size;"; + + kernel kernel = k.compile(context); + + scalar<uint_> result_size(context); + kernel.set_arg(result_size_arg, result_size.get_buffer()); + kernel.set_arg(count_arg, static_cast<uint_>(count)); + + queue.enqueue_task(kernel); + + return static_cast<size_t>(result_size.read(queue)); +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_REDUCE_BY_KEY_HPP diff --git a/boost/compute/algorithm/equal.hpp b/boost/compute/algorithm/equal.hpp new file mode 100644 index 0000000000..35d0c5f0ea --- /dev/null +++ b/boost/compute/algorithm/equal.hpp @@ -0,0 +1,53 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_EQUAL_HPP +#define BOOST_COMPUTE_ALGORITHM_EQUAL_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/mismatch.hpp> + +namespace boost { +namespace compute { + +/// Returns \c true if the range [\p first1, \p last1) and the range +/// beginning at \p first2 are equal. +template<class InputIterator1, class InputIterator2> +inline bool equal(InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + command_queue &queue = system::default_queue()) +{ + return ::boost::compute::mismatch(first1, + last1, + first2, + queue).first == last1; +} + +/// \overload +template<class InputIterator1, class InputIterator2> +inline bool equal(InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + InputIterator2 last2, + command_queue &queue = system::default_queue()) +{ + if(std::distance(first1, last1) != std::distance(first2, last2)){ + return false; + } + + return ::boost::compute::equal(first1, last1, first2, queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_EQUAL_HPP diff --git a/boost/compute/algorithm/equal_range.hpp b/boost/compute/algorithm/equal_range.hpp new file mode 100644 index 0000000000..fd82177324 --- /dev/null +++ b/boost/compute/algorithm/equal_range.hpp @@ -0,0 +1,42 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_EQUAL_RANGE_HPP +#define BOOST_COMPUTE_ALGORITHM_EQUAL_RANGE_HPP + +#include <utility> + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/lower_bound.hpp> +#include <boost/compute/algorithm/upper_bound.hpp> + +namespace boost { +namespace compute { + +/// Returns a pair of iterators containing the range of values equal +/// to \p value in the sorted range [\p first, \p last). +template<class InputIterator, class T> +inline std::pair<InputIterator, InputIterator> +equal_range(InputIterator first, + InputIterator last, + const T &value, + command_queue &queue = system::default_queue()) +{ + return std::make_pair( + ::boost::compute::lower_bound(first, last, value, queue), + ::boost::compute::upper_bound(first, last, value, queue) + ); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_EQUAL_RANGE_HPP diff --git a/boost/compute/algorithm/exclusive_scan.hpp b/boost/compute/algorithm/exclusive_scan.hpp new file mode 100644 index 0000000000..205d3de658 --- /dev/null +++ b/boost/compute/algorithm/exclusive_scan.hpp @@ -0,0 +1,96 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_EXCLUSIVE_SCAN_HPP +#define BOOST_COMPUTE_ALGORITHM_EXCLUSIVE_SCAN_HPP + +#include <boost/compute/functional.hpp> +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/detail/scan.hpp> + +namespace boost { +namespace compute { + +/// Performs an exclusive scan of the elements in the range [\p first, \p last) +/// and stores the results in the range beginning at \p result. +/// +/// Each element in the output is assigned to the sum of all the previous +/// values in the input. +/// +/// \param first first element in the range to scan +/// \param last last element in the range to scan +/// \param result first element in the result range +/// \param init value used to initialize the scan sequence +/// \param binary_op associative binary operator +/// \param queue command queue to perform the operation +/// +/// \return \c OutputIterator to the end of the result range +/// +/// The default operation is to add the elements up. +/// +/// \snippet test/test_scan.cpp exclusive_scan_int +/// +/// But different associative operation can be specified as \p binary_op +/// instead (e.g., multiplication, maximum, minimum). Also value used to +/// initialized the scan sequence can be specified. +/// +/// \snippet test/test_scan.cpp exclusive_scan_int_multiplies +/// +/// \see inclusive_scan() +template<class InputIterator, class OutputIterator, class T, class BinaryOperator> +inline OutputIterator +exclusive_scan(InputIterator first, + InputIterator last, + OutputIterator result, + T init, + BinaryOperator binary_op, + command_queue &queue = system::default_queue()) +{ + return detail::scan(first, last, result, true, init, binary_op, queue); +} + +/// \overload +template<class InputIterator, class OutputIterator, class T> +inline OutputIterator +exclusive_scan(InputIterator first, + InputIterator last, + OutputIterator result, + T init, + command_queue &queue = system::default_queue()) +{ + typedef typename + std::iterator_traits<OutputIterator>::value_type output_type; + + return detail::scan(first, last, result, true, + init, boost::compute::plus<output_type>(), + queue); +} + +/// \overload +template<class InputIterator, class OutputIterator> +inline OutputIterator +exclusive_scan(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue = system::default_queue()) +{ + typedef typename + std::iterator_traits<OutputIterator>::value_type output_type; + + return detail::scan(first, last, result, true, + output_type(0), boost::compute::plus<output_type>(), + queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_EXCLUSIVE_SCAN_HPP diff --git a/boost/compute/algorithm/fill.hpp b/boost/compute/algorithm/fill.hpp new file mode 100644 index 0000000000..c711f46b94 --- /dev/null +++ b/boost/compute/algorithm/fill.hpp @@ -0,0 +1,306 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_FILL_HPP +#define BOOST_COMPUTE_ALGORITHM_FILL_HPP + +#include <iterator> + +#include <boost/mpl/int.hpp> +#include <boost/mpl/vector.hpp> +#include <boost/mpl/contains.hpp> +#include <boost/utility/enable_if.hpp> + +#include <boost/compute/cl.hpp> +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/copy.hpp> +#include <boost/compute/async/future.hpp> +#include <boost/compute/iterator/constant_iterator.hpp> +#include <boost/compute/iterator/discard_iterator.hpp> +#include <boost/compute/detail/is_buffer_iterator.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> + +namespace boost { +namespace compute { +namespace detail { + +namespace mpl = boost::mpl; + +// fills the range [first, first + count) with value using copy() +template<class BufferIterator, class T> +inline void fill_with_copy(BufferIterator first, + size_t count, + const T &value, + command_queue &queue) +{ + ::boost::compute::copy( + ::boost::compute::make_constant_iterator(value, 0), + ::boost::compute::make_constant_iterator(value, count), + first, + queue + ); +} + +// fills the range [first, first + count) with value using copy_async() +template<class BufferIterator, class T> +inline future<void> fill_async_with_copy(BufferIterator first, + size_t count, + const T &value, + command_queue &queue) +{ + return ::boost::compute::copy_async( + ::boost::compute::make_constant_iterator(value, 0), + ::boost::compute::make_constant_iterator(value, count), + first, + queue + ); +} + +#if defined(CL_VERSION_1_2) + +// meta-function returing true if Iterator points to a range of values +// that can be filled using clEnqueueFillBuffer(). to meet this criteria +// it must have a buffer accessible through iter.get_buffer() and the +// size of its value_type must by in {1, 2, 4, 8, 16, 32, 64, 128}. +template<class Iterator> +struct is_valid_fill_buffer_iterator : + public mpl::and_< + is_buffer_iterator<Iterator>, + mpl::contains< + mpl::vector< + mpl::int_<1>, + mpl::int_<2>, + mpl::int_<4>, + mpl::int_<8>, + mpl::int_<16>, + mpl::int_<32>, + mpl::int_<64>, + mpl::int_<128> + >, + mpl::int_< + sizeof(typename std::iterator_traits<Iterator>::value_type) + > + > + >::type { }; + +template<> +struct is_valid_fill_buffer_iterator<discard_iterator> : public boost::false_type {}; + +// specialization which uses clEnqueueFillBuffer for buffer iterators +template<class BufferIterator, class T> +inline void +dispatch_fill(BufferIterator first, + size_t count, + const T &value, + command_queue &queue, + typename boost::enable_if< + is_valid_fill_buffer_iterator<BufferIterator> + >::type* = 0) +{ + typedef typename std::iterator_traits<BufferIterator>::value_type value_type; + + if(count == 0){ + // nothing to do + return; + } + + // check if the device supports OpenCL 1.2 (required for enqueue_fill_buffer) + if(!queue.check_device_version(1, 2)){ + return fill_with_copy(first, count, value, queue); + } + + value_type pattern = static_cast<value_type>(value); + size_t offset = static_cast<size_t>(first.get_index()); + + if(count == 1){ + // use clEnqueueWriteBuffer() directly when writing a single value + // to the device buffer. this is potentially more efficient and also + // works around a bug in the intel opencl driver. + queue.enqueue_write_buffer( + first.get_buffer(), + offset * sizeof(value_type), + sizeof(value_type), + &pattern + ); + } + else { + queue.enqueue_fill_buffer( + first.get_buffer(), + &pattern, + sizeof(value_type), + offset * sizeof(value_type), + count * sizeof(value_type) + ); + } +} + +template<class BufferIterator, class T> +inline future<void> +dispatch_fill_async(BufferIterator first, + size_t count, + const T &value, + command_queue &queue, + typename boost::enable_if< + is_valid_fill_buffer_iterator<BufferIterator> + >::type* = 0) +{ + typedef typename std::iterator_traits<BufferIterator>::value_type value_type; + + // check if the device supports OpenCL 1.2 (required for enqueue_fill_buffer) + if(!queue.check_device_version(1, 2)){ + return fill_async_with_copy(first, count, value, queue); + } + + value_type pattern = static_cast<value_type>(value); + size_t offset = static_cast<size_t>(first.get_index()); + + event event_ = + queue.enqueue_fill_buffer(first.get_buffer(), + &pattern, + sizeof(value_type), + offset * sizeof(value_type), + count * sizeof(value_type)); + + return future<void>(event_); +} + +#ifdef CL_VERSION_2_0 +// specializations for svm_ptr<T> +template<class T> +inline void dispatch_fill(svm_ptr<T> first, + size_t count, + const T &value, + command_queue &queue) +{ + if(count == 0){ + return; + } + + queue.enqueue_svm_fill( + first.get(), &value, sizeof(T), count * sizeof(T) + ); +} + +template<class T> +inline future<void> dispatch_fill_async(svm_ptr<T> first, + size_t count, + const T &value, + command_queue &queue) +{ + if(count == 0){ + return future<void>(); + } + + event event_ = queue.enqueue_svm_fill( + first.get(), &value, sizeof(T), count * sizeof(T) + ); + + return future<void>(event_); +} +#endif // CL_VERSION_2_0 + +// default implementations +template<class BufferIterator, class T> +inline void +dispatch_fill(BufferIterator first, + size_t count, + const T &value, + command_queue &queue, + typename boost::disable_if< + is_valid_fill_buffer_iterator<BufferIterator> + >::type* = 0) +{ + fill_with_copy(first, count, value, queue); +} + +template<class BufferIterator, class T> +inline future<void> +dispatch_fill_async(BufferIterator first, + size_t count, + const T &value, + command_queue &queue, + typename boost::disable_if< + is_valid_fill_buffer_iterator<BufferIterator> + >::type* = 0) +{ + return fill_async_with_copy(first, count, value, queue); +} +#else +template<class BufferIterator, class T> +inline void dispatch_fill(BufferIterator first, + size_t count, + const T &value, + command_queue &queue) +{ + fill_with_copy(first, count, value, queue); +} + +template<class BufferIterator, class T> +inline future<void> dispatch_fill_async(BufferIterator first, + size_t count, + const T &value, + command_queue &queue) +{ + return fill_async_with_copy(first, count, value, queue); +} +#endif // !defined(CL_VERSION_1_2) + +} // end detail namespace + +/// Fills the range [\p first, \p last) with \p value. +/// +/// \param first first element in the range to fill +/// \param last last element in the range to fill +/// \param value value to copy to each element +/// \param queue command queue to perform the operation +/// +/// For example, to fill a vector on the device with sevens: +/// \code +/// // vector on the device +/// boost::compute::vector<int> vec(10, context); +/// +/// // fill vector with sevens +/// boost::compute::fill(vec.begin(), vec.end(), 7, queue); +/// \endcode +/// +/// \see boost::compute::fill_n() +template<class BufferIterator, class T> +inline void fill(BufferIterator first, + BufferIterator last, + const T &value, + command_queue &queue = system::default_queue()) +{ + size_t count = detail::iterator_range_size(first, last); + if(count == 0){ + return; + } + + detail::dispatch_fill(first, count, value, queue); +} + +template<class BufferIterator, class T> +inline future<void> fill_async(BufferIterator first, + BufferIterator last, + const T &value, + command_queue &queue = system::default_queue()) +{ + size_t count = detail::iterator_range_size(first, last); + if(count == 0){ + return future<void>(); + } + + return detail::dispatch_fill_async(first, count, value, queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_FILL_HPP diff --git a/boost/compute/algorithm/fill_n.hpp b/boost/compute/algorithm/fill_n.hpp new file mode 100644 index 0000000000..18a8f706a5 --- /dev/null +++ b/boost/compute/algorithm/fill_n.hpp @@ -0,0 +1,36 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_FILL_N_HPP +#define BOOST_COMPUTE_ALGORITHM_FILL_N_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/fill.hpp> + +namespace boost { +namespace compute { + +/// Fills the range [\p first, \p first + count) with \p value. +/// +/// \see fill() +template<class BufferIterator, class Size, class T> +inline void fill_n(BufferIterator first, + Size count, + const T &value, + command_queue &queue = system::default_queue()) +{ + ::boost::compute::fill(first, first + count, value, queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_FILL_N_HPP diff --git a/boost/compute/algorithm/find.hpp b/boost/compute/algorithm/find.hpp new file mode 100644 index 0000000000..ef3ebf0c47 --- /dev/null +++ b/boost/compute/algorithm/find.hpp @@ -0,0 +1,57 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_FIND_HPP +#define BOOST_COMPUTE_ALGORITHM_FIND_HPP + +#include <boost/compute/lambda.hpp> +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/find_if.hpp> +#include <boost/compute/type_traits/vector_size.hpp> + +namespace boost { +namespace compute { + +/// Returns an iterator pointing to the first element in the range +/// [\p first, \p last) that equals \p value. +template<class InputIterator, class T> +inline InputIterator find(InputIterator first, + InputIterator last, + const T &value, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + + using ::boost::compute::_1; + using ::boost::compute::lambda::all; + + if(vector_size<value_type>::value == 1){ + return ::boost::compute::find_if( + first, + last, + _1 == value, + queue + ); + } + else { + return ::boost::compute::find_if( + first, + last, + all(_1 == value), + queue + ); + } +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_FIND_HPP diff --git a/boost/compute/algorithm/find_end.hpp b/boost/compute/algorithm/find_end.hpp new file mode 100644 index 0000000000..5c40055113 --- /dev/null +++ b/boost/compute/algorithm/find_end.hpp @@ -0,0 +1,119 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_FIND_END_HPP +#define BOOST_COMPUTE_ALGORITHM_FIND_END_HPP + +#include <boost/compute/algorithm/copy.hpp> +#include <boost/compute/algorithm/detail/search_all.hpp> +#include <boost/compute/container/detail/scalar.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/system.hpp> + +namespace boost { +namespace compute { +namespace detail { + +/// +/// \brief Helper function for find_end +/// +/// Basically a copy of find_if which returns last occurence +/// instead of first occurence +/// +template<class InputIterator, class UnaryPredicate> +inline InputIterator find_end_helper(InputIterator first, + InputIterator last, + UnaryPredicate predicate, + command_queue &queue) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + + size_t count = detail::iterator_range_size(first, last); + if(count == 0){ + return last; + } + + const context &context = queue.get_context(); + + detail::meta_kernel k("find_end"); + size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index"); + atomic_max<int_> atomic_max_int; + + k << k.decl<const int_>("i") << " = get_global_id(0);\n" + << k.decl<const value_type>("value") << "=" + << first[k.var<const int_>("i")] << ";\n" + << "if(" << predicate(k.var<const value_type>("value")) << "){\n" + << " " << atomic_max_int(k.var<int_ *>("index"), k.var<int_>("i")) << ";\n" + << "}\n"; + + kernel kernel = k.compile(context); + + scalar<int_> index(context); + kernel.set_arg(index_arg, index.get_buffer()); + + index.write(static_cast<int_>(-1), queue); + + queue.enqueue_1d_range_kernel(kernel, 0, count, 0); + + int result = static_cast<int>(index.read(queue)); + if(result == -1) return last; + else return first + result; +} + +} // end detail namespace + +/// +/// \brief Substring matching algorithm +/// +/// Searches for the last match of the pattern [p_first, p_last) +/// in text [t_first, t_last). +/// \return Iterator pointing to beginning of last occurence +/// +/// \param t_first Iterator pointing to start of text +/// \param t_last Iterator pointing to end of text +/// \param p_first Iterator pointing to start of pattern +/// \param p_last Iterator pointing to end of pattern +/// \param queue Queue on which to execute +/// +template<class TextIterator, class PatternIterator> +inline TextIterator find_end(TextIterator t_first, + TextIterator t_last, + PatternIterator p_first, + PatternIterator p_last, + command_queue &queue = system::default_queue()) +{ + const context &context = queue.get_context(); + vector<uint_> matching_indices(detail::iterator_range_size(t_first, t_last), + context); + + detail::search_kernel<PatternIterator, + TextIterator, + vector<uint_>::iterator> kernel; + + kernel.set_range(p_first, p_last, t_first, t_last, matching_indices.begin()); + kernel.exec(queue); + + using boost::compute::_1; + + vector<uint_>::iterator index = + detail::find_end_helper(matching_indices.begin(), + matching_indices.end(), + _1 == 1, + queue); + + return t_first + detail::iterator_range_size(matching_indices.begin(), index); +} + +} //end compute namespace +} //end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_FIND_END_HPP diff --git a/boost/compute/algorithm/find_if.hpp b/boost/compute/algorithm/find_if.hpp new file mode 100644 index 0000000000..db99cc0396 --- /dev/null +++ b/boost/compute/algorithm/find_if.hpp @@ -0,0 +1,35 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_FIND_IF_HPP +#define BOOST_COMPUTE_ALGORITHM_FIND_IF_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/detail/find_if_with_atomics.hpp> + +namespace boost { +namespace compute { + +/// Returns an iterator pointing to the first element in the range +/// [\p first, \p last) for which \p predicate returns \c true. +template<class InputIterator, class UnaryPredicate> +inline InputIterator find_if(InputIterator first, + InputIterator last, + UnaryPredicate predicate, + command_queue &queue = system::default_queue()) +{ + return detail::find_if_with_atomics(first, last, predicate, queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_FIND_IF_HPP diff --git a/boost/compute/algorithm/find_if_not.hpp b/boost/compute/algorithm/find_if_not.hpp new file mode 100644 index 0000000000..61de050d31 --- /dev/null +++ b/boost/compute/algorithm/find_if_not.hpp @@ -0,0 +1,43 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_FIND_IF_NOT_HPP +#define BOOST_COMPUTE_ALGORITHM_FIND_IF_NOT_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/functional.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/find_if.hpp> + +namespace boost { +namespace compute { + +/// Returns an iterator pointing to the first element in the range +/// [\p first, \p last) for which \p predicate returns \c false. +/// +/// \see find_if() +template<class InputIterator, class UnaryPredicate> +inline InputIterator find_if_not(InputIterator first, + InputIterator last, + UnaryPredicate predicate, + command_queue &queue = system::default_queue()) +{ + return ::boost::compute::find_if( + first, + last, + not1(predicate), + queue + ); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_FIND_IF_NOT_HPP diff --git a/boost/compute/algorithm/for_each.hpp b/boost/compute/algorithm/for_each.hpp new file mode 100644 index 0000000000..3ed399e6e9 --- /dev/null +++ b/boost/compute/algorithm/for_each.hpp @@ -0,0 +1,65 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_FOR_EACH_HPP +#define BOOST_COMPUTE_ALGORITHM_FOR_EACH_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator, class Function> +struct for_each_kernel : public meta_kernel +{ + for_each_kernel(InputIterator first, InputIterator last, Function function) + : meta_kernel("for_each") + { + // store range size + m_count = detail::iterator_range_size(first, last); + + // setup kernel source + *this << function(first[get_global_id(0)]) << ";\n"; + } + + void exec(command_queue &queue) + { + exec_1d(queue, 0, m_count); + } + + size_t m_count; +}; + +} // end detail namespace + +/// Calls \p function on each element in the range [\p first, \p last). +/// +/// \see transform() +template<class InputIterator, class UnaryFunction> +inline UnaryFunction for_each(InputIterator first, + InputIterator last, + UnaryFunction function, + command_queue &queue = system::default_queue()) +{ + detail::for_each_kernel<InputIterator, UnaryFunction> kernel(first, last, function); + + kernel.exec(queue); + + return function; +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_FOR_EACH_HPP diff --git a/boost/compute/algorithm/for_each_n.hpp b/boost/compute/algorithm/for_each_n.hpp new file mode 100644 index 0000000000..d0be784bf7 --- /dev/null +++ b/boost/compute/algorithm/for_each_n.hpp @@ -0,0 +1,35 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_FOR_EACH_N_HPP +#define BOOST_COMPUTE_ALGORITHM_FOR_EACH_N_HPP + +#include <boost/compute/algorithm/for_each.hpp> + +namespace boost { +namespace compute { + +/// Calls \p function on each element in the range [\p first, \p first +/// \c + \p count). +/// +/// \see for_each() +template<class InputIterator, class Size, class UnaryFunction> +inline UnaryFunction for_each_n(InputIterator first, + Size count, + UnaryFunction function, + command_queue &queue = system::default_queue()) +{ + return ::boost::compute::for_each(first, first + count, function, queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_FOR_EACH_N_HPP diff --git a/boost/compute/algorithm/gather.hpp b/boost/compute/algorithm/gather.hpp new file mode 100644 index 0000000000..b2f725d54e --- /dev/null +++ b/boost/compute/algorithm/gather.hpp @@ -0,0 +1,84 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_GATHER_HPP +#define BOOST_COMPUTE_ALGORITHM_GATHER_HPP + +#include <boost/compute/command_queue.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/exception.hpp> +#include <boost/compute/iterator/buffer_iterator.hpp> +#include <boost/compute/system.hpp> +#include <boost/compute/type_traits/type_name.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator, class MapIterator, class OutputIterator> +class gather_kernel : public meta_kernel +{ +public: + gather_kernel() : meta_kernel("gather") + {} + + void set_range(MapIterator first, + MapIterator last, + InputIterator input, + OutputIterator result) + { + m_count = iterator_range_size(first, last); + m_offset = first.get_index(); + + *this << + "const uint i = get_global_id(0);\n" << + result[expr<uint_>("i")] << "=" << + input[first[expr<uint_>("i")]] << ";\n"; + } + + event exec(command_queue &queue) + { + if(m_count == 0) { + return event(); + } + + return exec_1d(queue, m_offset, m_count); + } + +private: + size_t m_count; + size_t m_offset; +}; + +} // end detail namespace + +/// Copies the elements using the indices from the range [\p first, \p last) +/// to the range beginning at \p result using the input values from the range +/// beginning at \p input. +/// +/// \see scatter() +template<class InputIterator, class MapIterator, class OutputIterator> +inline void gather(MapIterator first, + MapIterator last, + InputIterator input, + OutputIterator result, + command_queue &queue = system::default_queue()) +{ + detail::gather_kernel<InputIterator, MapIterator, OutputIterator> kernel; + + kernel.set_range(first, last, input, result); + kernel.exec(queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_GATHER_HPP diff --git a/boost/compute/algorithm/generate.hpp b/boost/compute/algorithm/generate.hpp new file mode 100644 index 0000000000..c70a542683 --- /dev/null +++ b/boost/compute/algorithm/generate.hpp @@ -0,0 +1,49 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_GENERATE_HPP +#define BOOST_COMPUTE_ALGORITHM_GENERATE_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/copy.hpp> +#include <boost/compute/iterator/function_input_iterator.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> + +namespace boost { +namespace compute { + +/// Stores the result of \p generator for each element in the range +/// [\p first, \p last). +template<class OutputIterator, class Generator> +inline void generate(OutputIterator first, + OutputIterator last, + Generator generator, + command_queue &queue = system::default_queue()) +{ + size_t count = detail::iterator_range_size(first, last); + if(count == 0){ + return; + } + + ::boost::compute::copy( + ::boost::compute::make_function_input_iterator(generator, + first.get_index()), + ::boost::compute::make_function_input_iterator(generator, + last.get_index()), + first, + queue + ); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_GENERATE_HPP diff --git a/boost/compute/algorithm/generate_n.hpp b/boost/compute/algorithm/generate_n.hpp new file mode 100644 index 0000000000..6d8e607b64 --- /dev/null +++ b/boost/compute/algorithm/generate_n.hpp @@ -0,0 +1,35 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_GENERATE_N_HPP +#define BOOST_COMPUTE_ALGORITHM_GENERATE_N_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/generate.hpp> + +namespace boost { +namespace compute { + +/// Stores the result of \p generator for each element in the range +/// [\p first, \p first + \p count). +template<class OutputIterator, class Size, class Generator> +inline void generate_n(OutputIterator first, + Size count, + Generator generator, + command_queue &queue = system::default_queue()) +{ + ::boost::compute::generate(first, first + count, generator, queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_GENERATE_N_HPP diff --git a/boost/compute/algorithm/includes.hpp b/boost/compute/algorithm/includes.hpp new file mode 100644 index 0000000000..c4e7c793e7 --- /dev/null +++ b/boost/compute/algorithm/includes.hpp @@ -0,0 +1,155 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_INCLUDES_HPP +#define BOOST_COMPUTE_ALGORITHM_INCLUDES_HPP + +#include <iterator> + +#include <boost/compute/algorithm/detail/balanced_path.hpp> +#include <boost/compute/algorithm/fill_n.hpp> +#include <boost/compute/algorithm/find.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/detail/read_write_single_value.hpp> +#include <boost/compute/system.hpp> + +namespace boost { +namespace compute { +namespace detail { + +/// +/// \brief Serial includes kernel class +/// +/// Subclass of meta_kernel to perform includes operation after tiling +/// +class serial_includes_kernel : meta_kernel +{ +public: + + serial_includes_kernel() : meta_kernel("includes") + { + + } + + template<class InputIterator1, class InputIterator2, + class InputIterator3, class InputIterator4, + class OutputIterator> + void set_range(InputIterator1 first1, + InputIterator2 first2, + InputIterator3 tile_first1, + InputIterator3 tile_last1, + InputIterator4 tile_first2, + OutputIterator result) + { + m_count = iterator_range_size(tile_first1, tile_last1) - 1; + + *this << + "uint i = get_global_id(0);\n" << + "uint start1 = " << tile_first1[expr<uint_>("i")] << ";\n" << + "uint end1 = " << tile_first1[expr<uint_>("i+1")] << ";\n" << + "uint start2 = " << tile_first2[expr<uint_>("i")] << ";\n" << + "uint end2 = " << tile_first2[expr<uint_>("i+1")] << ";\n" << + "uint includes = 1;\n" << + "while(start1<end1 && start2<end2)\n" << + "{\n" << + " if(" << first1[expr<uint_>("start1")] << " == " << + first2[expr<uint_>("start2")] << ")\n" << + " {\n" << + " start1++; start2++;\n" << + " }\n" << + " else if(" << first1[expr<uint_>("start1")] << " < " << + first2[expr<uint_>("start2")] << ")\n" << + " start1++;\n" << + " else\n" << + " {\n" << + " includes = 0;\n" << + " break;\n" << + " }\n" << + "}\n" << + "if(start2<end2)\n" << + " includes = 0;\n" << + result[expr<uint_>("i")] << " = includes;\n"; + } + + event exec(command_queue &queue) + { + if(m_count == 0) { + return event(); + } + + return exec_1d(queue, 0, m_count); + } + +private: + size_t m_count; +}; + +} //end detail namespace + +/// +/// \brief Includes algorithm +/// +/// Finds if the sorted range [first1, last1) includes the sorted +/// range [first2, last2). In other words, it checks if [first1, last1) is +/// a superset of [first2, last2). +/// +/// \return True, if [first1, last1) includes [first2, last2). False otherwise. +/// +/// \param first1 Iterator pointing to start of first set +/// \param last1 Iterator pointing to end of first set +/// \param first2 Iterator pointing to start of second set +/// \param last2 Iterator pointing to end of second set +/// \param queue Queue on which to execute +/// +template<class InputIterator1, class InputIterator2> +inline bool includes(InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + InputIterator2 last2, + command_queue &queue = system::default_queue()) +{ + size_t tile_size = 1024; + + size_t count1 = detail::iterator_range_size(first1, last1); + size_t count2 = detail::iterator_range_size(first2, last2); + + vector<uint_> tile_a((count1+count2+tile_size-1)/tile_size+1, queue.get_context()); + vector<uint_> tile_b((count1+count2+tile_size-1)/tile_size+1, queue.get_context()); + + // Tile the sets + detail::balanced_path_kernel tiling_kernel; + tiling_kernel.tile_size = static_cast<unsigned int>(tile_size); + tiling_kernel.set_range(first1, last1, first2, last2, + tile_a.begin()+1, tile_b.begin()+1); + fill_n(tile_a.begin(), 1, uint_(0), queue); + fill_n(tile_b.begin(), 1, uint_(0), queue); + tiling_kernel.exec(queue); + + fill_n(tile_a.end()-1, 1, static_cast<uint_>(count1), queue); + fill_n(tile_b.end()-1, 1, static_cast<uint_>(count2), queue); + + vector<uint_> result((count1+count2+tile_size-1)/tile_size, queue.get_context()); + + // Find individually + detail::serial_includes_kernel includes_kernel; + includes_kernel.set_range(first1, first2, tile_a.begin(), tile_a.end(), + tile_b.begin(), result.begin()); + + includes_kernel.exec(queue); + + return find(result.begin(), result.end(), 0, queue) == result.end(); +} + +} //end compute namespace +} //end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_SET_UNION_HPP diff --git a/boost/compute/algorithm/inclusive_scan.hpp b/boost/compute/algorithm/inclusive_scan.hpp new file mode 100644 index 0000000000..9f98beaf7c --- /dev/null +++ b/boost/compute/algorithm/inclusive_scan.hpp @@ -0,0 +1,81 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_INCLUSIVE_SCAN_HPP +#define BOOST_COMPUTE_ALGORITHM_INCLUSIVE_SCAN_HPP + +#include <boost/compute/functional.hpp> +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/detail/scan.hpp> + +namespace boost { +namespace compute { + +/// Performs an inclusive scan of the elements in the range [\p first, \p last) +/// and stores the results in the range beginning at \p result. +/// +/// Each element in the output is assigned to the sum of the current value in +/// the input with the sum of every previous value in the input. +/// +/// \param first first element in the range to scan +/// \param last last element in the range to scan +/// \param result first element in the result range +/// \param binary_op associative binary operator +/// \param queue command queue to perform the operation +/// +/// \return \c OutputIterator to the end of the result range +/// +/// The default operation is to add the elements up. +/// +/// \snippet test/test_scan.cpp inclusive_scan_int +/// +/// But different associative operation can be specified as \p binary_op +/// instead (e.g., multiplication, maximum, minimum). +/// +/// \snippet test/test_scan.cpp inclusive_scan_int_multiplies +/// +/// \see exclusive_scan() +template<class InputIterator, class OutputIterator, class BinaryOperator> +inline OutputIterator +inclusive_scan(InputIterator first, + InputIterator last, + OutputIterator result, + BinaryOperator binary_op, + command_queue &queue = system::default_queue()) +{ + typedef typename + std::iterator_traits<OutputIterator>::value_type output_type; + + return detail::scan(first, last, result, false, + output_type(0), binary_op, + queue); +} + +/// \overload +template<class InputIterator, class OutputIterator> +inline OutputIterator +inclusive_scan(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue = system::default_queue()) +{ + typedef typename + std::iterator_traits<OutputIterator>::value_type output_type; + + return detail::scan(first, last, result, false, + output_type(0), boost::compute::plus<output_type>(), + queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_INCLUSIVE_SCAN_HPP diff --git a/boost/compute/algorithm/inner_product.hpp b/boost/compute/algorithm/inner_product.hpp new file mode 100644 index 0000000000..614611f91e --- /dev/null +++ b/boost/compute/algorithm/inner_product.hpp @@ -0,0 +1,93 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_INNER_PRODUCT_HPP +#define BOOST_COMPUTE_ALGORITHM_INNER_PRODUCT_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/functional.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/accumulate.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/iterator/transform_iterator.hpp> +#include <boost/compute/iterator/zip_iterator.hpp> +#include <boost/compute/functional/detail/unpack.hpp> + +namespace boost { +namespace compute { + +/// Returns the inner product of the elements in the range +/// [\p first1, \p last1) with the elements in the range beginning +/// at \p first2. +template<class InputIterator1, class InputIterator2, class T> +inline T inner_product(InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + T init, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator1>::value_type input_type; + + ptrdiff_t n = std::distance(first1, last1); + + return ::boost::compute::accumulate( + ::boost::compute::make_transform_iterator( + ::boost::compute::make_zip_iterator( + boost::make_tuple(first1, first2) + ), + detail::unpack(multiplies<input_type>()) + ), + ::boost::compute::make_transform_iterator( + ::boost::compute::make_zip_iterator( + boost::make_tuple(last1, first2 + n) + ), + detail::unpack(multiplies<input_type>()) + ), + init, + queue + ); +} + +/// \overload +template<class InputIterator1, + class InputIterator2, + class T, + class BinaryAccumulateFunction, + class BinaryTransformFunction> +inline T inner_product(InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + T init, + BinaryAccumulateFunction accumulate_function, + BinaryTransformFunction transform_function, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator1>::value_type value_type; + + size_t count = detail::iterator_range_size(first1, last1); + vector<value_type> result(count, queue.get_context()); + transform(first1, + last1, + first2, + result.begin(), + transform_function, + queue); + + return ::boost::compute::accumulate(result.begin(), + result.end(), + init, + accumulate_function, + queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_INNER_PRODUCT_HPP diff --git a/boost/compute/algorithm/inplace_merge.hpp b/boost/compute/algorithm/inplace_merge.hpp new file mode 100644 index 0000000000..3080950df5 --- /dev/null +++ b/boost/compute/algorithm/inplace_merge.hpp @@ -0,0 +1,60 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_INPLACE_MERGE_HPP +#define BOOST_COMPUTE_ALGORITHM_INPLACE_MERGE_HPP + +#include <iterator> + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/merge.hpp> +#include <boost/compute/container/vector.hpp> + +namespace boost { +namespace compute { + +/// Merges the sorted values in the range [\p first, \p middle) with +/// the sorted values in the range [\p middle, \p last) in-place. +template<class Iterator> +inline void inplace_merge(Iterator first, + Iterator middle, + Iterator last, + command_queue &queue = system::default_queue()) +{ + BOOST_ASSERT(first < middle && middle < last); + + typedef typename std::iterator_traits<Iterator>::value_type T; + + const context &context = queue.get_context(); + + ptrdiff_t left_size = std::distance(first, middle); + ptrdiff_t right_size = std::distance(middle, last); + + vector<T> left(left_size, context); + vector<T> right(right_size, context); + + copy(first, middle, left.begin(), queue); + copy(middle, last, right.begin(), queue); + + ::boost::compute::merge( + left.begin(), + left.end(), + right.begin(), + right.end(), + first, + queue + ); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_INPLACE_MERGE_HPP diff --git a/boost/compute/algorithm/iota.hpp b/boost/compute/algorithm/iota.hpp new file mode 100644 index 0000000000..084c3d8d97 --- /dev/null +++ b/boost/compute/algorithm/iota.hpp @@ -0,0 +1,48 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_IOTA_HPP +#define BOOST_COMPUTE_ALGORITHM_IOTA_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/copy.hpp> +#include <boost/compute/iterator/counting_iterator.hpp> + +namespace boost { +namespace compute { + +/// Fills the range [\p first, \p last) with sequential values starting at +/// \p value. +/// +/// For example, the following code: +/// \snippet test/test_iota.cpp iota +/// +/// Will fill \c vec with the values (\c 0, \c 1, \c 2, \c ...). +template<class BufferIterator, class T> +inline void iota(BufferIterator first, + BufferIterator last, + const T &value, + command_queue &queue = system::default_queue()) +{ + T count = static_cast<T>(detail::iterator_range_size(first, last)); + + copy( + ::boost::compute::make_counting_iterator(value), + ::boost::compute::make_counting_iterator(value + count), + first, + queue + ); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_IOTA_HPP diff --git a/boost/compute/algorithm/is_partitioned.hpp b/boost/compute/algorithm/is_partitioned.hpp new file mode 100644 index 0000000000..3916825057 --- /dev/null +++ b/boost/compute/algorithm/is_partitioned.hpp @@ -0,0 +1,43 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_IS_PARTITIONED_HPP +#define BOOST_COMPUTE_ALGORITHM_IS_PARTITIONED_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/find_if.hpp> +#include <boost/compute/algorithm/find_if_not.hpp> + +namespace boost { +namespace compute { + +/// Returns \c true if the values in the range [\p first, \p last) +/// are partitioned according to \p predicate. +template<class InputIterator, class UnaryPredicate> +inline bool is_partitioned(InputIterator first, + InputIterator last, + UnaryPredicate predicate, + command_queue &queue = system::default_queue()) +{ + return ::boost::compute::find_if( + ::boost::compute::find_if_not(first, + last, + predicate, + queue), + last, + predicate, + queue) == last; +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_PARTITION_HPP diff --git a/boost/compute/algorithm/is_permutation.hpp b/boost/compute/algorithm/is_permutation.hpp new file mode 100644 index 0000000000..1e502efb37 --- /dev/null +++ b/boost/compute/algorithm/is_permutation.hpp @@ -0,0 +1,67 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_IS_PERMUTATION_HPP +#define BOOST_COMPUTE_ALGORITHM_IS_PERMUTATION_HPP + +#include <iterator> + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/algorithm/equal.hpp> +#include <boost/compute/algorithm/sort.hpp> + +namespace boost { +namespace compute { + +/// +/// \brief Permutation checking algorithm +/// +/// Checks if the range [first1, last1) can be permuted into the +/// range [first2, last2) +/// \return True, if it can be permuted. False, otherwise. +/// +/// \param first1 Iterator pointing to start of first range +/// \param last1 Iterator pointing to end of first range +/// \param first2 Iterator pointing to start of second range +/// \param last2 Iterator pointing to end of second range +/// \param queue Queue on which to execute +/// +template<class InputIterator1, class InputIterator2> +inline bool is_permutation(InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + InputIterator2 last2, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator1>::value_type value_type1; + typedef typename std::iterator_traits<InputIterator2>::value_type value_type2; + + size_t count1 = detail::iterator_range_size(first1, last1); + size_t count2 = detail::iterator_range_size(first2, last2); + + if(count1 != count2) return false; + + vector<value_type1> temp1(first1, last1, queue); + vector<value_type2> temp2(first2, last2, queue); + + sort(temp1.begin(), temp1.end(), queue); + sort(temp2.begin(), temp2.end(), queue); + + return equal(temp1.begin(), temp1.end(), + temp2.begin(), queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_IS_PERMUTATION_HPP diff --git a/boost/compute/algorithm/is_sorted.hpp b/boost/compute/algorithm/is_sorted.hpp new file mode 100644 index 0000000000..a605159ac3 --- /dev/null +++ b/boost/compute/algorithm/is_sorted.hpp @@ -0,0 +1,64 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013-2014 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_IS_SORTED_HPP +#define BOOST_COMPUTE_ALGORITHM_IS_SORTED_HPP + +#include <boost/compute/command_queue.hpp> +#include <boost/compute/system.hpp> +#include <boost/compute/functional/bind.hpp> +#include <boost/compute/functional/operator.hpp> +#include <boost/compute/algorithm/adjacent_find.hpp> + +namespace boost { +namespace compute { + +/// Returns \c true if the values in the range [\p first, \p last) +/// are in sorted order. +/// +/// \param first first element in the range to check +/// \param last last element in the range to check +/// \param compare comparison function (by default \c less) +/// \param queue command queue to perform the operation +/// +/// \return \c true if the range [\p first, \p last) is sorted +/// +/// \see sort() +template<class InputIterator, class Compare> +inline bool is_sorted(InputIterator first, + InputIterator last, + Compare compare, + command_queue &queue = system::default_queue()) +{ + using ::boost::compute::placeholders::_1; + using ::boost::compute::placeholders::_2; + + return ::boost::compute::adjacent_find( + first, last, ::boost::compute::bind(compare, _2, _1), queue + ) == last; +} + +/// \overload +template<class InputIterator> +inline bool is_sorted(InputIterator first, + InputIterator last, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + + return ::boost::compute::is_sorted( + first, last, ::boost::compute::less<value_type>(), queue + ); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_IS_SORTED_HPP diff --git a/boost/compute/algorithm/lexicographical_compare.hpp b/boost/compute/algorithm/lexicographical_compare.hpp new file mode 100644 index 0000000000..c4f7120807 --- /dev/null +++ b/boost/compute/algorithm/lexicographical_compare.hpp @@ -0,0 +1,117 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2014 Mageswaran.D <mageswaran1989@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#include <boost/compute/system.hpp> +#include <boost/compute/context.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/any_of.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/utility/program_cache.hpp> + +namespace boost { +namespace compute { + +namespace detail { + +const char lexicographical_compare_source[] = +"__kernel void lexicographical_compare(const uint size1,\n" +" const uint size2,\n" +" __global const T1 *range1,\n" +" __global const T2 *range2,\n" +" __global bool *result_buf)\n" +"{\n" +" const uint i = get_global_id(0);\n" +" if((i != size1) && (i != size2)){\n" + //Individual elements are compared and results are stored in parallel. + //0 is true +" if(range1[i] < range2[i])\n" +" result_buf[i] = 0;\n" +" else\n" +" result_buf[i] = 1;\n" +" }\n" +" else\n" +" result_buf[i] = !((i == size1) && (i != size2));\n" +"}\n"; + +template<class InputIterator1, class InputIterator2> +inline bool dispatch_lexicographical_compare(InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + InputIterator2 last2, + command_queue &queue) +{ + const boost::compute::context &context = queue.get_context(); + + boost::shared_ptr<program_cache> cache = + program_cache::get_global_cache(context); + + size_t iterator_size1 = iterator_range_size(first1, last1); + size_t iterator_size2 = iterator_range_size(first2, last2); + size_t max_size = (std::max)(iterator_size1, iterator_size2); + + if(max_size == 0){ + return false; + } + + boost::compute::vector<bool> result_vector(max_size, context); + + + typedef typename std::iterator_traits<InputIterator1>::value_type value_type1; + typedef typename std::iterator_traits<InputIterator2>::value_type value_type2; + + // load (or create) lexicographical compare program + std::string cache_key = + std::string("__boost_lexicographical_compare") + + type_name<value_type1>() + type_name<value_type2>(); + + std::stringstream options; + options << " -DT1=" << type_name<value_type1>(); + options << " -DT2=" << type_name<value_type2>(); + + program lexicographical_compare_program = cache->get_or_build( + cache_key, options.str(), lexicographical_compare_source, context + ); + + kernel lexicographical_compare_kernel(lexicographical_compare_program, + "lexicographical_compare"); + + lexicographical_compare_kernel.set_arg<uint_>(0, iterator_size1); + lexicographical_compare_kernel.set_arg<uint_>(1, iterator_size2); + lexicographical_compare_kernel.set_arg(2, first1.get_buffer()); + lexicographical_compare_kernel.set_arg(3, first2.get_buffer()); + lexicographical_compare_kernel.set_arg(4, result_vector.get_buffer()); + + queue.enqueue_1d_range_kernel(lexicographical_compare_kernel, + 0, + max_size, + 0); + + return boost::compute::any_of(result_vector.begin(), + result_vector.end(), + _1 == 0, + queue); +} + +} // end detail namespace + +/// Checks if the first range [first1, last1) is lexicographically +/// less than the second range [first2, last2). +template<class InputIterator1, class InputIterator2> +inline bool lexicographical_compare(InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + InputIterator2 last2, + command_queue &queue = system::default_queue()) +{ + return detail::dispatch_lexicographical_compare(first1, last1, first2, last2, queue); +} + +} // end compute namespace +} // end boost namespac diff --git a/boost/compute/algorithm/lower_bound.hpp b/boost/compute/algorithm/lower_bound.hpp new file mode 100644 index 0000000000..b2011c66ef --- /dev/null +++ b/boost/compute/algorithm/lower_bound.hpp @@ -0,0 +1,44 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_LOWER_BOUND_HPP +#define BOOST_COMPUTE_ALGORITHM_LOWER_BOUND_HPP + +#include <boost/compute/lambda.hpp> +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/detail/binary_find.hpp> + +namespace boost { +namespace compute { + +/// Returns an iterator pointing to the first element in the sorted +/// range [\p first, \p last) that is not less than \p value. +/// +/// \see upper_bound() +template<class InputIterator, class T> +inline InputIterator +lower_bound(InputIterator first, + InputIterator last, + const T &value, + command_queue &queue = system::default_queue()) +{ + using ::boost::compute::_1; + + InputIterator position = + detail::binary_find(first, last, _1 >= value, queue); + + return position; +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_LOWER_BOUND_HPP diff --git a/boost/compute/algorithm/max_element.hpp b/boost/compute/algorithm/max_element.hpp new file mode 100644 index 0000000000..55f2f7ffbf --- /dev/null +++ b/boost/compute/algorithm/max_element.hpp @@ -0,0 +1,74 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_MAX_ELEMENT_HPP +#define BOOST_COMPUTE_ALGORITHM_MAX_ELEMENT_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/functional.hpp> +#include <boost/compute/algorithm/detail/find_extrema.hpp> + +namespace boost { +namespace compute { + +/// Returns an iterator pointing to the element in the range +/// [\p first, \p last) with the maximum value. +/// +/// \param first first element in the input range +/// \param last last element in the input range +/// \param compare comparison function object which returns true if the first +/// argument is less than (i.e. is ordered before) the second. +/// \param queue command queue to perform the operation +/// +/// For example, to find \c int2 value with maximum first component in given vector: +/// \code +/// // comparison function object +/// BOOST_COMPUTE_FUNCTION(bool, compare_first, (const int2_ &a, const int2_ &b), +/// { +/// return a.x < b.x; +/// }); +/// +/// // create vector +/// boost::compute::vector<uint2_> data = ... +/// +/// boost::compute::vector<uint2_>::iterator max = +/// boost::compute::max_element(data.begin(), data.end(), compare_first, queue); +/// \endcode +/// +/// \see min_element() +template<class InputIterator, class Compare> +inline InputIterator +max_element(InputIterator first, + InputIterator last, + Compare compare, + command_queue &queue = system::default_queue()) +{ + return detail::find_extrema(first, last, compare, false, queue); +} + +///\overload +template<class InputIterator> +inline InputIterator +max_element(InputIterator first, + InputIterator last, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + + return ::boost::compute::max_element( + first, last, ::boost::compute::less<value_type>(), queue + ); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_MAX_ELEMENT_HPP diff --git a/boost/compute/algorithm/merge.hpp b/boost/compute/algorithm/merge.hpp new file mode 100644 index 0000000000..875a283044 --- /dev/null +++ b/boost/compute/algorithm/merge.hpp @@ -0,0 +1,105 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_MERGE_HPP +#define BOOST_COMPUTE_ALGORITHM_MERGE_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/copy.hpp> +#include <boost/compute/algorithm/detail/merge_with_merge_path.hpp> +#include <boost/compute/algorithm/detail/serial_merge.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/parameter_cache.hpp> + +namespace boost { +namespace compute { + +/// Merges the sorted values in the range [\p first1, \p last1) with the sorted +/// values in the range [\p first2, last2) and stores the result in the range +/// beginning at \p result. Values are compared using the \p comp function. If +/// no comparision function is given, \c less is used. +/// +/// \param first1 first element in the first range to merge +/// \param last1 last element in the first range to merge +/// \param first2 first element in the second range to merge +/// \param last2 last element in the second range to merge +/// \param result first element in the result range +/// \param comp comparison function (by default \c less) +/// \param queue command queue to perform the operation +/// +/// \return \c OutputIterator to the end of the result range +/// +/// \see inplace_merge() +template<class InputIterator1, + class InputIterator2, + class OutputIterator, + class Compare> +inline OutputIterator merge(InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + InputIterator2 last2, + OutputIterator result, + Compare comp, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator1>::value_type input1_type; + typedef typename std::iterator_traits<InputIterator2>::value_type input2_type; + typedef typename std::iterator_traits<OutputIterator>::value_type output_type; + + const device &device = queue.get_device(); + + std::string cache_key = + std::string("__boost_merge_") + type_name<input1_type>() + "_" + + type_name<input2_type>() + "_" + type_name<output_type>(); + boost::shared_ptr<detail::parameter_cache> parameters = + detail::parameter_cache::get_global_cache(device); + + // default serial merge threshold depends on device type + size_t default_serial_merge_threshold = 32768; + if(device.type() & device::gpu) { + default_serial_merge_threshold = 2048; + } + + // loading serial merge threshold parameter + const size_t serial_merge_threshold = + parameters->get(cache_key, "serial_merge_threshold", + static_cast<uint_>(default_serial_merge_threshold)); + + // choosing merge algorithm + const size_t total_count = + detail::iterator_range_size(first1, last1) + + detail::iterator_range_size(first2, last2); + // for small inputs serial merge turns out to outperform + // merge with merge path algorithm + if(total_count <= serial_merge_threshold){ + return detail::serial_merge(first1, last1, first2, last2, result, comp, queue); + } + return detail::merge_with_merge_path(first1, last1, first2, last2, result, comp, queue); +} + +/// \overload +template<class InputIterator1, class InputIterator2, class OutputIterator> +inline OutputIterator merge(InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + InputIterator2 last2, + OutputIterator result, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator1>::value_type value_type; + less<value_type> less_than; + return merge(first1, last1, first2, last2, result, less_than, queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_MERGE_HPP diff --git a/boost/compute/algorithm/min_element.hpp b/boost/compute/algorithm/min_element.hpp new file mode 100644 index 0000000000..62744efb98 --- /dev/null +++ b/boost/compute/algorithm/min_element.hpp @@ -0,0 +1,74 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_MIN_ELEMENT_HPP +#define BOOST_COMPUTE_ALGORITHM_MIN_ELEMENT_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/functional.hpp> +#include <boost/compute/algorithm/detail/find_extrema.hpp> + +namespace boost { +namespace compute { + +/// Returns an iterator pointing to the element in range +/// [\p first, \p last) with the minimum value. +/// +/// \param first first element in the input range +/// \param last last element in the input range +/// \param compare comparison function object which returns true if the first +/// argument is less than (i.e. is ordered before) the second. +/// \param queue command queue to perform the operation +/// +/// For example, to find \c int2 value with minimum first component in given vector: +/// \code +/// // comparison function object +/// BOOST_COMPUTE_FUNCTION(bool, compare_first, (const int2_ &a, const int2_ &b), +/// { +/// return a.x < b.x; +/// }); +/// +/// // create vector +/// boost::compute::vector<uint2_> data = ... +/// +/// boost::compute::vector<uint2_>::iterator min = +/// boost::compute::min_element(data.begin(), data.end(), compare_first, queue); +/// \endcode +/// +/// \see max_element() +template<class InputIterator, class Compare> +inline InputIterator +min_element(InputIterator first, + InputIterator last, + Compare compare, + command_queue &queue = system::default_queue()) +{ + return detail::find_extrema(first, last, compare, true, queue); +} + +///\overload +template<class InputIterator> +inline InputIterator +min_element(InputIterator first, + InputIterator last, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + + return ::boost::compute::min_element( + first, last, ::boost::compute::less<value_type>(), queue + ); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_MIN_ELEMENT_HPP diff --git a/boost/compute/algorithm/minmax_element.hpp b/boost/compute/algorithm/minmax_element.hpp new file mode 100644 index 0000000000..bf32c3c989 --- /dev/null +++ b/boost/compute/algorithm/minmax_element.hpp @@ -0,0 +1,70 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_MINMAX_ELEMENT_HPP +#define BOOST_COMPUTE_ALGORITHM_MINMAX_ELEMENT_HPP + +#include <utility> + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/max_element.hpp> +#include <boost/compute/algorithm/min_element.hpp> + +namespace boost { +namespace compute { + +/// Returns a pair of iterators with the first pointing to the minimum +/// element and the second pointing to the maximum element in the range +/// [\p first, \p last). +/// +/// \param first first element in the input range +/// \param last last element in the input range +/// \param compare comparison function object which returns true if the first +/// argument is less than (i.e. is ordered before) the second. +/// \param queue command queue to perform the operation +/// +/// \see max_element(), min_element() +template<class InputIterator, class Compare> +inline std::pair<InputIterator, InputIterator> +minmax_element(InputIterator first, + InputIterator last, + Compare compare, + command_queue &queue = system::default_queue()) +{ + if(first == last){ + // empty range + return std::make_pair(first, first); + } + + return std::make_pair(min_element(first, last, compare, queue), + max_element(first, last, compare, queue)); +} + +///\overload +template<class InputIterator, class Compare> +inline std::pair<InputIterator, InputIterator> +minmax_element(InputIterator first, + InputIterator last, + command_queue &queue = system::default_queue()) +{ + if(first == last){ + // empty range + return std::make_pair(first, first); + } + + return std::make_pair(min_element(first, last, queue), + max_element(first, last, queue)); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_MINMAX_ELEMENT_HPP diff --git a/boost/compute/algorithm/mismatch.hpp b/boost/compute/algorithm/mismatch.hpp new file mode 100644 index 0000000000..e7db883004 --- /dev/null +++ b/boost/compute/algorithm/mismatch.hpp @@ -0,0 +1,89 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_MISMATCH_HPP +#define BOOST_COMPUTE_ALGORITHM_MISMATCH_HPP + +#include <iterator> +#include <utility> + +#include <boost/compute/system.hpp> +#include <boost/compute/functional.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/find.hpp> +#include <boost/compute/iterator/transform_iterator.hpp> +#include <boost/compute/iterator/zip_iterator.hpp> +#include <boost/compute/functional/detail/unpack.hpp> + +namespace boost { +namespace compute { + +/// Returns a pair of iterators pointing to the first position where the +/// range [\p first1, \p last1) and the range starting at \p first2 +/// differ. +template<class InputIterator1, class InputIterator2> +inline std::pair<InputIterator1, InputIterator2> +mismatch(InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator1>::value_type value_type; + + ::boost::compute::equal_to<value_type> op; + + InputIterator2 last2 = first2 + std::distance(first1, last1); + + InputIterator1 iter = + boost::get<0>( + ::boost::compute::find( + ::boost::compute::make_transform_iterator( + ::boost::compute::make_zip_iterator( + boost::make_tuple(first1, first2) + ), + detail::unpack(op) + ), + ::boost::compute::make_transform_iterator( + ::boost::compute::make_zip_iterator( + boost::make_tuple(last1, last2) + ), + detail::unpack(op) + ), + false, + queue + ).base().get_iterator_tuple() + ); + + return std::make_pair(iter, first2 + std::distance(first1, iter)); +} + +/// \overload +template<class InputIterator1, class InputIterator2> +inline std::pair<InputIterator1, InputIterator2> +mismatch(InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + InputIterator2 last2, + command_queue &queue = system::default_queue()) +{ + if(std::distance(first1, last1) < std::distance(first2, last2)){ + return ::boost::compute::mismatch(first1, last1, first2, queue); + } + else { + return ::boost::compute::mismatch( + first1, first1 + std::distance(first2, last2), first2, queue + ); + } +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_MISMATCH_HPP diff --git a/boost/compute/algorithm/next_permutation.hpp b/boost/compute/algorithm/next_permutation.hpp new file mode 100644 index 0000000000..e81fbd2ee8 --- /dev/null +++ b/boost/compute/algorithm/next_permutation.hpp @@ -0,0 +1,170 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_NEXT_PERMUTATION_HPP +#define BOOST_COMPUTE_ALGORITHM_NEXT_PERMUTATION_HPP + +#include <iterator> + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/container/detail/scalar.hpp> +#include <boost/compute/algorithm/reverse.hpp> + +namespace boost { +namespace compute { +namespace detail { + +/// +/// \brief Helper function for next_permutation +/// +/// To find rightmost element which is smaller +/// than its next element +/// +template<class InputIterator> +inline InputIterator next_permutation_helper(InputIterator first, + InputIterator last, + command_queue &queue) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + + size_t count = detail::iterator_range_size(first, last); + if(count == 0 || count == 1){ + return last; + } + count = count - 1; + const context &context = queue.get_context(); + + detail::meta_kernel k("next_permutation"); + size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index"); + atomic_max<int_> atomic_max_int; + + k << k.decl<const int_>("i") << " = get_global_id(0);\n" + << k.decl<const value_type>("cur_value") << "=" + << first[k.var<const int_>("i")] << ";\n" + << k.decl<const value_type>("next_value") << "=" + << first[k.expr<const int_>("i+1")] << ";\n" + << "if(cur_value < next_value){\n" + << " " << atomic_max_int(k.var<int_ *>("index"), k.var<int_>("i")) << ";\n" + << "}\n"; + + kernel kernel = k.compile(context); + + scalar<int_> index(context); + kernel.set_arg(index_arg, index.get_buffer()); + + index.write(static_cast<int_>(-1), queue); + + queue.enqueue_1d_range_kernel(kernel, 0, count, 0); + + int result = static_cast<int>(index.read(queue)); + if(result == -1) return last; + else return first + result; +} + +/// +/// \brief Helper function for next_permutation +/// +/// To find the smallest element to the right of the element found above +/// that is greater than it +/// +template<class InputIterator, class ValueType> +inline InputIterator np_ceiling(InputIterator first, + InputIterator last, + ValueType value, + command_queue &queue) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + + size_t count = detail::iterator_range_size(first, last); + if(count == 0){ + return last; + } + const context &context = queue.get_context(); + + detail::meta_kernel k("np_ceiling"); + size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index"); + size_t value_arg = k.add_arg<value_type>(memory_object::private_memory, "value"); + atomic_max<int_> atomic_max_int; + + k << k.decl<const int_>("i") << " = get_global_id(0);\n" + << k.decl<const value_type>("cur_value") << "=" + << first[k.var<const int_>("i")] << ";\n" + << "if(cur_value <= " << first[k.expr<int_>("*index")] + << " && cur_value > value){\n" + << " " << atomic_max_int(k.var<int_ *>("index"), k.var<int_>("i")) << ";\n" + << "}\n"; + + kernel kernel = k.compile(context); + + scalar<int_> index(context); + kernel.set_arg(index_arg, index.get_buffer()); + + index.write(static_cast<int_>(0), queue); + + kernel.set_arg(value_arg, value); + + queue.enqueue_1d_range_kernel(kernel, 0, count, 0); + + int result = static_cast<int>(index.read(queue)); + return first + result; +} + +} // end detail namespace + +/// +/// \brief Permutation generating algorithm +/// +/// Transforms the range [first, last) into the next permutation from the +/// set of all permutations arranged in lexicographic order +/// \return Boolean value signifying if the last permutation was crossed +/// and the range was reset +/// +/// \param first Iterator pointing to start of range +/// \param last Iterator pointing to end of range +/// \param queue Queue on which to execute +/// +template<class InputIterator> +inline bool next_permutation(InputIterator first, + InputIterator last, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + + if(first == last) return false; + + InputIterator first_element = + detail::next_permutation_helper(first, last, queue); + + if(first_element == last) + { + reverse(first, last, queue); + return false; + } + + value_type first_value = first_element.read(queue); + + InputIterator ceiling_element = + detail::np_ceiling(first_element + 1, last, first_value, queue); + + value_type ceiling_value = ceiling_element.read(queue); + + first_element.write(ceiling_value, queue); + ceiling_element.write(first_value, queue); + + reverse(first_element + 1, last, queue); + + return true; +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_NEXT_PERMUTATION_HPP diff --git a/boost/compute/algorithm/none_of.hpp b/boost/compute/algorithm/none_of.hpp new file mode 100644 index 0000000000..c25dd12a87 --- /dev/null +++ b/boost/compute/algorithm/none_of.hpp @@ -0,0 +1,36 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_NONE_OF_HPP +#define BOOST_COMPUTE_ALGORITHM_NONE_OF_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/algorithm/find_if.hpp> + +namespace boost { +namespace compute { + +/// Returns \c true if \p predicate returns \c true for none of the elements in +/// the range [\p first, \p last). +/// +/// \see all_of(), any_of() +template<class InputIterator, class UnaryPredicate> +inline bool none_of(InputIterator first, + InputIterator last, + UnaryPredicate predicate, + command_queue &queue = system::default_queue()) +{ + return ::boost::compute::find_if(first, last, predicate, queue) == last; +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_NONE_OF_HPP diff --git a/boost/compute/algorithm/nth_element.hpp b/boost/compute/algorithm/nth_element.hpp new file mode 100644 index 0000000000..68f7a3dbc0 --- /dev/null +++ b/boost/compute/algorithm/nth_element.hpp @@ -0,0 +1,87 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_NTH_ELEMENT_HPP +#define BOOST_COMPUTE_ALGORITHM_NTH_ELEMENT_HPP + +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/fill_n.hpp> +#include <boost/compute/algorithm/find.hpp> +#include <boost/compute/algorithm/partition.hpp> +#include <boost/compute/algorithm/sort.hpp> +#include <boost/compute/functional/bind.hpp> + +namespace boost { +namespace compute { + +/// Rearranges the elements in the range [\p first, \p last) such that +/// the \p nth element would be in that position in a sorted sequence. +template<class Iterator, class Compare> +inline void nth_element(Iterator first, + Iterator nth, + Iterator last, + Compare compare, + command_queue &queue = system::default_queue()) +{ + if(nth == last) return; + + typedef typename std::iterator_traits<Iterator>::value_type value_type; + + while(1) + { + value_type value = nth.read(queue); + + using boost::compute::placeholders::_1; + Iterator new_nth = partition( + first, last, ::boost::compute::bind(compare, _1, value), queue + ); + + Iterator old_nth = find(new_nth, last, value, queue); + + value_type new_value = new_nth.read(queue); + + fill_n(new_nth, 1, value, queue); + fill_n(old_nth, 1, new_value, queue); + + new_value = nth.read(queue); + + if(value == new_value) break; + + if(std::distance(first, nth) < std::distance(first, new_nth)) + { + last = new_nth; + } + else + { + first = new_nth; + } + } +} + +/// \overload +template<class Iterator> +inline void nth_element(Iterator first, + Iterator nth, + Iterator last, + command_queue &queue = system::default_queue()) +{ + if(nth == last) return; + + typedef typename std::iterator_traits<Iterator>::value_type value_type; + + less<value_type> less_than; + + return nth_element(first, nth, last, less_than, queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_NTH_ELEMENT_HPP diff --git a/boost/compute/algorithm/partial_sum.hpp b/boost/compute/algorithm/partial_sum.hpp new file mode 100644 index 0000000000..d440369a5a --- /dev/null +++ b/boost/compute/algorithm/partial_sum.hpp @@ -0,0 +1,37 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_PARTIAL_SUM_HPP +#define BOOST_COMPUTE_ALGORITHM_PARTIAL_SUM_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/inclusive_scan.hpp> + +namespace boost { +namespace compute { + +/// Calculates the cumulative sum of the elements in the range [\p first, +/// \p last) and writes the resulting values to the range beginning at +/// \p result. +template<class InputIterator, class OutputIterator> +inline OutputIterator +partial_sum(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue = system::default_queue()) +{ + return ::boost::compute::inclusive_scan(first, last, result, queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_PARTIAL_SUM_HPP diff --git a/boost/compute/algorithm/partition.hpp b/boost/compute/algorithm/partition.hpp new file mode 100644 index 0000000000..7860350e0d --- /dev/null +++ b/boost/compute/algorithm/partition.hpp @@ -0,0 +1,39 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_PARTITION_HPP +#define BOOST_COMPUTE_ALGORITHM_PARTITION_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/stable_partition.hpp> + +namespace boost { +namespace compute { + +/// +/// Partitions the elements in the range [\p first, \p last) according to +/// \p predicate. Order of the elements need not be preserved. +/// +/// \see is_partitioned() and stable_partition() +/// +template<class Iterator, class UnaryPredicate> +inline Iterator partition(Iterator first, + Iterator last, + UnaryPredicate predicate, + command_queue &queue = system::default_queue()) +{ + return stable_partition(first, last, predicate, queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_PARTITION_HPP diff --git a/boost/compute/algorithm/partition_copy.hpp b/boost/compute/algorithm/partition_copy.hpp new file mode 100644 index 0000000000..80a2c6475f --- /dev/null +++ b/boost/compute/algorithm/partition_copy.hpp @@ -0,0 +1,63 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_PARTITION_COPY_HPP +#define BOOST_COMPUTE_ALGORITHM_PARTITION_COPY_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/functional.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/copy_if.hpp> + +namespace boost { +namespace compute { + +/// Copies all of the elements in the range [\p first, \p last) for which +/// \p predicate returns \c true to the range beginning at \p first_true +/// and all of the elements for which \p predicate returns \c false to +/// the range beginning at \p first_false. +/// +/// \see partition() +template<class InputIterator, + class OutputIterator1, + class OutputIterator2, + class UnaryPredicate> +inline std::pair<OutputIterator1, OutputIterator2> +partition_copy(InputIterator first, + InputIterator last, + OutputIterator1 first_true, + OutputIterator2 first_false, + UnaryPredicate predicate, + command_queue &queue = system::default_queue()) +{ + // copy true values + OutputIterator1 last_true = + ::boost::compute::copy_if(first, + last, + first_true, + predicate, + queue); + + // copy false values + OutputIterator2 last_false = + ::boost::compute::copy_if(first, + last, + first_false, + not1(predicate), + queue); + + // return iterators to the end of the true and the false ranges + return std::make_pair(last_true, last_false); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_PARTITION_COPY_HPP diff --git a/boost/compute/algorithm/partition_point.hpp b/boost/compute/algorithm/partition_point.hpp new file mode 100644 index 0000000000..3cc2bc0ca6 --- /dev/null +++ b/boost/compute/algorithm/partition_point.hpp @@ -0,0 +1,46 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_PARTITION_POINT_HPP +#define BOOST_COMPUTE_ALGORITHM_PARTITION_POINT_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/detail/binary_find.hpp> + +namespace boost { +namespace compute { + +/// +/// \brief Partition point algorithm +/// +/// Finds the end of true values in the partitioned range [first, last) +/// \return Iterator pointing to end of true values +/// +/// \param first Iterator pointing to start of range +/// \param last Iterator pointing to end of range +/// \param predicate Unary predicate to be applied on each element +/// \param queue Queue on which to execute +/// +/// \see partition() and stable_partition() +/// +template<class InputIterator, class UnaryPredicate> +inline InputIterator partition_point(InputIterator first, + InputIterator last, + UnaryPredicate predicate, + command_queue &queue = system::default_queue()) +{ + return detail::binary_find(first, last, not1(predicate), queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_PARTITION_POINT_HPP diff --git a/boost/compute/algorithm/prev_permutation.hpp b/boost/compute/algorithm/prev_permutation.hpp new file mode 100644 index 0000000000..03c01bf8f4 --- /dev/null +++ b/boost/compute/algorithm/prev_permutation.hpp @@ -0,0 +1,170 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_PREV_PERMUTATION_HPP +#define BOOST_COMPUTE_ALGORITHM_PREV_PERMUTATION_HPP + +#include <iterator> + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/container/detail/scalar.hpp> +#include <boost/compute/algorithm/reverse.hpp> + +namespace boost { +namespace compute { +namespace detail { + +/// +/// \brief Helper function for prev_permutation +/// +/// To find rightmost element which is greater +/// than its next element +/// +template<class InputIterator> +inline InputIterator prev_permutation_helper(InputIterator first, + InputIterator last, + command_queue &queue) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + + size_t count = detail::iterator_range_size(first, last); + if(count == 0 || count == 1){ + return last; + } + count = count - 1; + const context &context = queue.get_context(); + + detail::meta_kernel k("prev_permutation"); + size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index"); + atomic_max<int_> atomic_max_int; + + k << k.decl<const int_>("i") << " = get_global_id(0);\n" + << k.decl<const value_type>("cur_value") << "=" + << first[k.var<const int_>("i")] << ";\n" + << k.decl<const value_type>("next_value") << "=" + << first[k.expr<const int_>("i+1")] << ";\n" + << "if(cur_value > next_value){\n" + << " " << atomic_max_int(k.var<int_ *>("index"), k.var<int_>("i")) << ";\n" + << "}\n"; + + kernel kernel = k.compile(context); + + scalar<int_> index(context); + kernel.set_arg(index_arg, index.get_buffer()); + + index.write(static_cast<int_>(-1), queue); + + queue.enqueue_1d_range_kernel(kernel, 0, count, 0); + + int result = static_cast<int>(index.read(queue)); + if(result == -1) return last; + else return first + result; +} + +/// +/// \brief Helper function for prev_permutation +/// +/// To find the largest element to the right of the element found above +/// that is smaller than it +/// +template<class InputIterator, class ValueType> +inline InputIterator pp_floor(InputIterator first, + InputIterator last, + ValueType value, + command_queue &queue) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + + size_t count = detail::iterator_range_size(first, last); + if(count == 0){ + return last; + } + const context &context = queue.get_context(); + + detail::meta_kernel k("pp_floor"); + size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index"); + size_t value_arg = k.add_arg<value_type>(memory_object::private_memory, "value"); + atomic_max<int_> atomic_max_int; + + k << k.decl<const int_>("i") << " = get_global_id(0);\n" + << k.decl<const value_type>("cur_value") << "=" + << first[k.var<const int_>("i")] << ";\n" + << "if(cur_value >= " << first[k.expr<int_>("*index")] + << " && cur_value < value){\n" + << " " << atomic_max_int(k.var<int_ *>("index"), k.var<int_>("i")) << ";\n" + << "}\n"; + + kernel kernel = k.compile(context); + + scalar<int_> index(context); + kernel.set_arg(index_arg, index.get_buffer()); + + index.write(static_cast<int_>(0), queue); + + kernel.set_arg(value_arg, value); + + queue.enqueue_1d_range_kernel(kernel, 0, count, 0); + + int result = static_cast<int>(index.read(queue)); + return first + result; +} + +} // end detail namespace + +/// +/// \brief Permutation generating algorithm +/// +/// Transforms the range [first, last) into the previous permutation from +/// the set of all permutations arranged in lexicographic order +/// \return Boolean value signifying if the first permutation was crossed +/// and the range was reset +/// +/// \param first Iterator pointing to start of range +/// \param last Iterator pointing to end of range +/// \param queue Queue on which to execute +/// +template<class InputIterator> +inline bool prev_permutation(InputIterator first, + InputIterator last, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + + if(first == last) return false; + + InputIterator first_element = + detail::prev_permutation_helper(first, last, queue); + + if(first_element == last) + { + reverse(first, last, queue); + return false; + } + + value_type first_value = first_element.read(queue); + + InputIterator ceiling_element = + detail::pp_floor(first_element + 1, last, first_value, queue); + + value_type ceiling_value = ceiling_element.read(queue); + + first_element.write(ceiling_value, queue); + ceiling_element.write(first_value, queue); + + reverse(first_element + 1, last, queue); + + return true; +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_PREV_PERMUTATION_HPP diff --git a/boost/compute/algorithm/random_shuffle.hpp b/boost/compute/algorithm/random_shuffle.hpp new file mode 100644 index 0000000000..7d2d46a133 --- /dev/null +++ b/boost/compute/algorithm/random_shuffle.hpp @@ -0,0 +1,75 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_RANDOM_SHUFFLE_HPP +#define BOOST_COMPUTE_ALGORITHM_RANDOM_SHUFFLE_HPP + +#include <vector> +#include <algorithm> + +#include <boost/range/algorithm_ext/iota.hpp> + +#include <boost/compute/system.hpp> +#include <boost/compute/functional.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/algorithm/scatter.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> + +namespace boost { +namespace compute { + +/// Randomly shuffles the elements in the range [\p first, \p last). +/// +/// \see scatter() +template<class Iterator> +inline void random_shuffle(Iterator first, + Iterator last, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<Iterator>::value_type value_type; + + size_t count = detail::iterator_range_size(first, last); + if(count == 0){ + return; + } + + // generate shuffled indices on the host + std::vector<cl_uint> random_indices(count); + boost::iota(random_indices, 0); + std::random_shuffle(random_indices.begin(), random_indices.end()); + + // copy random indices to the device + const context &context = queue.get_context(); + vector<cl_uint> indices(count, context); + ::boost::compute::copy(random_indices.begin(), + random_indices.end(), + indices.begin(), + queue); + + // make a copy of the values on the device + vector<value_type> tmp(count, context); + ::boost::compute::copy(first, + last, + tmp.begin(), + queue); + + // write values to their new locations + ::boost::compute::scatter(tmp.begin(), + tmp.end(), + indices.begin(), + first, + queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_RANDOM_SHUFFLE_HPP diff --git a/boost/compute/algorithm/reduce.hpp b/boost/compute/algorithm/reduce.hpp new file mode 100644 index 0000000000..79624a0e50 --- /dev/null +++ b/boost/compute/algorithm/reduce.hpp @@ -0,0 +1,301 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_REDUCE_HPP +#define BOOST_COMPUTE_ALGORITHM_REDUCE_HPP + +#include <iterator> + +#include <boost/compute/system.hpp> +#include <boost/compute/functional.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/container/array.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/algorithm/copy_n.hpp> +#include <boost/compute/algorithm/detail/inplace_reduce.hpp> +#include <boost/compute/algorithm/detail/reduce_on_gpu.hpp> +#include <boost/compute/algorithm/detail/serial_reduce.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/memory/local_buffer.hpp> +#include <boost/compute/type_traits/result_of.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator, class OutputIterator, class BinaryFunction> +size_t reduce(InputIterator first, + size_t count, + OutputIterator result, + size_t block_size, + BinaryFunction function, + command_queue &queue) +{ + typedef typename + std::iterator_traits<InputIterator>::value_type + input_type; + typedef typename + boost::compute::result_of<BinaryFunction(input_type, input_type)>::type + result_type; + + const context &context = queue.get_context(); + size_t block_count = count / 2 / block_size; + size_t total_block_count = + static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size))); + + if(block_count != 0){ + meta_kernel k("block_reduce"); + size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output"); + size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block"); + + k << + "const uint gid = get_global_id(0);\n" << + "const uint lid = get_local_id(0);\n" << + + // copy values to local memory + "block[lid] = " << + function(first[k.make_var<uint_>("gid*2+0")], + first[k.make_var<uint_>("gid*2+1")]) << ";\n" << + + // perform reduction + "for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" << + " barrier(CLK_LOCAL_MEM_FENCE);\n" << + " uint mask = (i << 1) - 1;\n" << + " if((lid & mask) == 0){\n" << + " block[lid] = " << + function(k.expr<input_type>("block[lid]"), + k.expr<input_type>("block[lid+i]")) << ";\n" << + " }\n" << + "}\n" << + + // write block result to global output + "if(lid == 0)\n" << + " output[get_group_id(0)] = block[0];\n"; + + kernel kernel = k.compile(context); + kernel.set_arg(output_arg, result.get_buffer()); + kernel.set_arg(block_arg, local_buffer<input_type>(block_size)); + + queue.enqueue_1d_range_kernel(kernel, + 0, + block_count * block_size, + block_size); + } + + // serially reduce any leftovers + if(block_count * block_size * 2 < count){ + size_t last_block_start = block_count * block_size * 2; + + meta_kernel k("extra_serial_reduce"); + size_t count_arg = k.add_arg<uint_>("count"); + size_t offset_arg = k.add_arg<uint_>("offset"); + size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output"); + size_t output_offset_arg = k.add_arg<uint_>("output_offset"); + + k << + k.decl<result_type>("result") << " = \n" << + first[k.expr<uint_>("offset")] << ";\n" << + "for(uint i = offset + 1; i < count; i++)\n" << + " result = " << + function(k.var<result_type>("result"), + first[k.var<uint_>("i")]) << ";\n" << + "output[output_offset] = result;\n"; + + kernel kernel = k.compile(context); + kernel.set_arg(count_arg, static_cast<uint_>(count)); + kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start)); + kernel.set_arg(output_arg, result.get_buffer()); + kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count)); + + queue.enqueue_task(kernel); + } + + return total_block_count; +} + +template<class InputIterator, class BinaryFunction> +inline vector< + typename boost::compute::result_of< + BinaryFunction( + typename std::iterator_traits<InputIterator>::value_type, + typename std::iterator_traits<InputIterator>::value_type + ) + >::type +> +block_reduce(InputIterator first, + size_t count, + size_t block_size, + BinaryFunction function, + command_queue &queue) +{ + typedef typename + std::iterator_traits<InputIterator>::value_type + input_type; + typedef typename + boost::compute::result_of<BinaryFunction(input_type, input_type)>::type + result_type; + + const context &context = queue.get_context(); + size_t total_block_count = + static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size))); + vector<result_type> result_vector(total_block_count, context); + + reduce(first, count, result_vector.begin(), block_size, function, queue); + + return result_vector; +} + +template<class InputIterator, class OutputIterator, class BinaryFunction> +inline void generic_reduce(InputIterator first, + InputIterator last, + OutputIterator result, + BinaryFunction function, + command_queue &queue) +{ + typedef typename + std::iterator_traits<InputIterator>::value_type + input_type; + typedef typename + boost::compute::result_of<BinaryFunction(input_type, input_type)>::type + result_type; + + const device &device = queue.get_device(); + const context &context = queue.get_context(); + + size_t count = detail::iterator_range_size(first, last); + + if(device.type() & device::cpu){ + boost::compute::vector<result_type> value(1, context); + detail::serial_reduce(first, last, value.begin(), function, queue); + boost::compute::copy_n(value.begin(), 1, result, queue); + } + else { + size_t block_size = 256; + + // first pass + vector<result_type> results = detail::block_reduce(first, + count, + block_size, + function, + queue); + + if(results.size() > 1){ + detail::inplace_reduce(results.begin(), + results.end(), + function, + queue); + } + + boost::compute::copy_n(results.begin(), 1, result, queue); + } +} + +template<class InputIterator, class OutputIterator, class T> +inline void dispatch_reduce(InputIterator first, + InputIterator last, + OutputIterator result, + const plus<T> &function, + command_queue &queue) +{ + const context &context = queue.get_context(); + const device &device = queue.get_device(); + + // reduce to temporary buffer on device + array<T, 1> tmp(context); + if(device.type() & device::cpu){ + detail::serial_reduce(first, last, tmp.begin(), function, queue); + } + else { + reduce_on_gpu(first, last, tmp.begin(), function, queue); + } + + // copy to result iterator + copy_n(tmp.begin(), 1, result, queue); +} + +template<class InputIterator, class OutputIterator, class BinaryFunction> +inline void dispatch_reduce(InputIterator first, + InputIterator last, + OutputIterator result, + BinaryFunction function, + command_queue &queue) +{ + generic_reduce(first, last, result, function, queue); +} + +} // end detail namespace + +/// Returns the result of applying \p function to the elements in the +/// range [\p first, \p last). +/// +/// If no function is specified, \c plus will be used. +/// +/// \param first first element in the input range +/// \param last last element in the input range +/// \param result iterator pointing to the output +/// \param function binary reduction function +/// \param queue command queue to perform the operation +/// +/// The \c reduce() algorithm assumes that the binary reduction function is +/// associative. When used with non-associative functions the result may +/// be non-deterministic and vary in precision. Notably this affects the +/// \c plus<float>() function as floating-point addition is not associative +/// and may produce slightly different results than a serial algorithm. +/// +/// This algorithm supports both host and device iterators for the +/// result argument. This allows for values to be reduced and copied +/// to the host all with a single function call. +/// +/// For example, to calculate the sum of the values in a device vector and +/// copy the result to a value on the host: +/// +/// \snippet test/test_reduce.cpp sum_int +/// +/// Note that while the the \c reduce() algorithm is conceptually identical to +/// the \c accumulate() algorithm, its implementation is substantially more +/// efficient on parallel hardware. For more information, see the documentation +/// on the \c accumulate() algorithm. +/// +/// \see accumulate() +template<class InputIterator, class OutputIterator, class BinaryFunction> +inline void reduce(InputIterator first, + InputIterator last, + OutputIterator result, + BinaryFunction function, + command_queue &queue = system::default_queue()) +{ + if(first == last){ + return; + } + + detail::dispatch_reduce(first, last, result, function, queue); +} + +/// \overload +template<class InputIterator, class OutputIterator> +inline void reduce(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator>::value_type T; + + if(first == last){ + return; + } + + detail::dispatch_reduce(first, last, result, plus<T>(), queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_REDUCE_HPP diff --git a/boost/compute/algorithm/reduce_by_key.hpp b/boost/compute/algorithm/reduce_by_key.hpp new file mode 100644 index 0000000000..87c73e887f --- /dev/null +++ b/boost/compute/algorithm/reduce_by_key.hpp @@ -0,0 +1,118 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_REDUCE_BY_KEY_HPP +#define BOOST_COMPUTE_ALGORITHM_REDUCE_BY_KEY_HPP + +#include <iterator> +#include <utility> + +#include <boost/compute/command_queue.hpp> +#include <boost/compute/device.hpp> +#include <boost/compute/functional.hpp> +#include <boost/compute/system.hpp> +#include <boost/compute/algorithm/detail/reduce_by_key.hpp> + +namespace boost { +namespace compute { + +/// The \c reduce_by_key() algorithm performs reduction for each contiguous +/// subsequence of values determinate by equivalent keys. +/// +/// Returns a pair of iterators at the end of the ranges [\p keys_result, keys_result_last) +/// and [\p values_result, \p values_result_last). +/// +/// If no function is specified, \c plus will be used. +/// If no predicate is specified, \c equal_to will be used. +/// +/// \param keys_first the first key +/// \param keys_last the last key +/// \param values_first the first input value +/// \param keys_result iterator pointing to the key output +/// \param values_result iterator pointing to the reduced value output +/// \param function binary reduction function +/// \param predicate binary predicate which returns true only if two keys are equal +/// \param queue command queue to perform the operation +/// +/// The \c reduce_by_key() algorithm assumes that the binary reduction function +/// is associative. When used with non-associative functions the result may +/// be non-deterministic and vary in precision. Notably this affects the +/// \c plus<float>() function as floating-point addition is not associative +/// and may produce slightly different results than a serial algorithm. +/// +/// For example, to calculate the sum of the values for each key: +/// +/// \snippet test/test_reduce_by_key.cpp reduce_by_key_int +/// +/// \see reduce() +template<class InputKeyIterator, class InputValueIterator, + class OutputKeyIterator, class OutputValueIterator, + class BinaryFunction, class BinaryPredicate> +inline std::pair<OutputKeyIterator, OutputValueIterator> +reduce_by_key(InputKeyIterator keys_first, + InputKeyIterator keys_last, + InputValueIterator values_first, + OutputKeyIterator keys_result, + OutputValueIterator values_result, + BinaryFunction function, + BinaryPredicate predicate, + command_queue &queue = system::default_queue()) +{ + return detail::dispatch_reduce_by_key(keys_first, keys_last, values_first, + keys_result, values_result, + function, predicate, + queue); +} + +/// \overload +template<class InputKeyIterator, class InputValueIterator, + class OutputKeyIterator, class OutputValueIterator, + class BinaryFunction> +inline std::pair<OutputKeyIterator, OutputValueIterator> +reduce_by_key(InputKeyIterator keys_first, + InputKeyIterator keys_last, + InputValueIterator values_first, + OutputKeyIterator keys_result, + OutputValueIterator values_result, + BinaryFunction function, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputKeyIterator>::value_type key_type; + + return reduce_by_key(keys_first, keys_last, values_first, + keys_result, values_result, + function, equal_to<key_type>(), + queue); +} + +/// \overload +template<class InputKeyIterator, class InputValueIterator, + class OutputKeyIterator, class OutputValueIterator> +inline std::pair<OutputKeyIterator, OutputValueIterator> +reduce_by_key(InputKeyIterator keys_first, + InputKeyIterator keys_last, + InputValueIterator values_first, + OutputKeyIterator keys_result, + OutputValueIterator values_result, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputKeyIterator>::value_type key_type; + typedef typename std::iterator_traits<InputValueIterator>::value_type value_type; + + return reduce_by_key(keys_first, keys_last, values_first, + keys_result, values_result, + plus<value_type>(), equal_to<key_type>(), + queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_REDUCE_BY_KEY_HPP diff --git a/boost/compute/algorithm/remove.hpp b/boost/compute/algorithm/remove.hpp new file mode 100644 index 0000000000..98feb1f9d8 --- /dev/null +++ b/boost/compute/algorithm/remove.hpp @@ -0,0 +1,54 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_REMOVE_HPP +#define BOOST_COMPUTE_ALGORITHM_REMOVE_HPP + +#include <boost/compute/lambda.hpp> +#include <boost/compute/system.hpp> +#include <boost/compute/algorithm/remove_if.hpp> +#include <boost/compute/type_traits/vector_size.hpp> + +namespace boost { +namespace compute { + +/// Removes each element equal to \p value in the range [\p first, +/// \p last). +/// +/// \see remove_if() +template<class Iterator, class T> +inline Iterator remove(Iterator first, + Iterator last, + const T &value, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<Iterator>::value_type value_type; + + using ::boost::compute::_1; + using ::boost::compute::lambda::all; + + if(vector_size<value_type>::value == 1){ + return ::boost::compute::remove_if(first, + last, + _1 == value, + queue); + } + else { + return ::boost::compute::remove_if(first, + last, + all(_1 == value), + queue); + } +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_REMOVE_HPP diff --git a/boost/compute/algorithm/remove_if.hpp b/boost/compute/algorithm/remove_if.hpp new file mode 100644 index 0000000000..5e416bef88 --- /dev/null +++ b/boost/compute/algorithm/remove_if.hpp @@ -0,0 +1,47 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_REMOVE_IF_HPP +#define BOOST_COMPUTE_ALGORITHM_REMOVE_IF_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/algorithm/copy_if.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/functional/logical.hpp> + +namespace boost { +namespace compute { + +/// Removes each element for which \p predicate returns \c true in the +/// range [\p first, \p last). +/// +/// \see remove() +template<class Iterator, class Predicate> +inline Iterator remove_if(Iterator first, + Iterator last, + Predicate predicate, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<Iterator>::value_type value_type; + + // temporary storage for the input data + ::boost::compute::vector<value_type> tmp(first, last, queue); + + return ::boost::compute::copy_if(tmp.begin(), + tmp.end(), + first, + not1(predicate), + queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_REMOVE_IF_HPP diff --git a/boost/compute/algorithm/replace.hpp b/boost/compute/algorithm/replace.hpp new file mode 100644 index 0000000000..fd649a2fad --- /dev/null +++ b/boost/compute/algorithm/replace.hpp @@ -0,0 +1,90 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_REPLACE_HPP +#define BOOST_COMPUTE_ALGORITHM_REPLACE_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class Iterator, class T> +class replace_kernel : public meta_kernel +{ +public: + replace_kernel() + : meta_kernel("replace") + { + m_count = 0; + } + + void set_range(Iterator first, Iterator last) + { + m_count = detail::iterator_range_size(first, last); + + *this << + "const uint i = get_global_id(0);\n" << + "if(" << first[var<cl_uint>("i")] << " == " << var<T>("old_value") << ")\n" << + " " << first[var<cl_uint>("i")] << '=' << var<T>("new_value") << ";\n"; + } + + void set_old_value(const T &old_value) + { + add_set_arg<T>("old_value", old_value); + } + + void set_new_value(const T &new_value) + { + add_set_arg<T>("new_value", new_value); + } + + void exec(command_queue &queue) + { + if(m_count == 0){ + // nothing to do + return; + } + + exec_1d(queue, 0, m_count); + } + +private: + size_t m_count; +}; + +} // end detail namespace + +/// Replaces each instance of \p old_value in the range [\p first, +/// \p last) with \p new_value. +template<class Iterator, class T> +inline void replace(Iterator first, + Iterator last, + const T &old_value, + const T &new_value, + command_queue &queue = system::default_queue()) +{ + detail::replace_kernel<Iterator, T> kernel; + + kernel.set_range(first, last); + kernel.set_old_value(old_value); + kernel.set_new_value(new_value); + + kernel.exec(queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_REPLACE_HPP diff --git a/boost/compute/algorithm/replace_copy.hpp b/boost/compute/algorithm/replace_copy.hpp new file mode 100644 index 0000000000..7224bd3ae6 --- /dev/null +++ b/boost/compute/algorithm/replace_copy.hpp @@ -0,0 +1,62 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_REPLACE_COPY_HPP +#define BOOST_COMPUTE_ALGORITHM_REPLACE_COPY_HPP + +#include <iterator> + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/copy.hpp> +#include <boost/compute/algorithm/replace.hpp> + +namespace boost { +namespace compute { + +/// Copies the value in the range [\p first, \p last) to the range +/// beginning at \p result while replacing each instance of \p old_value +/// with \p new_value. +/// +/// \see replace() +template<class InputIterator, class OutputIterator, class T> +inline OutputIterator +replace_copy(InputIterator first, + InputIterator last, + OutputIterator result, + const T &old_value, + const T &new_value, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<OutputIterator>::difference_type difference_type; + + difference_type count = std::distance(first, last); + if(count == 0){ + return result; + } + + // copy data to result + ::boost::compute::copy(first, last, result, queue); + + // replace in result + ::boost::compute::replace(result, + result + count, + old_value, + new_value, + queue); + + // return iterator to the end of result + return result + count; +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_REPLACE_COPY_HPP diff --git a/boost/compute/algorithm/reverse.hpp b/boost/compute/algorithm/reverse.hpp new file mode 100644 index 0000000000..b6a9e8098c --- /dev/null +++ b/boost/compute/algorithm/reverse.hpp @@ -0,0 +1,74 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_REVERSE_HPP +#define BOOST_COMPUTE_ALGORITHM_REVERSE_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class Iterator> +struct reverse_kernel : public meta_kernel +{ + reverse_kernel(Iterator first, Iterator last) + : meta_kernel("reverse") + { + typedef typename std::iterator_traits<Iterator>::value_type value_type; + + // store size of the range + m_size = detail::iterator_range_size(first, last); + add_set_arg<const cl_uint>("size", static_cast<const cl_uint>(m_size)); + + *this << + decl<cl_uint>("i") << " = get_global_id(0);\n" << + decl<cl_uint>("j") << " = size - get_global_id(0) - 1;\n" << + decl<value_type>("tmp") << "=" << first[var<cl_uint>("i")] << ";\n" << + first[var<cl_uint>("i")] << "=" << first[var<cl_uint>("j")] << ";\n" << + first[var<cl_uint>("j")] << "= tmp;\n"; + } + + void exec(command_queue &queue) + { + exec_1d(queue, 0, m_size / 2); + } + + size_t m_size; +}; + +} // end detail namespace + +/// Reverses the elements in the range [\p first, \p last). +/// +/// \see reverse_copy() +template<class Iterator> +inline void reverse(Iterator first, + Iterator last, + command_queue &queue = system::default_queue()) +{ + size_t count = detail::iterator_range_size(first, last); + if(count < 2){ + return; + } + + detail::reverse_kernel<Iterator> kernel(first, last); + + kernel.exec(queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_REVERSE_HPP diff --git a/boost/compute/algorithm/reverse_copy.hpp b/boost/compute/algorithm/reverse_copy.hpp new file mode 100644 index 0000000000..c839f44651 --- /dev/null +++ b/boost/compute/algorithm/reverse_copy.hpp @@ -0,0 +1,79 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_REVERSE_COPY_HPP +#define BOOST_COMPUTE_ALGORITHM_REVERSE_COPY_HPP + +#include <iterator> + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/copy.hpp> +#include <boost/compute/algorithm/reverse.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class Iterator, class OutputIterator> +struct reverse_copy_kernel : public meta_kernel +{ + reverse_copy_kernel(Iterator first, Iterator last, OutputIterator result) + : meta_kernel("reverse_copy") + { + // store size of the range + m_size = detail::iterator_range_size(first, last); + add_set_arg<const cl_uint>("size", static_cast<const cl_uint>(m_size)); + + *this << + decl<cl_uint>("i") << " = get_global_id(0);\n" << + decl<cl_uint>("j") << " = size - get_global_id(0) - 1;\n" << + result[var<cl_uint>("j")] << "=" << first[var<cl_uint>("i")] << ";\n"; + } + + void exec(command_queue &queue) + { + exec_1d(queue, 0, m_size); + } + + size_t m_size; +}; + +} // end detail namespace + +/// Copies the elements in the range [\p first, \p last) in reversed +/// order to the range beginning at \p result. +/// +/// \see reverse() +template<class InputIterator, class OutputIterator> +inline OutputIterator +reverse_copy(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<OutputIterator>::difference_type difference_type; + + difference_type count = std::distance(first, last); + + detail::reverse_copy_kernel<InputIterator, OutputIterator> + kernel(first, last, result); + + // run kernel + kernel.exec(queue); + + // return iterator to the end of result + return result + count; +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_REVERSE_COPY_HPP diff --git a/boost/compute/algorithm/rotate.hpp b/boost/compute/algorithm/rotate.hpp new file mode 100644 index 0000000000..54cb073cc2 --- /dev/null +++ b/boost/compute/algorithm/rotate.hpp @@ -0,0 +1,54 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_ROTATE_HPP +#define BOOST_COMPUTE_ALGORITHM_ROTATE_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/algorithm/copy.hpp> +#include <boost/compute/container/vector.hpp> + +namespace boost { +namespace compute { + +/// Performs left rotation such that element at \p n_first comes to the +/// beginning. +/// +/// \see rotate_copy() +template<class InputIterator> +inline void rotate(InputIterator first, + InputIterator n_first, + InputIterator last, + command_queue &queue = system::default_queue()) +{ + //Handle trivial cases + if (n_first==first || n_first==last) + { + return; + } + + //Handle others + typedef typename std::iterator_traits<InputIterator>::value_type T; + + size_t count = detail::iterator_range_size(first, n_first); + size_t count2 = detail::iterator_range_size(first, last); + + const context &context = queue.get_context(); + vector<T> temp(count2, context); + ::boost::compute::copy(first, last, temp.begin(), queue); + + ::boost::compute::copy(temp.begin()+count, temp.end(), first, queue); + ::boost::compute::copy(temp.begin(), temp.begin()+count, last-count, queue); +} + +} //end compute namespace +} //end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_ROTATE_HPP diff --git a/boost/compute/algorithm/rotate_copy.hpp b/boost/compute/algorithm/rotate_copy.hpp new file mode 100644 index 0000000000..fa1b44c5e5 --- /dev/null +++ b/boost/compute/algorithm/rotate_copy.hpp @@ -0,0 +1,41 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_ROTATE_COPY_HPP +#define BOOST_COMPUTE_ALGORITHM_ROTATE_COPY_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/algorithm/copy.hpp> + +namespace boost { +namespace compute { + +/// Performs left rotation such that element at n_first comes to the +/// beginning and the output is stored in range starting at result. +/// +/// \see rotate() +template<class InputIterator, class OutputIterator> +inline void rotate_copy(InputIterator first, + InputIterator n_first, + InputIterator last, + OutputIterator result, + command_queue &queue = system::default_queue()) +{ + size_t count = detail::iterator_range_size(first, n_first); + size_t count2 = detail::iterator_range_size(n_first, last); + + ::boost::compute::copy(first+count, last, result, queue); + ::boost::compute::copy(first, first+count, result+count2, queue); +} + +} //end compute namespace +} //end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_ROTATE_COPY_HPP diff --git a/boost/compute/algorithm/scatter.hpp b/boost/compute/algorithm/scatter.hpp new file mode 100644 index 0000000000..bea4201628 --- /dev/null +++ b/boost/compute/algorithm/scatter.hpp @@ -0,0 +1,99 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_SCATTER_HPP +#define BOOST_COMPUTE_ALGORITHM_SCATTER_HPP + +#include <boost/algorithm/string/replace.hpp> + +#include <boost/compute/system.hpp> +#include <boost/compute/exception.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/iterator/buffer_iterator.hpp> +#include <boost/compute/type_traits/type_name.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/meta_kernel.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator, class MapIterator, class OutputIterator> +class scatter_kernel : meta_kernel +{ +public: + scatter_kernel() : meta_kernel("scatter") + {} + + void set_range(InputIterator first, + InputIterator last, + MapIterator map, + OutputIterator result) + { + m_count = iterator_range_size(first, last); + m_input_offset = first.get_index(); + m_output_offset = result.get_index(); + + m_input_offset_arg = add_arg<uint_>("input_offset"); + m_output_offset_arg = add_arg<uint_>("output_offset"); + + *this << + "const uint i = get_global_id(0);\n" << + "uint i1 = " << map[expr<uint_>("i")] << + " + output_offset;\n" << + "uint i2 = i + input_offset;\n" << + result[expr<uint_>("i1")] << "=" << + first[expr<uint_>("i2")] << ";\n"; + } + + event exec(command_queue &queue) + { + if(m_count == 0) { + return event(); + } + + set_arg(m_input_offset_arg, uint_(m_input_offset)); + set_arg(m_output_offset_arg, uint_(m_output_offset)); + + return exec_1d(queue, 0, m_count); + } + +private: + size_t m_count; + size_t m_input_offset; + size_t m_input_offset_arg; + size_t m_output_offset; + size_t m_output_offset_arg; +}; + +} // end detail namespace + +/// Copies the elements from the range [\p first, \p last) to the range +/// beginning at \p result using the output indices from the range beginning +/// at \p map. +/// +/// \see gather() +template<class InputIterator, class MapIterator, class OutputIterator> +inline void scatter(InputIterator first, + InputIterator last, + MapIterator map, + OutputIterator result, + command_queue &queue = system::default_queue()) +{ + detail::scatter_kernel<InputIterator, MapIterator, OutputIterator> kernel; + + kernel.set_range(first, last, map, result); + kernel.exec(queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_SCATTER_HPP diff --git a/boost/compute/algorithm/scatter_if.hpp b/boost/compute/algorithm/scatter_if.hpp new file mode 100644 index 0000000000..159edd8c86 --- /dev/null +++ b/boost/compute/algorithm/scatter_if.hpp @@ -0,0 +1,119 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2015 Jakub Pola <jakub.pola@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_SCATTER_IF_HPP +#define BOOST_COMPUTE_ALGORITHM_SCATTER_IF_HPP + +#include <boost/algorithm/string/replace.hpp> + +#include <boost/compute/system.hpp> +#include <boost/compute/exception.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/iterator/buffer_iterator.hpp> +#include <boost/compute/type_traits/type_name.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/meta_kernel.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator, class MapIterator, class StencilIterator, class OutputIterator, class Predicate> +class scatter_if_kernel : meta_kernel +{ +public: + scatter_if_kernel() : meta_kernel("scatter_if") + {} + + void set_range(InputIterator first, + InputIterator last, + MapIterator map, + StencilIterator stencil, + OutputIterator result, + Predicate predicate) + { + m_count = iterator_range_size(first, last); + m_input_offset = first.get_index(); + m_output_offset = result.get_index(); + + m_input_offset_arg = add_arg<uint_>("input_offset"); + m_output_offset_arg = add_arg<uint_>("output_offset"); + + *this << + "const uint i = get_global_id(0);\n" << + "uint i1 = " << map[expr<uint_>("i")] << + " + output_offset;\n" << + "uint i2 = i + input_offset;\n" << + if_(predicate(stencil[expr<uint_>("i")])) << "\n" << + result[expr<uint_>("i1")] << "=" << + first[expr<uint_>("i2")] << ";\n"; + } + + event exec(command_queue &queue) + { + if(m_count == 0) { + return event(); + } + + set_arg(m_input_offset_arg, uint_(m_input_offset)); + set_arg(m_output_offset_arg, uint_(m_output_offset)); + + return exec_1d(queue, 0, m_count); + } + +private: + size_t m_count; + size_t m_input_offset; + size_t m_input_offset_arg; + size_t m_output_offset; + size_t m_output_offset_arg; +}; + +} // end detail namespace + +/// Copies the elements from the range [\p first, \p last) to the range +/// beginning at \p result using the output indices from the range beginning +/// at \p map if stencil is resolved to true. By default the predicate is +/// an identity +/// +/// +template<class InputIterator, class MapIterator, class StencilIterator, class OutputIterator, + class Predicate> +inline void scatter_if(InputIterator first, + InputIterator last, + MapIterator map, + StencilIterator stencil, + OutputIterator result, + Predicate predicate, + command_queue &queue = system::default_queue()) +{ + detail::scatter_if_kernel<InputIterator, MapIterator, StencilIterator, OutputIterator, Predicate> kernel; + + kernel.set_range(first, last, map, stencil, result, predicate); + kernel.exec(queue); +} + +template<class InputIterator, class MapIterator, class StencilIterator, class OutputIterator> +inline void scatter_if(InputIterator first, + InputIterator last, + MapIterator map, + StencilIterator stencil, + OutputIterator result, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<StencilIterator>::value_type T; + + scatter_if(first, last, map, stencil, result, identity<T>(), queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_SCATTER_IF_HPP diff --git a/boost/compute/algorithm/search.hpp b/boost/compute/algorithm/search.hpp new file mode 100644 index 0000000000..3d3d035b3c --- /dev/null +++ b/boost/compute/algorithm/search.hpp @@ -0,0 +1,73 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_SEARCH_HPP +#define BOOST_COMPUTE_ALGORITHM_SEARCH_HPP + +#include <boost/compute/algorithm/detail/search_all.hpp> +#include <boost/compute/algorithm/find.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/system.hpp> + +namespace boost { +namespace compute { + +/// +/// \brief Substring matching algorithm +/// +/// Searches for the first match of the pattern [p_first, p_last) +/// in text [t_first, t_last). +/// \return Iterator pointing to beginning of first occurrence +/// +/// \param t_first Iterator pointing to start of text +/// \param t_last Iterator pointing to end of text +/// \param p_first Iterator pointing to start of pattern +/// \param p_last Iterator pointing to end of pattern +/// \param queue Queue on which to execute +/// +template<class TextIterator, class PatternIterator> +inline TextIterator search(TextIterator t_first, + TextIterator t_last, + PatternIterator p_first, + PatternIterator p_last, + command_queue &queue = system::default_queue()) +{ + // there is no need to check if pattern starts at last n - 1 indices + vector<uint_> matching_indices( + detail::iterator_range_size(t_first, t_last) + - detail::iterator_range_size(p_first, p_last) + 1, + queue.get_context() + ); + + // search_kernel puts value 1 at every index in vector where pattern starts at + detail::search_kernel<PatternIterator, + TextIterator, + vector<uint_>::iterator> kernel; + + kernel.set_range(p_first, p_last, t_first, t_last, matching_indices.begin()); + kernel.exec(queue); + + vector<uint_>::iterator index = ::boost::compute::find( + matching_indices.begin(), matching_indices.end(), uint_(1), queue + ); + + // pattern was not found + if(index == matching_indices.end()) + return t_last; + + return t_first + detail::iterator_range_size(matching_indices.begin(), index); +} + +} //end compute namespace +} //end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_SEARCH_HPP diff --git a/boost/compute/algorithm/search_n.hpp b/boost/compute/algorithm/search_n.hpp new file mode 100644 index 0000000000..9e03111bb0 --- /dev/null +++ b/boost/compute/algorithm/search_n.hpp @@ -0,0 +1,140 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SEARCH_N_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_SEARCH_N_HPP + +#include <iterator> + +#include <boost/compute/algorithm/find.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/system.hpp> + +namespace boost { +namespace compute { +namespace detail { + +/// +/// \brief Search kernel class +/// +/// Subclass of meta_kernel which is capable of performing search_n +/// +template<class TextIterator, class OutputIterator> +class search_n_kernel : public meta_kernel +{ +public: + typedef typename std::iterator_traits<TextIterator>::value_type value_type; + + search_n_kernel() : meta_kernel("search_n") + {} + + void set_range(TextIterator t_first, + TextIterator t_last, + value_type value, + size_t n, + OutputIterator result) + { + m_n = n; + m_n_arg = add_arg<uint_>("n"); + + m_value = value; + m_value_arg = add_arg<value_type>("value"); + + m_count = iterator_range_size(t_first, t_last); + m_count = m_count + 1 - m_n; + + *this << + "uint i = get_global_id(0);\n" << + "uint i1 = i;\n" << + "uint j;\n" << + "for(j = 0; j<n; j++,i++)\n" << + "{\n" << + " if(value != " << t_first[expr<uint_>("i")] << ")\n" << + " j = n + 1;\n" << + "}\n" << + "if(j == n)\n" << + result[expr<uint_>("i1")] << " = 1;\n" << + "else\n" << + result[expr<uint_>("i1")] << " = 0;\n"; + } + + event exec(command_queue &queue) + { + if(m_count == 0) { + return event(); + } + + set_arg(m_n_arg, uint_(m_n)); + set_arg(m_value_arg, m_value); + + return exec_1d(queue, 0, m_count); + } + +private: + size_t m_n; + size_t m_n_arg; + size_t m_count; + value_type m_value; + size_t m_value_arg; +}; + +} //end detail namespace + +/// +/// \brief Substring matching algorithm +/// +/// Searches for the first occurrence of n consecutive occurrences of +/// value in text [t_first, t_last). +/// \return Iterator pointing to beginning of first occurrence +/// +/// \param t_first Iterator pointing to start of text +/// \param t_last Iterator pointing to end of text +/// \param n Number of times value repeats +/// \param value Value which repeats +/// \param queue Queue on which to execute +/// +template<class TextIterator, class ValueType> +inline TextIterator search_n(TextIterator t_first, + TextIterator t_last, + size_t n, + ValueType value, + command_queue &queue = system::default_queue()) +{ + // there is no need to check if pattern starts at last n - 1 indices + vector<uint_> matching_indices( + detail::iterator_range_size(t_first, t_last) + 1 - n, + queue.get_context() + ); + + // search_n_kernel puts value 1 at every index in vector where pattern + // of n values starts at + detail::search_n_kernel<TextIterator, + vector<uint_>::iterator> kernel; + + kernel.set_range(t_first, t_last, value, n, matching_indices.begin()); + kernel.exec(queue); + + vector<uint_>::iterator index = ::boost::compute::find( + matching_indices.begin(), matching_indices.end(), uint_(1), queue + ); + + // pattern was not found + if(index == matching_indices.end()) + return t_last; + + return t_first + detail::iterator_range_size(matching_indices.begin(), index); +} + +} //end compute namespace +} //end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SEARCH_N_HPP diff --git a/boost/compute/algorithm/set_difference.hpp b/boost/compute/algorithm/set_difference.hpp new file mode 100644 index 0000000000..17ce7bd3f6 --- /dev/null +++ b/boost/compute/algorithm/set_difference.hpp @@ -0,0 +1,182 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_SET_DIFFERENCE_HPP +#define BOOST_COMPUTE_ALGORITHM_SET_DIFFERENCE_HPP + +#include <iterator> + +#include <boost/compute/algorithm/detail/compact.hpp> +#include <boost/compute/algorithm/detail/balanced_path.hpp> +#include <boost/compute/algorithm/exclusive_scan.hpp> +#include <boost/compute/algorithm/fill_n.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/system.hpp> + +namespace boost { +namespace compute { +namespace detail { + +/// +/// \brief Serial set difference kernel class +/// +/// Subclass of meta_kernel to perform serial set difference after tiling +/// +class serial_set_difference_kernel : meta_kernel +{ +public: + unsigned int tile_size; + + serial_set_difference_kernel() : meta_kernel("set_difference") + { + tile_size = 4; + } + + template<class InputIterator1, class InputIterator2, + class InputIterator3, class InputIterator4, + class OutputIterator1, class OutputIterator2> + void set_range(InputIterator1 first1, + InputIterator2 first2, + InputIterator3 tile_first1, + InputIterator3 tile_last1, + InputIterator4 tile_first2, + OutputIterator1 result, + OutputIterator2 counts) + { + m_count = iterator_range_size(tile_first1, tile_last1) - 1; + + *this << + "uint i = get_global_id(0);\n" << + "uint start1 = " << tile_first1[expr<uint_>("i")] << ";\n" << + "uint end1 = " << tile_first1[expr<uint_>("i+1")] << ";\n" << + "uint start2 = " << tile_first2[expr<uint_>("i")] << ";\n" << + "uint end2 = " << tile_first2[expr<uint_>("i+1")] << ";\n" << + "uint index = i*" << tile_size << ";\n" << + "uint count = 0;\n" << + "while(start1<end1 && start2<end2)\n" << + "{\n" << + " if(" << first1[expr<uint_>("start1")] << " == " << + first2[expr<uint_>("start2")] << ")\n" << + " {\n" << + " start1++; start2++;\n" << + " }\n" << + " else if(" << first1[expr<uint_>("start1")] << " < " << + first2[expr<uint_>("start2")] << ")\n" << + " {\n" << + result[expr<uint_>("index")] << + " = " << first1[expr<uint_>("start1")] << ";\n" << + " index++; count++;\n" << + " start1++;\n" << + " }\n" << + " else\n" << + " {\n" << + " start2++;\n" << + " }\n" << + "}\n" << + "while(start1<end1)\n" << + "{\n" << + result[expr<uint_>("index")] << + " = " << first1[expr<uint_>("start1")] << ";\n" << + " index++; count++;\n" << + " start1++;\n" << + "}\n" << + counts[expr<uint_>("i")] << " = count;\n"; + } + + event exec(command_queue &queue) + { + if(m_count == 0) { + return event(); + } + + return exec_1d(queue, 0, m_count); + } + +private: + size_t m_count; +}; + +} //end detail namespace + +/// +/// \brief Set difference algorithm +/// +/// Finds the difference of the sorted range [first2, last2) from the sorted +/// range [first1, last1) and stores it in range starting at result +/// \return Iterator pointing to end of difference +/// +/// \param first1 Iterator pointing to start of first set +/// \param last1 Iterator pointing to end of first set +/// \param first2 Iterator pointing to start of second set +/// \param last2 Iterator pointing to end of second set +/// \param result Iterator pointing to start of range in which the difference +/// will be stored +/// \param queue Queue on which to execute +/// +template<class InputIterator1, class InputIterator2, class OutputIterator> +inline OutputIterator set_difference(InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + InputIterator2 last2, + OutputIterator result, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator1>::value_type value_type; + + int tile_size = 1024; + + int count1 = detail::iterator_range_size(first1, last1); + int count2 = detail::iterator_range_size(first2, last2); + + vector<uint_> tile_a((count1+count2+tile_size-1)/tile_size+1, queue.get_context()); + vector<uint_> tile_b((count1+count2+tile_size-1)/tile_size+1, queue.get_context()); + + // Tile the sets + detail::balanced_path_kernel tiling_kernel; + tiling_kernel.tile_size = tile_size; + tiling_kernel.set_range(first1, last1, first2, last2, + tile_a.begin()+1, tile_b.begin()+1); + fill_n(tile_a.begin(), 1, 0, queue); + fill_n(tile_b.begin(), 1, 0, queue); + tiling_kernel.exec(queue); + + fill_n(tile_a.end()-1, 1, count1, queue); + fill_n(tile_b.end()-1, 1, count2, queue); + + vector<value_type> temp_result(count1+count2, queue.get_context()); + vector<uint_> counts((count1+count2+tile_size-1)/tile_size + 1, queue.get_context()); + fill_n(counts.end()-1, 1, 0, queue); + + // Find individual differences + detail::serial_set_difference_kernel difference_kernel; + difference_kernel.tile_size = tile_size; + difference_kernel.set_range(first1, first2, tile_a.begin(), tile_a.end(), + tile_b.begin(), temp_result.begin(), counts.begin()); + + difference_kernel.exec(queue); + + exclusive_scan(counts.begin(), counts.end(), counts.begin(), queue); + + // Compact the results + detail::compact_kernel compact_kernel; + compact_kernel.tile_size = tile_size; + compact_kernel.set_range(temp_result.begin(), counts.begin(), counts.end(), result); + + compact_kernel.exec(queue); + + return result + (counts.end() - 1).read(queue); +} + +} //end compute namespace +} //end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_SET_DIFFERENCE_HPP diff --git a/boost/compute/algorithm/set_intersection.hpp b/boost/compute/algorithm/set_intersection.hpp new file mode 100644 index 0000000000..50f291e84a --- /dev/null +++ b/boost/compute/algorithm/set_intersection.hpp @@ -0,0 +1,170 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_SET_INTERSECTION_HPP +#define BOOST_COMPUTE_ALGORITHM_SET_INTERSECTION_HPP + +#include <iterator> + +#include <boost/compute/algorithm/detail/compact.hpp> +#include <boost/compute/algorithm/detail/balanced_path.hpp> +#include <boost/compute/algorithm/exclusive_scan.hpp> +#include <boost/compute/algorithm/fill_n.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/system.hpp> + +namespace boost { +namespace compute { +namespace detail { + +/// +/// \brief Serial set intersection kernel class +/// +/// Subclass of meta_kernel to perform serial set intersection after tiling +/// +class serial_set_intersection_kernel : meta_kernel +{ +public: + unsigned int tile_size; + + serial_set_intersection_kernel() : meta_kernel("set_intersection") + { + tile_size = 4; + } + + template<class InputIterator1, class InputIterator2, + class InputIterator3, class InputIterator4, + class OutputIterator1, class OutputIterator2> + void set_range(InputIterator1 first1, + InputIterator2 first2, + InputIterator3 tile_first1, + InputIterator3 tile_last1, + InputIterator4 tile_first2, + OutputIterator1 result, + OutputIterator2 counts) + { + m_count = iterator_range_size(tile_first1, tile_last1) - 1; + + *this << + "uint i = get_global_id(0);\n" << + "uint start1 = " << tile_first1[expr<uint_>("i")] << ";\n" << + "uint end1 = " << tile_first1[expr<uint_>("i+1")] << ";\n" << + "uint start2 = " << tile_first2[expr<uint_>("i")] << ";\n" << + "uint end2 = " << tile_first2[expr<uint_>("i+1")] << ";\n" << + "uint index = i*" << tile_size << ";\n" << + "uint count = 0;\n" << + "while(start1<end1 && start2<end2)\n" << + "{\n" << + " if(" << first1[expr<uint_>("start1")] << " == " << + first2[expr<uint_>("start2")] << ")\n" << + " {\n" << + result[expr<uint_>("index")] << + " = " << first1[expr<uint_>("start1")] << ";\n" << + " index++; count++;\n" << + " start1++; start2++;\n" << + " }\n" << + " else if(" << first1[expr<uint_>("start1")] << " < " << + first2[expr<uint_>("start2")] << ")\n" << + " start1++;\n" << + " else start2++;\n" << + "}\n" << + counts[expr<uint_>("i")] << " = count;\n"; + } + + event exec(command_queue &queue) + { + if(m_count == 0) { + return event(); + } + + return exec_1d(queue, 0, m_count); + } + +private: + size_t m_count; +}; + +} //end detail namespace + +/// +/// \brief Set intersection algorithm +/// +/// Finds the intersection of the sorted range [first1, last1) with the sorted +/// range [first2, last2) and stores it in range starting at result +/// \return Iterator pointing to end of intersection +/// +/// \param first1 Iterator pointing to start of first set +/// \param last1 Iterator pointing to end of first set +/// \param first2 Iterator pointing to start of second set +/// \param last2 Iterator pointing to end of second set +/// \param result Iterator pointing to start of range in which the intersection +/// will be stored +/// \param queue Queue on which to execute +/// +template<class InputIterator1, class InputIterator2, class OutputIterator> +inline OutputIterator set_intersection(InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + InputIterator2 last2, + OutputIterator result, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator1>::value_type value_type; + + int tile_size = 1024; + + int count1 = detail::iterator_range_size(first1, last1); + int count2 = detail::iterator_range_size(first2, last2); + + vector<uint_> tile_a((count1+count2+tile_size-1)/tile_size+1, queue.get_context()); + vector<uint_> tile_b((count1+count2+tile_size-1)/tile_size+1, queue.get_context()); + + // Tile the sets + detail::balanced_path_kernel tiling_kernel; + tiling_kernel.tile_size = tile_size; + tiling_kernel.set_range(first1, last1, first2, last2, + tile_a.begin()+1, tile_b.begin()+1); + fill_n(tile_a.begin(), 1, 0, queue); + fill_n(tile_b.begin(), 1, 0, queue); + tiling_kernel.exec(queue); + + fill_n(tile_a.end()-1, 1, count1, queue); + fill_n(tile_b.end()-1, 1, count2, queue); + + vector<value_type> temp_result(count1+count2, queue.get_context()); + vector<uint_> counts((count1+count2+tile_size-1)/tile_size + 1, queue.get_context()); + fill_n(counts.end()-1, 1, 0, queue); + + // Find individual intersections + detail::serial_set_intersection_kernel intersection_kernel; + intersection_kernel.tile_size = tile_size; + intersection_kernel.set_range(first1, first2, tile_a.begin(), tile_a.end(), + tile_b.begin(), temp_result.begin(), counts.begin()); + + intersection_kernel.exec(queue); + + exclusive_scan(counts.begin(), counts.end(), counts.begin(), queue); + + // Compact the results + detail::compact_kernel compact_kernel; + compact_kernel.tile_size = tile_size; + compact_kernel.set_range(temp_result.begin(), counts.begin(), counts.end(), result); + + compact_kernel.exec(queue); + + return result + (counts.end() - 1).read(queue); +} + +} //end compute namespace +} //end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_SET_INTERSECTION_HPP diff --git a/boost/compute/algorithm/set_symmetric_difference.hpp b/boost/compute/algorithm/set_symmetric_difference.hpp new file mode 100644 index 0000000000..6e60b38511 --- /dev/null +++ b/boost/compute/algorithm/set_symmetric_difference.hpp @@ -0,0 +1,194 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_SET_SYMMETRIC_DIFFERENCE_HPP +#define BOOST_COMPUTE_ALGORITHM_SET_SYMMETRIC_DIFFERENCE_HPP + +#include <iterator> + +#include <boost/compute/algorithm/detail/compact.hpp> +#include <boost/compute/algorithm/detail/balanced_path.hpp> +#include <boost/compute/algorithm/exclusive_scan.hpp> +#include <boost/compute/algorithm/fill_n.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/system.hpp> + +namespace boost { +namespace compute { +namespace detail { + +/// +/// \brief Serial set symmetric difference kernel class +/// +/// Subclass of meta_kernel to perform serial set symmetric +/// difference after tiling +/// +class serial_set_symmetric_difference_kernel : meta_kernel +{ +public: + unsigned int tile_size; + + serial_set_symmetric_difference_kernel() : meta_kernel("set_symmetric_difference") + { + tile_size = 4; + } + + template<class InputIterator1, class InputIterator2, + class InputIterator3, class InputIterator4, + class OutputIterator1, class OutputIterator2> + void set_range(InputIterator1 first1, + InputIterator2 first2, + InputIterator3 tile_first1, + InputIterator3 tile_last1, + InputIterator4 tile_first2, + OutputIterator1 result, + OutputIterator2 counts) + { + m_count = iterator_range_size(tile_first1, tile_last1) - 1; + + *this << + "uint i = get_global_id(0);\n" << + "uint start1 = " << tile_first1[expr<uint_>("i")] << ";\n" << + "uint end1 = " << tile_first1[expr<uint_>("i+1")] << ";\n" << + "uint start2 = " << tile_first2[expr<uint_>("i")] << ";\n" << + "uint end2 = " << tile_first2[expr<uint_>("i+1")] << ";\n" << + "uint index = i*" << tile_size << ";\n" << + "uint count = 0;\n" << + "while(start1<end1 && start2<end2)\n" << + "{\n" << + " if(" << first1[expr<uint_>("start1")] << " == " << + first2[expr<uint_>("start2")] << ")\n" << + " {\n" << + " start1++; start2++;\n" << + " }\n" << + " else if(" << first1[expr<uint_>("start1")] << " < " << + first2[expr<uint_>("start2")] << ")\n" << + " {\n" << + result[expr<uint_>("index")] << + " = " << first1[expr<uint_>("start1")] << ";\n" << + " index++; count++;\n" << + " start1++;\n" << + " }\n" << + " else\n" << + " {\n" << + result[expr<uint_>("index")] << + " = " << first2[expr<uint_>("start2")] << ";\n" << + " index++; count++;\n" << + " start2++;\n" << + " }\n" << + "}\n" << + "while(start1<end1)\n" << + "{\n" << + result[expr<uint_>("index")] << + " = " << first1[expr<uint_>("start1")] << ";\n" << + " index++; count++;\n" << + " start1++;\n" << + "}\n" << + "while(start2<end2)\n" << + "{\n" << + result[expr<uint_>("index")] << + " = " << first2[expr<uint_>("start2")] << ";\n" << + " index++; count++;\n" << + " start2++;\n" << + "}\n" << + counts[expr<uint_>("i")] << " = count;\n"; + } + + event exec(command_queue &queue) + { + if(m_count == 0) { + return event(); + } + + return exec_1d(queue, 0, m_count); + } + +private: + size_t m_count; +}; + +} //end detail namespace + +/// +/// \brief Set symmetric difference algorithm +/// +/// Finds the symmetric difference of the sorted range [first2, last2) from +/// the sorted range [first1, last1) and stores it in range starting at result +/// \return Iterator pointing to end of symmetric difference +/// +/// \param first1 Iterator pointing to start of first set +/// \param last1 Iterator pointing to end of first set +/// \param first2 Iterator pointing to start of second set +/// \param last2 Iterator pointing to end of second set +/// \param result Iterator pointing to start of range in which the symmetric +/// difference will be stored +/// \param queue Queue on which to execute +/// +template<class InputIterator1, class InputIterator2, class OutputIterator> +inline OutputIterator set_symmetric_difference(InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + InputIterator2 last2, + OutputIterator result, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator1>::value_type value_type; + + int tile_size = 1024; + + int count1 = detail::iterator_range_size(first1, last1); + int count2 = detail::iterator_range_size(first2, last2); + + vector<uint_> tile_a((count1+count2+tile_size-1)/tile_size+1, queue.get_context()); + vector<uint_> tile_b((count1+count2+tile_size-1)/tile_size+1, queue.get_context()); + + // Tile the sets + detail::balanced_path_kernel tiling_kernel; + tiling_kernel.tile_size = tile_size; + tiling_kernel.set_range(first1, last1, first2, last2, + tile_a.begin()+1, tile_b.begin()+1); + fill_n(tile_a.begin(), 1, 0, queue); + fill_n(tile_b.begin(), 1, 0, queue); + tiling_kernel.exec(queue); + + fill_n(tile_a.end()-1, 1, count1, queue); + fill_n(tile_b.end()-1, 1, count2, queue); + + vector<value_type> temp_result(count1+count2, queue.get_context()); + vector<uint_> counts((count1+count2+tile_size-1)/tile_size + 1, queue.get_context()); + fill_n(counts.end()-1, 1, 0, queue); + + // Find individual symmetric differences + detail::serial_set_symmetric_difference_kernel symmetric_difference_kernel; + symmetric_difference_kernel.tile_size = tile_size; + symmetric_difference_kernel.set_range(first1, first2, tile_a.begin(), + tile_a.end(), tile_b.begin(), + temp_result.begin(), counts.begin()); + + symmetric_difference_kernel.exec(queue); + + exclusive_scan(counts.begin(), counts.end(), counts.begin(), queue); + + // Compact the results + detail::compact_kernel compact_kernel; + compact_kernel.tile_size = tile_size; + compact_kernel.set_range(temp_result.begin(), counts.begin(), counts.end(), result); + + compact_kernel.exec(queue); + + return result + (counts.end() - 1).read(queue); +} + +} //end compute namespace +} //end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_SET_SYMMETRIC_DIFFERENCE_HPP diff --git a/boost/compute/algorithm/set_union.hpp b/boost/compute/algorithm/set_union.hpp new file mode 100644 index 0000000000..c61f7b29b3 --- /dev/null +++ b/boost/compute/algorithm/set_union.hpp @@ -0,0 +1,195 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_SET_UNION_HPP +#define BOOST_COMPUTE_ALGORITHM_SET_UNION_HPP + +#include <iterator> + +#include <boost/compute/algorithm/detail/balanced_path.hpp> +#include <boost/compute/algorithm/detail/compact.hpp> +#include <boost/compute/algorithm/exclusive_scan.hpp> +#include <boost/compute/algorithm/fill_n.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/system.hpp> + +namespace boost { +namespace compute { +namespace detail { + +/// +/// \brief Serial set union kernel class +/// +/// Subclass of meta_kernel to perform serial set union after tiling +/// +class serial_set_union_kernel : meta_kernel +{ +public: + unsigned int tile_size; + + serial_set_union_kernel() : meta_kernel("set_union") + { + tile_size = 4; + } + + template<class InputIterator1, class InputIterator2, + class InputIterator3, class InputIterator4, + class OutputIterator1, class OutputIterator2> + void set_range(InputIterator1 first1, + InputIterator2 first2, + InputIterator3 tile_first1, + InputIterator3 tile_last1, + InputIterator4 tile_first2, + OutputIterator1 result, + OutputIterator2 counts) + { + m_count = iterator_range_size(tile_first1, tile_last1) - 1; + + *this << + "uint i = get_global_id(0);\n" << + "uint start1 = " << tile_first1[expr<uint_>("i")] << ";\n" << + "uint end1 = " << tile_first1[expr<uint_>("i+1")] << ";\n" << + "uint start2 = " << tile_first2[expr<uint_>("i")] << ";\n" << + "uint end2 = " << tile_first2[expr<uint_>("i+1")] << ";\n" << + "uint index = i*" << tile_size << ";\n" << + "uint count = 0;\n" << + "while(start1<end1 && start2<end2)\n" << + "{\n" << + " if(" << first1[expr<uint_>("start1")] << " == " << + first2[expr<uint_>("start2")] << ")\n" << + " {\n" << + result[expr<uint_>("index")] << + " = " << first1[expr<uint_>("start1")] << ";\n" << + " index++; count++;\n" << + " start1++; start2++;\n" << + " }\n" << + " else if(" << first1[expr<uint_>("start1")] << " < " << + first2[expr<uint_>("start2")] << ")\n" << + " {\n" << + result[expr<uint_>("index")] << + " = " << first1[expr<uint_>("start1")] << ";\n" << + " index++; count++;\n" << + " start1++;\n" << + " }\n" << + " else\n" << + " {\n" << + result[expr<uint_>("index")] << + " = " << first2[expr<uint_>("start2")] << ";\n" << + " index++; count++;\n" << + " start2++;\n" << + " }\n" << + "}\n" << + "while(start1<end1)\n" << + "{\n" << + result[expr<uint_>("index")] << + " = " << first1[expr<uint_>("start1")] << ";\n" << + " index++; count++;\n" << + " start1++;\n" << + "}\n" << + "while(start2<end2)\n" << + "{\n" << + result[expr<uint_>("index")] << + " = " << first2[expr<uint_>("start2")] << ";\n" << + " index++; count++;\n" << + " start2++;\n" << + "}\n" << + counts[expr<uint_>("i")] << " = count;\n"; + } + + event exec(command_queue &queue) + { + if(m_count == 0) { + return event(); + } + + return exec_1d(queue, 0, m_count); + } + +private: + size_t m_count; +}; + +} //end detail namespace + +/// +/// \brief Set union algorithm +/// +/// Finds the union of the sorted range [first1, last1) with the sorted +/// range [first2, last2) and stores it in range starting at result +/// \return Iterator pointing to end of union +/// +/// \param first1 Iterator pointing to start of first set +/// \param last1 Iterator pointing to end of first set +/// \param first2 Iterator pointing to start of second set +/// \param last2 Iterator pointing to end of second set +/// \param result Iterator pointing to start of range in which the union +/// will be stored +/// \param queue Queue on which to execute +/// +template<class InputIterator1, class InputIterator2, class OutputIterator> +inline OutputIterator set_union(InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + InputIterator2 last2, + OutputIterator result, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator1>::value_type value_type; + + int tile_size = 1024; + + int count1 = detail::iterator_range_size(first1, last1); + int count2 = detail::iterator_range_size(first2, last2); + + vector<uint_> tile_a((count1+count2+tile_size-1)/tile_size+1, queue.get_context()); + vector<uint_> tile_b((count1+count2+tile_size-1)/tile_size+1, queue.get_context()); + + // Tile the sets + detail::balanced_path_kernel tiling_kernel; + tiling_kernel.tile_size = tile_size; + tiling_kernel.set_range(first1, last1, first2, last2, + tile_a.begin()+1, tile_b.begin()+1); + fill_n(tile_a.begin(), 1, 0, queue); + fill_n(tile_b.begin(), 1, 0, queue); + tiling_kernel.exec(queue); + + fill_n(tile_a.end()-1, 1, count1, queue); + fill_n(tile_b.end()-1, 1, count2, queue); + + vector<value_type> temp_result(count1+count2, queue.get_context()); + vector<uint_> counts((count1+count2+tile_size-1)/tile_size + 1, queue.get_context()); + fill_n(counts.end()-1, 1, 0, queue); + + // Find individual unions + detail::serial_set_union_kernel union_kernel; + union_kernel.tile_size = tile_size; + union_kernel.set_range(first1, first2, tile_a.begin(), tile_a.end(), + tile_b.begin(), temp_result.begin(), counts.begin()); + + union_kernel.exec(queue); + + exclusive_scan(counts.begin(), counts.end(), counts.begin(), queue); + + // Compact the results + detail::compact_kernel compact_kernel; + compact_kernel.tile_size = tile_size; + compact_kernel.set_range(temp_result.begin(), counts.begin(), counts.end(), result); + + compact_kernel.exec(queue); + + return result + (counts.end() - 1).read(queue); +} + +} //end compute namespace +} //end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_SET_UNION_HPP diff --git a/boost/compute/algorithm/sort.hpp b/boost/compute/algorithm/sort.hpp new file mode 100644 index 0000000000..b2730b3e2b --- /dev/null +++ b/boost/compute/algorithm/sort.hpp @@ -0,0 +1,194 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_SORT_HPP +#define BOOST_COMPUTE_ALGORITHM_SORT_HPP + +#include <iterator> + +#include <boost/utility/enable_if.hpp> + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/detail/merge_sort_on_cpu.hpp> +#include <boost/compute/algorithm/detail/radix_sort.hpp> +#include <boost/compute/algorithm/detail/insertion_sort.hpp> +#include <boost/compute/algorithm/reverse.hpp> +#include <boost/compute/container/mapped_view.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/iterator/buffer_iterator.hpp> +#include <boost/compute/type_traits/is_device_iterator.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class T> +inline void dispatch_gpu_sort(buffer_iterator<T> first, + buffer_iterator<T> last, + less<T>, + command_queue &queue, + typename boost::enable_if_c< + is_radix_sortable<T>::value + >::type* = 0) +{ + size_t count = detail::iterator_range_size(first, last); + + if(count < 2){ + // nothing to do + return; + } + else if(count <= 32){ + ::boost::compute::detail::serial_insertion_sort(first, last, queue); + } + else { + ::boost::compute::detail::radix_sort(first, last, queue); + } +} + +template<class T> +inline void dispatch_gpu_sort(buffer_iterator<T> first, + buffer_iterator<T> last, + greater<T> compare, + command_queue &queue, + typename boost::enable_if_c< + is_radix_sortable<T>::value + >::type* = 0) +{ + size_t count = detail::iterator_range_size(first, last); + + if(count < 2){ + // nothing to do + return; + } + else if(count <= 32){ + ::boost::compute::detail::serial_insertion_sort( + first, last, compare, queue + ); + } + else { + // radix sort in ascending order + ::boost::compute::detail::radix_sort(first, last, queue); + + // reverse range to descending order + ::boost::compute::reverse(first, last, queue); + } +} + +template<class Iterator, class Compare> +inline void dispatch_gpu_sort(Iterator first, + Iterator last, + Compare compare, + command_queue &queue) +{ + ::boost::compute::detail::serial_insertion_sort( + first, last, compare, queue + ); +} + +// sort() for device iterators +template<class Iterator, class Compare> +inline void dispatch_sort(Iterator first, + Iterator last, + Compare compare, + command_queue &queue, + typename boost::enable_if< + is_device_iterator<Iterator> + >::type* = 0) +{ + if(queue.get_device().type() & device::gpu) { + dispatch_gpu_sort(first, last, compare, queue); + return; + } + ::boost::compute::detail::merge_sort_on_cpu(first, last, compare, queue); +} + +// sort() for host iterators +template<class Iterator, class Compare> +inline void dispatch_sort(Iterator first, + Iterator last, + Compare compare, + command_queue &queue, + typename boost::disable_if< + is_device_iterator<Iterator> + >::type* = 0) +{ + typedef typename std::iterator_traits<Iterator>::value_type T; + + size_t size = static_cast<size_t>(std::distance(first, last)); + + // create mapped buffer + mapped_view<T> view( + boost::addressof(*first), size, queue.get_context() + ); + + // sort mapped buffer + dispatch_sort(view.begin(), view.end(), compare, queue); + + // return results to host + view.map(queue); +} + +} // end detail namespace + +/// Sorts the values in the range [\p first, \p last) according to +/// \p compare. +/// +/// \param first first element in the range to sort +/// \param last last element in the range to sort +/// \param compare comparison function (by default \c less) +/// \param queue command queue to perform the operation +/// +/// For example, to sort a vector on the device: +/// \code +/// // create vector on the device with data +/// float data[] = { 2.f, 4.f, 1.f, 3.f }; +/// boost::compute::vector<float> vec(data, data + 4, queue); +/// +/// // sort the vector on the device +/// boost::compute::sort(vec.begin(), vec.end(), queue); +/// \endcode +/// +/// The sort() algorithm can also be directly used with host iterators. This +/// example will automatically transfer the data to the device, sort it, and +/// then transfer the data back to the host: +/// \code +/// std::vector<int> data = { 9, 3, 2, 5, 1, 4, 6, 7 }; +/// +/// boost::compute::sort(data.begin(), data.end(), queue); +/// \endcode +/// +/// \see is_sorted() +template<class Iterator, class Compare> +inline void sort(Iterator first, + Iterator last, + Compare compare, + command_queue &queue = system::default_queue()) +{ + ::boost::compute::detail::dispatch_sort(first, last, compare, queue); +} + +/// \overload +template<class Iterator> +inline void sort(Iterator first, + Iterator last, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<Iterator>::value_type value_type; + + ::boost::compute::sort( + first, last, ::boost::compute::less<value_type>(), queue + ); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_SORT_HPP diff --git a/boost/compute/algorithm/sort_by_key.hpp b/boost/compute/algorithm/sort_by_key.hpp new file mode 100644 index 0000000000..0e3dba81eb --- /dev/null +++ b/boost/compute/algorithm/sort_by_key.hpp @@ -0,0 +1,156 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_SORT_BY_KEY_HPP +#define BOOST_COMPUTE_ALGORITHM_SORT_BY_KEY_HPP + +#include <iterator> + +#include <boost/utility/enable_if.hpp> + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/detail/merge_sort_on_cpu.hpp> +#include <boost/compute/algorithm/detail/insertion_sort.hpp> +#include <boost/compute/algorithm/detail/radix_sort.hpp> +#include <boost/compute/algorithm/reverse.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> + +namespace boost { +namespace compute { + +namespace detail { + +template<class KeyIterator, class ValueIterator> +inline void +dispatch_gpu_sort_by_key(KeyIterator keys_first, + KeyIterator keys_last, + ValueIterator values_first, + less<typename std::iterator_traits<KeyIterator>::value_type> compare, + command_queue &queue, + typename boost::enable_if_c< + is_radix_sortable< + typename std::iterator_traits<KeyIterator>::value_type + >::value + >::type* = 0) +{ + size_t count = detail::iterator_range_size(keys_first, keys_last); + + if(count < 32){ + detail::serial_insertion_sort_by_key( + keys_first, keys_last, values_first, compare, queue + ); + } + else { + detail::radix_sort_by_key( + keys_first, keys_last, values_first, queue + ); + } +} + +template<class KeyIterator, class ValueIterator> +inline void +dispatch_gpu_sort_by_key(KeyIterator keys_first, + KeyIterator keys_last, + ValueIterator values_first, + greater<typename std::iterator_traits<KeyIterator>::value_type> compare, + command_queue &queue, + typename boost::enable_if_c< + is_radix_sortable< + typename std::iterator_traits<KeyIterator>::value_type + >::value + >::type* = 0) +{ + size_t count = detail::iterator_range_size(keys_first, keys_last); + + if(count < 32){ + detail::serial_insertion_sort_by_key( + keys_first, keys_last, values_first, compare, queue + ); + } + else { + // radix sorts in ascending order + detail::radix_sort_by_key( + keys_first, keys_last, values_first, queue + ); + + // Reverse keys, values for descending order + ::boost::compute::reverse(keys_first, keys_last, queue); + ::boost::compute::reverse(values_first, values_first + count, queue); + } +} + +template<class KeyIterator, class ValueIterator, class Compare> +inline void dispatch_gpu_sort_by_key(KeyIterator keys_first, + KeyIterator keys_last, + ValueIterator values_first, + Compare compare, + command_queue &queue) +{ + detail::serial_insertion_sort_by_key( + keys_first, keys_last, values_first, compare, queue + ); +} + +template<class KeyIterator, class ValueIterator, class Compare> +inline void dispatch_sort_by_key(KeyIterator keys_first, + KeyIterator keys_last, + ValueIterator values_first, + Compare compare, + command_queue &queue) +{ + if(queue.get_device().type() & device::gpu) { + dispatch_gpu_sort_by_key(keys_first, keys_last, values_first, compare, queue); + return; + } + ::boost::compute::detail::merge_sort_by_key_on_cpu( + keys_first, keys_last, values_first, compare, queue + ); +} + +} // end detail namespace + +/// Performs a key-value sort using the keys in the range [\p keys_first, +/// \p keys_last) on the values in the range [\p values_first, +/// \p values_first \c + (\p keys_last \c - \p keys_first)) using \p compare. +/// +/// If no compare function is specified, \c less is used. +/// +/// \see sort() +template<class KeyIterator, class ValueIterator, class Compare> +inline void sort_by_key(KeyIterator keys_first, + KeyIterator keys_last, + ValueIterator values_first, + Compare compare, + command_queue &queue = system::default_queue()) +{ + ::boost::compute::detail::dispatch_sort_by_key( + keys_first, keys_last, values_first, compare, queue + ); +} + +/// \overload +template<class KeyIterator, class ValueIterator> +inline void sort_by_key(KeyIterator keys_first, + KeyIterator keys_last, + ValueIterator values_first, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<KeyIterator>::value_type key_type; + + ::boost::compute::sort_by_key( + keys_first, keys_last, values_first, less<key_type>(), queue + ); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_SORT_BY_KEY_HPP diff --git a/boost/compute/algorithm/stable_partition.hpp b/boost/compute/algorithm/stable_partition.hpp new file mode 100644 index 0000000000..283b068283 --- /dev/null +++ b/boost/compute/algorithm/stable_partition.hpp @@ -0,0 +1,72 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_STABLE_PARTITION_HPP +#define BOOST_COMPUTE_ALGORITHM_STABLE_PARTITION_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/context.hpp> +#include <boost/compute/functional.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/copy_if.hpp> +#include <boost/compute/container/vector.hpp> + +namespace boost { +namespace compute { + +/// +/// \brief Partitioning algorithm +/// +/// Partitions the elements in the range [\p first, \p last) according to +/// \p predicate. The order of the elements is preserved. +/// \return Iterator pointing to end of true values +/// +/// \param first Iterator pointing to start of range +/// \param last Iterator pointing to end of range +/// \param predicate Unary predicate to be applied on each element +/// \param queue Queue on which to execute +/// +/// \see is_partitioned() and partition() +/// +template<class Iterator, class UnaryPredicate> +inline Iterator stable_partition(Iterator first, + Iterator last, + UnaryPredicate predicate, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<Iterator>::value_type value_type; + + // make temporary copy of the input + ::boost::compute::vector<value_type> tmp(first, last, queue); + + // copy true values + Iterator last_true = + ::boost::compute::copy_if(tmp.begin(), + tmp.end(), + first, + predicate, + queue); + + // copy false values + Iterator last_false = + ::boost::compute::copy_if(tmp.begin(), + tmp.end(), + last_true, + not1(predicate), + queue); + + // return iterator pointing to the last true value + return last_true; +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_STABLE_PARTITION_HPP diff --git a/boost/compute/algorithm/stable_sort.hpp b/boost/compute/algorithm/stable_sort.hpp new file mode 100644 index 0000000000..cd82a0a606 --- /dev/null +++ b/boost/compute/algorithm/stable_sort.hpp @@ -0,0 +1,99 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_STABLE_SORT_HPP +#define BOOST_COMPUTE_ALGORITHM_STABLE_SORT_HPP + +#include <iterator> + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/detail/merge_sort_on_cpu.hpp> +#include <boost/compute/algorithm/detail/radix_sort.hpp> +#include <boost/compute/algorithm/detail/insertion_sort.hpp> +#include <boost/compute/algorithm/reverse.hpp> +#include <boost/compute/functional/operator.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class Iterator, class Compare> +inline void dispatch_gpu_stable_sort(Iterator first, + Iterator last, + Compare compare, + command_queue &queue) +{ + ::boost::compute::detail::serial_insertion_sort( + first, last, compare, queue + ); +} + +template<class T> +inline typename boost::enable_if_c<is_radix_sortable<T>::value>::type +dispatch_gpu_stable_sort(buffer_iterator<T> first, + buffer_iterator<T> last, + less<T>, + command_queue &queue) +{ + ::boost::compute::detail::radix_sort(first, last, queue); +} + +template<class T> +inline typename boost::enable_if_c<is_radix_sortable<T>::value>::type +dispatch_gpu_stable_sort(buffer_iterator<T> first, + buffer_iterator<T> last, + greater<T>, + command_queue &queue) +{ + // radix sort in ascending order + ::boost::compute::detail::radix_sort(first, last, queue); + + // reverse range to descending order + ::boost::compute::reverse(first, last, queue); +} + +} // end detail namespace + +/// Sorts the values in the range [\p first, \p last) according to +/// \p compare. The relative order of identical values is preserved. +/// +/// \see sort(), is_sorted() +template<class Iterator, class Compare> +inline void stable_sort(Iterator first, + Iterator last, + Compare compare, + command_queue &queue = system::default_queue()) +{ + if(queue.get_device().type() & device::gpu) { + ::boost::compute::detail::dispatch_gpu_stable_sort( + first, last, compare, queue + ); + } + ::boost::compute::detail::merge_sort_on_cpu(first, last, compare, queue); +} + +/// \overload +template<class Iterator> +inline void stable_sort(Iterator first, + Iterator last, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<Iterator>::value_type value_type; + + ::boost::compute::less<value_type> less; + + ::boost::compute::stable_sort(first, last, less, queue); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_STABLE_SORT_HPP diff --git a/boost/compute/algorithm/stable_sort_by_key.hpp b/boost/compute/algorithm/stable_sort_by_key.hpp new file mode 100644 index 0000000000..8a51372ede --- /dev/null +++ b/boost/compute/algorithm/stable_sort_by_key.hpp @@ -0,0 +1,61 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_STABLE_SORT_BY_KEY_HPP +#define BOOST_COMPUTE_ALGORITHM_STABLE_SORT_BY_KEY_HPP + +#include <iterator> + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/sort_by_key.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> + +namespace boost { +namespace compute { + +/// Performs a key-value stable sort using the keys in the range [\p keys_first, +/// \p keys_last) on the values in the range [\p values_first, +/// \p values_first \c + (\p keys_last \c - \p keys_first)) using \p compare. +/// +/// If no compare function is specified, \c less is used. +/// +/// \see sort() +template<class KeyIterator, class ValueIterator, class Compare> +inline void stable_sort_by_key(KeyIterator keys_first, + KeyIterator keys_last, + ValueIterator values_first, + Compare compare, + command_queue &queue = system::default_queue()) +{ + // sort_by_key is stable + ::boost::compute::sort_by_key( + keys_first, keys_last, values_first, compare, queue + ); +} + +/// \overload +template<class KeyIterator, class ValueIterator> +inline void stable_sort_by_key(KeyIterator keys_first, + KeyIterator keys_last, + ValueIterator values_first, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<KeyIterator>::value_type key_type; + + ::boost::compute::stable_sort_by_key( + keys_first, keys_last, values_first, less<key_type>(), queue + ); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_STABLE_SORT_BY_KEY_HPP diff --git a/boost/compute/algorithm/swap_ranges.hpp b/boost/compute/algorithm/swap_ranges.hpp new file mode 100644 index 0000000000..6ff3e14f6a --- /dev/null +++ b/boost/compute/algorithm/swap_ranges.hpp @@ -0,0 +1,44 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_SWAP_RANGES_HPP +#define BOOST_COMPUTE_ALGORITHM_SWAP_RANGES_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/copy.hpp> +#include <boost/compute/container/vector.hpp> + +namespace boost { +namespace compute { + +/// Swaps the elements in the range [\p first1, \p last1) with the +/// elements in the range beginning at \p first2. +template<class Iterator1, class Iterator2> +inline Iterator2 swap_ranges(Iterator1 first1, + Iterator1 last1, + Iterator2 first2, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<Iterator1>::value_type value_type; + + Iterator2 last2 = first2 + std::distance(first1, last1); + + ::boost::compute::vector<value_type> tmp(first1, last1, queue); + ::boost::compute::copy(first2, last2, first1, queue); + ::boost::compute::copy(tmp.begin(), tmp.end(), first2, queue); + + return last2; +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_SWAP_RANGES_HPP diff --git a/boost/compute/algorithm/transform.hpp b/boost/compute/algorithm/transform.hpp new file mode 100644 index 0000000000..022a4988bd --- /dev/null +++ b/boost/compute/algorithm/transform.hpp @@ -0,0 +1,76 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_TRANSFORM_HPP +#define BOOST_COMPUTE_ALGORITHM_TRANSFORM_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/copy.hpp> +#include <boost/compute/iterator/transform_iterator.hpp> +#include <boost/compute/iterator/zip_iterator.hpp> +#include <boost/compute/functional/detail/unpack.hpp> + +namespace boost { +namespace compute { + +/// Transforms the elements in the range [\p first, \p last) using +/// \p transform and stores the results in the range beginning at +/// \p result. +/// +/// For example, to calculate the absolute value for each element in a vector: +/// +/// \snippet test/test_transform.cpp transform_abs +/// +/// \see copy() +template<class InputIterator, class OutputIterator, class UnaryOperator> +inline OutputIterator transform(InputIterator first, + InputIterator last, + OutputIterator result, + UnaryOperator op, + command_queue &queue = system::default_queue()) +{ + return copy( + ::boost::compute::make_transform_iterator(first, op), + ::boost::compute::make_transform_iterator(last, op), + result, + queue + ); +} + +/// \overload +template<class InputIterator1, + class InputIterator2, + class OutputIterator, + class BinaryOperator> +inline OutputIterator transform(InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + OutputIterator result, + BinaryOperator op, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator1>::difference_type difference_type; + + difference_type n = std::distance(first1, last1); + + return transform( + make_zip_iterator(boost::make_tuple(first1, first2)), + make_zip_iterator(boost::make_tuple(last1, first2 + n)), + result, + detail::unpack(op), + queue + ); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_TRANSFORM_HPP diff --git a/boost/compute/algorithm/transform_if.hpp b/boost/compute/algorithm/transform_if.hpp new file mode 100644 index 0000000000..0eb0fd434e --- /dev/null +++ b/boost/compute/algorithm/transform_if.hpp @@ -0,0 +1,117 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013-2015 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_TRANSFORM_IF_HPP +#define BOOST_COMPUTE_ALGORITHM_TRANSFORM_IF_HPP + +#include <boost/compute/cl.hpp> +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/count.hpp> +#include <boost/compute/algorithm/count_if.hpp> +#include <boost/compute/algorithm/exclusive_scan.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/iterator/discard_iterator.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator, class OutputIterator, class UnaryFunction, class Predicate> +inline OutputIterator transform_if_impl(InputIterator first, + InputIterator last, + OutputIterator result, + UnaryFunction function, + Predicate predicate, + bool copyIndex, + command_queue &queue) +{ + typedef typename std::iterator_traits<OutputIterator>::difference_type difference_type; + + size_t count = detail::iterator_range_size(first, last); + if(count == 0){ + return result; + } + + const context &context = queue.get_context(); + + // storage for destination indices + ::boost::compute::vector<cl_uint> indices(count, context); + + // write counts + ::boost::compute::detail::meta_kernel k1("transform_if_write_counts"); + k1 << indices.begin()[k1.get_global_id(0)] << " = " + << predicate(first[k1.get_global_id(0)]) << " ? 1 : 0;\n"; + k1.exec_1d(queue, 0, count); + + // count number of elements to be copied + size_t copied_element_count = + ::boost::compute::count(indices.begin(), indices.end(), 1, queue); + + // scan indices + ::boost::compute::exclusive_scan( + indices.begin(), indices.end(), indices.begin(), queue + ); + + // copy values + ::boost::compute::detail::meta_kernel k2("transform_if_do_copy"); + k2 << "if(" << predicate(first[k2.get_global_id(0)]) << ")" << + " " << result[indices.begin()[k2.get_global_id(0)]] << "="; + + if(copyIndex){ + k2 << k2.get_global_id(0) << ";\n"; + } + else { + k2 << function(first[k2.get_global_id(0)]) << ";\n"; + } + + k2.exec_1d(queue, 0, count); + + return result + static_cast<difference_type>(copied_element_count); +} + +template<class InputIterator, class UnaryFunction, class Predicate> +inline discard_iterator transform_if_impl(InputIterator first, + InputIterator last, + discard_iterator result, + UnaryFunction function, + Predicate predicate, + bool copyIndex, + command_queue &queue) +{ + (void) function; + (void) copyIndex; + + return result + count_if(first, last, predicate, queue); +} + +} // end detail namespace + +/// Copies each element in the range [\p first, \p last) for which +/// \p predicate returns \c true to the range beginning at \p result. +template<class InputIterator, class OutputIterator, class UnaryFunction, class Predicate> +inline OutputIterator transform_if(InputIterator first, + InputIterator last, + OutputIterator result, + UnaryFunction function, + Predicate predicate, + command_queue &queue = system::default_queue()) +{ + return detail::transform_if_impl( + first, last, result, function, predicate, false, queue + ); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_TRANSFORM_IF_HPP diff --git a/boost/compute/algorithm/transform_reduce.hpp b/boost/compute/algorithm/transform_reduce.hpp new file mode 100644 index 0000000000..fbeee5a691 --- /dev/null +++ b/boost/compute/algorithm/transform_reduce.hpp @@ -0,0 +1,89 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_TRANSFORM_REDUCE_HPP +#define BOOST_COMPUTE_ALGORITHM_TRANSFORM_REDUCE_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/algorithm/reduce.hpp> +#include <boost/compute/iterator/transform_iterator.hpp> +#include <boost/compute/iterator/zip_iterator.hpp> +#include <boost/compute/functional/detail/unpack.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> + +namespace boost { +namespace compute { + +/// Transforms each value in the range [\p first, \p last) with the unary +/// \p transform_function and then reduces each transformed value with +/// \p reduce_function. +/// +/// For example, to calculate the sum of the absolute values of a vector +/// of integers: +/// +/// \snippet test/test_transform_reduce.cpp sum_abs_int +/// +/// \see reduce(), inner_product() +template<class InputIterator, + class OutputIterator, + class UnaryTransformFunction, + class BinaryReduceFunction> +inline void transform_reduce(InputIterator first, + InputIterator last, + OutputIterator result, + UnaryTransformFunction transform_function, + BinaryReduceFunction reduce_function, + command_queue &queue = system::default_queue()) +{ + ::boost::compute::reduce( + ::boost::compute::make_transform_iterator(first, transform_function), + ::boost::compute::make_transform_iterator(last, transform_function), + result, + reduce_function, + queue + ); +} + +/// \overload +template<class InputIterator1, + class InputIterator2, + class OutputIterator, + class BinaryTransformFunction, + class BinaryReduceFunction> +inline void transform_reduce(InputIterator1 first1, + InputIterator1 last1, + InputIterator2 first2, + OutputIterator result, + BinaryTransformFunction transform_function, + BinaryReduceFunction reduce_function, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator1>::difference_type difference_type; + + difference_type n = std::distance(first1, last1); + + ::boost::compute::transform_reduce( + ::boost::compute::make_zip_iterator( + boost::make_tuple(first1, first2) + ), + ::boost::compute::make_zip_iterator( + boost::make_tuple(last1, first2 + n) + ), + result, + detail::unpack(transform_function), + reduce_function, + queue + ); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_TRANSFORM_REDUCE_HPP diff --git a/boost/compute/algorithm/unique.hpp b/boost/compute/algorithm/unique.hpp new file mode 100644 index 0000000000..faa36bad9d --- /dev/null +++ b/boost/compute/algorithm/unique.hpp @@ -0,0 +1,66 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_UNIQUE_HPP +#define BOOST_COMPUTE_ALGORITHM_UNIQUE_HPP + +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/unique_copy.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/functional/operator.hpp> + +namespace boost { +namespace compute { + +/// Removes all consecutive duplicate elements (determined by \p op) from the +/// range [first, last). If \p op is not provided, the equality operator is +/// used. +/// +/// \param first first element in the input range +/// \param last last element in the input range +/// \param op binary operator used to check for uniqueness +/// \param queue command queue to perform the operation +/// +/// \return \c InputIterator to the new logical end of the range +/// +/// \see unique_copy() +template<class InputIterator, class BinaryPredicate> +inline InputIterator unique(InputIterator first, + InputIterator last, + BinaryPredicate op, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + + vector<value_type> temp(first, last, queue); + + return ::boost::compute::unique_copy( + temp.begin(), temp.end(), first, op, queue + ); +} + +/// \overload +template<class InputIterator> +inline InputIterator unique(InputIterator first, + InputIterator last, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + + return ::boost::compute::unique( + first, last, ::boost::compute::equal_to<value_type>(), queue + ); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_UNIQUE_HPP diff --git a/boost/compute/algorithm/unique_copy.hpp b/boost/compute/algorithm/unique_copy.hpp new file mode 100644 index 0000000000..2ce60a9359 --- /dev/null +++ b/boost/compute/algorithm/unique_copy.hpp @@ -0,0 +1,164 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2014 Roshan <thisisroshansmail@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_UNIQUE_COPY_HPP +#define BOOST_COMPUTE_ALGORITHM_UNIQUE_COPY_HPP + +#include <boost/compute/command_queue.hpp> +#include <boost/compute/lambda.hpp> +#include <boost/compute/system.hpp> +#include <boost/compute/algorithm/copy_if.hpp> +#include <boost/compute/algorithm/transform.hpp> +#include <boost/compute/algorithm/gather.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/functional/operator.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator, class OutputIterator, class BinaryPredicate> +inline OutputIterator serial_unique_copy(InputIterator first, + InputIterator last, + OutputIterator result, + BinaryPredicate op, + command_queue &queue) +{ + if(first == last){ + return result; + } + + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + + const context &context = queue.get_context(); + + size_t count = detail::iterator_range_size(first, last); + + detail::meta_kernel k("serial_unique_copy"); + + vector<uint_> unique_count_vector(1, context); + + size_t size_arg = k.add_arg<const uint_>("size"); + size_t unique_count_arg = k.add_arg<uint_ *>(memory_object::global_memory, "unique_count"); + + k << k.decl<uint_>("index") << " = 0;\n" + << k.decl<value_type>("current") << " = " << first[k.var<uint_>("0")] << ";\n" + << result[k.var<uint_>("0")] << " = current;\n" + << "for(uint i = 1; i < size; i++){\n" + << " " << k.decl<value_type>("next") << " = " << first[k.var<uint_>("i")] << ";\n" + << " if(!" << op(k.var<value_type>("current"), k.var<value_type>("next")) << "){\n" + << " " << result[k.var<uint_>("++index")] << " = next;\n" + << " " << "current = next;\n" + << " }\n" + << "}\n" + << "*unique_count = index + 1;\n"; + + k.set_arg<const uint_>(size_arg, count); + k.set_arg(unique_count_arg, unique_count_vector.get_buffer()); + + k.exec_1d(queue, 0, 1, 1); + + uint_ unique_count; + copy_n(unique_count_vector.begin(), 1, &unique_count, queue); + + return result + unique_count; +} + +template<class InputIterator, class OutputIterator, class BinaryPredicate> +inline OutputIterator unique_copy(InputIterator first, + InputIterator last, + OutputIterator result, + BinaryPredicate op, + command_queue &queue) +{ + if(first == last){ + return result; + } + + const context &context = queue.get_context(); + size_t count = detail::iterator_range_size(first, last); + + // flags marking unique elements + vector<uint_> flags(count, context); + + // find each unique element and mark it with a one + transform( + first, last - 1, first + 1, flags.begin() + 1, not2(op), queue + ); + + // first element is always unique + fill_n(flags.begin(), 1, 1, queue); + + // storage for desination indices + vector<uint_> indices(count, context); + + // copy indices for each unique element + vector<uint_>::iterator last_index = detail::copy_index_if( + flags.begin(), flags.end(), indices.begin(), lambda::_1 == 1, queue + ); + + // copy unique values from input to output using the computed indices + gather(indices.begin(), last_index, first, result, queue); + + // return an iterator to the end of the unique output range + return result + std::distance(indices.begin(), last_index); +} + +} // end detail namespace + +/// Makes a copy of the range [first, last) and removes all consecutive +/// duplicate elements (determined by \p op) from the copy. If \p op is not +/// provided, the equality operator is used. +/// +/// \param first first element in the input range +/// \param last last element in the input range +/// \param result first element in the result range +/// \param op binary operator used to check for uniqueness +/// \param queue command queue to perform the operation +/// +/// \return \c OutputIterator to the end of the result range +/// +/// \see unique() +template<class InputIterator, class OutputIterator, class BinaryPredicate> +inline OutputIterator unique_copy(InputIterator first, + InputIterator last, + OutputIterator result, + BinaryPredicate op, + command_queue &queue = system::default_queue()) +{ + size_t count = detail::iterator_range_size(first, last); + if(count < 32){ + return detail::serial_unique_copy(first, last, result, op, queue); + } + else { + return detail::unique_copy(first, last, result, op, queue); + } +} + +/// \overload +template<class InputIterator, class OutputIterator> +inline OutputIterator unique_copy(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + + return ::boost::compute::unique_copy( + first, last, result, ::boost::compute::equal_to<value_type>(), queue + ); +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_UNIQUE_COPY_HPP diff --git a/boost/compute/algorithm/upper_bound.hpp b/boost/compute/algorithm/upper_bound.hpp new file mode 100644 index 0000000000..a5a82d301c --- /dev/null +++ b/boost/compute/algorithm/upper_bound.hpp @@ -0,0 +1,43 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_UPPER_BOUND_HPP +#define BOOST_COMPUTE_ALGORITHM_UPPER_BOUND_HPP + +#include <boost/compute/lambda.hpp> +#include <boost/compute/system.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/detail/binary_find.hpp> + +namespace boost { +namespace compute { + +/// Returns an iterator pointing to the first element in the sorted +/// range [\p first, \p last) that is not less than or equal to +/// \p value. +template<class InputIterator, class T> +inline InputIterator +upper_bound(InputIterator first, + InputIterator last, + const T &value, + command_queue &queue = system::default_queue()) +{ + using ::boost::compute::_1; + + InputIterator position = + detail::binary_find(first, last, _1 > value, queue); + + return position; +} + +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_UPPER_BOUND_HPP |