summaryrefslogtreecommitdiff
path: root/boost/compute
diff options
context:
space:
mode:
Diffstat (limited to 'boost/compute')
-rw-r--r--boost/compute/algorithm/adjacent_difference.hpp76
-rw-r--r--boost/compute/algorithm/copy.hpp584
-rw-r--r--boost/compute/algorithm/detail/copy_on_device.hpp154
-rw-r--r--boost/compute/algorithm/detail/copy_to_device.hpp68
-rw-r--r--boost/compute/algorithm/detail/copy_to_host.hpp83
-rw-r--r--boost/compute/algorithm/detail/find_extrema.hpp12
-rw-r--r--boost/compute/algorithm/detail/find_extrema_on_cpu.hpp138
-rw-r--r--boost/compute/algorithm/detail/find_extrema_with_reduce.hpp2
-rw-r--r--boost/compute/algorithm/detail/merge_sort_on_gpu.hpp590
-rw-r--r--boost/compute/algorithm/detail/radix_sort.hpp50
-rw-r--r--boost/compute/algorithm/detail/reduce_on_cpu.hpp110
-rw-r--r--boost/compute/algorithm/detail/scan_on_cpu.hpp160
-rw-r--r--boost/compute/algorithm/detail/scan_on_gpu.hpp1
-rw-r--r--boost/compute/algorithm/detail/search_all.hpp2
-rw-r--r--boost/compute/algorithm/detail/serial_scan.hpp103
-rw-r--r--boost/compute/algorithm/find_end.hpp33
-rw-r--r--boost/compute/algorithm/gather.hpp4
-rw-r--r--boost/compute/algorithm/minmax_element.hpp2
-rw-r--r--boost/compute/algorithm/reduce.hpp14
-rw-r--r--boost/compute/algorithm/sort.hpp27
-rw-r--r--boost/compute/algorithm/sort_by_key.hpp23
-rw-r--r--boost/compute/algorithm/stable_sort.hpp24
-rw-r--r--boost/compute/algorithm/stable_sort_by_key.hpp102
-rw-r--r--boost/compute/algorithm/transform.hpp2
-rw-r--r--boost/compute/config.hpp14
-rw-r--r--boost/compute/container/array.hpp48
-rw-r--r--boost/compute/container/vector.hpp24
-rw-r--r--boost/compute/context.hpp2
-rw-r--r--boost/compute/detail/literal.hpp10
-rw-r--r--boost/compute/detail/meta_kernel.hpp89
-rw-r--r--boost/compute/detail/parameter_cache.hpp2
-rw-r--r--boost/compute/detail/vendor.hpp12
-rw-r--r--boost/compute/exception/context_error.hpp2
-rw-r--r--boost/compute/function.hpp2
-rw-r--r--boost/compute/interop/opengl/acquire.hpp5
-rw-r--r--boost/compute/interop/opengl/context.hpp2
-rw-r--r--boost/compute/kernel.hpp38
-rw-r--r--boost/compute/memory/svm_ptr.hpp103
-rw-r--r--boost/compute/random/bernoulli_distribution.hpp8
-rw-r--r--boost/compute/random/discrete_distribution.hpp91
-rw-r--r--boost/compute/random/normal_distribution.hpp24
-rw-r--r--boost/compute/random/uniform_int_distribution.hpp8
-rw-r--r--boost/compute/random/uniform_real_distribution.hpp17
-rw-r--r--boost/compute/svm.hpp14
-rw-r--r--boost/compute/system.hpp14
-rw-r--r--boost/compute/utility/wait_list.hpp16
46 files changed, 2584 insertions, 325 deletions
diff --git a/boost/compute/algorithm/adjacent_difference.hpp b/boost/compute/algorithm/adjacent_difference.hpp
index a8f84e020e..ef13970754 100644
--- a/boost/compute/algorithm/adjacent_difference.hpp
+++ b/boost/compute/algorithm/adjacent_difference.hpp
@@ -23,33 +23,17 @@
namespace boost {
namespace compute {
-/// Stores the difference of each pair of consecutive values in the range
-/// [\p first, \p last) to the range beginning at \p result. If \p op is not
-/// provided, \c minus<T> is used.
-///
-/// \param first first element in the input range
-/// \param last last element in the input range
-/// \param result first element in the output range
-/// \param op binary difference function
-/// \param queue command queue to perform the operation
-///
-/// \return \c OutputIterator to the end of the result range
-///
-/// \see adjacent_find()
+namespace detail {
+
template<class InputIterator, class OutputIterator, class BinaryFunction>
inline OutputIterator
-adjacent_difference(InputIterator first,
- InputIterator last,
- OutputIterator result,
- BinaryFunction op,
- command_queue &queue = system::default_queue())
+dispatch_adjacent_difference(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ BinaryFunction op,
+ command_queue &queue = system::default_queue())
{
- if(first == last){
- return result;
- }
-
size_t count = detail::iterator_range_size(first, last);
-
detail::meta_kernel k("adjacent_difference");
k << "const uint i = get_global_id(0);\n"
@@ -66,32 +50,66 @@ adjacent_difference(InputIterator first,
return result + count;
}
-/// \overload
-template<class InputIterator, class OutputIterator>
+} // end detail namespace
+
+/// Stores the difference of each pair of consecutive values in the range
+/// [\p first, \p last) to the range beginning at \p result. If \p op is not
+/// provided, \c minus<T> is used.
+///
+/// \param first first element in the input range
+/// \param last last element in the input range
+/// \param result first element in the output range
+/// \param op binary difference function
+/// \param queue command queue to perform the operation
+///
+/// \return \c OutputIterator to the end of the result range
+///
+/// \see adjacent_find()
+template<class InputIterator, class OutputIterator, class BinaryFunction>
inline OutputIterator
adjacent_difference(InputIterator first,
InputIterator last,
OutputIterator result,
+ BinaryFunction op,
command_queue &queue = system::default_queue())
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+ if(first == last) {
+ return result;
+ }
+
if (first == result) {
vector<value_type> temp(detail::iterator_range_size(first, last),
queue.get_context());
copy(first, last, temp.begin(), queue);
- return ::boost::compute::adjacent_difference(
- temp.begin(), temp.end(), result, ::boost::compute::minus<value_type>(), queue
+ return ::boost::compute::detail::dispatch_adjacent_difference(
+ temp.begin(), temp.end(), result, op, queue
);
}
else {
- return ::boost::compute::adjacent_difference(
- first, last, result, ::boost::compute::minus<value_type>(), queue
+ return ::boost::compute::detail::dispatch_adjacent_difference(
+ first, last, result, op, queue
);
}
}
+/// \overload
+template<class InputIterator, class OutputIterator>
+inline OutputIterator
+adjacent_difference(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue = system::default_queue())
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+ return ::boost::compute::adjacent_difference(
+ first, last, result, ::boost::compute::minus<value_type>(), queue
+ );
+}
+
} // end compute namespace
} // end boost namespace
diff --git a/boost/compute/algorithm/copy.hpp b/boost/compute/algorithm/copy.hpp
index 2a25059bba..7779277b82 100644
--- a/boost/compute/algorithm/copy.hpp
+++ b/boost/compute/algorithm/copy.hpp
@@ -18,6 +18,7 @@
#include <boost/mpl/and.hpp>
#include <boost/mpl/not.hpp>
+#include <boost/mpl/or.hpp>
#include <boost/compute/buffer.hpp>
#include <boost/compute/system.hpp>
@@ -26,9 +27,13 @@
#include <boost/compute/algorithm/detail/copy_to_device.hpp>
#include <boost/compute/algorithm/detail/copy_to_host.hpp>
#include <boost/compute/async/future.hpp>
+#include <boost/compute/container/mapped_view.hpp>
+#include <boost/compute/detail/device_ptr.hpp>
#include <boost/compute/detail/is_contiguous_iterator.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/parameter_cache.hpp>
#include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/type_traits/type_name.hpp>
#include <boost/compute/type_traits/is_device_iterator.hpp>
namespace boost {
@@ -42,13 +47,25 @@ namespace mpl = boost::mpl;
template<class InputIterator, class OutputIterator>
struct can_copy_with_copy_buffer :
mpl::and_<
- boost::is_same<
- InputIterator,
- buffer_iterator<typename InputIterator::value_type>
+ mpl::or_<
+ boost::is_same<
+ InputIterator,
+ buffer_iterator<typename InputIterator::value_type>
+ >,
+ boost::is_same<
+ InputIterator,
+ detail::device_ptr<typename InputIterator::value_type>
+ >
>,
- boost::is_same<
- OutputIterator,
- buffer_iterator<typename OutputIterator::value_type>
+ mpl::or_<
+ boost::is_same<
+ OutputIterator,
+ buffer_iterator<typename OutputIterator::value_type>
+ >,
+ boost::is_same<
+ OutputIterator,
+ detail::device_ptr<typename OutputIterator::value_type>
+ >
>,
boost::is_same<
typename InputIterator::value_type,
@@ -56,40 +73,72 @@ struct can_copy_with_copy_buffer :
>
>::type {};
-// host -> device
+// meta-function returning true if value_types of HostIterator and
+// DeviceIterator are same
+template<class HostIterator, class DeviceIterator>
+struct is_same_value_type :
+ boost::is_same<
+ typename boost::remove_cv<
+ typename std::iterator_traits<HostIterator>::value_type
+ >::type,
+ typename boost::remove_cv<
+ typename DeviceIterator::value_type
+ >::type
+ >::type {};
+
+// meta-function returning true if value_type of HostIterator is bool
+template<class HostIterator>
+struct is_bool_value_type :
+ boost::is_same<
+ typename boost::remove_cv<
+ typename std::iterator_traits<HostIterator>::value_type
+ >::type,
+ bool
+ >::type {};
+
+// host -> device (async)
template<class InputIterator, class OutputIterator>
-inline OutputIterator
-dispatch_copy(InputIterator first,
- InputIterator last,
- OutputIterator result,
- command_queue &queue,
- typename boost::enable_if_c<
- !is_device_iterator<InputIterator>::value &&
- is_device_iterator<OutputIterator>::value
- >::type* = 0)
+inline future<OutputIterator>
+dispatch_copy_async(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue,
+ typename boost::enable_if<
+ mpl::and_<
+ mpl::not_<
+ is_device_iterator<InputIterator>
+ >,
+ is_device_iterator<OutputIterator>,
+ is_same_value_type<InputIterator, OutputIterator>
+ >
+ >::type* = 0)
{
- if(is_contiguous_iterator<InputIterator>::value){
- return copy_to_device(first, last, result, queue);
- }
- else {
- // for non-contiguous input we first copy the values to
- // a temporary std::vector and then copy from there
- typedef typename std::iterator_traits<InputIterator>::value_type T;
- std::vector<T> vector(first, last);
- return copy_to_device(vector.begin(), vector.end(), result, queue);
- }
+ BOOST_STATIC_ASSERT_MSG(
+ is_contiguous_iterator<InputIterator>::value,
+ "copy_async() is only supported for contiguous host iterators"
+ );
+
+ return copy_to_device_async(first, last, result, queue);
}
// host -> device (async)
+// Type mismatch between InputIterator and OutputIterator value_types
template<class InputIterator, class OutputIterator>
inline future<OutputIterator>
dispatch_copy_async(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue,
- typename boost::enable_if_c<
- !is_device_iterator<InputIterator>::value &&
- is_device_iterator<OutputIterator>::value
+ typename boost::enable_if<
+ mpl::and_<
+ mpl::not_<
+ is_device_iterator<InputIterator>
+ >,
+ is_device_iterator<OutputIterator>,
+ mpl::not_<
+ is_same_value_type<InputIterator, OutputIterator>
+ >
+ >
>::type* = 0)
{
BOOST_STATIC_ASSERT_MSG(
@@ -97,32 +146,211 @@ dispatch_copy_async(InputIterator first,
"copy_async() is only supported for contiguous host iterators"
);
- return copy_to_device_async(first, last, result, queue);
+ typedef typename std::iterator_traits<InputIterator>::value_type input_type;
+
+ const context &context = queue.get_context();
+ size_t count = iterator_range_size(first, last);
+
+ if(count < size_t(1)) {
+ return future<OutputIterator>();
+ }
+
+ // map [first; last) to device and run copy kernel
+ // on device for copying & casting
+ ::boost::compute::mapped_view<input_type> mapped_host(
+ // make sure it's a pointer to constant data
+ // to force read only mapping
+ const_cast<const input_type*>(
+ ::boost::addressof(*first)
+ ),
+ count,
+ context
+ );
+ return copy_on_device_async(
+ mapped_host.begin(), mapped_host.end(), result, queue
+ );
}
-// device -> host
+// host -> device
+// InputIterator is a contiguous iterator
template<class InputIterator, class OutputIterator>
inline OutputIterator
dispatch_copy(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue,
- typename boost::enable_if_c<
- is_device_iterator<InputIterator>::value &&
- !is_device_iterator<OutputIterator>::value
+ typename boost::enable_if<
+ mpl::and_<
+ mpl::not_<
+ is_device_iterator<InputIterator>
+ >,
+ is_device_iterator<OutputIterator>,
+ is_same_value_type<InputIterator, OutputIterator>,
+ is_contiguous_iterator<InputIterator>
+ >
+ >::type* = 0)
+{
+ return copy_to_device(first, last, result, queue);
+}
+
+// host -> device
+// Type mismatch between InputIterator and OutputIterator value_types
+// InputIterator is a contiguous iterator
+template<class InputIterator, class OutputIterator>
+inline OutputIterator
+dispatch_copy(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue,
+ typename boost::enable_if<
+ mpl::and_<
+ mpl::not_<
+ is_device_iterator<InputIterator>
+ >,
+ is_device_iterator<OutputIterator>,
+ mpl::not_<
+ is_same_value_type<InputIterator, OutputIterator>
+ >,
+ is_contiguous_iterator<InputIterator>
+ >
>::type* = 0)
{
- if(is_contiguous_iterator<OutputIterator>::value){
- return copy_to_host(first, last, result, queue);
+ typedef typename OutputIterator::value_type output_type;
+ typedef typename std::iterator_traits<InputIterator>::value_type input_type;
+
+ const device &device = queue.get_device();
+
+ // loading parameters
+ std::string cache_key =
+ std::string("__boost_compute_copy_to_device_")
+ + type_name<input_type>() + "_" + type_name<output_type>();
+ boost::shared_ptr<parameter_cache> parameters =
+ detail::parameter_cache::get_global_cache(device);
+
+ size_t map_copy_threshold;
+ size_t direct_copy_threshold;
+
+ // calculate default values of thresholds
+ if (device.type() & device::gpu) {
+ // GPUs
+ map_copy_threshold = 524288; // 0.5 MB
+ direct_copy_threshold = 52428800; // 50 MB
}
else {
- // for non-contiguous input we first copy the values to
- // a temporary std::vector and then copy from there
- typedef typename std::iterator_traits<InputIterator>::value_type T;
- std::vector<T> vector(iterator_range_size(first, last));
- copy_to_host(first, last, vector.begin(), queue);
- return std::copy(vector.begin(), vector.end(), result);
+ // CPUs and other devices
+ map_copy_threshold = 134217728; // 128 MB
+ direct_copy_threshold = 0; // it's never efficient for CPUs
}
+
+ // load thresholds
+ map_copy_threshold =
+ parameters->get(
+ cache_key, "map_copy_threshold", map_copy_threshold
+ );
+ direct_copy_threshold =
+ parameters->get(
+ cache_key, "direct_copy_threshold", direct_copy_threshold
+ );
+
+ // select copy method based on thresholds & input_size_bytes
+ size_t count = iterator_range_size(first, last);
+ size_t input_size_bytes = count * sizeof(input_type);
+
+ // [0; map_copy_threshold) -> copy_to_device_map()
+ if(input_size_bytes < map_copy_threshold) {
+ return copy_to_device_map(first, last, result, queue);
+ }
+ // [map_copy_threshold; direct_copy_threshold) -> convert [first; last)
+ // on host and then perform copy_to_device()
+ else if(input_size_bytes < direct_copy_threshold) {
+ std::vector<output_type> vector(first, last);
+ return copy_to_device(vector.begin(), vector.end(), result, queue);
+ }
+
+ // [direct_copy_threshold; inf) -> map [first; last) to device and
+ // run copy kernel on device for copying & casting
+ // At this point we are sure that count > 1 (first != last).
+
+ // Perform async copy to device, wait for it to be finished and
+ // return the result.
+ // At this point we are sure that count > 1 (first != last), so event
+ // returned by dispatch_copy_async() must be valid.
+ return dispatch_copy_async(first, last, result, queue).get();
+}
+
+// host -> device
+// InputIterator is NOT a contiguous iterator
+template<class InputIterator, class OutputIterator>
+inline OutputIterator
+dispatch_copy(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue,
+ typename boost::enable_if<
+ mpl::and_<
+ mpl::not_<
+ is_device_iterator<InputIterator>
+ >,
+ is_device_iterator<OutputIterator>,
+ mpl::not_<
+ is_contiguous_iterator<InputIterator>
+ >
+ >
+ >::type* = 0)
+{
+ typedef typename OutputIterator::value_type output_type;
+ typedef typename std::iterator_traits<InputIterator>::value_type input_type;
+
+ const device &device = queue.get_device();
+
+ // loading parameters
+ std::string cache_key =
+ std::string("__boost_compute_copy_to_device_")
+ + type_name<input_type>() + "_" + type_name<output_type>();
+ boost::shared_ptr<parameter_cache> parameters =
+ detail::parameter_cache::get_global_cache(device);
+
+ size_t map_copy_threshold;
+ size_t direct_copy_threshold;
+
+ // calculate default values of thresholds
+ if (device.type() & device::gpu) {
+ // GPUs
+ map_copy_threshold = 524288; // 0.5 MB
+ direct_copy_threshold = 52428800; // 50 MB
+ }
+ else {
+ // CPUs and other devices
+ map_copy_threshold = 134217728; // 128 MB
+ direct_copy_threshold = 0; // it's never efficient for CPUs
+ }
+
+ // load thresholds
+ map_copy_threshold =
+ parameters->get(
+ cache_key, "map_copy_threshold", map_copy_threshold
+ );
+ direct_copy_threshold =
+ parameters->get(
+ cache_key, "direct_copy_threshold", direct_copy_threshold
+ );
+
+ // select copy method based on thresholds & input_size_bytes
+ size_t input_size = iterator_range_size(first, last);
+ size_t input_size_bytes = input_size * sizeof(input_type);
+
+ // [0; map_copy_threshold) -> copy_to_device_map()
+ //
+ // if direct_copy_threshold is less than map_copy_threshold
+ // copy_to_device_map() is used for every input
+ if(input_size_bytes < map_copy_threshold
+ || direct_copy_threshold <= map_copy_threshold) {
+ return copy_to_device_map(first, last, result, queue);
+ }
+ // [map_copy_threshold; inf) -> convert [first; last)
+ // on host and then perform copy_to_device()
+ std::vector<output_type> vector(first, last);
+ return copy_to_device(vector.begin(), vector.end(), result, queue);
}
// device -> host (async)
@@ -132,9 +360,14 @@ dispatch_copy_async(InputIterator first,
InputIterator last,
OutputIterator result,
command_queue &queue,
- typename boost::enable_if_c<
- is_device_iterator<InputIterator>::value &&
- !is_device_iterator<OutputIterator>::value
+ typename boost::enable_if<
+ mpl::and_<
+ is_device_iterator<InputIterator>,
+ mpl::not_<
+ is_device_iterator<OutputIterator>
+ >,
+ is_same_value_type<OutputIterator, InputIterator>
+ >
>::type* = 0)
{
BOOST_STATIC_ASSERT_MSG(
@@ -145,6 +378,267 @@ dispatch_copy_async(InputIterator first,
return copy_to_host_async(first, last, result, queue);
}
+// device -> host (async)
+// Type mismatch between InputIterator and OutputIterator value_types
+template<class InputIterator, class OutputIterator>
+inline future<OutputIterator>
+dispatch_copy_async(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue,
+ typename boost::enable_if<
+ mpl::and_<
+ is_device_iterator<InputIterator>,
+ mpl::not_<
+ is_device_iterator<OutputIterator>
+ >,
+ mpl::not_<
+ is_same_value_type<OutputIterator, InputIterator>
+ >
+ >
+ >::type* = 0)
+{
+ BOOST_STATIC_ASSERT_MSG(
+ is_contiguous_iterator<OutputIterator>::value,
+ "copy_async() is only supported for contiguous host iterators"
+ );
+
+ typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
+ const context &context = queue.get_context();
+ size_t count = iterator_range_size(first, last);
+
+ if(count < size_t(1)) {
+ return future<OutputIterator>();
+ }
+
+ // map host memory to device
+ buffer mapped_host(
+ context,
+ count * sizeof(output_type),
+ buffer::write_only | buffer::use_host_ptr,
+ static_cast<void*>(
+ ::boost::addressof(*result)
+ )
+ );
+ // copy async on device
+ ::boost::compute::future<buffer_iterator<output_type> > future =
+ copy_on_device_async(
+ first,
+ last,
+ make_buffer_iterator<output_type>(mapped_host),
+ queue
+ );
+ // update host memory asynchronously by maping and unmaping memory
+ event map_event;
+ void* ptr = queue.enqueue_map_buffer_async(
+ mapped_host,
+ CL_MAP_READ,
+ 0,
+ count * sizeof(output_type),
+ map_event,
+ future.get_event()
+ );
+ event unmap_event =
+ queue.enqueue_unmap_buffer(mapped_host, ptr, map_event);
+ return make_future(result + count, unmap_event);
+}
+
+// device -> host
+// OutputIterator is a contiguous iterator
+template<class InputIterator, class OutputIterator>
+inline OutputIterator
+dispatch_copy(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue,
+ typename boost::enable_if<
+ mpl::and_<
+ is_device_iterator<InputIterator>,
+ mpl::not_<
+ is_device_iterator<OutputIterator>
+ >,
+ is_same_value_type<OutputIterator, InputIterator>,
+ is_contiguous_iterator<OutputIterator>,
+ mpl::not_<
+ is_bool_value_type<OutputIterator>
+ >
+ >
+ >::type* = 0)
+{
+ return copy_to_host(first, last, result, queue);
+}
+
+// device -> host
+// Type mismatch between InputIterator and OutputIterator value_types
+// OutputIterator is NOT a contiguous iterator or value_type of OutputIterator
+// is a boolean type.
+template<class InputIterator, class OutputIterator>
+inline OutputIterator
+dispatch_copy(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue,
+ typename boost::enable_if<
+ mpl::and_<
+ is_device_iterator<InputIterator>,
+ mpl::not_<
+ is_device_iterator<OutputIterator>
+ >,
+ mpl::or_<
+ mpl::not_<
+ is_contiguous_iterator<OutputIterator>
+ >,
+ is_bool_value_type<OutputIterator>
+ >
+ >
+ >::type* = 0)
+{
+ typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
+ typedef typename InputIterator::value_type input_type;
+
+ const device &device = queue.get_device();
+
+ // loading parameters
+ std::string cache_key =
+ std::string("__boost_compute_copy_to_host_")
+ + type_name<input_type>() + "_" + type_name<output_type>();
+ boost::shared_ptr<parameter_cache> parameters =
+ detail::parameter_cache::get_global_cache(device);
+
+ size_t map_copy_threshold;
+ size_t direct_copy_threshold;
+
+ // calculate default values of thresholds
+ if (device.type() & device::gpu) {
+ // GPUs
+ map_copy_threshold = 33554432; // 30 MB
+ direct_copy_threshold = 0; // it's never efficient for GPUs
+ }
+ else {
+ // CPUs and other devices
+ map_copy_threshold = 134217728; // 128 MB
+ direct_copy_threshold = 0; // it's never efficient for CPUs
+ }
+
+ // load thresholds
+ map_copy_threshold =
+ parameters->get(
+ cache_key, "map_copy_threshold", map_copy_threshold
+ );
+ direct_copy_threshold =
+ parameters->get(
+ cache_key, "direct_copy_threshold", direct_copy_threshold
+ );
+
+ // select copy method based on thresholds & input_size_bytes
+ size_t count = iterator_range_size(first, last);
+ size_t input_size_bytes = count * sizeof(input_type);
+
+ // [0; map_copy_threshold) -> copy_to_host_map()
+ //
+ // if direct_copy_threshold is less than map_copy_threshold
+ // copy_to_host_map() is used for every input
+ if(input_size_bytes < map_copy_threshold
+ || direct_copy_threshold <= map_copy_threshold) {
+ return copy_to_host_map(first, last, result, queue);
+ }
+ // [map_copy_threshold; inf) -> copy [first;last) to temporary vector
+ // then copy (and convert) to result using std::copy()
+ std::vector<input_type> vector(count);
+ copy_to_host(first, last, vector.begin(), queue);
+ return std::copy(vector.begin(), vector.end(), result);
+}
+
+// device -> host
+// Type mismatch between InputIterator and OutputIterator value_types
+// OutputIterator is a contiguous iterator
+// value_type of OutputIterator is NOT a boolean type
+template<class InputIterator, class OutputIterator>
+inline OutputIterator
+dispatch_copy(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue,
+ typename boost::enable_if<
+ mpl::and_<
+ is_device_iterator<InputIterator>,
+ mpl::not_<
+ is_device_iterator<OutputIterator>
+ >,
+ mpl::not_<
+ is_same_value_type<OutputIterator, InputIterator>
+ >,
+ is_contiguous_iterator<OutputIterator>,
+ mpl::not_<
+ is_bool_value_type<OutputIterator>
+ >
+ >
+ >::type* = 0)
+{
+ typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
+ typedef typename InputIterator::value_type input_type;
+
+ const device &device = queue.get_device();
+
+ // loading parameters
+ std::string cache_key =
+ std::string("__boost_compute_copy_to_host_")
+ + type_name<input_type>() + "_" + type_name<output_type>();
+ boost::shared_ptr<parameter_cache> parameters =
+ detail::parameter_cache::get_global_cache(device);
+
+ size_t map_copy_threshold;
+ size_t direct_copy_threshold;
+
+ // calculate default values of thresholds
+ if (device.type() & device::gpu) {
+ // GPUs
+ map_copy_threshold = 524288; // 0.5 MB
+ direct_copy_threshold = 52428800; // 50 MB
+ }
+ else {
+ // CPUs and other devices
+ map_copy_threshold = 134217728; // 128 MB
+ direct_copy_threshold = 0; // it's never efficient for CPUs
+ }
+
+ // load thresholds
+ map_copy_threshold =
+ parameters->get(
+ cache_key, "map_copy_threshold", map_copy_threshold
+ );
+ direct_copy_threshold =
+ parameters->get(
+ cache_key, "direct_copy_threshold", direct_copy_threshold
+ );
+
+ // select copy method based on thresholds & input_size_bytes
+ size_t count = iterator_range_size(first, last);
+ size_t input_size_bytes = count * sizeof(input_type);
+
+ // [0; map_copy_threshold) -> copy_to_host_map()
+ if(input_size_bytes < map_copy_threshold) {
+ return copy_to_host_map(first, last, result, queue);
+ }
+ // [map_copy_threshold; direct_copy_threshold) -> copy [first;last) to
+ // temporary vector then copy (and convert) to result using std::copy()
+ else if(input_size_bytes < direct_copy_threshold) {
+ std::vector<input_type> vector(count);
+ copy_to_host(first, last, vector.begin(), queue);
+ return std::copy(vector.begin(), vector.end(), result);
+ }
+
+ // [direct_copy_threshold; inf) -> map [result; result + input_size) to
+ // device and run copy kernel on device for copying & casting
+ // map host memory to device.
+
+ // Perform async copy to host, wait for it to be finished and
+ // return the result.
+ // At this point we are sure that count > 1 (first != last), so event
+ // returned by dispatch_copy_async() must be valid.
+ return dispatch_copy_async(first, last, result, queue).get();
+}
+
// device -> device
template<class InputIterator, class OutputIterator>
inline OutputIterator
diff --git a/boost/compute/algorithm/detail/copy_on_device.hpp b/boost/compute/algorithm/detail/copy_on_device.hpp
index 0bcee27ed5..8738c8c0b4 100644
--- a/boost/compute/algorithm/detail/copy_on_device.hpp
+++ b/boost/compute/algorithm/detail/copy_on_device.hpp
@@ -22,84 +22,96 @@
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/parameter_cache.hpp>
#include <boost/compute/detail/work_size.hpp>
+#include <boost/compute/detail/vendor.hpp>
namespace boost {
namespace compute {
namespace detail {
-inline size_t pick_copy_work_group_size(size_t n, const device &device)
+template<class InputIterator, class OutputIterator>
+inline event copy_on_device_cpu(InputIterator first,
+ OutputIterator result,
+ size_t count,
+ command_queue &queue)
{
- (void) device;
-
- if(n % 32 == 0) return 32;
- else if(n % 16 == 0) return 16;
- else if(n % 8 == 0) return 8;
- else if(n % 4 == 0) return 4;
- else if(n % 2 == 0) return 2;
- else return 1;
+ meta_kernel k("copy");
+ const device& device = queue.get_device();
+
+ k <<
+ "uint block = " <<
+ "(uint)ceil(((float)count)/get_global_size(0));\n" <<
+ "uint index = get_global_id(0) * block;\n" <<
+ "uint end = min(count, index + block);\n" <<
+ "while(index < end){\n" <<
+ result[k.var<uint_>("index")] << '=' <<
+ first[k.var<uint_>("index")] << ";\n" <<
+ "index++;\n" <<
+ "}\n";
+
+ k.add_set_arg<const uint_>("count", static_cast<uint_>(count));
+
+ size_t global_work_size = device.compute_units();
+ if(count <= 1024) global_work_size = 1;
+ return k.exec_1d(queue, 0, global_work_size);
}
template<class InputIterator, class OutputIterator>
-class copy_kernel : public meta_kernel
+inline event copy_on_device_gpu(InputIterator first,
+ OutputIterator result,
+ size_t count,
+ command_queue &queue)
{
-public:
- copy_kernel(const device &device)
- : meta_kernel("copy")
- {
- m_count = 0;
-
- typedef typename std::iterator_traits<InputIterator>::value_type input_type;
-
- boost::shared_ptr<parameter_cache> parameters =
- detail::parameter_cache::get_global_cache(device);
-
- std::string cache_key =
- "__boost_copy_kernel_" + boost::lexical_cast<std::string>(sizeof(input_type));
+ typedef typename std::iterator_traits<InputIterator>::value_type input_type;
+
+ const device& device = queue.get_device();
+ boost::shared_ptr<parameter_cache> parameters =
+ detail::parameter_cache::get_global_cache(device);
+ std::string cache_key =
+ "__boost_copy_kernel_" + boost::lexical_cast<std::string>(sizeof(input_type));
+
+ uint_ vpt = parameters->get(cache_key, "vpt", 4);
+ uint_ tpb = parameters->get(cache_key, "tpb", 128);
+
+ meta_kernel k("copy");
+ k <<
+ "uint index = get_local_id(0) + " <<
+ "(" << vpt * tpb << " * get_group_id(0));\n" <<
+ "for(uint i = 0; i < " << vpt << "; i++){\n" <<
+ " if(index < count){\n" <<
+ result[k.var<uint_>("index")] << '=' <<
+ first[k.var<uint_>("index")] << ";\n" <<
+ " index += " << tpb << ";\n"
+ " }\n"
+ "}\n";
+
+ k.add_set_arg<const uint_>("count", static_cast<uint_>(count));
+ size_t global_work_size = calculate_work_size(count, vpt, tpb);
+ return k.exec_1d(queue, 0, global_work_size, tpb);
+}
- m_vpt = parameters->get(cache_key, "vpt", 4);
- m_tpb = parameters->get(cache_key, "tpb", 128);
- }
+template<class InputIterator, class OutputIterator>
+inline event dispatch_copy_on_device(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ command_queue &queue)
+{
+ const size_t count = detail::iterator_range_size(first, last);
- void set_range(InputIterator first,
- InputIterator last,
- OutputIterator result)
- {
- m_count_arg = add_arg<uint_>("count");
-
- *this <<
- "uint index = get_local_id(0) + " <<
- "(" << m_vpt * m_tpb << " * get_group_id(0));\n" <<
- "for(uint i = 0; i < " << m_vpt << "; i++){\n" <<
- " if(index < count){\n" <<
- result[expr<uint_>("index")] << '=' <<
- first[expr<uint_>("index")] << ";\n" <<
- " index += " << m_tpb << ";\n"
- " }\n"
- "}\n";
-
- m_count = detail::iterator_range_size(first, last);
+ if(count == 0){
+ // nothing to do
+ return event();
}
- event exec(command_queue &queue)
+ const device& device = queue.get_device();
+ // copy_on_device_cpu() does not work for CPU on Apple platform
+ // due to bug in its compiler.
+ // See https://github.com/boostorg/compute/pull/626
+ if((device.type() & device::cpu) && !is_apple_platform_device(device))
{
- if(m_count == 0){
- // nothing to do
- return event();
- }
-
- size_t global_work_size = calculate_work_size(m_count, m_vpt, m_tpb);
-
- set_arg(m_count_arg, uint_(m_count));
-
- return exec_1d(queue, 0, global_work_size, m_tpb);
+ return copy_on_device_cpu(first, result, count, queue);
}
-
-private:
- size_t m_count;
- size_t m_count_arg;
- uint_ m_vpt;
- uint_ m_tpb;
-};
+ return copy_on_device_gpu(first, result, count, queue);
+}
template<class InputIterator, class OutputIterator>
inline OutputIterator copy_on_device(InputIterator first,
@@ -107,13 +119,7 @@ inline OutputIterator copy_on_device(InputIterator first,
OutputIterator result,
command_queue &queue)
{
- const device &device = queue.get_device();
-
- copy_kernel<InputIterator, OutputIterator> kernel(device);
-
- kernel.set_range(first, last, result);
- kernel.exec(queue);
-
+ dispatch_copy_on_device(first, last, result, queue);
return result + std::distance(first, last);
}
@@ -134,13 +140,7 @@ inline future<OutputIterator> copy_on_device_async(InputIterator first,
OutputIterator result,
command_queue &queue)
{
- const device &device = queue.get_device();
-
- copy_kernel<InputIterator, OutputIterator> kernel(device);
-
- kernel.set_range(first, last, result);
- event event_ = kernel.exec(queue);
-
+ event event_ = dispatch_copy_on_device(first, last, result, queue);
return make_future(result + std::distance(first, last), event_);
}
@@ -172,7 +172,7 @@ inline future<svm_ptr<T> > copy_on_device_async(svm_ptr<T> first,
{
size_t count = iterator_range_size(first, last);
if(count == 0){
- return result;
+ return future<svm_ptr<T> >();
}
event event_ = queue.enqueue_svm_memcpy_async(
diff --git a/boost/compute/algorithm/detail/copy_to_device.hpp b/boost/compute/algorithm/detail/copy_to_device.hpp
index 90545fb4ed..bce5975f53 100644
--- a/boost/compute/algorithm/detail/copy_to_device.hpp
+++ b/boost/compute/algorithm/detail/copy_to_device.hpp
@@ -53,6 +53,49 @@ inline DeviceIterator copy_to_device(HostIterator first,
}
template<class HostIterator, class DeviceIterator>
+inline DeviceIterator copy_to_device_map(HostIterator first,
+ HostIterator last,
+ DeviceIterator result,
+ command_queue &queue)
+{
+ typedef typename
+ std::iterator_traits<DeviceIterator>::value_type
+ value_type;
+ typedef typename
+ std::iterator_traits<DeviceIterator>::difference_type
+ difference_type;
+
+ size_t count = iterator_range_size(first, last);
+ if(count == 0){
+ return result;
+ }
+
+ size_t offset = result.get_index();
+
+ // map result buffer to host
+ value_type *pointer = static_cast<value_type*>(
+ queue.enqueue_map_buffer(
+ result.get_buffer(),
+ CL_MAP_WRITE,
+ offset * sizeof(value_type),
+ count * sizeof(value_type)
+ )
+ );
+
+ // copy [first; last) to result buffer
+ std::copy(first, last, pointer);
+
+ // unmap result buffer
+ boost::compute::event unmap_event = queue.enqueue_unmap_buffer(
+ result.get_buffer(),
+ static_cast<void*>(pointer)
+ );
+ unmap_event.wait();
+
+ return result + static_cast<difference_type>(count);
+}
+
+template<class HostIterator, class DeviceIterator>
inline future<DeviceIterator> copy_to_device_async(HostIterator first,
HostIterator last,
DeviceIterator result,
@@ -109,7 +152,7 @@ inline future<svm_ptr<T> > copy_to_device_async(HostIterator first,
{
size_t count = iterator_range_size(first, last);
if(count == 0){
- return result;
+ return future<svm_ptr<T> >();
}
event event_ = queue.enqueue_svm_memcpy_async(
@@ -118,6 +161,29 @@ inline future<svm_ptr<T> > copy_to_device_async(HostIterator first,
return make_future(result + count, event_);
}
+
+template<class HostIterator, class T>
+inline svm_ptr<T> copy_to_device_map(HostIterator first,
+ HostIterator last,
+ svm_ptr<T> result,
+ command_queue &queue)
+{
+ size_t count = iterator_range_size(first, last);
+ if(count == 0){
+ return result;
+ }
+
+ // map
+ queue.enqueue_svm_map(result.get(), count * sizeof(T), CL_MAP_WRITE);
+
+ // copy [first; last) to result buffer
+ std::copy(first, last, static_cast<T*>(result.get()));
+
+ // unmap result
+ queue.enqueue_svm_unmap(result.get()).wait();
+
+ return result + count;
+}
#endif // CL_VERSION_2_0
} // end detail namespace
diff --git a/boost/compute/algorithm/detail/copy_to_host.hpp b/boost/compute/algorithm/detail/copy_to_host.hpp
index b889e0c871..d770a996ef 100644
--- a/boost/compute/algorithm/detail/copy_to_host.hpp
+++ b/boost/compute/algorithm/detail/copy_to_host.hpp
@@ -51,17 +51,51 @@ inline HostIterator copy_to_host(DeviceIterator first,
return iterator_plus_distance(result, count);
}
-// copy_to_host() specialization for std::vector<bool>
-template<class DeviceIterator>
-inline std::vector<bool>::iterator
-copy_to_host(DeviceIterator first,
- DeviceIterator last,
- std::vector<bool>::iterator result,
- command_queue &queue)
+template<class DeviceIterator, class HostIterator>
+inline HostIterator copy_to_host_map(DeviceIterator first,
+ DeviceIterator last,
+ HostIterator result,
+ command_queue &queue)
{
- std::vector<uint8_t> temp(std::distance(first, last));
- copy_to_host(first, last, temp.begin(), queue);
- return std::copy(temp.begin(), temp.end(), result);
+ typedef typename
+ std::iterator_traits<DeviceIterator>::value_type
+ value_type;
+ typedef typename
+ std::iterator_traits<DeviceIterator>::difference_type
+ difference_type;
+
+ size_t count = iterator_range_size(first, last);
+ if(count == 0){
+ return result;
+ }
+
+ size_t offset = first.get_index();
+
+ // map [first; last) buffer to host
+ value_type *pointer = static_cast<value_type*>(
+ queue.enqueue_map_buffer(
+ first.get_buffer(),
+ CL_MAP_READ,
+ offset * sizeof(value_type),
+ count * sizeof(value_type)
+ )
+ );
+
+ // copy [first; last) to result buffer
+ std::copy(
+ pointer,
+ pointer + static_cast<difference_type>(count),
+ result
+ );
+
+ // unmap [first; last)
+ boost::compute::event unmap_event = queue.enqueue_unmap_buffer(
+ first.get_buffer(),
+ static_cast<void*>(pointer)
+ );
+ unmap_event.wait();
+
+ return iterator_plus_distance(result, count);
}
template<class DeviceIterator, class HostIterator>
@@ -119,7 +153,7 @@ inline future<HostIterator> copy_to_host_async(svm_ptr<T> first,
{
size_t count = iterator_range_size(first, last);
if(count == 0){
- return result;
+ return future<HostIterator>();
}
event event_ = queue.enqueue_svm_memcpy_async(
@@ -128,6 +162,33 @@ inline future<HostIterator> copy_to_host_async(svm_ptr<T> first,
return make_future(iterator_plus_distance(result, count), event_);
}
+
+template<class T, class HostIterator>
+inline HostIterator copy_to_host_map(svm_ptr<T> first,
+ svm_ptr<T> last,
+ HostIterator result,
+ command_queue &queue)
+{
+ size_t count = iterator_range_size(first, last);
+ if(count == 0){
+ return result;
+ }
+
+ // map
+ queue.enqueue_svm_map(first.get(), count * sizeof(T), CL_MAP_READ);
+
+ // copy [first; last) to result
+ std::copy(
+ static_cast<T*>(first.get()),
+ static_cast<T*>(last.get()),
+ result
+ );
+
+ // unmap [first; last)
+ queue.enqueue_svm_unmap(first.get()).wait();
+
+ return iterator_plus_distance(result, count);
+}
#endif // CL_VERSION_2_0
} // end detail namespace
diff --git a/boost/compute/algorithm/detail/find_extrema.hpp b/boost/compute/algorithm/detail/find_extrema.hpp
index 6e756c3904..eef2e36c3c 100644
--- a/boost/compute/algorithm/detail/find_extrema.hpp
+++ b/boost/compute/algorithm/detail/find_extrema.hpp
@@ -12,6 +12,7 @@
#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_HPP
#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/algorithm/detail/find_extrema_on_cpu.hpp>
#include <boost/compute/algorithm/detail/find_extrema_with_reduce.hpp>
#include <boost/compute/algorithm/detail/find_extrema_with_atomics.hpp>
#include <boost/compute/algorithm/detail/serial_find_extrema.hpp>
@@ -36,12 +37,17 @@ inline InputIterator find_extrema(InputIterator first,
const device &device = queue.get_device();
+ // CPU
+ if(device.type() & device::cpu) {
+ return find_extrema_on_cpu(first, last, compare, find_minimum, queue);
+ }
+
+ // GPU
// use serial method for small inputs
- // and when device is a CPU
- if(count < 512 || (device.type() & device::cpu)){
+ if(count < 512)
+ {
return serial_find_extrema(first, last, compare, find_minimum, queue);
}
-
// find_extrema_with_reduce() is used only if requirements are met
if(find_extrema_with_reduce_requirements_met(first, last, queue))
{
diff --git a/boost/compute/algorithm/detail/find_extrema_on_cpu.hpp b/boost/compute/algorithm/detail/find_extrema_on_cpu.hpp
new file mode 100644
index 0000000000..9363ad5837
--- /dev/null
+++ b/boost/compute/algorithm/detail/find_extrema_on_cpu.hpp
@@ -0,0 +1,138 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_ON_CPU_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_ON_CPU_HPP
+
+#include <algorithm>
+
+#include <boost/compute/algorithm/detail/find_extrema_with_reduce.hpp>
+#include <boost/compute/algorithm/detail/find_extrema_with_atomics.hpp>
+#include <boost/compute/algorithm/detail/serial_find_extrema.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class Compare>
+inline InputIterator find_extrema_on_cpu(InputIterator first,
+ InputIterator last,
+ Compare compare,
+ const bool find_minimum,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<InputIterator>::value_type input_type;
+ typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
+ size_t count = iterator_range_size(first, last);
+
+ const device &device = queue.get_device();
+ const uint_ compute_units = queue.get_device().compute_units();
+
+ boost::shared_ptr<parameter_cache> parameters =
+ detail::parameter_cache::get_global_cache(device);
+ std::string cache_key =
+ "__boost_find_extrema_cpu_"
+ + boost::lexical_cast<std::string>(sizeof(input_type));
+
+ // for inputs smaller than serial_find_extrema_threshold
+ // serial_find_extrema algorithm is used
+ uint_ serial_find_extrema_threshold = parameters->get(
+ cache_key,
+ "serial_find_extrema_threshold",
+ 16384 * sizeof(input_type)
+ );
+ serial_find_extrema_threshold =
+ (std::max)(serial_find_extrema_threshold, uint_(2 * compute_units));
+
+ const context &context = queue.get_context();
+ if(count < serial_find_extrema_threshold) {
+ return serial_find_extrema(first, last, compare, find_minimum, queue);
+ }
+
+ meta_kernel k("find_extrema_on_cpu");
+ buffer output(context, sizeof(input_type) * compute_units);
+ buffer output_idx(
+ context, sizeof(uint_) * compute_units,
+ buffer::read_write | buffer::alloc_host_ptr
+ );
+
+ size_t count_arg = k.add_arg<uint_>("count");
+ size_t output_arg =
+ k.add_arg<input_type *>(memory_object::global_memory, "output");
+ size_t output_idx_arg =
+ k.add_arg<uint_ *>(memory_object::global_memory, "output_idx");
+
+ k <<
+ "uint block = " <<
+ "(uint)ceil(((float)count)/get_global_size(0));\n" <<
+ "uint index = get_global_id(0) * block;\n" <<
+ "uint end = min(count, index + block);\n" <<
+
+ "uint value_index = index;\n" <<
+ k.decl<input_type>("value") << " = " << first[k.var<uint_>("index")] << ";\n" <<
+
+ "index++;\n" <<
+ "while(index < end){\n" <<
+ k.decl<input_type>("candidate") <<
+ " = " << first[k.var<uint_>("index")] << ";\n" <<
+ "#ifndef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
+ "bool compare = " << compare(k.var<input_type>("candidate"),
+ k.var<input_type>("value")) << ";\n" <<
+ "#else\n" <<
+ "bool compare = " << compare(k.var<input_type>("value"),
+ k.var<input_type>("candidate")) << ";\n" <<
+ "#endif\n" <<
+ "value = compare ? candidate : value;\n" <<
+ "value_index = compare ? index : value_index;\n" <<
+ "index++;\n" <<
+ "}\n" <<
+ "output[get_global_id(0)] = value;\n" <<
+ "output_idx[get_global_id(0)] = value_index;\n";
+
+ size_t global_work_size = compute_units;
+ std::string options;
+ if(!find_minimum){
+ options = "-DBOOST_COMPUTE_FIND_MAXIMUM";
+ }
+ kernel kernel = k.compile(context, options);
+
+ kernel.set_arg(count_arg, static_cast<uint_>(count));
+ kernel.set_arg(output_arg, output);
+ kernel.set_arg(output_idx_arg, output_idx);
+ queue.enqueue_1d_range_kernel(kernel, 0, global_work_size, 0);
+
+ buffer_iterator<input_type> result = serial_find_extrema(
+ make_buffer_iterator<input_type>(output),
+ make_buffer_iterator<input_type>(output, global_work_size),
+ compare,
+ find_minimum,
+ queue
+ );
+
+ uint_* output_idx_host_ptr =
+ static_cast<uint_*>(
+ queue.enqueue_map_buffer(
+ output_idx, command_queue::map_read,
+ 0, global_work_size * sizeof(uint_)
+ )
+ );
+
+ difference_type extremum_idx =
+ static_cast<difference_type>(*(output_idx_host_ptr + result.get_index()));
+ return first + extremum_idx;
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_ON_CPU_HPP
diff --git a/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp b/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp
index 1fbb7dee19..8f2a83c38b 100644
--- a/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp
+++ b/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp
@@ -128,7 +128,7 @@ inline void find_extrema_with_reduce(InputIterator input,
// Next element
k.decl<input_type>("next") << " = " << input[k.var<uint_>("idx")] << ";\n" <<
"#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
- k.decl<input_type>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
+ k.decl<uint_>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
"#endif\n" <<
// Comparison between currently best element (acc) and next element
diff --git a/boost/compute/algorithm/detail/merge_sort_on_gpu.hpp b/boost/compute/algorithm/detail/merge_sort_on_gpu.hpp
new file mode 100644
index 0000000000..e62c6beb8d
--- /dev/null
+++ b/boost/compute/algorithm/detail/merge_sort_on_gpu.hpp
@@ -0,0 +1,590 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_GPU_HPP_
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_GPU_HPP_
+
+#include <algorithm>
+
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/program.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/memory/local_buffer.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class KeyType, class ValueType>
+inline size_t pick_bitonic_block_sort_block_size(size_t proposed_wg,
+ size_t lmem_size,
+ bool sort_by_key)
+{
+ size_t n = proposed_wg;
+
+ size_t lmem_required = n * sizeof(KeyType);
+ if(sort_by_key) {
+ lmem_required += n * sizeof(ValueType);
+ }
+
+ // try to force at least 4 work-groups of >64 elements
+ // for better occupancy
+ while(lmem_size < (lmem_required * 4) && (n > 64)) {
+ n /= 2;
+ lmem_required = n * sizeof(KeyType);
+ }
+ while(lmem_size < lmem_required && (n != 1)) {
+ n /= 2;
+ if(n < 1) n = 1;
+ lmem_required = n * sizeof(KeyType);
+ }
+
+ if(n < 2) { return 1; }
+ else if(n < 4) { return 2; }
+ else if(n < 8) { return 4; }
+ else if(n < 16) { return 8; }
+ else if(n < 32) { return 16; }
+ else if(n < 64) { return 32; }
+ else if(n < 128) { return 64; }
+ else if(n < 256) { return 128; }
+ else { return 256; }
+}
+
+
+/// Performs bitonic block sort according to \p compare.
+///
+/// Since bitonic sort can be only performed when input size is equal to 2^n,
+/// in this case input size is block size (\p work_group_size), we would have
+/// to require \p count be a exact multiple of block size. That would not be
+/// great.
+/// Instead, bitonic sort kernel is merged with odd-even merge sort so if the
+/// last block is not equal to 2^n (where n is some natural number) the odd-even
+/// sort is performed for that block. That way bitonic_block_sort() works for
+/// input of any size. Block size (\p work_group_size) still have to be equal
+/// to 2^n.
+///
+/// This is NOT stable.
+///
+/// \param keys_first first key element in the range to sort
+/// \param values_first first value element in the range to sort
+/// \param compare comparison function for keys
+/// \param count number of elements in the range; count > 0
+/// \param work_group_size size of the work group, also the block size; must be
+/// equal to n^2 where n is natural number
+/// \param queue command queue to perform the operation
+template<class KeyIterator, class ValueIterator, class Compare>
+inline size_t bitonic_block_sort(KeyIterator keys_first,
+ ValueIterator values_first,
+ Compare compare,
+ const size_t count,
+ const bool sort_by_key,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
+
+ meta_kernel k("bitonic_block_sort");
+ size_t count_arg = k.add_arg<const uint_>("count");
+
+ size_t local_keys_arg = k.add_arg<key_type *>(memory_object::local_memory, "lkeys");
+ size_t local_vals_arg = 0;
+ if(sort_by_key) {
+ local_vals_arg = k.add_arg<uchar_ *>(memory_object::local_memory, "lidx");
+ }
+
+ k <<
+ // Work item global and local ids
+ k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
+ k.decl<const uint_>("lid") << " = get_local_id(0);\n";
+
+ // declare my_key and my_value
+ k <<
+ k.decl<key_type>("my_key") << ";\n";
+ // Instead of copying values (my_value) in local memory with keys
+ // we save local index (uchar) and copy my_value at the end at
+ // final index. This saves local memory.
+ if(sort_by_key)
+ {
+ k <<
+ k.decl<uchar_>("my_index") << " = (uchar)(lid);\n";
+ }
+
+ // load key
+ k <<
+ "if(gid < count) {\n" <<
+ k.var<key_type>("my_key") << " = " <<
+ keys_first[k.var<const uint_>("gid")] << ";\n" <<
+ "}\n";
+
+ // load key and index to local memory
+ k <<
+ "lkeys[lid] = my_key;\n";
+ if(sort_by_key)
+ {
+ k <<
+ "lidx[lid] = my_index;\n";
+ }
+ k <<
+ k.decl<const uint_>("offset") << " = get_group_id(0) * get_local_size(0);\n" <<
+ k.decl<const uint_>("n") << " = min((uint)(get_local_size(0)),(count - offset));\n";
+
+ // When work group size is a power of 2 bitonic sorter can be used;
+ // otherwise, slower odd-even sort is used.
+
+ k <<
+ // check if n is power of 2
+ "if(((n != 0) && ((n & (~n + 1)) == n))) {\n";
+
+ // bitonic sort, not stable
+ k <<
+ // wait for keys and vals to be stored in local memory
+ "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+
+ "#pragma unroll\n" <<
+ "for(" <<
+ k.decl<uint_>("length") << " = 1; " <<
+ "length < n; " <<
+ "length <<= 1" <<
+ ") {\n" <<
+ // direction of sort: false -> asc, true -> desc
+ k.decl<bool>("direction") << "= ((lid & (length<<1)) != 0);\n" <<
+ "for(" <<
+ k.decl<uint_>("k") << " = length; " <<
+ "k > 0; " <<
+ "k >>= 1" <<
+ ") {\n" <<
+
+ // sibling to compare with my key
+ k.decl<uint_>("sibling_idx") << " = lid ^ k;\n" <<
+ k.decl<key_type>("sibling_key") << " = lkeys[sibling_idx];\n" <<
+ k.decl<bool>("compare") << " = " <<
+ compare(k.var<key_type>("sibling_key"),
+ k.var<key_type>("my_key")) << ";\n" <<
+ k.decl<bool>("swap") <<
+ " = compare ^ (sibling_idx < lid) ^ direction;\n" <<
+ "my_key = swap ? sibling_key : my_key;\n";
+ if(sort_by_key)
+ {
+ k <<
+ "my_index = swap ? lidx[sibling_idx] : my_index;\n";
+ }
+ k <<
+ "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+ "lkeys[lid] = my_key;\n";
+ if(sort_by_key)
+ {
+ k <<
+ "lidx[lid] = my_index;\n";
+ }
+ k <<
+ "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+ "}\n" <<
+ "}\n";
+
+ // end of bitonic sort
+
+ // odd-even sort, not stable
+ k <<
+ "}\n" <<
+ "else { \n";
+
+ k <<
+ k.decl<bool>("lid_is_even") << " = (lid%2) == 0;\n" <<
+ k.decl<uint_>("oddsibling_idx") << " = " <<
+ "(lid_is_even) ? max(lid,(uint)(1)) - 1 : min(lid+1,n-1);\n" <<
+ k.decl<uint_>("evensibling_idx") << " = " <<
+ "(lid_is_even) ? min(lid+1,n-1) : max(lid,(uint)(1)) - 1;\n" <<
+
+ // wait for keys and vals to be stored in local memory
+ "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+
+ "#pragma unroll\n" <<
+ "for(" <<
+ k.decl<uint_>("i") << " = 0; " <<
+ "i < n; " <<
+ "i++" <<
+ ") {\n" <<
+ k.decl<uint_>("sibling_idx") <<
+ " = i%2 == 0 ? evensibling_idx : oddsibling_idx;\n" <<
+ k.decl<key_type>("sibling_key") << " = lkeys[sibling_idx];\n" <<
+ k.decl<bool>("compare") << " = " <<
+ compare(k.var<key_type>("sibling_key"),
+ k.var<key_type>("my_key")) << ";\n" <<
+ k.decl<bool>("swap") <<
+ " = compare ^ (sibling_idx < lid);\n" <<
+ "my_key = swap ? sibling_key : my_key;\n";
+ if(sort_by_key)
+ {
+ k <<
+ "my_index = swap ? lidx[sibling_idx] : my_index;\n";
+ }
+ k <<
+ "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+ "lkeys[lid] = my_key;\n";
+ if(sort_by_key)
+ {
+ k <<
+ "lidx[lid] = my_index;\n";
+ }
+ k <<
+ "barrier(CLK_LOCAL_MEM_FENCE);\n"
+ "}\n" << // for
+
+ "}\n"; // else
+ // end of odd-even sort
+
+ // save key and value
+ k <<
+ "if(gid < count) {\n" <<
+ keys_first[k.var<const uint_>("gid")] << " = " <<
+ k.var<key_type>("my_key") << ";\n";
+ if(sort_by_key)
+ {
+ k << values_first[k.var<const uint_>("gid")] << " = " <<
+ values_first[k.var<const uint_>("offset + my_index")] << ";\n";
+ }
+ k <<
+ // end if
+ "}\n";
+
+ const context &context = queue.get_context();
+ const device &device = queue.get_device();
+ ::boost::compute::kernel kernel = k.compile(context);
+
+ const size_t work_group_size =
+ pick_bitonic_block_sort_block_size<key_type, uchar_>(
+ kernel.get_work_group_info<size_t>(
+ device, CL_KERNEL_WORK_GROUP_SIZE
+ ),
+ device.get_info<size_t>(CL_DEVICE_LOCAL_MEM_SIZE),
+ sort_by_key
+ );
+
+ const size_t global_size =
+ work_group_size * static_cast<size_t>(
+ std::ceil(float(count) / work_group_size)
+ );
+
+ kernel.set_arg(count_arg, static_cast<uint_>(count));
+ kernel.set_arg(local_keys_arg, local_buffer<key_type>(work_group_size));
+ if(sort_by_key) {
+ kernel.set_arg(local_vals_arg, local_buffer<uchar_>(work_group_size));
+ }
+
+ queue.enqueue_1d_range_kernel(kernel, 0, global_size, work_group_size);
+ // return size of the block
+ return work_group_size;
+}
+
+template<class KeyIterator, class ValueIterator, class Compare>
+inline size_t block_sort(KeyIterator keys_first,
+ ValueIterator values_first,
+ Compare compare,
+ const size_t count,
+ const bool sort_by_key,
+ const bool stable,
+ command_queue &queue)
+{
+ if(stable) {
+ // TODO: Implement stable block sort (stable odd-even merge sort)
+ return size_t(1);
+ }
+ return bitonic_block_sort(
+ keys_first, values_first,
+ compare, count,
+ sort_by_key, queue
+ );
+}
+
+/// space: O(n + m); n - number of keys, m - number of values
+template<class KeyIterator, class ValueIterator, class Compare>
+inline void merge_blocks_on_gpu(KeyIterator keys_first,
+ ValueIterator values_first,
+ KeyIterator out_keys_first,
+ ValueIterator out_values_first,
+ Compare compare,
+ const size_t count,
+ const size_t block_size,
+ const bool sort_by_key,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
+ typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
+
+ meta_kernel k("merge_blocks");
+ size_t count_arg = k.add_arg<const uint_>("count");
+ size_t block_size_arg = k.add_arg<const uint_>("block_size");
+
+ k <<
+ // get global id
+ k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
+ "if(gid >= count) {\n" <<
+ "return;\n" <<
+ "}\n" <<
+
+ k.decl<const key_type>("my_key") << " = " <<
+ keys_first[k.var<const uint_>("gid")] << ";\n";
+
+ if(sort_by_key) {
+ k <<
+ k.decl<const value_type>("my_value") << " = " <<
+ values_first[k.var<const uint_>("gid")] << ";\n";
+ }
+
+ k <<
+ // get my block idx
+ k.decl<const uint_>("my_block_idx") << " = gid / block_size;\n" <<
+ k.decl<const bool>("my_block_idx_is_odd") << " = " <<
+ "my_block_idx & 0x1;\n" <<
+
+ k.decl<const uint_>("other_block_idx") << " = " <<
+ // if(my_block_idx is odd) {} else {}
+ "my_block_idx_is_odd ? my_block_idx - 1 : my_block_idx + 1;\n" <<
+
+ // get ranges of my block and the other block
+ // [my_block_start; my_block_end)
+ // [other_block_start; other_block_end)
+ k.decl<const uint_>("my_block_start") << " = " <<
+ "min(my_block_idx * block_size, count);\n" << // including
+ k.decl<const uint_>("my_block_end") << " = " <<
+ "min((my_block_idx + 1) * block_size, count);\n" << // excluding
+
+ k.decl<const uint_>("other_block_start") << " = " <<
+ "min(other_block_idx * block_size, count);\n" << // including
+ k.decl<const uint_>("other_block_end") << " = " <<
+ "min((other_block_idx + 1) * block_size, count);\n" << // excluding
+
+ // other block is empty, nothing to merge here
+ "if(other_block_start == count){\n" <<
+ out_keys_first[k.var<uint_>("gid")] << " = my_key;\n";
+ if(sort_by_key) {
+ k <<
+ out_values_first[k.var<uint_>("gid")] << " = my_value;\n";
+ }
+
+ k <<
+ "return;\n" <<
+ "}\n" <<
+
+ // lower bound
+ // left_idx - lower bound
+ k.decl<uint_>("left_idx") << " = other_block_start;\n" <<
+ k.decl<uint_>("right_idx") << " = other_block_end;\n" <<
+ "while(left_idx < right_idx) {\n" <<
+ k.decl<uint_>("mid_idx") << " = (left_idx + right_idx) / 2;\n" <<
+ k.decl<key_type>("mid_key") << " = " <<
+ keys_first[k.var<const uint_>("mid_idx")] << ";\n" <<
+ k.decl<bool>("smaller") << " = " <<
+ compare(k.var<key_type>("mid_key"),
+ k.var<key_type>("my_key")) << ";\n" <<
+ "left_idx = smaller ? mid_idx + 1 : left_idx;\n" <<
+ "right_idx = smaller ? right_idx : mid_idx;\n" <<
+ "}\n" <<
+ // left_idx is found position in other block
+
+ // if my_block is odd we need to get the upper bound
+ "right_idx = other_block_end;\n" <<
+ "if(my_block_idx_is_odd && left_idx != right_idx) {\n" <<
+ k.decl<key_type>("upper_key") << " = " <<
+ keys_first[k.var<const uint_>("left_idx")] << ";\n" <<
+ "while(" <<
+ "!(" << compare(k.var<key_type>("upper_key"),
+ k.var<key_type>("my_key")) <<
+ ") && " <<
+ "!(" << compare(k.var<key_type>("my_key"),
+ k.var<key_type>("upper_key")) <<
+ ") && " <<
+ "left_idx < right_idx" <<
+ ")" <<
+ "{\n" <<
+ k.decl<uint_>("mid_idx") << " = (left_idx + right_idx) / 2;\n" <<
+ k.decl<key_type>("mid_key") << " = " <<
+ keys_first[k.var<const uint_>("mid_idx")] << ";\n" <<
+ k.decl<bool>("equal") << " = " <<
+ "!(" << compare(k.var<key_type>("mid_key"),
+ k.var<key_type>("my_key")) <<
+ ") && " <<
+ "!(" << compare(k.var<key_type>("my_key"),
+ k.var<key_type>("mid_key")) <<
+ ");\n" <<
+ "left_idx = equal ? mid_idx + 1 : left_idx + 1;\n" <<
+ "right_idx = equal ? right_idx : mid_idx;\n" <<
+ "upper_key = equal ? upper_key : " <<
+ keys_first[k.var<const uint_>("left_idx")] << ";\n" <<
+ "}\n" <<
+ "}\n" <<
+
+ k.decl<uint_>("offset") << " = 0;\n" <<
+ "offset += gid - my_block_start;\n" <<
+ "offset += left_idx - other_block_start;\n" <<
+ "offset += min(my_block_start, other_block_start);\n" <<
+ out_keys_first[k.var<uint_>("offset")] << " = my_key;\n";
+ if(sort_by_key) {
+ k <<
+ out_values_first[k.var<uint_>("offset")] << " = my_value;\n";
+ }
+
+ const context &context = queue.get_context();
+ ::boost::compute::kernel kernel = k.compile(context);
+
+ const size_t work_group_size = (std::min)(
+ size_t(256),
+ kernel.get_work_group_info<size_t>(
+ queue.get_device(), CL_KERNEL_WORK_GROUP_SIZE
+ )
+ );
+ const size_t global_size =
+ work_group_size * static_cast<size_t>(
+ std::ceil(float(count) / work_group_size)
+ );
+
+ kernel.set_arg(count_arg, static_cast<uint_>(count));
+ kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));
+ queue.enqueue_1d_range_kernel(kernel, 0, global_size, work_group_size);
+}
+
+template<class KeyIterator, class ValueIterator, class Compare>
+inline void merge_sort_by_key_on_gpu(KeyIterator keys_first,
+ KeyIterator keys_last,
+ ValueIterator values_first,
+ Compare compare,
+ bool stable,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
+ typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
+
+ size_t count = iterator_range_size(keys_first, keys_last);
+ if(count < 2){
+ return;
+ }
+
+ size_t block_size =
+ block_sort(
+ keys_first, values_first,
+ compare, count,
+ true /* sort_by_key */, stable /* stable */,
+ queue
+ );
+
+ // for small input size only block sort is performed
+ if(count <= block_size) {
+ return;
+ }
+
+ const context &context = queue.get_context();
+
+ bool result_in_temporary_buffer = false;
+ ::boost::compute::vector<key_type> temp_keys(count, context);
+ ::boost::compute::vector<value_type> temp_values(count, context);
+
+ for(; block_size < count; block_size *= 2) {
+ result_in_temporary_buffer = !result_in_temporary_buffer;
+ if(result_in_temporary_buffer) {
+ merge_blocks_on_gpu(keys_first, values_first,
+ temp_keys.begin(), temp_values.begin(),
+ compare, count, block_size,
+ true /* sort_by_key */, queue);
+ } else {
+ merge_blocks_on_gpu(temp_keys.begin(), temp_values.begin(),
+ keys_first, values_first,
+ compare, count, block_size,
+ true /* sort_by_key */, queue);
+ }
+ }
+
+ if(result_in_temporary_buffer) {
+ copy_async(temp_keys.begin(), temp_keys.end(), keys_first, queue);
+ copy_async(temp_values.begin(), temp_values.end(), values_first, queue);
+ }
+}
+
+template<class Iterator, class Compare>
+inline void merge_sort_on_gpu(Iterator first,
+ Iterator last,
+ Compare compare,
+ bool stable,
+ command_queue &queue)
+{
+ typedef typename std::iterator_traits<Iterator>::value_type key_type;
+
+ size_t count = iterator_range_size(first, last);
+ if(count < 2){
+ return;
+ }
+
+ Iterator dummy;
+ size_t block_size =
+ block_sort(
+ first, dummy,
+ compare, count,
+ false /* sort_by_key */, stable /* stable */,
+ queue
+ );
+
+ // for small input size only block sort is performed
+ if(count <= block_size) {
+ return;
+ }
+
+ const context &context = queue.get_context();
+
+ bool result_in_temporary_buffer = false;
+ ::boost::compute::vector<key_type> temp_keys(count, context);
+
+ for(; block_size < count; block_size *= 2) {
+ result_in_temporary_buffer = !result_in_temporary_buffer;
+ if(result_in_temporary_buffer) {
+ merge_blocks_on_gpu(first, dummy, temp_keys.begin(), dummy,
+ compare, count, block_size,
+ false /* sort_by_key */, queue);
+ } else {
+ merge_blocks_on_gpu(temp_keys.begin(), dummy, first, dummy,
+ compare, count, block_size,
+ false /* sort_by_key */, queue);
+ }
+ }
+
+ if(result_in_temporary_buffer) {
+ copy_async(temp_keys.begin(), temp_keys.end(), first, queue);
+ }
+}
+
+template<class KeyIterator, class ValueIterator, class Compare>
+inline void merge_sort_by_key_on_gpu(KeyIterator keys_first,
+ KeyIterator keys_last,
+ ValueIterator values_first,
+ Compare compare,
+ command_queue &queue)
+{
+ merge_sort_by_key_on_gpu(
+ keys_first, keys_last, values_first,
+ compare, false /* not stable */, queue
+ );
+}
+
+template<class Iterator, class Compare>
+inline void merge_sort_on_gpu(Iterator first,
+ Iterator last,
+ Compare compare,
+ command_queue &queue)
+{
+ merge_sort_on_gpu(
+ first, last, compare, false /* not stable */, queue
+ );
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif /* BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_GPU_HPP_ */
diff --git a/boost/compute/algorithm/detail/radix_sort.hpp b/boost/compute/algorithm/detail/radix_sort.hpp
index c2ba4ed17c..8e6d5f9c0a 100644
--- a/boost/compute/algorithm/detail/radix_sort.hpp
+++ b/boost/compute/algorithm/detail/radix_sort.hpp
@@ -92,6 +92,8 @@ const char radix_sort_source[] =
"#define RADIX_MASK ((((T)(1)) << K_BITS) - 1)\n"
"#define SIGN_BIT ((sizeof(T) * CHAR_BIT) - 1)\n"
+"#if defined(ASC)\n" // asc order
+
"inline uint radix(const T x, const uint low_bit)\n"
"{\n"
"#if defined(IS_FLOATING_POINT)\n"
@@ -104,6 +106,25 @@ const char radix_sort_source[] =
"#endif\n"
"}\n"
+"#else\n" // desc order
+
+// For signed types we just negate the x and for unsigned types we
+// subtract the x from max value of its type ((T)(-1) is a max value
+// of type T when T is an unsigned type).
+"inline uint radix(const T x, const uint low_bit)\n"
+"{\n"
+"#if defined(IS_FLOATING_POINT)\n"
+" const T mask = -(x >> SIGN_BIT) | (((T)(1)) << SIGN_BIT);\n"
+" return (((-x) ^ mask) >> low_bit) & RADIX_MASK;\n"
+"#elif defined(IS_SIGNED)\n"
+" return (((-x) ^ (((T)(1)) << SIGN_BIT)) >> low_bit) & RADIX_MASK;\n"
+"#else\n"
+" return (((T)(-1) - x) >> low_bit) & RADIX_MASK;\n"
+"#endif\n"
+"}\n"
+
+"#endif\n" // #if defined(ASC)
+
"__kernel void count(__global const T *input,\n"
" const uint input_offset,\n"
" const uint input_size,\n"
@@ -227,6 +248,7 @@ template<class T, class T2>
inline void radix_sort_impl(const buffer_iterator<T> first,
const buffer_iterator<T> last,
const buffer_iterator<T2> values_first,
+ const bool ascending,
command_queue &queue)
{
@@ -279,6 +301,10 @@ inline void radix_sort_impl(const buffer_iterator<T> first,
options << enable_double<T2>();
}
+ if(ascending){
+ options << " -DASC";
+ }
+
// load radix sort program
program radix_sort_program = cache->get_or_build(
cache_key, options.str(), radix_sort_source, context
@@ -396,18 +422,38 @@ inline void radix_sort(Iterator first,
Iterator last,
command_queue &queue)
{
- radix_sort_impl(first, last, buffer_iterator<int>(), queue);
+ radix_sort_impl(first, last, buffer_iterator<int>(), true, queue);
+}
+
+template<class KeyIterator, class ValueIterator>
+inline void radix_sort_by_key(KeyIterator keys_first,
+ KeyIterator keys_last,
+ ValueIterator values_first,
+ command_queue &queue)
+{
+ radix_sort_impl(keys_first, keys_last, values_first, true, queue);
+}
+
+template<class Iterator>
+inline void radix_sort(Iterator first,
+ Iterator last,
+ const bool ascending,
+ command_queue &queue)
+{
+ radix_sort_impl(first, last, buffer_iterator<int>(), ascending, queue);
}
template<class KeyIterator, class ValueIterator>
inline void radix_sort_by_key(KeyIterator keys_first,
KeyIterator keys_last,
ValueIterator values_first,
+ const bool ascending,
command_queue &queue)
{
- radix_sort_impl(keys_first, keys_last, values_first, queue);
+ radix_sort_impl(keys_first, keys_last, values_first, ascending, queue);
}
+
} // end detail namespace
} // end compute namespace
} // end boost namespace
diff --git a/boost/compute/algorithm/detail/reduce_on_cpu.hpp b/boost/compute/algorithm/detail/reduce_on_cpu.hpp
new file mode 100644
index 0000000000..490d7d538e
--- /dev/null
+++ b/boost/compute/algorithm/detail/reduce_on_cpu.hpp
@@ -0,0 +1,110 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_CPU_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_CPU_HPP
+
+#include <algorithm>
+
+#include <boost/compute/buffer.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/parameter_cache.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/type_traits/result_of.hpp>
+#include <boost/compute/algorithm/detail/serial_reduce.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class OutputIterator, class BinaryFunction>
+inline void reduce_on_cpu(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ BinaryFunction function,
+ command_queue &queue)
+{
+ typedef typename
+ std::iterator_traits<InputIterator>::value_type T;
+ typedef typename
+ ::boost::compute::result_of<BinaryFunction(T, T)>::type result_type;
+
+ const device &device = queue.get_device();
+ const uint_ compute_units = queue.get_device().compute_units();
+
+ boost::shared_ptr<parameter_cache> parameters =
+ detail::parameter_cache::get_global_cache(device);
+
+ std::string cache_key =
+ "__boost_reduce_cpu_" + boost::lexical_cast<std::string>(sizeof(T));
+
+ // for inputs smaller than serial_reduce_threshold
+ // serial_reduce algorithm is used
+ uint_ serial_reduce_threshold =
+ parameters->get(cache_key, "serial_reduce_threshold", 16384 * sizeof(T));
+ serial_reduce_threshold =
+ (std::max)(serial_reduce_threshold, uint_(compute_units));
+
+ const context &context = queue.get_context();
+ size_t count = detail::iterator_range_size(first, last);
+ if(count == 0){
+ return;
+ }
+ else if(count < serial_reduce_threshold) {
+ return serial_reduce(first, last, result, function, queue);
+ }
+
+ meta_kernel k("reduce_on_cpu");
+ buffer output(context, sizeof(result_type) * compute_units);
+
+ size_t count_arg = k.add_arg<uint_>("count");
+ size_t output_arg =
+ k.add_arg<result_type *>(memory_object::global_memory, "output");
+
+ k <<
+ "uint block = " <<
+ "(uint)ceil(((float)count)/get_global_size(0));\n" <<
+ "uint index = get_global_id(0) * block;\n" <<
+ "uint end = min(count, index + block);\n" <<
+
+ k.decl<result_type>("result") << " = " << first[k.var<uint_>("index")] << ";\n" <<
+ "index++;\n" <<
+ "while(index < end){\n" <<
+ "result = " << function(k.var<T>("result"),
+ first[k.var<uint_>("index")]) << ";\n" <<
+ "index++;\n" <<
+ "}\n" <<
+ "output[get_global_id(0)] = result;\n";
+
+ size_t global_work_size = compute_units;
+ kernel kernel = k.compile(context);
+
+ // reduction to global_work_size elements
+ kernel.set_arg(count_arg, static_cast<uint_>(count));
+ kernel.set_arg(output_arg, output);
+ queue.enqueue_1d_range_kernel(kernel, 0, global_work_size, 0);
+
+ // final reduction
+ reduce_on_cpu(
+ make_buffer_iterator<result_type>(output),
+ make_buffer_iterator<result_type>(output, global_work_size),
+ result,
+ function,
+ queue
+ );
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_CPU_HPP
diff --git a/boost/compute/algorithm/detail/scan_on_cpu.hpp b/boost/compute/algorithm/detail/scan_on_cpu.hpp
index 6611c0ba3e..d81117c65f 100644
--- a/boost/compute/algorithm/detail/scan_on_cpu.hpp
+++ b/boost/compute/algorithm/detail/scan_on_cpu.hpp
@@ -1,5 +1,5 @@
//---------------------------------------------------------------------------//
-// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
@@ -16,8 +16,10 @@
#include <boost/compute/device.hpp>
#include <boost/compute/kernel.hpp>
#include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/detail/serial_scan.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/parameter_cache.hpp>
namespace boost {
namespace compute {
@@ -32,68 +34,170 @@ inline OutputIterator scan_on_cpu(InputIterator first,
BinaryOperator op,
command_queue &queue)
{
- if(first == last){
- return result;
- }
-
typedef typename
std::iterator_traits<InputIterator>::value_type input_type;
typedef typename
std::iterator_traits<OutputIterator>::value_type output_type;
const context &context = queue.get_context();
+ const device &device = queue.get_device();
+ const size_t compute_units = queue.get_device().compute_units();
+
+ boost::shared_ptr<parameter_cache> parameters =
+ detail::parameter_cache::get_global_cache(device);
+
+ std::string cache_key =
+ "__boost_scan_cpu_" + boost::lexical_cast<std::string>(sizeof(T));
+
+ // for inputs smaller than serial_scan_threshold
+ // serial_scan algorithm is used
+ uint_ serial_scan_threshold =
+ parameters->get(cache_key, "serial_scan_threshold", 16384 * sizeof(T));
+ serial_scan_threshold =
+ (std::max)(serial_scan_threshold, uint_(compute_units));
+
+ size_t count = detail::iterator_range_size(first, last);
+ if(count == 0){
+ return result;
+ }
+ else if(count < serial_scan_threshold) {
+ return serial_scan(first, last, result, exclusive, init, op, queue);
+ }
+
+ buffer block_partial_sums(context, sizeof(output_type) * compute_units );
// create scan kernel
- meta_kernel k("scan_on_cpu");
+ meta_kernel k("scan_on_cpu_block_scan");
// Arguments
- size_t n_arg = k.add_arg<ulong_>("n");
+ size_t count_arg = k.add_arg<uint_>("count");
size_t init_arg = k.add_arg<output_type>("initial_value");
+ size_t block_partial_sums_arg =
+ k.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums");
+
+ k <<
+ "uint block = " <<
+ "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" <<
+ "uint index = get_global_id(0) * block;\n" <<
+ "uint end = min(count, index + block);\n";
if(!exclusive){
k <<
- k.decl<const ulong_>("start_idx") << " = 1;\n" <<
- k.decl<output_type>("sum") << " = " << first[0] << ";\n" <<
- result[0] << " = sum;\n";
+ k.decl<output_type>("sum") << " = " <<
+ first[k.var<uint_>("index")] << ";\n" <<
+ result[k.var<uint_>("index")] << " = sum;\n" <<
+ "index++;\n";
}
else {
k <<
- k.decl<const ulong_>("start_idx") << " = 0;\n" <<
- k.decl<output_type>("sum") << " = initial_value;\n";
+ k.decl<output_type>("sum") << ";\n" <<
+ "if(index == 0){\n" <<
+ "sum = initial_value;\n" <<
+ "}\n" <<
+ "else {\n" <<
+ "sum = " << first[k.var<uint_>("index")] << ";\n" <<
+ "index++;\n" <<
+ "}\n";
}
k <<
- "for(ulong i = start_idx; i < n; i++){\n" <<
- k.decl<const input_type>("x") << " = "
- << first[k.var<ulong_>("i")] << ";\n";
+ "while(index < end){\n" <<
+ // load next value
+ k.decl<const input_type>("value") << " = "
+ << first[k.var<uint_>("index")] << ";\n";
if(exclusive){
- k << result[k.var<ulong_>("i")] << " = sum;\n";
+ k <<
+ "if(get_global_id(0) == 0){\n" <<
+ result[k.var<uint_>("index")] << " = sum;\n" <<
+ "}\n";
}
-
- k << " sum = "
- << op(k.var<output_type>("sum"), k.var<output_type>("x"))
- << ";\n";
+ k <<
+ "sum = " << op(k.var<output_type>("sum"),
+ k.var<output_type>("value")) << ";\n";
if(!exclusive){
- k << result[k.var<ulong_>("i")] << " = sum;\n";
+ k <<
+ "if(get_global_id(0) == 0){\n" <<
+ result[k.var<uint_>("index")] << " = sum;\n" <<
+ "}\n";
+ }
+
+ k <<
+ "index++;\n" <<
+ "}\n" << // end while
+ "block_partial_sums[get_global_id(0)] = sum;\n";
+
+ // compile scan kernel
+ kernel block_scan_kernel = k.compile(context);
+
+ // setup kernel arguments
+ block_scan_kernel.set_arg(count_arg, static_cast<uint_>(count));
+ block_scan_kernel.set_arg(init_arg, static_cast<output_type>(init));
+ block_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums);
+
+ // execute the kernel
+ size_t global_work_size = compute_units;
+ queue.enqueue_1d_range_kernel(block_scan_kernel, 0, global_work_size, 0);
+
+ // scan is done
+ if(compute_units < 2) {
+ return result + count;
+ }
+
+ // final scan kernel
+ meta_kernel l("scan_on_cpu_final_scan");
+
+ // Arguments
+ count_arg = l.add_arg<uint_>("count");
+ block_partial_sums_arg =
+ l.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums");
+
+ l <<
+ "uint block = " <<
+ "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" <<
+ "uint index = block + get_global_id(0) * block;\n" <<
+ "uint end = min(count, index + block);\n" <<
+
+ k.decl<output_type>("sum") << " = block_partial_sums[0];\n" <<
+ "for(uint i = 0; i < get_global_id(0); i++) {\n" <<
+ "sum = " << op(k.var<output_type>("sum"),
+ k.var<output_type>("block_partial_sums[i + 1]")) << ";\n" <<
+ "}\n" <<
+
+ "while(index < end){\n";
+ if(exclusive){
+ l <<
+ l.decl<output_type>("value") << " = "
+ << first[k.var<uint_>("index")] << ";\n" <<
+ result[k.var<uint_>("index")] << " = sum;\n" <<
+ "sum = " << op(k.var<output_type>("sum"),
+ k.var<output_type>("value")) << ";\n";
+ }
+ else {
+ l <<
+ "sum = " << op(k.var<output_type>("sum"),
+ first[k.var<uint_>("index")]) << ";\n" <<
+ result[k.var<uint_>("index")] << " = sum;\n";
}
+ l <<
+ "index++;\n" <<
+ "}\n";
- k << "}\n";
// compile scan kernel
- kernel scan_kernel = k.compile(context);
+ kernel final_scan_kernel = l.compile(context);
// setup kernel arguments
- size_t n = detail::iterator_range_size(first, last);
- scan_kernel.set_arg<ulong_>(n_arg, n);
- scan_kernel.set_arg<output_type>(init_arg, static_cast<output_type>(init));
+ final_scan_kernel.set_arg(count_arg, static_cast<uint_>(count));
+ final_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums);
// execute the kernel
- queue.enqueue_1d_range_kernel(scan_kernel, 0, 1, 1);
+ global_work_size = compute_units;
+ queue.enqueue_1d_range_kernel(final_scan_kernel, 0, global_work_size, 0);
// return iterator pointing to the end of the result range
- return result + n;
+ return result + count;
}
} // end detail namespace
diff --git a/boost/compute/algorithm/detail/scan_on_gpu.hpp b/boost/compute/algorithm/detail/scan_on_gpu.hpp
index 07c6d6d3c0..dcaff2cdf3 100644
--- a/boost/compute/algorithm/detail/scan_on_gpu.hpp
+++ b/boost/compute/algorithm/detail/scan_on_gpu.hpp
@@ -14,7 +14,6 @@
#include <boost/compute/kernel.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/command_queue.hpp>
-#include <boost/compute/algorithm/detail/scan_on_cpu.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/memory/local_buffer.hpp>
diff --git a/boost/compute/algorithm/detail/search_all.hpp b/boost/compute/algorithm/detail/search_all.hpp
index a874bcdebe..b091ec7de5 100644
--- a/boost/compute/algorithm/detail/search_all.hpp
+++ b/boost/compute/algorithm/detail/search_all.hpp
@@ -48,7 +48,7 @@ public:
*this <<
"uint i = get_global_id(0);\n" <<
- "uint i1 = i;\n" <<
+ "const uint i1 = i;\n" <<
"uint j;\n" <<
"for(j = 0; j<p_count; j++,i++)\n" <<
"{\n" <<
diff --git a/boost/compute/algorithm/detail/serial_scan.hpp b/boost/compute/algorithm/detail/serial_scan.hpp
new file mode 100644
index 0000000000..7590fd94fb
--- /dev/null
+++ b/boost/compute/algorithm/detail/serial_scan.hpp
@@ -0,0 +1,103 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_SCAN_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_SCAN_HPP
+
+#include <iterator>
+
+#include <boost/compute/device.hpp>
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
+inline OutputIterator serial_scan(InputIterator first,
+ InputIterator last,
+ OutputIterator result,
+ bool exclusive,
+ T init,
+ BinaryOperator op,
+ command_queue &queue)
+{
+ if(first == last){
+ return result;
+ }
+
+ typedef typename
+ std::iterator_traits<InputIterator>::value_type input_type;
+ typedef typename
+ std::iterator_traits<OutputIterator>::value_type output_type;
+
+ const context &context = queue.get_context();
+
+ // create scan kernel
+ meta_kernel k("serial_scan");
+
+ // Arguments
+ size_t n_arg = k.add_arg<ulong_>("n");
+ size_t init_arg = k.add_arg<output_type>("initial_value");
+
+ if(!exclusive){
+ k <<
+ k.decl<const ulong_>("start_idx") << " = 1;\n" <<
+ k.decl<output_type>("sum") << " = " << first[0] << ";\n" <<
+ result[0] << " = sum;\n";
+ }
+ else {
+ k <<
+ k.decl<const ulong_>("start_idx") << " = 0;\n" <<
+ k.decl<output_type>("sum") << " = initial_value;\n";
+ }
+
+ k <<
+ "for(ulong i = start_idx; i < n; i++){\n" <<
+ k.decl<const input_type>("x") << " = "
+ << first[k.var<ulong_>("i")] << ";\n";
+
+ if(exclusive){
+ k << result[k.var<ulong_>("i")] << " = sum;\n";
+ }
+
+ k << " sum = "
+ << op(k.var<output_type>("sum"), k.var<output_type>("x"))
+ << ";\n";
+
+ if(!exclusive){
+ k << result[k.var<ulong_>("i")] << " = sum;\n";
+ }
+
+ k << "}\n";
+
+ // compile scan kernel
+ kernel scan_kernel = k.compile(context);
+
+ // setup kernel arguments
+ size_t n = detail::iterator_range_size(first, last);
+ scan_kernel.set_arg<ulong_>(n_arg, n);
+ scan_kernel.set_arg<output_type>(init_arg, static_cast<output_type>(init));
+
+ // execute the kernel
+ queue.enqueue_1d_range_kernel(scan_kernel, 0, 1, 1);
+
+ // return iterator pointing to the end of the result range
+ return result + n;
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_SCAN_HPP
diff --git a/boost/compute/algorithm/find_end.hpp b/boost/compute/algorithm/find_end.hpp
index 5c40055113..265a1da542 100644
--- a/boost/compute/algorithm/find_end.hpp
+++ b/boost/compute/algorithm/find_end.hpp
@@ -36,6 +36,7 @@ inline InputIterator find_end_helper(InputIterator first,
command_queue &queue)
{
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+ typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
size_t count = detail::iterator_range_size(first, last);
if(count == 0){
@@ -65,8 +66,13 @@ inline InputIterator find_end_helper(InputIterator first,
queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
int result = static_cast<int>(index.read(queue));
- if(result == -1) return last;
- else return first + result;
+
+ if(result == -1){
+ return last;
+ }
+ else {
+ return first + static_cast<difference_type>(result);
+ }
}
} // end detail namespace
@@ -92,8 +98,13 @@ inline TextIterator find_end(TextIterator t_first,
command_queue &queue = system::default_queue())
{
const context &context = queue.get_context();
- vector<uint_> matching_indices(detail::iterator_range_size(t_first, t_last),
- context);
+
+ // there is no need to check if pattern starts at last n - 1 indices
+ vector<uint_> matching_indices(
+ detail::iterator_range_size(t_first, t_last)
+ + 1 - detail::iterator_range_size(p_first, p_last),
+ context
+ );
detail::search_kernel<PatternIterator,
TextIterator,
@@ -105,10 +116,16 @@ inline TextIterator find_end(TextIterator t_first,
using boost::compute::_1;
vector<uint_>::iterator index =
- detail::find_end_helper(matching_indices.begin(),
- matching_indices.end(),
- _1 == 1,
- queue);
+ detail::find_end_helper(
+ matching_indices.begin(),
+ matching_indices.end(),
+ _1 == 1,
+ queue
+ );
+
+ // pattern was not found
+ if(index == matching_indices.end())
+ return t_last;
return t_first + detail::iterator_range_size(matching_indices.begin(), index);
}
diff --git a/boost/compute/algorithm/gather.hpp b/boost/compute/algorithm/gather.hpp
index b2f725d54e..24c5c727ae 100644
--- a/boost/compute/algorithm/gather.hpp
+++ b/boost/compute/algorithm/gather.hpp
@@ -36,7 +36,6 @@ public:
OutputIterator result)
{
m_count = iterator_range_size(first, last);
- m_offset = first.get_index();
*this <<
"const uint i = get_global_id(0);\n" <<
@@ -50,12 +49,11 @@ public:
return event();
}
- return exec_1d(queue, m_offset, m_count);
+ return exec_1d(queue, 0, m_count);
}
private:
size_t m_count;
- size_t m_offset;
};
} // end detail namespace
diff --git a/boost/compute/algorithm/minmax_element.hpp b/boost/compute/algorithm/minmax_element.hpp
index bf32c3c989..3f44c09eaf 100644
--- a/boost/compute/algorithm/minmax_element.hpp
+++ b/boost/compute/algorithm/minmax_element.hpp
@@ -49,7 +49,7 @@ minmax_element(InputIterator first,
}
///\overload
-template<class InputIterator, class Compare>
+template<class InputIterator>
inline std::pair<InputIterator, InputIterator>
minmax_element(InputIterator first,
InputIterator last,
diff --git a/boost/compute/algorithm/reduce.hpp b/boost/compute/algorithm/reduce.hpp
index 79624a0e50..19d070019f 100644
--- a/boost/compute/algorithm/reduce.hpp
+++ b/boost/compute/algorithm/reduce.hpp
@@ -22,7 +22,7 @@
#include <boost/compute/algorithm/copy_n.hpp>
#include <boost/compute/algorithm/detail/inplace_reduce.hpp>
#include <boost/compute/algorithm/detail/reduce_on_gpu.hpp>
-#include <boost/compute/algorithm/detail/serial_reduce.hpp>
+#include <boost/compute/algorithm/detail/reduce_on_cpu.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/memory/local_buffer.hpp>
#include <boost/compute/type_traits/result_of.hpp>
@@ -173,8 +173,8 @@ inline void generic_reduce(InputIterator first,
size_t count = detail::iterator_range_size(first, last);
if(device.type() & device::cpu){
- boost::compute::vector<result_type> value(1, context);
- detail::serial_reduce(first, last, value.begin(), function, queue);
+ array<result_type, 1> value(context);
+ detail::reduce_on_cpu(first, last, value.begin(), function, queue);
boost::compute::copy_n(value.begin(), 1, result, queue);
}
else {
@@ -209,16 +209,16 @@ inline void dispatch_reduce(InputIterator first,
const device &device = queue.get_device();
// reduce to temporary buffer on device
- array<T, 1> tmp(context);
+ array<T, 1> value(context);
if(device.type() & device::cpu){
- detail::serial_reduce(first, last, tmp.begin(), function, queue);
+ detail::reduce_on_cpu(first, last, value.begin(), function, queue);
}
else {
- reduce_on_gpu(first, last, tmp.begin(), function, queue);
+ reduce_on_gpu(first, last, value.begin(), function, queue);
}
// copy to result iterator
- copy_n(tmp.begin(), 1, result, queue);
+ copy_n(value.begin(), 1, result, queue);
}
template<class InputIterator, class OutputIterator, class BinaryFunction>
diff --git a/boost/compute/algorithm/sort.hpp b/boost/compute/algorithm/sort.hpp
index b2730b3e2b..7e0a583e3e 100644
--- a/boost/compute/algorithm/sort.hpp
+++ b/boost/compute/algorithm/sort.hpp
@@ -18,6 +18,7 @@
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/detail/merge_sort_on_cpu.hpp>
+#include <boost/compute/algorithm/detail/merge_sort_on_gpu.hpp>
#include <boost/compute/algorithm/detail/radix_sort.hpp>
#include <boost/compute/algorithm/detail/insertion_sort.hpp>
#include <boost/compute/algorithm/reverse.hpp>
@@ -74,11 +75,8 @@ inline void dispatch_gpu_sort(buffer_iterator<T> first,
);
}
else {
- // radix sort in ascending order
- ::boost::compute::detail::radix_sort(first, last, queue);
-
- // reverse range to descending order
- ::boost::compute::reverse(first, last, queue);
+ // radix sorts in descending order
+ ::boost::compute::detail::radix_sort(first, last, false, queue);
}
}
@@ -88,9 +86,22 @@ inline void dispatch_gpu_sort(Iterator first,
Compare compare,
command_queue &queue)
{
- ::boost::compute::detail::serial_insertion_sort(
- first, last, compare, queue
- );
+ size_t count = detail::iterator_range_size(first, last);
+
+ if(count < 2){
+ // nothing to do
+ return;
+ }
+ else if(count <= 32){
+ ::boost::compute::detail::serial_insertion_sort(
+ first, last, compare, queue
+ );
+ }
+ else {
+ ::boost::compute::detail::merge_sort_on_gpu(
+ first, last, compare, queue
+ );
+ }
}
// sort() for device iterators
diff --git a/boost/compute/algorithm/sort_by_key.hpp b/boost/compute/algorithm/sort_by_key.hpp
index 0e3dba81eb..c39bcf9890 100644
--- a/boost/compute/algorithm/sort_by_key.hpp
+++ b/boost/compute/algorithm/sort_by_key.hpp
@@ -18,6 +18,7 @@
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/detail/merge_sort_on_cpu.hpp>
+#include <boost/compute/algorithm/detail/merge_sort_on_gpu.hpp>
#include <boost/compute/algorithm/detail/insertion_sort.hpp>
#include <boost/compute/algorithm/detail/radix_sort.hpp>
#include <boost/compute/algorithm/reverse.hpp>
@@ -76,14 +77,10 @@ dispatch_gpu_sort_by_key(KeyIterator keys_first,
);
}
else {
- // radix sorts in ascending order
+ // radix sorts in descending order
detail::radix_sort_by_key(
- keys_first, keys_last, values_first, queue
+ keys_first, keys_last, values_first, false, queue
);
-
- // Reverse keys, values for descending order
- ::boost::compute::reverse(keys_first, keys_last, queue);
- ::boost::compute::reverse(values_first, values_first + count, queue);
}
}
@@ -94,9 +91,17 @@ inline void dispatch_gpu_sort_by_key(KeyIterator keys_first,
Compare compare,
command_queue &queue)
{
- detail::serial_insertion_sort_by_key(
- keys_first, keys_last, values_first, compare, queue
- );
+ size_t count = detail::iterator_range_size(keys_first, keys_last);
+
+ if(count < 32){
+ detail::serial_insertion_sort_by_key(
+ keys_first, keys_last, values_first, compare, queue
+ );
+ } else {
+ detail::merge_sort_by_key_on_gpu(
+ keys_first, keys_last, values_first, compare, queue
+ );
+ }
}
template<class KeyIterator, class ValueIterator, class Compare>
diff --git a/boost/compute/algorithm/stable_sort.hpp b/boost/compute/algorithm/stable_sort.hpp
index cd82a0a606..381fc81bc0 100644
--- a/boost/compute/algorithm/stable_sort.hpp
+++ b/boost/compute/algorithm/stable_sort.hpp
@@ -16,10 +16,12 @@
#include <boost/compute/system.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/algorithm/detail/merge_sort_on_cpu.hpp>
+#include <boost/compute/algorithm/detail/merge_sort_on_gpu.hpp>
#include <boost/compute/algorithm/detail/radix_sort.hpp>
#include <boost/compute/algorithm/detail/insertion_sort.hpp>
#include <boost/compute/algorithm/reverse.hpp>
#include <boost/compute/functional/operator.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
namespace boost {
namespace compute {
@@ -31,9 +33,17 @@ inline void dispatch_gpu_stable_sort(Iterator first,
Compare compare,
command_queue &queue)
{
- ::boost::compute::detail::serial_insertion_sort(
- first, last, compare, queue
- );
+ size_t count = detail::iterator_range_size(first, last);
+
+ if(count < 32){
+ detail::serial_insertion_sort(
+ first, last, compare, queue
+ );
+ } else {
+ detail::merge_sort_on_gpu(
+ first, last, compare, true /* stable */, queue
+ );
+ }
}
template<class T>
@@ -53,11 +63,8 @@ dispatch_gpu_stable_sort(buffer_iterator<T> first,
greater<T>,
command_queue &queue)
{
- // radix sort in ascending order
- ::boost::compute::detail::radix_sort(first, last, queue);
-
- // reverse range to descending order
- ::boost::compute::reverse(first, last, queue);
+ // radix sorts in descending order
+ ::boost::compute::detail::radix_sort(first, last, false, queue);
}
} // end detail namespace
@@ -76,6 +83,7 @@ inline void stable_sort(Iterator first,
::boost::compute::detail::dispatch_gpu_stable_sort(
first, last, compare, queue
);
+ return;
}
::boost::compute::detail::merge_sort_on_cpu(first, last, compare, queue);
}
diff --git a/boost/compute/algorithm/stable_sort_by_key.hpp b/boost/compute/algorithm/stable_sort_by_key.hpp
index 8a51372ede..878f999f44 100644
--- a/boost/compute/algorithm/stable_sort_by_key.hpp
+++ b/boost/compute/algorithm/stable_sort_by_key.hpp
@@ -21,6 +21,105 @@
namespace boost {
namespace compute {
+namespace detail {
+
+template<class KeyIterator, class ValueIterator>
+inline void
+dispatch_gpu_ssort_by_key(KeyIterator keys_first,
+ KeyIterator keys_last,
+ ValueIterator values_first,
+ less<typename std::iterator_traits<KeyIterator>::value_type> compare,
+ command_queue &queue,
+ typename boost::enable_if_c<
+ is_radix_sortable<
+ typename std::iterator_traits<KeyIterator>::value_type
+ >::value
+ >::type* = 0)
+{
+ size_t count = detail::iterator_range_size(keys_first, keys_last);
+
+ if(count < 32){
+ detail::serial_insertion_sort_by_key(
+ keys_first, keys_last, values_first, compare, queue
+ );
+ }
+ else {
+ detail::radix_sort_by_key(
+ keys_first, keys_last, values_first, queue
+ );
+ }
+}
+
+template<class KeyIterator, class ValueIterator>
+inline void
+dispatch_gpu_ssort_by_key(KeyIterator keys_first,
+ KeyIterator keys_last,
+ ValueIterator values_first,
+ greater<typename std::iterator_traits<KeyIterator>::value_type> compare,
+ command_queue &queue,
+ typename boost::enable_if_c<
+ is_radix_sortable<
+ typename std::iterator_traits<KeyIterator>::value_type
+ >::value
+ >::type* = 0)
+{
+ size_t count = detail::iterator_range_size(keys_first, keys_last);
+
+ if(count < 32){
+ detail::serial_insertion_sort_by_key(
+ keys_first, keys_last, values_first, compare, queue
+ );
+ }
+ else {
+ // radix sorts in descending order
+ detail::radix_sort_by_key(
+ keys_first, keys_last, values_first, false, queue
+ );
+ }
+}
+
+template<class KeyIterator, class ValueIterator, class Compare>
+inline void dispatch_gpu_ssort_by_key(KeyIterator keys_first,
+ KeyIterator keys_last,
+ ValueIterator values_first,
+ Compare compare,
+ command_queue &queue)
+{
+ size_t count = detail::iterator_range_size(keys_first, keys_last);
+
+ if(count < 32){
+ detail::serial_insertion_sort_by_key(
+ keys_first, keys_last, values_first,
+ compare, queue
+ );
+ } else {
+ detail::merge_sort_by_key_on_gpu(
+ keys_first, keys_last, values_first,
+ compare, true /* stable */, queue
+ );
+ }
+}
+
+template<class KeyIterator, class ValueIterator, class Compare>
+inline void dispatch_ssort_by_key(KeyIterator keys_first,
+ KeyIterator keys_last,
+ ValueIterator values_first,
+ Compare compare,
+ command_queue &queue)
+{
+ if(queue.get_device().type() & device::gpu) {
+ dispatch_gpu_ssort_by_key(
+ keys_first, keys_last, values_first, compare, queue
+ );
+ return;
+ }
+ ::boost::compute::detail::merge_sort_by_key_on_cpu(
+ keys_first, keys_last, values_first, compare, queue
+ );
+}
+
+} // end detail namespace
+
/// Performs a key-value stable sort using the keys in the range [\p keys_first,
/// \p keys_last) on the values in the range [\p values_first,
/// \p values_first \c + (\p keys_last \c - \p keys_first)) using \p compare.
@@ -35,8 +134,7 @@ inline void stable_sort_by_key(KeyIterator keys_first,
Compare compare,
command_queue &queue = system::default_queue())
{
- // sort_by_key is stable
- ::boost::compute::sort_by_key(
+ ::boost::compute::detail::dispatch_ssort_by_key(
keys_first, keys_last, values_first, compare, queue
);
}
diff --git a/boost/compute/algorithm/transform.hpp b/boost/compute/algorithm/transform.hpp
index 022a4988bd..68750a6523 100644
--- a/boost/compute/algorithm/transform.hpp
+++ b/boost/compute/algorithm/transform.hpp
@@ -22,7 +22,7 @@ namespace boost {
namespace compute {
/// Transforms the elements in the range [\p first, \p last) using
-/// \p transform and stores the results in the range beginning at
+/// operator \p op and stores the results in the range beginning at
/// \p result.
///
/// For example, to calculate the absolute value for each element in a vector:
diff --git a/boost/compute/config.hpp b/boost/compute/config.hpp
index 77d0d7b9df..e501ed28c9 100644
--- a/boost/compute/config.hpp
+++ b/boost/compute/config.hpp
@@ -16,14 +16,13 @@
#include <boost/compute/cl.hpp>
// check for minimum required boost version
-#if BOOST_VERSION < 104800
-#error Boost.Compute requires Boost version 1.48 or later
+#if BOOST_VERSION < 105400
+#error Boost.Compute requires Boost version 1.54 or later
#endif
// the BOOST_COMPUTE_NO_VARIADIC_TEMPLATES macro is defined
// if the compiler does not *fully* support variadic templates
#if defined(BOOST_NO_CXX11_VARIADIC_TEMPLATES) || \
- defined(BOOST_NO_VARIADIC_TEMPLATES) || \
(defined(__GNUC__) && !defined(__clang__) && \
__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
#define BOOST_COMPUTE_NO_VARIADIC_TEMPLATES
@@ -32,7 +31,6 @@
// the BOOST_COMPUTE_NO_STD_TUPLE macro is defined if the
// compiler/stdlib does not support std::tuple
#if defined(BOOST_NO_CXX11_HDR_TUPLE) || \
- defined(BOOST_NO_0X_HDR_TUPLE) || \
defined(BOOST_COMPUTE_NO_VARIADIC_TEMPLATES)
#define BOOST_COMPUTE_NO_STD_TUPLE
#endif // BOOST_NO_CXX11_HDR_TUPLE
@@ -53,17 +51,15 @@
#endif
#if !defined(BOOST_COMPUTE_DOXYGEN_INVOKED) && \
- (defined(BOOST_NO_CXX11_RVALUE_REFERENCES) || defined(BOOST_NO_RVALUE_REFERENCES))
+ defined(BOOST_NO_CXX11_RVALUE_REFERENCES)
# define BOOST_COMPUTE_NO_RVALUE_REFERENCES
#endif // BOOST_NO_CXX11_RVALUE_REFERENCES
-#if defined(BOOST_NO_CXX11_HDR_INITIALIZER_LIST) || \
- defined(BOOST_NO_0X_HDR_INITIALIZER_LIST)
+#if defined(BOOST_NO_CXX11_HDR_INITIALIZER_LIST)
# define BOOST_COMPUTE_NO_HDR_INITIALIZER_LIST
#endif // BOOST_NO_CXX11_HDR_INITIALIZER_LIST
-#if defined(BOOST_NO_CXX11_HDR_CHRONO) || \
- defined(BOOST_NO_0X_HDR_CHRONO)
+#if defined(BOOST_NO_CXX11_HDR_CHRONO)
# define BOOST_COMPUTE_NO_HDR_CHRONO
#endif // BOOST_NO_CXX11_HDR_CHRONO
diff --git a/boost/compute/container/array.hpp b/boost/compute/container/array.hpp
index 919be6eeac..2b504be4a8 100644
--- a/boost/compute/container/array.hpp
+++ b/boost/compute/container/array.hpp
@@ -73,20 +73,33 @@ public:
array(const array<T, N> &other)
: m_buffer(other.m_buffer.get_context(), sizeof(T) * N)
{
- boost::compute::copy(other.begin(), other.end(), begin());
+ command_queue queue = default_queue();
+ boost::compute::copy(other.begin(), other.end(), begin(), queue);
+ queue.finish();
}
array(const boost::array<T, N> &array,
const context &context = system::default_context())
: m_buffer(context, sizeof(T) * N)
{
- boost::compute::copy(array.begin(), array.end(), begin());
+ command_queue queue = default_queue();
+ boost::compute::copy(array.begin(), array.end(), begin(), queue);
+ queue.finish();
+ }
+
+ array(const array<T, N> &other,
+ const command_queue &queue)
+ : m_buffer(other.m_buffer.get_context(), sizeof(T) * N)
+ {
+ boost::compute::copy(other.begin(), other.end(), begin(), queue);
}
array<T, N>& operator=(const array<T, N> &other)
{
if(this != &other){
- boost::compute::copy(other.begin(), other.end(), begin());
+ command_queue queue = default_queue();
+ boost::compute::copy(other.begin(), other.end(), begin(), queue);
+ queue.finish();
}
return *this;
@@ -94,7 +107,9 @@ public:
array<T, N>& operator=(const boost::array<T, N> &array)
{
- boost::compute::copy(array.begin(), array.end(), begin());
+ command_queue queue = default_queue();
+ boost::compute::copy(array.begin(), array.end(), begin(), queue);
+ queue.finish();
return *this;
}
@@ -226,14 +241,28 @@ public:
return *(end() - static_cast<difference_type>(1));
}
+ void fill(const value_type &value, const command_queue &queue)
+ {
+ ::boost::compute::fill(begin(), end(), value, queue);
+ }
+
+ void swap(array<T, N> &other, const command_queue &queue)
+ {
+ ::boost::compute::swap_ranges(begin(), end(), other.begin(), queue);
+ }
+
void fill(const value_type &value)
{
- ::boost::compute::fill(begin(), end(), value);
+ command_queue queue = default_queue();
+ ::boost::compute::fill(begin(), end(), value, queue);
+ queue.finish();
}
void swap(array<T, N> &other)
{
- ::boost::compute::swap_ranges(begin(), end(), other.begin());
+ command_queue queue = default_queue();
+ ::boost::compute::swap_ranges(begin(), end(), other.begin(), queue);
+ queue.finish();
}
const buffer& get_buffer() const
@@ -243,6 +272,13 @@ public:
private:
buffer m_buffer;
+
+ command_queue default_queue() const
+ {
+ const context &context = m_buffer.get_context();
+ command_queue queue(context, context.get_device());
+ return queue;
+ }
};
namespace detail {
diff --git a/boost/compute/container/vector.hpp b/boost/compute/container/vector.hpp
index 47d649ad99..aa0eb98ce9 100644
--- a/boost/compute/container/vector.hpp
+++ b/boost/compute/container/vector.hpp
@@ -196,8 +196,15 @@ public:
m_data = m_allocator.allocate((std::max)(m_size, _minimum_capacity()));
if(!other.empty()){
- ::boost::compute::copy(other.begin(), other.end(), begin(), queue);
- queue.finish();
+ if(other.get_buffer().get_context() != queue.get_context()){
+ command_queue other_queue = other.default_queue();
+ ::boost::compute::copy(other.begin(), other.end(), begin(), other_queue);
+ other_queue.finish();
+ }
+ else {
+ ::boost::compute::copy(other.begin(), other.end(), begin(), queue);
+ queue.finish();
+ }
}
}
@@ -253,6 +260,17 @@ public:
}
template<class OtherAlloc>
+ vector& operator=(const vector<T, OtherAlloc> &other)
+ {
+ command_queue queue = default_queue();
+ resize(other.size(), queue);
+ ::boost::compute::copy(other.begin(), other.end(), begin(), queue);
+ queue.finish();
+
+ return *this;
+ }
+
+ template<class OtherAlloc>
vector& operator=(const std::vector<T, OtherAlloc> &vector)
{
command_queue queue = default_queue();
@@ -371,7 +389,7 @@ public:
/// Resizes the vector to \p size.
void resize(size_type size, command_queue &queue)
{
- if(size < capacity()){
+ if(size <= capacity()){
m_size = size;
}
else {
diff --git a/boost/compute/context.hpp b/boost/compute/context.hpp
index 5db39e9d83..61e84e9767 100644
--- a/boost/compute/context.hpp
+++ b/boost/compute/context.hpp
@@ -204,7 +204,7 @@ public:
typename detail::get_object_info_type<context, Enum>::type
get_info() const;
- /// Returns \c true if the context is the same at \p other.
+ /// Returns \c true if the context is the same as \p other.
bool operator==(const context &other) const
{
return m_context == other.m_context;
diff --git a/boost/compute/detail/literal.hpp b/boost/compute/detail/literal.hpp
index 0d23b1d4d2..874830ee49 100644
--- a/boost/compute/detail/literal.hpp
+++ b/boost/compute/detail/literal.hpp
@@ -27,7 +27,15 @@ template<class T>
std::string make_literal(T x)
{
std::stringstream s;
- s << std::setprecision(std::numeric_limits<T>::digits10)
+ s << std::setprecision(
+#ifndef BOOST_NO_CXX11_NUMERIC_LIMITS
+ std::numeric_limits<T>::max_digits10
+#else
+ // We don't have max_digits10, so add 3 other digits (this is what is required for
+ // float, and is one more than required for double).
+ 3 + std::numeric_limits<T>::digits10
+#endif
+ )
<< std::scientific
<< x;
diff --git a/boost/compute/detail/meta_kernel.hpp b/boost/compute/detail/meta_kernel.hpp
index 7be778b025..5e6d6e0337 100644
--- a/boost/compute/detail/meta_kernel.hpp
+++ b/boost/compute/detail/meta_kernel.hpp
@@ -34,6 +34,7 @@
#include <boost/compute/image/image2d.hpp>
#include <boost/compute/image/image_sampler.hpp>
#include <boost/compute/memory_object.hpp>
+#include <boost/compute/memory/svm_ptr.hpp>
#include <boost/compute/detail/device_ptr.hpp>
#include <boost/compute/detail/sha1.hpp>
#include <boost/compute/utility/program_cache.hpp>
@@ -203,6 +204,28 @@ struct meta_kernel_buffer_info
size_t index;
};
+struct meta_kernel_svm_info
+{
+ template <class T>
+ meta_kernel_svm_info(const svm_ptr<T> ptr,
+ const std::string &id,
+ memory_object::address_space addr_space,
+ size_t i)
+ : ptr(ptr.get()),
+ identifier(id),
+ address_space(addr_space),
+ index(i)
+ {
+
+ }
+
+ void* ptr;
+ std::string identifier;
+ memory_object::address_space address_space;
+ size_t index;
+};
+
+
class meta_kernel;
template<class Type>
@@ -280,12 +303,14 @@ public:
meta_kernel(const meta_kernel &other)
{
m_source.str(other.m_source.str());
+ m_options = other.m_options;
}
meta_kernel& operator=(const meta_kernel &other)
{
if(this != &other){
m_source.str(other.m_source.str());
+ m_options = other.m_options;
}
return *this;
@@ -342,9 +367,11 @@ public:
boost::shared_ptr<program_cache> cache =
program_cache::get_global_cache(context);
+ std::string compile_options = m_options + options;
+
// load (or build) program from cache
::boost::compute::program program =
- cache->get_or_build(cache_key, options, source, context);
+ cache->get_or_build(cache_key, compile_options, source, context);
// create kernel
::boost::compute::kernel kernel = program.create_kernel(name());
@@ -365,6 +392,13 @@ public:
kernel.set_arg(bi.index, bi.m_mem);
}
+ // bind svm args
+ for(size_t i = 0; i < m_stored_svm_ptrs.size(); i++){
+ const detail::meta_kernel_svm_info &spi = m_stored_svm_ptrs[i];
+
+ kernel.set_arg_svm_ptr(spi.index, spi.ptr);
+ }
+
return kernel;
}
@@ -689,6 +723,45 @@ public:
return identifier;
}
+ template<class T>
+ std::string get_svm_identifier(const svm_ptr<T> &svm_ptr,
+ const memory_object::address_space address_space =
+ memory_object::global_memory)
+ {
+ BOOST_ASSERT(
+ (address_space == memory_object::global_memory)
+ || (address_space == memory_object::constant_memory)
+ );
+
+ // check if we've already seen this pointer
+ for(size_t i = 0; i < m_stored_svm_ptrs.size(); i++){
+ const detail::meta_kernel_svm_info &spi = m_stored_svm_ptrs[i];
+
+ if(spi.ptr == svm_ptr.get() &&
+ spi.address_space == address_space){
+ return spi.identifier;
+ }
+ }
+
+ // create a new binding
+ std::string identifier =
+ "_svm_ptr" + lexical_cast<std::string>(m_stored_svm_ptrs.size());
+ size_t index = add_arg<T *>(address_space, identifier);
+
+ if(m_stored_svm_ptrs.empty()) {
+ m_options += std::string(" -cl-std=CL2.0");
+ }
+
+ // store new svm pointer info
+ m_stored_svm_ptrs.push_back(
+ detail::meta_kernel_svm_info(
+ svm_ptr, identifier, address_space, index
+ )
+ );
+
+ return identifier;
+ }
+
std::string get_image_identifier(const char *qualifiers, const image2d &image)
{
size_t index = add_arg_with_qualifiers<image2d>(qualifiers, "image");
@@ -880,8 +953,10 @@ private:
std::set<std::string> m_external_function_names;
std::vector<std::string> m_args;
std::string m_pragmas;
+ std::string m_options;
std::vector<detail::meta_kernel_stored_arg> m_stored_args;
std::vector<detail::meta_kernel_buffer_info> m_stored_buffers;
+ std::vector<detail::meta_kernel_svm_info> m_stored_svm_ptrs;
};
template<class ResultType, class ArgTuple>
@@ -960,6 +1035,18 @@ inline meta_kernel& operator<<(meta_kernel &kernel,
}
}
+// SVM requires OpenCL 2.0
+#if defined(CL_VERSION_2_0) || defined(BOOST_COMPUTE_DOXYGEN_INVOKED)
+template<class T, class IndexExpr>
+inline meta_kernel& operator<<(meta_kernel &kernel,
+ const svm_ptr_index_expr<T, IndexExpr> &expr)
+{
+ return kernel <<
+ kernel.get_svm_identifier<T>(expr.m_svm_ptr) <<
+ '[' << expr.m_expr << ']';
+}
+#endif
+
template<class Predicate, class Arg>
inline meta_kernel& operator<<(meta_kernel &kernel,
const invoked_unary_negate_function<Predicate,
diff --git a/boost/compute/detail/parameter_cache.hpp b/boost/compute/detail/parameter_cache.hpp
index 2a856311e1..0a16cd9b0e 100644
--- a/boost/compute/detail/parameter_cache.hpp
+++ b/boost/compute/detail/parameter_cache.hpp
@@ -161,7 +161,7 @@ private:
try {
read_json(m_file_name, pt);
}
- catch(boost::property_tree::json_parser::json_parser_error &e){
+ catch(boost::property_tree::json_parser::json_parser_error&){
// no saved cache file, ignore
return;
}
diff --git a/boost/compute/detail/vendor.hpp b/boost/compute/detail/vendor.hpp
index 0aa9c9c0d4..e2359cb466 100644
--- a/boost/compute/detail/vendor.hpp
+++ b/boost/compute/detail/vendor.hpp
@@ -31,6 +31,18 @@ inline bool is_amd_device(const device &device)
return device.platform().vendor() == "Advanced Micro Devices, Inc.";
}
+// returns true if the platform is Apple OpenCL platform
+inline bool is_apple_platform(const platform &platform)
+{
+ return platform.name() == "Apple";
+}
+
+// returns true if the device is from Apple OpenCL Platform
+inline bool is_apple_platform_device(const device &device)
+{
+ return is_apple_platform(device.platform());
+}
+
} // end detail namespace
} // end compute namespace
} // end boost namespace
diff --git a/boost/compute/exception/context_error.hpp b/boost/compute/exception/context_error.hpp
index eeb387d884..ec8dc7c30f 100644
--- a/boost/compute/exception/context_error.hpp
+++ b/boost/compute/exception/context_error.hpp
@@ -70,7 +70,7 @@ public:
}
/// Returns the size of the private info memory block.
- const size_t get_private_info_size() const throw()
+ size_t get_private_info_size() const throw()
{
return m_private_info_size;
}
diff --git a/boost/compute/function.hpp b/boost/compute/function.hpp
index e83f16808a..b0b893e948 100644
--- a/boost/compute/function.hpp
+++ b/boost/compute/function.hpp
@@ -215,7 +215,7 @@ public:
{
BOOST_STATIC_ASSERT_MSG(
arity == 3,
- "Non-ternary function invoked with two arguments"
+ "Non-ternary function invoked with three arguments"
);
return detail::invoked_function<result_type, boost::tuple<Arg1, Arg2, Arg3> >(
diff --git a/boost/compute/interop/opengl/acquire.hpp b/boost/compute/interop/opengl/acquire.hpp
index 10af4338fb..b9259e0d88 100644
--- a/boost/compute/interop/opengl/acquire.hpp
+++ b/boost/compute/interop/opengl/acquire.hpp
@@ -14,6 +14,7 @@
#include <boost/compute/command_queue.hpp>
#include <boost/compute/interop/opengl/cl_gl.hpp>
#include <boost/compute/interop/opengl/opengl_buffer.hpp>
+#include <boost/compute/types/fundamental.hpp>
#include <boost/compute/utility/wait_list.hpp>
namespace boost {
@@ -22,7 +23,7 @@ namespace compute {
/// Enqueues a command to acquire the specified OpenGL memory objects.
///
/// \see_opencl_ref{clEnqueueAcquireGLObjects}
-inline event opengl_enqueue_acquire_gl_objects(size_t num_objects,
+inline event opengl_enqueue_acquire_gl_objects(const uint_ num_objects,
const cl_mem *mem_objects,
command_queue &queue,
const wait_list &events = wait_list())
@@ -47,7 +48,7 @@ inline event opengl_enqueue_acquire_gl_objects(size_t num_objects,
/// Enqueues a command to release the specified OpenGL memory objects.
///
/// \see_opencl_ref{clEnqueueReleaseGLObjects}
-inline event opengl_enqueue_release_gl_objects(size_t num_objects,
+inline event opengl_enqueue_release_gl_objects(const uint_ num_objects,
const cl_mem *mem_objects,
command_queue &queue,
const wait_list &events = wait_list())
diff --git a/boost/compute/interop/opengl/context.hpp b/boost/compute/interop/opengl/context.hpp
index 754dca2236..c35fedddfa 100644
--- a/boost/compute/interop/opengl/context.hpp
+++ b/boost/compute/interop/opengl/context.hpp
@@ -78,7 +78,7 @@ inline context opengl_create_shared_context()
// load clGetGLContextInfoKHR() extension function
GetGLContextInfoKHRFunction GetGLContextInfoKHR =
reinterpret_cast<GetGLContextInfoKHRFunction>(
- reinterpret_cast<unsigned long>(
+ reinterpret_cast<size_t>(
platform.get_extension_function_address("clGetGLContextInfoKHR")
)
);
diff --git a/boost/compute/kernel.hpp b/boost/compute/kernel.hpp
index 9494e46de2..72f21a0378 100644
--- a/boost/compute/kernel.hpp
+++ b/boost/compute/kernel.hpp
@@ -22,7 +22,6 @@
#include <boost/compute/type_traits/is_fundamental.hpp>
#include <boost/compute/detail/get_object_info.hpp>
#include <boost/compute/detail/assert_cl_success.hpp>
-#include <boost/compute/memory/svm_ptr.hpp>
namespace boost {
namespace compute {
@@ -189,8 +188,15 @@ public:
template<class T>
T get_arg_info(size_t index, cl_kernel_arg_info info) const
{
- return detail::get_object_info<T>(clGetKernelArgInfo, m_kernel, info, index);
+ return detail::get_object_info<T>(
+ clGetKernelArgInfo, m_kernel, info, static_cast<cl_uint>(index)
+ );
}
+
+ /// \overload
+ template<int Enum>
+ typename detail::get_object_info_type<kernel, Enum>::type
+ get_arg_info(size_t index) const;
#endif // CL_VERSION_1_2
/// Returns work-group information for the kernel with \p device.
@@ -258,15 +264,16 @@ public:
}
/// \internal_
- template<class T>
- void set_arg(size_t index, const svm_ptr<T> ptr)
+ void set_arg_svm_ptr(size_t index, void* ptr)
{
#ifdef CL_VERSION_2_0
- cl_int ret = clSetKernelArgSVMPointer(m_kernel, index, ptr.get());
+ cl_int ret = clSetKernelArgSVMPointer(m_kernel, static_cast<cl_uint>(index), ptr);
if(ret != CL_SUCCESS){
BOOST_THROW_EXCEPTION(opencl_error(ret));
}
#else
+ (void) index;
+ (void) ptr;
BOOST_THROW_EXCEPTION(opencl_error(CL_INVALID_ARG_VALUE));
#endif
}
@@ -324,7 +331,7 @@ public:
}
private:
- #ifndef BOOST_NO_VARIADIC_TEMPLATES
+ #ifndef BOOST_COMPUTE_NO_VARIADIC_TEMPLATES
/// \internal_
template<size_t N>
void _set_args()
@@ -338,7 +345,7 @@ private:
set_arg(N, arg);
_set_args<N+1>(rest...);
}
- #endif // BOOST_NO_VARIADIC_TEMPLATES
+ #endif // BOOST_COMPUTE_NO_VARIADIC_TEMPLATES
private:
cl_kernel m_kernel;
@@ -364,6 +371,23 @@ BOOST_COMPUTE_DETAIL_DEFINE_GET_INFO_SPECIALIZATIONS(kernel,
)
#endif // CL_VERSION_1_2
+/// \internal_ define get_arg_info() specializations for kernel
+#ifdef CL_VERSION_1_2
+#define BOOST_COMPUTE_DETAIL_DEFINE_KERNEL_GET_ARG_INFO_SPECIALIZATION(result_type, value) \
+ namespace detail { \
+ template<> struct get_object_info_type<kernel, value> { typedef result_type type; }; \
+ } \
+ template<> inline result_type kernel::get_arg_info<value>(size_t index) const { \
+ return get_arg_info<result_type>(index, value); \
+ }
+
+BOOST_COMPUTE_DETAIL_DEFINE_KERNEL_GET_ARG_INFO_SPECIALIZATION(cl_kernel_arg_address_qualifier, CL_KERNEL_ARG_ADDRESS_QUALIFIER)
+BOOST_COMPUTE_DETAIL_DEFINE_KERNEL_GET_ARG_INFO_SPECIALIZATION(cl_kernel_arg_access_qualifier, CL_KERNEL_ARG_ACCESS_QUALIFIER)
+BOOST_COMPUTE_DETAIL_DEFINE_KERNEL_GET_ARG_INFO_SPECIALIZATION(std::string, CL_KERNEL_ARG_TYPE_NAME)
+BOOST_COMPUTE_DETAIL_DEFINE_KERNEL_GET_ARG_INFO_SPECIALIZATION(cl_kernel_arg_type_qualifier, CL_KERNEL_ARG_TYPE_QUALIFIER)
+BOOST_COMPUTE_DETAIL_DEFINE_KERNEL_GET_ARG_INFO_SPECIALIZATION(std::string, CL_KERNEL_ARG_NAME)
+#endif // CL_VERSION_1_2
+
namespace detail {
// set_kernel_arg implementation for built-in types
diff --git a/boost/compute/memory/svm_ptr.hpp b/boost/compute/memory/svm_ptr.hpp
index 2dbcb8f635..0c9d88035c 100644
--- a/boost/compute/memory/svm_ptr.hpp
+++ b/boost/compute/memory/svm_ptr.hpp
@@ -11,12 +11,67 @@
#ifndef BOOST_COMPUTE_MEMORY_SVM_PTR_HPP
#define BOOST_COMPUTE_MEMORY_SVM_PTR_HPP
+#include <boost/type_traits.hpp>
+#include <boost/static_assert.hpp>
+#include <boost/assert.hpp>
+
#include <boost/compute/cl.hpp>
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/context.hpp>
+#include <boost/compute/command_queue.hpp>
#include <boost/compute/type_traits/is_device_iterator.hpp>
namespace boost {
namespace compute {
+// forward declaration for svm_ptr<T>
+template<class T>
+class svm_ptr;
+
+// svm functions require OpenCL 2.0
+#if defined(CL_VERSION_2_0) || defined(BOOST_COMPUTE_DOXYGEN_INVOKED)
+namespace detail {
+
+template<class T, class IndexExpr>
+struct svm_ptr_index_expr
+{
+ typedef T result_type;
+
+ svm_ptr_index_expr(const svm_ptr<T> &svm_ptr,
+ const IndexExpr &expr)
+ : m_svm_ptr(svm_ptr),
+ m_expr(expr)
+ {
+ }
+
+ operator T() const
+ {
+ BOOST_STATIC_ASSERT_MSG(boost::is_integral<IndexExpr>::value,
+ "Index expression must be integral");
+
+ BOOST_ASSERT(m_svm_ptr.get());
+
+ const context &context = m_svm_ptr.get_context();
+ const device &device = context.get_device();
+ command_queue queue(context, device);
+
+ T value;
+ T* ptr =
+ static_cast<T*>(m_svm_ptr.get()) + static_cast<std::ptrdiff_t>(m_expr);
+ queue.enqueue_svm_map(static_cast<void*>(ptr), sizeof(T), CL_MAP_READ);
+ value = *(ptr);
+ queue.enqueue_svm_unmap(static_cast<void*>(ptr)).wait();
+
+ return value;
+ }
+
+ const svm_ptr<T> &m_svm_ptr;
+ IndexExpr m_expr;
+};
+
+} // end detail namespace
+#endif
+
template<class T>
class svm_ptr
{
@@ -32,19 +87,22 @@ public:
{
}
- explicit svm_ptr(void *ptr)
- : m_ptr(static_cast<T*>(ptr))
+ svm_ptr(void *ptr, const context &context)
+ : m_ptr(static_cast<T*>(ptr)),
+ m_context(context)
{
}
svm_ptr(const svm_ptr<T> &other)
- : m_ptr(other.m_ptr)
+ : m_ptr(other.m_ptr),
+ m_context(other.m_context)
{
}
- svm_ptr& operator=(const svm_ptr<T> &other)
+ svm_ptr<T>& operator=(const svm_ptr<T> &other)
{
m_ptr = other.m_ptr;
+ m_context = other.m_context;
return *this;
}
@@ -59,18 +117,53 @@ public:
svm_ptr<T> operator+(difference_type n)
{
- return svm_ptr<T>(m_ptr + n);
+ return svm_ptr<T>(m_ptr + n, m_context);
}
difference_type operator-(svm_ptr<T> other)
{
+ BOOST_ASSERT(other.m_context == m_context);
return m_ptr - other.m_ptr;
}
+ context& get_context() const
+ {
+ return m_context;
+ }
+
+ // svm functions require OpenCL 2.0
+ #if defined(CL_VERSION_2_0) || defined(BOOST_COMPUTE_DOXYGEN_INVOKED)
+ /// \internal_
+ template<class Expr>
+ detail::svm_ptr_index_expr<T, Expr>
+ operator[](const Expr &expr) const
+ {
+ BOOST_ASSERT(m_ptr);
+
+ return detail::svm_ptr_index_expr<T, Expr>(*this,
+ expr);
+ }
+ #endif
+
private:
T *m_ptr;
+ context m_context;
};
+namespace detail {
+
+/// \internal_
+template<class T>
+struct set_kernel_arg<svm_ptr<T> >
+{
+ void operator()(kernel &kernel_, size_t index, const svm_ptr<T> &ptr)
+ {
+ kernel_.set_arg_svm_ptr(index, ptr.get());
+ }
+};
+
+} // end detail namespace
+
/// \internal_ (is_device_iterator specialization for svm_ptr)
template<class T>
struct is_device_iterator<svm_ptr<T> > : boost::true_type {};
diff --git a/boost/compute/random/bernoulli_distribution.hpp b/boost/compute/random/bernoulli_distribution.hpp
index edd1125090..50bf27f591 100644
--- a/boost/compute/random/bernoulli_distribution.hpp
+++ b/boost/compute/random/bernoulli_distribution.hpp
@@ -11,6 +11,9 @@
#ifndef BOOST_COMPUTE_RANDOM_BERNOULLI_DISTRIBUTION_HPP
#define BOOST_COMPUTE_RANDOM_BERNOULLI_DISTRIBUTION_HPP
+#include <boost/assert.hpp>
+#include <boost/type_traits.hpp>
+
#include <boost/compute/command_queue.hpp>
#include <boost/compute/function.hpp>
#include <boost/compute/types/fundamental.hpp>
@@ -84,6 +87,11 @@ public:
private:
RealType m_p;
+
+ BOOST_STATIC_ASSERT_MSG(
+ boost::is_floating_point<RealType>::value,
+ "Template argument must be a floating point type"
+ );
};
} // end compute namespace
diff --git a/boost/compute/random/discrete_distribution.hpp b/boost/compute/random/discrete_distribution.hpp
index 3707928f98..86249538ac 100644
--- a/boost/compute/random/discrete_distribution.hpp
+++ b/boost/compute/random/discrete_distribution.hpp
@@ -11,6 +11,12 @@
#ifndef BOOST_COMPUTE_RANDOM_DISCRETE_DISTRIBUTION_HPP
#define BOOST_COMPUTE_RANDOM_DISCRETE_DISTRIBUTION_HPP
+#include <numeric>
+
+#include <boost/config.hpp>
+#include <boost/type_traits.hpp>
+#include <boost/static_assert.hpp>
+
#include <boost/compute/command_queue.hpp>
#include <boost/compute/function.hpp>
#include <boost/compute/algorithm/accumulate.hpp>
@@ -38,23 +44,42 @@ class discrete_distribution
public:
typedef IntType result_type;
+ /// Creates a new discrete distribution with a single weight p = { 1 }.
+ /// This distribution produces only zeroes.
+ discrete_distribution()
+ : m_probabilities(1, double(1)),
+ m_scanned_probabilities(1, double(1))
+ {
+
+ }
+
/// Creates a new discrete distribution with weights given by
- /// the range [\p first, \p last)
+ /// the range [\p first, \p last).
template<class InputIterator>
discrete_distribution(InputIterator first, InputIterator last)
- : m_n(std::distance(first, last)),
- m_probabilities(std::distance(first, last))
+ : m_probabilities(first, last),
+ m_scanned_probabilities(std::distance(first, last))
{
- double sum = 0;
-
- for(InputIterator iter = first; iter!=last; iter++)
- {
- sum += *iter;
+ if(first != last) {
+ // after this m_scanned_probabilities.back() is a sum of all
+ // weights from the range [first, last)
+ std::partial_sum(first, last, m_scanned_probabilities.begin());
+
+ std::vector<double>::iterator i = m_probabilities.begin();
+ std::vector<double>::iterator j = m_scanned_probabilities.begin();
+ for(; i != m_probabilities.end(); ++i, ++j)
+ {
+ // dividing each weight by sum of all weights to
+ // get probabilities
+ *i = *i / m_scanned_probabilities.back();
+ // dividing each partial sum of weights by sum of
+ // all weights to get partial sums of probabilities
+ *j = *j / m_scanned_probabilities.back();
+ }
}
-
- for(size_t i=0; i<m_n; i++)
- {
- m_probabilities[i] = m_probabilities[i-1] + first[i]/sum;
+ else {
+ m_probabilities.push_back(double(1));
+ m_scanned_probabilities.push_back(double(1));
}
}
@@ -63,19 +88,31 @@ public:
{
}
- /// Returns the value of n
- result_type n() const
- {
- return m_n;
- }
-
/// Returns the probabilities
::std::vector<double> probabilities() const
{
return m_probabilities;
}
- /// Generates uniformily distributed integers and stores
+ /// Returns the minimum potentially generated value.
+ result_type min BOOST_PREVENT_MACRO_SUBSTITUTION () const
+ {
+ return result_type(0);
+ }
+
+ /// Returns the maximum potentially generated value.
+ result_type max BOOST_PREVENT_MACRO_SUBSTITUTION () const
+ {
+ size_t type_max = static_cast<size_t>(
+ (std::numeric_limits<result_type>::max)()
+ );
+ if(m_probabilities.size() - 1 > type_max) {
+ return (std::numeric_limits<result_type>::max)();
+ }
+ return static_cast<result_type>(m_probabilities.size() - 1);
+ }
+
+ /// Generates uniformly distributed integers and stores
/// them to the range [\p first, \p last).
template<class OutputIterator, class Generator>
void generate(OutputIterator first,
@@ -83,32 +120,38 @@ public:
Generator &generator,
command_queue &queue)
{
- std::string source = "inline uint scale_random(uint x)\n";
+ std::string source = "inline IntType scale_random(uint x)\n";
source = source +
"{\n" +
"float rno = convert_float(x) / UINT_MAX;\n";
- for(size_t i=0; i<m_n; i++)
+ for(size_t i = 0; i < m_scanned_probabilities.size() - 1; i++)
{
source = source +
- "if(rno <= " + detail::make_literal<float>(m_probabilities[i]) + ")\n" +
+ "if(rno <= " + detail::make_literal<float>(m_scanned_probabilities[i]) + ")\n" +
" return " + detail::make_literal(i) + ";\n";
}
source = source +
- "return " + detail::make_literal(m_n - 1) + ";\n" +
+ "return " + detail::make_literal(m_scanned_probabilities.size() - 1) + ";\n" +
"}\n";
BOOST_COMPUTE_FUNCTION(IntType, scale_random, (const uint_ x), {});
scale_random.set_source(source);
+ scale_random.define("IntType", type_name<IntType>());
generator.generate(first, last, scale_random, queue);
}
private:
- size_t m_n;
::std::vector<double> m_probabilities;
+ ::std::vector<double> m_scanned_probabilities;
+
+ BOOST_STATIC_ASSERT_MSG(
+ boost::is_integral<IntType>::value,
+ "Template argument must be integral"
+ );
};
} // end compute namespace
diff --git a/boost/compute/random/normal_distribution.hpp b/boost/compute/random/normal_distribution.hpp
index d025faeb2e..4693e4fffe 100644
--- a/boost/compute/random/normal_distribution.hpp
+++ b/boost/compute/random/normal_distribution.hpp
@@ -13,6 +13,9 @@
#include <limits>
+#include <boost/assert.hpp>
+#include <boost/type_traits.hpp>
+
#include <boost/compute/command_queue.hpp>
#include <boost/compute/function.hpp>
#include <boost/compute/types/fundamental.hpp>
@@ -90,11 +93,19 @@ public:
BOOST_COMPUTE_FUNCTION(RealType2, box_muller, (const uint2_ x),
{
- const RealType x1 = x.x / (RealType) (UINT_MAX - 1);
- const RealType x2 = x.y / (RealType) (UINT_MAX - 1);
+ const RealType one = 1;
+ const RealType two = 2;
+
+ // Use nextafter to push values down into [0,1) range; without this, floating point rounding can
+ // lead to have x1 = 1, but that would lead to taking the log of 0, which would result in negative
+ // infinities; by pushing the values off 1 towards 0, we ensure this won't happen.
+ const RealType x1 = nextafter(x.x / (RealType) UINT_MAX, (RealType) 0);
+ const RealType x2 = x.y / (RealType) UINT_MAX;
- const RealType z1 = sqrt(-2.f * log2(x1)) * cos(2.f * M_PI_F * x2);
- const RealType z2 = sqrt(-2.f * log2(x1)) * sin(2.f * M_PI_F * x2);
+ const RealType rho = sqrt(-two * log(one-x1));
+
+ const RealType z1 = rho * cos(two * M_PI_F * x2);
+ const RealType z2 = rho * sin(two * M_PI_F * x2);
return (RealType2)(MEAN, MEAN) + (RealType2)(z1, z2) * (RealType2)(STDDEV, STDDEV);
});
@@ -116,6 +127,11 @@ public:
private:
RealType m_mean;
RealType m_stddev;
+
+ BOOST_STATIC_ASSERT_MSG(
+ boost::is_floating_point<RealType>::value,
+ "Template argument must be a floating point type"
+ );
};
} // end compute namespace
diff --git a/boost/compute/random/uniform_int_distribution.hpp b/boost/compute/random/uniform_int_distribution.hpp
index 92e8b3305f..20448afec0 100644
--- a/boost/compute/random/uniform_int_distribution.hpp
+++ b/boost/compute/random/uniform_int_distribution.hpp
@@ -13,6 +13,9 @@
#include <limits>
+#include <boost/type_traits.hpp>
+#include <boost/static_assert.hpp>
+
#include <boost/compute/command_queue.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/function.hpp>
@@ -103,6 +106,11 @@ public:
private:
IntType m_a;
IntType m_b;
+
+ BOOST_STATIC_ASSERT_MSG(
+ boost::is_integral<IntType>::value,
+ "Template argument must be integral"
+ );
};
} // end compute namespace
diff --git a/boost/compute/random/uniform_real_distribution.hpp b/boost/compute/random/uniform_real_distribution.hpp
index 231b0dba01..75d9659d6a 100644
--- a/boost/compute/random/uniform_real_distribution.hpp
+++ b/boost/compute/random/uniform_real_distribution.hpp
@@ -11,6 +11,9 @@
#ifndef BOOST_COMPUTE_RANDOM_UNIFORM_REAL_DISTRIBUTION_HPP
#define BOOST_COMPUTE_RANDOM_UNIFORM_REAL_DISTRIBUTION_HPP
+#include <boost/assert.hpp>
+#include <boost/type_traits.hpp>
+
#include <boost/compute/command_queue.hpp>
#include <boost/compute/function.hpp>
#include <boost/compute/detail/literal.hpp>
@@ -20,7 +23,7 @@ namespace boost {
namespace compute {
/// \class uniform_real_distribution
-/// \brief Produces uniformily distributed random floating-point numbers.
+/// \brief Produces uniformly distributed random floating-point numbers.
///
/// The following example shows how to setup a uniform real distribution to
/// produce random \c float values between \c 1 and \c 100.
@@ -36,10 +39,12 @@ public:
/// Creates a new uniform distribution producing numbers in the range
/// [\p a, \p b).
+ /// Requires a < b
uniform_real_distribution(RealType a = 0.f, RealType b = 1.f)
: m_a(a),
m_b(b)
{
+ BOOST_ASSERT(a < b);
}
/// Destroys the uniform_real_distribution object.
@@ -59,7 +64,7 @@ public:
return m_b;
}
- /// Generates uniformily distributed floating-point numbers and stores
+ /// Generates uniformly distributed floating-point numbers and stores
/// them to the range [\p first, \p last).
template<class OutputIterator, class Generator>
void generate(OutputIterator first,
@@ -69,7 +74,7 @@ public:
{
BOOST_COMPUTE_FUNCTION(RealType, scale_random, (const uint_ x),
{
- return LO + (convert_RealType(x) / MAX_RANDOM) * (HI - LO);
+ return nextafter(LO + (convert_RealType(x) / MAX_RANDOM) * (HI - LO), (RealType) LO);
});
scale_random.define("LO", detail::make_literal(m_a));
@@ -78,6 +83,7 @@ public:
scale_random.define(
"convert_RealType", std::string("convert_") + type_name<RealType>()
);
+ scale_random.define("RealType", type_name<RealType>());
generator.generate(
first, last, scale_random, queue
@@ -97,6 +103,11 @@ public:
private:
RealType m_a;
RealType m_b;
+
+ BOOST_STATIC_ASSERT_MSG(
+ boost::is_floating_point<RealType>::value,
+ "Template argument must be a floating point type"
+ );
};
} // end compute namespace
diff --git a/boost/compute/svm.hpp b/boost/compute/svm.hpp
index d03c8d9079..4bc3a74237 100644
--- a/boost/compute/svm.hpp
+++ b/boost/compute/svm.hpp
@@ -15,7 +15,7 @@
#include <boost/compute/context.hpp>
#include <boost/compute/memory/svm_ptr.hpp>
-// svm functions require opencl 2.0
+// svm functions require OpenCL 2.0
#if defined(CL_VERSION_2_0) || defined(BOOST_COMPUTE_DOXYGEN_INVOKED)
namespace boost {
@@ -34,7 +34,10 @@ inline svm_ptr<T> svm_alloc(const context &context,
cl_svm_mem_flags flags = CL_MEM_READ_WRITE,
unsigned int alignment = 0)
{
- svm_ptr<T> ptr(clSVMAlloc(context.get(), flags, size * sizeof(T), alignment));
+ svm_ptr<T> ptr(
+ clSVMAlloc(context.get(), flags, size * sizeof(T), alignment),
+ context
+ );
if(!ptr.get()){
BOOST_THROW_EXCEPTION(opencl_error(CL_MEM_OBJECT_ALLOCATION_FAILURE));
}
@@ -49,6 +52,13 @@ inline svm_ptr<T> svm_alloc(const context &context,
///
/// \see svm_alloc(), command_queue::enqueue_svm_free()
template<class T>
+inline void svm_free(svm_ptr<T> ptr)
+{
+ clSVMFree(ptr.get_context(), ptr.get());
+}
+
+/// \overload
+template<class T>
inline void svm_free(const context &context, svm_ptr<T> ptr)
{
clSVMFree(context.get(), ptr.get());
diff --git a/boost/compute/system.hpp b/boost/compute/system.hpp
index f205fece7d..e6ed353d68 100644
--- a/boost/compute/system.hpp
+++ b/boost/compute/system.hpp
@@ -184,14 +184,16 @@ public:
cl_uint count = 0;
clGetPlatformIDs(0, 0, &count);
- std::vector<cl_platform_id> platform_ids(count);
- clGetPlatformIDs(count, &platform_ids[0], 0);
-
std::vector<platform> platforms;
- for(size_t i = 0; i < platform_ids.size(); i++){
- platforms.push_back(platform(platform_ids[i]));
- }
+ if(count > 0)
+ {
+ std::vector<cl_platform_id> platform_ids(count);
+ clGetPlatformIDs(count, &platform_ids[0], 0);
+ for(size_t i = 0; i < platform_ids.size(); i++){
+ platforms.push_back(platform(platform_ids[i]));
+ }
+ }
return platforms;
}
diff --git a/boost/compute/utility/wait_list.hpp b/boost/compute/utility/wait_list.hpp
index 9a7e74bac0..8b81924d63 100644
--- a/boost/compute/utility/wait_list.hpp
+++ b/boost/compute/utility/wait_list.hpp
@@ -13,6 +13,12 @@
#include <vector>
+#include <boost/compute/config.hpp>
+
+#ifndef BOOST_COMPUTE_NO_HDR_INITIALIZER_LIST
+#include <initializer_list>
+#endif
+
#include <boost/compute/event.hpp>
namespace boost {
@@ -27,7 +33,7 @@ template<class T> class future;
/// specify dependencies for OpenCL operations or to wait on the host until
/// all of the events have completed.
///
-/// This class also provides convenience fnuctions for interacting with
+/// This class also provides convenience functions for interacting with
/// OpenCL APIs which typically accept event dependencies as a \c cl_event*
/// pointer and a \c cl_uint size. For example:
/// \code
@@ -60,6 +66,14 @@ public:
{
}
+ #ifndef BOOST_COMPUTE_NO_HDR_INITIALIZER_LIST
+ /// Creates a wait-list from \p events
+ wait_list(std::initializer_list<event> events)
+ : m_events(events)
+ {
+ }
+ #endif // BOOST_COMPUTE_NO_HDR_INITIALIZER_LIST
+
/// Copies the events in the wait-list from \p other.
wait_list& operator=(const wait_list &other)
{