diff options
Diffstat (limited to 'boost/compute')
46 files changed, 2584 insertions, 325 deletions
diff --git a/boost/compute/algorithm/adjacent_difference.hpp b/boost/compute/algorithm/adjacent_difference.hpp index a8f84e020e..ef13970754 100644 --- a/boost/compute/algorithm/adjacent_difference.hpp +++ b/boost/compute/algorithm/adjacent_difference.hpp @@ -23,33 +23,17 @@ namespace boost { namespace compute { -/// Stores the difference of each pair of consecutive values in the range -/// [\p first, \p last) to the range beginning at \p result. If \p op is not -/// provided, \c minus<T> is used. -/// -/// \param first first element in the input range -/// \param last last element in the input range -/// \param result first element in the output range -/// \param op binary difference function -/// \param queue command queue to perform the operation -/// -/// \return \c OutputIterator to the end of the result range -/// -/// \see adjacent_find() +namespace detail { + template<class InputIterator, class OutputIterator, class BinaryFunction> inline OutputIterator -adjacent_difference(InputIterator first, - InputIterator last, - OutputIterator result, - BinaryFunction op, - command_queue &queue = system::default_queue()) +dispatch_adjacent_difference(InputIterator first, + InputIterator last, + OutputIterator result, + BinaryFunction op, + command_queue &queue = system::default_queue()) { - if(first == last){ - return result; - } - size_t count = detail::iterator_range_size(first, last); - detail::meta_kernel k("adjacent_difference"); k << "const uint i = get_global_id(0);\n" @@ -66,32 +50,66 @@ adjacent_difference(InputIterator first, return result + count; } -/// \overload -template<class InputIterator, class OutputIterator> +} // end detail namespace + +/// Stores the difference of each pair of consecutive values in the range +/// [\p first, \p last) to the range beginning at \p result. If \p op is not +/// provided, \c minus<T> is used. +/// +/// \param first first element in the input range +/// \param last last element in the input range +/// \param result first element in the output range +/// \param op binary difference function +/// \param queue command queue to perform the operation +/// +/// \return \c OutputIterator to the end of the result range +/// +/// \see adjacent_find() +template<class InputIterator, class OutputIterator, class BinaryFunction> inline OutputIterator adjacent_difference(InputIterator first, InputIterator last, OutputIterator result, + BinaryFunction op, command_queue &queue = system::default_queue()) { typedef typename std::iterator_traits<InputIterator>::value_type value_type; + if(first == last) { + return result; + } + if (first == result) { vector<value_type> temp(detail::iterator_range_size(first, last), queue.get_context()); copy(first, last, temp.begin(), queue); - return ::boost::compute::adjacent_difference( - temp.begin(), temp.end(), result, ::boost::compute::minus<value_type>(), queue + return ::boost::compute::detail::dispatch_adjacent_difference( + temp.begin(), temp.end(), result, op, queue ); } else { - return ::boost::compute::adjacent_difference( - first, last, result, ::boost::compute::minus<value_type>(), queue + return ::boost::compute::detail::dispatch_adjacent_difference( + first, last, result, op, queue ); } } +/// \overload +template<class InputIterator, class OutputIterator> +inline OutputIterator +adjacent_difference(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue = system::default_queue()) +{ + typedef typename std::iterator_traits<InputIterator>::value_type value_type; + + return ::boost::compute::adjacent_difference( + first, last, result, ::boost::compute::minus<value_type>(), queue + ); +} + } // end compute namespace } // end boost namespace diff --git a/boost/compute/algorithm/copy.hpp b/boost/compute/algorithm/copy.hpp index 2a25059bba..7779277b82 100644 --- a/boost/compute/algorithm/copy.hpp +++ b/boost/compute/algorithm/copy.hpp @@ -18,6 +18,7 @@ #include <boost/mpl/and.hpp> #include <boost/mpl/not.hpp> +#include <boost/mpl/or.hpp> #include <boost/compute/buffer.hpp> #include <boost/compute/system.hpp> @@ -26,9 +27,13 @@ #include <boost/compute/algorithm/detail/copy_to_device.hpp> #include <boost/compute/algorithm/detail/copy_to_host.hpp> #include <boost/compute/async/future.hpp> +#include <boost/compute/container/mapped_view.hpp> +#include <boost/compute/detail/device_ptr.hpp> #include <boost/compute/detail/is_contiguous_iterator.hpp> #include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/parameter_cache.hpp> #include <boost/compute/iterator/buffer_iterator.hpp> +#include <boost/compute/type_traits/type_name.hpp> #include <boost/compute/type_traits/is_device_iterator.hpp> namespace boost { @@ -42,13 +47,25 @@ namespace mpl = boost::mpl; template<class InputIterator, class OutputIterator> struct can_copy_with_copy_buffer : mpl::and_< - boost::is_same< - InputIterator, - buffer_iterator<typename InputIterator::value_type> + mpl::or_< + boost::is_same< + InputIterator, + buffer_iterator<typename InputIterator::value_type> + >, + boost::is_same< + InputIterator, + detail::device_ptr<typename InputIterator::value_type> + > >, - boost::is_same< - OutputIterator, - buffer_iterator<typename OutputIterator::value_type> + mpl::or_< + boost::is_same< + OutputIterator, + buffer_iterator<typename OutputIterator::value_type> + >, + boost::is_same< + OutputIterator, + detail::device_ptr<typename OutputIterator::value_type> + > >, boost::is_same< typename InputIterator::value_type, @@ -56,40 +73,72 @@ struct can_copy_with_copy_buffer : > >::type {}; -// host -> device +// meta-function returning true if value_types of HostIterator and +// DeviceIterator are same +template<class HostIterator, class DeviceIterator> +struct is_same_value_type : + boost::is_same< + typename boost::remove_cv< + typename std::iterator_traits<HostIterator>::value_type + >::type, + typename boost::remove_cv< + typename DeviceIterator::value_type + >::type + >::type {}; + +// meta-function returning true if value_type of HostIterator is bool +template<class HostIterator> +struct is_bool_value_type : + boost::is_same< + typename boost::remove_cv< + typename std::iterator_traits<HostIterator>::value_type + >::type, + bool + >::type {}; + +// host -> device (async) template<class InputIterator, class OutputIterator> -inline OutputIterator -dispatch_copy(InputIterator first, - InputIterator last, - OutputIterator result, - command_queue &queue, - typename boost::enable_if_c< - !is_device_iterator<InputIterator>::value && - is_device_iterator<OutputIterator>::value - >::type* = 0) +inline future<OutputIterator> +dispatch_copy_async(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue, + typename boost::enable_if< + mpl::and_< + mpl::not_< + is_device_iterator<InputIterator> + >, + is_device_iterator<OutputIterator>, + is_same_value_type<InputIterator, OutputIterator> + > + >::type* = 0) { - if(is_contiguous_iterator<InputIterator>::value){ - return copy_to_device(first, last, result, queue); - } - else { - // for non-contiguous input we first copy the values to - // a temporary std::vector and then copy from there - typedef typename std::iterator_traits<InputIterator>::value_type T; - std::vector<T> vector(first, last); - return copy_to_device(vector.begin(), vector.end(), result, queue); - } + BOOST_STATIC_ASSERT_MSG( + is_contiguous_iterator<InputIterator>::value, + "copy_async() is only supported for contiguous host iterators" + ); + + return copy_to_device_async(first, last, result, queue); } // host -> device (async) +// Type mismatch between InputIterator and OutputIterator value_types template<class InputIterator, class OutputIterator> inline future<OutputIterator> dispatch_copy_async(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, - typename boost::enable_if_c< - !is_device_iterator<InputIterator>::value && - is_device_iterator<OutputIterator>::value + typename boost::enable_if< + mpl::and_< + mpl::not_< + is_device_iterator<InputIterator> + >, + is_device_iterator<OutputIterator>, + mpl::not_< + is_same_value_type<InputIterator, OutputIterator> + > + > >::type* = 0) { BOOST_STATIC_ASSERT_MSG( @@ -97,32 +146,211 @@ dispatch_copy_async(InputIterator first, "copy_async() is only supported for contiguous host iterators" ); - return copy_to_device_async(first, last, result, queue); + typedef typename std::iterator_traits<InputIterator>::value_type input_type; + + const context &context = queue.get_context(); + size_t count = iterator_range_size(first, last); + + if(count < size_t(1)) { + return future<OutputIterator>(); + } + + // map [first; last) to device and run copy kernel + // on device for copying & casting + ::boost::compute::mapped_view<input_type> mapped_host( + // make sure it's a pointer to constant data + // to force read only mapping + const_cast<const input_type*>( + ::boost::addressof(*first) + ), + count, + context + ); + return copy_on_device_async( + mapped_host.begin(), mapped_host.end(), result, queue + ); } -// device -> host +// host -> device +// InputIterator is a contiguous iterator template<class InputIterator, class OutputIterator> inline OutputIterator dispatch_copy(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, - typename boost::enable_if_c< - is_device_iterator<InputIterator>::value && - !is_device_iterator<OutputIterator>::value + typename boost::enable_if< + mpl::and_< + mpl::not_< + is_device_iterator<InputIterator> + >, + is_device_iterator<OutputIterator>, + is_same_value_type<InputIterator, OutputIterator>, + is_contiguous_iterator<InputIterator> + > + >::type* = 0) +{ + return copy_to_device(first, last, result, queue); +} + +// host -> device +// Type mismatch between InputIterator and OutputIterator value_types +// InputIterator is a contiguous iterator +template<class InputIterator, class OutputIterator> +inline OutputIterator +dispatch_copy(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue, + typename boost::enable_if< + mpl::and_< + mpl::not_< + is_device_iterator<InputIterator> + >, + is_device_iterator<OutputIterator>, + mpl::not_< + is_same_value_type<InputIterator, OutputIterator> + >, + is_contiguous_iterator<InputIterator> + > >::type* = 0) { - if(is_contiguous_iterator<OutputIterator>::value){ - return copy_to_host(first, last, result, queue); + typedef typename OutputIterator::value_type output_type; + typedef typename std::iterator_traits<InputIterator>::value_type input_type; + + const device &device = queue.get_device(); + + // loading parameters + std::string cache_key = + std::string("__boost_compute_copy_to_device_") + + type_name<input_type>() + "_" + type_name<output_type>(); + boost::shared_ptr<parameter_cache> parameters = + detail::parameter_cache::get_global_cache(device); + + size_t map_copy_threshold; + size_t direct_copy_threshold; + + // calculate default values of thresholds + if (device.type() & device::gpu) { + // GPUs + map_copy_threshold = 524288; // 0.5 MB + direct_copy_threshold = 52428800; // 50 MB } else { - // for non-contiguous input we first copy the values to - // a temporary std::vector and then copy from there - typedef typename std::iterator_traits<InputIterator>::value_type T; - std::vector<T> vector(iterator_range_size(first, last)); - copy_to_host(first, last, vector.begin(), queue); - return std::copy(vector.begin(), vector.end(), result); + // CPUs and other devices + map_copy_threshold = 134217728; // 128 MB + direct_copy_threshold = 0; // it's never efficient for CPUs } + + // load thresholds + map_copy_threshold = + parameters->get( + cache_key, "map_copy_threshold", map_copy_threshold + ); + direct_copy_threshold = + parameters->get( + cache_key, "direct_copy_threshold", direct_copy_threshold + ); + + // select copy method based on thresholds & input_size_bytes + size_t count = iterator_range_size(first, last); + size_t input_size_bytes = count * sizeof(input_type); + + // [0; map_copy_threshold) -> copy_to_device_map() + if(input_size_bytes < map_copy_threshold) { + return copy_to_device_map(first, last, result, queue); + } + // [map_copy_threshold; direct_copy_threshold) -> convert [first; last) + // on host and then perform copy_to_device() + else if(input_size_bytes < direct_copy_threshold) { + std::vector<output_type> vector(first, last); + return copy_to_device(vector.begin(), vector.end(), result, queue); + } + + // [direct_copy_threshold; inf) -> map [first; last) to device and + // run copy kernel on device for copying & casting + // At this point we are sure that count > 1 (first != last). + + // Perform async copy to device, wait for it to be finished and + // return the result. + // At this point we are sure that count > 1 (first != last), so event + // returned by dispatch_copy_async() must be valid. + return dispatch_copy_async(first, last, result, queue).get(); +} + +// host -> device +// InputIterator is NOT a contiguous iterator +template<class InputIterator, class OutputIterator> +inline OutputIterator +dispatch_copy(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue, + typename boost::enable_if< + mpl::and_< + mpl::not_< + is_device_iterator<InputIterator> + >, + is_device_iterator<OutputIterator>, + mpl::not_< + is_contiguous_iterator<InputIterator> + > + > + >::type* = 0) +{ + typedef typename OutputIterator::value_type output_type; + typedef typename std::iterator_traits<InputIterator>::value_type input_type; + + const device &device = queue.get_device(); + + // loading parameters + std::string cache_key = + std::string("__boost_compute_copy_to_device_") + + type_name<input_type>() + "_" + type_name<output_type>(); + boost::shared_ptr<parameter_cache> parameters = + detail::parameter_cache::get_global_cache(device); + + size_t map_copy_threshold; + size_t direct_copy_threshold; + + // calculate default values of thresholds + if (device.type() & device::gpu) { + // GPUs + map_copy_threshold = 524288; // 0.5 MB + direct_copy_threshold = 52428800; // 50 MB + } + else { + // CPUs and other devices + map_copy_threshold = 134217728; // 128 MB + direct_copy_threshold = 0; // it's never efficient for CPUs + } + + // load thresholds + map_copy_threshold = + parameters->get( + cache_key, "map_copy_threshold", map_copy_threshold + ); + direct_copy_threshold = + parameters->get( + cache_key, "direct_copy_threshold", direct_copy_threshold + ); + + // select copy method based on thresholds & input_size_bytes + size_t input_size = iterator_range_size(first, last); + size_t input_size_bytes = input_size * sizeof(input_type); + + // [0; map_copy_threshold) -> copy_to_device_map() + // + // if direct_copy_threshold is less than map_copy_threshold + // copy_to_device_map() is used for every input + if(input_size_bytes < map_copy_threshold + || direct_copy_threshold <= map_copy_threshold) { + return copy_to_device_map(first, last, result, queue); + } + // [map_copy_threshold; inf) -> convert [first; last) + // on host and then perform copy_to_device() + std::vector<output_type> vector(first, last); + return copy_to_device(vector.begin(), vector.end(), result, queue); } // device -> host (async) @@ -132,9 +360,14 @@ dispatch_copy_async(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue, - typename boost::enable_if_c< - is_device_iterator<InputIterator>::value && - !is_device_iterator<OutputIterator>::value + typename boost::enable_if< + mpl::and_< + is_device_iterator<InputIterator>, + mpl::not_< + is_device_iterator<OutputIterator> + >, + is_same_value_type<OutputIterator, InputIterator> + > >::type* = 0) { BOOST_STATIC_ASSERT_MSG( @@ -145,6 +378,267 @@ dispatch_copy_async(InputIterator first, return copy_to_host_async(first, last, result, queue); } +// device -> host (async) +// Type mismatch between InputIterator and OutputIterator value_types +template<class InputIterator, class OutputIterator> +inline future<OutputIterator> +dispatch_copy_async(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue, + typename boost::enable_if< + mpl::and_< + is_device_iterator<InputIterator>, + mpl::not_< + is_device_iterator<OutputIterator> + >, + mpl::not_< + is_same_value_type<OutputIterator, InputIterator> + > + > + >::type* = 0) +{ + BOOST_STATIC_ASSERT_MSG( + is_contiguous_iterator<OutputIterator>::value, + "copy_async() is only supported for contiguous host iterators" + ); + + typedef typename std::iterator_traits<OutputIterator>::value_type output_type; + const context &context = queue.get_context(); + size_t count = iterator_range_size(first, last); + + if(count < size_t(1)) { + return future<OutputIterator>(); + } + + // map host memory to device + buffer mapped_host( + context, + count * sizeof(output_type), + buffer::write_only | buffer::use_host_ptr, + static_cast<void*>( + ::boost::addressof(*result) + ) + ); + // copy async on device + ::boost::compute::future<buffer_iterator<output_type> > future = + copy_on_device_async( + first, + last, + make_buffer_iterator<output_type>(mapped_host), + queue + ); + // update host memory asynchronously by maping and unmaping memory + event map_event; + void* ptr = queue.enqueue_map_buffer_async( + mapped_host, + CL_MAP_READ, + 0, + count * sizeof(output_type), + map_event, + future.get_event() + ); + event unmap_event = + queue.enqueue_unmap_buffer(mapped_host, ptr, map_event); + return make_future(result + count, unmap_event); +} + +// device -> host +// OutputIterator is a contiguous iterator +template<class InputIterator, class OutputIterator> +inline OutputIterator +dispatch_copy(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue, + typename boost::enable_if< + mpl::and_< + is_device_iterator<InputIterator>, + mpl::not_< + is_device_iterator<OutputIterator> + >, + is_same_value_type<OutputIterator, InputIterator>, + is_contiguous_iterator<OutputIterator>, + mpl::not_< + is_bool_value_type<OutputIterator> + > + > + >::type* = 0) +{ + return copy_to_host(first, last, result, queue); +} + +// device -> host +// Type mismatch between InputIterator and OutputIterator value_types +// OutputIterator is NOT a contiguous iterator or value_type of OutputIterator +// is a boolean type. +template<class InputIterator, class OutputIterator> +inline OutputIterator +dispatch_copy(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue, + typename boost::enable_if< + mpl::and_< + is_device_iterator<InputIterator>, + mpl::not_< + is_device_iterator<OutputIterator> + >, + mpl::or_< + mpl::not_< + is_contiguous_iterator<OutputIterator> + >, + is_bool_value_type<OutputIterator> + > + > + >::type* = 0) +{ + typedef typename std::iterator_traits<OutputIterator>::value_type output_type; + typedef typename InputIterator::value_type input_type; + + const device &device = queue.get_device(); + + // loading parameters + std::string cache_key = + std::string("__boost_compute_copy_to_host_") + + type_name<input_type>() + "_" + type_name<output_type>(); + boost::shared_ptr<parameter_cache> parameters = + detail::parameter_cache::get_global_cache(device); + + size_t map_copy_threshold; + size_t direct_copy_threshold; + + // calculate default values of thresholds + if (device.type() & device::gpu) { + // GPUs + map_copy_threshold = 33554432; // 30 MB + direct_copy_threshold = 0; // it's never efficient for GPUs + } + else { + // CPUs and other devices + map_copy_threshold = 134217728; // 128 MB + direct_copy_threshold = 0; // it's never efficient for CPUs + } + + // load thresholds + map_copy_threshold = + parameters->get( + cache_key, "map_copy_threshold", map_copy_threshold + ); + direct_copy_threshold = + parameters->get( + cache_key, "direct_copy_threshold", direct_copy_threshold + ); + + // select copy method based on thresholds & input_size_bytes + size_t count = iterator_range_size(first, last); + size_t input_size_bytes = count * sizeof(input_type); + + // [0; map_copy_threshold) -> copy_to_host_map() + // + // if direct_copy_threshold is less than map_copy_threshold + // copy_to_host_map() is used for every input + if(input_size_bytes < map_copy_threshold + || direct_copy_threshold <= map_copy_threshold) { + return copy_to_host_map(first, last, result, queue); + } + // [map_copy_threshold; inf) -> copy [first;last) to temporary vector + // then copy (and convert) to result using std::copy() + std::vector<input_type> vector(count); + copy_to_host(first, last, vector.begin(), queue); + return std::copy(vector.begin(), vector.end(), result); +} + +// device -> host +// Type mismatch between InputIterator and OutputIterator value_types +// OutputIterator is a contiguous iterator +// value_type of OutputIterator is NOT a boolean type +template<class InputIterator, class OutputIterator> +inline OutputIterator +dispatch_copy(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue, + typename boost::enable_if< + mpl::and_< + is_device_iterator<InputIterator>, + mpl::not_< + is_device_iterator<OutputIterator> + >, + mpl::not_< + is_same_value_type<OutputIterator, InputIterator> + >, + is_contiguous_iterator<OutputIterator>, + mpl::not_< + is_bool_value_type<OutputIterator> + > + > + >::type* = 0) +{ + typedef typename std::iterator_traits<OutputIterator>::value_type output_type; + typedef typename InputIterator::value_type input_type; + + const device &device = queue.get_device(); + + // loading parameters + std::string cache_key = + std::string("__boost_compute_copy_to_host_") + + type_name<input_type>() + "_" + type_name<output_type>(); + boost::shared_ptr<parameter_cache> parameters = + detail::parameter_cache::get_global_cache(device); + + size_t map_copy_threshold; + size_t direct_copy_threshold; + + // calculate default values of thresholds + if (device.type() & device::gpu) { + // GPUs + map_copy_threshold = 524288; // 0.5 MB + direct_copy_threshold = 52428800; // 50 MB + } + else { + // CPUs and other devices + map_copy_threshold = 134217728; // 128 MB + direct_copy_threshold = 0; // it's never efficient for CPUs + } + + // load thresholds + map_copy_threshold = + parameters->get( + cache_key, "map_copy_threshold", map_copy_threshold + ); + direct_copy_threshold = + parameters->get( + cache_key, "direct_copy_threshold", direct_copy_threshold + ); + + // select copy method based on thresholds & input_size_bytes + size_t count = iterator_range_size(first, last); + size_t input_size_bytes = count * sizeof(input_type); + + // [0; map_copy_threshold) -> copy_to_host_map() + if(input_size_bytes < map_copy_threshold) { + return copy_to_host_map(first, last, result, queue); + } + // [map_copy_threshold; direct_copy_threshold) -> copy [first;last) to + // temporary vector then copy (and convert) to result using std::copy() + else if(input_size_bytes < direct_copy_threshold) { + std::vector<input_type> vector(count); + copy_to_host(first, last, vector.begin(), queue); + return std::copy(vector.begin(), vector.end(), result); + } + + // [direct_copy_threshold; inf) -> map [result; result + input_size) to + // device and run copy kernel on device for copying & casting + // map host memory to device. + + // Perform async copy to host, wait for it to be finished and + // return the result. + // At this point we are sure that count > 1 (first != last), so event + // returned by dispatch_copy_async() must be valid. + return dispatch_copy_async(first, last, result, queue).get(); +} + // device -> device template<class InputIterator, class OutputIterator> inline OutputIterator diff --git a/boost/compute/algorithm/detail/copy_on_device.hpp b/boost/compute/algorithm/detail/copy_on_device.hpp index 0bcee27ed5..8738c8c0b4 100644 --- a/boost/compute/algorithm/detail/copy_on_device.hpp +++ b/boost/compute/algorithm/detail/copy_on_device.hpp @@ -22,84 +22,96 @@ #include <boost/compute/detail/meta_kernel.hpp> #include <boost/compute/detail/parameter_cache.hpp> #include <boost/compute/detail/work_size.hpp> +#include <boost/compute/detail/vendor.hpp> namespace boost { namespace compute { namespace detail { -inline size_t pick_copy_work_group_size(size_t n, const device &device) +template<class InputIterator, class OutputIterator> +inline event copy_on_device_cpu(InputIterator first, + OutputIterator result, + size_t count, + command_queue &queue) { - (void) device; - - if(n % 32 == 0) return 32; - else if(n % 16 == 0) return 16; - else if(n % 8 == 0) return 8; - else if(n % 4 == 0) return 4; - else if(n % 2 == 0) return 2; - else return 1; + meta_kernel k("copy"); + const device& device = queue.get_device(); + + k << + "uint block = " << + "(uint)ceil(((float)count)/get_global_size(0));\n" << + "uint index = get_global_id(0) * block;\n" << + "uint end = min(count, index + block);\n" << + "while(index < end){\n" << + result[k.var<uint_>("index")] << '=' << + first[k.var<uint_>("index")] << ";\n" << + "index++;\n" << + "}\n"; + + k.add_set_arg<const uint_>("count", static_cast<uint_>(count)); + + size_t global_work_size = device.compute_units(); + if(count <= 1024) global_work_size = 1; + return k.exec_1d(queue, 0, global_work_size); } template<class InputIterator, class OutputIterator> -class copy_kernel : public meta_kernel +inline event copy_on_device_gpu(InputIterator first, + OutputIterator result, + size_t count, + command_queue &queue) { -public: - copy_kernel(const device &device) - : meta_kernel("copy") - { - m_count = 0; - - typedef typename std::iterator_traits<InputIterator>::value_type input_type; - - boost::shared_ptr<parameter_cache> parameters = - detail::parameter_cache::get_global_cache(device); - - std::string cache_key = - "__boost_copy_kernel_" + boost::lexical_cast<std::string>(sizeof(input_type)); + typedef typename std::iterator_traits<InputIterator>::value_type input_type; + + const device& device = queue.get_device(); + boost::shared_ptr<parameter_cache> parameters = + detail::parameter_cache::get_global_cache(device); + std::string cache_key = + "__boost_copy_kernel_" + boost::lexical_cast<std::string>(sizeof(input_type)); + + uint_ vpt = parameters->get(cache_key, "vpt", 4); + uint_ tpb = parameters->get(cache_key, "tpb", 128); + + meta_kernel k("copy"); + k << + "uint index = get_local_id(0) + " << + "(" << vpt * tpb << " * get_group_id(0));\n" << + "for(uint i = 0; i < " << vpt << "; i++){\n" << + " if(index < count){\n" << + result[k.var<uint_>("index")] << '=' << + first[k.var<uint_>("index")] << ";\n" << + " index += " << tpb << ";\n" + " }\n" + "}\n"; + + k.add_set_arg<const uint_>("count", static_cast<uint_>(count)); + size_t global_work_size = calculate_work_size(count, vpt, tpb); + return k.exec_1d(queue, 0, global_work_size, tpb); +} - m_vpt = parameters->get(cache_key, "vpt", 4); - m_tpb = parameters->get(cache_key, "tpb", 128); - } +template<class InputIterator, class OutputIterator> +inline event dispatch_copy_on_device(InputIterator first, + InputIterator last, + OutputIterator result, + command_queue &queue) +{ + const size_t count = detail::iterator_range_size(first, last); - void set_range(InputIterator first, - InputIterator last, - OutputIterator result) - { - m_count_arg = add_arg<uint_>("count"); - - *this << - "uint index = get_local_id(0) + " << - "(" << m_vpt * m_tpb << " * get_group_id(0));\n" << - "for(uint i = 0; i < " << m_vpt << "; i++){\n" << - " if(index < count){\n" << - result[expr<uint_>("index")] << '=' << - first[expr<uint_>("index")] << ";\n" << - " index += " << m_tpb << ";\n" - " }\n" - "}\n"; - - m_count = detail::iterator_range_size(first, last); + if(count == 0){ + // nothing to do + return event(); } - event exec(command_queue &queue) + const device& device = queue.get_device(); + // copy_on_device_cpu() does not work for CPU on Apple platform + // due to bug in its compiler. + // See https://github.com/boostorg/compute/pull/626 + if((device.type() & device::cpu) && !is_apple_platform_device(device)) { - if(m_count == 0){ - // nothing to do - return event(); - } - - size_t global_work_size = calculate_work_size(m_count, m_vpt, m_tpb); - - set_arg(m_count_arg, uint_(m_count)); - - return exec_1d(queue, 0, global_work_size, m_tpb); + return copy_on_device_cpu(first, result, count, queue); } - -private: - size_t m_count; - size_t m_count_arg; - uint_ m_vpt; - uint_ m_tpb; -}; + return copy_on_device_gpu(first, result, count, queue); +} template<class InputIterator, class OutputIterator> inline OutputIterator copy_on_device(InputIterator first, @@ -107,13 +119,7 @@ inline OutputIterator copy_on_device(InputIterator first, OutputIterator result, command_queue &queue) { - const device &device = queue.get_device(); - - copy_kernel<InputIterator, OutputIterator> kernel(device); - - kernel.set_range(first, last, result); - kernel.exec(queue); - + dispatch_copy_on_device(first, last, result, queue); return result + std::distance(first, last); } @@ -134,13 +140,7 @@ inline future<OutputIterator> copy_on_device_async(InputIterator first, OutputIterator result, command_queue &queue) { - const device &device = queue.get_device(); - - copy_kernel<InputIterator, OutputIterator> kernel(device); - - kernel.set_range(first, last, result); - event event_ = kernel.exec(queue); - + event event_ = dispatch_copy_on_device(first, last, result, queue); return make_future(result + std::distance(first, last), event_); } @@ -172,7 +172,7 @@ inline future<svm_ptr<T> > copy_on_device_async(svm_ptr<T> first, { size_t count = iterator_range_size(first, last); if(count == 0){ - return result; + return future<svm_ptr<T> >(); } event event_ = queue.enqueue_svm_memcpy_async( diff --git a/boost/compute/algorithm/detail/copy_to_device.hpp b/boost/compute/algorithm/detail/copy_to_device.hpp index 90545fb4ed..bce5975f53 100644 --- a/boost/compute/algorithm/detail/copy_to_device.hpp +++ b/boost/compute/algorithm/detail/copy_to_device.hpp @@ -53,6 +53,49 @@ inline DeviceIterator copy_to_device(HostIterator first, } template<class HostIterator, class DeviceIterator> +inline DeviceIterator copy_to_device_map(HostIterator first, + HostIterator last, + DeviceIterator result, + command_queue &queue) +{ + typedef typename + std::iterator_traits<DeviceIterator>::value_type + value_type; + typedef typename + std::iterator_traits<DeviceIterator>::difference_type + difference_type; + + size_t count = iterator_range_size(first, last); + if(count == 0){ + return result; + } + + size_t offset = result.get_index(); + + // map result buffer to host + value_type *pointer = static_cast<value_type*>( + queue.enqueue_map_buffer( + result.get_buffer(), + CL_MAP_WRITE, + offset * sizeof(value_type), + count * sizeof(value_type) + ) + ); + + // copy [first; last) to result buffer + std::copy(first, last, pointer); + + // unmap result buffer + boost::compute::event unmap_event = queue.enqueue_unmap_buffer( + result.get_buffer(), + static_cast<void*>(pointer) + ); + unmap_event.wait(); + + return result + static_cast<difference_type>(count); +} + +template<class HostIterator, class DeviceIterator> inline future<DeviceIterator> copy_to_device_async(HostIterator first, HostIterator last, DeviceIterator result, @@ -109,7 +152,7 @@ inline future<svm_ptr<T> > copy_to_device_async(HostIterator first, { size_t count = iterator_range_size(first, last); if(count == 0){ - return result; + return future<svm_ptr<T> >(); } event event_ = queue.enqueue_svm_memcpy_async( @@ -118,6 +161,29 @@ inline future<svm_ptr<T> > copy_to_device_async(HostIterator first, return make_future(result + count, event_); } + +template<class HostIterator, class T> +inline svm_ptr<T> copy_to_device_map(HostIterator first, + HostIterator last, + svm_ptr<T> result, + command_queue &queue) +{ + size_t count = iterator_range_size(first, last); + if(count == 0){ + return result; + } + + // map + queue.enqueue_svm_map(result.get(), count * sizeof(T), CL_MAP_WRITE); + + // copy [first; last) to result buffer + std::copy(first, last, static_cast<T*>(result.get())); + + // unmap result + queue.enqueue_svm_unmap(result.get()).wait(); + + return result + count; +} #endif // CL_VERSION_2_0 } // end detail namespace diff --git a/boost/compute/algorithm/detail/copy_to_host.hpp b/boost/compute/algorithm/detail/copy_to_host.hpp index b889e0c871..d770a996ef 100644 --- a/boost/compute/algorithm/detail/copy_to_host.hpp +++ b/boost/compute/algorithm/detail/copy_to_host.hpp @@ -51,17 +51,51 @@ inline HostIterator copy_to_host(DeviceIterator first, return iterator_plus_distance(result, count); } -// copy_to_host() specialization for std::vector<bool> -template<class DeviceIterator> -inline std::vector<bool>::iterator -copy_to_host(DeviceIterator first, - DeviceIterator last, - std::vector<bool>::iterator result, - command_queue &queue) +template<class DeviceIterator, class HostIterator> +inline HostIterator copy_to_host_map(DeviceIterator first, + DeviceIterator last, + HostIterator result, + command_queue &queue) { - std::vector<uint8_t> temp(std::distance(first, last)); - copy_to_host(first, last, temp.begin(), queue); - return std::copy(temp.begin(), temp.end(), result); + typedef typename + std::iterator_traits<DeviceIterator>::value_type + value_type; + typedef typename + std::iterator_traits<DeviceIterator>::difference_type + difference_type; + + size_t count = iterator_range_size(first, last); + if(count == 0){ + return result; + } + + size_t offset = first.get_index(); + + // map [first; last) buffer to host + value_type *pointer = static_cast<value_type*>( + queue.enqueue_map_buffer( + first.get_buffer(), + CL_MAP_READ, + offset * sizeof(value_type), + count * sizeof(value_type) + ) + ); + + // copy [first; last) to result buffer + std::copy( + pointer, + pointer + static_cast<difference_type>(count), + result + ); + + // unmap [first; last) + boost::compute::event unmap_event = queue.enqueue_unmap_buffer( + first.get_buffer(), + static_cast<void*>(pointer) + ); + unmap_event.wait(); + + return iterator_plus_distance(result, count); } template<class DeviceIterator, class HostIterator> @@ -119,7 +153,7 @@ inline future<HostIterator> copy_to_host_async(svm_ptr<T> first, { size_t count = iterator_range_size(first, last); if(count == 0){ - return result; + return future<HostIterator>(); } event event_ = queue.enqueue_svm_memcpy_async( @@ -128,6 +162,33 @@ inline future<HostIterator> copy_to_host_async(svm_ptr<T> first, return make_future(iterator_plus_distance(result, count), event_); } + +template<class T, class HostIterator> +inline HostIterator copy_to_host_map(svm_ptr<T> first, + svm_ptr<T> last, + HostIterator result, + command_queue &queue) +{ + size_t count = iterator_range_size(first, last); + if(count == 0){ + return result; + } + + // map + queue.enqueue_svm_map(first.get(), count * sizeof(T), CL_MAP_READ); + + // copy [first; last) to result + std::copy( + static_cast<T*>(first.get()), + static_cast<T*>(last.get()), + result + ); + + // unmap [first; last) + queue.enqueue_svm_unmap(first.get()).wait(); + + return iterator_plus_distance(result, count); +} #endif // CL_VERSION_2_0 } // end detail namespace diff --git a/boost/compute/algorithm/detail/find_extrema.hpp b/boost/compute/algorithm/detail/find_extrema.hpp index 6e756c3904..eef2e36c3c 100644 --- a/boost/compute/algorithm/detail/find_extrema.hpp +++ b/boost/compute/algorithm/detail/find_extrema.hpp @@ -12,6 +12,7 @@ #define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_HPP #include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/algorithm/detail/find_extrema_on_cpu.hpp> #include <boost/compute/algorithm/detail/find_extrema_with_reduce.hpp> #include <boost/compute/algorithm/detail/find_extrema_with_atomics.hpp> #include <boost/compute/algorithm/detail/serial_find_extrema.hpp> @@ -36,12 +37,17 @@ inline InputIterator find_extrema(InputIterator first, const device &device = queue.get_device(); + // CPU + if(device.type() & device::cpu) { + return find_extrema_on_cpu(first, last, compare, find_minimum, queue); + } + + // GPU // use serial method for small inputs - // and when device is a CPU - if(count < 512 || (device.type() & device::cpu)){ + if(count < 512) + { return serial_find_extrema(first, last, compare, find_minimum, queue); } - // find_extrema_with_reduce() is used only if requirements are met if(find_extrema_with_reduce_requirements_met(first, last, queue)) { diff --git a/boost/compute/algorithm/detail/find_extrema_on_cpu.hpp b/boost/compute/algorithm/detail/find_extrema_on_cpu.hpp new file mode 100644 index 0000000000..9363ad5837 --- /dev/null +++ b/boost/compute/algorithm/detail/find_extrema_on_cpu.hpp @@ -0,0 +1,138 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_ON_CPU_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_ON_CPU_HPP + +#include <algorithm> + +#include <boost/compute/algorithm/detail/find_extrema_with_reduce.hpp> +#include <boost/compute/algorithm/detail/find_extrema_with_atomics.hpp> +#include <boost/compute/algorithm/detail/serial_find_extrema.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/iterator/buffer_iterator.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator, class Compare> +inline InputIterator find_extrema_on_cpu(InputIterator first, + InputIterator last, + Compare compare, + const bool find_minimum, + command_queue &queue) +{ + typedef typename std::iterator_traits<InputIterator>::value_type input_type; + typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; + size_t count = iterator_range_size(first, last); + + const device &device = queue.get_device(); + const uint_ compute_units = queue.get_device().compute_units(); + + boost::shared_ptr<parameter_cache> parameters = + detail::parameter_cache::get_global_cache(device); + std::string cache_key = + "__boost_find_extrema_cpu_" + + boost::lexical_cast<std::string>(sizeof(input_type)); + + // for inputs smaller than serial_find_extrema_threshold + // serial_find_extrema algorithm is used + uint_ serial_find_extrema_threshold = parameters->get( + cache_key, + "serial_find_extrema_threshold", + 16384 * sizeof(input_type) + ); + serial_find_extrema_threshold = + (std::max)(serial_find_extrema_threshold, uint_(2 * compute_units)); + + const context &context = queue.get_context(); + if(count < serial_find_extrema_threshold) { + return serial_find_extrema(first, last, compare, find_minimum, queue); + } + + meta_kernel k("find_extrema_on_cpu"); + buffer output(context, sizeof(input_type) * compute_units); + buffer output_idx( + context, sizeof(uint_) * compute_units, + buffer::read_write | buffer::alloc_host_ptr + ); + + size_t count_arg = k.add_arg<uint_>("count"); + size_t output_arg = + k.add_arg<input_type *>(memory_object::global_memory, "output"); + size_t output_idx_arg = + k.add_arg<uint_ *>(memory_object::global_memory, "output_idx"); + + k << + "uint block = " << + "(uint)ceil(((float)count)/get_global_size(0));\n" << + "uint index = get_global_id(0) * block;\n" << + "uint end = min(count, index + block);\n" << + + "uint value_index = index;\n" << + k.decl<input_type>("value") << " = " << first[k.var<uint_>("index")] << ";\n" << + + "index++;\n" << + "while(index < end){\n" << + k.decl<input_type>("candidate") << + " = " << first[k.var<uint_>("index")] << ";\n" << + "#ifndef BOOST_COMPUTE_FIND_MAXIMUM\n" << + "bool compare = " << compare(k.var<input_type>("candidate"), + k.var<input_type>("value")) << ";\n" << + "#else\n" << + "bool compare = " << compare(k.var<input_type>("value"), + k.var<input_type>("candidate")) << ";\n" << + "#endif\n" << + "value = compare ? candidate : value;\n" << + "value_index = compare ? index : value_index;\n" << + "index++;\n" << + "}\n" << + "output[get_global_id(0)] = value;\n" << + "output_idx[get_global_id(0)] = value_index;\n"; + + size_t global_work_size = compute_units; + std::string options; + if(!find_minimum){ + options = "-DBOOST_COMPUTE_FIND_MAXIMUM"; + } + kernel kernel = k.compile(context, options); + + kernel.set_arg(count_arg, static_cast<uint_>(count)); + kernel.set_arg(output_arg, output); + kernel.set_arg(output_idx_arg, output_idx); + queue.enqueue_1d_range_kernel(kernel, 0, global_work_size, 0); + + buffer_iterator<input_type> result = serial_find_extrema( + make_buffer_iterator<input_type>(output), + make_buffer_iterator<input_type>(output, global_work_size), + compare, + find_minimum, + queue + ); + + uint_* output_idx_host_ptr = + static_cast<uint_*>( + queue.enqueue_map_buffer( + output_idx, command_queue::map_read, + 0, global_work_size * sizeof(uint_) + ) + ); + + difference_type extremum_idx = + static_cast<difference_type>(*(output_idx_host_ptr + result.get_index())); + return first + extremum_idx; +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_ON_CPU_HPP diff --git a/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp b/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp index 1fbb7dee19..8f2a83c38b 100644 --- a/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp +++ b/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp @@ -128,7 +128,7 @@ inline void find_extrema_with_reduce(InputIterator input, // Next element k.decl<input_type>("next") << " = " << input[k.var<uint_>("idx")] << ";\n" << "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << - k.decl<input_type>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" << + k.decl<uint_>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" << "#endif\n" << // Comparison between currently best element (acc) and next element diff --git a/boost/compute/algorithm/detail/merge_sort_on_gpu.hpp b/boost/compute/algorithm/detail/merge_sort_on_gpu.hpp new file mode 100644 index 0000000000..e62c6beb8d --- /dev/null +++ b/boost/compute/algorithm/detail/merge_sort_on_gpu.hpp @@ -0,0 +1,590 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_GPU_HPP_ +#define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_GPU_HPP_ + +#include <algorithm> + +#include <boost/compute/kernel.hpp> +#include <boost/compute/program.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/container/vector.hpp> +#include <boost/compute/memory/local_buffer.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class KeyType, class ValueType> +inline size_t pick_bitonic_block_sort_block_size(size_t proposed_wg, + size_t lmem_size, + bool sort_by_key) +{ + size_t n = proposed_wg; + + size_t lmem_required = n * sizeof(KeyType); + if(sort_by_key) { + lmem_required += n * sizeof(ValueType); + } + + // try to force at least 4 work-groups of >64 elements + // for better occupancy + while(lmem_size < (lmem_required * 4) && (n > 64)) { + n /= 2; + lmem_required = n * sizeof(KeyType); + } + while(lmem_size < lmem_required && (n != 1)) { + n /= 2; + if(n < 1) n = 1; + lmem_required = n * sizeof(KeyType); + } + + if(n < 2) { return 1; } + else if(n < 4) { return 2; } + else if(n < 8) { return 4; } + else if(n < 16) { return 8; } + else if(n < 32) { return 16; } + else if(n < 64) { return 32; } + else if(n < 128) { return 64; } + else if(n < 256) { return 128; } + else { return 256; } +} + + +/// Performs bitonic block sort according to \p compare. +/// +/// Since bitonic sort can be only performed when input size is equal to 2^n, +/// in this case input size is block size (\p work_group_size), we would have +/// to require \p count be a exact multiple of block size. That would not be +/// great. +/// Instead, bitonic sort kernel is merged with odd-even merge sort so if the +/// last block is not equal to 2^n (where n is some natural number) the odd-even +/// sort is performed for that block. That way bitonic_block_sort() works for +/// input of any size. Block size (\p work_group_size) still have to be equal +/// to 2^n. +/// +/// This is NOT stable. +/// +/// \param keys_first first key element in the range to sort +/// \param values_first first value element in the range to sort +/// \param compare comparison function for keys +/// \param count number of elements in the range; count > 0 +/// \param work_group_size size of the work group, also the block size; must be +/// equal to n^2 where n is natural number +/// \param queue command queue to perform the operation +template<class KeyIterator, class ValueIterator, class Compare> +inline size_t bitonic_block_sort(KeyIterator keys_first, + ValueIterator values_first, + Compare compare, + const size_t count, + const bool sort_by_key, + command_queue &queue) +{ + typedef typename std::iterator_traits<KeyIterator>::value_type key_type; + + meta_kernel k("bitonic_block_sort"); + size_t count_arg = k.add_arg<const uint_>("count"); + + size_t local_keys_arg = k.add_arg<key_type *>(memory_object::local_memory, "lkeys"); + size_t local_vals_arg = 0; + if(sort_by_key) { + local_vals_arg = k.add_arg<uchar_ *>(memory_object::local_memory, "lidx"); + } + + k << + // Work item global and local ids + k.decl<const uint_>("gid") << " = get_global_id(0);\n" << + k.decl<const uint_>("lid") << " = get_local_id(0);\n"; + + // declare my_key and my_value + k << + k.decl<key_type>("my_key") << ";\n"; + // Instead of copying values (my_value) in local memory with keys + // we save local index (uchar) and copy my_value at the end at + // final index. This saves local memory. + if(sort_by_key) + { + k << + k.decl<uchar_>("my_index") << " = (uchar)(lid);\n"; + } + + // load key + k << + "if(gid < count) {\n" << + k.var<key_type>("my_key") << " = " << + keys_first[k.var<const uint_>("gid")] << ";\n" << + "}\n"; + + // load key and index to local memory + k << + "lkeys[lid] = my_key;\n"; + if(sort_by_key) + { + k << + "lidx[lid] = my_index;\n"; + } + k << + k.decl<const uint_>("offset") << " = get_group_id(0) * get_local_size(0);\n" << + k.decl<const uint_>("n") << " = min((uint)(get_local_size(0)),(count - offset));\n"; + + // When work group size is a power of 2 bitonic sorter can be used; + // otherwise, slower odd-even sort is used. + + k << + // check if n is power of 2 + "if(((n != 0) && ((n & (~n + 1)) == n))) {\n"; + + // bitonic sort, not stable + k << + // wait for keys and vals to be stored in local memory + "barrier(CLK_LOCAL_MEM_FENCE);\n" << + + "#pragma unroll\n" << + "for(" << + k.decl<uint_>("length") << " = 1; " << + "length < n; " << + "length <<= 1" << + ") {\n" << + // direction of sort: false -> asc, true -> desc + k.decl<bool>("direction") << "= ((lid & (length<<1)) != 0);\n" << + "for(" << + k.decl<uint_>("k") << " = length; " << + "k > 0; " << + "k >>= 1" << + ") {\n" << + + // sibling to compare with my key + k.decl<uint_>("sibling_idx") << " = lid ^ k;\n" << + k.decl<key_type>("sibling_key") << " = lkeys[sibling_idx];\n" << + k.decl<bool>("compare") << " = " << + compare(k.var<key_type>("sibling_key"), + k.var<key_type>("my_key")) << ";\n" << + k.decl<bool>("swap") << + " = compare ^ (sibling_idx < lid) ^ direction;\n" << + "my_key = swap ? sibling_key : my_key;\n"; + if(sort_by_key) + { + k << + "my_index = swap ? lidx[sibling_idx] : my_index;\n"; + } + k << + "barrier(CLK_LOCAL_MEM_FENCE);\n" << + "lkeys[lid] = my_key;\n"; + if(sort_by_key) + { + k << + "lidx[lid] = my_index;\n"; + } + k << + "barrier(CLK_LOCAL_MEM_FENCE);\n" << + "}\n" << + "}\n"; + + // end of bitonic sort + + // odd-even sort, not stable + k << + "}\n" << + "else { \n"; + + k << + k.decl<bool>("lid_is_even") << " = (lid%2) == 0;\n" << + k.decl<uint_>("oddsibling_idx") << " = " << + "(lid_is_even) ? max(lid,(uint)(1)) - 1 : min(lid+1,n-1);\n" << + k.decl<uint_>("evensibling_idx") << " = " << + "(lid_is_even) ? min(lid+1,n-1) : max(lid,(uint)(1)) - 1;\n" << + + // wait for keys and vals to be stored in local memory + "barrier(CLK_LOCAL_MEM_FENCE);\n" << + + "#pragma unroll\n" << + "for(" << + k.decl<uint_>("i") << " = 0; " << + "i < n; " << + "i++" << + ") {\n" << + k.decl<uint_>("sibling_idx") << + " = i%2 == 0 ? evensibling_idx : oddsibling_idx;\n" << + k.decl<key_type>("sibling_key") << " = lkeys[sibling_idx];\n" << + k.decl<bool>("compare") << " = " << + compare(k.var<key_type>("sibling_key"), + k.var<key_type>("my_key")) << ";\n" << + k.decl<bool>("swap") << + " = compare ^ (sibling_idx < lid);\n" << + "my_key = swap ? sibling_key : my_key;\n"; + if(sort_by_key) + { + k << + "my_index = swap ? lidx[sibling_idx] : my_index;\n"; + } + k << + "barrier(CLK_LOCAL_MEM_FENCE);\n" << + "lkeys[lid] = my_key;\n"; + if(sort_by_key) + { + k << + "lidx[lid] = my_index;\n"; + } + k << + "barrier(CLK_LOCAL_MEM_FENCE);\n" + "}\n" << // for + + "}\n"; // else + // end of odd-even sort + + // save key and value + k << + "if(gid < count) {\n" << + keys_first[k.var<const uint_>("gid")] << " = " << + k.var<key_type>("my_key") << ";\n"; + if(sort_by_key) + { + k << values_first[k.var<const uint_>("gid")] << " = " << + values_first[k.var<const uint_>("offset + my_index")] << ";\n"; + } + k << + // end if + "}\n"; + + const context &context = queue.get_context(); + const device &device = queue.get_device(); + ::boost::compute::kernel kernel = k.compile(context); + + const size_t work_group_size = + pick_bitonic_block_sort_block_size<key_type, uchar_>( + kernel.get_work_group_info<size_t>( + device, CL_KERNEL_WORK_GROUP_SIZE + ), + device.get_info<size_t>(CL_DEVICE_LOCAL_MEM_SIZE), + sort_by_key + ); + + const size_t global_size = + work_group_size * static_cast<size_t>( + std::ceil(float(count) / work_group_size) + ); + + kernel.set_arg(count_arg, static_cast<uint_>(count)); + kernel.set_arg(local_keys_arg, local_buffer<key_type>(work_group_size)); + if(sort_by_key) { + kernel.set_arg(local_vals_arg, local_buffer<uchar_>(work_group_size)); + } + + queue.enqueue_1d_range_kernel(kernel, 0, global_size, work_group_size); + // return size of the block + return work_group_size; +} + +template<class KeyIterator, class ValueIterator, class Compare> +inline size_t block_sort(KeyIterator keys_first, + ValueIterator values_first, + Compare compare, + const size_t count, + const bool sort_by_key, + const bool stable, + command_queue &queue) +{ + if(stable) { + // TODO: Implement stable block sort (stable odd-even merge sort) + return size_t(1); + } + return bitonic_block_sort( + keys_first, values_first, + compare, count, + sort_by_key, queue + ); +} + +/// space: O(n + m); n - number of keys, m - number of values +template<class KeyIterator, class ValueIterator, class Compare> +inline void merge_blocks_on_gpu(KeyIterator keys_first, + ValueIterator values_first, + KeyIterator out_keys_first, + ValueIterator out_values_first, + Compare compare, + const size_t count, + const size_t block_size, + const bool sort_by_key, + command_queue &queue) +{ + typedef typename std::iterator_traits<KeyIterator>::value_type key_type; + typedef typename std::iterator_traits<ValueIterator>::value_type value_type; + + meta_kernel k("merge_blocks"); + size_t count_arg = k.add_arg<const uint_>("count"); + size_t block_size_arg = k.add_arg<const uint_>("block_size"); + + k << + // get global id + k.decl<const uint_>("gid") << " = get_global_id(0);\n" << + "if(gid >= count) {\n" << + "return;\n" << + "}\n" << + + k.decl<const key_type>("my_key") << " = " << + keys_first[k.var<const uint_>("gid")] << ";\n"; + + if(sort_by_key) { + k << + k.decl<const value_type>("my_value") << " = " << + values_first[k.var<const uint_>("gid")] << ";\n"; + } + + k << + // get my block idx + k.decl<const uint_>("my_block_idx") << " = gid / block_size;\n" << + k.decl<const bool>("my_block_idx_is_odd") << " = " << + "my_block_idx & 0x1;\n" << + + k.decl<const uint_>("other_block_idx") << " = " << + // if(my_block_idx is odd) {} else {} + "my_block_idx_is_odd ? my_block_idx - 1 : my_block_idx + 1;\n" << + + // get ranges of my block and the other block + // [my_block_start; my_block_end) + // [other_block_start; other_block_end) + k.decl<const uint_>("my_block_start") << " = " << + "min(my_block_idx * block_size, count);\n" << // including + k.decl<const uint_>("my_block_end") << " = " << + "min((my_block_idx + 1) * block_size, count);\n" << // excluding + + k.decl<const uint_>("other_block_start") << " = " << + "min(other_block_idx * block_size, count);\n" << // including + k.decl<const uint_>("other_block_end") << " = " << + "min((other_block_idx + 1) * block_size, count);\n" << // excluding + + // other block is empty, nothing to merge here + "if(other_block_start == count){\n" << + out_keys_first[k.var<uint_>("gid")] << " = my_key;\n"; + if(sort_by_key) { + k << + out_values_first[k.var<uint_>("gid")] << " = my_value;\n"; + } + + k << + "return;\n" << + "}\n" << + + // lower bound + // left_idx - lower bound + k.decl<uint_>("left_idx") << " = other_block_start;\n" << + k.decl<uint_>("right_idx") << " = other_block_end;\n" << + "while(left_idx < right_idx) {\n" << + k.decl<uint_>("mid_idx") << " = (left_idx + right_idx) / 2;\n" << + k.decl<key_type>("mid_key") << " = " << + keys_first[k.var<const uint_>("mid_idx")] << ";\n" << + k.decl<bool>("smaller") << " = " << + compare(k.var<key_type>("mid_key"), + k.var<key_type>("my_key")) << ";\n" << + "left_idx = smaller ? mid_idx + 1 : left_idx;\n" << + "right_idx = smaller ? right_idx : mid_idx;\n" << + "}\n" << + // left_idx is found position in other block + + // if my_block is odd we need to get the upper bound + "right_idx = other_block_end;\n" << + "if(my_block_idx_is_odd && left_idx != right_idx) {\n" << + k.decl<key_type>("upper_key") << " = " << + keys_first[k.var<const uint_>("left_idx")] << ";\n" << + "while(" << + "!(" << compare(k.var<key_type>("upper_key"), + k.var<key_type>("my_key")) << + ") && " << + "!(" << compare(k.var<key_type>("my_key"), + k.var<key_type>("upper_key")) << + ") && " << + "left_idx < right_idx" << + ")" << + "{\n" << + k.decl<uint_>("mid_idx") << " = (left_idx + right_idx) / 2;\n" << + k.decl<key_type>("mid_key") << " = " << + keys_first[k.var<const uint_>("mid_idx")] << ";\n" << + k.decl<bool>("equal") << " = " << + "!(" << compare(k.var<key_type>("mid_key"), + k.var<key_type>("my_key")) << + ") && " << + "!(" << compare(k.var<key_type>("my_key"), + k.var<key_type>("mid_key")) << + ");\n" << + "left_idx = equal ? mid_idx + 1 : left_idx + 1;\n" << + "right_idx = equal ? right_idx : mid_idx;\n" << + "upper_key = equal ? upper_key : " << + keys_first[k.var<const uint_>("left_idx")] << ";\n" << + "}\n" << + "}\n" << + + k.decl<uint_>("offset") << " = 0;\n" << + "offset += gid - my_block_start;\n" << + "offset += left_idx - other_block_start;\n" << + "offset += min(my_block_start, other_block_start);\n" << + out_keys_first[k.var<uint_>("offset")] << " = my_key;\n"; + if(sort_by_key) { + k << + out_values_first[k.var<uint_>("offset")] << " = my_value;\n"; + } + + const context &context = queue.get_context(); + ::boost::compute::kernel kernel = k.compile(context); + + const size_t work_group_size = (std::min)( + size_t(256), + kernel.get_work_group_info<size_t>( + queue.get_device(), CL_KERNEL_WORK_GROUP_SIZE + ) + ); + const size_t global_size = + work_group_size * static_cast<size_t>( + std::ceil(float(count) / work_group_size) + ); + + kernel.set_arg(count_arg, static_cast<uint_>(count)); + kernel.set_arg(block_size_arg, static_cast<uint_>(block_size)); + queue.enqueue_1d_range_kernel(kernel, 0, global_size, work_group_size); +} + +template<class KeyIterator, class ValueIterator, class Compare> +inline void merge_sort_by_key_on_gpu(KeyIterator keys_first, + KeyIterator keys_last, + ValueIterator values_first, + Compare compare, + bool stable, + command_queue &queue) +{ + typedef typename std::iterator_traits<KeyIterator>::value_type key_type; + typedef typename std::iterator_traits<ValueIterator>::value_type value_type; + + size_t count = iterator_range_size(keys_first, keys_last); + if(count < 2){ + return; + } + + size_t block_size = + block_sort( + keys_first, values_first, + compare, count, + true /* sort_by_key */, stable /* stable */, + queue + ); + + // for small input size only block sort is performed + if(count <= block_size) { + return; + } + + const context &context = queue.get_context(); + + bool result_in_temporary_buffer = false; + ::boost::compute::vector<key_type> temp_keys(count, context); + ::boost::compute::vector<value_type> temp_values(count, context); + + for(; block_size < count; block_size *= 2) { + result_in_temporary_buffer = !result_in_temporary_buffer; + if(result_in_temporary_buffer) { + merge_blocks_on_gpu(keys_first, values_first, + temp_keys.begin(), temp_values.begin(), + compare, count, block_size, + true /* sort_by_key */, queue); + } else { + merge_blocks_on_gpu(temp_keys.begin(), temp_values.begin(), + keys_first, values_first, + compare, count, block_size, + true /* sort_by_key */, queue); + } + } + + if(result_in_temporary_buffer) { + copy_async(temp_keys.begin(), temp_keys.end(), keys_first, queue); + copy_async(temp_values.begin(), temp_values.end(), values_first, queue); + } +} + +template<class Iterator, class Compare> +inline void merge_sort_on_gpu(Iterator first, + Iterator last, + Compare compare, + bool stable, + command_queue &queue) +{ + typedef typename std::iterator_traits<Iterator>::value_type key_type; + + size_t count = iterator_range_size(first, last); + if(count < 2){ + return; + } + + Iterator dummy; + size_t block_size = + block_sort( + first, dummy, + compare, count, + false /* sort_by_key */, stable /* stable */, + queue + ); + + // for small input size only block sort is performed + if(count <= block_size) { + return; + } + + const context &context = queue.get_context(); + + bool result_in_temporary_buffer = false; + ::boost::compute::vector<key_type> temp_keys(count, context); + + for(; block_size < count; block_size *= 2) { + result_in_temporary_buffer = !result_in_temporary_buffer; + if(result_in_temporary_buffer) { + merge_blocks_on_gpu(first, dummy, temp_keys.begin(), dummy, + compare, count, block_size, + false /* sort_by_key */, queue); + } else { + merge_blocks_on_gpu(temp_keys.begin(), dummy, first, dummy, + compare, count, block_size, + false /* sort_by_key */, queue); + } + } + + if(result_in_temporary_buffer) { + copy_async(temp_keys.begin(), temp_keys.end(), first, queue); + } +} + +template<class KeyIterator, class ValueIterator, class Compare> +inline void merge_sort_by_key_on_gpu(KeyIterator keys_first, + KeyIterator keys_last, + ValueIterator values_first, + Compare compare, + command_queue &queue) +{ + merge_sort_by_key_on_gpu( + keys_first, keys_last, values_first, + compare, false /* not stable */, queue + ); +} + +template<class Iterator, class Compare> +inline void merge_sort_on_gpu(Iterator first, + Iterator last, + Compare compare, + command_queue &queue) +{ + merge_sort_on_gpu( + first, last, compare, false /* not stable */, queue + ); +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif /* BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_GPU_HPP_ */ diff --git a/boost/compute/algorithm/detail/radix_sort.hpp b/boost/compute/algorithm/detail/radix_sort.hpp index c2ba4ed17c..8e6d5f9c0a 100644 --- a/boost/compute/algorithm/detail/radix_sort.hpp +++ b/boost/compute/algorithm/detail/radix_sort.hpp @@ -92,6 +92,8 @@ const char radix_sort_source[] = "#define RADIX_MASK ((((T)(1)) << K_BITS) - 1)\n" "#define SIGN_BIT ((sizeof(T) * CHAR_BIT) - 1)\n" +"#if defined(ASC)\n" // asc order + "inline uint radix(const T x, const uint low_bit)\n" "{\n" "#if defined(IS_FLOATING_POINT)\n" @@ -104,6 +106,25 @@ const char radix_sort_source[] = "#endif\n" "}\n" +"#else\n" // desc order + +// For signed types we just negate the x and for unsigned types we +// subtract the x from max value of its type ((T)(-1) is a max value +// of type T when T is an unsigned type). +"inline uint radix(const T x, const uint low_bit)\n" +"{\n" +"#if defined(IS_FLOATING_POINT)\n" +" const T mask = -(x >> SIGN_BIT) | (((T)(1)) << SIGN_BIT);\n" +" return (((-x) ^ mask) >> low_bit) & RADIX_MASK;\n" +"#elif defined(IS_SIGNED)\n" +" return (((-x) ^ (((T)(1)) << SIGN_BIT)) >> low_bit) & RADIX_MASK;\n" +"#else\n" +" return (((T)(-1) - x) >> low_bit) & RADIX_MASK;\n" +"#endif\n" +"}\n" + +"#endif\n" // #if defined(ASC) + "__kernel void count(__global const T *input,\n" " const uint input_offset,\n" " const uint input_size,\n" @@ -227,6 +248,7 @@ template<class T, class T2> inline void radix_sort_impl(const buffer_iterator<T> first, const buffer_iterator<T> last, const buffer_iterator<T2> values_first, + const bool ascending, command_queue &queue) { @@ -279,6 +301,10 @@ inline void radix_sort_impl(const buffer_iterator<T> first, options << enable_double<T2>(); } + if(ascending){ + options << " -DASC"; + } + // load radix sort program program radix_sort_program = cache->get_or_build( cache_key, options.str(), radix_sort_source, context @@ -396,18 +422,38 @@ inline void radix_sort(Iterator first, Iterator last, command_queue &queue) { - radix_sort_impl(first, last, buffer_iterator<int>(), queue); + radix_sort_impl(first, last, buffer_iterator<int>(), true, queue); +} + +template<class KeyIterator, class ValueIterator> +inline void radix_sort_by_key(KeyIterator keys_first, + KeyIterator keys_last, + ValueIterator values_first, + command_queue &queue) +{ + radix_sort_impl(keys_first, keys_last, values_first, true, queue); +} + +template<class Iterator> +inline void radix_sort(Iterator first, + Iterator last, + const bool ascending, + command_queue &queue) +{ + radix_sort_impl(first, last, buffer_iterator<int>(), ascending, queue); } template<class KeyIterator, class ValueIterator> inline void radix_sort_by_key(KeyIterator keys_first, KeyIterator keys_last, ValueIterator values_first, + const bool ascending, command_queue &queue) { - radix_sort_impl(keys_first, keys_last, values_first, queue); + radix_sort_impl(keys_first, keys_last, values_first, ascending, queue); } + } // end detail namespace } // end compute namespace } // end boost namespace diff --git a/boost/compute/algorithm/detail/reduce_on_cpu.hpp b/boost/compute/algorithm/detail/reduce_on_cpu.hpp new file mode 100644 index 0000000000..490d7d538e --- /dev/null +++ b/boost/compute/algorithm/detail/reduce_on_cpu.hpp @@ -0,0 +1,110 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_CPU_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_CPU_HPP + +#include <algorithm> + +#include <boost/compute/buffer.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/parameter_cache.hpp> +#include <boost/compute/iterator/buffer_iterator.hpp> +#include <boost/compute/type_traits/result_of.hpp> +#include <boost/compute/algorithm/detail/serial_reduce.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator, class OutputIterator, class BinaryFunction> +inline void reduce_on_cpu(InputIterator first, + InputIterator last, + OutputIterator result, + BinaryFunction function, + command_queue &queue) +{ + typedef typename + std::iterator_traits<InputIterator>::value_type T; + typedef typename + ::boost::compute::result_of<BinaryFunction(T, T)>::type result_type; + + const device &device = queue.get_device(); + const uint_ compute_units = queue.get_device().compute_units(); + + boost::shared_ptr<parameter_cache> parameters = + detail::parameter_cache::get_global_cache(device); + + std::string cache_key = + "__boost_reduce_cpu_" + boost::lexical_cast<std::string>(sizeof(T)); + + // for inputs smaller than serial_reduce_threshold + // serial_reduce algorithm is used + uint_ serial_reduce_threshold = + parameters->get(cache_key, "serial_reduce_threshold", 16384 * sizeof(T)); + serial_reduce_threshold = + (std::max)(serial_reduce_threshold, uint_(compute_units)); + + const context &context = queue.get_context(); + size_t count = detail::iterator_range_size(first, last); + if(count == 0){ + return; + } + else if(count < serial_reduce_threshold) { + return serial_reduce(first, last, result, function, queue); + } + + meta_kernel k("reduce_on_cpu"); + buffer output(context, sizeof(result_type) * compute_units); + + size_t count_arg = k.add_arg<uint_>("count"); + size_t output_arg = + k.add_arg<result_type *>(memory_object::global_memory, "output"); + + k << + "uint block = " << + "(uint)ceil(((float)count)/get_global_size(0));\n" << + "uint index = get_global_id(0) * block;\n" << + "uint end = min(count, index + block);\n" << + + k.decl<result_type>("result") << " = " << first[k.var<uint_>("index")] << ";\n" << + "index++;\n" << + "while(index < end){\n" << + "result = " << function(k.var<T>("result"), + first[k.var<uint_>("index")]) << ";\n" << + "index++;\n" << + "}\n" << + "output[get_global_id(0)] = result;\n"; + + size_t global_work_size = compute_units; + kernel kernel = k.compile(context); + + // reduction to global_work_size elements + kernel.set_arg(count_arg, static_cast<uint_>(count)); + kernel.set_arg(output_arg, output); + queue.enqueue_1d_range_kernel(kernel, 0, global_work_size, 0); + + // final reduction + reduce_on_cpu( + make_buffer_iterator<result_type>(output), + make_buffer_iterator<result_type>(output, global_work_size), + result, + function, + queue + ); +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_CPU_HPP diff --git a/boost/compute/algorithm/detail/scan_on_cpu.hpp b/boost/compute/algorithm/detail/scan_on_cpu.hpp index 6611c0ba3e..d81117c65f 100644 --- a/boost/compute/algorithm/detail/scan_on_cpu.hpp +++ b/boost/compute/algorithm/detail/scan_on_cpu.hpp @@ -1,5 +1,5 @@ //---------------------------------------------------------------------------// -// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com> // // Distributed under the Boost Software License, Version 1.0 // See accompanying file LICENSE_1_0.txt or copy at @@ -16,8 +16,10 @@ #include <boost/compute/device.hpp> #include <boost/compute/kernel.hpp> #include <boost/compute/command_queue.hpp> +#include <boost/compute/algorithm/detail/serial_scan.hpp> #include <boost/compute/detail/meta_kernel.hpp> #include <boost/compute/detail/iterator_range_size.hpp> +#include <boost/compute/detail/parameter_cache.hpp> namespace boost { namespace compute { @@ -32,68 +34,170 @@ inline OutputIterator scan_on_cpu(InputIterator first, BinaryOperator op, command_queue &queue) { - if(first == last){ - return result; - } - typedef typename std::iterator_traits<InputIterator>::value_type input_type; typedef typename std::iterator_traits<OutputIterator>::value_type output_type; const context &context = queue.get_context(); + const device &device = queue.get_device(); + const size_t compute_units = queue.get_device().compute_units(); + + boost::shared_ptr<parameter_cache> parameters = + detail::parameter_cache::get_global_cache(device); + + std::string cache_key = + "__boost_scan_cpu_" + boost::lexical_cast<std::string>(sizeof(T)); + + // for inputs smaller than serial_scan_threshold + // serial_scan algorithm is used + uint_ serial_scan_threshold = + parameters->get(cache_key, "serial_scan_threshold", 16384 * sizeof(T)); + serial_scan_threshold = + (std::max)(serial_scan_threshold, uint_(compute_units)); + + size_t count = detail::iterator_range_size(first, last); + if(count == 0){ + return result; + } + else if(count < serial_scan_threshold) { + return serial_scan(first, last, result, exclusive, init, op, queue); + } + + buffer block_partial_sums(context, sizeof(output_type) * compute_units ); // create scan kernel - meta_kernel k("scan_on_cpu"); + meta_kernel k("scan_on_cpu_block_scan"); // Arguments - size_t n_arg = k.add_arg<ulong_>("n"); + size_t count_arg = k.add_arg<uint_>("count"); size_t init_arg = k.add_arg<output_type>("initial_value"); + size_t block_partial_sums_arg = + k.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums"); + + k << + "uint block = " << + "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" << + "uint index = get_global_id(0) * block;\n" << + "uint end = min(count, index + block);\n"; if(!exclusive){ k << - k.decl<const ulong_>("start_idx") << " = 1;\n" << - k.decl<output_type>("sum") << " = " << first[0] << ";\n" << - result[0] << " = sum;\n"; + k.decl<output_type>("sum") << " = " << + first[k.var<uint_>("index")] << ";\n" << + result[k.var<uint_>("index")] << " = sum;\n" << + "index++;\n"; } else { k << - k.decl<const ulong_>("start_idx") << " = 0;\n" << - k.decl<output_type>("sum") << " = initial_value;\n"; + k.decl<output_type>("sum") << ";\n" << + "if(index == 0){\n" << + "sum = initial_value;\n" << + "}\n" << + "else {\n" << + "sum = " << first[k.var<uint_>("index")] << ";\n" << + "index++;\n" << + "}\n"; } k << - "for(ulong i = start_idx; i < n; i++){\n" << - k.decl<const input_type>("x") << " = " - << first[k.var<ulong_>("i")] << ";\n"; + "while(index < end){\n" << + // load next value + k.decl<const input_type>("value") << " = " + << first[k.var<uint_>("index")] << ";\n"; if(exclusive){ - k << result[k.var<ulong_>("i")] << " = sum;\n"; + k << + "if(get_global_id(0) == 0){\n" << + result[k.var<uint_>("index")] << " = sum;\n" << + "}\n"; } - - k << " sum = " - << op(k.var<output_type>("sum"), k.var<output_type>("x")) - << ";\n"; + k << + "sum = " << op(k.var<output_type>("sum"), + k.var<output_type>("value")) << ";\n"; if(!exclusive){ - k << result[k.var<ulong_>("i")] << " = sum;\n"; + k << + "if(get_global_id(0) == 0){\n" << + result[k.var<uint_>("index")] << " = sum;\n" << + "}\n"; + } + + k << + "index++;\n" << + "}\n" << // end while + "block_partial_sums[get_global_id(0)] = sum;\n"; + + // compile scan kernel + kernel block_scan_kernel = k.compile(context); + + // setup kernel arguments + block_scan_kernel.set_arg(count_arg, static_cast<uint_>(count)); + block_scan_kernel.set_arg(init_arg, static_cast<output_type>(init)); + block_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums); + + // execute the kernel + size_t global_work_size = compute_units; + queue.enqueue_1d_range_kernel(block_scan_kernel, 0, global_work_size, 0); + + // scan is done + if(compute_units < 2) { + return result + count; + } + + // final scan kernel + meta_kernel l("scan_on_cpu_final_scan"); + + // Arguments + count_arg = l.add_arg<uint_>("count"); + block_partial_sums_arg = + l.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums"); + + l << + "uint block = " << + "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" << + "uint index = block + get_global_id(0) * block;\n" << + "uint end = min(count, index + block);\n" << + + k.decl<output_type>("sum") << " = block_partial_sums[0];\n" << + "for(uint i = 0; i < get_global_id(0); i++) {\n" << + "sum = " << op(k.var<output_type>("sum"), + k.var<output_type>("block_partial_sums[i + 1]")) << ";\n" << + "}\n" << + + "while(index < end){\n"; + if(exclusive){ + l << + l.decl<output_type>("value") << " = " + << first[k.var<uint_>("index")] << ";\n" << + result[k.var<uint_>("index")] << " = sum;\n" << + "sum = " << op(k.var<output_type>("sum"), + k.var<output_type>("value")) << ";\n"; + } + else { + l << + "sum = " << op(k.var<output_type>("sum"), + first[k.var<uint_>("index")]) << ";\n" << + result[k.var<uint_>("index")] << " = sum;\n"; } + l << + "index++;\n" << + "}\n"; - k << "}\n"; // compile scan kernel - kernel scan_kernel = k.compile(context); + kernel final_scan_kernel = l.compile(context); // setup kernel arguments - size_t n = detail::iterator_range_size(first, last); - scan_kernel.set_arg<ulong_>(n_arg, n); - scan_kernel.set_arg<output_type>(init_arg, static_cast<output_type>(init)); + final_scan_kernel.set_arg(count_arg, static_cast<uint_>(count)); + final_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums); // execute the kernel - queue.enqueue_1d_range_kernel(scan_kernel, 0, 1, 1); + global_work_size = compute_units; + queue.enqueue_1d_range_kernel(final_scan_kernel, 0, global_work_size, 0); // return iterator pointing to the end of the result range - return result + n; + return result + count; } } // end detail namespace diff --git a/boost/compute/algorithm/detail/scan_on_gpu.hpp b/boost/compute/algorithm/detail/scan_on_gpu.hpp index 07c6d6d3c0..dcaff2cdf3 100644 --- a/boost/compute/algorithm/detail/scan_on_gpu.hpp +++ b/boost/compute/algorithm/detail/scan_on_gpu.hpp @@ -14,7 +14,6 @@ #include <boost/compute/kernel.hpp> #include <boost/compute/detail/meta_kernel.hpp> #include <boost/compute/command_queue.hpp> -#include <boost/compute/algorithm/detail/scan_on_cpu.hpp> #include <boost/compute/container/vector.hpp> #include <boost/compute/detail/iterator_range_size.hpp> #include <boost/compute/memory/local_buffer.hpp> diff --git a/boost/compute/algorithm/detail/search_all.hpp b/boost/compute/algorithm/detail/search_all.hpp index a874bcdebe..b091ec7de5 100644 --- a/boost/compute/algorithm/detail/search_all.hpp +++ b/boost/compute/algorithm/detail/search_all.hpp @@ -48,7 +48,7 @@ public: *this << "uint i = get_global_id(0);\n" << - "uint i1 = i;\n" << + "const uint i1 = i;\n" << "uint j;\n" << "for(j = 0; j<p_count; j++,i++)\n" << "{\n" << diff --git a/boost/compute/algorithm/detail/serial_scan.hpp b/boost/compute/algorithm/detail/serial_scan.hpp new file mode 100644 index 0000000000..7590fd94fb --- /dev/null +++ b/boost/compute/algorithm/detail/serial_scan.hpp @@ -0,0 +1,103 @@ +//---------------------------------------------------------------------------// +// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> +// +// Distributed under the Boost Software License, Version 1.0 +// See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt +// +// See http://boostorg.github.com/compute for more information. +//---------------------------------------------------------------------------// + +#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_SCAN_HPP +#define BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_SCAN_HPP + +#include <iterator> + +#include <boost/compute/device.hpp> +#include <boost/compute/kernel.hpp> +#include <boost/compute/command_queue.hpp> +#include <boost/compute/detail/meta_kernel.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> + +namespace boost { +namespace compute { +namespace detail { + +template<class InputIterator, class OutputIterator, class T, class BinaryOperator> +inline OutputIterator serial_scan(InputIterator first, + InputIterator last, + OutputIterator result, + bool exclusive, + T init, + BinaryOperator op, + command_queue &queue) +{ + if(first == last){ + return result; + } + + typedef typename + std::iterator_traits<InputIterator>::value_type input_type; + typedef typename + std::iterator_traits<OutputIterator>::value_type output_type; + + const context &context = queue.get_context(); + + // create scan kernel + meta_kernel k("serial_scan"); + + // Arguments + size_t n_arg = k.add_arg<ulong_>("n"); + size_t init_arg = k.add_arg<output_type>("initial_value"); + + if(!exclusive){ + k << + k.decl<const ulong_>("start_idx") << " = 1;\n" << + k.decl<output_type>("sum") << " = " << first[0] << ";\n" << + result[0] << " = sum;\n"; + } + else { + k << + k.decl<const ulong_>("start_idx") << " = 0;\n" << + k.decl<output_type>("sum") << " = initial_value;\n"; + } + + k << + "for(ulong i = start_idx; i < n; i++){\n" << + k.decl<const input_type>("x") << " = " + << first[k.var<ulong_>("i")] << ";\n"; + + if(exclusive){ + k << result[k.var<ulong_>("i")] << " = sum;\n"; + } + + k << " sum = " + << op(k.var<output_type>("sum"), k.var<output_type>("x")) + << ";\n"; + + if(!exclusive){ + k << result[k.var<ulong_>("i")] << " = sum;\n"; + } + + k << "}\n"; + + // compile scan kernel + kernel scan_kernel = k.compile(context); + + // setup kernel arguments + size_t n = detail::iterator_range_size(first, last); + scan_kernel.set_arg<ulong_>(n_arg, n); + scan_kernel.set_arg<output_type>(init_arg, static_cast<output_type>(init)); + + // execute the kernel + queue.enqueue_1d_range_kernel(scan_kernel, 0, 1, 1); + + // return iterator pointing to the end of the result range + return result + n; +} + +} // end detail namespace +} // end compute namespace +} // end boost namespace + +#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_SCAN_HPP diff --git a/boost/compute/algorithm/find_end.hpp b/boost/compute/algorithm/find_end.hpp index 5c40055113..265a1da542 100644 --- a/boost/compute/algorithm/find_end.hpp +++ b/boost/compute/algorithm/find_end.hpp @@ -36,6 +36,7 @@ inline InputIterator find_end_helper(InputIterator first, command_queue &queue) { typedef typename std::iterator_traits<InputIterator>::value_type value_type; + typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; size_t count = detail::iterator_range_size(first, last); if(count == 0){ @@ -65,8 +66,13 @@ inline InputIterator find_end_helper(InputIterator first, queue.enqueue_1d_range_kernel(kernel, 0, count, 0); int result = static_cast<int>(index.read(queue)); - if(result == -1) return last; - else return first + result; + + if(result == -1){ + return last; + } + else { + return first + static_cast<difference_type>(result); + } } } // end detail namespace @@ -92,8 +98,13 @@ inline TextIterator find_end(TextIterator t_first, command_queue &queue = system::default_queue()) { const context &context = queue.get_context(); - vector<uint_> matching_indices(detail::iterator_range_size(t_first, t_last), - context); + + // there is no need to check if pattern starts at last n - 1 indices + vector<uint_> matching_indices( + detail::iterator_range_size(t_first, t_last) + + 1 - detail::iterator_range_size(p_first, p_last), + context + ); detail::search_kernel<PatternIterator, TextIterator, @@ -105,10 +116,16 @@ inline TextIterator find_end(TextIterator t_first, using boost::compute::_1; vector<uint_>::iterator index = - detail::find_end_helper(matching_indices.begin(), - matching_indices.end(), - _1 == 1, - queue); + detail::find_end_helper( + matching_indices.begin(), + matching_indices.end(), + _1 == 1, + queue + ); + + // pattern was not found + if(index == matching_indices.end()) + return t_last; return t_first + detail::iterator_range_size(matching_indices.begin(), index); } diff --git a/boost/compute/algorithm/gather.hpp b/boost/compute/algorithm/gather.hpp index b2f725d54e..24c5c727ae 100644 --- a/boost/compute/algorithm/gather.hpp +++ b/boost/compute/algorithm/gather.hpp @@ -36,7 +36,6 @@ public: OutputIterator result) { m_count = iterator_range_size(first, last); - m_offset = first.get_index(); *this << "const uint i = get_global_id(0);\n" << @@ -50,12 +49,11 @@ public: return event(); } - return exec_1d(queue, m_offset, m_count); + return exec_1d(queue, 0, m_count); } private: size_t m_count; - size_t m_offset; }; } // end detail namespace diff --git a/boost/compute/algorithm/minmax_element.hpp b/boost/compute/algorithm/minmax_element.hpp index bf32c3c989..3f44c09eaf 100644 --- a/boost/compute/algorithm/minmax_element.hpp +++ b/boost/compute/algorithm/minmax_element.hpp @@ -49,7 +49,7 @@ minmax_element(InputIterator first, } ///\overload -template<class InputIterator, class Compare> +template<class InputIterator> inline std::pair<InputIterator, InputIterator> minmax_element(InputIterator first, InputIterator last, diff --git a/boost/compute/algorithm/reduce.hpp b/boost/compute/algorithm/reduce.hpp index 79624a0e50..19d070019f 100644 --- a/boost/compute/algorithm/reduce.hpp +++ b/boost/compute/algorithm/reduce.hpp @@ -22,7 +22,7 @@ #include <boost/compute/algorithm/copy_n.hpp> #include <boost/compute/algorithm/detail/inplace_reduce.hpp> #include <boost/compute/algorithm/detail/reduce_on_gpu.hpp> -#include <boost/compute/algorithm/detail/serial_reduce.hpp> +#include <boost/compute/algorithm/detail/reduce_on_cpu.hpp> #include <boost/compute/detail/iterator_range_size.hpp> #include <boost/compute/memory/local_buffer.hpp> #include <boost/compute/type_traits/result_of.hpp> @@ -173,8 +173,8 @@ inline void generic_reduce(InputIterator first, size_t count = detail::iterator_range_size(first, last); if(device.type() & device::cpu){ - boost::compute::vector<result_type> value(1, context); - detail::serial_reduce(first, last, value.begin(), function, queue); + array<result_type, 1> value(context); + detail::reduce_on_cpu(first, last, value.begin(), function, queue); boost::compute::copy_n(value.begin(), 1, result, queue); } else { @@ -209,16 +209,16 @@ inline void dispatch_reduce(InputIterator first, const device &device = queue.get_device(); // reduce to temporary buffer on device - array<T, 1> tmp(context); + array<T, 1> value(context); if(device.type() & device::cpu){ - detail::serial_reduce(first, last, tmp.begin(), function, queue); + detail::reduce_on_cpu(first, last, value.begin(), function, queue); } else { - reduce_on_gpu(first, last, tmp.begin(), function, queue); + reduce_on_gpu(first, last, value.begin(), function, queue); } // copy to result iterator - copy_n(tmp.begin(), 1, result, queue); + copy_n(value.begin(), 1, result, queue); } template<class InputIterator, class OutputIterator, class BinaryFunction> diff --git a/boost/compute/algorithm/sort.hpp b/boost/compute/algorithm/sort.hpp index b2730b3e2b..7e0a583e3e 100644 --- a/boost/compute/algorithm/sort.hpp +++ b/boost/compute/algorithm/sort.hpp @@ -18,6 +18,7 @@ #include <boost/compute/system.hpp> #include <boost/compute/command_queue.hpp> #include <boost/compute/algorithm/detail/merge_sort_on_cpu.hpp> +#include <boost/compute/algorithm/detail/merge_sort_on_gpu.hpp> #include <boost/compute/algorithm/detail/radix_sort.hpp> #include <boost/compute/algorithm/detail/insertion_sort.hpp> #include <boost/compute/algorithm/reverse.hpp> @@ -74,11 +75,8 @@ inline void dispatch_gpu_sort(buffer_iterator<T> first, ); } else { - // radix sort in ascending order - ::boost::compute::detail::radix_sort(first, last, queue); - - // reverse range to descending order - ::boost::compute::reverse(first, last, queue); + // radix sorts in descending order + ::boost::compute::detail::radix_sort(first, last, false, queue); } } @@ -88,9 +86,22 @@ inline void dispatch_gpu_sort(Iterator first, Compare compare, command_queue &queue) { - ::boost::compute::detail::serial_insertion_sort( - first, last, compare, queue - ); + size_t count = detail::iterator_range_size(first, last); + + if(count < 2){ + // nothing to do + return; + } + else if(count <= 32){ + ::boost::compute::detail::serial_insertion_sort( + first, last, compare, queue + ); + } + else { + ::boost::compute::detail::merge_sort_on_gpu( + first, last, compare, queue + ); + } } // sort() for device iterators diff --git a/boost/compute/algorithm/sort_by_key.hpp b/boost/compute/algorithm/sort_by_key.hpp index 0e3dba81eb..c39bcf9890 100644 --- a/boost/compute/algorithm/sort_by_key.hpp +++ b/boost/compute/algorithm/sort_by_key.hpp @@ -18,6 +18,7 @@ #include <boost/compute/system.hpp> #include <boost/compute/command_queue.hpp> #include <boost/compute/algorithm/detail/merge_sort_on_cpu.hpp> +#include <boost/compute/algorithm/detail/merge_sort_on_gpu.hpp> #include <boost/compute/algorithm/detail/insertion_sort.hpp> #include <boost/compute/algorithm/detail/radix_sort.hpp> #include <boost/compute/algorithm/reverse.hpp> @@ -76,14 +77,10 @@ dispatch_gpu_sort_by_key(KeyIterator keys_first, ); } else { - // radix sorts in ascending order + // radix sorts in descending order detail::radix_sort_by_key( - keys_first, keys_last, values_first, queue + keys_first, keys_last, values_first, false, queue ); - - // Reverse keys, values for descending order - ::boost::compute::reverse(keys_first, keys_last, queue); - ::boost::compute::reverse(values_first, values_first + count, queue); } } @@ -94,9 +91,17 @@ inline void dispatch_gpu_sort_by_key(KeyIterator keys_first, Compare compare, command_queue &queue) { - detail::serial_insertion_sort_by_key( - keys_first, keys_last, values_first, compare, queue - ); + size_t count = detail::iterator_range_size(keys_first, keys_last); + + if(count < 32){ + detail::serial_insertion_sort_by_key( + keys_first, keys_last, values_first, compare, queue + ); + } else { + detail::merge_sort_by_key_on_gpu( + keys_first, keys_last, values_first, compare, queue + ); + } } template<class KeyIterator, class ValueIterator, class Compare> diff --git a/boost/compute/algorithm/stable_sort.hpp b/boost/compute/algorithm/stable_sort.hpp index cd82a0a606..381fc81bc0 100644 --- a/boost/compute/algorithm/stable_sort.hpp +++ b/boost/compute/algorithm/stable_sort.hpp @@ -16,10 +16,12 @@ #include <boost/compute/system.hpp> #include <boost/compute/command_queue.hpp> #include <boost/compute/algorithm/detail/merge_sort_on_cpu.hpp> +#include <boost/compute/algorithm/detail/merge_sort_on_gpu.hpp> #include <boost/compute/algorithm/detail/radix_sort.hpp> #include <boost/compute/algorithm/detail/insertion_sort.hpp> #include <boost/compute/algorithm/reverse.hpp> #include <boost/compute/functional/operator.hpp> +#include <boost/compute/detail/iterator_range_size.hpp> namespace boost { namespace compute { @@ -31,9 +33,17 @@ inline void dispatch_gpu_stable_sort(Iterator first, Compare compare, command_queue &queue) { - ::boost::compute::detail::serial_insertion_sort( - first, last, compare, queue - ); + size_t count = detail::iterator_range_size(first, last); + + if(count < 32){ + detail::serial_insertion_sort( + first, last, compare, queue + ); + } else { + detail::merge_sort_on_gpu( + first, last, compare, true /* stable */, queue + ); + } } template<class T> @@ -53,11 +63,8 @@ dispatch_gpu_stable_sort(buffer_iterator<T> first, greater<T>, command_queue &queue) { - // radix sort in ascending order - ::boost::compute::detail::radix_sort(first, last, queue); - - // reverse range to descending order - ::boost::compute::reverse(first, last, queue); + // radix sorts in descending order + ::boost::compute::detail::radix_sort(first, last, false, queue); } } // end detail namespace @@ -76,6 +83,7 @@ inline void stable_sort(Iterator first, ::boost::compute::detail::dispatch_gpu_stable_sort( first, last, compare, queue ); + return; } ::boost::compute::detail::merge_sort_on_cpu(first, last, compare, queue); } diff --git a/boost/compute/algorithm/stable_sort_by_key.hpp b/boost/compute/algorithm/stable_sort_by_key.hpp index 8a51372ede..878f999f44 100644 --- a/boost/compute/algorithm/stable_sort_by_key.hpp +++ b/boost/compute/algorithm/stable_sort_by_key.hpp @@ -21,6 +21,105 @@ namespace boost { namespace compute { +namespace detail { + +template<class KeyIterator, class ValueIterator> +inline void +dispatch_gpu_ssort_by_key(KeyIterator keys_first, + KeyIterator keys_last, + ValueIterator values_first, + less<typename std::iterator_traits<KeyIterator>::value_type> compare, + command_queue &queue, + typename boost::enable_if_c< + is_radix_sortable< + typename std::iterator_traits<KeyIterator>::value_type + >::value + >::type* = 0) +{ + size_t count = detail::iterator_range_size(keys_first, keys_last); + + if(count < 32){ + detail::serial_insertion_sort_by_key( + keys_first, keys_last, values_first, compare, queue + ); + } + else { + detail::radix_sort_by_key( + keys_first, keys_last, values_first, queue + ); + } +} + +template<class KeyIterator, class ValueIterator> +inline void +dispatch_gpu_ssort_by_key(KeyIterator keys_first, + KeyIterator keys_last, + ValueIterator values_first, + greater<typename std::iterator_traits<KeyIterator>::value_type> compare, + command_queue &queue, + typename boost::enable_if_c< + is_radix_sortable< + typename std::iterator_traits<KeyIterator>::value_type + >::value + >::type* = 0) +{ + size_t count = detail::iterator_range_size(keys_first, keys_last); + + if(count < 32){ + detail::serial_insertion_sort_by_key( + keys_first, keys_last, values_first, compare, queue + ); + } + else { + // radix sorts in descending order + detail::radix_sort_by_key( + keys_first, keys_last, values_first, false, queue + ); + } +} + +template<class KeyIterator, class ValueIterator, class Compare> +inline void dispatch_gpu_ssort_by_key(KeyIterator keys_first, + KeyIterator keys_last, + ValueIterator values_first, + Compare compare, + command_queue &queue) +{ + size_t count = detail::iterator_range_size(keys_first, keys_last); + + if(count < 32){ + detail::serial_insertion_sort_by_key( + keys_first, keys_last, values_first, + compare, queue + ); + } else { + detail::merge_sort_by_key_on_gpu( + keys_first, keys_last, values_first, + compare, true /* stable */, queue + ); + } +} + +template<class KeyIterator, class ValueIterator, class Compare> +inline void dispatch_ssort_by_key(KeyIterator keys_first, + KeyIterator keys_last, + ValueIterator values_first, + Compare compare, + command_queue &queue) +{ + if(queue.get_device().type() & device::gpu) { + dispatch_gpu_ssort_by_key( + keys_first, keys_last, values_first, compare, queue + ); + return; + } + ::boost::compute::detail::merge_sort_by_key_on_cpu( + keys_first, keys_last, values_first, compare, queue + ); +} + +} // end detail namespace + /// Performs a key-value stable sort using the keys in the range [\p keys_first, /// \p keys_last) on the values in the range [\p values_first, /// \p values_first \c + (\p keys_last \c - \p keys_first)) using \p compare. @@ -35,8 +134,7 @@ inline void stable_sort_by_key(KeyIterator keys_first, Compare compare, command_queue &queue = system::default_queue()) { - // sort_by_key is stable - ::boost::compute::sort_by_key( + ::boost::compute::detail::dispatch_ssort_by_key( keys_first, keys_last, values_first, compare, queue ); } diff --git a/boost/compute/algorithm/transform.hpp b/boost/compute/algorithm/transform.hpp index 022a4988bd..68750a6523 100644 --- a/boost/compute/algorithm/transform.hpp +++ b/boost/compute/algorithm/transform.hpp @@ -22,7 +22,7 @@ namespace boost { namespace compute { /// Transforms the elements in the range [\p first, \p last) using -/// \p transform and stores the results in the range beginning at +/// operator \p op and stores the results in the range beginning at /// \p result. /// /// For example, to calculate the absolute value for each element in a vector: diff --git a/boost/compute/config.hpp b/boost/compute/config.hpp index 77d0d7b9df..e501ed28c9 100644 --- a/boost/compute/config.hpp +++ b/boost/compute/config.hpp @@ -16,14 +16,13 @@ #include <boost/compute/cl.hpp> // check for minimum required boost version -#if BOOST_VERSION < 104800 -#error Boost.Compute requires Boost version 1.48 or later +#if BOOST_VERSION < 105400 +#error Boost.Compute requires Boost version 1.54 or later #endif // the BOOST_COMPUTE_NO_VARIADIC_TEMPLATES macro is defined // if the compiler does not *fully* support variadic templates #if defined(BOOST_NO_CXX11_VARIADIC_TEMPLATES) || \ - defined(BOOST_NO_VARIADIC_TEMPLATES) || \ (defined(__GNUC__) && !defined(__clang__) && \ __GNUC__ == 4 && __GNUC_MINOR__ <= 6) #define BOOST_COMPUTE_NO_VARIADIC_TEMPLATES @@ -32,7 +31,6 @@ // the BOOST_COMPUTE_NO_STD_TUPLE macro is defined if the // compiler/stdlib does not support std::tuple #if defined(BOOST_NO_CXX11_HDR_TUPLE) || \ - defined(BOOST_NO_0X_HDR_TUPLE) || \ defined(BOOST_COMPUTE_NO_VARIADIC_TEMPLATES) #define BOOST_COMPUTE_NO_STD_TUPLE #endif // BOOST_NO_CXX11_HDR_TUPLE @@ -53,17 +51,15 @@ #endif #if !defined(BOOST_COMPUTE_DOXYGEN_INVOKED) && \ - (defined(BOOST_NO_CXX11_RVALUE_REFERENCES) || defined(BOOST_NO_RVALUE_REFERENCES)) + defined(BOOST_NO_CXX11_RVALUE_REFERENCES) # define BOOST_COMPUTE_NO_RVALUE_REFERENCES #endif // BOOST_NO_CXX11_RVALUE_REFERENCES -#if defined(BOOST_NO_CXX11_HDR_INITIALIZER_LIST) || \ - defined(BOOST_NO_0X_HDR_INITIALIZER_LIST) +#if defined(BOOST_NO_CXX11_HDR_INITIALIZER_LIST) # define BOOST_COMPUTE_NO_HDR_INITIALIZER_LIST #endif // BOOST_NO_CXX11_HDR_INITIALIZER_LIST -#if defined(BOOST_NO_CXX11_HDR_CHRONO) || \ - defined(BOOST_NO_0X_HDR_CHRONO) +#if defined(BOOST_NO_CXX11_HDR_CHRONO) # define BOOST_COMPUTE_NO_HDR_CHRONO #endif // BOOST_NO_CXX11_HDR_CHRONO diff --git a/boost/compute/container/array.hpp b/boost/compute/container/array.hpp index 919be6eeac..2b504be4a8 100644 --- a/boost/compute/container/array.hpp +++ b/boost/compute/container/array.hpp @@ -73,20 +73,33 @@ public: array(const array<T, N> &other) : m_buffer(other.m_buffer.get_context(), sizeof(T) * N) { - boost::compute::copy(other.begin(), other.end(), begin()); + command_queue queue = default_queue(); + boost::compute::copy(other.begin(), other.end(), begin(), queue); + queue.finish(); } array(const boost::array<T, N> &array, const context &context = system::default_context()) : m_buffer(context, sizeof(T) * N) { - boost::compute::copy(array.begin(), array.end(), begin()); + command_queue queue = default_queue(); + boost::compute::copy(array.begin(), array.end(), begin(), queue); + queue.finish(); + } + + array(const array<T, N> &other, + const command_queue &queue) + : m_buffer(other.m_buffer.get_context(), sizeof(T) * N) + { + boost::compute::copy(other.begin(), other.end(), begin(), queue); } array<T, N>& operator=(const array<T, N> &other) { if(this != &other){ - boost::compute::copy(other.begin(), other.end(), begin()); + command_queue queue = default_queue(); + boost::compute::copy(other.begin(), other.end(), begin(), queue); + queue.finish(); } return *this; @@ -94,7 +107,9 @@ public: array<T, N>& operator=(const boost::array<T, N> &array) { - boost::compute::copy(array.begin(), array.end(), begin()); + command_queue queue = default_queue(); + boost::compute::copy(array.begin(), array.end(), begin(), queue); + queue.finish(); return *this; } @@ -226,14 +241,28 @@ public: return *(end() - static_cast<difference_type>(1)); } + void fill(const value_type &value, const command_queue &queue) + { + ::boost::compute::fill(begin(), end(), value, queue); + } + + void swap(array<T, N> &other, const command_queue &queue) + { + ::boost::compute::swap_ranges(begin(), end(), other.begin(), queue); + } + void fill(const value_type &value) { - ::boost::compute::fill(begin(), end(), value); + command_queue queue = default_queue(); + ::boost::compute::fill(begin(), end(), value, queue); + queue.finish(); } void swap(array<T, N> &other) { - ::boost::compute::swap_ranges(begin(), end(), other.begin()); + command_queue queue = default_queue(); + ::boost::compute::swap_ranges(begin(), end(), other.begin(), queue); + queue.finish(); } const buffer& get_buffer() const @@ -243,6 +272,13 @@ public: private: buffer m_buffer; + + command_queue default_queue() const + { + const context &context = m_buffer.get_context(); + command_queue queue(context, context.get_device()); + return queue; + } }; namespace detail { diff --git a/boost/compute/container/vector.hpp b/boost/compute/container/vector.hpp index 47d649ad99..aa0eb98ce9 100644 --- a/boost/compute/container/vector.hpp +++ b/boost/compute/container/vector.hpp @@ -196,8 +196,15 @@ public: m_data = m_allocator.allocate((std::max)(m_size, _minimum_capacity())); if(!other.empty()){ - ::boost::compute::copy(other.begin(), other.end(), begin(), queue); - queue.finish(); + if(other.get_buffer().get_context() != queue.get_context()){ + command_queue other_queue = other.default_queue(); + ::boost::compute::copy(other.begin(), other.end(), begin(), other_queue); + other_queue.finish(); + } + else { + ::boost::compute::copy(other.begin(), other.end(), begin(), queue); + queue.finish(); + } } } @@ -253,6 +260,17 @@ public: } template<class OtherAlloc> + vector& operator=(const vector<T, OtherAlloc> &other) + { + command_queue queue = default_queue(); + resize(other.size(), queue); + ::boost::compute::copy(other.begin(), other.end(), begin(), queue); + queue.finish(); + + return *this; + } + + template<class OtherAlloc> vector& operator=(const std::vector<T, OtherAlloc> &vector) { command_queue queue = default_queue(); @@ -371,7 +389,7 @@ public: /// Resizes the vector to \p size. void resize(size_type size, command_queue &queue) { - if(size < capacity()){ + if(size <= capacity()){ m_size = size; } else { diff --git a/boost/compute/context.hpp b/boost/compute/context.hpp index 5db39e9d83..61e84e9767 100644 --- a/boost/compute/context.hpp +++ b/boost/compute/context.hpp @@ -204,7 +204,7 @@ public: typename detail::get_object_info_type<context, Enum>::type get_info() const; - /// Returns \c true if the context is the same at \p other. + /// Returns \c true if the context is the same as \p other. bool operator==(const context &other) const { return m_context == other.m_context; diff --git a/boost/compute/detail/literal.hpp b/boost/compute/detail/literal.hpp index 0d23b1d4d2..874830ee49 100644 --- a/boost/compute/detail/literal.hpp +++ b/boost/compute/detail/literal.hpp @@ -27,7 +27,15 @@ template<class T> std::string make_literal(T x) { std::stringstream s; - s << std::setprecision(std::numeric_limits<T>::digits10) + s << std::setprecision( +#ifndef BOOST_NO_CXX11_NUMERIC_LIMITS + std::numeric_limits<T>::max_digits10 +#else + // We don't have max_digits10, so add 3 other digits (this is what is required for + // float, and is one more than required for double). + 3 + std::numeric_limits<T>::digits10 +#endif + ) << std::scientific << x; diff --git a/boost/compute/detail/meta_kernel.hpp b/boost/compute/detail/meta_kernel.hpp index 7be778b025..5e6d6e0337 100644 --- a/boost/compute/detail/meta_kernel.hpp +++ b/boost/compute/detail/meta_kernel.hpp @@ -34,6 +34,7 @@ #include <boost/compute/image/image2d.hpp> #include <boost/compute/image/image_sampler.hpp> #include <boost/compute/memory_object.hpp> +#include <boost/compute/memory/svm_ptr.hpp> #include <boost/compute/detail/device_ptr.hpp> #include <boost/compute/detail/sha1.hpp> #include <boost/compute/utility/program_cache.hpp> @@ -203,6 +204,28 @@ struct meta_kernel_buffer_info size_t index; }; +struct meta_kernel_svm_info +{ + template <class T> + meta_kernel_svm_info(const svm_ptr<T> ptr, + const std::string &id, + memory_object::address_space addr_space, + size_t i) + : ptr(ptr.get()), + identifier(id), + address_space(addr_space), + index(i) + { + + } + + void* ptr; + std::string identifier; + memory_object::address_space address_space; + size_t index; +}; + + class meta_kernel; template<class Type> @@ -280,12 +303,14 @@ public: meta_kernel(const meta_kernel &other) { m_source.str(other.m_source.str()); + m_options = other.m_options; } meta_kernel& operator=(const meta_kernel &other) { if(this != &other){ m_source.str(other.m_source.str()); + m_options = other.m_options; } return *this; @@ -342,9 +367,11 @@ public: boost::shared_ptr<program_cache> cache = program_cache::get_global_cache(context); + std::string compile_options = m_options + options; + // load (or build) program from cache ::boost::compute::program program = - cache->get_or_build(cache_key, options, source, context); + cache->get_or_build(cache_key, compile_options, source, context); // create kernel ::boost::compute::kernel kernel = program.create_kernel(name()); @@ -365,6 +392,13 @@ public: kernel.set_arg(bi.index, bi.m_mem); } + // bind svm args + for(size_t i = 0; i < m_stored_svm_ptrs.size(); i++){ + const detail::meta_kernel_svm_info &spi = m_stored_svm_ptrs[i]; + + kernel.set_arg_svm_ptr(spi.index, spi.ptr); + } + return kernel; } @@ -689,6 +723,45 @@ public: return identifier; } + template<class T> + std::string get_svm_identifier(const svm_ptr<T> &svm_ptr, + const memory_object::address_space address_space = + memory_object::global_memory) + { + BOOST_ASSERT( + (address_space == memory_object::global_memory) + || (address_space == memory_object::constant_memory) + ); + + // check if we've already seen this pointer + for(size_t i = 0; i < m_stored_svm_ptrs.size(); i++){ + const detail::meta_kernel_svm_info &spi = m_stored_svm_ptrs[i]; + + if(spi.ptr == svm_ptr.get() && + spi.address_space == address_space){ + return spi.identifier; + } + } + + // create a new binding + std::string identifier = + "_svm_ptr" + lexical_cast<std::string>(m_stored_svm_ptrs.size()); + size_t index = add_arg<T *>(address_space, identifier); + + if(m_stored_svm_ptrs.empty()) { + m_options += std::string(" -cl-std=CL2.0"); + } + + // store new svm pointer info + m_stored_svm_ptrs.push_back( + detail::meta_kernel_svm_info( + svm_ptr, identifier, address_space, index + ) + ); + + return identifier; + } + std::string get_image_identifier(const char *qualifiers, const image2d &image) { size_t index = add_arg_with_qualifiers<image2d>(qualifiers, "image"); @@ -880,8 +953,10 @@ private: std::set<std::string> m_external_function_names; std::vector<std::string> m_args; std::string m_pragmas; + std::string m_options; std::vector<detail::meta_kernel_stored_arg> m_stored_args; std::vector<detail::meta_kernel_buffer_info> m_stored_buffers; + std::vector<detail::meta_kernel_svm_info> m_stored_svm_ptrs; }; template<class ResultType, class ArgTuple> @@ -960,6 +1035,18 @@ inline meta_kernel& operator<<(meta_kernel &kernel, } } +// SVM requires OpenCL 2.0 +#if defined(CL_VERSION_2_0) || defined(BOOST_COMPUTE_DOXYGEN_INVOKED) +template<class T, class IndexExpr> +inline meta_kernel& operator<<(meta_kernel &kernel, + const svm_ptr_index_expr<T, IndexExpr> &expr) +{ + return kernel << + kernel.get_svm_identifier<T>(expr.m_svm_ptr) << + '[' << expr.m_expr << ']'; +} +#endif + template<class Predicate, class Arg> inline meta_kernel& operator<<(meta_kernel &kernel, const invoked_unary_negate_function<Predicate, diff --git a/boost/compute/detail/parameter_cache.hpp b/boost/compute/detail/parameter_cache.hpp index 2a856311e1..0a16cd9b0e 100644 --- a/boost/compute/detail/parameter_cache.hpp +++ b/boost/compute/detail/parameter_cache.hpp @@ -161,7 +161,7 @@ private: try { read_json(m_file_name, pt); } - catch(boost::property_tree::json_parser::json_parser_error &e){ + catch(boost::property_tree::json_parser::json_parser_error&){ // no saved cache file, ignore return; } diff --git a/boost/compute/detail/vendor.hpp b/boost/compute/detail/vendor.hpp index 0aa9c9c0d4..e2359cb466 100644 --- a/boost/compute/detail/vendor.hpp +++ b/boost/compute/detail/vendor.hpp @@ -31,6 +31,18 @@ inline bool is_amd_device(const device &device) return device.platform().vendor() == "Advanced Micro Devices, Inc."; } +// returns true if the platform is Apple OpenCL platform +inline bool is_apple_platform(const platform &platform) +{ + return platform.name() == "Apple"; +} + +// returns true if the device is from Apple OpenCL Platform +inline bool is_apple_platform_device(const device &device) +{ + return is_apple_platform(device.platform()); +} + } // end detail namespace } // end compute namespace } // end boost namespace diff --git a/boost/compute/exception/context_error.hpp b/boost/compute/exception/context_error.hpp index eeb387d884..ec8dc7c30f 100644 --- a/boost/compute/exception/context_error.hpp +++ b/boost/compute/exception/context_error.hpp @@ -70,7 +70,7 @@ public: } /// Returns the size of the private info memory block. - const size_t get_private_info_size() const throw() + size_t get_private_info_size() const throw() { return m_private_info_size; } diff --git a/boost/compute/function.hpp b/boost/compute/function.hpp index e83f16808a..b0b893e948 100644 --- a/boost/compute/function.hpp +++ b/boost/compute/function.hpp @@ -215,7 +215,7 @@ public: { BOOST_STATIC_ASSERT_MSG( arity == 3, - "Non-ternary function invoked with two arguments" + "Non-ternary function invoked with three arguments" ); return detail::invoked_function<result_type, boost::tuple<Arg1, Arg2, Arg3> >( diff --git a/boost/compute/interop/opengl/acquire.hpp b/boost/compute/interop/opengl/acquire.hpp index 10af4338fb..b9259e0d88 100644 --- a/boost/compute/interop/opengl/acquire.hpp +++ b/boost/compute/interop/opengl/acquire.hpp @@ -14,6 +14,7 @@ #include <boost/compute/command_queue.hpp> #include <boost/compute/interop/opengl/cl_gl.hpp> #include <boost/compute/interop/opengl/opengl_buffer.hpp> +#include <boost/compute/types/fundamental.hpp> #include <boost/compute/utility/wait_list.hpp> namespace boost { @@ -22,7 +23,7 @@ namespace compute { /// Enqueues a command to acquire the specified OpenGL memory objects. /// /// \see_opencl_ref{clEnqueueAcquireGLObjects} -inline event opengl_enqueue_acquire_gl_objects(size_t num_objects, +inline event opengl_enqueue_acquire_gl_objects(const uint_ num_objects, const cl_mem *mem_objects, command_queue &queue, const wait_list &events = wait_list()) @@ -47,7 +48,7 @@ inline event opengl_enqueue_acquire_gl_objects(size_t num_objects, /// Enqueues a command to release the specified OpenGL memory objects. /// /// \see_opencl_ref{clEnqueueReleaseGLObjects} -inline event opengl_enqueue_release_gl_objects(size_t num_objects, +inline event opengl_enqueue_release_gl_objects(const uint_ num_objects, const cl_mem *mem_objects, command_queue &queue, const wait_list &events = wait_list()) diff --git a/boost/compute/interop/opengl/context.hpp b/boost/compute/interop/opengl/context.hpp index 754dca2236..c35fedddfa 100644 --- a/boost/compute/interop/opengl/context.hpp +++ b/boost/compute/interop/opengl/context.hpp @@ -78,7 +78,7 @@ inline context opengl_create_shared_context() // load clGetGLContextInfoKHR() extension function GetGLContextInfoKHRFunction GetGLContextInfoKHR = reinterpret_cast<GetGLContextInfoKHRFunction>( - reinterpret_cast<unsigned long>( + reinterpret_cast<size_t>( platform.get_extension_function_address("clGetGLContextInfoKHR") ) ); diff --git a/boost/compute/kernel.hpp b/boost/compute/kernel.hpp index 9494e46de2..72f21a0378 100644 --- a/boost/compute/kernel.hpp +++ b/boost/compute/kernel.hpp @@ -22,7 +22,6 @@ #include <boost/compute/type_traits/is_fundamental.hpp> #include <boost/compute/detail/get_object_info.hpp> #include <boost/compute/detail/assert_cl_success.hpp> -#include <boost/compute/memory/svm_ptr.hpp> namespace boost { namespace compute { @@ -189,8 +188,15 @@ public: template<class T> T get_arg_info(size_t index, cl_kernel_arg_info info) const { - return detail::get_object_info<T>(clGetKernelArgInfo, m_kernel, info, index); + return detail::get_object_info<T>( + clGetKernelArgInfo, m_kernel, info, static_cast<cl_uint>(index) + ); } + + /// \overload + template<int Enum> + typename detail::get_object_info_type<kernel, Enum>::type + get_arg_info(size_t index) const; #endif // CL_VERSION_1_2 /// Returns work-group information for the kernel with \p device. @@ -258,15 +264,16 @@ public: } /// \internal_ - template<class T> - void set_arg(size_t index, const svm_ptr<T> ptr) + void set_arg_svm_ptr(size_t index, void* ptr) { #ifdef CL_VERSION_2_0 - cl_int ret = clSetKernelArgSVMPointer(m_kernel, index, ptr.get()); + cl_int ret = clSetKernelArgSVMPointer(m_kernel, static_cast<cl_uint>(index), ptr); if(ret != CL_SUCCESS){ BOOST_THROW_EXCEPTION(opencl_error(ret)); } #else + (void) index; + (void) ptr; BOOST_THROW_EXCEPTION(opencl_error(CL_INVALID_ARG_VALUE)); #endif } @@ -324,7 +331,7 @@ public: } private: - #ifndef BOOST_NO_VARIADIC_TEMPLATES + #ifndef BOOST_COMPUTE_NO_VARIADIC_TEMPLATES /// \internal_ template<size_t N> void _set_args() @@ -338,7 +345,7 @@ private: set_arg(N, arg); _set_args<N+1>(rest...); } - #endif // BOOST_NO_VARIADIC_TEMPLATES + #endif // BOOST_COMPUTE_NO_VARIADIC_TEMPLATES private: cl_kernel m_kernel; @@ -364,6 +371,23 @@ BOOST_COMPUTE_DETAIL_DEFINE_GET_INFO_SPECIALIZATIONS(kernel, ) #endif // CL_VERSION_1_2 +/// \internal_ define get_arg_info() specializations for kernel +#ifdef CL_VERSION_1_2 +#define BOOST_COMPUTE_DETAIL_DEFINE_KERNEL_GET_ARG_INFO_SPECIALIZATION(result_type, value) \ + namespace detail { \ + template<> struct get_object_info_type<kernel, value> { typedef result_type type; }; \ + } \ + template<> inline result_type kernel::get_arg_info<value>(size_t index) const { \ + return get_arg_info<result_type>(index, value); \ + } + +BOOST_COMPUTE_DETAIL_DEFINE_KERNEL_GET_ARG_INFO_SPECIALIZATION(cl_kernel_arg_address_qualifier, CL_KERNEL_ARG_ADDRESS_QUALIFIER) +BOOST_COMPUTE_DETAIL_DEFINE_KERNEL_GET_ARG_INFO_SPECIALIZATION(cl_kernel_arg_access_qualifier, CL_KERNEL_ARG_ACCESS_QUALIFIER) +BOOST_COMPUTE_DETAIL_DEFINE_KERNEL_GET_ARG_INFO_SPECIALIZATION(std::string, CL_KERNEL_ARG_TYPE_NAME) +BOOST_COMPUTE_DETAIL_DEFINE_KERNEL_GET_ARG_INFO_SPECIALIZATION(cl_kernel_arg_type_qualifier, CL_KERNEL_ARG_TYPE_QUALIFIER) +BOOST_COMPUTE_DETAIL_DEFINE_KERNEL_GET_ARG_INFO_SPECIALIZATION(std::string, CL_KERNEL_ARG_NAME) +#endif // CL_VERSION_1_2 + namespace detail { // set_kernel_arg implementation for built-in types diff --git a/boost/compute/memory/svm_ptr.hpp b/boost/compute/memory/svm_ptr.hpp index 2dbcb8f635..0c9d88035c 100644 --- a/boost/compute/memory/svm_ptr.hpp +++ b/boost/compute/memory/svm_ptr.hpp @@ -11,12 +11,67 @@ #ifndef BOOST_COMPUTE_MEMORY_SVM_PTR_HPP #define BOOST_COMPUTE_MEMORY_SVM_PTR_HPP +#include <boost/type_traits.hpp> +#include <boost/static_assert.hpp> +#include <boost/assert.hpp> + #include <boost/compute/cl.hpp> +#include <boost/compute/kernel.hpp> +#include <boost/compute/context.hpp> +#include <boost/compute/command_queue.hpp> #include <boost/compute/type_traits/is_device_iterator.hpp> namespace boost { namespace compute { +// forward declaration for svm_ptr<T> +template<class T> +class svm_ptr; + +// svm functions require OpenCL 2.0 +#if defined(CL_VERSION_2_0) || defined(BOOST_COMPUTE_DOXYGEN_INVOKED) +namespace detail { + +template<class T, class IndexExpr> +struct svm_ptr_index_expr +{ + typedef T result_type; + + svm_ptr_index_expr(const svm_ptr<T> &svm_ptr, + const IndexExpr &expr) + : m_svm_ptr(svm_ptr), + m_expr(expr) + { + } + + operator T() const + { + BOOST_STATIC_ASSERT_MSG(boost::is_integral<IndexExpr>::value, + "Index expression must be integral"); + + BOOST_ASSERT(m_svm_ptr.get()); + + const context &context = m_svm_ptr.get_context(); + const device &device = context.get_device(); + command_queue queue(context, device); + + T value; + T* ptr = + static_cast<T*>(m_svm_ptr.get()) + static_cast<std::ptrdiff_t>(m_expr); + queue.enqueue_svm_map(static_cast<void*>(ptr), sizeof(T), CL_MAP_READ); + value = *(ptr); + queue.enqueue_svm_unmap(static_cast<void*>(ptr)).wait(); + + return value; + } + + const svm_ptr<T> &m_svm_ptr; + IndexExpr m_expr; +}; + +} // end detail namespace +#endif + template<class T> class svm_ptr { @@ -32,19 +87,22 @@ public: { } - explicit svm_ptr(void *ptr) - : m_ptr(static_cast<T*>(ptr)) + svm_ptr(void *ptr, const context &context) + : m_ptr(static_cast<T*>(ptr)), + m_context(context) { } svm_ptr(const svm_ptr<T> &other) - : m_ptr(other.m_ptr) + : m_ptr(other.m_ptr), + m_context(other.m_context) { } - svm_ptr& operator=(const svm_ptr<T> &other) + svm_ptr<T>& operator=(const svm_ptr<T> &other) { m_ptr = other.m_ptr; + m_context = other.m_context; return *this; } @@ -59,18 +117,53 @@ public: svm_ptr<T> operator+(difference_type n) { - return svm_ptr<T>(m_ptr + n); + return svm_ptr<T>(m_ptr + n, m_context); } difference_type operator-(svm_ptr<T> other) { + BOOST_ASSERT(other.m_context == m_context); return m_ptr - other.m_ptr; } + context& get_context() const + { + return m_context; + } + + // svm functions require OpenCL 2.0 + #if defined(CL_VERSION_2_0) || defined(BOOST_COMPUTE_DOXYGEN_INVOKED) + /// \internal_ + template<class Expr> + detail::svm_ptr_index_expr<T, Expr> + operator[](const Expr &expr) const + { + BOOST_ASSERT(m_ptr); + + return detail::svm_ptr_index_expr<T, Expr>(*this, + expr); + } + #endif + private: T *m_ptr; + context m_context; }; +namespace detail { + +/// \internal_ +template<class T> +struct set_kernel_arg<svm_ptr<T> > +{ + void operator()(kernel &kernel_, size_t index, const svm_ptr<T> &ptr) + { + kernel_.set_arg_svm_ptr(index, ptr.get()); + } +}; + +} // end detail namespace + /// \internal_ (is_device_iterator specialization for svm_ptr) template<class T> struct is_device_iterator<svm_ptr<T> > : boost::true_type {}; diff --git a/boost/compute/random/bernoulli_distribution.hpp b/boost/compute/random/bernoulli_distribution.hpp index edd1125090..50bf27f591 100644 --- a/boost/compute/random/bernoulli_distribution.hpp +++ b/boost/compute/random/bernoulli_distribution.hpp @@ -11,6 +11,9 @@ #ifndef BOOST_COMPUTE_RANDOM_BERNOULLI_DISTRIBUTION_HPP #define BOOST_COMPUTE_RANDOM_BERNOULLI_DISTRIBUTION_HPP +#include <boost/assert.hpp> +#include <boost/type_traits.hpp> + #include <boost/compute/command_queue.hpp> #include <boost/compute/function.hpp> #include <boost/compute/types/fundamental.hpp> @@ -84,6 +87,11 @@ public: private: RealType m_p; + + BOOST_STATIC_ASSERT_MSG( + boost::is_floating_point<RealType>::value, + "Template argument must be a floating point type" + ); }; } // end compute namespace diff --git a/boost/compute/random/discrete_distribution.hpp b/boost/compute/random/discrete_distribution.hpp index 3707928f98..86249538ac 100644 --- a/boost/compute/random/discrete_distribution.hpp +++ b/boost/compute/random/discrete_distribution.hpp @@ -11,6 +11,12 @@ #ifndef BOOST_COMPUTE_RANDOM_DISCRETE_DISTRIBUTION_HPP #define BOOST_COMPUTE_RANDOM_DISCRETE_DISTRIBUTION_HPP +#include <numeric> + +#include <boost/config.hpp> +#include <boost/type_traits.hpp> +#include <boost/static_assert.hpp> + #include <boost/compute/command_queue.hpp> #include <boost/compute/function.hpp> #include <boost/compute/algorithm/accumulate.hpp> @@ -38,23 +44,42 @@ class discrete_distribution public: typedef IntType result_type; + /// Creates a new discrete distribution with a single weight p = { 1 }. + /// This distribution produces only zeroes. + discrete_distribution() + : m_probabilities(1, double(1)), + m_scanned_probabilities(1, double(1)) + { + + } + /// Creates a new discrete distribution with weights given by - /// the range [\p first, \p last) + /// the range [\p first, \p last). template<class InputIterator> discrete_distribution(InputIterator first, InputIterator last) - : m_n(std::distance(first, last)), - m_probabilities(std::distance(first, last)) + : m_probabilities(first, last), + m_scanned_probabilities(std::distance(first, last)) { - double sum = 0; - - for(InputIterator iter = first; iter!=last; iter++) - { - sum += *iter; + if(first != last) { + // after this m_scanned_probabilities.back() is a sum of all + // weights from the range [first, last) + std::partial_sum(first, last, m_scanned_probabilities.begin()); + + std::vector<double>::iterator i = m_probabilities.begin(); + std::vector<double>::iterator j = m_scanned_probabilities.begin(); + for(; i != m_probabilities.end(); ++i, ++j) + { + // dividing each weight by sum of all weights to + // get probabilities + *i = *i / m_scanned_probabilities.back(); + // dividing each partial sum of weights by sum of + // all weights to get partial sums of probabilities + *j = *j / m_scanned_probabilities.back(); + } } - - for(size_t i=0; i<m_n; i++) - { - m_probabilities[i] = m_probabilities[i-1] + first[i]/sum; + else { + m_probabilities.push_back(double(1)); + m_scanned_probabilities.push_back(double(1)); } } @@ -63,19 +88,31 @@ public: { } - /// Returns the value of n - result_type n() const - { - return m_n; - } - /// Returns the probabilities ::std::vector<double> probabilities() const { return m_probabilities; } - /// Generates uniformily distributed integers and stores + /// Returns the minimum potentially generated value. + result_type min BOOST_PREVENT_MACRO_SUBSTITUTION () const + { + return result_type(0); + } + + /// Returns the maximum potentially generated value. + result_type max BOOST_PREVENT_MACRO_SUBSTITUTION () const + { + size_t type_max = static_cast<size_t>( + (std::numeric_limits<result_type>::max)() + ); + if(m_probabilities.size() - 1 > type_max) { + return (std::numeric_limits<result_type>::max)(); + } + return static_cast<result_type>(m_probabilities.size() - 1); + } + + /// Generates uniformly distributed integers and stores /// them to the range [\p first, \p last). template<class OutputIterator, class Generator> void generate(OutputIterator first, @@ -83,32 +120,38 @@ public: Generator &generator, command_queue &queue) { - std::string source = "inline uint scale_random(uint x)\n"; + std::string source = "inline IntType scale_random(uint x)\n"; source = source + "{\n" + "float rno = convert_float(x) / UINT_MAX;\n"; - for(size_t i=0; i<m_n; i++) + for(size_t i = 0; i < m_scanned_probabilities.size() - 1; i++) { source = source + - "if(rno <= " + detail::make_literal<float>(m_probabilities[i]) + ")\n" + + "if(rno <= " + detail::make_literal<float>(m_scanned_probabilities[i]) + ")\n" + " return " + detail::make_literal(i) + ";\n"; } source = source + - "return " + detail::make_literal(m_n - 1) + ";\n" + + "return " + detail::make_literal(m_scanned_probabilities.size() - 1) + ";\n" + "}\n"; BOOST_COMPUTE_FUNCTION(IntType, scale_random, (const uint_ x), {}); scale_random.set_source(source); + scale_random.define("IntType", type_name<IntType>()); generator.generate(first, last, scale_random, queue); } private: - size_t m_n; ::std::vector<double> m_probabilities; + ::std::vector<double> m_scanned_probabilities; + + BOOST_STATIC_ASSERT_MSG( + boost::is_integral<IntType>::value, + "Template argument must be integral" + ); }; } // end compute namespace diff --git a/boost/compute/random/normal_distribution.hpp b/boost/compute/random/normal_distribution.hpp index d025faeb2e..4693e4fffe 100644 --- a/boost/compute/random/normal_distribution.hpp +++ b/boost/compute/random/normal_distribution.hpp @@ -13,6 +13,9 @@ #include <limits> +#include <boost/assert.hpp> +#include <boost/type_traits.hpp> + #include <boost/compute/command_queue.hpp> #include <boost/compute/function.hpp> #include <boost/compute/types/fundamental.hpp> @@ -90,11 +93,19 @@ public: BOOST_COMPUTE_FUNCTION(RealType2, box_muller, (const uint2_ x), { - const RealType x1 = x.x / (RealType) (UINT_MAX - 1); - const RealType x2 = x.y / (RealType) (UINT_MAX - 1); + const RealType one = 1; + const RealType two = 2; + + // Use nextafter to push values down into [0,1) range; without this, floating point rounding can + // lead to have x1 = 1, but that would lead to taking the log of 0, which would result in negative + // infinities; by pushing the values off 1 towards 0, we ensure this won't happen. + const RealType x1 = nextafter(x.x / (RealType) UINT_MAX, (RealType) 0); + const RealType x2 = x.y / (RealType) UINT_MAX; - const RealType z1 = sqrt(-2.f * log2(x1)) * cos(2.f * M_PI_F * x2); - const RealType z2 = sqrt(-2.f * log2(x1)) * sin(2.f * M_PI_F * x2); + const RealType rho = sqrt(-two * log(one-x1)); + + const RealType z1 = rho * cos(two * M_PI_F * x2); + const RealType z2 = rho * sin(two * M_PI_F * x2); return (RealType2)(MEAN, MEAN) + (RealType2)(z1, z2) * (RealType2)(STDDEV, STDDEV); }); @@ -116,6 +127,11 @@ public: private: RealType m_mean; RealType m_stddev; + + BOOST_STATIC_ASSERT_MSG( + boost::is_floating_point<RealType>::value, + "Template argument must be a floating point type" + ); }; } // end compute namespace diff --git a/boost/compute/random/uniform_int_distribution.hpp b/boost/compute/random/uniform_int_distribution.hpp index 92e8b3305f..20448afec0 100644 --- a/boost/compute/random/uniform_int_distribution.hpp +++ b/boost/compute/random/uniform_int_distribution.hpp @@ -13,6 +13,9 @@ #include <limits> +#include <boost/type_traits.hpp> +#include <boost/static_assert.hpp> + #include <boost/compute/command_queue.hpp> #include <boost/compute/container/vector.hpp> #include <boost/compute/function.hpp> @@ -103,6 +106,11 @@ public: private: IntType m_a; IntType m_b; + + BOOST_STATIC_ASSERT_MSG( + boost::is_integral<IntType>::value, + "Template argument must be integral" + ); }; } // end compute namespace diff --git a/boost/compute/random/uniform_real_distribution.hpp b/boost/compute/random/uniform_real_distribution.hpp index 231b0dba01..75d9659d6a 100644 --- a/boost/compute/random/uniform_real_distribution.hpp +++ b/boost/compute/random/uniform_real_distribution.hpp @@ -11,6 +11,9 @@ #ifndef BOOST_COMPUTE_RANDOM_UNIFORM_REAL_DISTRIBUTION_HPP #define BOOST_COMPUTE_RANDOM_UNIFORM_REAL_DISTRIBUTION_HPP +#include <boost/assert.hpp> +#include <boost/type_traits.hpp> + #include <boost/compute/command_queue.hpp> #include <boost/compute/function.hpp> #include <boost/compute/detail/literal.hpp> @@ -20,7 +23,7 @@ namespace boost { namespace compute { /// \class uniform_real_distribution -/// \brief Produces uniformily distributed random floating-point numbers. +/// \brief Produces uniformly distributed random floating-point numbers. /// /// The following example shows how to setup a uniform real distribution to /// produce random \c float values between \c 1 and \c 100. @@ -36,10 +39,12 @@ public: /// Creates a new uniform distribution producing numbers in the range /// [\p a, \p b). + /// Requires a < b uniform_real_distribution(RealType a = 0.f, RealType b = 1.f) : m_a(a), m_b(b) { + BOOST_ASSERT(a < b); } /// Destroys the uniform_real_distribution object. @@ -59,7 +64,7 @@ public: return m_b; } - /// Generates uniformily distributed floating-point numbers and stores + /// Generates uniformly distributed floating-point numbers and stores /// them to the range [\p first, \p last). template<class OutputIterator, class Generator> void generate(OutputIterator first, @@ -69,7 +74,7 @@ public: { BOOST_COMPUTE_FUNCTION(RealType, scale_random, (const uint_ x), { - return LO + (convert_RealType(x) / MAX_RANDOM) * (HI - LO); + return nextafter(LO + (convert_RealType(x) / MAX_RANDOM) * (HI - LO), (RealType) LO); }); scale_random.define("LO", detail::make_literal(m_a)); @@ -78,6 +83,7 @@ public: scale_random.define( "convert_RealType", std::string("convert_") + type_name<RealType>() ); + scale_random.define("RealType", type_name<RealType>()); generator.generate( first, last, scale_random, queue @@ -97,6 +103,11 @@ public: private: RealType m_a; RealType m_b; + + BOOST_STATIC_ASSERT_MSG( + boost::is_floating_point<RealType>::value, + "Template argument must be a floating point type" + ); }; } // end compute namespace diff --git a/boost/compute/svm.hpp b/boost/compute/svm.hpp index d03c8d9079..4bc3a74237 100644 --- a/boost/compute/svm.hpp +++ b/boost/compute/svm.hpp @@ -15,7 +15,7 @@ #include <boost/compute/context.hpp> #include <boost/compute/memory/svm_ptr.hpp> -// svm functions require opencl 2.0 +// svm functions require OpenCL 2.0 #if defined(CL_VERSION_2_0) || defined(BOOST_COMPUTE_DOXYGEN_INVOKED) namespace boost { @@ -34,7 +34,10 @@ inline svm_ptr<T> svm_alloc(const context &context, cl_svm_mem_flags flags = CL_MEM_READ_WRITE, unsigned int alignment = 0) { - svm_ptr<T> ptr(clSVMAlloc(context.get(), flags, size * sizeof(T), alignment)); + svm_ptr<T> ptr( + clSVMAlloc(context.get(), flags, size * sizeof(T), alignment), + context + ); if(!ptr.get()){ BOOST_THROW_EXCEPTION(opencl_error(CL_MEM_OBJECT_ALLOCATION_FAILURE)); } @@ -49,6 +52,13 @@ inline svm_ptr<T> svm_alloc(const context &context, /// /// \see svm_alloc(), command_queue::enqueue_svm_free() template<class T> +inline void svm_free(svm_ptr<T> ptr) +{ + clSVMFree(ptr.get_context(), ptr.get()); +} + +/// \overload +template<class T> inline void svm_free(const context &context, svm_ptr<T> ptr) { clSVMFree(context.get(), ptr.get()); diff --git a/boost/compute/system.hpp b/boost/compute/system.hpp index f205fece7d..e6ed353d68 100644 --- a/boost/compute/system.hpp +++ b/boost/compute/system.hpp @@ -184,14 +184,16 @@ public: cl_uint count = 0; clGetPlatformIDs(0, 0, &count); - std::vector<cl_platform_id> platform_ids(count); - clGetPlatformIDs(count, &platform_ids[0], 0); - std::vector<platform> platforms; - for(size_t i = 0; i < platform_ids.size(); i++){ - platforms.push_back(platform(platform_ids[i])); - } + if(count > 0) + { + std::vector<cl_platform_id> platform_ids(count); + clGetPlatformIDs(count, &platform_ids[0], 0); + for(size_t i = 0; i < platform_ids.size(); i++){ + platforms.push_back(platform(platform_ids[i])); + } + } return platforms; } diff --git a/boost/compute/utility/wait_list.hpp b/boost/compute/utility/wait_list.hpp index 9a7e74bac0..8b81924d63 100644 --- a/boost/compute/utility/wait_list.hpp +++ b/boost/compute/utility/wait_list.hpp @@ -13,6 +13,12 @@ #include <vector> +#include <boost/compute/config.hpp> + +#ifndef BOOST_COMPUTE_NO_HDR_INITIALIZER_LIST +#include <initializer_list> +#endif + #include <boost/compute/event.hpp> namespace boost { @@ -27,7 +33,7 @@ template<class T> class future; /// specify dependencies for OpenCL operations or to wait on the host until /// all of the events have completed. /// -/// This class also provides convenience fnuctions for interacting with +/// This class also provides convenience functions for interacting with /// OpenCL APIs which typically accept event dependencies as a \c cl_event* /// pointer and a \c cl_uint size. For example: /// \code @@ -60,6 +66,14 @@ public: { } + #ifndef BOOST_COMPUTE_NO_HDR_INITIALIZER_LIST + /// Creates a wait-list from \p events + wait_list(std::initializer_list<event> events) + : m_events(events) + { + } + #endif // BOOST_COMPUTE_NO_HDR_INITIALIZER_LIST + /// Copies the events in the wait-list from \p other. wait_list& operator=(const wait_list &other) { |