46 files changed, 2584 insertions, 325 deletions
diff --git a/boost/compute/algorithm/adjacent_difference.hpp b/boost/compute/algorithm/adjacent_difference.hpp
index a8f84e020e..ef13970754 100644
--- a/boost/compute/algorithm/adjacent_difference.hpp
+++ b/boost/compute/algorithm/adjacent_difference.hpp
@@ -23,33 +23,17 @@
 namespace boost {
 namespace compute {
 
-/// Stores the difference of each pair of consecutive values in the range
-/// [\p first, \p last) to the range beginning at \p result. If \p op is not
-/// provided, \c minus<T> is used.
-///
-/// \param first first element in the input range
-/// \param last last element in the input range
-/// \param result first element in the output range
-/// \param op binary difference function
-/// \param queue command queue to perform the operation
-///
-/// \return \c OutputIterator to the end of the result range
-///
-/// \see adjacent_find()
+namespace detail {
+
 template<class InputIterator, class OutputIterator, class BinaryFunction>
 inline OutputIterator
-adjacent_difference(InputIterator first,
-                    InputIterator last,
-                    OutputIterator result,
-                    BinaryFunction op,
-                    command_queue &queue = system::default_queue())
+dispatch_adjacent_difference(InputIterator first,
+                             InputIterator last,
+                             OutputIterator result,
+                             BinaryFunction op,
+                             command_queue &queue = system::default_queue())
 {
-    if(first == last){
-        return result;
-    }
-
     size_t count = detail::iterator_range_size(first, last);
-
     detail::meta_kernel k("adjacent_difference");
 
     k << "const uint i = get_global_id(0);\n"
@@ -66,32 +50,66 @@ adjacent_difference(InputIterator first,
     return result + count;
 }
 
-/// \overload
-template<class InputIterator, class OutputIterator>
+} // end detail namespace
+
+/// Stores the difference of each pair of consecutive values in the range
+/// [\p first, \p last) to the range beginning at \p result. If \p op is not
+/// provided, \c minus<T> is used.
+///
+/// \param first first element in the input range
+/// \param last last element in the input range
+/// \param result first element in the output range
+/// \param op binary difference function
+/// \param queue command queue to perform the operation
+///
+/// \return \c OutputIterator to the end of the result range
+///
+/// \see adjacent_find()
+template<class InputIterator, class OutputIterator, class BinaryFunction>
 inline OutputIterator
 adjacent_difference(InputIterator first,
                     InputIterator last,
                     OutputIterator result,
+                    BinaryFunction op,
                     command_queue &queue = system::default_queue())
 {
     typedef typename std::iterator_traits<InputIterator>::value_type value_type;
 
+    if(first == last) {
+        return result;
+    }
+
     if (first == result) {
         vector<value_type> temp(detail::iterator_range_size(first, last),
                                 queue.get_context());
         copy(first, last, temp.begin(), queue);
 
-        return ::boost::compute::adjacent_difference(
-            temp.begin(), temp.end(), result, ::boost::compute::minus<value_type>(), queue
+        return ::boost::compute::detail::dispatch_adjacent_difference(
+            temp.begin(), temp.end(), result, op, queue
         );
     }
     else {
-        return ::boost::compute::adjacent_difference(
-            first, last, result, ::boost::compute::minus<value_type>(), queue
+        return ::boost::compute::detail::dispatch_adjacent_difference(
+            first, last, result, op, queue
         );
     }
 }
 
+/// \overload
+template<class InputIterator, class OutputIterator>
+inline OutputIterator
+adjacent_difference(InputIterator first,
+                    InputIterator last,
+                    OutputIterator result,
+                    command_queue &queue = system::default_queue())
+{
+    typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+
+    return ::boost::compute::adjacent_difference(
+        first, last, result, ::boost::compute::minus<value_type>(), queue
+    );
+}
+
 } // end compute namespace
 } // end boost namespace
 
diff --git a/boost/compute/algorithm/copy.hpp b/boost/compute/algorithm/copy.hpp
index 2a25059bba..7779277b82 100644
--- a/boost/compute/algorithm/copy.hpp
+++ b/boost/compute/algorithm/copy.hpp
@@ -18,6 +18,7 @@
 
 #include <boost/mpl/and.hpp>
 #include <boost/mpl/not.hpp>
+#include <boost/mpl/or.hpp>
 
 #include <boost/compute/buffer.hpp>
 #include <boost/compute/system.hpp>
@@ -26,9 +27,13 @@
 #include <boost/compute/algorithm/detail/copy_to_device.hpp>
 #include <boost/compute/algorithm/detail/copy_to_host.hpp>
 #include <boost/compute/async/future.hpp>
+#include <boost/compute/container/mapped_view.hpp>
+#include <boost/compute/detail/device_ptr.hpp>
 #include <boost/compute/detail/is_contiguous_iterator.hpp>
 #include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/parameter_cache.hpp>
 #include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/type_traits/type_name.hpp>
 #include <boost/compute/type_traits/is_device_iterator.hpp>
 
 namespace boost {
@@ -42,13 +47,25 @@ namespace mpl = boost::mpl;
 template<class InputIterator, class OutputIterator>
 struct can_copy_with_copy_buffer :
     mpl::and_<
-        boost::is_same<
-            InputIterator,
-            buffer_iterator<typename InputIterator::value_type>
+        mpl::or_<
+            boost::is_same<
+                InputIterator,
+                buffer_iterator<typename InputIterator::value_type>
+            >,
+            boost::is_same<
+                InputIterator,
+                detail::device_ptr<typename InputIterator::value_type>
+            >
         >,
-        boost::is_same<
-            OutputIterator,
-            buffer_iterator<typename OutputIterator::value_type>
+        mpl::or_<
+            boost::is_same<
+                OutputIterator,
+                buffer_iterator<typename OutputIterator::value_type>
+            >,
+            boost::is_same<
+                OutputIterator,
+                detail::device_ptr<typename OutputIterator::value_type>
+            >
         >,
         boost::is_same<
             typename InputIterator::value_type,
@@ -56,40 +73,72 @@ struct can_copy_with_copy_buffer :
         >
     >::type {};
 
-// host -> device
+// meta-function returning true if value_types of HostIterator and
+// DeviceIterator are same
+template<class HostIterator, class DeviceIterator>
+struct is_same_value_type :
+    boost::is_same<
+        typename boost::remove_cv<
+            typename std::iterator_traits<HostIterator>::value_type
+        >::type,
+        typename boost::remove_cv<
+            typename DeviceIterator::value_type
+        >::type
+    >::type {};
+
+// meta-function returning true if value_type of HostIterator is bool
+template<class HostIterator>
+struct is_bool_value_type :
+    boost::is_same<
+        typename boost::remove_cv<
+            typename std::iterator_traits<HostIterator>::value_type
+        >::type,
+        bool
+    >::type {};
+
+// host -> device (async)
 template<class InputIterator, class OutputIterator>
-inline OutputIterator
-dispatch_copy(InputIterator first,
-              InputIterator last,
-              OutputIterator result,
-              command_queue &queue,
-              typename boost::enable_if_c<
-                  !is_device_iterator<InputIterator>::value &&
-                  is_device_iterator<OutputIterator>::value
-              >::type* = 0)
+inline future<OutputIterator>
+dispatch_copy_async(InputIterator first,
+                    InputIterator last,
+                    OutputIterator result,
+                    command_queue &queue,
+                    typename boost::enable_if<
+                        mpl::and_<
+                            mpl::not_<
+                                is_device_iterator<InputIterator>
+                            >,
+                            is_device_iterator<OutputIterator>,
+                            is_same_value_type<InputIterator, OutputIterator>
+                        >
+                    >::type* = 0)
 {
-    if(is_contiguous_iterator<InputIterator>::value){
-        return copy_to_device(first, last, result, queue);
-    }
-    else {
-        // for non-contiguous input we first copy the values to
-        // a temporary std::vector and then copy from there
-        typedef typename std::iterator_traits<InputIterator>::value_type T;
-        std::vector<T> vector(first, last);
-        return copy_to_device(vector.begin(), vector.end(), result, queue);
-    }
+    BOOST_STATIC_ASSERT_MSG(
+        is_contiguous_iterator<InputIterator>::value,
+        "copy_async() is only supported for contiguous host iterators"
+    );
+
+    return copy_to_device_async(first, last, result, queue);
 }
 
 // host -> device (async)
+// Type mismatch between InputIterator and OutputIterator value_types
 template<class InputIterator, class OutputIterator>
 inline future<OutputIterator>
 dispatch_copy_async(InputIterator first,
                     InputIterator last,
                     OutputIterator result,
                     command_queue &queue,
-                    typename boost::enable_if_c<
-                        !is_device_iterator<InputIterator>::value &&
-                        is_device_iterator<OutputIterator>::value
+                    typename boost::enable_if<
+                        mpl::and_<
+                            mpl::not_<
+                                is_device_iterator<InputIterator>
+                            >,
+                            is_device_iterator<OutputIterator>,
+                            mpl::not_<
+                                is_same_value_type<InputIterator, OutputIterator>
+                            >
+                        >
                     >::type* = 0)
 {
     BOOST_STATIC_ASSERT_MSG(
@@ -97,32 +146,211 @@ dispatch_copy_async(InputIterator first,
         "copy_async() is only supported for contiguous host iterators"
     );
 
-    return copy_to_device_async(first, last, result, queue);
+    typedef typename std::iterator_traits<InputIterator>::value_type input_type;
+
+    const context &context = queue.get_context();
+    size_t count = iterator_range_size(first, last);
+
+    if(count < size_t(1)) {
+        return future<OutputIterator>();
+    }
+
+    // map [first; last) to device and run copy kernel
+    // on device for copying & casting
+    ::boost::compute::mapped_view<input_type> mapped_host(
+        // make sure it's a pointer to constant data
+        // to force read only mapping
+        const_cast<const input_type*>(
+            ::boost::addressof(*first)
+        ),
+        count,
+        context
+    );
+    return copy_on_device_async(
+        mapped_host.begin(), mapped_host.end(), result, queue
+    );
 }
 
-// device -> host
+// host -> device
+// InputIterator is a contiguous iterator
 template<class InputIterator, class OutputIterator>
 inline OutputIterator
 dispatch_copy(InputIterator first,
               InputIterator last,
               OutputIterator result,
               command_queue &queue,
-              typename boost::enable_if_c<
-                  is_device_iterator<InputIterator>::value &&
-                  !is_device_iterator<OutputIterator>::value
+              typename boost::enable_if<
+                  mpl::and_<
+                      mpl::not_<
+                          is_device_iterator<InputIterator>
+                      >,
+                      is_device_iterator<OutputIterator>,
+                      is_same_value_type<InputIterator, OutputIterator>,
+                      is_contiguous_iterator<InputIterator>
+                  >
+              >::type* = 0)
+{
+    return copy_to_device(first, last, result, queue);
+}
+
+// host -> device
+// Type mismatch between InputIterator and OutputIterator value_types
+// InputIterator is a contiguous iterator
+template<class InputIterator, class OutputIterator>
+inline OutputIterator
+dispatch_copy(InputIterator first,
+              InputIterator last,
+              OutputIterator result,
+              command_queue &queue,
+              typename boost::enable_if<
+                  mpl::and_<
+                      mpl::not_<
+                          is_device_iterator<InputIterator>
+                      >,
+                      is_device_iterator<OutputIterator>,
+                      mpl::not_<
+                          is_same_value_type<InputIterator, OutputIterator>
+                      >,
+                      is_contiguous_iterator<InputIterator>
+                  >
               >::type* = 0)
 {
-    if(is_contiguous_iterator<OutputIterator>::value){
-        return copy_to_host(first, last, result, queue);
+    typedef typename OutputIterator::value_type output_type;
+    typedef typename std::iterator_traits<InputIterator>::value_type input_type;
+
+    const device &device = queue.get_device();
+
+    // loading parameters
+    std::string cache_key =
+        std::string("__boost_compute_copy_to_device_")
+            + type_name<input_type>() + "_" + type_name<output_type>();
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+
+    size_t map_copy_threshold;
+    size_t direct_copy_threshold;
+
+    // calculate default values of thresholds
+    if (device.type() & device::gpu) {
+        // GPUs
+        map_copy_threshold = 524288;  // 0.5 MB
+        direct_copy_threshold = 52428800; // 50 MB
     }
     else {
-        // for non-contiguous input we first copy the values to
-        // a temporary std::vector and then copy from there
-        typedef typename std::iterator_traits<InputIterator>::value_type T;
-        std::vector<T> vector(iterator_range_size(first, last));
-        copy_to_host(first, last, vector.begin(), queue);
-        return std::copy(vector.begin(), vector.end(), result);
+        // CPUs and other devices
+        map_copy_threshold = 134217728; // 128 MB
+        direct_copy_threshold = 0; // it's never efficient for CPUs
     }
+
+    // load thresholds
+    map_copy_threshold =
+        parameters->get(
+            cache_key, "map_copy_threshold", map_copy_threshold
+        );
+    direct_copy_threshold =
+        parameters->get(
+            cache_key, "direct_copy_threshold", direct_copy_threshold
+        );
+
+    // select copy method based on thresholds & input_size_bytes
+    size_t count = iterator_range_size(first, last);
+    size_t input_size_bytes = count * sizeof(input_type);
+
+    // [0; map_copy_threshold) -> copy_to_device_map()
+    if(input_size_bytes < map_copy_threshold) {
+        return copy_to_device_map(first, last, result, queue);
+    }
+    // [map_copy_threshold; direct_copy_threshold) -> convert [first; last)
+    //     on host and then perform copy_to_device()
+    else if(input_size_bytes < direct_copy_threshold) {
+        std::vector<output_type> vector(first, last);
+        return copy_to_device(vector.begin(), vector.end(), result, queue);
+    }
+
+    // [direct_copy_threshold; inf) -> map [first; last) to device and
+    //     run copy kernel on device for copying & casting
+    // At this point we are sure that count > 1 (first != last).
+
+    // Perform async copy to device, wait for it to be finished and
+    // return the result.
+    // At this point we are sure that count > 1 (first != last), so event
+    // returned by dispatch_copy_async() must be valid.
+    return dispatch_copy_async(first, last, result, queue).get();
+}
+
+// host -> device
+// InputIterator is NOT a contiguous iterator
+template<class InputIterator, class OutputIterator>
+inline OutputIterator
+dispatch_copy(InputIterator first,
+              InputIterator last,
+              OutputIterator result,
+              command_queue &queue,
+              typename boost::enable_if<
+                  mpl::and_<
+                      mpl::not_<
+                          is_device_iterator<InputIterator>
+                      >,
+                      is_device_iterator<OutputIterator>,
+                      mpl::not_<
+                          is_contiguous_iterator<InputIterator>
+                      >
+                  >
+              >::type* = 0)
+{
+    typedef typename OutputIterator::value_type output_type;
+    typedef typename std::iterator_traits<InputIterator>::value_type input_type;
+
+    const device &device = queue.get_device();
+
+    // loading parameters
+    std::string cache_key =
+        std::string("__boost_compute_copy_to_device_")
+            + type_name<input_type>() + "_" + type_name<output_type>();
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+
+    size_t map_copy_threshold;
+    size_t direct_copy_threshold;
+
+    // calculate default values of thresholds
+    if (device.type() & device::gpu) {
+        // GPUs
+        map_copy_threshold = 524288;  // 0.5 MB
+        direct_copy_threshold = 52428800; // 50 MB
+    }
+    else {
+        // CPUs and other devices
+        map_copy_threshold = 134217728; // 128 MB
+        direct_copy_threshold = 0; // it's never efficient for CPUs
+    }
+
+    // load thresholds
+    map_copy_threshold =
+        parameters->get(
+            cache_key, "map_copy_threshold", map_copy_threshold
+        );
+    direct_copy_threshold =
+        parameters->get(
+            cache_key, "direct_copy_threshold", direct_copy_threshold
+        );
+
+    // select copy method based on thresholds & input_size_bytes
+    size_t input_size = iterator_range_size(first, last);
+    size_t input_size_bytes = input_size * sizeof(input_type);
+
+    // [0; map_copy_threshold) -> copy_to_device_map()
+    //
+    // if direct_copy_threshold is less than map_copy_threshold
+    // copy_to_device_map() is used for every input
+    if(input_size_bytes < map_copy_threshold
+        || direct_copy_threshold <= map_copy_threshold) {
+        return copy_to_device_map(first, last, result, queue);
+    }
+    // [map_copy_threshold; inf) -> convert [first; last)
+    //     on host and then perform copy_to_device()
+    std::vector<output_type> vector(first, last);
+    return copy_to_device(vector.begin(), vector.end(), result, queue);
 }
 
 // device -> host (async)
@@ -132,9 +360,14 @@ dispatch_copy_async(InputIterator first,
                     InputIterator last,
                     OutputIterator result,
                     command_queue &queue,
-                    typename boost::enable_if_c<
-                        is_device_iterator<InputIterator>::value &&
-                       !is_device_iterator<OutputIterator>::value
+                    typename boost::enable_if<
+                        mpl::and_<
+                            is_device_iterator<InputIterator>,
+                            mpl::not_<
+                                is_device_iterator<OutputIterator>
+                            >,
+                            is_same_value_type<OutputIterator, InputIterator>
+                        >
                     >::type* = 0)
 {
     BOOST_STATIC_ASSERT_MSG(
@@ -145,6 +378,267 @@ dispatch_copy_async(InputIterator first,
     return copy_to_host_async(first, last, result, queue);
 }
 
+// device -> host (async)
+// Type mismatch between InputIterator and OutputIterator value_types
+template<class InputIterator, class OutputIterator>
+inline future<OutputIterator>
+dispatch_copy_async(InputIterator first,
+                    InputIterator last,
+                    OutputIterator result,
+                    command_queue &queue,
+                    typename boost::enable_if<
+                        mpl::and_<
+                            is_device_iterator<InputIterator>,
+                            mpl::not_<
+                                is_device_iterator<OutputIterator>
+                            >,
+                            mpl::not_<
+                                is_same_value_type<OutputIterator, InputIterator>
+                            >
+                        >
+                    >::type* = 0)
+{
+    BOOST_STATIC_ASSERT_MSG(
+        is_contiguous_iterator<OutputIterator>::value,
+        "copy_async() is only supported for contiguous host iterators"
+    );
+
+    typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
+    const context &context = queue.get_context();
+    size_t count = iterator_range_size(first, last);
+
+    if(count < size_t(1)) {
+        return future<OutputIterator>();
+    }
+
+    // map host memory to device
+    buffer mapped_host(
+        context,
+        count * sizeof(output_type),
+        buffer::write_only | buffer::use_host_ptr,
+        static_cast<void*>(
+            ::boost::addressof(*result)
+        )
+    );
+    // copy async on device
+    ::boost::compute::future<buffer_iterator<output_type> > future =
+        copy_on_device_async(
+            first,
+            last,
+            make_buffer_iterator<output_type>(mapped_host),
+            queue
+        );
+    // update host memory asynchronously by maping and unmaping memory
+    event map_event;
+    void* ptr = queue.enqueue_map_buffer_async(
+        mapped_host,
+        CL_MAP_READ,
+        0,
+        count * sizeof(output_type),
+        map_event,
+        future.get_event()
+    );
+    event unmap_event =
+        queue.enqueue_unmap_buffer(mapped_host, ptr, map_event);
+    return make_future(result + count, unmap_event);
+}
+
+// device -> host
+// OutputIterator is a contiguous iterator
+template<class InputIterator, class OutputIterator>
+inline OutputIterator
+dispatch_copy(InputIterator first,
+              InputIterator last,
+              OutputIterator result,
+              command_queue &queue,
+              typename boost::enable_if<
+                  mpl::and_<
+                      is_device_iterator<InputIterator>,
+                      mpl::not_<
+                          is_device_iterator<OutputIterator>
+                      >,
+                      is_same_value_type<OutputIterator, InputIterator>,
+                      is_contiguous_iterator<OutputIterator>,
+                      mpl::not_<
+                          is_bool_value_type<OutputIterator>
+                      >
+                  >
+              >::type* = 0)
+{
+    return copy_to_host(first, last, result, queue);
+}
+
+// device -> host
+// Type mismatch between InputIterator and OutputIterator value_types
+// OutputIterator is NOT a contiguous iterator or value_type of OutputIterator
+// is a boolean type.
+template<class InputIterator, class OutputIterator>
+inline OutputIterator
+dispatch_copy(InputIterator first,
+              InputIterator last,
+              OutputIterator result,
+              command_queue &queue,
+              typename boost::enable_if<
+                  mpl::and_<
+                      is_device_iterator<InputIterator>,
+                      mpl::not_<
+                          is_device_iterator<OutputIterator>
+                      >,
+                      mpl::or_<
+                          mpl::not_<
+                              is_contiguous_iterator<OutputIterator>
+                          >,
+                          is_bool_value_type<OutputIterator>
+                      >
+                  >
+              >::type* = 0)
+{
+    typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
+    typedef typename InputIterator::value_type input_type;
+
+    const device &device = queue.get_device();
+
+    // loading parameters
+    std::string cache_key =
+        std::string("__boost_compute_copy_to_host_")
+            + type_name<input_type>() + "_" + type_name<output_type>();
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+
+    size_t map_copy_threshold;
+    size_t direct_copy_threshold;
+
+    // calculate default values of thresholds
+    if (device.type() & device::gpu) {
+        // GPUs
+        map_copy_threshold = 33554432;  // 30 MB
+        direct_copy_threshold = 0; // it's never efficient for GPUs
+    }
+    else {
+        // CPUs and other devices
+        map_copy_threshold = 134217728; // 128 MB
+        direct_copy_threshold = 0; // it's never efficient for CPUs
+    }
+
+    // load thresholds
+    map_copy_threshold =
+        parameters->get(
+            cache_key, "map_copy_threshold", map_copy_threshold
+        );
+    direct_copy_threshold =
+        parameters->get(
+            cache_key, "direct_copy_threshold", direct_copy_threshold
+        );
+
+    // select copy method based on thresholds & input_size_bytes
+    size_t count = iterator_range_size(first, last);
+    size_t input_size_bytes = count * sizeof(input_type);
+
+    // [0; map_copy_threshold) -> copy_to_host_map()
+    //
+    // if direct_copy_threshold is less than map_copy_threshold
+    // copy_to_host_map() is used for every input
+    if(input_size_bytes < map_copy_threshold
+        || direct_copy_threshold <= map_copy_threshold) {
+        return copy_to_host_map(first, last, result, queue);
+    }
+    // [map_copy_threshold; inf) -> copy [first;last) to temporary vector
+    //     then copy (and convert) to result using std::copy()
+    std::vector<input_type> vector(count);
+    copy_to_host(first, last, vector.begin(), queue);
+    return std::copy(vector.begin(), vector.end(), result);
+}
+
+// device -> host
+// Type mismatch between InputIterator and OutputIterator value_types
+// OutputIterator is a contiguous iterator
+// value_type of OutputIterator is NOT a boolean type
+template<class InputIterator, class OutputIterator>
+inline OutputIterator
+dispatch_copy(InputIterator first,
+              InputIterator last,
+              OutputIterator result,
+              command_queue &queue,
+              typename boost::enable_if<
+                  mpl::and_<
+                      is_device_iterator<InputIterator>,
+                      mpl::not_<
+                          is_device_iterator<OutputIterator>
+                      >,
+                      mpl::not_<
+                          is_same_value_type<OutputIterator, InputIterator>
+                      >,
+                      is_contiguous_iterator<OutputIterator>,
+                      mpl::not_<
+                          is_bool_value_type<OutputIterator>
+                      >
+                  >
+              >::type* = 0)
+{
+    typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
+    typedef typename InputIterator::value_type input_type;
+
+    const device &device = queue.get_device();
+
+    // loading parameters
+    std::string cache_key =
+        std::string("__boost_compute_copy_to_host_")
+            + type_name<input_type>() + "_" + type_name<output_type>();
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+
+    size_t map_copy_threshold;
+    size_t direct_copy_threshold;
+
+    // calculate default values of thresholds
+    if (device.type() & device::gpu) {
+        // GPUs
+        map_copy_threshold = 524288;  // 0.5 MB
+        direct_copy_threshold = 52428800; // 50 MB
+    }
+    else {
+        // CPUs and other devices
+        map_copy_threshold = 134217728; // 128 MB
+        direct_copy_threshold = 0; // it's never efficient for CPUs
+    }
+
+    // load thresholds
+    map_copy_threshold =
+        parameters->get(
+            cache_key, "map_copy_threshold", map_copy_threshold
+        );
+    direct_copy_threshold =
+        parameters->get(
+            cache_key, "direct_copy_threshold", direct_copy_threshold
+        );
+
+    // select copy method based on thresholds & input_size_bytes
+    size_t count = iterator_range_size(first, last);
+    size_t input_size_bytes = count * sizeof(input_type);
+
+    // [0; map_copy_threshold) -> copy_to_host_map()
+    if(input_size_bytes < map_copy_threshold) {
+        return copy_to_host_map(first, last, result, queue);
+    }
+    // [map_copy_threshold; direct_copy_threshold) -> copy [first;last) to
+    //     temporary vector then copy (and convert) to result using std::copy()
+    else if(input_size_bytes < direct_copy_threshold) {
+        std::vector<input_type> vector(count);
+        copy_to_host(first, last, vector.begin(), queue);
+        return std::copy(vector.begin(), vector.end(), result);
+    }
+
+    // [direct_copy_threshold; inf) -> map [result; result + input_size) to
+    //     device and run copy kernel on device for copying & casting
+    // map host memory to device.
+
+    // Perform async copy to host, wait for it to be finished and
+    // return the result.
+    // At this point we are sure that count > 1 (first != last), so event
+    // returned by dispatch_copy_async() must be valid.
+    return dispatch_copy_async(first, last, result, queue).get();
+}
+
 // device -> device
 template<class InputIterator, class OutputIterator>
 inline OutputIterator
diff --git a/boost/compute/algorithm/detail/copy_on_device.hpp b/boost/compute/algorithm/detail/copy_on_device.hpp
index 0bcee27ed5..8738c8c0b4 100644
--- a/boost/compute/algorithm/detail/copy_on_device.hpp
+++ b/boost/compute/algorithm/detail/copy_on_device.hpp
@@ -22,84 +22,96 @@
 #include <boost/compute/detail/meta_kernel.hpp>
 #include <boost/compute/detail/parameter_cache.hpp>
 #include <boost/compute/detail/work_size.hpp>
+#include <boost/compute/detail/vendor.hpp>
 
 namespace boost {
 namespace compute {
 namespace detail {
 
-inline size_t pick_copy_work_group_size(size_t n, const device &device)
+template<class InputIterator, class OutputIterator>
+inline event copy_on_device_cpu(InputIterator first,
+                                OutputIterator result,
+                                size_t count,
+                                command_queue &queue)
 {
-    (void) device;
-
-    if(n % 32 == 0) return 32;
-    else if(n % 16 == 0) return 16;
-    else if(n % 8 == 0) return 8;
-    else if(n % 4 == 0) return 4;
-    else if(n % 2 == 0) return 2;
-    else return 1;
+    meta_kernel k("copy");
+    const device& device = queue.get_device();
+
+    k <<
+        "uint block = " <<
+            "(uint)ceil(((float)count)/get_global_size(0));\n" <<
+        "uint index = get_global_id(0) * block;\n" <<
+        "uint end = min(count, index + block);\n" <<
+        "while(index < end){\n" <<
+            result[k.var<uint_>("index")] << '=' <<
+                first[k.var<uint_>("index")] << ";\n" <<
+            "index++;\n" <<
+        "}\n";
+
+    k.add_set_arg<const uint_>("count", static_cast<uint_>(count));
+
+    size_t global_work_size = device.compute_units();
+    if(count <= 1024) global_work_size = 1;
+    return k.exec_1d(queue, 0, global_work_size);
 }
 
 template<class InputIterator, class OutputIterator>
-class copy_kernel : public meta_kernel
+inline event copy_on_device_gpu(InputIterator first,
+                                OutputIterator result,
+                                size_t count,
+                                command_queue &queue)
 {
-public:
-    copy_kernel(const device &device)
-        : meta_kernel("copy")
-    {
-        m_count = 0;
-
-        typedef typename std::iterator_traits<InputIterator>::value_type input_type;
-
-        boost::shared_ptr<parameter_cache> parameters =
-            detail::parameter_cache::get_global_cache(device);
-
-        std::string cache_key =
-            "__boost_copy_kernel_" + boost::lexical_cast<std::string>(sizeof(input_type));
+    typedef typename std::iterator_traits<InputIterator>::value_type input_type;
+
+    const device& device = queue.get_device();
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+    std::string cache_key =
+        "__boost_copy_kernel_" + boost::lexical_cast<std::string>(sizeof(input_type));
+
+    uint_ vpt = parameters->get(cache_key, "vpt", 4);
+    uint_ tpb = parameters->get(cache_key, "tpb", 128);
+
+    meta_kernel k("copy");
+    k <<
+        "uint index = get_local_id(0) + " <<
+            "(" << vpt * tpb << " * get_group_id(0));\n" <<
+        "for(uint i = 0; i < " << vpt << "; i++){\n" <<
+        "    if(index < count){\n" <<
+                result[k.var<uint_>("index")] << '=' <<
+                    first[k.var<uint_>("index")] << ";\n" <<
+        "       index += " << tpb << ";\n"
+        "    }\n"
+        "}\n";
+
+    k.add_set_arg<const uint_>("count", static_cast<uint_>(count));
+    size_t global_work_size = calculate_work_size(count, vpt, tpb);
+    return k.exec_1d(queue, 0, global_work_size, tpb);
+}
 
-        m_vpt = parameters->get(cache_key, "vpt", 4);
-        m_tpb = parameters->get(cache_key, "tpb", 128);
-    }
+template<class InputIterator, class OutputIterator>
+inline event dispatch_copy_on_device(InputIterator first,
+                                     InputIterator last,
+                                     OutputIterator result,
+                                     command_queue &queue)
+{
+    const size_t count = detail::iterator_range_size(first, last);
 
-    void set_range(InputIterator first,
-                   InputIterator last,
-                   OutputIterator result)
-    {
-        m_count_arg = add_arg<uint_>("count");
-
-        *this <<
-            "uint index = get_local_id(0) + " <<
-               "(" << m_vpt * m_tpb << " * get_group_id(0));\n" <<
-            "for(uint i = 0; i < " << m_vpt << "; i++){\n" <<
-            "    if(index < count){\n" <<
-                     result[expr<uint_>("index")] << '=' <<
-                         first[expr<uint_>("index")] << ";\n" <<
-            "        index += " << m_tpb << ";\n"
-            "    }\n"
-            "}\n";
-
-        m_count = detail::iterator_range_size(first, last);
+    if(count == 0){
+        // nothing to do
+        return event();
     }
 
-    event exec(command_queue &queue)
+    const device& device = queue.get_device();
+    // copy_on_device_cpu() does not work for CPU on Apple platform
+    // due to bug in its compiler.
+    // See https://github.com/boostorg/compute/pull/626
+    if((device.type() & device::cpu) && !is_apple_platform_device(device))
     {
-        if(m_count == 0){
-            // nothing to do
-            return event();
-        }
-
-        size_t global_work_size = calculate_work_size(m_count, m_vpt, m_tpb);
-
-        set_arg(m_count_arg, uint_(m_count));
-
-        return exec_1d(queue, 0, global_work_size, m_tpb);
+        return copy_on_device_cpu(first, result, count, queue);
     }
-
-private:
-    size_t m_count;
-    size_t m_count_arg;
-    uint_ m_vpt;
-    uint_ m_tpb;
-};
+    return copy_on_device_gpu(first, result, count, queue);
+}
 
 template<class InputIterator, class OutputIterator>
 inline OutputIterator copy_on_device(InputIterator first,
@@ -107,13 +119,7 @@ inline OutputIterator copy_on_device(InputIterator first,
                                      OutputIterator result,
                                      command_queue &queue)
 {
-    const device &device = queue.get_device();
-
-    copy_kernel<InputIterator, OutputIterator> kernel(device);
-
-    kernel.set_range(first, last, result);
-    kernel.exec(queue);
-
+    dispatch_copy_on_device(first, last, result, queue);
     return result + std::distance(first, last);
 }
 
@@ -134,13 +140,7 @@ inline future<OutputIterator> copy_on_device_async(InputIterator first,
                                                    OutputIterator result,
                                                    command_queue &queue)
 {
-    const device &device = queue.get_device();
-
-    copy_kernel<InputIterator, OutputIterator> kernel(device);
-
-    kernel.set_range(first, last, result);
-    event event_ = kernel.exec(queue);
-
+    event event_ = dispatch_copy_on_device(first, last, result, queue);
     return make_future(result + std::distance(first, last), event_);
 }
 
@@ -172,7 +172,7 @@ inline future<svm_ptr<T> > copy_on_device_async(svm_ptr<T> first,
 {
     size_t count = iterator_range_size(first, last);
     if(count == 0){
-        return result;
+        return future<svm_ptr<T> >();
     }
 
     event event_ = queue.enqueue_svm_memcpy_async(
diff --git a/boost/compute/algorithm/detail/copy_to_device.hpp b/boost/compute/algorithm/detail/copy_to_device.hpp
index 90545fb4ed..bce5975f53 100644
--- a/boost/compute/algorithm/detail/copy_to_device.hpp
+++ b/boost/compute/algorithm/detail/copy_to_device.hpp
@@ -53,6 +53,49 @@ inline DeviceIterator copy_to_device(HostIterator first,
 }
 
 template<class HostIterator, class DeviceIterator>
+inline DeviceIterator copy_to_device_map(HostIterator first,
+                                         HostIterator last,
+                                         DeviceIterator result,
+                                         command_queue &queue)
+{
+    typedef typename
+        std::iterator_traits<DeviceIterator>::value_type
+        value_type;
+    typedef typename
+        std::iterator_traits<DeviceIterator>::difference_type
+        difference_type;
+
+    size_t count = iterator_range_size(first, last);
+    if(count == 0){
+        return result;
+    }
+
+    size_t offset = result.get_index();
+
+    // map result buffer to host
+    value_type *pointer = static_cast<value_type*>(
+        queue.enqueue_map_buffer(
+            result.get_buffer(),
+            CL_MAP_WRITE,
+            offset * sizeof(value_type),
+            count * sizeof(value_type)
+        )
+    );
+
+    // copy [first; last) to result buffer
+    std::copy(first, last, pointer);
+
+    // unmap result buffer
+    boost::compute::event unmap_event = queue.enqueue_unmap_buffer(
+        result.get_buffer(),
+        static_cast<void*>(pointer)
+    );
+    unmap_event.wait();
+
+    return result + static_cast<difference_type>(count);
+}
+
+template<class HostIterator, class DeviceIterator>
 inline future<DeviceIterator> copy_to_device_async(HostIterator first,
                                                    HostIterator last,
                                                    DeviceIterator result,
@@ -109,7 +152,7 @@ inline future<svm_ptr<T> > copy_to_device_async(HostIterator first,
 {
     size_t count = iterator_range_size(first, last);
     if(count == 0){
-        return result;
+        return future<svm_ptr<T> >();
     }
 
     event event_ = queue.enqueue_svm_memcpy_async(
@@ -118,6 +161,29 @@ inline future<svm_ptr<T> > copy_to_device_async(HostIterator first,
 
     return make_future(result + count, event_);
 }
+
+template<class HostIterator, class T>
+inline svm_ptr<T> copy_to_device_map(HostIterator first,
+                                              HostIterator last,
+                                              svm_ptr<T> result,
+                                              command_queue &queue)
+{
+    size_t count = iterator_range_size(first, last);
+    if(count == 0){
+        return result;
+    }
+
+    // map
+    queue.enqueue_svm_map(result.get(), count * sizeof(T), CL_MAP_WRITE);
+
+    // copy [first; last) to result buffer
+    std::copy(first, last, static_cast<T*>(result.get()));
+
+    // unmap result
+    queue.enqueue_svm_unmap(result.get()).wait();
+
+    return result + count;
+}
 #endif // CL_VERSION_2_0
 
 } // end detail namespace
diff --git a/boost/compute/algorithm/detail/copy_to_host.hpp b/boost/compute/algorithm/detail/copy_to_host.hpp
index b889e0c871..d770a996ef 100644
--- a/boost/compute/algorithm/detail/copy_to_host.hpp
+++ b/boost/compute/algorithm/detail/copy_to_host.hpp
@@ -51,17 +51,51 @@ inline HostIterator copy_to_host(DeviceIterator first,
     return iterator_plus_distance(result, count);
 }
 
-// copy_to_host() specialization for std::vector<bool>
-template<class DeviceIterator>
-inline std::vector<bool>::iterator
-copy_to_host(DeviceIterator first,
-             DeviceIterator last,
-             std::vector<bool>::iterator result,
-             command_queue &queue)
+template<class DeviceIterator, class HostIterator>
+inline HostIterator copy_to_host_map(DeviceIterator first,
+                                     DeviceIterator last,
+                                     HostIterator result,
+                                     command_queue &queue)
 {
-    std::vector<uint8_t> temp(std::distance(first, last));
-    copy_to_host(first, last, temp.begin(), queue);
-    return std::copy(temp.begin(), temp.end(), result);
+    typedef typename
+        std::iterator_traits<DeviceIterator>::value_type
+        value_type;
+    typedef typename
+        std::iterator_traits<DeviceIterator>::difference_type
+        difference_type;
+
+    size_t count = iterator_range_size(first, last);
+    if(count == 0){
+        return result;
+    }
+
+    size_t offset = first.get_index();
+
+    // map [first; last) buffer to host
+    value_type *pointer = static_cast<value_type*>(
+        queue.enqueue_map_buffer(
+            first.get_buffer(),
+            CL_MAP_READ,
+            offset * sizeof(value_type),
+            count * sizeof(value_type)
+        )
+    );
+
+    // copy [first; last) to result buffer
+    std::copy(
+        pointer,
+        pointer + static_cast<difference_type>(count),
+        result
+    );
+
+    // unmap [first; last)
+    boost::compute::event unmap_event = queue.enqueue_unmap_buffer(
+        first.get_buffer(),
+        static_cast<void*>(pointer)
+    );
+    unmap_event.wait();
+
+    return iterator_plus_distance(result, count);
 }
 
 template<class DeviceIterator, class HostIterator>
@@ -119,7 +153,7 @@ inline future<HostIterator> copy_to_host_async(svm_ptr<T> first,
 {
     size_t count = iterator_range_size(first, last);
     if(count == 0){
-        return result;
+        return future<HostIterator>();
     }
 
     event event_ = queue.enqueue_svm_memcpy_async(
@@ -128,6 +162,33 @@ inline future<HostIterator> copy_to_host_async(svm_ptr<T> first,
 
     return make_future(iterator_plus_distance(result, count), event_);
 }
+
+template<class T, class HostIterator>
+inline HostIterator copy_to_host_map(svm_ptr<T> first,
+                                     svm_ptr<T> last,
+                                     HostIterator result,
+                                     command_queue &queue)
+{
+    size_t count = iterator_range_size(first, last);
+    if(count == 0){
+        return result;
+    }
+
+    // map
+    queue.enqueue_svm_map(first.get(), count * sizeof(T), CL_MAP_READ);
+
+    // copy [first; last) to result
+    std::copy(
+        static_cast<T*>(first.get()),
+        static_cast<T*>(last.get()),
+        result
+    );
+
+    // unmap [first; last)
+    queue.enqueue_svm_unmap(first.get()).wait();
+
+    return iterator_plus_distance(result, count);
+}
 #endif // CL_VERSION_2_0
 
 } // end detail namespace
diff --git a/boost/compute/algorithm/detail/find_extrema.hpp b/boost/compute/algorithm/detail/find_extrema.hpp
index 6e756c3904..eef2e36c3c 100644
--- a/boost/compute/algorithm/detail/find_extrema.hpp
+++ b/boost/compute/algorithm/detail/find_extrema.hpp
@@ -12,6 +12,7 @@
 #define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_HPP
 
 #include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/algorithm/detail/find_extrema_on_cpu.hpp>
 #include <boost/compute/algorithm/detail/find_extrema_with_reduce.hpp>
 #include <boost/compute/algorithm/detail/find_extrema_with_atomics.hpp>
 #include <boost/compute/algorithm/detail/serial_find_extrema.hpp>
@@ -36,12 +37,17 @@ inline InputIterator find_extrema(InputIterator first,
 
     const device &device = queue.get_device();
 
+    // CPU
+    if(device.type() & device::cpu) {
+        return find_extrema_on_cpu(first, last, compare, find_minimum, queue);
+    }
+
+    // GPU
     // use serial method for small inputs
-    // and when device is a CPU
-    if(count < 512 || (device.type() & device::cpu)){
+    if(count < 512)
+    {
         return serial_find_extrema(first, last, compare, find_minimum, queue);
     }
-
     // find_extrema_with_reduce() is used only if requirements are met
     if(find_extrema_with_reduce_requirements_met(first, last, queue))
     {
diff --git a/boost/compute/algorithm/detail/find_extrema_on_cpu.hpp b/boost/compute/algorithm/detail/find_extrema_on_cpu.hpp
new file mode 100644
index 0000000000..9363ad5837
--- /dev/null
+++ b/boost/compute/algorithm/detail/find_extrema_on_cpu.hpp
@@ -0,0 +1,138 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_ON_CPU_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_ON_CPU_HPP
+
+#include <algorithm>
+
+#include <boost/compute/algorithm/detail/find_extrema_with_reduce.hpp>
+#include <boost/compute/algorithm/detail/find_extrema_with_atomics.hpp>
+#include <boost/compute/algorithm/detail/serial_find_extrema.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class Compare>
+inline InputIterator find_extrema_on_cpu(InputIterator first,
+                                         InputIterator last,
+                                         Compare compare,
+                                         const bool find_minimum,
+                                         command_queue &queue)
+{
+    typedef typename std::iterator_traits<InputIterator>::value_type input_type;
+    typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
+    size_t count = iterator_range_size(first, last);
+
+    const device &device = queue.get_device();
+    const uint_ compute_units = queue.get_device().compute_units();
+
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+    std::string cache_key =
+        "__boost_find_extrema_cpu_"
+            + boost::lexical_cast<std::string>(sizeof(input_type));
+
+    // for inputs smaller than serial_find_extrema_threshold
+    // serial_find_extrema algorithm is used
+    uint_ serial_find_extrema_threshold = parameters->get(
+        cache_key,
+        "serial_find_extrema_threshold",
+        16384 * sizeof(input_type)
+    );
+    serial_find_extrema_threshold =
+        (std::max)(serial_find_extrema_threshold, uint_(2 * compute_units));
+
+    const context &context = queue.get_context();
+    if(count < serial_find_extrema_threshold) {
+        return serial_find_extrema(first, last, compare, find_minimum, queue);
+    }
+
+    meta_kernel k("find_extrema_on_cpu");
+    buffer output(context, sizeof(input_type) * compute_units);
+    buffer output_idx(
+        context, sizeof(uint_) * compute_units,
+        buffer::read_write | buffer::alloc_host_ptr
+    );
+
+    size_t count_arg = k.add_arg<uint_>("count");
+    size_t output_arg =
+        k.add_arg<input_type *>(memory_object::global_memory, "output");
+    size_t output_idx_arg =
+        k.add_arg<uint_ *>(memory_object::global_memory, "output_idx");
+
+    k <<
+        "uint block = " <<
+            "(uint)ceil(((float)count)/get_global_size(0));\n" <<
+        "uint index = get_global_id(0) * block;\n" <<
+        "uint end = min(count, index + block);\n" <<
+
+        "uint value_index = index;\n" <<
+        k.decl<input_type>("value") << " = " << first[k.var<uint_>("index")] << ";\n" <<
+
+        "index++;\n" <<
+        "while(index < end){\n" <<
+            k.decl<input_type>("candidate") <<
+                " = " << first[k.var<uint_>("index")] << ";\n" <<
+        "#ifndef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
+            "bool compare = " << compare(k.var<input_type>("candidate"),
+                                         k.var<input_type>("value")) << ";\n" <<
+        "#else\n" <<
+            "bool compare = " << compare(k.var<input_type>("value"),
+                                         k.var<input_type>("candidate")) << ";\n" <<
+        "#endif\n" <<
+            "value = compare ? candidate : value;\n" <<
+            "value_index = compare ? index : value_index;\n" <<
+            "index++;\n" <<
+        "}\n" <<
+        "output[get_global_id(0)] = value;\n" <<
+        "output_idx[get_global_id(0)] = value_index;\n";
+
+    size_t global_work_size = compute_units;
+    std::string options;
+    if(!find_minimum){
+        options = "-DBOOST_COMPUTE_FIND_MAXIMUM";
+    }
+    kernel kernel = k.compile(context, options);
+
+    kernel.set_arg(count_arg, static_cast<uint_>(count));
+    kernel.set_arg(output_arg, output);
+    kernel.set_arg(output_idx_arg, output_idx);
+    queue.enqueue_1d_range_kernel(kernel, 0, global_work_size, 0);
+    
+    buffer_iterator<input_type> result = serial_find_extrema(
+        make_buffer_iterator<input_type>(output),
+        make_buffer_iterator<input_type>(output, global_work_size),
+        compare,
+        find_minimum,
+        queue
+    );
+
+    uint_* output_idx_host_ptr =
+        static_cast<uint_*>(
+            queue.enqueue_map_buffer(
+                output_idx, command_queue::map_read,
+                0, global_work_size * sizeof(uint_)
+            )
+        );
+
+    difference_type extremum_idx =
+        static_cast<difference_type>(*(output_idx_host_ptr + result.get_index()));
+    return first + extremum_idx;
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_ON_CPU_HPP
diff --git a/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp b/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp
index 1fbb7dee19..8f2a83c38b 100644
--- a/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp
+++ b/boost/compute/algorithm/detail/find_extrema_with_reduce.hpp
@@ -128,7 +128,7 @@ inline void find_extrema_with_reduce(InputIterator input,
             // Next element
             k.decl<input_type>("next") << " = " << input[k.var<uint_>("idx")] << ";\n" <<
             "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
-            k.decl<input_type>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
+            k.decl<uint_>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
             "#endif\n" <<
 
             // Comparison between currently best element (acc) and next element
diff --git a/boost/compute/algorithm/detail/merge_sort_on_gpu.hpp b/boost/compute/algorithm/detail/merge_sort_on_gpu.hpp
new file mode 100644
index 0000000000..e62c6beb8d
--- /dev/null
+++ b/boost/compute/algorithm/detail/merge_sort_on_gpu.hpp
@@ -0,0 +1,590 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_GPU_HPP_
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_GPU_HPP_
+
+#include <algorithm>
+
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/program.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/container/vector.hpp>
+#include <boost/compute/memory/local_buffer.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class KeyType, class ValueType>
+inline size_t pick_bitonic_block_sort_block_size(size_t proposed_wg,
+                                                 size_t lmem_size,
+                                                 bool sort_by_key)
+{
+    size_t n = proposed_wg;
+
+    size_t lmem_required = n * sizeof(KeyType);
+    if(sort_by_key) {
+        lmem_required += n * sizeof(ValueType);
+    }
+
+    // try to force at least 4 work-groups of >64 elements
+    // for better occupancy
+    while(lmem_size < (lmem_required * 4) && (n > 64)) {
+        n /= 2;
+        lmem_required = n * sizeof(KeyType);
+    }
+    while(lmem_size < lmem_required && (n != 1)) {
+        n /= 2;
+        if(n < 1) n = 1;
+        lmem_required = n * sizeof(KeyType);
+    }
+
+    if(n < 2)   { return 1; }
+    else if(n < 4)   { return 2; }
+    else if(n < 8)   { return 4; }
+    else if(n < 16)  { return 8; }
+    else if(n < 32)  { return 16; }
+    else if(n < 64)  { return 32; }
+    else if(n < 128) { return 64; }
+    else if(n < 256) { return 128; }
+    else             { return 256; }
+}
+
+
+/// Performs bitonic block sort according to \p compare.
+///
+/// Since bitonic sort can be only performed when input size is equal to 2^n,
+/// in this case input size is block size (\p work_group_size), we would have
+/// to require \p count be a exact multiple of block size. That would not be
+/// great.
+/// Instead, bitonic sort kernel is merged with odd-even merge sort so if the
+/// last block is not equal to 2^n (where n is some natural number) the odd-even
+/// sort is performed for that block. That way bitonic_block_sort() works for
+/// input of any size. Block size (\p work_group_size) still have to be equal
+/// to 2^n.
+///
+/// This is NOT stable.
+///
+/// \param keys_first first key element in the range to sort
+/// \param values_first first value element in the range to sort
+/// \param compare comparison function for keys
+/// \param count number of elements in the range; count > 0
+/// \param work_group_size size of the work group, also the block size; must be
+///        equal to n^2 where n is natural number
+/// \param queue command queue to perform the operation
+template<class KeyIterator, class ValueIterator, class Compare>
+inline size_t bitonic_block_sort(KeyIterator keys_first,
+                                 ValueIterator values_first,
+                                 Compare compare,
+                                 const size_t count,
+                                 const bool sort_by_key,
+                                 command_queue &queue)
+{
+    typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
+
+    meta_kernel k("bitonic_block_sort");
+    size_t count_arg = k.add_arg<const uint_>("count");
+
+    size_t local_keys_arg = k.add_arg<key_type *>(memory_object::local_memory, "lkeys");
+    size_t local_vals_arg = 0;
+    if(sort_by_key) {
+        local_vals_arg = k.add_arg<uchar_ *>(memory_object::local_memory, "lidx");
+    }
+
+    k <<
+        // Work item global and local ids
+        k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
+        k.decl<const uint_>("lid") << " = get_local_id(0);\n";
+
+    // declare my_key and my_value
+    k <<
+        k.decl<key_type>("my_key") << ";\n";
+    // Instead of copying values (my_value) in local memory with keys
+    // we save local index (uchar) and copy my_value at the end at
+    // final index. This saves local memory.
+    if(sort_by_key)
+    {
+        k <<
+            k.decl<uchar_>("my_index") << " = (uchar)(lid);\n";
+    }
+
+    // load key
+    k <<
+        "if(gid < count) {\n" <<
+            k.var<key_type>("my_key") <<  " = " <<
+                keys_first[k.var<const uint_>("gid")] << ";\n" <<
+        "}\n";
+
+    // load key and index to local memory
+    k <<
+        "lkeys[lid] = my_key;\n";
+    if(sort_by_key)
+    {
+        k <<
+            "lidx[lid] = my_index;\n";
+    }
+    k <<
+        k.decl<const uint_>("offset") << " = get_group_id(0) * get_local_size(0);\n" <<
+        k.decl<const uint_>("n") << " = min((uint)(get_local_size(0)),(count - offset));\n";
+
+    // When work group size is a power of 2 bitonic sorter can be used;
+    // otherwise, slower odd-even sort is used.
+
+    k <<
+        // check if n is power of 2
+        "if(((n != 0) && ((n & (~n + 1)) == n))) {\n";
+
+    // bitonic sort, not stable
+    k <<
+        // wait for keys and vals to be stored in local memory
+        "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+
+        "#pragma unroll\n" <<
+        "for(" <<
+            k.decl<uint_>("length") << " = 1; " <<
+            "length < n; " <<
+            "length <<= 1" <<
+        ") {\n" <<
+            // direction of sort: false -> asc, true -> desc
+            k.decl<bool>("direction") << "= ((lid & (length<<1)) != 0);\n" <<
+            "for(" <<
+                k.decl<uint_>("k") << " = length; " <<
+                "k > 0; " <<
+                "k >>= 1" <<
+            ") {\n" <<
+
+            // sibling to compare with my key
+            k.decl<uint_>("sibling_idx") << " = lid ^ k;\n" <<
+            k.decl<key_type>("sibling_key") << " = lkeys[sibling_idx];\n" <<
+            k.decl<bool>("compare") << " = " <<
+                compare(k.var<key_type>("sibling_key"),
+                        k.var<key_type>("my_key")) << ";\n" <<
+            k.decl<bool>("swap") <<
+                " = compare ^ (sibling_idx < lid) ^ direction;\n" <<
+            "my_key = swap ? sibling_key : my_key;\n";
+    if(sort_by_key)
+    {
+        k <<
+            "my_index = swap ? lidx[sibling_idx] : my_index;\n";
+    }
+    k <<
+            "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+            "lkeys[lid] = my_key;\n";
+    if(sort_by_key)
+    {
+        k <<
+            "lidx[lid] = my_index;\n";
+    }
+    k <<
+            "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+            "}\n" <<
+         "}\n";
+
+    // end of bitonic sort
+
+    // odd-even sort, not stable
+    k <<
+        "}\n" <<
+        "else { \n";
+
+    k <<
+        k.decl<bool>("lid_is_even") << " = (lid%2) == 0;\n" <<
+        k.decl<uint_>("oddsibling_idx") << " = " <<
+            "(lid_is_even) ? max(lid,(uint)(1)) - 1 : min(lid+1,n-1);\n" <<
+        k.decl<uint_>("evensibling_idx") << " = " <<
+            "(lid_is_even) ? min(lid+1,n-1) : max(lid,(uint)(1)) - 1;\n" <<
+
+        // wait for keys and vals to be stored in local memory
+        "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+
+        "#pragma unroll\n" <<
+        "for(" <<
+            k.decl<uint_>("i") << " = 0; " <<
+            "i < n; " <<
+            "i++" <<
+        ") {\n" <<
+            k.decl<uint_>("sibling_idx") <<
+                " = i%2 == 0 ? evensibling_idx : oddsibling_idx;\n" <<
+            k.decl<key_type>("sibling_key") << " = lkeys[sibling_idx];\n" <<
+            k.decl<bool>("compare") << " = " <<
+                compare(k.var<key_type>("sibling_key"),
+                        k.var<key_type>("my_key")) << ";\n" <<
+            k.decl<bool>("swap") <<
+                " = compare ^ (sibling_idx < lid);\n" <<
+            "my_key = swap ? sibling_key : my_key;\n";
+    if(sort_by_key)
+    {
+        k <<
+            "my_index = swap ? lidx[sibling_idx] : my_index;\n";
+    }
+    k <<
+            "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
+            "lkeys[lid] = my_key;\n";
+    if(sort_by_key)
+    {
+        k <<
+            "lidx[lid] = my_index;\n";
+    }
+    k <<
+            "barrier(CLK_LOCAL_MEM_FENCE);\n"
+        "}\n" <<  // for
+
+    "}\n"; // else
+    // end of odd-even sort
+
+    // save key and value
+    k <<
+        "if(gid < count) {\n" <<
+        keys_first[k.var<const uint_>("gid")] << " = " <<
+            k.var<key_type>("my_key") << ";\n";
+    if(sort_by_key)
+    {
+        k << values_first[k.var<const uint_>("gid")] << " = " <<
+                values_first[k.var<const uint_>("offset + my_index")] << ";\n";
+    }
+    k <<
+        // end if
+        "}\n";
+
+    const context &context = queue.get_context();
+    const device &device = queue.get_device();
+    ::boost::compute::kernel kernel = k.compile(context);
+
+    const size_t work_group_size =
+        pick_bitonic_block_sort_block_size<key_type, uchar_>(
+            kernel.get_work_group_info<size_t>(
+                device, CL_KERNEL_WORK_GROUP_SIZE
+            ),
+            device.get_info<size_t>(CL_DEVICE_LOCAL_MEM_SIZE),
+            sort_by_key
+        );
+
+    const size_t global_size =
+        work_group_size * static_cast<size_t>(
+            std::ceil(float(count) / work_group_size)
+        );
+
+    kernel.set_arg(count_arg, static_cast<uint_>(count));
+    kernel.set_arg(local_keys_arg, local_buffer<key_type>(work_group_size));
+    if(sort_by_key) {
+        kernel.set_arg(local_vals_arg, local_buffer<uchar_>(work_group_size));
+    }
+
+    queue.enqueue_1d_range_kernel(kernel, 0, global_size, work_group_size);
+    // return size of the block
+    return work_group_size;
+}
+
+template<class KeyIterator, class ValueIterator, class Compare>
+inline size_t block_sort(KeyIterator keys_first,
+                         ValueIterator values_first,
+                         Compare compare,
+                         const size_t count,
+                         const bool sort_by_key,
+                         const bool stable,
+                         command_queue &queue)
+{
+    if(stable) {
+        // TODO: Implement stable block sort (stable odd-even merge sort)
+        return size_t(1);
+    }
+    return bitonic_block_sort(
+        keys_first, values_first,
+        compare, count,
+        sort_by_key, queue
+    );
+}
+
+/// space: O(n + m); n - number of keys, m - number of values
+template<class KeyIterator, class ValueIterator, class Compare>
+inline void merge_blocks_on_gpu(KeyIterator keys_first,
+                                ValueIterator values_first,
+                                KeyIterator out_keys_first,
+                                ValueIterator out_values_first,
+                                Compare compare,
+                                const size_t count,
+                                const size_t block_size,
+                                const bool sort_by_key,
+                                command_queue &queue)
+{
+    typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
+    typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
+
+    meta_kernel k("merge_blocks");
+    size_t count_arg = k.add_arg<const uint_>("count");
+    size_t block_size_arg = k.add_arg<const uint_>("block_size");
+
+    k <<
+        // get global id
+        k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
+        "if(gid >= count) {\n" <<
+            "return;\n" <<
+        "}\n" <<
+
+        k.decl<const key_type>("my_key") << " = " <<
+            keys_first[k.var<const uint_>("gid")] << ";\n";
+
+    if(sort_by_key) {
+        k <<
+            k.decl<const value_type>("my_value") << " = " <<
+                values_first[k.var<const uint_>("gid")] << ";\n";
+    }
+
+    k <<
+        // get my block idx
+        k.decl<const uint_>("my_block_idx") << " = gid / block_size;\n" <<
+        k.decl<const bool>("my_block_idx_is_odd") << " = " <<
+            "my_block_idx & 0x1;\n" <<
+
+        k.decl<const uint_>("other_block_idx") << " = " <<
+            // if(my_block_idx is odd) {} else {}
+            "my_block_idx_is_odd ? my_block_idx - 1 : my_block_idx + 1;\n" <<
+
+        // get ranges of my block and the other block
+        // [my_block_start; my_block_end)
+        // [other_block_start; other_block_end)
+        k.decl<const uint_>("my_block_start") << " = " <<
+            "min(my_block_idx * block_size, count);\n" << // including
+        k.decl<const uint_>("my_block_end") << " = " <<
+            "min((my_block_idx + 1) * block_size, count);\n" << // excluding
+
+        k.decl<const uint_>("other_block_start") << " = " <<
+            "min(other_block_idx * block_size, count);\n" << // including
+        k.decl<const uint_>("other_block_end") << " = " <<
+            "min((other_block_idx + 1) * block_size, count);\n" << // excluding
+
+        // other block is empty, nothing to merge here
+        "if(other_block_start == count){\n" <<
+            out_keys_first[k.var<uint_>("gid")] << " = my_key;\n";
+        if(sort_by_key) {
+            k <<
+                out_values_first[k.var<uint_>("gid")] << " = my_value;\n";
+        }
+
+        k <<
+        "return;\n" <<
+        "}\n" <<
+
+        // lower bound
+        // left_idx - lower bound
+        k.decl<uint_>("left_idx") << " = other_block_start;\n" <<
+        k.decl<uint_>("right_idx") << " = other_block_end;\n" <<
+        "while(left_idx < right_idx) {\n" <<
+            k.decl<uint_>("mid_idx") << " = (left_idx + right_idx) / 2;\n" <<
+            k.decl<key_type>("mid_key") << " = " <<
+                    keys_first[k.var<const uint_>("mid_idx")] << ";\n" <<
+            k.decl<bool>("smaller") << " = " <<
+                compare(k.var<key_type>("mid_key"),
+                        k.var<key_type>("my_key")) << ";\n" <<
+            "left_idx = smaller ? mid_idx + 1 : left_idx;\n" <<
+            "right_idx = smaller ? right_idx :  mid_idx;\n" <<
+        "}\n" <<
+        // left_idx is found position in other block
+
+        // if my_block is odd we need to get the upper bound
+        "right_idx = other_block_end;\n" <<
+        "if(my_block_idx_is_odd && left_idx != right_idx) {\n" <<
+            k.decl<key_type>("upper_key") << " = " <<
+                keys_first[k.var<const uint_>("left_idx")] << ";\n" <<
+            "while(" <<
+                "!(" << compare(k.var<key_type>("upper_key"),
+                                k.var<key_type>("my_key")) <<
+                ") && " <<
+                "!(" << compare(k.var<key_type>("my_key"),
+                                k.var<key_type>("upper_key")) <<
+                ") && " <<
+                     "left_idx < right_idx" <<
+                ")" <<
+            "{\n" <<
+                k.decl<uint_>("mid_idx") << " = (left_idx + right_idx) / 2;\n" <<
+                k.decl<key_type>("mid_key") << " = " <<
+                    keys_first[k.var<const uint_>("mid_idx")] << ";\n" <<
+                k.decl<bool>("equal") << " = " <<
+                    "!(" << compare(k.var<key_type>("mid_key"),
+                                    k.var<key_type>("my_key")) <<
+                    ") && " <<
+                    "!(" << compare(k.var<key_type>("my_key"),
+                                    k.var<key_type>("mid_key")) <<
+                    ");\n" <<
+                "left_idx = equal ? mid_idx + 1 : left_idx + 1;\n" <<
+                "right_idx = equal ? right_idx : mid_idx;\n" <<
+                "upper_key = equal ? upper_key : " <<
+                    keys_first[k.var<const uint_>("left_idx")] << ";\n" <<
+            "}\n" <<
+        "}\n" <<
+
+        k.decl<uint_>("offset") << " = 0;\n" <<
+        "offset += gid - my_block_start;\n" <<
+        "offset += left_idx - other_block_start;\n" <<
+        "offset += min(my_block_start, other_block_start);\n" <<
+        out_keys_first[k.var<uint_>("offset")] << " = my_key;\n";
+    if(sort_by_key) {
+        k <<
+            out_values_first[k.var<uint_>("offset")] << " = my_value;\n";
+    }
+
+    const context &context = queue.get_context();
+    ::boost::compute::kernel kernel = k.compile(context);
+
+    const size_t work_group_size = (std::min)(
+        size_t(256),
+        kernel.get_work_group_info<size_t>(
+            queue.get_device(), CL_KERNEL_WORK_GROUP_SIZE
+        )
+    );
+    const size_t global_size =
+        work_group_size * static_cast<size_t>(
+            std::ceil(float(count) / work_group_size)
+        );
+
+    kernel.set_arg(count_arg, static_cast<uint_>(count));
+    kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));
+    queue.enqueue_1d_range_kernel(kernel, 0, global_size, work_group_size);
+}
+
+template<class KeyIterator, class ValueIterator, class Compare>
+inline void merge_sort_by_key_on_gpu(KeyIterator keys_first,
+                                     KeyIterator keys_last,
+                                     ValueIterator values_first,
+                                     Compare compare,
+                                     bool stable,
+                                     command_queue &queue)
+{
+    typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
+    typedef typename std::iterator_traits<ValueIterator>::value_type value_type;
+
+    size_t count = iterator_range_size(keys_first, keys_last);
+    if(count < 2){
+        return;
+    }
+
+    size_t block_size =
+        block_sort(
+            keys_first, values_first,
+            compare, count,
+            true /* sort_by_key */, stable /* stable */,
+            queue
+        );
+
+    // for small input size only block sort is performed
+    if(count <= block_size) {
+        return;
+    }
+
+    const context &context = queue.get_context();
+
+    bool result_in_temporary_buffer = false;
+    ::boost::compute::vector<key_type> temp_keys(count, context);
+    ::boost::compute::vector<value_type> temp_values(count, context);
+
+    for(; block_size < count; block_size *= 2) {
+        result_in_temporary_buffer = !result_in_temporary_buffer;
+        if(result_in_temporary_buffer) {
+            merge_blocks_on_gpu(keys_first, values_first,
+                                temp_keys.begin(), temp_values.begin(),
+                                compare, count, block_size,
+                                true /* sort_by_key */, queue);
+        } else {
+            merge_blocks_on_gpu(temp_keys.begin(), temp_values.begin(),
+                                keys_first, values_first,
+                                compare, count, block_size,
+                                true /* sort_by_key */, queue);
+        }
+    }
+
+    if(result_in_temporary_buffer) {
+        copy_async(temp_keys.begin(), temp_keys.end(), keys_first, queue);
+        copy_async(temp_values.begin(), temp_values.end(), values_first, queue);
+    }
+}
+
+template<class Iterator, class Compare>
+inline void merge_sort_on_gpu(Iterator first,
+                              Iterator last,
+                              Compare compare,
+                              bool stable,
+                              command_queue &queue)
+{
+    typedef typename std::iterator_traits<Iterator>::value_type key_type;
+
+    size_t count = iterator_range_size(first, last);
+    if(count < 2){
+        return;
+    }
+
+    Iterator dummy;
+    size_t block_size =
+        block_sort(
+            first, dummy,
+            compare, count,
+            false /* sort_by_key */, stable /* stable */,
+            queue
+        );
+
+    // for small input size only block sort is performed
+    if(count <= block_size) {
+        return;
+    }
+
+    const context &context = queue.get_context();
+
+    bool result_in_temporary_buffer = false;
+    ::boost::compute::vector<key_type> temp_keys(count, context);
+
+    for(; block_size < count; block_size *= 2) {
+        result_in_temporary_buffer = !result_in_temporary_buffer;
+        if(result_in_temporary_buffer) {
+            merge_blocks_on_gpu(first, dummy, temp_keys.begin(), dummy,
+                                compare, count, block_size,
+                                false /* sort_by_key */, queue);
+        } else {
+            merge_blocks_on_gpu(temp_keys.begin(), dummy, first, dummy,
+                                compare, count, block_size,
+                                false /* sort_by_key */, queue);
+        }
+    }
+
+    if(result_in_temporary_buffer) {
+        copy_async(temp_keys.begin(), temp_keys.end(), first, queue);
+    }
+}
+
+template<class KeyIterator, class ValueIterator, class Compare>
+inline void merge_sort_by_key_on_gpu(KeyIterator keys_first,
+                                     KeyIterator keys_last,
+                                     ValueIterator values_first,
+                                     Compare compare,
+                                     command_queue &queue)
+{
+    merge_sort_by_key_on_gpu(
+        keys_first, keys_last, values_first,
+        compare, false /* not stable */, queue
+    );
+}
+
+template<class Iterator, class Compare>
+inline void merge_sort_on_gpu(Iterator first,
+                              Iterator last,
+                              Compare compare,
+                              command_queue &queue)
+{
+    merge_sort_on_gpu(
+        first, last, compare, false /* not stable */, queue
+    );
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif /* BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_GPU_HPP_ */
diff --git a/boost/compute/algorithm/detail/radix_sort.hpp b/boost/compute/algorithm/detail/radix_sort.hpp
index c2ba4ed17c..8e6d5f9c0a 100644
--- a/boost/compute/algorithm/detail/radix_sort.hpp
+++ b/boost/compute/algorithm/detail/radix_sort.hpp
@@ -92,6 +92,8 @@ const char radix_sort_source[] =
 "#define RADIX_MASK ((((T)(1)) << K_BITS) - 1)\n"
 "#define SIGN_BIT ((sizeof(T) * CHAR_BIT) - 1)\n"
 
+"#if defined(ASC)\n" // asc order
+
 "inline uint radix(const T x, const uint low_bit)\n"
 "{\n"
 "#if defined(IS_FLOATING_POINT)\n"
@@ -104,6 +106,25 @@ const char radix_sort_source[] =
 "#endif\n"
 "}\n"
 
+"#else\n" // desc order
+
+// For signed types we just negate the x and for unsigned types we
+// subtract the x from max value of its type ((T)(-1) is a max value
+// of type T when T is an unsigned type).
+"inline uint radix(const T x, const uint low_bit)\n"
+"{\n"
+"#if defined(IS_FLOATING_POINT)\n"
+"    const T mask = -(x >> SIGN_BIT) | (((T)(1)) << SIGN_BIT);\n"
+"    return (((-x) ^ mask) >> low_bit) & RADIX_MASK;\n"
+"#elif defined(IS_SIGNED)\n"
+"    return (((-x) ^ (((T)(1)) << SIGN_BIT)) >> low_bit) & RADIX_MASK;\n"
+"#else\n"
+"    return (((T)(-1) - x) >> low_bit) & RADIX_MASK;\n"
+"#endif\n"
+"}\n"
+
+"#endif\n" // #if defined(ASC)
+
 "__kernel void count(__global const T *input,\n"
 "                    const uint input_offset,\n"
 "                    const uint input_size,\n"
@@ -227,6 +248,7 @@ template<class T, class T2>
 inline void radix_sort_impl(const buffer_iterator<T> first,
                             const buffer_iterator<T> last,
                             const buffer_iterator<T2> values_first,
+                            const bool ascending,
                             command_queue &queue)
 {
 
@@ -279,6 +301,10 @@ inline void radix_sort_impl(const buffer_iterator<T> first,
         options << enable_double<T2>();
     }
 
+    if(ascending){
+        options << " -DASC";
+    }
+
     // load radix sort program
     program radix_sort_program = cache->get_or_build(
         cache_key, options.str(), radix_sort_source, context
@@ -396,18 +422,38 @@ inline void radix_sort(Iterator first,
                        Iterator last,
                        command_queue &queue)
 {
-    radix_sort_impl(first, last, buffer_iterator<int>(), queue);
+    radix_sort_impl(first, last, buffer_iterator<int>(), true, queue);
+}
+
+template<class KeyIterator, class ValueIterator>
+inline void radix_sort_by_key(KeyIterator keys_first,
+                              KeyIterator keys_last,
+                              ValueIterator values_first,
+                              command_queue &queue)
+{
+    radix_sort_impl(keys_first, keys_last, values_first, true, queue);
+}
+
+template<class Iterator>
+inline void radix_sort(Iterator first,
+                       Iterator last,
+                       const bool ascending,
+                       command_queue &queue)
+{
+    radix_sort_impl(first, last, buffer_iterator<int>(), ascending, queue);
 }
 
 template<class KeyIterator, class ValueIterator>
 inline void radix_sort_by_key(KeyIterator keys_first,
                               KeyIterator keys_last,
                               ValueIterator values_first,
+                              const bool ascending,
                               command_queue &queue)
 {
-    radix_sort_impl(keys_first, keys_last, values_first, queue);
+    radix_sort_impl(keys_first, keys_last, values_first, ascending, queue);
 }
 
+
 } // end detail namespace
 } // end compute namespace
 } // end boost namespace
diff --git a/boost/compute/algorithm/detail/reduce_on_cpu.hpp b/boost/compute/algorithm/detail/reduce_on_cpu.hpp
new file mode 100644
index 0000000000..490d7d538e
--- /dev/null
+++ b/boost/compute/algorithm/detail/reduce_on_cpu.hpp
@@ -0,0 +1,110 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_CPU_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_CPU_HPP
+
+#include <algorithm>
+
+#include <boost/compute/buffer.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/parameter_cache.hpp>
+#include <boost/compute/iterator/buffer_iterator.hpp>
+#include <boost/compute/type_traits/result_of.hpp>
+#include <boost/compute/algorithm/detail/serial_reduce.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class OutputIterator, class BinaryFunction>
+inline void reduce_on_cpu(InputIterator first,
+                          InputIterator last,
+                          OutputIterator result,
+                          BinaryFunction function,
+                          command_queue &queue)
+{
+    typedef typename
+        std::iterator_traits<InputIterator>::value_type T;
+    typedef typename
+        ::boost::compute::result_of<BinaryFunction(T, T)>::type result_type;
+
+    const device &device = queue.get_device();
+    const uint_ compute_units = queue.get_device().compute_units();
+
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+
+    std::string cache_key =
+        "__boost_reduce_cpu_" + boost::lexical_cast<std::string>(sizeof(T));
+
+    // for inputs smaller than serial_reduce_threshold
+    // serial_reduce algorithm is used
+    uint_ serial_reduce_threshold =
+        parameters->get(cache_key, "serial_reduce_threshold", 16384 * sizeof(T));
+    serial_reduce_threshold =
+        (std::max)(serial_reduce_threshold, uint_(compute_units));
+
+    const context &context = queue.get_context();
+    size_t count = detail::iterator_range_size(first, last);
+    if(count == 0){
+        return;
+    }
+    else if(count < serial_reduce_threshold) {
+        return serial_reduce(first, last, result, function, queue);
+    }
+
+    meta_kernel k("reduce_on_cpu");
+    buffer output(context, sizeof(result_type) * compute_units);
+
+    size_t count_arg = k.add_arg<uint_>("count");
+    size_t output_arg =
+        k.add_arg<result_type *>(memory_object::global_memory, "output");
+
+    k <<
+        "uint block = " <<
+            "(uint)ceil(((float)count)/get_global_size(0));\n" <<
+        "uint index = get_global_id(0) * block;\n" <<
+        "uint end = min(count, index + block);\n" <<
+
+        k.decl<result_type>("result") << " = " << first[k.var<uint_>("index")] << ";\n" <<
+        "index++;\n" <<
+        "while(index < end){\n" <<
+             "result = " << function(k.var<T>("result"),
+                                     first[k.var<uint_>("index")]) << ";\n" <<
+             "index++;\n" <<
+        "}\n" <<
+        "output[get_global_id(0)] = result;\n";
+
+    size_t global_work_size = compute_units;
+    kernel kernel = k.compile(context);
+
+    // reduction to global_work_size elements
+    kernel.set_arg(count_arg, static_cast<uint_>(count));
+    kernel.set_arg(output_arg, output);
+    queue.enqueue_1d_range_kernel(kernel, 0, global_work_size, 0);
+
+    // final reduction
+    reduce_on_cpu(
+        make_buffer_iterator<result_type>(output),
+        make_buffer_iterator<result_type>(output, global_work_size),
+        result,
+        function,
+        queue
+    );
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_CPU_HPP
diff --git a/boost/compute/algorithm/detail/scan_on_cpu.hpp b/boost/compute/algorithm/detail/scan_on_cpu.hpp
index 6611c0ba3e..d81117c65f 100644
--- a/boost/compute/algorithm/detail/scan_on_cpu.hpp
+++ b/boost/compute/algorithm/detail/scan_on_cpu.hpp
@@ -1,5 +1,5 @@
 //---------------------------------------------------------------------------//
-// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
 //
 // Distributed under the Boost Software License, Version 1.0
 // See accompanying file LICENSE_1_0.txt or copy at
@@ -16,8 +16,10 @@
 #include <boost/compute/device.hpp>
 #include <boost/compute/kernel.hpp>
 #include <boost/compute/command_queue.hpp>
+#include <boost/compute/algorithm/detail/serial_scan.hpp>
 #include <boost/compute/detail/meta_kernel.hpp>
 #include <boost/compute/detail/iterator_range_size.hpp>
+#include <boost/compute/detail/parameter_cache.hpp>
 
 namespace boost {
 namespace compute {
@@ -32,68 +34,170 @@ inline OutputIterator scan_on_cpu(InputIterator first,
                                   BinaryOperator op,
                                   command_queue &queue)
 {
-    if(first == last){
-        return result;
-    }
-
     typedef typename
         std::iterator_traits<InputIterator>::value_type input_type;
     typedef typename
         std::iterator_traits<OutputIterator>::value_type output_type;
 
     const context &context = queue.get_context();
+    const device &device = queue.get_device();
+    const size_t compute_units = queue.get_device().compute_units();
+
+    boost::shared_ptr<parameter_cache> parameters =
+        detail::parameter_cache::get_global_cache(device);
+
+    std::string cache_key =
+        "__boost_scan_cpu_" + boost::lexical_cast<std::string>(sizeof(T));
+
+    // for inputs smaller than serial_scan_threshold
+    // serial_scan algorithm is used
+    uint_ serial_scan_threshold =
+        parameters->get(cache_key, "serial_scan_threshold", 16384 * sizeof(T));
+    serial_scan_threshold =
+        (std::max)(serial_scan_threshold, uint_(compute_units));
+
+    size_t count = detail::iterator_range_size(first, last);
+    if(count == 0){
+        return result;
+    }
+    else if(count < serial_scan_threshold) {
+        return serial_scan(first, last, result, exclusive, init, op, queue);
+    }
+
+    buffer block_partial_sums(context, sizeof(output_type) * compute_units );
 
     // create scan kernel
-    meta_kernel k("scan_on_cpu");
+    meta_kernel k("scan_on_cpu_block_scan");
 
     // Arguments
-    size_t n_arg = k.add_arg<ulong_>("n");
+    size_t count_arg = k.add_arg<uint_>("count");
     size_t init_arg = k.add_arg<output_type>("initial_value");
+    size_t block_partial_sums_arg =
+        k.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums");
+
+    k <<
+        "uint block = " <<
+            "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" <<
+        "uint index = get_global_id(0) * block;\n" <<
+        "uint end = min(count, index + block);\n";
 
     if(!exclusive){
         k <<
-            k.decl<const ulong_>("start_idx") << " = 1;\n" <<
-            k.decl<output_type>("sum") << " = " << first[0] << ";\n" <<
-            result[0] << " = sum;\n";
+            k.decl<output_type>("sum") << " = " <<
+                first[k.var<uint_>("index")] << ";\n" <<
+            result[k.var<uint_>("index")] << " = sum;\n" <<
+            "index++;\n";
     }
     else {
         k <<
-            k.decl<const ulong_>("start_idx") << " = 0;\n" <<
-            k.decl<output_type>("sum") << " = initial_value;\n";
+            k.decl<output_type>("sum") << ";\n" <<
+            "if(index == 0){\n" <<
+                "sum = initial_value;\n" <<
+            "}\n" <<
+            "else {\n" <<
+                "sum = " << first[k.var<uint_>("index")] << ";\n" <<
+                "index++;\n" <<
+            "}\n";
     }
 
     k <<
-        "for(ulong i = start_idx; i < n; i++){\n" <<
-        k.decl<const input_type>("x") << " = "
-            << first[k.var<ulong_>("i")] << ";\n";
+        "while(index < end){\n" <<
+            // load next value
+            k.decl<const input_type>("value") << " = "
+                << first[k.var<uint_>("index")] << ";\n";
 
     if(exclusive){
-        k << result[k.var<ulong_>("i")] << " = sum;\n";
+        k <<
+            "if(get_global_id(0) == 0){\n" <<
+                result[k.var<uint_>("index")] << " = sum;\n" <<
+            "}\n";
     }
-
-    k << "    sum = "
-        << op(k.var<output_type>("sum"), k.var<output_type>("x"))
-        << ";\n";
+    k <<
+            "sum = " << op(k.var<output_type>("sum"),
+                           k.var<output_type>("value")) << ";\n";
 
     if(!exclusive){
-        k << result[k.var<ulong_>("i")] << " = sum;\n";
+        k <<
+            "if(get_global_id(0) == 0){\n" <<
+                result[k.var<uint_>("index")] << " = sum;\n" <<
+            "}\n";
+    }
+
+    k <<
+            "index++;\n" <<
+        "}\n" << // end while
+        "block_partial_sums[get_global_id(0)] = sum;\n";
+
+    // compile scan kernel
+    kernel block_scan_kernel = k.compile(context);
+
+    // setup kernel arguments
+    block_scan_kernel.set_arg(count_arg, static_cast<uint_>(count));
+    block_scan_kernel.set_arg(init_arg, static_cast<output_type>(init));
+    block_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums);
+
+    // execute the kernel
+    size_t global_work_size = compute_units;
+    queue.enqueue_1d_range_kernel(block_scan_kernel, 0, global_work_size, 0);
+
+    // scan is done
+    if(compute_units < 2) {
+        return result + count;
+    }
+
+    // final scan kernel
+    meta_kernel l("scan_on_cpu_final_scan");
+
+    // Arguments
+    count_arg = l.add_arg<uint_>("count");
+    block_partial_sums_arg =
+        l.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums");
+
+    l <<
+        "uint block = " <<
+            "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" <<
+        "uint index = block + get_global_id(0) * block;\n" <<
+        "uint end = min(count, index + block);\n" <<
+
+        k.decl<output_type>("sum") << " = block_partial_sums[0];\n" <<
+        "for(uint i = 0; i < get_global_id(0); i++) {\n" <<
+            "sum = " << op(k.var<output_type>("sum"),
+                           k.var<output_type>("block_partial_sums[i + 1]")) << ";\n" <<
+        "}\n" <<
+
+        "while(index < end){\n";
+    if(exclusive){
+        l <<
+            l.decl<output_type>("value") << " = "
+                << first[k.var<uint_>("index")] << ";\n" <<
+            result[k.var<uint_>("index")] << " = sum;\n" <<
+            "sum = " << op(k.var<output_type>("sum"),
+                           k.var<output_type>("value")) << ";\n";
+    }
+    else {
+        l <<
+            "sum = " << op(k.var<output_type>("sum"),
+                           first[k.var<uint_>("index")]) << ";\n" <<
+            result[k.var<uint_>("index")] << " = sum;\n";
     }
+    l <<
+            "index++;\n" <<
+        "}\n";
 
-    k << "}\n";
 
     // compile scan kernel
-    kernel scan_kernel = k.compile(context);
+    kernel final_scan_kernel = l.compile(context);
 
     // setup kernel arguments
-    size_t n = detail::iterator_range_size(first, last);
-    scan_kernel.set_arg<ulong_>(n_arg, n);
-    scan_kernel.set_arg<output_type>(init_arg, static_cast<output_type>(init));
+    final_scan_kernel.set_arg(count_arg, static_cast<uint_>(count));
+    final_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums);
 
     // execute the kernel
-    queue.enqueue_1d_range_kernel(scan_kernel, 0, 1, 1);
+    global_work_size = compute_units;
+    queue.enqueue_1d_range_kernel(final_scan_kernel, 0, global_work_size, 0);
 
     // return iterator pointing to the end of the result range
-    return result + n;
+    return result + count;
 }
 
 } // end detail namespace
diff --git a/boost/compute/algorithm/detail/scan_on_gpu.hpp b/boost/compute/algorithm/detail/scan_on_gpu.hpp
index 07c6d6d3c0..dcaff2cdf3 100644
--- a/boost/compute/algorithm/detail/scan_on_gpu.hpp
+++ b/boost/compute/algorithm/detail/scan_on_gpu.hpp
@@ -14,7 +14,6 @@
 #include <boost/compute/kernel.hpp>
 #include <boost/compute/detail/meta_kernel.hpp>
 #include <boost/compute/command_queue.hpp>
-#include <boost/compute/algorithm/detail/scan_on_cpu.hpp>
 #include <boost/compute/container/vector.hpp>
 #include <boost/compute/detail/iterator_range_size.hpp>
 #include <boost/compute/memory/local_buffer.hpp>
diff --git a/boost/compute/algorithm/detail/search_all.hpp b/boost/compute/algorithm/detail/search_all.hpp
index a874bcdebe..b091ec7de5 100644
--- a/boost/compute/algorithm/detail/search_all.hpp
+++ b/boost/compute/algorithm/detail/search_all.hpp
@@ -48,7 +48,7 @@ public:
 
         *this <<
             "uint i = get_global_id(0);\n" <<
-            "uint i1 = i;\n" <<
+            "const uint i1 = i;\n" <<
             "uint j;\n" <<
             "for(j = 0; j<p_count; j++,i++)\n" <<
             "{\n" <<
diff --git a/boost/compute/algorithm/detail/serial_scan.hpp b/boost/compute/algorithm/detail/serial_scan.hpp
new file mode 100644
index 0000000000..7590fd94fb
--- /dev/null
+++ b/boost/compute/algorithm/detail/serial_scan.hpp
@@ -0,0 +1,103 @@
+//---------------------------------------------------------------------------//
+// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
+//
+// Distributed under the Boost Software License, Version 1.0
+// See accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt
+//
+// See http://boostorg.github.com/compute for more information.
+//---------------------------------------------------------------------------//
+
+#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_SCAN_HPP
+#define BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_SCAN_HPP
+
+#include <iterator>
+
+#include <boost/compute/device.hpp>
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/command_queue.hpp>
+#include <boost/compute/detail/meta_kernel.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
+
+namespace boost {
+namespace compute {
+namespace detail {
+
+template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
+inline OutputIterator serial_scan(InputIterator first,
+                                  InputIterator last,
+                                  OutputIterator result,
+                                  bool exclusive,
+                                  T init,
+                                  BinaryOperator op,
+                                  command_queue &queue)
+{
+    if(first == last){
+        return result;
+    }
+
+    typedef typename
+        std::iterator_traits<InputIterator>::value_type input_type;
+    typedef typename
+        std::iterator_traits<OutputIterator>::value_type output_type;
+
+    const context &context = queue.get_context();
+
+    // create scan kernel
+    meta_kernel k("serial_scan");
+
+    // Arguments
+    size_t n_arg = k.add_arg<ulong_>("n");
+    size_t init_arg = k.add_arg<output_type>("initial_value");
+
+    if(!exclusive){
+        k <<
+            k.decl<const ulong_>("start_idx") << " = 1;\n" <<
+            k.decl<output_type>("sum") << " = " << first[0] << ";\n" <<
+            result[0] << " = sum;\n";
+    }
+    else {
+        k <<
+            k.decl<const ulong_>("start_idx") << " = 0;\n" <<
+            k.decl<output_type>("sum") << " = initial_value;\n";
+    }
+
+    k <<
+        "for(ulong i = start_idx; i < n; i++){\n" <<
+        k.decl<const input_type>("x") << " = "
+            << first[k.var<ulong_>("i")] << ";\n";
+
+    if(exclusive){
+        k << result[k.var<ulong_>("i")] << " = sum;\n";
+    }
+
+    k << "    sum = "
+        << op(k.var<output_type>("sum"), k.var<output_type>("x"))
+        << ";\n";
+
+    if(!exclusive){
+        k << result[k.var<ulong_>("i")] << " = sum;\n";
+    }
+
+    k << "}\n";
+
+    // compile scan kernel
+    kernel scan_kernel = k.compile(context);
+
+    // setup kernel arguments
+    size_t n = detail::iterator_range_size(first, last);
+    scan_kernel.set_arg<ulong_>(n_arg, n);
+    scan_kernel.set_arg<output_type>(init_arg, static_cast<output_type>(init));
+
+    // execute the kernel
+    queue.enqueue_1d_range_kernel(scan_kernel, 0, 1, 1);
+
+    // return iterator pointing to the end of the result range
+    return result + n;
+}
+
+} // end detail namespace
+} // end compute namespace
+} // end boost namespace
+
+#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SERIAL_SCAN_HPP
diff --git a/boost/compute/algorithm/find_end.hpp b/boost/compute/algorithm/find_end.hpp
index 5c40055113..265a1da542 100644
--- a/boost/compute/algorithm/find_end.hpp
+++ b/boost/compute/algorithm/find_end.hpp
@@ -36,6 +36,7 @@ inline InputIterator find_end_helper(InputIterator first,
                                      command_queue &queue)
 {
     typedef typename std::iterator_traits<InputIterator>::value_type value_type;
+    typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
 
     size_t count = detail::iterator_range_size(first, last);
     if(count == 0){
@@ -65,8 +66,13 @@ inline InputIterator find_end_helper(InputIterator first,
     queue.enqueue_1d_range_kernel(kernel, 0, count, 0);
 
     int result = static_cast<int>(index.read(queue));
-    if(result == -1) return last;
-    else return first + result;
+
+    if(result == -1){
+        return last;
+    }
+    else {
+        return first + static_cast<difference_type>(result);
+    }
 }
 
 } // end detail namespace
@@ -92,8 +98,13 @@ inline TextIterator find_end(TextIterator t_first,
                              command_queue &queue = system::default_queue())
 {
     const context &context = queue.get_context();
-    vector<uint_> matching_indices(detail::iterator_range_size(t_first, t_last),
-                                    context);
+
+    // there is no need to check if pattern starts at last n - 1 indices
+    vector<uint_> matching_indices(
+        detail::iterator_range_size(t_first, t_last)
+            + 1 - detail::iterator_range_size(p_first, p_last),
+        context
+    );
 
     detail::search_kernel<PatternIterator,
                           TextIterator,
@@ -105,10 +116,16 @@ inline TextIterator find_end(TextIterator t_first,
     using boost::compute::_1;
 
     vector<uint_>::iterator index =
-        detail::find_end_helper(matching_indices.begin(),
-                                matching_indices.end(),
-                                _1 == 1,
-                                queue);
+        detail::find_end_helper(
+            matching_indices.begin(),
+            matching_indices.end(),
+            _1 == 1,
+            queue
+        );
+
+    // pattern was not found
+    if(index == matching_indices.end())
+        return t_last;
 
     return t_first + detail::iterator_range_size(matching_indices.begin(), index);
 }
diff --git a/boost/compute/algorithm/gather.hpp b/boost/compute/algorithm/gather.hpp
index b2f725d54e..24c5c727ae 100644
--- a/boost/compute/algorithm/gather.hpp
+++ b/boost/compute/algorithm/gather.hpp
@@ -36,7 +36,6 @@ public:
                    OutputIterator result)
     {
         m_count = iterator_range_size(first, last);
-        m_offset = first.get_index();
 
         *this <<
             "const uint i = get_global_id(0);\n" <<
@@ -50,12 +49,11 @@ public:
             return event();
         }
 
-        return exec_1d(queue, m_offset, m_count);
+        return exec_1d(queue, 0, m_count);
     }
 
 private:
     size_t m_count;
-    size_t m_offset;
 };
 
 } // end detail namespace
diff --git a/boost/compute/algorithm/minmax_element.hpp b/boost/compute/algorithm/minmax_element.hpp
index bf32c3c989..3f44c09eaf 100644
--- a/boost/compute/algorithm/minmax_element.hpp
+++ b/boost/compute/algorithm/minmax_element.hpp
@@ -49,7 +49,7 @@ minmax_element(InputIterator first,
 }
 
 ///\overload
-template<class InputIterator, class Compare>
+template<class InputIterator>
 inline std::pair<InputIterator, InputIterator>
 minmax_element(InputIterator first,
                InputIterator last,
diff --git a/boost/compute/algorithm/reduce.hpp b/boost/compute/algorithm/reduce.hpp
index 79624a0e50..19d070019f 100644
--- a/boost/compute/algorithm/reduce.hpp
+++ b/boost/compute/algorithm/reduce.hpp
@@ -22,7 +22,7 @@
 #include <boost/compute/algorithm/copy_n.hpp>
 #include <boost/compute/algorithm/detail/inplace_reduce.hpp>
 #include <boost/compute/algorithm/detail/reduce_on_gpu.hpp>
-#include <boost/compute/algorithm/detail/serial_reduce.hpp>
+#include <boost/compute/algorithm/detail/reduce_on_cpu.hpp>
 #include <boost/compute/detail/iterator_range_size.hpp>
 #include <boost/compute/memory/local_buffer.hpp>
 #include <boost/compute/type_traits/result_of.hpp>
@@ -173,8 +173,8 @@ inline void generic_reduce(InputIterator first,
     size_t count = detail::iterator_range_size(first, last);
 
     if(device.type() & device::cpu){
-        boost::compute::vector<result_type> value(1, context);
-        detail::serial_reduce(first, last, value.begin(), function, queue);
+        array<result_type, 1> value(context);
+        detail::reduce_on_cpu(first, last, value.begin(), function, queue);
         boost::compute::copy_n(value.begin(), 1, result, queue);
     }
     else {
@@ -209,16 +209,16 @@ inline void dispatch_reduce(InputIterator first,
     const device &device = queue.get_device();
 
     // reduce to temporary buffer on device
-    array<T, 1> tmp(context);
+    array<T, 1> value(context);
     if(device.type() & device::cpu){
-        detail::serial_reduce(first, last, tmp.begin(), function, queue);
+        detail::reduce_on_cpu(first, last, value.begin(), function, queue);
     }
     else {
-        reduce_on_gpu(first, last, tmp.begin(), function, queue);
+        reduce_on_gpu(first, last, value.begin(), function, queue);
     }
 
     // copy to result iterator
-    copy_n(tmp.begin(), 1, result, queue);
+    copy_n(value.begin(), 1, result, queue);
 }
 
 template<class InputIterator, class OutputIterator, class BinaryFunction>
diff --git a/boost/compute/algorithm/sort.hpp b/boost/compute/algorithm/sort.hpp
index b2730b3e2b..7e0a583e3e 100644
--- a/boost/compute/algorithm/sort.hpp
+++ b/boost/compute/algorithm/sort.hpp
@@ -18,6 +18,7 @@
 #include <boost/compute/system.hpp>
 #include <boost/compute/command_queue.hpp>
 #include <boost/compute/algorithm/detail/merge_sort_on_cpu.hpp>
+#include <boost/compute/algorithm/detail/merge_sort_on_gpu.hpp>
 #include <boost/compute/algorithm/detail/radix_sort.hpp>
 #include <boost/compute/algorithm/detail/insertion_sort.hpp>
 #include <boost/compute/algorithm/reverse.hpp>
@@ -74,11 +75,8 @@ inline void dispatch_gpu_sort(buffer_iterator<T> first,
         );
     }
     else {
-        // radix sort in ascending order
-        ::boost::compute::detail::radix_sort(first, last, queue);
-
-        // reverse range to descending order
-        ::boost::compute::reverse(first, last, queue);
+        // radix sorts in descending order
+        ::boost::compute::detail::radix_sort(first, last, false, queue);
     }
 }
 
@@ -88,9 +86,22 @@ inline void dispatch_gpu_sort(Iterator first,
                               Compare compare,
                               command_queue &queue)
 {
-    ::boost::compute::detail::serial_insertion_sort(
-        first, last, compare, queue
-    );
+    size_t count = detail::iterator_range_size(first, last);
+
+    if(count < 2){
+        // nothing to do
+        return;
+    }
+    else if(count <= 32){
+        ::boost::compute::detail::serial_insertion_sort(
+            first, last, compare, queue
+        );
+    }
+    else {
+        ::boost::compute::detail::merge_sort_on_gpu(
+            first, last, compare, queue
+        );
+    }
 }
 
 // sort() for device iterators
diff --git a/boost/compute/algorithm/sort_by_key.hpp b/boost/compute/algorithm/sort_by_key.hpp
index 0e3dba81eb..c39bcf9890 100644
--- a/boost/compute/algorithm/sort_by_key.hpp
+++ b/boost/compute/algorithm/sort_by_key.hpp
@@ -18,6 +18,7 @@
 #include <boost/compute/system.hpp>
 #include <boost/compute/command_queue.hpp>
 #include <boost/compute/algorithm/detail/merge_sort_on_cpu.hpp>
+#include <boost/compute/algorithm/detail/merge_sort_on_gpu.hpp>
 #include <boost/compute/algorithm/detail/insertion_sort.hpp>
 #include <boost/compute/algorithm/detail/radix_sort.hpp>
 #include <boost/compute/algorithm/reverse.hpp>
@@ -76,14 +77,10 @@ dispatch_gpu_sort_by_key(KeyIterator keys_first,
         );
     }
     else {
-        // radix sorts in ascending order
+        // radix sorts in descending order
         detail::radix_sort_by_key(
-            keys_first, keys_last, values_first, queue
+            keys_first, keys_last, values_first, false, queue
         );
-
-        // Reverse keys, values for descending order
-        ::boost::compute::reverse(keys_first, keys_last, queue);
-        ::boost::compute::reverse(values_first, values_first + count, queue);
     }
 }
 
@@ -94,9 +91,17 @@ inline void dispatch_gpu_sort_by_key(KeyIterator keys_first,
                                      Compare compare,
                                      command_queue &queue)
 {
-    detail::serial_insertion_sort_by_key(
-        keys_first, keys_last, values_first, compare, queue
-    );
+    size_t count = detail::iterator_range_size(keys_first, keys_last);
+
+    if(count < 32){
+        detail::serial_insertion_sort_by_key(
+            keys_first, keys_last, values_first, compare, queue
+        );
+    } else {
+        detail::merge_sort_by_key_on_gpu(
+            keys_first, keys_last, values_first, compare, queue
+        );
+    }
 }
 
 template<class KeyIterator, class ValueIterator, class Compare>
diff --git a/boost/compute/algorithm/stable_sort.hpp b/boost/compute/algorithm/stable_sort.hpp
index cd82a0a606..381fc81bc0 100644
--- a/boost/compute/algorithm/stable_sort.hpp
+++ b/boost/compute/algorithm/stable_sort.hpp
@@ -16,10 +16,12 @@
 #include <boost/compute/system.hpp>
 #include <boost/compute/command_queue.hpp>
 #include <boost/compute/algorithm/detail/merge_sort_on_cpu.hpp>
+#include <boost/compute/algorithm/detail/merge_sort_on_gpu.hpp>
 #include <boost/compute/algorithm/detail/radix_sort.hpp>
 #include <boost/compute/algorithm/detail/insertion_sort.hpp>
 #include <boost/compute/algorithm/reverse.hpp>
 #include <boost/compute/functional/operator.hpp>
+#include <boost/compute/detail/iterator_range_size.hpp>
 
 namespace boost {
 namespace compute {
@@ -31,9 +33,17 @@ inline void dispatch_gpu_stable_sort(Iterator first,
                                      Compare compare,
                                      command_queue &queue)
 {
-    ::boost::compute::detail::serial_insertion_sort(
-        first, last, compare, queue
-    );
+    size_t count = detail::iterator_range_size(first, last);
+
+    if(count < 32){
+        detail::serial_insertion_sort(
+            first, last, compare, queue
+        );
+    } else {
+        detail::merge_sort_on_gpu(
+            first, last, compare, true /* stable */, queue
+        );
+    }
 }
 
 template<class T>
@@ -53,11 +63,8 @@ dispatch_gpu_stable_sort(buffer_iterator<T> first,
                          greater<T>,
                          command_queue &queue)
 {
-    // radix sort in ascending order
-    ::boost::compute::detail::radix_sort(first, last, queue);
-
-    // reverse range to descending order
-    ::boost::compute::reverse(first, last, queue);
+    // radix sorts in descending order
+    ::boost::compute::detail::radix_sort(first, last, false, queue);
 }
 
 } // end detail namespace
@@ -76,6 +83,7 @@ inline void stable_sort(Iterator first,
         ::boost::compute::detail::dispatch_gpu_stable_sort(
             first, last, compare, queue
         );
+        return;
     }
     ::boost::compute::detail::merge_sort_on_cpu(first, last, compare, queue);
 }
diff --git a/boost/compute/algorithm/stable_sort_by_key.hpp b/boost/compute/algorithm/stable_sort_by_key.hpp
index 8a51372ede..878f999f44 100644
--- a/boost/compute/algorithm/stable_sort_by_key.hpp
+++ b/boost/compute/algorithm/stable_sort_by_key.hpp
@@ -21,6 +21,105 @@
 namespace boost {
 namespace compute {
 
+namespace detail {
+
+template<class KeyIterator, class ValueIterator>
+inline void
+dispatch_gpu_ssort_by_key(KeyIterator keys_first,
+                          KeyIterator keys_last,
+                          ValueIterator values_first,
+                          less<typename std::iterator_traits<KeyIterator>::value_type> compare,
+                          command_queue &queue,
+                          typename boost::enable_if_c<
+                              is_radix_sortable<
+                                  typename std::iterator_traits<KeyIterator>::value_type
+                              >::value
+                          >::type* = 0)
+{
+    size_t count = detail::iterator_range_size(keys_first, keys_last);
+
+    if(count < 32){
+        detail::serial_insertion_sort_by_key(
+            keys_first, keys_last, values_first, compare, queue
+        );
+    }
+    else {
+        detail::radix_sort_by_key(
+            keys_first, keys_last, values_first, queue
+        );
+    }
+}
+
+template<class KeyIterator, class ValueIterator>
+inline void
+dispatch_gpu_ssort_by_key(KeyIterator keys_first,
+                          KeyIterator keys_last,
+                          ValueIterator values_first,
+                          greater<typename std::iterator_traits<KeyIterator>::value_type> compare,
+                          command_queue &queue,
+                          typename boost::enable_if_c<
+                              is_radix_sortable<
+                                  typename std::iterator_traits<KeyIterator>::value_type
+                              >::value
+                          >::type* = 0)
+{
+    size_t count = detail::iterator_range_size(keys_first, keys_last);
+
+    if(count < 32){
+        detail::serial_insertion_sort_by_key(
+            keys_first, keys_last, values_first, compare, queue
+        );
+    }
+    else {
+        // radix sorts in descending order
+        detail::radix_sort_by_key(
+            keys_first, keys_last, values_first, false, queue
+        );
+    }
+}
+
+template<class KeyIterator, class ValueIterator, class Compare>
+inline void dispatch_gpu_ssort_by_key(KeyIterator keys_first,
+                                      KeyIterator keys_last,
+                                      ValueIterator values_first,
+                                      Compare compare,
+                                      command_queue &queue)
+{
+    size_t count = detail::iterator_range_size(keys_first, keys_last);
+
+    if(count < 32){
+        detail::serial_insertion_sort_by_key(
+            keys_first, keys_last, values_first,
+            compare, queue
+        );
+    } else {
+        detail::merge_sort_by_key_on_gpu(
+            keys_first, keys_last, values_first,
+            compare, true /* stable */, queue
+        );
+    }
+}
+
+template<class KeyIterator, class ValueIterator, class Compare>
+inline void dispatch_ssort_by_key(KeyIterator keys_first,
+                                  KeyIterator keys_last,
+                                  ValueIterator values_first,
+                                  Compare compare,
+                                  command_queue &queue)
+{
+    if(queue.get_device().type() & device::gpu) {
+        dispatch_gpu_ssort_by_key(
+            keys_first, keys_last, values_first, compare, queue
+        );
+        return;
+    }
+    ::boost::compute::detail::merge_sort_by_key_on_cpu(
+        keys_first, keys_last, values_first, compare, queue
+    );
+}
+
+} // end detail namespace
+
 /// Performs a key-value stable sort using the keys in the range [\p keys_first,
 /// \p keys_last) on the values in the range [\p values_first,
 /// \p values_first \c + (\p keys_last \c - \p keys_first)) using \p compare.
@@ -35,8 +134,7 @@ inline void stable_sort_by_key(KeyIterator keys_first,
                                Compare compare,
                                command_queue &queue = system::default_queue())
 {
-    // sort_by_key is stable
-    ::boost::compute::sort_by_key(
+    ::boost::compute::detail::dispatch_ssort_by_key(
         keys_first, keys_last, values_first, compare, queue
     );
 }
diff --git a/boost/compute/algorithm/transform.hpp b/boost/compute/algorithm/transform.hpp
index 022a4988bd..68750a6523 100644
--- a/boost/compute/algorithm/transform.hpp
+++ b/boost/compute/algorithm/transform.hpp
@@ -22,7 +22,7 @@ namespace boost {
 namespace compute {
 
 /// Transforms the elements in the range [\p first, \p last) using
-/// \p transform and stores the results in the range beginning at
+/// operator \p op and stores the results in the range beginning at
 /// \p result.
 ///
 /// For example, to calculate the absolute value for each element in a vector:
diff --git a/boost/compute/config.hpp b/boost/compute/config.hpp
index 77d0d7b9df..e501ed28c9 100644
--- a/boost/compute/config.hpp
+++ b/boost/compute/config.hpp
@@ -16,14 +16,13 @@
 #include <boost/compute/cl.hpp>
 
 // check for minimum required boost version
-#if BOOST_VERSION < 104800
-#error Boost.Compute requires Boost version 1.48 or later
+#if BOOST_VERSION < 105400
+#error Boost.Compute requires Boost version 1.54 or later
 #endif
 
 // the BOOST_COMPUTE_NO_VARIADIC_TEMPLATES macro is defined
 // if the compiler does not *fully* support variadic templates
 #if defined(BOOST_NO_CXX11_VARIADIC_TEMPLATES) || \
-    defined(BOOST_NO_VARIADIC_TEMPLATES) || \
     (defined(__GNUC__) && !defined(__clang__) && \
      __GNUC__ == 4 && __GNUC_MINOR__ <= 6)
   #define BOOST_COMPUTE_NO_VARIADIC_TEMPLATES
@@ -32,7 +31,6 @@
 // the BOOST_COMPUTE_NO_STD_TUPLE macro is defined if the
 // compiler/stdlib does not support std::tuple
 #if defined(BOOST_NO_CXX11_HDR_TUPLE) || \
-    defined(BOOST_NO_0X_HDR_TUPLE) || \
     defined(BOOST_COMPUTE_NO_VARIADIC_TEMPLATES)
   #define BOOST_COMPUTE_NO_STD_TUPLE
 #endif // BOOST_NO_CXX11_HDR_TUPLE
@@ -53,17 +51,15 @@
 #endif
 
 #if !defined(BOOST_COMPUTE_DOXYGEN_INVOKED) && \
-    (defined(BOOST_NO_CXX11_RVALUE_REFERENCES) || defined(BOOST_NO_RVALUE_REFERENCES))
+    defined(BOOST_NO_CXX11_RVALUE_REFERENCES)
 #  define BOOST_COMPUTE_NO_RVALUE_REFERENCES
 #endif // BOOST_NO_CXX11_RVALUE_REFERENCES
 
-#if defined(BOOST_NO_CXX11_HDR_INITIALIZER_LIST) || \
-    defined(BOOST_NO_0X_HDR_INITIALIZER_LIST)
+#if defined(BOOST_NO_CXX11_HDR_INITIALIZER_LIST)
 #  define BOOST_COMPUTE_NO_HDR_INITIALIZER_LIST
 #endif // BOOST_NO_CXX11_HDR_INITIALIZER_LIST
 
-#if defined(BOOST_NO_CXX11_HDR_CHRONO) || \
-    defined(BOOST_NO_0X_HDR_CHRONO)
+#if defined(BOOST_NO_CXX11_HDR_CHRONO)
 #  define BOOST_COMPUTE_NO_HDR_CHRONO
 #endif // BOOST_NO_CXX11_HDR_CHRONO
 
diff --git a/boost/compute/container/array.hpp b/boost/compute/container/array.hpp
index 919be6eeac..2b504be4a8 100644
--- a/boost/compute/container/array.hpp
+++ b/boost/compute/container/array.hpp
@@ -73,20 +73,33 @@ public:
     array(const array<T, N> &other)
         : m_buffer(other.m_buffer.get_context(), sizeof(T) * N)
     {
-        boost::compute::copy(other.begin(), other.end(), begin());
+        command_queue queue = default_queue();
+        boost::compute::copy(other.begin(), other.end(), begin(), queue);
+        queue.finish();
     }
 
     array(const boost::array<T, N> &array,
           const context &context = system::default_context())
         : m_buffer(context, sizeof(T) * N)
     {
-        boost::compute::copy(array.begin(), array.end(), begin());
+        command_queue queue = default_queue();
+        boost::compute::copy(array.begin(), array.end(), begin(), queue);
+        queue.finish();
+    }
+
+    array(const array<T, N> &other,
+          const command_queue &queue)
+        : m_buffer(other.m_buffer.get_context(), sizeof(T) * N)
+    {
+        boost::compute::copy(other.begin(), other.end(), begin(), queue);
     }
 
     array<T, N>& operator=(const array<T, N> &other)
     {
         if(this != &other){
-            boost::compute::copy(other.begin(), other.end(), begin());
+            command_queue queue = default_queue();
+            boost::compute::copy(other.begin(), other.end(), begin(), queue);
+            queue.finish();
         }
 
         return *this;
@@ -94,7 +107,9 @@ public:
 
     array<T, N>& operator=(const boost::array<T, N> &array)
     {
-        boost::compute::copy(array.begin(), array.end(), begin());
+        command_queue queue = default_queue();
+        boost::compute::copy(array.begin(), array.end(), begin(), queue);
+        queue.finish();
 
         return *this;
     }
@@ -226,14 +241,28 @@ public:
         return *(end() - static_cast<difference_type>(1));
     }
 
+    void fill(const value_type &value, const command_queue &queue)
+    {
+        ::boost::compute::fill(begin(), end(), value, queue);
+    }
+
+    void swap(array<T, N> &other, const command_queue &queue)
+    {
+        ::boost::compute::swap_ranges(begin(), end(), other.begin(), queue);
+    }
+
     void fill(const value_type &value)
     {
-        ::boost::compute::fill(begin(), end(), value);
+        command_queue queue = default_queue();
+        ::boost::compute::fill(begin(), end(), value, queue);
+        queue.finish();
     }
 
     void swap(array<T, N> &other)
     {
-        ::boost::compute::swap_ranges(begin(), end(), other.begin());
+        command_queue queue = default_queue();
+        ::boost::compute::swap_ranges(begin(), end(), other.begin(), queue);
+        queue.finish();
     }
 
     const buffer& get_buffer() const
@@ -243,6 +272,13 @@ public:
 
 private:
     buffer m_buffer;
+
+    command_queue default_queue() const
+    {
+        const context &context = m_buffer.get_context();
+        command_queue queue(context, context.get_device());
+        return queue;
+    }
 };
 
 namespace detail {
diff --git a/boost/compute/container/vector.hpp b/boost/compute/container/vector.hpp
index 47d649ad99..aa0eb98ce9 100644
--- a/boost/compute/container/vector.hpp
+++ b/boost/compute/container/vector.hpp
@@ -196,8 +196,15 @@ public:
         m_data = m_allocator.allocate((std::max)(m_size, _minimum_capacity()));
 
         if(!other.empty()){
-            ::boost::compute::copy(other.begin(), other.end(), begin(), queue);
-            queue.finish();
+            if(other.get_buffer().get_context() != queue.get_context()){
+                command_queue other_queue = other.default_queue();
+                ::boost::compute::copy(other.begin(), other.end(), begin(), other_queue);
+                other_queue.finish();
+            }
+            else {
+                ::boost::compute::copy(other.begin(), other.end(), begin(), queue);
+                queue.finish();
+            }
         }
     }
 
@@ -253,6 +260,17 @@ public:
     }
 
     template<class OtherAlloc>
+    vector& operator=(const vector<T, OtherAlloc> &other)
+    {
+        command_queue queue = default_queue();
+        resize(other.size(), queue);
+        ::boost::compute::copy(other.begin(), other.end(), begin(), queue);
+        queue.finish();
+
+        return *this;
+    }
+
+    template<class OtherAlloc>
     vector& operator=(const std::vector<T, OtherAlloc> &vector)
     {
         command_queue queue = default_queue();
@@ -371,7 +389,7 @@ public:
     /// Resizes the vector to \p size.
     void resize(size_type size, command_queue &queue)
     {
-        if(size < capacity()){
+        if(size <= capacity()){
             m_size = size;
         }
         else {
diff --git a/boost/compute/context.hpp b/boost/compute/context.hpp
index 5db39e9d83..61e84e9767 100644
--- a/boost/compute/context.hpp
+++ b/boost/compute/context.hpp
@@ -204,7 +204,7 @@ public:
     typename detail::get_object_info_type<context, Enum>::type
     get_info() const;
 
-    /// Returns \c true if the context is the same at \p other.
+    /// Returns \c true if the context is the same as \p other.
     bool operator==(const context &other) const
     {
         return m_context == other.m_context;
diff --git a/boost/compute/detail/literal.hpp b/boost/compute/detail/literal.hpp
index 0d23b1d4d2..874830ee49 100644
--- a/boost/compute/detail/literal.hpp
+++ b/boost/compute/detail/literal.hpp
@@ -27,7 +27,15 @@ template<class T>
 std::string make_literal(T x)
 {
     std::stringstream s;
-    s << std::setprecision(std::numeric_limits<T>::digits10)
+    s << std::setprecision(
+#ifndef BOOST_NO_CXX11_NUMERIC_LIMITS
+            std::numeric_limits<T>::max_digits10
+#else
+            // We don't have max_digits10, so add 3 other digits (this is what is required for
+            // float, and is one more than required for double).
+            3 + std::numeric_limits<T>::digits10
+#endif
+            )
       << std::scientific
       << x;
 
diff --git a/boost/compute/detail/meta_kernel.hpp b/boost/compute/detail/meta_kernel.hpp
index 7be778b025..5e6d6e0337 100644
--- a/boost/compute/detail/meta_kernel.hpp
+++ b/boost/compute/detail/meta_kernel.hpp
@@ -34,6 +34,7 @@
 #include <boost/compute/image/image2d.hpp>
 #include <boost/compute/image/image_sampler.hpp>
 #include <boost/compute/memory_object.hpp>
+#include <boost/compute/memory/svm_ptr.hpp>
 #include <boost/compute/detail/device_ptr.hpp>
 #include <boost/compute/detail/sha1.hpp>
 #include <boost/compute/utility/program_cache.hpp>
@@ -203,6 +204,28 @@ struct meta_kernel_buffer_info
     size_t index;
 };
 
+struct meta_kernel_svm_info
+{
+    template <class T>
+    meta_kernel_svm_info(const svm_ptr<T> ptr,
+                         const std::string &id,
+                         memory_object::address_space addr_space,
+                         size_t i)
+      : ptr(ptr.get()),
+        identifier(id),
+        address_space(addr_space),
+        index(i)
+    {
+
+    }
+
+    void* ptr;
+    std::string identifier;
+    memory_object::address_space address_space;
+    size_t index;
+};
+
+
 class meta_kernel;
 
 template<class Type>
@@ -280,12 +303,14 @@ public:
     meta_kernel(const meta_kernel &other)
     {
         m_source.str(other.m_source.str());
+        m_options = other.m_options;
     }
 
     meta_kernel& operator=(const meta_kernel &other)
     {
         if(this != &other){
             m_source.str(other.m_source.str());
+            m_options = other.m_options;
         }
 
         return *this;
@@ -342,9 +367,11 @@ public:
         boost::shared_ptr<program_cache> cache =
             program_cache::get_global_cache(context);
 
+        std::string compile_options = m_options + options;
+
         // load (or build) program from cache
         ::boost::compute::program program =
-            cache->get_or_build(cache_key, options, source, context);
+            cache->get_or_build(cache_key, compile_options, source, context);
 
         // create kernel
         ::boost::compute::kernel kernel = program.create_kernel(name());
@@ -365,6 +392,13 @@ public:
             kernel.set_arg(bi.index, bi.m_mem);
         }
 
+        // bind svm args
+        for(size_t i = 0; i < m_stored_svm_ptrs.size(); i++){
+            const detail::meta_kernel_svm_info &spi = m_stored_svm_ptrs[i];
+
+            kernel.set_arg_svm_ptr(spi.index, spi.ptr);
+        }
+
         return kernel;
     }
 
@@ -689,6 +723,45 @@ public:
         return identifier;
     }
 
+    template<class T>
+    std::string get_svm_identifier(const svm_ptr<T> &svm_ptr,
+                                   const memory_object::address_space address_space =
+                                       memory_object::global_memory)
+    {
+        BOOST_ASSERT(
+            (address_space == memory_object::global_memory)
+                || (address_space == memory_object::constant_memory)
+        );
+
+        // check if we've already seen this pointer
+        for(size_t i = 0; i < m_stored_svm_ptrs.size(); i++){
+            const detail::meta_kernel_svm_info &spi = m_stored_svm_ptrs[i];
+
+            if(spi.ptr == svm_ptr.get() &&
+               spi.address_space == address_space){
+                return spi.identifier;
+            }
+        }
+
+        // create a new binding
+        std::string identifier =
+            "_svm_ptr" + lexical_cast<std::string>(m_stored_svm_ptrs.size());
+        size_t index = add_arg<T *>(address_space, identifier);
+
+        if(m_stored_svm_ptrs.empty()) {
+            m_options += std::string(" -cl-std=CL2.0");
+        }
+
+        // store new svm pointer info
+        m_stored_svm_ptrs.push_back(
+            detail::meta_kernel_svm_info(
+                svm_ptr, identifier, address_space, index
+            )
+        );
+
+        return identifier;
+    }
+
     std::string get_image_identifier(const char *qualifiers, const image2d &image)
     {
         size_t index = add_arg_with_qualifiers<image2d>(qualifiers, "image");
@@ -880,8 +953,10 @@ private:
     std::set<std::string> m_external_function_names;
     std::vector<std::string> m_args;
     std::string m_pragmas;
+    std::string m_options;
     std::vector<detail::meta_kernel_stored_arg> m_stored_args;
     std::vector<detail::meta_kernel_buffer_info> m_stored_buffers;
+    std::vector<detail::meta_kernel_svm_info> m_stored_svm_ptrs;
 };
 
 template<class ResultType, class ArgTuple>
@@ -960,6 +1035,18 @@ inline meta_kernel& operator<<(meta_kernel &kernel,
     }
 }
 
+// SVM requires OpenCL 2.0
+#if defined(CL_VERSION_2_0) || defined(BOOST_COMPUTE_DOXYGEN_INVOKED)
+template<class T, class IndexExpr>
+inline meta_kernel& operator<<(meta_kernel &kernel,
+                               const svm_ptr_index_expr<T, IndexExpr> &expr)
+{
+    return kernel <<
+        kernel.get_svm_identifier<T>(expr.m_svm_ptr) <<
+        '[' << expr.m_expr << ']';
+}
+#endif
+
 template<class Predicate, class Arg>
 inline meta_kernel& operator<<(meta_kernel &kernel,
                                const invoked_unary_negate_function<Predicate,
diff --git a/boost/compute/detail/parameter_cache.hpp b/boost/compute/detail/parameter_cache.hpp
index 2a856311e1..0a16cd9b0e 100644
--- a/boost/compute/detail/parameter_cache.hpp
+++ b/boost/compute/detail/parameter_cache.hpp
@@ -161,7 +161,7 @@ private:
         try {
             read_json(m_file_name, pt);
         }
-        catch(boost::property_tree::json_parser::json_parser_error &e){
+        catch(boost::property_tree::json_parser::json_parser_error&){
             // no saved cache file, ignore
             return;
         }
diff --git a/boost/compute/detail/vendor.hpp b/boost/compute/detail/vendor.hpp
index 0aa9c9c0d4..e2359cb466 100644
--- a/boost/compute/detail/vendor.hpp
+++ b/boost/compute/detail/vendor.hpp
@@ -31,6 +31,18 @@ inline bool is_amd_device(const device &device)
     return device.platform().vendor() == "Advanced Micro Devices, Inc.";
 }
 
+// returns true if the platform is Apple OpenCL platform
+inline bool is_apple_platform(const platform &platform)
+{
+    return platform.name() == "Apple";
+}
+
+// returns true if the device is from Apple OpenCL Platform
+inline bool is_apple_platform_device(const device &device)
+{
+    return is_apple_platform(device.platform());
+}
+
 } // end detail namespace
 } // end compute namespace
 } // end boost namespace
diff --git a/boost/compute/exception/context_error.hpp b/boost/compute/exception/context_error.hpp
index eeb387d884..ec8dc7c30f 100644
--- a/boost/compute/exception/context_error.hpp
+++ b/boost/compute/exception/context_error.hpp
@@ -70,7 +70,7 @@ public:
     }
 
     /// Returns the size of the private info memory block.
-    const size_t get_private_info_size() const throw()
+    size_t get_private_info_size() const throw()
     {
         return m_private_info_size;
     }
diff --git a/boost/compute/function.hpp b/boost/compute/function.hpp
index e83f16808a..b0b893e948 100644
--- a/boost/compute/function.hpp
+++ b/boost/compute/function.hpp
@@ -215,7 +215,7 @@ public:
     {
         BOOST_STATIC_ASSERT_MSG(
             arity == 3,
-            "Non-ternary function invoked with two arguments"
+            "Non-ternary function invoked with three arguments"
         );
 
         return detail::invoked_function<result_type, boost::tuple<Arg1, Arg2, Arg3> >(
diff --git a/boost/compute/interop/opengl/acquire.hpp b/boost/compute/interop/opengl/acquire.hpp
index 10af4338fb..b9259e0d88 100644
--- a/boost/compute/interop/opengl/acquire.hpp
+++ b/boost/compute/interop/opengl/acquire.hpp
@@ -14,6 +14,7 @@
 #include <boost/compute/command_queue.hpp>
 #include <boost/compute/interop/opengl/cl_gl.hpp>
 #include <boost/compute/interop/opengl/opengl_buffer.hpp>
+#include <boost/compute/types/fundamental.hpp>
 #include <boost/compute/utility/wait_list.hpp>
 
 namespace boost {
@@ -22,7 +23,7 @@ namespace compute {
 /// Enqueues a command to acquire the specified OpenGL memory objects.
 ///
 /// \see_opencl_ref{clEnqueueAcquireGLObjects}
-inline event opengl_enqueue_acquire_gl_objects(size_t num_objects,
+inline event opengl_enqueue_acquire_gl_objects(const uint_ num_objects,
                                               const cl_mem *mem_objects,
                                               command_queue &queue,
                                               const wait_list &events = wait_list())
@@ -47,7 +48,7 @@ inline event opengl_enqueue_acquire_gl_objects(size_t num_objects,
 /// Enqueues a command to release the specified OpenGL memory objects.
 ///
 /// \see_opencl_ref{clEnqueueReleaseGLObjects}
-inline event opengl_enqueue_release_gl_objects(size_t num_objects,
+inline event opengl_enqueue_release_gl_objects(const uint_ num_objects,
                                               const cl_mem *mem_objects,
                                               command_queue &queue,
                                               const wait_list &events = wait_list())
diff --git a/boost/compute/interop/opengl/context.hpp b/boost/compute/interop/opengl/context.hpp
index 754dca2236..c35fedddfa 100644
--- a/boost/compute/interop/opengl/context.hpp
+++ b/boost/compute/interop/opengl/context.hpp
@@ -78,7 +78,7 @@ inline context opengl_create_shared_context()
         // load clGetGLContextInfoKHR() extension function
         GetGLContextInfoKHRFunction GetGLContextInfoKHR =
             reinterpret_cast<GetGLContextInfoKHRFunction>(
-                reinterpret_cast<unsigned long>(
+                reinterpret_cast<size_t>(
                     platform.get_extension_function_address("clGetGLContextInfoKHR")
                 )
             );
diff --git a/boost/compute/kernel.hpp b/boost/compute/kernel.hpp
index 9494e46de2..72f21a0378 100644
--- a/boost/compute/kernel.hpp
+++ b/boost/compute/kernel.hpp
@@ -22,7 +22,6 @@
 #include <boost/compute/type_traits/is_fundamental.hpp>
 #include <boost/compute/detail/get_object_info.hpp>
 #include <boost/compute/detail/assert_cl_success.hpp>
-#include <boost/compute/memory/svm_ptr.hpp>
 
 namespace boost {
 namespace compute {
@@ -189,8 +188,15 @@ public:
     template<class T>
     T get_arg_info(size_t index, cl_kernel_arg_info info) const
     {
-        return detail::get_object_info<T>(clGetKernelArgInfo, m_kernel, info, index);
+        return detail::get_object_info<T>(
+            clGetKernelArgInfo, m_kernel, info, static_cast<cl_uint>(index)
+        );
     }
+
+    /// \overload
+    template<int Enum>
+    typename detail::get_object_info_type<kernel, Enum>::type
+    get_arg_info(size_t index) const;
     #endif // CL_VERSION_1_2
 
     /// Returns work-group information for the kernel with \p device.
@@ -258,15 +264,16 @@ public:
     }
 
     /// \internal_
-    template<class T>
-    void set_arg(size_t index, const svm_ptr<T> ptr)
+    void set_arg_svm_ptr(size_t index, void* ptr)
     {
         #ifdef CL_VERSION_2_0
-        cl_int ret = clSetKernelArgSVMPointer(m_kernel, index, ptr.get());
+        cl_int ret = clSetKernelArgSVMPointer(m_kernel, static_cast<cl_uint>(index), ptr);
         if(ret != CL_SUCCESS){
             BOOST_THROW_EXCEPTION(opencl_error(ret));
         }
         #else
+        (void) index;
+        (void) ptr;
         BOOST_THROW_EXCEPTION(opencl_error(CL_INVALID_ARG_VALUE));
         #endif
     }
@@ -324,7 +331,7 @@ public:
     }
 
 private:
-    #ifndef BOOST_NO_VARIADIC_TEMPLATES
+    #ifndef BOOST_COMPUTE_NO_VARIADIC_TEMPLATES
     /// \internal_
     template<size_t N>
     void _set_args()
@@ -338,7 +345,7 @@ private:
         set_arg(N, arg);
         _set_args<N+1>(rest...);
     }
-    #endif // BOOST_NO_VARIADIC_TEMPLATES
+    #endif // BOOST_COMPUTE_NO_VARIADIC_TEMPLATES
 
 private:
     cl_kernel m_kernel;
@@ -364,6 +371,23 @@ BOOST_COMPUTE_DETAIL_DEFINE_GET_INFO_SPECIALIZATIONS(kernel,
 )
 #endif // CL_VERSION_1_2
 
+/// \internal_ define get_arg_info() specializations for kernel
+#ifdef CL_VERSION_1_2
+#define BOOST_COMPUTE_DETAIL_DEFINE_KERNEL_GET_ARG_INFO_SPECIALIZATION(result_type, value) \
+    namespace detail { \
+        template<> struct get_object_info_type<kernel, value> { typedef result_type type; }; \
+    } \
+    template<> inline result_type kernel::get_arg_info<value>(size_t index) const { \
+        return get_arg_info<result_type>(index, value); \
+    }
+
+BOOST_COMPUTE_DETAIL_DEFINE_KERNEL_GET_ARG_INFO_SPECIALIZATION(cl_kernel_arg_address_qualifier, CL_KERNEL_ARG_ADDRESS_QUALIFIER)
+BOOST_COMPUTE_DETAIL_DEFINE_KERNEL_GET_ARG_INFO_SPECIALIZATION(cl_kernel_arg_access_qualifier, CL_KERNEL_ARG_ACCESS_QUALIFIER)
+BOOST_COMPUTE_DETAIL_DEFINE_KERNEL_GET_ARG_INFO_SPECIALIZATION(std::string, CL_KERNEL_ARG_TYPE_NAME)
+BOOST_COMPUTE_DETAIL_DEFINE_KERNEL_GET_ARG_INFO_SPECIALIZATION(cl_kernel_arg_type_qualifier, CL_KERNEL_ARG_TYPE_QUALIFIER)
+BOOST_COMPUTE_DETAIL_DEFINE_KERNEL_GET_ARG_INFO_SPECIALIZATION(std::string, CL_KERNEL_ARG_NAME)
+#endif // CL_VERSION_1_2
+
 namespace detail {
 
 // set_kernel_arg implementation for built-in types
diff --git a/boost/compute/memory/svm_ptr.hpp b/boost/compute/memory/svm_ptr.hpp
index 2dbcb8f635..0c9d88035c 100644
--- a/boost/compute/memory/svm_ptr.hpp
+++ b/boost/compute/memory/svm_ptr.hpp
@@ -11,12 +11,67 @@
 #ifndef BOOST_COMPUTE_MEMORY_SVM_PTR_HPP
 #define BOOST_COMPUTE_MEMORY_SVM_PTR_HPP
 
+#include <boost/type_traits.hpp>
+#include <boost/static_assert.hpp>
+#include <boost/assert.hpp>
+
 #include <boost/compute/cl.hpp>
+#include <boost/compute/kernel.hpp>
+#include <boost/compute/context.hpp>
+#include <boost/compute/command_queue.hpp>
 #include <boost/compute/type_traits/is_device_iterator.hpp>
 
 namespace boost {
 namespace compute {
 
+// forward declaration for svm_ptr<T>
+template<class T>
+class svm_ptr;
+
+// svm functions require OpenCL 2.0
+#if defined(CL_VERSION_2_0) || defined(BOOST_COMPUTE_DOXYGEN_INVOKED)
+namespace detail {
+
+template<class T, class IndexExpr>
+struct svm_ptr_index_expr
+{
+    typedef T result_type;
+
+    svm_ptr_index_expr(const svm_ptr<T> &svm_ptr,
+                       const IndexExpr &expr)
+        : m_svm_ptr(svm_ptr),
+          m_expr(expr)
+    {
+    }
+
+    operator T() const
+    {
+        BOOST_STATIC_ASSERT_MSG(boost::is_integral<IndexExpr>::value,
+                                "Index expression must be integral");
+
+        BOOST_ASSERT(m_svm_ptr.get());
+
+        const context &context = m_svm_ptr.get_context();
+        const device &device = context.get_device();
+        command_queue queue(context, device);
+
+        T value;
+        T* ptr =
+            static_cast<T*>(m_svm_ptr.get()) + static_cast<std::ptrdiff_t>(m_expr);
+        queue.enqueue_svm_map(static_cast<void*>(ptr), sizeof(T), CL_MAP_READ);
+        value = *(ptr);
+        queue.enqueue_svm_unmap(static_cast<void*>(ptr)).wait();
+
+        return value;
+    }
+
+    const svm_ptr<T> &m_svm_ptr;
+    IndexExpr m_expr;
+};
+
+} // end detail namespace
+#endif
+
 template<class T>
 class svm_ptr
 {
@@ -32,19 +87,22 @@ public:
     {
     }
 
-    explicit svm_ptr(void *ptr)
-        : m_ptr(static_cast<T*>(ptr))
+    svm_ptr(void *ptr, const context &context)
+        : m_ptr(static_cast<T*>(ptr)),
+          m_context(context)
     {
     }
 
     svm_ptr(const svm_ptr<T> &other)
-        : m_ptr(other.m_ptr)
+        : m_ptr(other.m_ptr),
+          m_context(other.m_context)
     {
     }
 
-    svm_ptr& operator=(const svm_ptr<T> &other)
+    svm_ptr<T>& operator=(const svm_ptr<T> &other)
     {
         m_ptr = other.m_ptr;
+        m_context = other.m_context;
         return *this;
     }
 
@@ -59,18 +117,53 @@ public:
 
     svm_ptr<T> operator+(difference_type n)
     {
-        return svm_ptr<T>(m_ptr + n);
+        return svm_ptr<T>(m_ptr + n, m_context);
     }
 
     difference_type operator-(svm_ptr<T> other)
     {
+        BOOST_ASSERT(other.m_context == m_context);
         return m_ptr - other.m_ptr;
     }
 
+    context& get_context() const
+    {
+        return m_context;
+    }
+
+    // svm functions require OpenCL 2.0
+    #if defined(CL_VERSION_2_0) || defined(BOOST_COMPUTE_DOXYGEN_INVOKED)
+    /// \internal_
+    template<class Expr>
+    detail::svm_ptr_index_expr<T, Expr>
+    operator[](const Expr &expr) const
+    {
+        BOOST_ASSERT(m_ptr);
+
+        return detail::svm_ptr_index_expr<T, Expr>(*this,
+                                                   expr);
+    }
+    #endif
+
 private:
     T *m_ptr;
+    context m_context;
 };
 
+namespace detail {
+
+/// \internal_
+template<class T>
+struct set_kernel_arg<svm_ptr<T> >
+{
+    void operator()(kernel &kernel_, size_t index, const svm_ptr<T> &ptr)
+    {
+        kernel_.set_arg_svm_ptr(index, ptr.get());
+    }
+};
+
+} // end detail namespace
+
 /// \internal_ (is_device_iterator specialization for svm_ptr)
 template<class T>
 struct is_device_iterator<svm_ptr<T> > : boost::true_type {};
diff --git a/boost/compute/random/bernoulli_distribution.hpp b/boost/compute/random/bernoulli_distribution.hpp
index edd1125090..50bf27f591 100644
--- a/boost/compute/random/bernoulli_distribution.hpp
+++ b/boost/compute/random/bernoulli_distribution.hpp
@@ -11,6 +11,9 @@
 #ifndef BOOST_COMPUTE_RANDOM_BERNOULLI_DISTRIBUTION_HPP
 #define BOOST_COMPUTE_RANDOM_BERNOULLI_DISTRIBUTION_HPP
 
+#include <boost/assert.hpp>
+#include <boost/type_traits.hpp>
+
 #include <boost/compute/command_queue.hpp>
 #include <boost/compute/function.hpp>
 #include <boost/compute/types/fundamental.hpp>
@@ -84,6 +87,11 @@ public:
 
 private:
     RealType m_p;
+
+    BOOST_STATIC_ASSERT_MSG(
+        boost::is_floating_point<RealType>::value,
+        "Template argument must be a floating point type"
+    );
 };
 
 } // end compute namespace
diff --git a/boost/compute/random/discrete_distribution.hpp b/boost/compute/random/discrete_distribution.hpp
index 3707928f98..86249538ac 100644
--- a/boost/compute/random/discrete_distribution.hpp
+++ b/boost/compute/random/discrete_distribution.hpp
@@ -11,6 +11,12 @@
 #ifndef BOOST_COMPUTE_RANDOM_DISCRETE_DISTRIBUTION_HPP
 #define BOOST_COMPUTE_RANDOM_DISCRETE_DISTRIBUTION_HPP
 
+#include <numeric>
+
+#include <boost/config.hpp>
+#include <boost/type_traits.hpp>
+#include <boost/static_assert.hpp>
+
 #include <boost/compute/command_queue.hpp>
 #include <boost/compute/function.hpp>
 #include <boost/compute/algorithm/accumulate.hpp>
@@ -38,23 +44,42 @@ class discrete_distribution
 public:
     typedef IntType result_type;
 
+    /// Creates a new discrete distribution with a single weight p = { 1 }.
+    /// This distribution produces only zeroes.
+    discrete_distribution()
+        : m_probabilities(1, double(1)),
+          m_scanned_probabilities(1, double(1))
+    {
+
+    }
+
     /// Creates a new discrete distribution with weights given by
-    /// the range [\p first, \p last)
+    /// the range [\p first, \p last).
     template<class InputIterator>
     discrete_distribution(InputIterator first, InputIterator last)
-        : m_n(std::distance(first, last)),
-          m_probabilities(std::distance(first, last))
+        : m_probabilities(first, last),
+          m_scanned_probabilities(std::distance(first, last))
     {
-        double sum = 0;
-
-        for(InputIterator iter = first; iter!=last; iter++)
-        {
-            sum += *iter;
+        if(first != last) {
+            // after this m_scanned_probabilities.back() is a sum of all
+            // weights from the range [first, last)
+            std::partial_sum(first, last, m_scanned_probabilities.begin());
+
+            std::vector<double>::iterator i = m_probabilities.begin();
+            std::vector<double>::iterator j = m_scanned_probabilities.begin();
+            for(; i != m_probabilities.end(); ++i, ++j)
+            {
+                // dividing each weight by sum of all weights to
+                // get probabilities
+                *i = *i / m_scanned_probabilities.back();
+                // dividing each partial sum of weights by sum of
+                // all weights to get partial sums of probabilities
+                *j = *j / m_scanned_probabilities.back();
+            }
         }
-
-        for(size_t i=0; i<m_n; i++)
-        {
-            m_probabilities[i] = m_probabilities[i-1] + first[i]/sum;
+        else {
+            m_probabilities.push_back(double(1));
+            m_scanned_probabilities.push_back(double(1));
         }
     }
 
@@ -63,19 +88,31 @@ public:
     {
     }
 
-    /// Returns the value of n
-    result_type n() const
-    {
-        return m_n;
-    }
-
     /// Returns the probabilities
     ::std::vector<double> probabilities() const
     {
         return m_probabilities;
     }
 
-    /// Generates uniformily distributed integers and stores
+    /// Returns the minimum potentially generated value.
+    result_type min BOOST_PREVENT_MACRO_SUBSTITUTION () const
+    {
+        return result_type(0);
+    }
+
+    /// Returns the maximum potentially generated value.
+    result_type max BOOST_PREVENT_MACRO_SUBSTITUTION () const
+    {
+        size_t type_max = static_cast<size_t>(
+            (std::numeric_limits<result_type>::max)()
+        );
+        if(m_probabilities.size() - 1 > type_max) {
+            return (std::numeric_limits<result_type>::max)();
+        }
+        return static_cast<result_type>(m_probabilities.size() - 1);
+    }
+
+    /// Generates uniformly distributed integers and stores
     /// them to the range [\p first, \p last).
     template<class OutputIterator, class Generator>
     void generate(OutputIterator first,
@@ -83,32 +120,38 @@ public:
                   Generator &generator,
                   command_queue &queue)
     {
-        std::string source = "inline uint scale_random(uint x)\n";
+        std::string source = "inline IntType scale_random(uint x)\n";
 
         source = source +
             "{\n" +
             "float rno = convert_float(x) / UINT_MAX;\n";
-        for(size_t i=0; i<m_n; i++)
+        for(size_t i = 0; i < m_scanned_probabilities.size() - 1; i++)
         {
             source = source +
-                "if(rno <= " + detail::make_literal<float>(m_probabilities[i]) + ")\n" +
+                "if(rno <= " + detail::make_literal<float>(m_scanned_probabilities[i]) + ")\n" +
                 "   return " + detail::make_literal(i) + ";\n";
         }
 
         source = source +
-            "return " + detail::make_literal(m_n - 1) + ";\n" +
+            "return " + detail::make_literal(m_scanned_probabilities.size() - 1) + ";\n" +
             "}\n";
 
         BOOST_COMPUTE_FUNCTION(IntType, scale_random, (const uint_ x), {});
 
         scale_random.set_source(source);
+        scale_random.define("IntType", type_name<IntType>());
 
         generator.generate(first, last, scale_random, queue);
     }
 
 private:
-    size_t m_n;
     ::std::vector<double> m_probabilities;
+    ::std::vector<double> m_scanned_probabilities;
+
+    BOOST_STATIC_ASSERT_MSG(
+        boost::is_integral<IntType>::value,
+        "Template argument must be integral"
+    );
 };
 
 } // end compute namespace
diff --git a/boost/compute/random/normal_distribution.hpp b/boost/compute/random/normal_distribution.hpp
index d025faeb2e..4693e4fffe 100644
--- a/boost/compute/random/normal_distribution.hpp
+++ b/boost/compute/random/normal_distribution.hpp
@@ -13,6 +13,9 @@
 
 #include <limits>
 
+#include <boost/assert.hpp>
+#include <boost/type_traits.hpp>
+
 #include <boost/compute/command_queue.hpp>
 #include <boost/compute/function.hpp>
 #include <boost/compute/types/fundamental.hpp>
@@ -90,11 +93,19 @@ public:
 
         BOOST_COMPUTE_FUNCTION(RealType2, box_muller, (const uint2_ x),
         {
-            const RealType x1 = x.x / (RealType) (UINT_MAX - 1);
-            const RealType x2 = x.y / (RealType) (UINT_MAX - 1);
+            const RealType one = 1;
+            const RealType two = 2;
+
+            // Use nextafter to push values down into [0,1) range; without this, floating point rounding can
+            // lead to have x1 = 1, but that would lead to taking the log of 0, which would result in negative
+            // infinities; by pushing the values off 1 towards 0, we ensure this won't happen.
+            const RealType x1 = nextafter(x.x / (RealType) UINT_MAX, (RealType) 0);
+            const RealType x2 = x.y / (RealType) UINT_MAX;
 
-            const RealType z1 = sqrt(-2.f * log2(x1)) * cos(2.f * M_PI_F * x2);
-            const RealType z2 = sqrt(-2.f * log2(x1)) * sin(2.f * M_PI_F * x2);
+            const RealType rho = sqrt(-two * log(one-x1));
+
+            const RealType z1 = rho * cos(two * M_PI_F * x2);
+            const RealType z2 = rho * sin(two * M_PI_F * x2);
 
             return (RealType2)(MEAN, MEAN) + (RealType2)(z1, z2) * (RealType2)(STDDEV, STDDEV);
         });
@@ -116,6 +127,11 @@ public:
 private:
     RealType m_mean;
     RealType m_stddev;
+
+    BOOST_STATIC_ASSERT_MSG(
+        boost::is_floating_point<RealType>::value,
+        "Template argument must be a floating point type"
+    );
 };
 
 } // end compute namespace
diff --git a/boost/compute/random/uniform_int_distribution.hpp b/boost/compute/random/uniform_int_distribution.hpp
index 92e8b3305f..20448afec0 100644
--- a/boost/compute/random/uniform_int_distribution.hpp
+++ b/boost/compute/random/uniform_int_distribution.hpp
@@ -13,6 +13,9 @@
 
 #include <limits>
 
+#include <boost/type_traits.hpp>
+#include <boost/static_assert.hpp>
+
 #include <boost/compute/command_queue.hpp>
 #include <boost/compute/container/vector.hpp>
 #include <boost/compute/function.hpp>
@@ -103,6 +106,11 @@ public:
 private:
     IntType m_a;
     IntType m_b;
+
+    BOOST_STATIC_ASSERT_MSG(
+        boost::is_integral<IntType>::value,
+        "Template argument must be integral"
+    );
 };
 
 } // end compute namespace
diff --git a/boost/compute/random/uniform_real_distribution.hpp b/boost/compute/random/uniform_real_distribution.hpp
index 231b0dba01..75d9659d6a 100644
--- a/boost/compute/random/uniform_real_distribution.hpp
+++ b/boost/compute/random/uniform_real_distribution.hpp
@@ -11,6 +11,9 @@
 #ifndef BOOST_COMPUTE_RANDOM_UNIFORM_REAL_DISTRIBUTION_HPP
 #define BOOST_COMPUTE_RANDOM_UNIFORM_REAL_DISTRIBUTION_HPP
 
+#include <boost/assert.hpp>
+#include <boost/type_traits.hpp>
+
 #include <boost/compute/command_queue.hpp>
 #include <boost/compute/function.hpp>
 #include <boost/compute/detail/literal.hpp>
@@ -20,7 +23,7 @@ namespace boost {
 namespace compute {
 
 /// \class uniform_real_distribution
-/// \brief Produces uniformily distributed random floating-point numbers.
+/// \brief Produces uniformly distributed random floating-point numbers.
 ///
 /// The following example shows how to setup a uniform real distribution to
 /// produce random \c float values between \c 1 and \c 100.
@@ -36,10 +39,12 @@ public:
 
     /// Creates a new uniform distribution producing numbers in the range
     /// [\p a, \p b).
+    /// Requires a < b
     uniform_real_distribution(RealType a = 0.f, RealType b = 1.f)
         : m_a(a),
           m_b(b)
     {
+        BOOST_ASSERT(a < b);
     }
 
     /// Destroys the uniform_real_distribution object.
@@ -59,7 +64,7 @@ public:
         return m_b;
     }
 
-    /// Generates uniformily distributed floating-point numbers and stores
+    /// Generates uniformly distributed floating-point numbers and stores
     /// them to the range [\p first, \p last).
     template<class OutputIterator, class Generator>
     void generate(OutputIterator first,
@@ -69,7 +74,7 @@ public:
     {
         BOOST_COMPUTE_FUNCTION(RealType, scale_random, (const uint_ x),
         {
-            return LO + (convert_RealType(x) / MAX_RANDOM) * (HI - LO);
+            return nextafter(LO + (convert_RealType(x) / MAX_RANDOM) * (HI - LO), (RealType) LO);
         });
 
         scale_random.define("LO", detail::make_literal(m_a));
@@ -78,6 +83,7 @@ public:
         scale_random.define(
             "convert_RealType", std::string("convert_") + type_name<RealType>()
         );
+        scale_random.define("RealType", type_name<RealType>());
 
         generator.generate(
             first, last, scale_random, queue
@@ -97,6 +103,11 @@ public:
 private:
     RealType m_a;
     RealType m_b;
+
+    BOOST_STATIC_ASSERT_MSG(
+        boost::is_floating_point<RealType>::value,
+        "Template argument must be a floating point type"
+    );
 };
 
 } // end compute namespace
diff --git a/boost/compute/svm.hpp b/boost/compute/svm.hpp
index d03c8d9079..4bc3a74237 100644
--- a/boost/compute/svm.hpp
+++ b/boost/compute/svm.hpp
@@ -15,7 +15,7 @@
 #include <boost/compute/context.hpp>
 #include <boost/compute/memory/svm_ptr.hpp>
 
-// svm functions require opencl 2.0
+// svm functions require OpenCL 2.0
 #if defined(CL_VERSION_2_0) || defined(BOOST_COMPUTE_DOXYGEN_INVOKED)
 
 namespace boost {
@@ -34,7 +34,10 @@ inline svm_ptr<T> svm_alloc(const context &context,
                             cl_svm_mem_flags flags = CL_MEM_READ_WRITE,
                             unsigned int alignment = 0)
 {
-    svm_ptr<T> ptr(clSVMAlloc(context.get(), flags, size * sizeof(T), alignment));
+    svm_ptr<T> ptr(
+        clSVMAlloc(context.get(), flags, size * sizeof(T), alignment),
+        context
+    );
     if(!ptr.get()){
         BOOST_THROW_EXCEPTION(opencl_error(CL_MEM_OBJECT_ALLOCATION_FAILURE));
     }
@@ -49,6 +52,13 @@ inline svm_ptr<T> svm_alloc(const context &context,
 ///
 /// \see svm_alloc(), command_queue::enqueue_svm_free()
 template<class T>
+inline void svm_free(svm_ptr<T> ptr)
+{
+    clSVMFree(ptr.get_context(), ptr.get());
+}
+
+/// \overload
+template<class T>
 inline void svm_free(const context &context, svm_ptr<T> ptr)
 {
     clSVMFree(context.get(), ptr.get());
diff --git a/boost/compute/system.hpp b/boost/compute/system.hpp
index f205fece7d..e6ed353d68 100644
--- a/boost/compute/system.hpp
+++ b/boost/compute/system.hpp
@@ -184,14 +184,16 @@ public:
         cl_uint count = 0;
         clGetPlatformIDs(0, 0, &count);
 
-        std::vector<cl_platform_id> platform_ids(count);
-        clGetPlatformIDs(count, &platform_ids[0], 0);
-
         std::vector<platform> platforms;
-        for(size_t i = 0; i < platform_ids.size(); i++){
-            platforms.push_back(platform(platform_ids[i]));
-        }
+        if(count > 0)
+        {
+            std::vector<cl_platform_id> platform_ids(count);
+            clGetPlatformIDs(count, &platform_ids[0], 0);
 
+            for(size_t i = 0; i < platform_ids.size(); i++){
+                platforms.push_back(platform(platform_ids[i]));
+            }
+        }
         return platforms;
     }
 
diff --git a/boost/compute/utility/wait_list.hpp b/boost/compute/utility/wait_list.hpp
index 9a7e74bac0..8b81924d63 100644
--- a/boost/compute/utility/wait_list.hpp
+++ b/boost/compute/utility/wait_list.hpp
@@ -13,6 +13,12 @@
 
 #include <vector>
 
+#include <boost/compute/config.hpp>
+
+#ifndef BOOST_COMPUTE_NO_HDR_INITIALIZER_LIST
+#include <initializer_list>
+#endif
+
 #include <boost/compute/event.hpp>
 
 namespace boost {
@@ -27,7 +33,7 @@ template<class T> class future;
 /// specify dependencies for OpenCL operations or to wait on the host until
 /// all of the events have completed.
 ///
-/// This class also provides convenience fnuctions for interacting with
+/// This class also provides convenience functions for interacting with
 /// OpenCL APIs which typically accept event dependencies as a \c cl_event*
 /// pointer and a \c cl_uint size. For example:
 /// \code
@@ -60,6 +66,14 @@ public:
     {
     }
 
+    #ifndef BOOST_COMPUTE_NO_HDR_INITIALIZER_LIST
+    /// Creates a wait-list from \p events
+    wait_list(std::initializer_list<event> events)
+        : m_events(events)
+    {
+    }
+    #endif // BOOST_COMPUTE_NO_HDR_INITIALIZER_LIST
+
     /// Copies the events in the wait-list from \p other.
     wait_list& operator=(const wait_list &other)
     {