inference-engine/thirdparty/clDNN/common/boost/1.64.0/include/boost-1_64/boost/compute/algorithm/detail/find_if_with_atomics.hpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212

//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//

#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP

#include <iterator>

#include <boost/compute/types.hpp>
#include <boost/compute/functional.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/container/detail/scalar.hpp>
#include <boost/compute/iterator/buffer_iterator.hpp>
#include <boost/compute/type_traits/type_name.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/detail/parameter_cache.hpp>

namespace boost {
namespace compute {
namespace detail {

template<class InputIterator, class UnaryPredicate>
inline InputIterator find_if_with_atomics_one_vpt(InputIterator first,
                                                  InputIterator last,
                                                  UnaryPredicate predicate,
                                                  const size_t count,
                                                  command_queue &queue)
{
    typedef typename std::iterator_traits<InputIterator>::value_type value_type;
    typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;

    const context &context = queue.get_context();

    detail::meta_kernel k("find_if");
    size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index");
    atomic_min<uint_> atomic_min_uint;

    k << k.decl<const uint_>("i") << " = get_global_id(0);\n"
      << k.decl<const value_type>("value") << "="
      <<     first[k.var<const uint_>("i")] << ";\n"
      << "if(" << predicate(k.var<const value_type>("value")) << "){\n"
      << "    " << atomic_min_uint(k.var<uint_ *>("index"), k.var<uint_>("i")) << ";\n"
      << "}\n";

    kernel kernel = k.compile(context);

    scalar<uint_> index(context);
    kernel.set_arg(index_arg, index.get_buffer());

    // initialize index to the last iterator's index
    index.write(static_cast<uint_>(count), queue);
    queue.enqueue_1d_range_kernel(kernel, 0, count, 0);

    // read index and return iterator
    return first + static_cast<difference_type>(index.read(queue));
}

template<class InputIterator, class UnaryPredicate>
inline InputIterator find_if_with_atomics_multiple_vpt(InputIterator first,
                                                       InputIterator last,
                                                       UnaryPredicate predicate,
                                                       const size_t count,
                                                       const size_t vpt,
                                                       command_queue &queue)
{
    typedef typename std::iterator_traits<InputIterator>::value_type value_type;
    typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;

    const context &context = queue.get_context();
    const device &device = queue.get_device();

    detail::meta_kernel k("find_if");
    size_t index_arg = k.add_arg<uint_ *>(memory_object::global_memory, "index");
    size_t count_arg = k.add_arg<const uint_>("count");
    size_t vpt_arg = k.add_arg<const uint_>("vpt");
    atomic_min<uint_> atomic_min_uint;

    // for GPUs reads from global memory are coalesced
    if(device.type() & device::gpu) {
        k <<
            k.decl<const uint_>("lsize") << " = get_local_size(0);\n" <<
            k.decl<uint_>("id") << " = get_local_id(0) + get_group_id(0) * lsize * vpt;\n" <<
            k.decl<const uint_>("end") << " = min(" <<
                    "id + (lsize *" << k.var<uint_>("vpt") << ")," <<
                    "count" <<
            ");\n" <<

            // checking if the index is already found
            "__local uint local_index;\n" <<
            "if(get_local_id(0) == 0){\n" <<
            "    local_index = *index;\n " <<
            "};\n" <<
            "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
            "if(local_index < id){\n" <<
            "    return;\n" <<
            "}\n" <<

            "while(id < end){\n" <<
            "    " << k.decl<const value_type>("value") << " = " <<
                      first[k.var<const uint_>("id")] << ";\n"
            "    if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
            "        " << atomic_min_uint(k.var<uint_ *>("index"),
                                          k.var<uint_>("id")) << ";\n" <<
            "        return;\n"
            "    }\n" <<
            "    id+=lsize;\n" <<
            "}\n";
    // for CPUs (and other devices) reads are ordered so the big cache is
    // efficiently used.
    } else {
        k <<
            k.decl<uint_>("id") << " = get_global_id(0) * " << k.var<uint_>("vpt") << ";\n" <<
            k.decl<const uint_>("end") << " = min(" <<
                    "id + " << k.var<uint_>("vpt") << "," <<
                    "count" <<
            ");\n" <<
            "while(id < end && (*index) > id){\n" <<
            "    " << k.decl<const value_type>("value") << " = " <<
                      first[k.var<const uint_>("id")] << ";\n"
            "    if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
            "        " << atomic_min_uint(k.var<uint_ *>("index"),
                                          k.var<uint_>("id")) << ";\n" <<
            "        return;\n" <<
            "    }\n" <<
            "    id++;\n" <<
            "}\n";
    }

    kernel kernel = k.compile(context);

    scalar<uint_> index(context);
    kernel.set_arg(index_arg, index.get_buffer());
    kernel.set_arg(count_arg, static_cast<uint_>(count));
    kernel.set_arg(vpt_arg, static_cast<uint_>(vpt));

    // initialize index to the last iterator's index
    index.write(static_cast<uint_>(count), queue);

    const size_t global_wg_size = static_cast<size_t>(
        std::ceil(float(count) / vpt)
    );
    queue.enqueue_1d_range_kernel(kernel, 0, global_wg_size, 0);

    // read index and return iterator
    return first + static_cast<difference_type>(index.read(queue));
}

template<class InputIterator, class UnaryPredicate>
inline InputIterator find_if_with_atomics(InputIterator first,
                                          InputIterator last,
                                          UnaryPredicate predicate,
                                          command_queue &queue)
{
    typedef typename std::iterator_traits<InputIterator>::value_type value_type;

    size_t count = detail::iterator_range_size(first, last);
    if(count == 0){
        return last;
    }

    const device &device = queue.get_device();

    // load cached parameters
    std::string cache_key = std::string("__boost_find_if_with_atomics_")
        + type_name<value_type>();
    boost::shared_ptr<parameter_cache> parameters =
        detail::parameter_cache::get_global_cache(device);

    // for relatively small inputs on GPUs kernel checking one value per thread
    // (work-item) is more efficient than its multiple values per thread version
    if(device.type() & device::gpu){
        const size_t one_vpt_threshold =
            parameters->get(cache_key, "one_vpt_threshold", 1048576);
        if(count <= one_vpt_threshold){
            return find_if_with_atomics_one_vpt(
                first, last, predicate, count, queue
            );
        }
    }

    // values per thread
    size_t vpt;
    if(device.type() & device::gpu){
        // get vpt parameter
        vpt = parameters->get(cache_key, "vpt", 32);
    } else {
        // for CPUs work is split equally between compute units
        const size_t max_compute_units =
            device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
        vpt = static_cast<size_t>(
            std::ceil(float(count) / max_compute_units)
        );
    }

    return find_if_with_atomics_multiple_vpt(
        first, last, predicate, count, vpt, queue
    );
}

} // end detail namespace
} // end compute namespace
} // end boost namespace

#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP