runtimes/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269

/*
 * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
 * Copyright (c) 2017 ARM Limited.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

// reference:
// https://code.google.com/archive/p/ocl-radix-sort/source/default/source
// OpenCL kernel sources for the CLRadixSort class
// the #include does not exist in OpenCL
// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr
// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html
// if you find this software usefull you can cite the following work in your reports or articles:
// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011.
// http://hal.archives-ouvertes.fr/hal-00596730

// Reference for floating point radix sort:
// http://www.codercorner.com/RadixSortRevisited.htm

// compute the histogram for each radix and each virtual processor for the pass
__kernel void radixsort_histogram(__global float *in_key_buf, __global int *d_Histograms,
                                  const int pass, __local int *loc_histo, const int n)
{
  int it = get_local_id(0);  // i local number of the processor
  int ig = get_global_id(0); // global number = i + g I

  int gr = get_group_id(0); // g group number

  int groups = get_num_groups(0);
  int items = get_local_size(0);

  // set the local histograms to zero
  for (int ir = 0; ir < _RADIX; ir++)
  {
    loc_histo[ir * items + it] = 0;
  }

  barrier(CLK_LOCAL_MEM_FENCE);

  // range of keys that are analyzed by the work item
  int size = n / groups / items; // size of the sub-list
  int start = ig * size;         // beginning of the sub-list

  unsigned int key;
  int shortkey, k;

  // compute the index
  // the computation depends on the transposition
  for (int j = 0; j < size; j++)
  {
#ifdef TRANSPOSE
    k = groups * items * j + ig;
#else
    k = j + start;
#endif

    key = *((__global unsigned int *)(in_key_buf + k));

    // extract the group of _BITS bits of the pass
    // the result is in the range 0.._RADIX-1
    shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1));

    // increment the local histogram
    loc_histo[shortkey * items + it]++;
  }

  barrier(CLK_LOCAL_MEM_FENCE);

  // copy the local histogram to the global one
  for (int ir = 0; ir < _RADIX; ir++)
  {
    d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it];
  }

  barrier(CLK_GLOBAL_MEM_FENCE);
}

// initial transpose of the list for improving
// coalescent memory access
__kernel void transpose(const __global int *invect, __global int *outvect, const int nbcol,
                        const int nbrow, const __global int *inperm, __global int *outperm,
                        __local int *blockmat, __local int *blockperm, const int tilesize)
{

  int i0 = get_global_id(0) * tilesize; // first row index
  int j = get_global_id(1);             // column index

  int jloc = get_local_id(1); // local column index

  // fill the cache
  for (int iloc = 0; iloc < tilesize; iloc++)
  {
    int k = (i0 + iloc) * nbcol + j; // position in the matrix
    blockmat[iloc * tilesize + jloc] = invect[k];
#ifdef PERMUT
    blockperm[iloc * tilesize + jloc] = inperm[k];
#endif
  }

  barrier(CLK_LOCAL_MEM_FENCE);

  // first row index in the transpose
  int j0 = get_group_id(1) * tilesize;

  // put the cache at the good place
  for (int iloc = 0; iloc < tilesize; iloc++)
  {
    int kt = (j0 + iloc) * nbrow + i0 + jloc; // position in the transpose
    outvect[kt] = blockmat[jloc * tilesize + iloc];
#ifdef PERMUT
    outperm[kt] = blockperm[jloc * tilesize + iloc];
#endif
  }
}

// each virtual processor reorders its data using the scanned histogram
__kernel void radixsort_reorder(__global float *in_key, __global float *out_key,
                                __global int *d_Histograms, const int pass,
                                __global int *indices_in, __global int *indices_out,
                                __local int *loc_histo, const int n)
{

  int it = get_local_id(0);
  int ig = get_global_id(0);

  int gr = get_group_id(0);
  int groups = get_num_groups(0);
  int items = get_local_size(0);

  int start = ig * (n / groups / items);
  int size = n / groups / items;

  // take the histogram in the cache
  for (int ir = 0; ir < _RADIX; ir++)
  {
    loc_histo[ir * items + it] = d_Histograms[items * (ir * groups + gr) + it];
  }
  barrier(CLK_LOCAL_MEM_FENCE);

  int newpos, shortkey, k, newpost;
  unsigned int key;

  for (int j = 0; j < size; j++)
  {
#ifdef TRANSPOSE
    k = groups * items * j + ig;
#else
    k = j + start;
#endif
    float org_value = in_key[k];
    key = *(__global unsigned int *)(in_key + k);
    shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1));

    newpos = loc_histo[shortkey * items + it];

#ifdef TRANSPOSE
    int ignew, jnew;
    ignew = newpos / (n / groups / items);
    jnew = newpos % (n / groups / items);
    newpost = jnew * (groups * items) + ignew;
#else
    newpost = newpos;
#endif

    // d_outKeys[newpost]= key;  // killing line !!!
    out_key[newpost] = org_value;

#ifdef PERMUT
    indices_out[newpost] = indices_in[k];
#endif

    newpos++;
    loc_histo[shortkey * items + it] = newpos;
  }
}

// perform a parallel prefix sum (a scan) on the local histograms
// (see Blelloch 1990) each workitem worries about two memories
// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
__kernel void radixsort_scanhistograms(__global int *histo, __local int *temp,
                                       __global int *globsum)
{
  int it = get_local_id(0);
  int ig = get_global_id(0);
  int decale = 1;
  int n = get_local_size(0) * 2;
  int gr = get_group_id(0);

  // load input into local memory
  // up sweep phase
  temp[2 * it] = histo[2 * ig];
  temp[2 * it + 1] = histo[2 * ig + 1];

  // parallel prefix sum (algorithm of Blelloch 1990)
  for (int d = n >> 1; d > 0; d >>= 1)
  {
    barrier(CLK_LOCAL_MEM_FENCE);
    if (it < d)
    {
      int ai = decale * (2 * it + 1) - 1;
      int bi = decale * (2 * it + 2) - 1;
      temp[bi] += temp[ai];
    }
    decale *= 2;
  }

  // store the last element in the global sum vector
  // (maybe used in the next step for constructing the global scan)
  // clear the last element
  if (it == 0)
  {
    globsum[gr] = temp[n - 1];
    temp[n - 1] = 0;
  }

  // down sweep phase
  for (int d = 1; d < n; d *= 2)
  {
    decale >>= 1;
    barrier(CLK_LOCAL_MEM_FENCE);

    if (it < d)
    {
      int ai = decale * (2 * it + 1) - 1;
      int bi = decale * (2 * it + 2) - 1;

      int t = temp[ai];
      temp[ai] = temp[bi];
      temp[bi] += t;
    }
  }
  barrier(CLK_LOCAL_MEM_FENCE);

  // write results to device memory

  histo[2 * ig] = temp[2 * it];
  histo[2 * ig + 1] = temp[2 * it + 1];

  barrier(CLK_GLOBAL_MEM_FENCE);
}

// use the global sum for updating the local histograms
// each work item updates two values
__kernel void radixsort_pastehistograms(__global int *histo, __global int *globsum)
{
  int ig = get_global_id(0);
  int gr = get_group_id(0);

  int s;

  s = globsum[gr];

  // write results to device memory
  histo[2 * ig] += s;
  histo[2 * ig + 1] += s;

  barrier(CLK_GLOBAL_MEM_FENCE);
}