1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
|
/*
* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
* Copyright (c) 2017 ARM Limited.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// reference:
// https://code.google.com/archive/p/ocl-radix-sort/source/default/source
// OpenCL kernel sources for the CLRadixSort class
// the #include does not exist in OpenCL
// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr
// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html
// if you find this software usefull you can cite the following work in your reports or articles:
// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011.
// http://hal.archives-ouvertes.fr/hal-00596730
// Reference for floating point radix sort:
// http://www.codercorner.com/RadixSortRevisited.htm
// compute the histogram for each radix and each virtual processor for the pass
__kernel void radixsort_histogram(__global float *in_key_buf, __global int *d_Histograms,
const int pass, __local int *loc_histo, const int n)
{
int it = get_local_id(0); // i local number of the processor
int ig = get_global_id(0); // global number = i + g I
int gr = get_group_id(0); // g group number
int groups = get_num_groups(0);
int items = get_local_size(0);
// set the local histograms to zero
for (int ir = 0; ir < _RADIX; ir++)
{
loc_histo[ir * items + it] = 0;
}
barrier(CLK_LOCAL_MEM_FENCE);
// range of keys that are analyzed by the work item
int size = n / groups / items; // size of the sub-list
int start = ig * size; // beginning of the sub-list
unsigned int key;
int shortkey, k;
// compute the index
// the computation depends on the transposition
for (int j = 0; j < size; j++)
{
#ifdef TRANSPOSE
k = groups * items * j + ig;
#else
k = j + start;
#endif
key = *((__global unsigned int *)(in_key_buf + k));
// extract the group of _BITS bits of the pass
// the result is in the range 0.._RADIX-1
shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1));
// increment the local histogram
loc_histo[shortkey * items + it]++;
}
barrier(CLK_LOCAL_MEM_FENCE);
// copy the local histogram to the global one
for (int ir = 0; ir < _RADIX; ir++)
{
d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it];
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
// initial transpose of the list for improving
// coalescent memory access
__kernel void transpose(const __global int *invect, __global int *outvect, const int nbcol,
const int nbrow, const __global int *inperm, __global int *outperm,
__local int *blockmat, __local int *blockperm, const int tilesize)
{
int i0 = get_global_id(0) * tilesize; // first row index
int j = get_global_id(1); // column index
int jloc = get_local_id(1); // local column index
// fill the cache
for (int iloc = 0; iloc < tilesize; iloc++)
{
int k = (i0 + iloc) * nbcol + j; // position in the matrix
blockmat[iloc * tilesize + jloc] = invect[k];
#ifdef PERMUT
blockperm[iloc * tilesize + jloc] = inperm[k];
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
// first row index in the transpose
int j0 = get_group_id(1) * tilesize;
// put the cache at the good place
for (int iloc = 0; iloc < tilesize; iloc++)
{
int kt = (j0 + iloc) * nbrow + i0 + jloc; // position in the transpose
outvect[kt] = blockmat[jloc * tilesize + iloc];
#ifdef PERMUT
outperm[kt] = blockperm[jloc * tilesize + iloc];
#endif
}
}
// each virtual processor reorders its data using the scanned histogram
__kernel void radixsort_reorder(__global float *in_key, __global float *out_key,
__global int *d_Histograms, const int pass,
__global int *indices_in, __global int *indices_out,
__local int *loc_histo, const int n)
{
int it = get_local_id(0);
int ig = get_global_id(0);
int gr = get_group_id(0);
int groups = get_num_groups(0);
int items = get_local_size(0);
int start = ig * (n / groups / items);
int size = n / groups / items;
// take the histogram in the cache
for (int ir = 0; ir < _RADIX; ir++)
{
loc_histo[ir * items + it] = d_Histograms[items * (ir * groups + gr) + it];
}
barrier(CLK_LOCAL_MEM_FENCE);
int newpos, shortkey, k, newpost;
unsigned int key;
for (int j = 0; j < size; j++)
{
#ifdef TRANSPOSE
k = groups * items * j + ig;
#else
k = j + start;
#endif
float org_value = in_key[k];
key = *(__global unsigned int *)(in_key + k);
shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1));
newpos = loc_histo[shortkey * items + it];
#ifdef TRANSPOSE
int ignew, jnew;
ignew = newpos / (n / groups / items);
jnew = newpos % (n / groups / items);
newpost = jnew * (groups * items) + ignew;
#else
newpost = newpos;
#endif
// d_outKeys[newpost]= key; // killing line !!!
out_key[newpost] = org_value;
#ifdef PERMUT
indices_out[newpost] = indices_in[k];
#endif
newpos++;
loc_histo[shortkey * items + it] = newpos;
}
}
// perform a parallel prefix sum (a scan) on the local histograms
// (see Blelloch 1990) each workitem worries about two memories
// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
__kernel void radixsort_scanhistograms(__global int *histo, __local int *temp,
__global int *globsum)
{
int it = get_local_id(0);
int ig = get_global_id(0);
int decale = 1;
int n = get_local_size(0) * 2;
int gr = get_group_id(0);
// load input into local memory
// up sweep phase
temp[2 * it] = histo[2 * ig];
temp[2 * it + 1] = histo[2 * ig + 1];
// parallel prefix sum (algorithm of Blelloch 1990)
for (int d = n >> 1; d > 0; d >>= 1)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (it < d)
{
int ai = decale * (2 * it + 1) - 1;
int bi = decale * (2 * it + 2) - 1;
temp[bi] += temp[ai];
}
decale *= 2;
}
// store the last element in the global sum vector
// (maybe used in the next step for constructing the global scan)
// clear the last element
if (it == 0)
{
globsum[gr] = temp[n - 1];
temp[n - 1] = 0;
}
// down sweep phase
for (int d = 1; d < n; d *= 2)
{
decale >>= 1;
barrier(CLK_LOCAL_MEM_FENCE);
if (it < d)
{
int ai = decale * (2 * it + 1) - 1;
int bi = decale * (2 * it + 2) - 1;
int t = temp[ai];
temp[ai] = temp[bi];
temp[bi] += t;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
// write results to device memory
histo[2 * ig] = temp[2 * it];
histo[2 * ig + 1] = temp[2 * it + 1];
barrier(CLK_GLOBAL_MEM_FENCE);
}
// use the global sum for updating the local histograms
// each work item updates two values
__kernel void radixsort_pastehistograms(__global int *histo, __global int *globsum)
{
int ig = get_global_id(0);
int gr = get_group_id(0);
int s;
s = globsum[gr];
// write results to device memory
histo[2 * ig] += s;
histo[2 * ig + 1] += s;
barrier(CLK_GLOBAL_MEM_FENCE);
}
|