summaryrefslogtreecommitdiff
path: root/runtimes/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
diff options
context:
space:
mode:
Diffstat (limited to 'runtimes/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl')
-rw-r--r--runtimes/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl269
1 files changed, 269 insertions, 0 deletions
diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/runtimes/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
new file mode 100644
index 000000000..f6830d229
--- /dev/null
+++ b/runtimes/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// reference:
+// https://code.google.com/archive/p/ocl-radix-sort/source/default/source
+// OpenCL kernel sources for the CLRadixSort class
+// the #include does not exist in OpenCL
+// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr
+// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html
+// if you find this software usefull you can cite the following work in your reports or articles:
+// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011.
+// http://hal.archives-ouvertes.fr/hal-00596730
+
+// Reference for floating point radix sort:
+// http://www.codercorner.com/RadixSortRevisited.htm
+
+// compute the histogram for each radix and each virtual processor for the pass
+__kernel void radixsort_histogram(__global float *in_key_buf, __global int *d_Histograms,
+ const int pass, __local int *loc_histo, const int n)
+{
+ int it = get_local_id(0); // i local number of the processor
+ int ig = get_global_id(0); // global number = i + g I
+
+ int gr = get_group_id(0); // g group number
+
+ int groups = get_num_groups(0);
+ int items = get_local_size(0);
+
+ // set the local histograms to zero
+ for (int ir = 0; ir < _RADIX; ir++)
+ {
+ loc_histo[ir * items + it] = 0;
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // range of keys that are analyzed by the work item
+ int size = n / groups / items; // size of the sub-list
+ int start = ig * size; // beginning of the sub-list
+
+ unsigned int key;
+ int shortkey, k;
+
+ // compute the index
+ // the computation depends on the transposition
+ for (int j = 0; j < size; j++)
+ {
+#ifdef TRANSPOSE
+ k = groups * items * j + ig;
+#else
+ k = j + start;
+#endif
+
+ key = *((__global unsigned int *)(in_key_buf + k));
+
+ // extract the group of _BITS bits of the pass
+ // the result is in the range 0.._RADIX-1
+ shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1));
+
+ // increment the local histogram
+ loc_histo[shortkey * items + it]++;
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // copy the local histogram to the global one
+ for (int ir = 0; ir < _RADIX; ir++)
+ {
+ d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it];
+ }
+
+ barrier(CLK_GLOBAL_MEM_FENCE);
+}
+
+// initial transpose of the list for improving
+// coalescent memory access
+__kernel void transpose(const __global int *invect, __global int *outvect, const int nbcol,
+ const int nbrow, const __global int *inperm, __global int *outperm,
+ __local int *blockmat, __local int *blockperm, const int tilesize)
+{
+
+ int i0 = get_global_id(0) * tilesize; // first row index
+ int j = get_global_id(1); // column index
+
+ int jloc = get_local_id(1); // local column index
+
+ // fill the cache
+ for (int iloc = 0; iloc < tilesize; iloc++)
+ {
+ int k = (i0 + iloc) * nbcol + j; // position in the matrix
+ blockmat[iloc * tilesize + jloc] = invect[k];
+#ifdef PERMUT
+ blockperm[iloc * tilesize + jloc] = inperm[k];
+#endif
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // first row index in the transpose
+ int j0 = get_group_id(1) * tilesize;
+
+ // put the cache at the good place
+ for (int iloc = 0; iloc < tilesize; iloc++)
+ {
+ int kt = (j0 + iloc) * nbrow + i0 + jloc; // position in the transpose
+ outvect[kt] = blockmat[jloc * tilesize + iloc];
+#ifdef PERMUT
+ outperm[kt] = blockperm[jloc * tilesize + iloc];
+#endif
+ }
+}
+
+// each virtual processor reorders its data using the scanned histogram
+__kernel void radixsort_reorder(__global float *in_key, __global float *out_key,
+ __global int *d_Histograms, const int pass,
+ __global int *indices_in, __global int *indices_out,
+ __local int *loc_histo, const int n)
+{
+
+ int it = get_local_id(0);
+ int ig = get_global_id(0);
+
+ int gr = get_group_id(0);
+ int groups = get_num_groups(0);
+ int items = get_local_size(0);
+
+ int start = ig * (n / groups / items);
+ int size = n / groups / items;
+
+ // take the histogram in the cache
+ for (int ir = 0; ir < _RADIX; ir++)
+ {
+ loc_histo[ir * items + it] = d_Histograms[items * (ir * groups + gr) + it];
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ int newpos, shortkey, k, newpost;
+ unsigned int key;
+
+ for (int j = 0; j < size; j++)
+ {
+#ifdef TRANSPOSE
+ k = groups * items * j + ig;
+#else
+ k = j + start;
+#endif
+ float org_value = in_key[k];
+ key = *(__global unsigned int *)(in_key + k);
+ shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1));
+
+ newpos = loc_histo[shortkey * items + it];
+
+#ifdef TRANSPOSE
+ int ignew, jnew;
+ ignew = newpos / (n / groups / items);
+ jnew = newpos % (n / groups / items);
+ newpost = jnew * (groups * items) + ignew;
+#else
+ newpost = newpos;
+#endif
+
+ // d_outKeys[newpost]= key; // killing line !!!
+ out_key[newpost] = org_value;
+
+#ifdef PERMUT
+ indices_out[newpost] = indices_in[k];
+#endif
+
+ newpos++;
+ loc_histo[shortkey * items + it] = newpos;
+ }
+}
+
+// perform a parallel prefix sum (a scan) on the local histograms
+// (see Blelloch 1990) each workitem worries about two memories
+// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
+__kernel void radixsort_scanhistograms(__global int *histo, __local int *temp,
+ __global int *globsum)
+{
+ int it = get_local_id(0);
+ int ig = get_global_id(0);
+ int decale = 1;
+ int n = get_local_size(0) * 2;
+ int gr = get_group_id(0);
+
+ // load input into local memory
+ // up sweep phase
+ temp[2 * it] = histo[2 * ig];
+ temp[2 * it + 1] = histo[2 * ig + 1];
+
+ // parallel prefix sum (algorithm of Blelloch 1990)
+ for (int d = n >> 1; d > 0; d >>= 1)
+ {
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (it < d)
+ {
+ int ai = decale * (2 * it + 1) - 1;
+ int bi = decale * (2 * it + 2) - 1;
+ temp[bi] += temp[ai];
+ }
+ decale *= 2;
+ }
+
+ // store the last element in the global sum vector
+ // (maybe used in the next step for constructing the global scan)
+ // clear the last element
+ if (it == 0)
+ {
+ globsum[gr] = temp[n - 1];
+ temp[n - 1] = 0;
+ }
+
+ // down sweep phase
+ for (int d = 1; d < n; d *= 2)
+ {
+ decale >>= 1;
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ if (it < d)
+ {
+ int ai = decale * (2 * it + 1) - 1;
+ int bi = decale * (2 * it + 2) - 1;
+
+ int t = temp[ai];
+ temp[ai] = temp[bi];
+ temp[bi] += t;
+ }
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // write results to device memory
+
+ histo[2 * ig] = temp[2 * it];
+ histo[2 * ig + 1] = temp[2 * it + 1];
+
+ barrier(CLK_GLOBAL_MEM_FENCE);
+}
+
+// use the global sum for updating the local histograms
+// each work item updates two values
+__kernel void radixsort_pastehistograms(__global int *histo, __global int *globsum)
+{
+ int ig = get_global_id(0);
+ int gr = get_group_id(0);
+
+ int s;
+
+ s = globsum[gr];
+
+ // write results to device memory
+ histo[2 * ig] += s;
+ histo[2 * ig + 1] += s;
+
+ barrier(CLK_GLOBAL_MEM_FENCE);
+}