summaryrefslogtreecommitdiff
path: root/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
diff options
context:
space:
mode:
Diffstat (limited to 'libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl')
-rw-r--r--libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl271
1 files changed, 0 insertions, 271 deletions
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
deleted file mode 100644
index c2c2d89a4..000000000
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// reference:
-// https://code.google.com/archive/p/ocl-radix-sort/source/default/source
-// OpenCL kernel sources for the CLRadixSort class
-// the #include does not exist in OpenCL
-// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr
-// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html
-// if you find this software usefull you can cite the following work in your reports or articles:
-// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011.
-// http://hal.archives-ouvertes.fr/hal-00596730
-
-// Reference for floating point radix sort:
-// http://www.codercorner.com/RadixSortRevisited.htm
-
-// compute the histogram for each radix and each virtual processor for the pass
-__kernel void radixsort_histogram(__global float* in_key_buf,
- __global int* d_Histograms,
- const int pass,
- __local int* loc_histo,
- const int n)
-{
- int it = get_local_id(0); // i local number of the processor
- int ig = get_global_id(0); // global number = i + g I
-
- int gr = get_group_id(0); // g group number
-
- int groups = get_num_groups(0);
- int items = get_local_size(0);
-
- // set the local histograms to zero
- for(int ir=0;ir<_RADIX;ir++){
- loc_histo[ir * items + it] = 0;
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // range of keys that are analyzed by the work item
- int size= n/groups/items; // size of the sub-list
- int start= ig * size; // beginning of the sub-list
-
- unsigned int key;
- int shortkey,k;
-
- // compute the index
- // the computation depends on the transposition
- for(int j = 0; j < size ; j++) {
-#ifdef TRANSPOSE
- k= groups * items * j + ig;
-#else
- k=j+start;
-#endif
-
- key = *((__global unsigned int*)(in_key_buf + k));
-
- // extract the group of _BITS bits of the pass
- // the result is in the range 0.._RADIX-1
- shortkey=(( key >> (pass * _BITS)) & (_RADIX-1));
-
- // increment the local histogram
- loc_histo[shortkey * items + it ]++;
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // copy the local histogram to the global one
- for(int ir=0;ir<_RADIX;ir++) {
- d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it];
- }
-
- barrier(CLK_GLOBAL_MEM_FENCE);
-}
-
-// initial transpose of the list for improving
-// coalescent memory access
-__kernel void transpose(const __global int* invect,
- __global int* outvect,
- const int nbcol,
- const int nbrow,
- const __global int* inperm,
- __global int* outperm,
- __local int* blockmat,
- __local int* blockperm,
- const int tilesize){
-
- int i0 = get_global_id(0)*tilesize; // first row index
- int j = get_global_id(1); // column index
-
- int jloc = get_local_id(1); // local column index
-
- // fill the cache
- for(int iloc=0;iloc<tilesize;iloc++){
- int k=(i0+iloc)*nbcol+j; // position in the matrix
- blockmat[iloc*tilesize+jloc]=invect[k];
-#ifdef PERMUT
- blockperm[iloc*tilesize+jloc]=inperm[k];
-#endif
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // first row index in the transpose
- int j0=get_group_id(1)*tilesize;
-
- // put the cache at the good place
- for(int iloc=0;iloc<tilesize;iloc++){
- int kt=(j0+iloc)*nbrow+i0+jloc; // position in the transpose
- outvect[kt]=blockmat[jloc*tilesize+iloc];
-#ifdef PERMUT
- outperm[kt]=blockperm[jloc*tilesize+iloc];
-#endif
- }
-
-}
-
-// each virtual processor reorders its data using the scanned histogram
-__kernel void radixsort_reorder(__global float* in_key,
- __global float* out_key,
- __global int* d_Histograms,
- const int pass,
- __global int* indices_in,
- __global int* indices_out,
- __local int* loc_histo,
- const int n){
-
- int it = get_local_id(0);
- int ig = get_global_id(0);
-
- int gr = get_group_id(0);
- int groups=get_num_groups(0);
- int items=get_local_size(0);
-
- int start= ig *(n/groups/items);
- int size= n/groups/items;
-
- // take the histogram in the cache
- for(int ir=0;ir<_RADIX;ir++){
- loc_histo[ir * items + it]=
- d_Histograms[items * (ir * groups + gr) + it];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int newpos,shortkey,k,newpost;
- unsigned int key;
-
- for(int j= 0; j< size;j++){
-#ifdef TRANSPOSE
- k= groups * items * j + ig;
-#else
- k=j+start;
-#endif
- float org_value = in_key[k];
- key = *(__global unsigned int*)(in_key + k);
- shortkey=((key >> (pass * _BITS)) & (_RADIX-1));
-
- newpos=loc_histo[shortkey * items + it];
-
-#ifdef TRANSPOSE
- int ignew,jnew;
- ignew= newpos/(n/groups/items);
- jnew = newpos%(n/groups/items);
- newpost = jnew * (groups*items) + ignew;
-#else
- newpost=newpos;
-#endif
-
- //d_outKeys[newpost]= key; // killing line !!!
- out_key[newpost] = org_value;
-
-#ifdef PERMUT
- indices_out[newpost] = indices_in[k];
-#endif
-
- newpos++;
- loc_histo[shortkey * items + it]=newpos;
- }
-}
-
-// perform a parallel prefix sum (a scan) on the local histograms
-// (see Blelloch 1990) each workitem worries about two memories
-// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
-__kernel void radixsort_scanhistograms(__global int* histo, __local int* temp, __global int* globsum)
-{
- int it = get_local_id(0);
- int ig = get_global_id(0);
- int decale = 1;
- int n=get_local_size(0) * 2 ;
- int gr=get_group_id(0);
-
- // load input into local memory
- // up sweep phase
- temp[2*it] = histo[2*ig];
- temp[2*it+1] = histo[2*ig+1];
-
- // parallel prefix sum (algorithm of Blelloch 1990)
- for (int d = n>>1; d > 0; d >>= 1){
- barrier(CLK_LOCAL_MEM_FENCE);
- if (it < d){
- int ai = decale*(2*it+1)-1;
- int bi = decale*(2*it+2)-1;
- temp[bi] += temp[ai];
- }
- decale *= 2;
- }
-
- // store the last element in the global sum vector
- // (maybe used in the next step for constructing the global scan)
- // clear the last element
- if (it == 0) {
- globsum[gr]=temp[n-1];
- temp[n - 1] = 0;
- }
-
- // down sweep phase
- for (int d = 1; d < n; d *= 2){
- decale >>= 1;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (it < d){
- int ai = decale*(2*it+1)-1;
- int bi = decale*(2*it+2)-1;
-
- int t = temp[ai];
- temp[ai] = temp[bi];
- temp[bi] += t;
- }
-
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // write results to device memory
-
- histo[2*ig] = temp[2*it];
- histo[2*ig+1] = temp[2*it+1];
-
- barrier(CLK_GLOBAL_MEM_FENCE);
-
-}
-
-// use the global sum for updating the local histograms
-// each work item updates two values
-__kernel void radixsort_pastehistograms( __global int* histo,__global int* globsum)
-{
- int ig = get_global_id(0);
- int gr=get_group_id(0);
-
- int s;
-
- s=globsum[gr];
-
- // write results to device memory
- histo[2*ig] += s;
- histo[2*ig+1] += s;
-
- barrier(CLK_GLOBAL_MEM_FENCE);
-}