diff options
Diffstat (limited to 'libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl')
-rw-r--r-- | libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl | 271 |
1 files changed, 0 insertions, 271 deletions
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl deleted file mode 100644 index c2c2d89a4..000000000 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl +++ /dev/null @@ -1,271 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * Copyright (c) 2017 ARM Limited. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// reference: -// https://code.google.com/archive/p/ocl-radix-sort/source/default/source -// OpenCL kernel sources for the CLRadixSort class -// the #include does not exist in OpenCL -// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr -// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html -// if you find this software usefull you can cite the following work in your reports or articles: -// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011. -// http://hal.archives-ouvertes.fr/hal-00596730 - -// Reference for floating point radix sort: -// http://www.codercorner.com/RadixSortRevisited.htm - -// compute the histogram for each radix and each virtual processor for the pass -__kernel void radixsort_histogram(__global float* in_key_buf, - __global int* d_Histograms, - const int pass, - __local int* loc_histo, - const int n) -{ - int it = get_local_id(0); // i local number of the processor - int ig = get_global_id(0); // global number = i + g I - - int gr = get_group_id(0); // g group number - - int groups = get_num_groups(0); - int items = get_local_size(0); - - // set the local histograms to zero - for(int ir=0;ir<_RADIX;ir++){ - loc_histo[ir * items + it] = 0; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // range of keys that are analyzed by the work item - int size= n/groups/items; // size of the sub-list - int start= ig * size; // beginning of the sub-list - - unsigned int key; - int shortkey,k; - - // compute the index - // the computation depends on the transposition - for(int j = 0; j < size ; j++) { -#ifdef TRANSPOSE - k= groups * items * j + ig; -#else - k=j+start; -#endif - - key = *((__global unsigned int*)(in_key_buf + k)); - - // extract the group of _BITS bits of the pass - // the result is in the range 0.._RADIX-1 - shortkey=(( key >> (pass * _BITS)) & (_RADIX-1)); - - // increment the local histogram - loc_histo[shortkey * items + it ]++; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // copy the local histogram to the global one - for(int ir=0;ir<_RADIX;ir++) { - d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it]; - } - - barrier(CLK_GLOBAL_MEM_FENCE); -} - -// initial transpose of the list for improving -// coalescent memory access -__kernel void transpose(const __global int* invect, - __global int* outvect, - const int nbcol, - const int nbrow, - const __global int* inperm, - __global int* outperm, - __local int* blockmat, - __local int* blockperm, - const int tilesize){ - - int i0 = get_global_id(0)*tilesize; // first row index - int j = get_global_id(1); // column index - - int jloc = get_local_id(1); // local column index - - // fill the cache - for(int iloc=0;iloc<tilesize;iloc++){ - int k=(i0+iloc)*nbcol+j; // position in the matrix - blockmat[iloc*tilesize+jloc]=invect[k]; -#ifdef PERMUT - blockperm[iloc*tilesize+jloc]=inperm[k]; -#endif - } - - barrier(CLK_LOCAL_MEM_FENCE); - - // first row index in the transpose - int j0=get_group_id(1)*tilesize; - - // put the cache at the good place - for(int iloc=0;iloc<tilesize;iloc++){ - int kt=(j0+iloc)*nbrow+i0+jloc; // position in the transpose - outvect[kt]=blockmat[jloc*tilesize+iloc]; -#ifdef PERMUT - outperm[kt]=blockperm[jloc*tilesize+iloc]; -#endif - } - -} - -// each virtual processor reorders its data using the scanned histogram -__kernel void radixsort_reorder(__global float* in_key, - __global float* out_key, - __global int* d_Histograms, - const int pass, - __global int* indices_in, - __global int* indices_out, - __local int* loc_histo, - const int n){ - - int it = get_local_id(0); - int ig = get_global_id(0); - - int gr = get_group_id(0); - int groups=get_num_groups(0); - int items=get_local_size(0); - - int start= ig *(n/groups/items); - int size= n/groups/items; - - // take the histogram in the cache - for(int ir=0;ir<_RADIX;ir++){ - loc_histo[ir * items + it]= - d_Histograms[items * (ir * groups + gr) + it]; - } - barrier(CLK_LOCAL_MEM_FENCE); - - int newpos,shortkey,k,newpost; - unsigned int key; - - for(int j= 0; j< size;j++){ -#ifdef TRANSPOSE - k= groups * items * j + ig; -#else - k=j+start; -#endif - float org_value = in_key[k]; - key = *(__global unsigned int*)(in_key + k); - shortkey=((key >> (pass * _BITS)) & (_RADIX-1)); - - newpos=loc_histo[shortkey * items + it]; - -#ifdef TRANSPOSE - int ignew,jnew; - ignew= newpos/(n/groups/items); - jnew = newpos%(n/groups/items); - newpost = jnew * (groups*items) + ignew; -#else - newpost=newpos; -#endif - - //d_outKeys[newpost]= key; // killing line !!! - out_key[newpost] = org_value; - -#ifdef PERMUT - indices_out[newpost] = indices_in[k]; -#endif - - newpos++; - loc_histo[shortkey * items + it]=newpos; - } -} - -// perform a parallel prefix sum (a scan) on the local histograms -// (see Blelloch 1990) each workitem worries about two memories -// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html -__kernel void radixsort_scanhistograms(__global int* histo, __local int* temp, __global int* globsum) -{ - int it = get_local_id(0); - int ig = get_global_id(0); - int decale = 1; - int n=get_local_size(0) * 2 ; - int gr=get_group_id(0); - - // load input into local memory - // up sweep phase - temp[2*it] = histo[2*ig]; - temp[2*it+1] = histo[2*ig+1]; - - // parallel prefix sum (algorithm of Blelloch 1990) - for (int d = n>>1; d > 0; d >>= 1){ - barrier(CLK_LOCAL_MEM_FENCE); - if (it < d){ - int ai = decale*(2*it+1)-1; - int bi = decale*(2*it+2)-1; - temp[bi] += temp[ai]; - } - decale *= 2; - } - - // store the last element in the global sum vector - // (maybe used in the next step for constructing the global scan) - // clear the last element - if (it == 0) { - globsum[gr]=temp[n-1]; - temp[n - 1] = 0; - } - - // down sweep phase - for (int d = 1; d < n; d *= 2){ - decale >>= 1; - barrier(CLK_LOCAL_MEM_FENCE); - - if (it < d){ - int ai = decale*(2*it+1)-1; - int bi = decale*(2*it+2)-1; - - int t = temp[ai]; - temp[ai] = temp[bi]; - temp[bi] += t; - } - - } - barrier(CLK_LOCAL_MEM_FENCE); - - // write results to device memory - - histo[2*ig] = temp[2*it]; - histo[2*ig+1] = temp[2*it+1]; - - barrier(CLK_GLOBAL_MEM_FENCE); - -} - -// use the global sum for updating the local histograms -// each work item updates two values -__kernel void radixsort_pastehistograms( __global int* histo,__global int* globsum) -{ - int ig = get_global_id(0); - int gr=get_group_id(0); - - int s; - - s=globsum[gr]; - - // write results to device memory - histo[2*ig] += s; - histo[2*ig+1] += s; - - barrier(CLK_GLOBAL_MEM_FENCE); -} |