summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--runtimes/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp8
-rw-r--r--runtimes/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl269
2 files changed, 0 insertions, 277 deletions
diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index 808125645..6525fb62e 100644
--- a/runtimes/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/runtimes/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -65,10 +65,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
{"topkv2_find_first_negative", "topkv2.cl"},
{"topkv2_reorder_negatives", "topkv2.cl"},
{"topkv2_store", "topkv2.cl"},
- {"radixsort_histogram", "topkv2_radixsort.cl"},
- {"radixsort_scanhistograms", "topkv2_radixsort.cl"},
- {"radixsort_pastehistograms", "topkv2_radixsort.cl"},
- {"radixsort_reorder", "topkv2_radixsort.cl"},
{"topkv2_quicksort", "topkv2_quicksort.cl"},
{"space_to_batch_4d_nchw", "space_to_batch.cl"},
{"space_to_batch_4d_nhwc", "space_to_batch.cl"},
@@ -143,10 +139,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
#include "./cl_kernels/topkv2.clembed"
},
{
- "topkv2_radixsort.cl",
-#include "./cl_kernels/topkv2_radixsort.clembed"
- },
- {
"topkv2_quicksort.cl",
#include "./cl_kernels/topkv2_quicksort.clembed"
},
diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/runtimes/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
deleted file mode 100644
index f6830d229..000000000
--- a/runtimes/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- * Copyright (c) 2017 ARM Limited.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// reference:
-// https://code.google.com/archive/p/ocl-radix-sort/source/default/source
-// OpenCL kernel sources for the CLRadixSort class
-// the #include does not exist in OpenCL
-// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr
-// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html
-// if you find this software usefull you can cite the following work in your reports or articles:
-// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011.
-// http://hal.archives-ouvertes.fr/hal-00596730
-
-// Reference for floating point radix sort:
-// http://www.codercorner.com/RadixSortRevisited.htm
-
-// compute the histogram for each radix and each virtual processor for the pass
-__kernel void radixsort_histogram(__global float *in_key_buf, __global int *d_Histograms,
- const int pass, __local int *loc_histo, const int n)
-{
- int it = get_local_id(0); // i local number of the processor
- int ig = get_global_id(0); // global number = i + g I
-
- int gr = get_group_id(0); // g group number
-
- int groups = get_num_groups(0);
- int items = get_local_size(0);
-
- // set the local histograms to zero
- for (int ir = 0; ir < _RADIX; ir++)
- {
- loc_histo[ir * items + it] = 0;
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // range of keys that are analyzed by the work item
- int size = n / groups / items; // size of the sub-list
- int start = ig * size; // beginning of the sub-list
-
- unsigned int key;
- int shortkey, k;
-
- // compute the index
- // the computation depends on the transposition
- for (int j = 0; j < size; j++)
- {
-#ifdef TRANSPOSE
- k = groups * items * j + ig;
-#else
- k = j + start;
-#endif
-
- key = *((__global unsigned int *)(in_key_buf + k));
-
- // extract the group of _BITS bits of the pass
- // the result is in the range 0.._RADIX-1
- shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1));
-
- // increment the local histogram
- loc_histo[shortkey * items + it]++;
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // copy the local histogram to the global one
- for (int ir = 0; ir < _RADIX; ir++)
- {
- d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it];
- }
-
- barrier(CLK_GLOBAL_MEM_FENCE);
-}
-
-// initial transpose of the list for improving
-// coalescent memory access
-__kernel void transpose(const __global int *invect, __global int *outvect, const int nbcol,
- const int nbrow, const __global int *inperm, __global int *outperm,
- __local int *blockmat, __local int *blockperm, const int tilesize)
-{
-
- int i0 = get_global_id(0) * tilesize; // first row index
- int j = get_global_id(1); // column index
-
- int jloc = get_local_id(1); // local column index
-
- // fill the cache
- for (int iloc = 0; iloc < tilesize; iloc++)
- {
- int k = (i0 + iloc) * nbcol + j; // position in the matrix
- blockmat[iloc * tilesize + jloc] = invect[k];
-#ifdef PERMUT
- blockperm[iloc * tilesize + jloc] = inperm[k];
-#endif
- }
-
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // first row index in the transpose
- int j0 = get_group_id(1) * tilesize;
-
- // put the cache at the good place
- for (int iloc = 0; iloc < tilesize; iloc++)
- {
- int kt = (j0 + iloc) * nbrow + i0 + jloc; // position in the transpose
- outvect[kt] = blockmat[jloc * tilesize + iloc];
-#ifdef PERMUT
- outperm[kt] = blockperm[jloc * tilesize + iloc];
-#endif
- }
-}
-
-// each virtual processor reorders its data using the scanned histogram
-__kernel void radixsort_reorder(__global float *in_key, __global float *out_key,
- __global int *d_Histograms, const int pass,
- __global int *indices_in, __global int *indices_out,
- __local int *loc_histo, const int n)
-{
-
- int it = get_local_id(0);
- int ig = get_global_id(0);
-
- int gr = get_group_id(0);
- int groups = get_num_groups(0);
- int items = get_local_size(0);
-
- int start = ig * (n / groups / items);
- int size = n / groups / items;
-
- // take the histogram in the cache
- for (int ir = 0; ir < _RADIX; ir++)
- {
- loc_histo[ir * items + it] = d_Histograms[items * (ir * groups + gr) + it];
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- int newpos, shortkey, k, newpost;
- unsigned int key;
-
- for (int j = 0; j < size; j++)
- {
-#ifdef TRANSPOSE
- k = groups * items * j + ig;
-#else
- k = j + start;
-#endif
- float org_value = in_key[k];
- key = *(__global unsigned int *)(in_key + k);
- shortkey = ((key >> (pass * _BITS)) & (_RADIX - 1));
-
- newpos = loc_histo[shortkey * items + it];
-
-#ifdef TRANSPOSE
- int ignew, jnew;
- ignew = newpos / (n / groups / items);
- jnew = newpos % (n / groups / items);
- newpost = jnew * (groups * items) + ignew;
-#else
- newpost = newpos;
-#endif
-
- // d_outKeys[newpost]= key; // killing line !!!
- out_key[newpost] = org_value;
-
-#ifdef PERMUT
- indices_out[newpost] = indices_in[k];
-#endif
-
- newpos++;
- loc_histo[shortkey * items + it] = newpos;
- }
-}
-
-// perform a parallel prefix sum (a scan) on the local histograms
-// (see Blelloch 1990) each workitem worries about two memories
-// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html
-__kernel void radixsort_scanhistograms(__global int *histo, __local int *temp,
- __global int *globsum)
-{
- int it = get_local_id(0);
- int ig = get_global_id(0);
- int decale = 1;
- int n = get_local_size(0) * 2;
- int gr = get_group_id(0);
-
- // load input into local memory
- // up sweep phase
- temp[2 * it] = histo[2 * ig];
- temp[2 * it + 1] = histo[2 * ig + 1];
-
- // parallel prefix sum (algorithm of Blelloch 1990)
- for (int d = n >> 1; d > 0; d >>= 1)
- {
- barrier(CLK_LOCAL_MEM_FENCE);
- if (it < d)
- {
- int ai = decale * (2 * it + 1) - 1;
- int bi = decale * (2 * it + 2) - 1;
- temp[bi] += temp[ai];
- }
- decale *= 2;
- }
-
- // store the last element in the global sum vector
- // (maybe used in the next step for constructing the global scan)
- // clear the last element
- if (it == 0)
- {
- globsum[gr] = temp[n - 1];
- temp[n - 1] = 0;
- }
-
- // down sweep phase
- for (int d = 1; d < n; d *= 2)
- {
- decale >>= 1;
- barrier(CLK_LOCAL_MEM_FENCE);
-
- if (it < d)
- {
- int ai = decale * (2 * it + 1) - 1;
- int bi = decale * (2 * it + 2) - 1;
-
- int t = temp[ai];
- temp[ai] = temp[bi];
- temp[bi] += t;
- }
- }
- barrier(CLK_LOCAL_MEM_FENCE);
-
- // write results to device memory
-
- histo[2 * ig] = temp[2 * it];
- histo[2 * ig + 1] = temp[2 * it + 1];
-
- barrier(CLK_GLOBAL_MEM_FENCE);
-}
-
-// use the global sum for updating the local histograms
-// each work item updates two values
-__kernel void radixsort_pastehistograms(__global int *histo, __global int *globsum)
-{
- int ig = get_global_id(0);
- int gr = get_group_id(0);
-
- int s;
-
- s = globsum[gr];
-
- // write results to device memory
- histo[2 * ig] += s;
- histo[2 * ig + 1] += s;
-
- barrier(CLK_GLOBAL_MEM_FENCE);
-}