summaryrefslogtreecommitdiff
path: root/runtimes/libs/ARMComputeEx/src/core/CL/cl_kernels/gather_ex.cl
blob: 09f776156d4ee333f8c2a090177e529d565220cf (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
/*
 * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
 * Copyright (c) 2017 ARM Limited.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
#include "helpers.h"

#if defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM)

/** Performs the Gather operation along the chosen axis
 * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
 *       -DDATA_TYPE=short
 * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1
 * @attention Output tensor depth should be given as a preprocessor argument using
 *            -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16
 * @attention Input tensor depth should be given as a preprocessor argument using
 *            -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16
 *
 * @param[in]  input_ptr                             Pointer to the source tensor. Supported data
 *                                                   types: U8/S8/U16/S16/U32/S32/F16/F32
 * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in
 *                                                   bytes)
 * @param[in]  input_step_x                          input_stride_x * number of elements along X
 *                                                   processed per work item (in bytes)
 * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in
 *                                                   bytes)
 * @param[in]  input_step_y                          input_stride_y * number of elements along Y
 *                                                   processed per work item (in bytes)
 * @param[in]  input_stride_z                        Stride of the source tensor in Y dimension (in
 *                                                   bytes)
 * @param[in]  input_step_z                          input_stride_z * number of elements along Z
 *                                                   processed per work item (in bytes)
 * @param[in]  input_stride_w                        Stride of the source tensor in Z dimension (in
 *                                                   bytes)
 * @param[in]  input_step_w                          input_stride_w * number of elements along W
 *                                                   processed per work item (in bytes)
 * @param[in]  input_offset_first_element_in_bytes   Offset of the first element in the source
 *                                                   tensor
 * @param[in]  indices_ptr                           Pointer to the source tensor. Supported data
 *                                                   types: S32
 * @param[in]  indices_stride_x                      Stride of the source tensor in X dimension (in
 *                                                   bytes)
 * @param[in]  indices_step_x                        indices_stride_x * number of elements along X
 *                                                   processed per workitem(in  bytes)
 * @param[in]  indices_stride_y                      Stride of the source tensor in Y dimension (in
 *                                                   bytes)
 * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y
 *                                                   processed per workitem(in  bytes)
 * @param[in]  indices_stride_z                      Stride of the source tensor in Z dimension (in
 *                                                   bytes)
 * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z
 *                                                   processed per workitem(in  bytes)
 * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the
 *                                                   destination tensor
 * @param[out] output_ptr                            Pointer to the destination tensor. Supported
 *                                                   data types: same as @p input_ptr
 * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension
 *                                                   (in bytes)
 * @param[in]  output_step_x                         output_stride_x * number of elements along X
 *                                                   processed per work item (in bytes)
 * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension
 *                                                   (in bytes)
 * @param[in]  output_step_y                         output_stride_y * number of elements along Y
 *                                                   processed per work item (in bytes)
 * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension
 *                                                   (in bytes)
 * @param[in]  output_step_z                         output_stride_z * number of elements along Z
 *                                                   processed per work item (in bytes)
 * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension
 *                                                   (in bytes)
 * @param[in]  output_step_w                         output_stride_w * number of elements along W
 *                                                   processed per work item (in bytes)
 * @param[in]  output_offset_first_element_in_bytes  Offset of the first element in the destination
 *                                                   tensor
 */
__kernel void gather_ex(TENSOR4D_DECLARATION(input), TENSOR3D_DECLARATION(indices),
                        TENSOR4D_DECLARATION(output))
{
  const int px = get_global_id(0);
  const int py = get_global_id(1);
  const int pz = get_global_id(2) % OUTPUT_DIM_Z;
  const int pw = get_global_id(2) / OUTPUT_DIM_Z;

  const Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, INPUT_DIM_Z);
  const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices);
  Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z);

#if AXIS == 0
#if INDICES_DIM == 1
  const uint index = *(__global const uint *)tensor3D_offset(&indices, px, 0, 0);
  __global const uchar *input_addr = tensor4D_offset(&input, index, py, pz, pw);
#elif INDICES_DIM == 2
  const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, 0);
  __global const uchar *input_addr = tensor4D_offset(&input, index, pz, pw, 0);
#elif INDICES_DIM == 3
  const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pz);
  __global const uchar *input_addr = tensor4D_offset(&input, index, pw, 0, 0);
#endif
#elif AXIS == 1
#if INDICES_DIM == 1
  const uint index = *(__global const uint *)tensor3D_offset(&indices, py, 0, 0);
  __global const uchar *input_addr = tensor4D_offset(&input, px, index, pz, pw);
#elif INDICES_DIM == 2
  const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, 0);
  __global const uchar *input_addr = tensor4D_offset(&input, px, index, pw, 0);
#elif INDICES_DIM == 3
  const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, pw);
  __global const uchar *input_addr = tensor4D_offset(&input, px, index, 0, 0);
#endif
#elif AXIS == 2
#if INDICES_DIM == 1
  const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, 0, 0);
  __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, pw);
#elif INDICES_DIM == 2
  const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, pw, 0);
  __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, 0);
#endif
#elif AXIS == 3
#if INDICES_DIM == 1
  const uint index = *(__global const uint *)tensor3D_offset(&indices, pw, 0, 0);
  __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, index);
#endif
#endif // AXIS

  *(__global DATA_TYPE *)output.ptr = *((__global const DATA_TYPE *)input_addr);
}

#endif // defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM)