1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
|
/*
* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
* Copyright (c) 2017 ARM Limited.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "helpers.h"
#if defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM)
/** Performs the Gather operation along the chosen axis
* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
* -DDATA_TYPE=short
* @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1
* @attention Output tensor depth should be given as a preprocessor argument using
* -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16
* @attention Input tensor depth should be given as a preprocessor argument using
* -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16
*
* @param[in] input_ptr Pointer to the source tensor. Supported data
* types: U8/S8/U16/S16/U32/S32/F16/F32
* @param[in] input_stride_x Stride of the source tensor in X dimension (in
* bytes)
* @param[in] input_step_x input_stride_x * number of elements along X
* processed per work item (in bytes)
* @param[in] input_stride_y Stride of the source tensor in Y dimension (in
* bytes)
* @param[in] input_step_y input_stride_y * number of elements along Y
* processed per work item (in bytes)
* @param[in] input_stride_z Stride of the source tensor in Y dimension (in
* bytes)
* @param[in] input_step_z input_stride_z * number of elements along Z
* processed per work item (in bytes)
* @param[in] input_stride_w Stride of the source tensor in Z dimension (in
* bytes)
* @param[in] input_step_w input_stride_w * number of elements along W
* processed per work item (in bytes)
* @param[in] input_offset_first_element_in_bytes Offset of the first element in the source
* tensor
* @param[in] indices_ptr Pointer to the source tensor. Supported data
* types: S32
* @param[in] indices_stride_x Stride of the source tensor in X dimension (in
* bytes)
* @param[in] indices_step_x indices_stride_x * number of elements along X
* processed per workitem(in bytes)
* @param[in] indices_stride_y Stride of the source tensor in Y dimension (in
* bytes)
* @param[in] indices_step_y indices_stride_y * number of elements along Y
* processed per workitem(in bytes)
* @param[in] indices_stride_z Stride of the source tensor in Z dimension (in
* bytes)
* @param[in] indices_step_z indices_stride_z * number of elements along Z
* processed per workitem(in bytes)
* @param[in] indices_offset_first_element_in_bytes The offset of the first element in the
* destination tensor
* @param[out] output_ptr Pointer to the destination tensor. Supported
* data types: same as @p input_ptr
* @param[in] output_stride_x Stride of the destination tensor in X dimension
* (in bytes)
* @param[in] output_step_x output_stride_x * number of elements along X
* processed per work item (in bytes)
* @param[in] output_stride_y Stride of the destination tensor in Y dimension
* (in bytes)
* @param[in] output_step_y output_stride_y * number of elements along Y
* processed per work item (in bytes)
* @param[in] output_stride_z Stride of the destination tensor in Z dimension
* (in bytes)
* @param[in] output_step_z output_stride_z * number of elements along Z
* processed per work item (in bytes)
* @param[in] output_stride_w Stride of the destination tensor in W dimension
* (in bytes)
* @param[in] output_step_w output_stride_w * number of elements along W
* processed per work item (in bytes)
* @param[in] output_offset_first_element_in_bytes Offset of the first element in the destination
* tensor
*/
__kernel void gather_ex(TENSOR4D_DECLARATION(input), TENSOR3D_DECLARATION(indices),
TENSOR4D_DECLARATION(output))
{
const int px = get_global_id(0);
const int py = get_global_id(1);
const int pz = get_global_id(2) % OUTPUT_DIM_Z;
const int pw = get_global_id(2) / OUTPUT_DIM_Z;
const Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, INPUT_DIM_Z);
const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices);
Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z);
#if AXIS == 0
#if INDICES_DIM == 1
const uint index = *(__global const uint *)tensor3D_offset(&indices, px, 0, 0);
__global const uchar *input_addr = tensor4D_offset(&input, index, py, pz, pw);
#elif INDICES_DIM == 2
const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, 0);
__global const uchar *input_addr = tensor4D_offset(&input, index, pz, pw, 0);
#elif INDICES_DIM == 3
const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pz);
__global const uchar *input_addr = tensor4D_offset(&input, index, pw, 0, 0);
#endif
#elif AXIS == 1
#if INDICES_DIM == 1
const uint index = *(__global const uint *)tensor3D_offset(&indices, py, 0, 0);
__global const uchar *input_addr = tensor4D_offset(&input, px, index, pz, pw);
#elif INDICES_DIM == 2
const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, 0);
__global const uchar *input_addr = tensor4D_offset(&input, px, index, pw, 0);
#elif INDICES_DIM == 3
const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, pw);
__global const uchar *input_addr = tensor4D_offset(&input, px, index, 0, 0);
#endif
#elif AXIS == 2
#if INDICES_DIM == 1
const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, 0, 0);
__global const uchar *input_addr = tensor4D_offset(&input, px, py, index, pw);
#elif INDICES_DIM == 2
const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, pw, 0);
__global const uchar *input_addr = tensor4D_offset(&input, px, py, index, 0);
#endif
#elif AXIS == 3
#if INDICES_DIM == 1
const uint index = *(__global const uint *)tensor3D_offset(&indices, pw, 0, 0);
__global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, index);
#endif
#endif // AXIS
*(__global DATA_TYPE *)output.ptr = *((__global const DATA_TYPE *)input_addr);
}
#endif // defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM)
|