Compute Library  18.05
hog.cl
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2018 ARM Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "helpers.h"
25 #include "types.h"
26 
27 #if defined(CELL_WIDTH) && defined(CELL_HEIGHT) && defined(NUM_BINS) && defined(PHASE_SCALE)
28 
62 {
63  float bins[NUM_BINS] = { 0 };
64 
65  // Compute address for the magnitude and phase images
66  Image mag = CONVERT_TO_IMAGE_STRUCT(mag);
68 
69  __global uchar *mag_row_ptr = mag.ptr;
70  __global uchar *phase_row_ptr = phase.ptr;
71 
72  for(int yc = 0; yc < CELL_HEIGHT; ++yc)
73  {
74  int xc = 0;
75  for(; xc <= (CELL_WIDTH - 4); xc += 4)
76  {
77  // Load magnitude and phase values
78  const float4 mag_f32 = convert_float4(vload4(0, (__global short *)mag_row_ptr + xc));
79  float4 phase_f32 = convert_float4(vload4(0, phase_row_ptr + xc));
80 
81  // Scale phase: phase * scale + 0.5f
82  phase_f32 = (float4)0.5f + phase_f32 * (float4)PHASE_SCALE;
83 
84  // Compute histogram index.
85  int4 hidx_s32 = convert_int4(phase_f32);
86 
87  // Compute magnitude weights (w0 and w1)
88  const float4 hidx_f32 = convert_float4(hidx_s32);
89 
90  // w1 = phase_f32 - hidx_s32
91  const float4 w1_f32 = phase_f32 - hidx_f32;
92 
93  // w0 = 1.0 - w1
94  const float4 w0_f32 = (float4)1.0f - w1_f32;
95 
96  // Calculate the weights for splitting vote
97  const float4 mag_w0_f32 = mag_f32 * w0_f32;
98  const float4 mag_w1_f32 = mag_f32 * w1_f32;
99 
100  // Weighted vote between 2 bins
101 
102  // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
103  hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
104 
105  // Bin 0
106  bins[hidx_s32.s0] += mag_w0_f32.s0;
107  bins[hidx_s32.s1] += mag_w0_f32.s1;
108  bins[hidx_s32.s2] += mag_w0_f32.s2;
109  bins[hidx_s32.s3] += mag_w0_f32.s3;
110 
111  hidx_s32 += (int4)1;
112 
113  // Check if the histogram index is equal to NUM_BINS. If so, replace the index with 0
114  hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
115 
116  // Bin1
117  bins[hidx_s32.s0] += mag_w1_f32.s0;
118  bins[hidx_s32.s1] += mag_w1_f32.s1;
119  bins[hidx_s32.s2] += mag_w1_f32.s2;
120  bins[hidx_s32.s3] += mag_w1_f32.s3;
121  }
122 
123  // Left over computation
124  for(; xc < CELL_WIDTH; xc++)
125  {
126  const float mag_value = *((__global short *)mag_row_ptr + xc);
127  const float phase_value = *(phase_row_ptr + xc) * (float)PHASE_SCALE + 0.5f;
128  const float w1 = phase_value - floor(phase_value);
129 
130  // The quantised phase is the histogram index [0, NUM_BINS - 1]
131  // Check limit of histogram index. If hidx == NUM_BINS, hidx = 0
132  const uint hidx = (uint)(phase_value) % NUM_BINS;
133 
134  // Weighted vote between 2 bins
135  bins[hidx] += mag_value * (1.0f - w1);
136  bins[(hidx + 1) % NUM_BINS] += mag_value * w1;
137  }
138 
139  // Point to the next row of magnitude and phase images
140  mag_row_ptr += mag_stride_y;
141  phase_row_ptr += phase_stride_y;
142  }
143 
144  // Compute address for the destination image
146 
147  // Store the local HOG in the global memory
148  int xc = 0;
149  for(; xc <= (NUM_BINS - 4); xc += 4)
150  {
151  float4 values = vload4(0, bins + xc);
152 
153  vstore4(values, 0, ((__global float *)dst.ptr) + xc);
154  }
155 
156  // Left over stores
157  for(; xc < NUM_BINS; ++xc)
158  {
159  ((__global float *)dst.ptr)[xc] = bins[xc];
160  }
161 }
162 #endif /* CELL_WIDTH and CELL_HEIGHT and NUM_BINS and PHASE_SCALE */
163 
164 #if defined(NUM_CELLS_PER_BLOCK_HEIGHT) && defined(NUM_BINS_PER_BLOCK_X) && defined(NUM_BINS_PER_BLOCK) && defined(HOG_NORM_TYPE) && defined(L2_HYST_THRESHOLD)
165 
166 #ifndef L2_NORM
167 #error The value of enum class HOGNormType::L2_NORM has not be passed to the OpenCL kernel
168 #endif /* not L2_NORM */
169 
170 #ifndef L2HYS_NORM
171 #error The value of enum class HOGNormType::L2HYS_NORM has not be passed to the OpenCL kernel
172 #endif /* not L2HYS_NORM */
173 
174 #ifndef L1_NORM
175 #error The value of enum class HOGNormType::L1_NORM has not be passed to the OpenCL kernel
176 #endif /* not L1_NORM */
177 
207  IMAGE_DECLARATION(dst))
208 {
209  float sum = 0.0f;
210  float4 sum_f32 = (float4)(0.0f);
211 
212  // Compute address for the source and destination tensor
214  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
215 
216  for(size_t yc = 0; yc < NUM_CELLS_PER_BLOCK_HEIGHT; ++yc)
217  {
218  const __global float *hist_ptr = (__global float *)(src.ptr + yc * src_stride_y);
219 
220  int xc = 0;
221  for(; xc <= (NUM_BINS_PER_BLOCK_X - 16); xc += 16)
222  {
223  const float4 val0 = vload4(0, hist_ptr + xc + 0);
224  const float4 val1 = vload4(0, hist_ptr + xc + 4);
225  const float4 val2 = vload4(0, hist_ptr + xc + 8);
226  const float4 val3 = vload4(0, hist_ptr + xc + 12);
227 
228 #if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
229  // Compute val^2 for L2_NORM or L2HYS_NORM
230  sum_f32 += val0 * val0;
231  sum_f32 += val1 * val1;
232  sum_f32 += val2 * val2;
233  sum_f32 += val3 * val3;
234 #else /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
235  // Compute |val| for L1_NORM
236  sum_f32 += fabs(val0);
237  sum_f32 += fabs(val1);
238  sum_f32 += fabs(val2);
239  sum_f32 += fabs(val3);
240 #endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
241 
242  // Store linearly the input values un-normalized in the output image. These values will be reused for the normalization.
243  // This approach will help us to be cache friendly in the next for loop where the normalization will be done because all the values
244  // will be accessed consecutively
245  vstore4(val0, 0, ((__global float *)dst.ptr) + xc + 0 + yc * NUM_BINS_PER_BLOCK_X);
246  vstore4(val1, 0, ((__global float *)dst.ptr) + xc + 4 + yc * NUM_BINS_PER_BLOCK_X);
247  vstore4(val2, 0, ((__global float *)dst.ptr) + xc + 8 + yc * NUM_BINS_PER_BLOCK_X);
248  vstore4(val3, 0, ((__global float *)dst.ptr) + xc + 12 + yc * NUM_BINS_PER_BLOCK_X);
249  }
250 
251  // Compute left over
252  for(; xc < NUM_BINS_PER_BLOCK_X; ++xc)
253  {
254  const float val = hist_ptr[xc];
255 
256 #if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM)
257  sum += val * val;
258 #else /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
259  sum += fabs(val);
260 #endif /* (HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) */
261 
262  ((__global float *)dst.ptr)[xc + 0 + yc * NUM_BINS_PER_BLOCK_X] = val;
263  }
264  }
265 
266  sum += dot(sum_f32, (float4)1.0f);
267 
268  float scale = 1.0f / (sqrt(sum) + NUM_BINS_PER_BLOCK * 0.1f);
269 
270 #if(HOG_NORM_TYPE == L2HYS_NORM)
271  // Reset sum
272  sum_f32 = (float4)0.0f;
273  sum = 0.0f;
274 
275  int k = 0;
276  for(; k <= NUM_BINS_PER_BLOCK - 16; k += 16)
277  {
278  float4 val0 = vload4(0, ((__global float *)dst.ptr) + k + 0);
279  float4 val1 = vload4(0, ((__global float *)dst.ptr) + k + 4);
280  float4 val2 = vload4(0, ((__global float *)dst.ptr) + k + 8);
281  float4 val3 = vload4(0, ((__global float *)dst.ptr) + k + 12);
282 
283  // Scale val
284  val0 = val0 * (float4)scale;
285  val1 = val1 * (float4)scale;
286  val2 = val2 * (float4)scale;
287  val3 = val3 * (float4)scale;
288 
289  // Clip val if over _threshold_l2hys
290  val0 = fmin(val0, (float4)L2_HYST_THRESHOLD);
291  val1 = fmin(val1, (float4)L2_HYST_THRESHOLD);
292  val2 = fmin(val2, (float4)L2_HYST_THRESHOLD);
293  val3 = fmin(val3, (float4)L2_HYST_THRESHOLD);
294 
295  // Compute val^2
296  sum_f32 += val0 * val0;
297  sum_f32 += val1 * val1;
298  sum_f32 += val2 * val2;
299  sum_f32 += val3 * val3;
300 
301  vstore4(val0, 0, ((__global float *)dst.ptr) + k + 0);
302  vstore4(val1, 0, ((__global float *)dst.ptr) + k + 4);
303  vstore4(val2, 0, ((__global float *)dst.ptr) + k + 8);
304  vstore4(val3, 0, ((__global float *)dst.ptr) + k + 12);
305  }
306 
307  // Compute left over
308  for(; k < NUM_BINS_PER_BLOCK; ++k)
309  {
310  float val = ((__global float *)dst.ptr)[k] * scale;
311 
312  // Clip scaled input_value if over L2_HYST_THRESHOLD
313  val = fmin(val, (float)L2_HYST_THRESHOLD);
314 
315  sum += val * val;
316 
317  ((__global float *)dst.ptr)[k] = val;
318  }
319 
320  sum += dot(sum_f32, (float4)1.0f);
321 
322  // We use the same constants of OpenCV
323  scale = 1.0f / (sqrt(sum) + 1e-3f);
324 
325 #endif /* (HOG_NORM_TYPE == L2HYS_NORM) */
326 
327  int i = 0;
328  for(; i <= (NUM_BINS_PER_BLOCK - 16); i += 16)
329  {
330  float4 val0 = vload4(0, ((__global float *)dst.ptr) + i + 0);
331  float4 val1 = vload4(0, ((__global float *)dst.ptr) + i + 4);
332  float4 val2 = vload4(0, ((__global float *)dst.ptr) + i + 8);
333  float4 val3 = vload4(0, ((__global float *)dst.ptr) + i + 12);
334 
335  // Multiply val by the normalization scale factor
336  val0 = val0 * (float4)scale;
337  val1 = val1 * (float4)scale;
338  val2 = val2 * (float4)scale;
339  val3 = val3 * (float4)scale;
340 
341  vstore4(val0, 0, ((__global float *)dst.ptr) + i + 0);
342  vstore4(val1, 0, ((__global float *)dst.ptr) + i + 4);
343  vstore4(val2, 0, ((__global float *)dst.ptr) + i + 8);
344  vstore4(val3, 0, ((__global float *)dst.ptr) + i + 12);
345  }
346 
347  for(; i < NUM_BINS_PER_BLOCK; ++i)
348  {
349  ((__global float *)dst.ptr)[i] *= scale;
350  }
351 }
352 #endif /* NUM_CELLS_PER_BLOCK_HEIGHT and NUM_BINS_PER_BLOCK_X and NUM_BINS_PER_BLOCK and HOG_NORM_TYPE and L2_HYST_THRESHOLD */
353 
354 #if defined(NUM_BLOCKS_PER_DESCRIPTOR_Y) && defined(NUM_BINS_PER_DESCRIPTOR_X) && defined(THRESHOLD) && defined(MAX_NUM_DETECTION_WINDOWS) && defined(IDX_CLASS) && defined(DETECTION_WINDOW_STRIDE_WIDTH) && defined(DETECTION_WINDOW_STRIDE_HEIGHT) && defined(DETECTION_WINDOW_WIDTH) && defined(DETECTION_WINDOW_HEIGHT)
355 
382 __kernel void hog_detector(IMAGE_DECLARATION(src),
383  __global float *hog_descriptor,
384  __global DetectionWindow *dst,
385  __global uint *num_detection_windows)
386 {
387  // Check if the DetectionWindow array is full
388  if(*num_detection_windows >= MAX_NUM_DETECTION_WINDOWS)
389  {
390  return;
391  }
392 
393  Image src = CONVERT_TO_IMAGE_STRUCT(src);
394 
395  const int src_step_y_f32 = src_stride_y / sizeof(float);
396 
397  // Init score_f32 with 0
398  float4 score_f32 = (float4)0.0f;
399 
400  // Init score with 0
401  float score = 0.0f;
402 
403  __global float *src_row_ptr = (__global float *)src.ptr;
404 
405  // Compute Linear SVM
406  for(int yb = 0; yb < NUM_BLOCKS_PER_DESCRIPTOR_Y; ++yb, src_row_ptr += src_step_y_f32)
407  {
408  int xb = 0;
409 
410  const int offset_y = yb * NUM_BINS_PER_DESCRIPTOR_X;
411 
412  for(; xb < (int)NUM_BINS_PER_DESCRIPTOR_X - 8; xb += 8)
413  {
414  // Load descriptor values
415  float4 a0_f32 = vload4(0, src_row_ptr + xb + 0);
416  float4 a1_f32 = vload4(0, src_row_ptr + xb + 4);
417 
418  float4 b0_f32 = vload4(0, hog_descriptor + xb + 0 + offset_y);
419  float4 b1_f32 = vload4(0, hog_descriptor + xb + 4 + offset_y);
420 
421  // Multiply accumulate
422  score_f32 += a0_f32 * b0_f32;
423  score_f32 += a1_f32 * b1_f32;
424  }
425 
426  for(; xb < NUM_BINS_PER_DESCRIPTOR_X; ++xb)
427  {
428  const float a = src_row_ptr[xb];
429  const float b = hog_descriptor[xb + offset_y];
430 
431  score += a * b;
432  }
433  }
434 
435  score += dot(score_f32, (float4)1.0f);
436 
437  // Add the bias. The bias is located at the position (descriptor_size() - 1)
438  // (descriptor_size - 1) = NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y
439  score += hog_descriptor[NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y];
440 
441  if(score > (float)THRESHOLD)
442  {
443  int id = atomic_inc(num_detection_windows);
444  if(id < MAX_NUM_DETECTION_WINDOWS)
445  {
446  dst[id].x = get_global_id(0) * DETECTION_WINDOW_STRIDE_WIDTH;
447  dst[id].y = get_global_id(1) * DETECTION_WINDOW_STRIDE_HEIGHT;
448  dst[id].width = DETECTION_WINDOW_WIDTH;
449  dst[id].height = DETECTION_WINDOW_HEIGHT;
450  dst[id].idx_class = IDX_CLASS;
451  dst[id].score = score;
452  }
453  }
454 }
455 #endif /* NUM_BLOCKS_PER_DESCRIPTOR_Y && NUM_BINS_PER_DESCRIPTOR_X && THRESHOLD && MAX_NUM_DETECTION_WINDOWS && IDX_CLASS &&
456  * DETECTION_WINDOW_STRIDE_WIDTH && DETECTION_WINDOW_STRIDE_HEIGHT && DETECTION_WINDOW_WIDTH && DETECTION_WINDOW_HEIGHT */
Detection window struct.
Definition: types.h:47
void hog_orientation_binning(const SimpleTensor< T > &mag, const SimpleTensor< U > &phase, SimpleTensor< V > &hog_space, const HOGInfo &hog_info)
DATA_TYPE sum(__global const DATA_TYPE *input)
Calculate sum of a vector.
SimpleTensor< uint8_t > phase(const SimpleTensor< T > &gx, const SimpleTensor< T > &gy, PhaseType phase_type)
Definition: Phase.cpp:35
#define IMAGE_DECLARATION(name)
Definition: helpers.h:68
SimpleTensor< T > hog_descriptor(const SimpleTensor< U > &src, BorderMode border_mode, U constant_border_value, const HOGInfo &hog_info)
#define CONVERT_TO_IMAGE_STRUCT(name)
Definition: helpers.h:104
Structure to hold Image information.
Definition: helpers.h:142
__global uchar * ptr
Pointer to the starting postion of the buffer.
Definition: helpers.h:144
std::vector< DetectionWindow > hog_detector(const SimpleTensor< T > &src, const std::vector< T > &descriptor, unsigned int max_num_detection_windows, const HOGInfo &hog_info, const Size2D &detection_window_stride, float threshold, uint16_t idx_class)
Definition: HOGDetector.cpp:48
convolution configure & src
void hog_block_normalization(SimpleTensor< T > &desc, const SimpleTensor< T > &hog_space, const HOGInfo &hog_info)