27 #if defined(CELL_WIDTH) && defined(CELL_HEIGHT) && defined(NUM_BINS) && defined(PHASE_SCALE) 63 float bins[NUM_BINS] = { 0 };
69 __global uchar *mag_row_ptr = mag.
ptr;
70 __global uchar *phase_row_ptr = phase.
ptr;
72 for(
int yc = 0; yc < CELL_HEIGHT; ++yc)
75 for(; xc <= (CELL_WIDTH - 4); xc += 4)
78 const float4 mag_f32 = convert_float4(vload4(0, (__global
short *)mag_row_ptr + xc));
79 float4 phase_f32 = convert_float4(vload4(0, phase_row_ptr + xc));
82 phase_f32 = (float4)0.5f + phase_f32 * (float4)PHASE_SCALE;
85 int4 hidx_s32 = convert_int4(phase_f32);
88 const float4 hidx_f32 = convert_float4(hidx_s32);
91 const float4 w1_f32 = phase_f32 - hidx_f32;
94 const float4 w0_f32 = (float4)1.0f - w1_f32;
97 const float4 mag_w0_f32 = mag_f32 * w0_f32;
98 const float4 mag_w1_f32 = mag_f32 * w1_f32;
103 hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
106 bins[hidx_s32.s0] += mag_w0_f32.s0;
107 bins[hidx_s32.s1] += mag_w0_f32.s1;
108 bins[hidx_s32.s2] += mag_w0_f32.s2;
109 bins[hidx_s32.s3] += mag_w0_f32.s3;
114 hidx_s32 = select(hidx_s32, (int4)0, hidx_s32 == (int4)(NUM_BINS));
117 bins[hidx_s32.s0] += mag_w1_f32.s0;
118 bins[hidx_s32.s1] += mag_w1_f32.s1;
119 bins[hidx_s32.s2] += mag_w1_f32.s2;
120 bins[hidx_s32.s3] += mag_w1_f32.s3;
124 for(; xc < CELL_WIDTH; xc++)
126 const float mag_value = *((__global
short *)mag_row_ptr + xc);
127 const float phase_value = *(phase_row_ptr + xc) * (
float)PHASE_SCALE + 0.5f;
128 const float w1 = phase_value - floor(phase_value);
132 const uint hidx = (uint)(phase_value) % NUM_BINS;
135 bins[hidx] += mag_value * (1.0f - w1);
136 bins[(hidx + 1) % NUM_BINS] += mag_value * w1;
140 mag_row_ptr += mag_stride_y;
141 phase_row_ptr += phase_stride_y;
149 for(; xc <= (NUM_BINS - 4); xc += 4)
151 float4 values = vload4(0, bins + xc);
153 vstore4(values, 0, ((__global
float *)dst.
ptr) + xc);
157 for(; xc < NUM_BINS; ++xc)
159 ((__global
float *)dst.
ptr)[xc] = bins[xc];
164 #if defined(NUM_CELLS_PER_BLOCK_HEIGHT) && defined(NUM_BINS_PER_BLOCK_X) && defined(NUM_BINS_PER_BLOCK) && defined(HOG_NORM_TYPE) && defined(L2_HYST_THRESHOLD) 167 #error The value of enum class HOGNormType::L2_NORM has not be passed to the OpenCL kernel 171 #error The value of enum class HOGNormType::L2HYS_NORM has not be passed to the OpenCL kernel 175 #error The value of enum class HOGNormType::L1_NORM has not be passed to the OpenCL kernel 210 float4 sum_f32 = (float4)(0.0f);
216 for(
size_t yc = 0; yc < NUM_CELLS_PER_BLOCK_HEIGHT; ++yc)
218 const __global
float *hist_ptr = (__global
float *)(src.
ptr + yc * src_stride_y);
221 for(; xc <= (NUM_BINS_PER_BLOCK_X - 16); xc += 16)
223 const float4 val0 = vload4(0, hist_ptr + xc + 0);
224 const float4 val1 = vload4(0, hist_ptr + xc + 4);
225 const float4 val2 = vload4(0, hist_ptr + xc + 8);
226 const float4 val3 = vload4(0, hist_ptr + xc + 12);
228 #if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) 230 sum_f32 += val0 * val0;
231 sum_f32 += val1 * val1;
232 sum_f32 += val2 * val2;
233 sum_f32 += val3 * val3;
236 sum_f32 += fabs(val0);
237 sum_f32 += fabs(val1);
238 sum_f32 += fabs(val2);
239 sum_f32 += fabs(val3);
245 vstore4(val0, 0, ((__global
float *)dst.
ptr) + xc + 0 + yc * NUM_BINS_PER_BLOCK_X);
246 vstore4(val1, 0, ((__global
float *)dst.
ptr) + xc + 4 + yc * NUM_BINS_PER_BLOCK_X);
247 vstore4(val2, 0, ((__global
float *)dst.
ptr) + xc + 8 + yc * NUM_BINS_PER_BLOCK_X);
248 vstore4(val3, 0, ((__global
float *)dst.
ptr) + xc + 12 + yc * NUM_BINS_PER_BLOCK_X);
252 for(; xc < NUM_BINS_PER_BLOCK_X; ++xc)
254 const float val = hist_ptr[xc];
256 #if(HOG_NORM_TYPE == L2_NORM) || (HOG_NORM_TYPE == L2HYS_NORM) 262 ((__global
float *)dst.
ptr)[xc + 0 + yc * NUM_BINS_PER_BLOCK_X] = val;
266 sum += dot(sum_f32, (float4)1.0f);
268 float scale = 1.0f / (sqrt(sum) + NUM_BINS_PER_BLOCK * 0.1f);
270 #if(HOG_NORM_TYPE == L2HYS_NORM) 272 sum_f32 = (float4)0.0f;
276 for(; k <= NUM_BINS_PER_BLOCK - 16; k += 16)
278 float4 val0 = vload4(0, ((__global
float *)dst.
ptr) + k + 0);
279 float4 val1 = vload4(0, ((__global
float *)dst.
ptr) + k + 4);
280 float4 val2 = vload4(0, ((__global
float *)dst.
ptr) + k + 8);
281 float4 val3 = vload4(0, ((__global
float *)dst.
ptr) + k + 12);
284 val0 = val0 * (float4)scale;
285 val1 = val1 * (float4)scale;
286 val2 = val2 * (float4)scale;
287 val3 = val3 * (float4)scale;
290 val0 = fmin(val0, (float4)L2_HYST_THRESHOLD);
291 val1 = fmin(val1, (float4)L2_HYST_THRESHOLD);
292 val2 = fmin(val2, (float4)L2_HYST_THRESHOLD);
293 val3 = fmin(val3, (float4)L2_HYST_THRESHOLD);
296 sum_f32 += val0 * val0;
297 sum_f32 += val1 * val1;
298 sum_f32 += val2 * val2;
299 sum_f32 += val3 * val3;
301 vstore4(val0, 0, ((__global
float *)dst.
ptr) + k + 0);
302 vstore4(val1, 0, ((__global
float *)dst.
ptr) + k + 4);
303 vstore4(val2, 0, ((__global
float *)dst.
ptr) + k + 8);
304 vstore4(val3, 0, ((__global
float *)dst.
ptr) + k + 12);
308 for(; k < NUM_BINS_PER_BLOCK; ++k)
310 float val = ((__global
float *)dst.
ptr)[k] *
scale;
313 val = fmin(val, (
float)L2_HYST_THRESHOLD);
317 ((__global
float *)dst.
ptr)[k] = val;
320 sum += dot(sum_f32, (float4)1.0f);
323 scale = 1.0f / (sqrt(sum) + 1e-3f);
328 for(; i <= (NUM_BINS_PER_BLOCK - 16); i += 16)
330 float4 val0 = vload4(0, ((__global
float *)dst.
ptr) + i + 0);
331 float4 val1 = vload4(0, ((__global
float *)dst.
ptr) + i + 4);
332 float4 val2 = vload4(0, ((__global
float *)dst.
ptr) + i + 8);
333 float4 val3 = vload4(0, ((__global
float *)dst.
ptr) + i + 12);
336 val0 = val0 * (float4)scale;
337 val1 = val1 * (float4)scale;
338 val2 = val2 * (float4)scale;
339 val3 = val3 * (float4)scale;
341 vstore4(val0, 0, ((__global
float *)dst.
ptr) + i + 0);
342 vstore4(val1, 0, ((__global
float *)dst.
ptr) + i + 4);
343 vstore4(val2, 0, ((__global
float *)dst.
ptr) + i + 8);
344 vstore4(val3, 0, ((__global
float *)dst.
ptr) + i + 12);
347 for(; i < NUM_BINS_PER_BLOCK; ++i)
349 ((__global
float *)dst.
ptr)[i] *=
scale;
354 #if defined(NUM_BLOCKS_PER_DESCRIPTOR_Y) && defined(NUM_BINS_PER_DESCRIPTOR_X) && defined(THRESHOLD) && defined(MAX_NUM_DETECTION_WINDOWS) && defined(IDX_CLASS) && defined(DETECTION_WINDOW_STRIDE_WIDTH) && defined(DETECTION_WINDOW_STRIDE_HEIGHT) && defined(DETECTION_WINDOW_WIDTH) && defined(DETECTION_WINDOW_HEIGHT) 385 __global uint *num_detection_windows)
388 if(*num_detection_windows >= MAX_NUM_DETECTION_WINDOWS)
395 const int src_step_y_f32 = src_stride_y /
sizeof(float);
398 float4 score_f32 = (float4)0.0f;
403 __global
float *src_row_ptr = (__global
float *)src.
ptr;
406 for(
int yb = 0; yb < NUM_BLOCKS_PER_DESCRIPTOR_Y; ++yb, src_row_ptr += src_step_y_f32)
410 const int offset_y = yb * NUM_BINS_PER_DESCRIPTOR_X;
412 for(; xb < (int)NUM_BINS_PER_DESCRIPTOR_X - 8; xb += 8)
415 float4 a0_f32 = vload4(0, src_row_ptr + xb + 0);
416 float4 a1_f32 = vload4(0, src_row_ptr + xb + 4);
418 float4 b0_f32 = vload4(0, hog_descriptor + xb + 0 + offset_y);
419 float4 b1_f32 = vload4(0, hog_descriptor + xb + 4 + offset_y);
422 score_f32 += a0_f32 * b0_f32;
423 score_f32 += a1_f32 * b1_f32;
426 for(; xb < NUM_BINS_PER_DESCRIPTOR_X; ++xb)
428 const float a = src_row_ptr[xb];
429 const float b = hog_descriptor[xb + offset_y];
435 score += dot(score_f32, (float4)1.0f);
439 score += hog_descriptor[NUM_BINS_PER_DESCRIPTOR_X * NUM_BLOCKS_PER_DESCRIPTOR_Y];
441 if(score > (
float)THRESHOLD)
443 int id = atomic_inc(num_detection_windows);
444 if(
id < MAX_NUM_DETECTION_WINDOWS)
446 dst[id].x = get_global_id(0) * DETECTION_WINDOW_STRIDE_WIDTH;
447 dst[id].y = get_global_id(1) * DETECTION_WINDOW_STRIDE_HEIGHT;
448 dst[id].width = DETECTION_WINDOW_WIDTH;
449 dst[id].height = DETECTION_WINDOW_HEIGHT;
450 dst[id].idx_class = IDX_CLASS;
451 dst[id].score = score;
void hog_orientation_binning(const SimpleTensor< T > &mag, const SimpleTensor< U > &phase, SimpleTensor< V > &hog_space, const HOGInfo &hog_info)
DATA_TYPE sum(__global const DATA_TYPE *input)
Calculate sum of a vector.
SimpleTensor< uint8_t > phase(const SimpleTensor< T > &gx, const SimpleTensor< T > &gy, PhaseType phase_type)
#define IMAGE_DECLARATION(name)
SimpleTensor< T > hog_descriptor(const SimpleTensor< U > &src, BorderMode border_mode, U constant_border_value, const HOGInfo &hog_info)
#define CONVERT_TO_IMAGE_STRUCT(name)
Structure to hold Image information.
__global uchar * ptr
Pointer to the starting postion of the buffer.
std::vector< DetectionWindow > hog_detector(const SimpleTensor< T > &src, const std::vector< T > &descriptor, unsigned int max_num_detection_windows, const HOGInfo &hog_info, const Size2D &detection_window_stride, float threshold, uint16_t idx_class)
convolution configure & src
void hog_block_normalization(SimpleTensor< T > &desc, const SimpleTensor< T > &hog_space, const HOGInfo &hog_info)