Compute Library  18.05
softmax_layer.cl File Reference
#include "helpers.h"

Go to the source code of this file.

Macros

#define MAX_OP(x, y, type, size)   max((x), (y))
 
#define ADD_OP(x, y, type, size)   ((x) + (y))
 
#define SUB_OP(x, y, type, size)   ((x) - (y))
 
#define MUL_OP(x, y, type, size)   ((x) * (y))
 
#define DIV_OP(x, y, type, size)   ((x) / (y))
 
#define EXP_OP(x, type, size)   exp((x))
 
#define MINVAL   -FLT_MAX
 
#define SELECT_DATA_TYPE   int
 
#define GRID_SIZE   1
 
#define VECTOR_SIZE   16
 
#define LOG_VECTOR_SIZE   4
 

Functions

__kernel void softmax_layer_norm (__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_stride_z, uint src_step_z, uint src_offset_first_element_in_bytes, __global uchar *sum_ptr, uint sum_stride_x, uint sum_step_x, uint sum_stride_y, uint sum_step_y, uint sum_stride_z, uint sum_step_z, uint sum_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_stride_z, uint dst_step_z, uint dst_offset_first_element_in_bytes)
 Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel. More...
 
__kernel void softmax_layer_max_shift_exp_sum_serial (__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_stride_z, uint src_step_z, uint src_offset_first_element_in_bytes, __global uchar *maxo_ptr, uint maxo_stride_x, uint maxo_step_x, uint maxo_stride_y, uint maxo_step_y, uint maxo_stride_z, uint maxo_step_z, uint maxo_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_stride_z, uint dst_step_z, uint dst_offset_first_element_in_bytes, __global uchar *sum_ptr, uint sum_stride_x, uint sum_step_x, uint sum_stride_y, uint sum_step_y, uint sum_stride_z, uint sum_step_z, uint sum_offset_first_element_in_bytes, uint width)
 Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value, then gets the exponent of each element as sums all elements across each row. More...
 
__kernel void softmax_layer_max_shift_exp_sum_parallel (__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_stride_z, uint src_step_z, uint src_offset_first_element_in_bytes, __global uchar *maxo_ptr, uint maxo_stride_x, uint maxo_step_x, uint maxo_stride_y, uint maxo_step_y, uint maxo_stride_z, uint maxo_step_z, uint maxo_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_stride_z, uint dst_step_z, uint dst_offset_first_element_in_bytes, __global uchar *sum_ptr, uint sum_stride_x, uint sum_step_x, uint sum_stride_y, uint sum_step_y, uint sum_stride_z, uint sum_step_z, uint sum_offset_first_element_in_bytes, uint width)
 Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value, then gets the exponent of each element as sums all elements across each row. More...
 

Variables

__constant DATA_TYPE16 type_min_ = ( DATA_TYPE16 )( -FLT_MAX )
 
__constant uint16 idx__ = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
 
__constant DATA_TYPE16 type_min = ( DATA_TYPE16 )( -FLT_MAX )
 
__constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
 
__constant uint4 idx4 = (uint4)(0, 1, 2, 3)
 

Macro Definition Documentation

#define ADD_OP (   x,
  y,
  type,
  size 
)    ((x) + (y))
#define DIV_OP (   x,
  y,
  type,
  size 
)    ((x) / (y))

Definition at line 47 of file softmax_layer.cl.

Referenced by softmax_layer_norm().

#define EXP_OP (   x,
  type,
  size 
)    exp((x))
#define GRID_SIZE   1

Definition at line 62 of file softmax_layer.cl.

Referenced by softmax_layer_max_shift_exp_sum_parallel().

#define LOG_VECTOR_SIZE   4

Definition at line 80 of file softmax_layer.cl.

Referenced by softmax_layer_max_shift_exp_sum_serial().

#define MAX_OP (   x,
  y,
  type,
  size 
)    max((x), (y))
#define MINVAL   -FLT_MAX

Definition at line 54 of file softmax_layer.cl.

#define MUL_OP (   x,
  y,
  type,
  size 
)    ((x) * (y))
#define SELECT_DATA_TYPE   int
#define SUB_OP (   x,
  y,
  type,
  size 
)    ((x) - (y))
#define VECTOR_SIZE   16

Definition at line 79 of file softmax_layer.cl.

Referenced by softmax_layer_max_shift_exp_sum_serial().

Function Documentation

__kernel void softmax_layer_max_shift_exp_sum_parallel ( __global uchar *  src_ptr,
uint  src_stride_x,
uint  src_step_x,
uint  src_stride_y,
uint  src_step_y,
uint  src_stride_z,
uint  src_step_z,
uint  src_offset_first_element_in_bytes,
__global uchar *  maxo_ptr,
uint  maxo_stride_x,
uint  maxo_step_x,
uint  maxo_stride_y,
uint  maxo_step_y,
uint  maxo_stride_z,
uint  maxo_step_z,
uint  maxo_offset_first_element_in_bytes,
__global uchar *  dst_ptr,
uint  dst_stride_x,
uint  dst_step_x,
uint  dst_stride_y,
uint  dst_step_y,
uint  dst_stride_z,
uint  dst_step_z,
uint  dst_offset_first_element_in_bytes,
__global uchar *  sum_ptr,
uint  sum_stride_x,
uint  sum_step_x,
uint  sum_stride_y,
uint  sum_step_y,
uint  sum_stride_z,
uint  sum_step_z,
uint  sum_offset_first_element_in_bytes,
uint  width 
)

Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value, then gets the exponent of each element as sums all elements across each row.

Note
Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
Parameters
[in]src_ptrPointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
[in]src_stride_xStride of the source tensor in X dimension (in bytes)
[in]src_step_xsrc_stride_x * number of elements along X processed per workitem(in bytes)
[in]src_stride_yStride of the source tensor in Y dimension (in bytes)
[in]src_step_ysrc_stride_y * number of elements along Y processed per workitem(in bytes)
[in]src_stride_zStride of the source tensor in Z dimension (in bytes)
[in]src_step_zsrc_stride_z * number of elements along Z processed per workitem(in bytes)
[in]src_offset_first_element_in_bytesThe offset of the first element in the source tensor
[in]maxo_ptrPointer to the max values tensor slice. Supported data types: same as src_ptr
[in]maxo_stride_xStride of the max values tensor in X dimension (in bytes)
[in]maxo_step_xmax_stride_x * number of elements along X processed per workitem(in bytes)
[in]maxo_stride_yStride of the max values tensor in Y dimension (in bytes)
[in]maxo_step_ymax_stride_y * number of elements along Y processed per workitem(in bytes)
[in]maxo_stride_zStride of the max values tensor in Z dimension (in bytes)
[in]maxo_step_zmax_stride_z * number of elements along Z processed per workitem(in bytes)
[in]maxo_offset_first_element_in_bytesThe offset of the first element in the max values tensor
[out]dst_ptrPointer to the destination tensor slice. Supported data types: same as src_ptr
[in]dst_stride_xStride of the destination tensor in X dimension (in bytes)
[in]dst_step_xdst_stride_x * number of elements along X processed per workitem(in bytes)
[in]dst_stride_yStride of the destination tensor in Y dimension (in bytes)
[in]dst_step_ydst_stride_y * number of elements along Y processed per workitem(in bytes)
[in]dst_stride_zStride of the destination tensor in Z dimension (in bytes)
[in]dst_step_zdst_stride_z * number of elements along Z processed per workitem(in bytes)
[in]dst_offset_first_element_in_bytesThe offset of the first element in the destination tensor
[out]sum_ptrPointer to the sum values tensor slice. Supported data types: same as src_ptr
[in]sum_stride_xStride of the sum values tensor in X dimension (in bytes)
[in]sum_step_xsum_stride_x * number of elements along X processed per workitem(in bytes)
[in]sum_stride_yStride of the sum values tensor in Y dimension (in bytes)
[in]sum_step_ysum_stride_z * number of elements along Z processed per workitem(in bytes)
[in]sum_stride_zStride of the sum values tensor in Z dimension (in bytes)
[in]sum_step_zsum_stride_z * number of elements along Z processed per workitem(in bytes)
[in]sum_offset_first_element_in_bytesThe offset of the first element in the sum values tensor
[in]widthInput image width

Definition at line 328 of file softmax_layer.cl.

References ADD_OP, arm_compute::test::validation::beta, CONVERT, CONVERT_TENSOR3D_TO_IMAGE_STRUCT, arm_compute::test::validation::dst, EXP_OP, GRID_SIZE, idx4, MAX_OP, MUL_OP, offset(), Image::ptr, SELECT_DATA_TYPE, arm_compute::test::validation::src, SUB_OP, sum(), type_min_, VEC_DATA_TYPE, VLOAD, and VSTORE.

334 {
339 
340  const uint lid = get_local_id(0);
341 
342 #ifdef BETA
343  // Initialize beta
345  beta = (VEC_DATA_TYPE(DATA_TYPE, 4))BETA;
346 #endif /* BETA */
347 
348  // Define one temporary vector per work-item.
349  __local VEC_DATA_TYPE(DATA_TYPE, 4) tmp_local[GRID_SIZE];
350  __local DATA_TYPE max_local;
351 
352  __constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);
353  VEC_DATA_TYPE(DATA_TYPE, 4)
354  max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, 4))type_min4;
355  // Number of elements per work-item.
356  const uint row = width / GRID_SIZE;
357  // Number of iterations per work-item.
358  const uint width_ = row >> 2;
359  // Calculate max of row
360  uint i = 0;
361  for(; i < width_; i++)
362  {
363  VEC_DATA_TYPE(DATA_TYPE, 4)
364  data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
365  max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
366  }
367 #ifdef NON_MULTIPLE_OF_GRID_SIZE
368  // How many work-items needed to complete the computation.
369  int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
370  if(lid < boundary_workitems)
371  {
372  VEC_DATA_TYPE(DATA_TYPE, 4)
373  data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
374  max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
375  }
376 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE
377  if(boundary_workitems == 0)
378  {
379  boundary_workitems = GRID_SIZE;
380  i--;
381  }
382  if(lid == (boundary_workitems - 1))
383  {
384  // Handle non multiple of 4
385  VEC_DATA_TYPE(DATA_TYPE, 4)
386  data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
388  widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
389  max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, 4);
390  }
391 #endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
392 #endif /* NON_MULTIPLE_OF_GRID_SIZE */
393  tmp_local[lid] = max_val_vec;
394 
395  barrier(CLK_LOCAL_MEM_FENCE);
396 
397  if(GRID_SIZE >= 256)
398  {
399  if(lid < 128)
400  {
401  tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
402  }
403  barrier(CLK_LOCAL_MEM_FENCE);
404  }
405  if(GRID_SIZE >= 128)
406  {
407  if(lid < 64)
408  {
409  tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
410  }
411  barrier(CLK_LOCAL_MEM_FENCE);
412  }
413  if(GRID_SIZE >= 64)
414  {
415  if(lid < 32)
416  {
417  tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
418  }
419  barrier(CLK_LOCAL_MEM_FENCE);
420  }
421  if(GRID_SIZE >= 32)
422  {
423  if(lid < 16)
424  {
425  tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
426  }
427  barrier(CLK_LOCAL_MEM_FENCE);
428  }
429  if(GRID_SIZE >= 16)
430  {
431  if(lid < 8)
432  {
433  tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
434  }
435  barrier(CLK_LOCAL_MEM_FENCE);
436  }
437  if(GRID_SIZE >= 8)
438  {
439  if(lid < 4)
440  {
441  tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
442  }
443  barrier(CLK_LOCAL_MEM_FENCE);
444  }
445  if(GRID_SIZE >= 4)
446  {
447  if(lid < 2)
448  {
449  tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
450  }
451  barrier(CLK_LOCAL_MEM_FENCE);
452  }
453  if(lid == 0)
454  {
455  max_val_vec = MAX_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
456  max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
457  max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
458  max_local = max_val_vec.s0;
459  }
460  barrier(CLK_LOCAL_MEM_FENCE);
461 
462  /* Second section */
463 
464  // Set sum vector
465  VEC_DATA_TYPE(DATA_TYPE, 4)
466  sum1D = 0;
467  DATA_TYPE max_val = max_local;
468 
469  // Shift values, exp and sum
470  for(i = 0; i < width_; i++)
471  {
472  VEC_DATA_TYPE(DATA_TYPE, 4)
473  data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
474  data = SUB_OP(data, max_val, DATA_TYPE, 4);
475 #ifdef BETA
476  data = MUL_OP(data, beta, DATA_TYPE, 4);
477 #endif /* BETA */
478  data = EXP_OP(data, DATA_TYPE, 4);
479  VSTORE(4)
480  (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
481  sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
482  }
483 #ifdef NON_MULTIPLE_OF_GRID_SIZE
484  boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
485  if(lid < boundary_workitems)
486  {
487  VEC_DATA_TYPE(DATA_TYPE, 4)
488  data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
489  data = SUB_OP(data, max_val, DATA_TYPE, 4);
490 #ifdef BETA
491  data = MUL_OP(data, beta, DATA_TYPE, 4);
492 #endif /* BETA */
493  data = EXP_OP(data, DATA_TYPE, 4);
494  VSTORE(4)
495  (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
496  sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
497  }
498 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE
499  if(boundary_workitems == 0)
500  {
501  boundary_workitems = GRID_SIZE;
502  i--;
503  }
504  if(lid == (boundary_workitems - 1))
505  {
506  // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
507  VEC_DATA_TYPE(DATA_TYPE, 4)
508  data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
509  data = SUB_OP(data, max_val, DATA_TYPE, 4);
510 #ifdef BETA
511  data = MUL_OP(data, beta, DATA_TYPE, 4);
512 #endif /* BETA */
513  data = EXP_OP(data, DATA_TYPE, 4);
514  VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
515  widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
516  data = select(0, data, widx);
517  VSTORE(4)
518  (data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));
519  sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
520  }
521 #endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
522 #endif /* NON_MULTIPLE_OF_GRID_SIZE */
523  tmp_local[lid] = sum1D;
524 
525  barrier(CLK_LOCAL_MEM_FENCE);
526 
527  if(GRID_SIZE >= 256)
528  {
529  if(lid < 128)
530  {
531  tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
532  }
533  barrier(CLK_LOCAL_MEM_FENCE);
534  }
535  if(GRID_SIZE >= 128)
536  {
537  if(lid < 64)
538  {
539  tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
540  }
541  barrier(CLK_LOCAL_MEM_FENCE);
542  }
543  if(GRID_SIZE >= 64)
544  {
545  if(lid < 32)
546  {
547  tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
548  }
549  barrier(CLK_LOCAL_MEM_FENCE);
550  }
551  if(GRID_SIZE >= 32)
552  {
553  if(lid < 16)
554  {
555  tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
556  }
557  barrier(CLK_LOCAL_MEM_FENCE);
558  }
559  if(GRID_SIZE >= 16)
560  {
561  if(lid < 8)
562  {
563  tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
564  }
565  barrier(CLK_LOCAL_MEM_FENCE);
566  }
567  if(GRID_SIZE >= 8)
568  {
569  if(lid < 4)
570  {
571  tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
572  }
573  barrier(CLK_LOCAL_MEM_FENCE);
574  }
575  if(GRID_SIZE >= 4)
576  {
577  if(lid < 2)
578  {
579  tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
580  }
581  barrier(CLK_LOCAL_MEM_FENCE);
582  }
583  if(lid == 0)
584  {
585  sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
586  // Perform max reduction
587  sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
588  sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
589  *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
590  }
591 }
#define CONVERT(x, type)
Definition: fixed_point.h:98
#define VLOAD(size)
Definition: helpers.h:42
#define EXP_OP(x, type, size)
#define DATA_TYPE
#define MAX_OP(x, y, type, size)
DATA_TYPE sum(__global const DATA_TYPE *input)
Calculate sum of a vector.
__constant DATA_TYPE16 type_min_
#define ADD_OP(x, y, type, size)
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:303
#define VSTORE(size)
Definition: helpers.h:45
#define SELECT_DATA_TYPE
__constant uint4 idx4
#define MUL_OP(x, y, type, size)
Structure to hold Image information.
Definition: helpers.h:142
__global uchar * ptr
Pointer to the starting postion of the buffer.
Definition: helpers.h:144
#define SUB_OP(x, y, type, size)
#define VEC_DATA_TYPE(type, size)
Definition: fixed_point.h:93
#define MINVAL
#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)
Definition: helpers.h:116
#define GRID_SIZE
convolution configure & src
__kernel void softmax_layer_max_shift_exp_sum_serial ( __global uchar *  src_ptr,
uint  src_stride_x,
uint  src_step_x,
uint  src_stride_y,
uint  src_step_y,
uint  src_stride_z,
uint  src_step_z,
uint  src_offset_first_element_in_bytes,
__global uchar *  maxo_ptr,
uint  maxo_stride_x,
uint  maxo_step_x,
uint  maxo_stride_y,
uint  maxo_step_y,
uint  maxo_stride_z,
uint  maxo_step_z,
uint  maxo_offset_first_element_in_bytes,
__global uchar *  dst_ptr,
uint  dst_stride_x,
uint  dst_step_x,
uint  dst_stride_y,
uint  dst_step_y,
uint  dst_stride_z,
uint  dst_step_z,
uint  dst_offset_first_element_in_bytes,
__global uchar *  sum_ptr,
uint  sum_stride_x,
uint  sum_step_x,
uint  sum_stride_y,
uint  sum_step_y,
uint  sum_stride_z,
uint  sum_step_z,
uint  sum_offset_first_element_in_bytes,
uint  width 
)

Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value, then gets the exponent of each element as sums all elements across each row.

Note
Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
Parameters
[in]src_ptrPointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
[in]src_stride_xStride of the source tensor in X dimension (in bytes)
[in]src_step_xsrc_stride_x * number of elements along X processed per workitem(in bytes)
[in]src_stride_yStride of the source tensor in Y dimension (in bytes)
[in]src_step_ysrc_stride_y * number of elements along Y processed per workitem(in bytes)
[in]src_stride_zStride of the source tensor in Z dimension (in bytes)
[in]src_step_zsrc_stride_z * number of elements along Z processed per workitem(in bytes)
[in]src_offset_first_element_in_bytesThe offset of the first element in the source tensor
[in]maxo_ptrPointer to the max values tensor slice. Supported data types: same as src_ptr
[in]maxo_stride_xStride of the max values tensor in X dimension (in bytes)
[in]maxo_step_xmax_stride_x * number of elements along X processed per workitem(in bytes)
[in]maxo_stride_yStride of the max values tensor in Y dimension (in bytes)
[in]maxo_step_ymax_stride_y * number of elements along Y processed per workitem(in bytes)
[in]maxo_stride_zStride of the max values tensor in Z dimension (in bytes)
[in]maxo_step_zmax_stride_z * number of elements along Z processed per workitem(in bytes)
[in]maxo_offset_first_element_in_bytesThe offset of the first element in the max values tensor
[out]dst_ptrPointer to the destination tensor slice. Supported data types: same as src_ptr
[in]dst_stride_xStride of the destination tensor in X dimension (in bytes)
[in]dst_step_xdst_stride_x * number of elements along X processed per workitem(in bytes)
[in]dst_stride_yStride of the destination tensor in Y dimension (in bytes)
[in]dst_step_ydst_stride_y * number of elements along Y processed per workitem(in bytes)
[in]dst_stride_zStride of the destination tensor in Z dimension (in bytes)
[in]dst_step_zdst_stride_z * number of elements along Z processed per workitem(in bytes)
[in]dst_offset_first_element_in_bytesThe offset of the first element in the destination tensor
[out]sum_ptrPointer to the sum values tensor slice. Supported data types: same as src_ptr
[in]sum_stride_xStride of the sum values tensor in X dimension (in bytes)
[in]sum_step_xsum_stride_x * number of elements along X processed per workitem(in bytes)
[in]sum_stride_yStride of the sum values tensor in Y dimension (in bytes)
[in]sum_step_ysum_stride_z * number of elements along Z processed per workitem(in bytes)
[in]sum_stride_zStride of the sum values tensor in Z dimension (in bytes)
[in]sum_step_zsum_stride_z * number of elements along Z processed per workitem(in bytes)
[in]sum_offset_first_element_in_bytesThe offset of the first element in the sum values tensor
[in]widthInput image width

Definition at line 178 of file softmax_layer.cl.

References ADD_OP, arm_compute::test::validation::beta, CL_VEC_DATA_TYPE, CONVERT, CONVERT_TENSOR3D_TO_IMAGE_STRUCT, arm_compute::test::validation::dst, EXP_OP, EXPAND, idx__, LOG_VECTOR_SIZE, MAX_OP, MUL_OP, offset(), Image::ptr, SELECT_DATA_TYPE, arm_compute::test::validation::src, SUB_OP, sum(), type_min_, VEC_DATA_TYPE, VECTOR_SIZE, VLOAD, and VSTORE.

184 {
189 
190 #ifdef BETA
191  // Initialize beta
194 #endif /* BETA */
195 
196  // Initialize local maximum
197  VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
198  max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))type_min_;
199 
200  // Calculate max of row
201  const uint width_ = width >> LOG_VECTOR_SIZE;
202  for(uint i = 0; i < width_; i++)
203  {
204  VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
205  data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
206  max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, VECTOR_SIZE);
207  }
208 
209 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE
210  VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
211  data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
212  VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE)
213  widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
214  max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, VECTOR_SIZE);
215 #endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
216 
217  // Perform max reduction
218 #if VECTOR_SIZE == 16
219  max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, DATA_TYPE, 8);
220 #endif /* VECTOR SIZE 16 END */
221 #if VECTOR_SIZE >= 8
222  max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, DATA_TYPE, 4);
223 #endif /* VECTOR SIZE 8 END */
224 #if VECTOR_SIZE >= 4
225  max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
226 #endif /* VECTOR SIZE 4 END */
227  max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
228  // Store result
229  *((__global DATA_TYPE *)maxo.ptr) = max_val_vec.s0;
230 
231  /* Second section */
232 
233  // Load max value of 1D logits vector (row)
234  DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&maxo, 0, 0));
235 
236  // Set sum vector
237  VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
238  sum1D = 0;
239 
240  // Shift values, exp and sum
241  for(uint i = 0; i < width_; i++)
242  {
243  VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
244  data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
245  data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
246 #ifdef BETA
247  data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
248 #endif /* BETA */
249  data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
250  VSTORE(VECTOR_SIZE)
251  (data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
252  sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
253  }
254 
255 #ifdef NON_MULTIPLE_OF_VECTOR_SIZE
256  VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
257  data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
258  data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
259 #ifdef BETA
260  data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
261 #endif /* BETA */
262  data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
263  widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
264  data = select(0, data, widx);
265  VSTORE(VECTOR_SIZE)
266  (data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));
267  sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
268 #endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
269 
270  // Perform sum reduction
271 #if VECTOR_SIZE == 16
272  sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);
273 #endif /* VECTOR SIZE 16 END */
274 #if VECTOR_SIZE >= 8
275  sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);
276 #endif /* VECTOR SIZE 8 END */
277 #if VECTOR_SIZE >= 4
278  sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
279 #endif /* VECTOR SIZE 4 END */
280  sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
281 
282  // Calculate and store result
283  *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
284 }
__constant uint16 idx__
#define CONVERT(x, type)
Definition: fixed_point.h:98
#define VLOAD(size)
Definition: helpers.h:42
#define EXP_OP(x, type, size)
#define DATA_TYPE
#define LOG_VECTOR_SIZE
#define MAX_OP(x, y, type, size)
DATA_TYPE sum(__global const DATA_TYPE *input)
Calculate sum of a vector.
__constant DATA_TYPE16 type_min_
#define ADD_OP(x, y, type, size)
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:303
#define VSTORE(size)
Definition: helpers.h:45
#define VECTOR_SIZE
#define SELECT_DATA_TYPE
#define MUL_OP(x, y, type, size)
Structure to hold Image information.
Definition: helpers.h:142
__global uchar * ptr
Pointer to the starting postion of the buffer.
Definition: helpers.h:144
#define CL_VEC_DATA_TYPE(type, size)
Definition: helpers.h:51
#define SUB_OP(x, y, type, size)
#define VEC_DATA_TYPE(type, size)
Definition: fixed_point.h:93
#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)
Definition: helpers.h:116
#define EXPAND(x)
Definition: helpers.h:37
convolution configure & src
__kernel void softmax_layer_norm ( __global uchar *  src_ptr,
uint  src_stride_x,
uint  src_step_x,
uint  src_stride_y,
uint  src_step_y,
uint  src_stride_z,
uint  src_step_z,
uint  src_offset_first_element_in_bytes,
__global uchar *  sum_ptr,
uint  sum_stride_x,
uint  sum_step_x,
uint  sum_stride_y,
uint  sum_step_y,
uint  sum_stride_z,
uint  sum_step_z,
uint  sum_offset_first_element_in_bytes,
__global uchar *  dst_ptr,
uint  dst_stride_x,
uint  dst_step_x,
uint  dst_stride_y,
uint  dst_step_y,
uint  dst_stride_z,
uint  dst_step_z,
uint  dst_offset_first_element_in_bytes 
)

Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.

Note
Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
Parameters
[in]src_ptrPointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
[in]src_stride_xStride of the source tensor in X dimension (in bytes)
[in]src_step_xsrc_stride_x * number of elements along X processed per workitem(in bytes)
[in]src_stride_yStride of the source tensor in Y dimension (in bytes)
[in]src_step_ysrc_stride_y * number of elements along Y processed per workitem(in bytes)
[in]src_stride_zStride of the source tensor in Z dimension (in bytes)
[in]src_step_zsrc_stride_z * number of elements along Z processed per workitem(in bytes)
[in]src_offset_first_element_in_bytesThe offset of the first element in the source tensor
[in]sum_ptrPointer to the sum values tensor slice. Supported data types: same as src_ptr
[in]sum_stride_xStride of the sum values tensor in X dimension (in bytes)
[in]sum_step_xsum_stride_x * number of elements along X processed per workitem(in bytes)
[in]sum_stride_yStride of the sum values tensor in Y dimension (in bytes)
[in]sum_step_ysum_stride_y * number of elements along Y processed per workitem(in bytes)
[in]sum_stride_zStride of the sum values tensor in Z dimension (in bytes)
[in]sum_step_zsum_stride_z * number of elements along Z processed per workitem(in bytes)
[in]sum_offset_first_element_in_bytesThe offset of the first element in the sum values tensor
[out]dst_ptrPointer to the destination tensor slice. Supported data types: same as src_ptr
[in]dst_stride_xStride of the destination tensor in X dimension (in bytes)
[in]dst_step_xdst_stride_x * number of elements along X processed per workitem(in bytes)
[in]dst_stride_yStride of the destination tensor in Y dimension (in bytes)
[in]dst_step_ydst_stride_y * number of elements along Y processed per workitem(in bytes)
[in]dst_stride_zStride of the destination tensor in Z dimension (in bytes)
[in]dst_step_zdst_stride_z * number of elements along Z processed per workitem(in bytes)
[in]dst_offset_first_element_in_bytesThe offset of the first element in the destination tensor

Definition at line 120 of file softmax_layer.cl.

References CONVERT_TENSOR3D_TO_IMAGE_STRUCT, CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP, DIV_OP, offset(), and VEC_DATA_TYPE.

124 {
128 
129  // Load max value of 1D logits vector (row)
130  DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1)));
132  data = vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0));
133  vstore16(DIV_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
134 }
#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name)
Definition: helpers.h:113
#define DATA_TYPE
DATA_TYPE sum(__global const DATA_TYPE *input)
Calculate sum of a vector.
#define DIV_OP(x, y, type, size)
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:303
Structure to hold Image information.
Definition: helpers.h:142
#define VEC_DATA_TYPE(type, size)
Definition: fixed_point.h:93
#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)
Definition: helpers.h:116
convolution configure & src

Variable Documentation

__constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)

Definition at line 87 of file softmax_layer.cl.

__constant uint4 idx4 = (uint4)(0, 1, 2, 3)

Definition at line 88 of file softmax_layer.cl.

Referenced by softmax_layer_max_shift_exp_sum_parallel().

__constant uint16 idx__ = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)

Definition at line 82 of file softmax_layer.cl.

Referenced by softmax_layer_max_shift_exp_sum_serial().

__constant DATA_TYPE16 type_min = ( DATA_TYPE16 )( -FLT_MAX )

Definition at line 86 of file softmax_layer.cl.

__constant DATA_TYPE16 type_min_ = ( DATA_TYPE16 )( -FLT_MAX )