diff options
author | Anthony Barbier <Anthony.barbier@arm.com> | 2017-12-14 23:48:46 +0000 |
---|---|---|
committer | Anthony Barbier <anthony.barbier@arm.com> | 2018-01-24 10:01:21 +0000 |
commit | 8140e1e155d3430992fa46e04ef8938ff09ffd2d (patch) | |
tree | 9bcf86d01635bfc73e8debd1bda75e6f75b8b406 /arm_compute | |
parent | 8a3da6f91f90c566b844d568f4ec43b946915af8 (diff) | |
download | armcl-8140e1e155d3430992fa46e04ef8938ff09ffd2d.tar.gz armcl-8140e1e155d3430992fa46e04ef8938ff09ffd2d.tar.bz2 armcl-8140e1e155d3430992fa46e04ef8938ff09ffd2d.zip |
arm_compute v17.12
Diffstat (limited to 'arm_compute')
327 files changed, 18642 insertions, 1411 deletions
diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h index 1a4476e30..365ecb06c 100644 --- a/arm_compute/core/CL/CLHelpers.h +++ b/arm_compute/core/CL/CLHelpers.h @@ -43,7 +43,7 @@ struct enable_bitwise_ops<arm_compute::GPUTarget> }; /** Max vector width of an OpenCL vector */ -static constexpr const unsigned int max_cl_vector_width = 16; +static constexpr unsigned int max_cl_vector_width = 16; /** Translates a tensor data type to the appropriate OpenCL type. * @@ -126,6 +126,13 @@ GPUTarget get_arch_from_target(GPUTarget target); * @return the highest OpenCL version supported */ CLVersion get_cl_version(const cl::Device &device); +/** Helper function to check whether the cl_khr_fp16 extension is supported + * + * @param[in] device A CL device + * + * @return True if the extension is supported + */ +bool fp16_support(const cl::Device &device); /** Helper function to check whether the arm_non_uniform_work_group_size extension is supported * * @param[in] device A CL device diff --git a/arm_compute/core/CL/CLKernelLibrary.h b/arm_compute/core/CL/CLKernelLibrary.h index fc131cdcf..25c7f75ba 100644 --- a/arm_compute/core/CL/CLKernelLibrary.h +++ b/arm_compute/core/CL/CLKernelLibrary.h @@ -33,6 +33,52 @@ namespace arm_compute { +/** Build options */ +class CLBuildOptions +{ + using StringSet = std::set<std::string>; + +public: + /** Default constructor. */ + CLBuildOptions(); + /** Adds option to the existing build option list + * + * @param[in] option Option to add + */ + void add_option(std::string option); + /** Adds option if a given condition is true; + * + * @param[in] cond Condition to check + * @param[in] option Option to add if condition is true + */ + void add_option_if(bool cond, std::string option); + /** Adds first option if condition is true else the second one + * + * @param[in] cond Condition to check + * @param[in] option_true Option to add if condition is true + * @param[in] option_false Option to add if condition is false + */ + void add_option_if_else(bool cond, std::string option_true, std::string option_false); + /** Appends given build options to the current's objects options. + * + * @param[in] options Build options to append + */ + void add_options(const StringSet &options); + /** Appends given build options to the current's objects options if a given condition is true. + * + * @param[in] cond Condition to check + * @param[in] options Option to add if condition is true + */ + void add_options_if(bool cond, const StringSet &options); + /** Gets the current options list set + * + * @return Build options set + */ + const StringSet &options() const; + +private: + StringSet _build_opts; /**< Build options set */ +}; /** Program class */ class Program { @@ -181,8 +227,8 @@ public: return _kernel_path; }; /** Gets the source of the selected program - * - * @param[in] program_name Program name. + * + * @param[in] program_name Program name. */ std::string get_program_source(const std::string &program_name); /** Sets the CL context used to create programs. diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h index 8da0cecad..9da0e5ab3 100644 --- a/arm_compute/core/CL/CLKernels.h +++ b/arm_compute/core/CL/CLKernels.h @@ -42,9 +42,9 @@ #include "arm_compute/core/CL/kernels/CLCol2ImKernel.h" #include "arm_compute/core/CL/kernels/CLColorConvertKernel.h" #include "arm_compute/core/CL/kernels/CLConvolutionKernel.h" -#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h" -#include "arm_compute/core/CL/kernels/CLDepthConvertKernel.h" -#include "arm_compute/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.h" +#include "arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h" +#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h" +#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.h" #include "arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h" #include "arm_compute/core/CL/kernels/CLDepthwiseVectorToTensorKernel.h" #include "arm_compute/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.h" @@ -58,6 +58,10 @@ #include "arm_compute/core/CL/kernels/CLFloorKernel.h" #include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h" #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h" +#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" +#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h" +#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h" +#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h" #include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h" #include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h" #include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h" @@ -72,7 +76,7 @@ #include "arm_compute/core/CL/kernels/CLHistogramKernel.h" #include "arm_compute/core/CL/kernels/CLIm2ColKernel.h" #include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h" -#include "arm_compute/core/CL/kernels/CLL2NormalizeKernel.h" +#include "arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h" #include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h" #include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h" #include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h" diff --git a/arm_compute/core/CL/ICLKernel.h b/arm_compute/core/CL/ICLKernel.h index 9119940bc..a1bc3eb8d 100644 --- a/arm_compute/core/CL/ICLKernel.h +++ b/arm_compute/core/CL/ICLKernel.h @@ -180,6 +180,13 @@ public: * @return The maximum workgroup size value. */ size_t get_max_workgroup_size(); + /** Get the global work size given an execution window + * + * @param[in] window Execution window + * + * @return Global work size of the given execution window + */ + static cl::NDRange gws_from_window(const Window &window); private: /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx. diff --git a/arm_compute/core/CL/ICLMultiHOG.h b/arm_compute/core/CL/ICLMultiHOG.h index 9f3c77523..90082a611 100644 --- a/arm_compute/core/CL/ICLMultiHOG.h +++ b/arm_compute/core/CL/ICLMultiHOG.h @@ -35,14 +35,14 @@ class ICLMultiHOG : public IMultiHOG public: /** Return a pointer to the requested OpenCL HOG model * - * @param[in] index The index of the wanted OpenCL HOG model. + * @param[in] index The index of the wanted OpenCL HOG model. * * @return A pointer pointed to the HOG model */ virtual ICLHOG *cl_model(size_t index) = 0; /** Return a constant pointer to the requested OpenCL HOG model * - * @param[in] index The index of the wanted OpenCL HOG model. + * @param[in] index The index of the wanted OpenCL HOG model. * * @return A constant pointer pointed to the OpenCL HOG model */ diff --git a/arm_compute/core/CL/ICLMultiImage.h b/arm_compute/core/CL/ICLMultiImage.h index e8705b182..774175607 100644 --- a/arm_compute/core/CL/ICLMultiImage.h +++ b/arm_compute/core/CL/ICLMultiImage.h @@ -37,14 +37,14 @@ class ICLMultiImage : public IMultiImage public: /** Return a pointer to the requested OpenCL plane of the image. * - * @param[in] index The index of the wanted planed. + * @param[in] index The index of the wanted planed. * * @return A pointer pointed to the OpenCL plane */ virtual ICLImage *cl_plane(unsigned int index) = 0; /** Return a constant pointer to the requested OpenCL plane of the image. * - * @param[in] index The index of the wanted planed. + * @param[in] index The index of the wanted planed. * * @return A constant pointer pointed to the OpenCL plane */ diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h index 6780e23c2..8a2d30bb8 100644 --- a/arm_compute/core/CL/OpenCL.h +++ b/arm_compute/core/CL/OpenCL.h @@ -54,69 +54,42 @@ public: bool load(const std::string &library); bool load_default(); - using clBuildProgram_func = cl_int (*)(cl_program, cl_uint, const cl_device_id *, const char *, void (*pfn_notify)(cl_program, void *), void *); - using clEnqueueNDRangeKernel_func = cl_int (*)(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *); - using clSetKernelArg_func = cl_int (*)(cl_kernel, cl_uint, size_t, const void *); - using clRetainMemObject_func = cl_int (*)(cl_mem); - using clReleaseMemObject_func = cl_int (*)(cl_mem); - using clEnqueueUnmapMemObject_func = cl_int (*)(cl_command_queue, cl_mem, void *, cl_uint, const cl_event *, cl_event *); - using clRetainCommandQueue_func = cl_int (*)(cl_command_queue command_queue); - using clReleaseContext_func = cl_int (*)(cl_context); - using clReleaseEvent_func = cl_int (*)(cl_event); - using clEnqueueWriteBuffer_func = cl_int (*)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *); - using clEnqueueReadBuffer_func = cl_int (*)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *); - using clGetProgramBuildInfo_func = cl_int (*)(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *); - using clRetainProgram_func = cl_int (*)(cl_program program); - using clEnqueueMapBuffer_func = void *(*)(cl_command_queue, cl_mem, cl_bool, cl_map_flags, size_t, size_t, cl_uint, const cl_event *, cl_event *, cl_int *); - using clReleaseCommandQueue_func = cl_int (*)(cl_command_queue); - using clCreateProgramWithBinary_func = cl_program (*)(cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *); - using clRetainContext_func = cl_int (*)(cl_context context); - using clReleaseProgram_func = cl_int (*)(cl_program program); - using clFlush_func = cl_int (*)(cl_command_queue command_queue); - using clFinish_func = cl_int (*)(cl_command_queue command_queue); - using clGetProgramInfo_func = cl_int (*)(cl_program, cl_program_info, size_t, void *, size_t *); - using clCreateKernel_func = cl_kernel (*)(cl_program, const char *, cl_int *); - using clRetainKernel_func = cl_int (*)(cl_kernel kernel); - using clCreateBuffer_func = cl_mem (*)(cl_context, cl_mem_flags, size_t, void *, cl_int *); - using clCreateProgramWithSource_func = cl_program (*)(cl_context, cl_uint, const char **, const size_t *, cl_int *); - using clReleaseKernel_func = cl_int (*)(cl_kernel kernel); - using clGetDeviceInfo_func = cl_int (*)(cl_device_id, cl_device_info, size_t, void *, size_t *); - using clGetDeviceIDs_func = cl_int (*)(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *); - using clRetainEvent_func = cl_int (*)(cl_event); - using clGetPlatformIDs_func = cl_int (*)(cl_uint, cl_platform_id *, cl_uint *); - using clGetKernelWorkGroupInfo_func = cl_int (*)(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *); +#define DECLARE_FUNCTION_PTR(func_name) \ + std::function<decltype(func_name)> func_name##_ptr = nullptr - clBuildProgram_func clBuildProgram = nullptr; - clEnqueueNDRangeKernel_func clEnqueueNDRangeKernel = nullptr; - clSetKernelArg_func clSetKernelArg = nullptr; - clReleaseKernel_func clReleaseKernel = nullptr; - clCreateProgramWithSource_func clCreateProgramWithSource = nullptr; - clCreateBuffer_func clCreateBuffer = nullptr; - clRetainKernel_func clRetainKernel = nullptr; - clCreateKernel_func clCreateKernel = nullptr; - clGetProgramInfo_func clGetProgramInfo = nullptr; - clFlush_func clFlush = nullptr; - clFinish_func clFinish = nullptr; - clReleaseProgram_func clReleaseProgram = nullptr; - clRetainContext_func clRetainContext = nullptr; - clCreateProgramWithBinary_func clCreateProgramWithBinary = nullptr; - clReleaseCommandQueue_func clReleaseCommandQueue = nullptr; - clEnqueueMapBuffer_func clEnqueueMapBuffer = nullptr; - clRetainProgram_func clRetainProgram = nullptr; - clGetProgramBuildInfo_func clGetProgramBuildInfo = nullptr; - clEnqueueReadBuffer_func clEnqueueReadBuffer = nullptr; - clEnqueueWriteBuffer_func clEnqueueWriteBuffer = nullptr; - clReleaseEvent_func clReleaseEvent = nullptr; - clReleaseContext_func clReleaseContext = nullptr; - clRetainCommandQueue_func clRetainCommandQueue = nullptr; - clEnqueueUnmapMemObject_func clEnqueueUnmapMemObject = nullptr; - clRetainMemObject_func clRetainMemObject = nullptr; - clReleaseMemObject_func clReleaseMemObject = nullptr; - clGetDeviceInfo_func clGetDeviceInfo = nullptr; - clGetDeviceIDs_func clGetDeviceIDs = nullptr; - clRetainEvent_func clRetainEvent = nullptr; - clGetPlatformIDs_func clGetPlatformIDs = nullptr; - clGetKernelWorkGroupInfo_func clGetKernelWorkGroupInfo = nullptr; + DECLARE_FUNCTION_PTR(clBuildProgram); + DECLARE_FUNCTION_PTR(clEnqueueNDRangeKernel); + DECLARE_FUNCTION_PTR(clSetKernelArg); + DECLARE_FUNCTION_PTR(clReleaseKernel); + DECLARE_FUNCTION_PTR(clCreateProgramWithSource); + DECLARE_FUNCTION_PTR(clCreateBuffer); + DECLARE_FUNCTION_PTR(clRetainKernel); + DECLARE_FUNCTION_PTR(clCreateKernel); + DECLARE_FUNCTION_PTR(clGetProgramInfo); + DECLARE_FUNCTION_PTR(clFlush); + DECLARE_FUNCTION_PTR(clFinish); + DECLARE_FUNCTION_PTR(clReleaseProgram); + DECLARE_FUNCTION_PTR(clRetainContext); + DECLARE_FUNCTION_PTR(clCreateProgramWithBinary); + DECLARE_FUNCTION_PTR(clReleaseCommandQueue); + DECLARE_FUNCTION_PTR(clEnqueueMapBuffer); + DECLARE_FUNCTION_PTR(clRetainProgram); + DECLARE_FUNCTION_PTR(clGetProgramBuildInfo); + DECLARE_FUNCTION_PTR(clEnqueueReadBuffer); + DECLARE_FUNCTION_PTR(clEnqueueWriteBuffer); + DECLARE_FUNCTION_PTR(clReleaseEvent); + DECLARE_FUNCTION_PTR(clReleaseContext); + DECLARE_FUNCTION_PTR(clRetainCommandQueue); + DECLARE_FUNCTION_PTR(clEnqueueUnmapMemObject); + DECLARE_FUNCTION_PTR(clRetainMemObject); + DECLARE_FUNCTION_PTR(clReleaseMemObject); + DECLARE_FUNCTION_PTR(clGetDeviceInfo); + DECLARE_FUNCTION_PTR(clGetDeviceIDs); + DECLARE_FUNCTION_PTR(clRetainEvent); + DECLARE_FUNCTION_PTR(clGetPlatformIDs); + DECLARE_FUNCTION_PTR(clGetKernelWorkGroupInfo); + +#undef DECLARE_FUNCTION_PTR private: std::pair<bool, bool> _loaded{ false, false }; diff --git a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h b/arm_compute/core/CL/kernels/CLActivationLayerKernel.h index dab133f05..5b6c44cdd 100644 --- a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLActivationLayerKernel.h @@ -56,6 +56,16 @@ public: * @param[in] act_info Activation layer information. */ void configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info); + /** Static function to check if given info will lead to a valid configuration of @ref CLActivationLayerKernel + * + * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result + * of the activation function. Data types supported: QS8/QS16/F16/F32. + * @param[in] output Destination tensor info. Data type supported: same as @p input + * @param[in] act_info Activation layer information. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h b/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h index 0895fe3f7..96b8dc8d4 100644 --- a/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h +++ b/arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h @@ -59,6 +59,16 @@ public: * @param[in] policy Policy to use to handle overflow. */ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticAdditionKernel + * + * @param[in] input1 First tensor input info. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input2 Second tensor input info. Data types supported: U8/QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16/F32. + * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32. + * @param[in] policy Policy to use to handle overflow. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h b/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h index d7755d5e3..c5f862a61 100644 --- a/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h +++ b/arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h @@ -61,6 +61,16 @@ public: * @param[in] policy Policy to use to handle overflow. */ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticSubtractionKernel + * + * @param[in] input1 First tensor input info. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input2 Second tensor input info. Data types supported: U8/QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16/F32. + * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32. + * @param[in] policy Policy to use to handle overflow. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h b/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h index 26825efba..8643d83bc 100644 --- a/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h @@ -55,14 +55,32 @@ public: * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result. * 3 lower dimensions represent a single input with dimensions [width, height, FM]. * The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32. + * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input * @param[in] mean Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input * @param[in] var Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input - * @param[in] gamma Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input * @param[in] beta Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] gamma Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input * @param[in] epsilon Small value to avoid division with zero. - * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input */ void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon); + /** Static function to check if given info will lead to a valid configuration of @ref CLBatchNormalizationLayerKernel + * + * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result. + * 3 lower dimensions represent a single input with dimensions [width, height, FM]. + * The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32. + * @param[in] output Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input + * @param[in] mean Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] var Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] beta Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] gamma Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] epsilon Small value to avoid division with zero. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *mean, const ITensorInfo *var, + const ITensorInfo *beta, const ITensorInfo *gamma, + float epsilon); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/arm_compute/core/CL/kernels/CLChannelExtractKernel.h b/arm_compute/core/CL/kernels/CLChannelExtractKernel.h index 404b2d144..96ce44220 100644 --- a/arm_compute/core/CL/kernels/CLChannelExtractKernel.h +++ b/arm_compute/core/CL/kernels/CLChannelExtractKernel.h @@ -53,14 +53,14 @@ public: ~CLChannelExtractKernel() = default; /** Set the input and output of the kernel * - * @param[in] input Source tensor. + * @param[in] input Source tensor. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422 * @param[in] channel Channel to extract. * @param[out] output Destination tensor. Must be of U8 format. */ void configure(const ICLTensor *input, Channel channel, ICLTensor *output); /** Set the input and output of the kernel * - * @param[in] input Multi-planar source image. + * @param[in] input Multi-planar source image. Formats supported: NV12/NV21/IYUV/YUV444 * @param[in] channel Channel to extract. * @param[out] output Single-planar 2D destination image. Must be of U8 format. */ diff --git a/arm_compute/core/CL/kernels/CLCol2ImKernel.h b/arm_compute/core/CL/kernels/CLCol2ImKernel.h index 807748cfd..bd86da1b5 100644 --- a/arm_compute/core/CL/kernels/CLCol2ImKernel.h +++ b/arm_compute/core/CL/kernels/CLCol2ImKernel.h @@ -66,7 +66,7 @@ public: /** Set the input and output of the kernel. * - * @param[in] input The input tensor to convert. Data types supported: QS8/QS16/F16/F32 + * @param[in] input The input tensor to convert. Data types supported: QS8/QS16/QASYMM8/F16/F32 * @param[out] output The output tensor. 3 lower dimensions represent a single output [width, height, OFM], * while the rest represent batch of outputs. Data types supported: Same as @p input * @param[in] convolved_dims Output convolved dimensions. diff --git a/arm_compute/core/CL/kernels/CLColorConvertKernel.h b/arm_compute/core/CL/kernels/CLColorConvertKernel.h index 23f1c56c6..edd05ef00 100644 --- a/arm_compute/core/CL/kernels/CLColorConvertKernel.h +++ b/arm_compute/core/CL/kernels/CLColorConvertKernel.h @@ -53,26 +53,27 @@ public: /** Set the input and output of the kernel * - * @param[in] input Source tensor - * @param[out] output Destination tensor + * @param[in] input Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888 + * @param[out] output Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422), + * RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/) */ void configure(const ICLTensor *input, ICLTensor *output); /** Set the input and output of the kernel * - * @param[in] input multi-planar source image - * @param[out] output single-planar destination image + * @param[in] input Multi-planar source image. Formats supported: NV12/NV21/IYUV + * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888 */ void configure(const ICLMultiImage *input, ICLImage *output); /** Set the input and output of the kernel * - * @param[in] input single-planar source image - * @param[out] output multi-planar destination image + * @param[in] input Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422 + * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888) */ void configure(const ICLImage *input, ICLMultiImage *output); /** Set the input and output of the kernel * - * @param[in] input multi-planar source image - * @param[out] output multi-planar destination image + * @param[in] input Multi-planar source image. Formats supported: NV12/NV21/IYUV + * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of @p input is IYUV) */ void configure(const ICLMultiImage *input, ICLMultiImage *output); diff --git a/arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h b/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h index 2833d8ec2..467bdfab3 100644 --- a/arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h +++ b/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h @@ -35,21 +35,21 @@ class ICLTensor; /** Interface for the depth concatenate kernel. * The input tensor will be concatenated into the output tensor. */ -class CLDepthConcatenateKernel : public ICLKernel +class CLDepthConcatenateLayerKernel : public ICLKernel { public: /** Default constructor */ - CLDepthConcatenateKernel(); + CLDepthConcatenateLayerKernel(); /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLDepthConcatenateKernel(const CLDepthConcatenateKernel &) = delete; + CLDepthConcatenateLayerKernel(const CLDepthConcatenateLayerKernel &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLDepthConcatenateKernel &operator=(const CLDepthConcatenateKernel &) = delete; + CLDepthConcatenateLayerKernel &operator=(const CLDepthConcatenateLayerKernel &) = delete; /** Allow instances of this class to be moved */ - CLDepthConcatenateKernel(CLDepthConcatenateKernel &&) = default; + CLDepthConcatenateLayerKernel(CLDepthConcatenateLayerKernel &&) = default; /** Allow instances of this class to be moved */ - CLDepthConcatenateKernel &operator=(CLDepthConcatenateKernel &&) = default; + CLDepthConcatenateLayerKernel &operator=(CLDepthConcatenateLayerKernel &&) = default; /** Default destructor */ - ~CLDepthConcatenateKernel() = default; + ~CLDepthConcatenateLayerKernel() = default; /** Initialise the kernel's inputs and output * * @param[in] input Input tensor. Data types supported: QS8/QS16/F16/F32. diff --git a/arm_compute/core/CL/kernels/CLDepthConvertKernel.h b/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h index da70bff0f..3a6310d69 100644 --- a/arm_compute/core/CL/kernels/CLDepthConvertKernel.h +++ b/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h @@ -36,7 +36,7 @@ class ICLTensor; /** Interface for the depth conversion kernel. * */ -class CLDepthConvertKernel : public ICLSimple2DKernel +class CLDepthConvertLayerKernel : public ICLSimple2DKernel { public: /** Set the input and output of the kernel. diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.h b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.h index 4e69f551b..eb62465f8 100644 --- a/arm_compute/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.h +++ b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.h @@ -32,27 +32,29 @@ class ICLTensor; /** Interface for the kernel to run a 3x3 depthwise convolution on a tensor. */ -class CLDepthwiseConvolution3x3Kernel : public ICLKernel +class CLDepthwiseConvolutionLayer3x3Kernel : public ICLKernel { public: /** Default constructor */ - CLDepthwiseConvolution3x3Kernel(); + CLDepthwiseConvolutionLayer3x3Kernel(); /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLDepthwiseConvolution3x3Kernel(const CLDepthwiseConvolution3x3Kernel &) = delete; + CLDepthwiseConvolutionLayer3x3Kernel(const CLDepthwiseConvolutionLayer3x3Kernel &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLDepthwiseConvolution3x3Kernel &operator=(const CLDepthwiseConvolution3x3Kernel &) = delete; + CLDepthwiseConvolutionLayer3x3Kernel &operator=(const CLDepthwiseConvolutionLayer3x3Kernel &) = delete; /** Default Move Constructor. */ - CLDepthwiseConvolution3x3Kernel(CLDepthwiseConvolution3x3Kernel &&) = default; + CLDepthwiseConvolutionLayer3x3Kernel(CLDepthwiseConvolutionLayer3x3Kernel &&) = default; /** Default move assignment operator. */ - CLDepthwiseConvolution3x3Kernel &operator=(CLDepthwiseConvolution3x3Kernel &&) = default; + CLDepthwiseConvolutionLayer3x3Kernel &operator=(CLDepthwiseConvolutionLayer3x3Kernel &&) = default; /** Initialize the function's source, destination, conv and border_size. * - * @param[in] input Source tensor. DataType supported: F32. + * @param[in] input Source tensor. DataType supported: QASYMM8/F32. + * @param[in] weights Weights tensor. A 3D tensor with dimensions [3, 3, IFM]. Data type supported: Same as @p input. + * @param[in] biases (Optional) Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input. * @param[out] output Destination tensor. Data type supported: Same as @p input. - * @param[in] weights Weights tensor. These are 3D tensors with dimensions [3, 3, IFM]. Data type supported: Same as @p input. * @param[in] conv_info Padding and stride information to use for the convolution. */ - void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *weights, const PadStrideInfo &conv_info); + void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; @@ -63,10 +65,11 @@ private: const ICLTensor *_input; ICLTensor *_output; const ICLTensor *_weights; + const ICLTensor *_biases; unsigned int _conv_stride_x; unsigned int _conv_stride_y; - unsigned int _conv_pad_x; - unsigned int _conv_pad_y; + unsigned int _conv_pad_left; + unsigned int _conv_pad_top; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_CLDEPTHWISECONVOLUTIONKERNEL3x3_H__ */ diff --git a/arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h index ae56adfa3..7e786e8df 100644 --- a/arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h +++ b/arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h @@ -56,8 +56,9 @@ public: * while every dimension above 3 represents a batch. Data types supported: Same as @p input * @param[in] kernel_dims The kernel dimensions (width and height). * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] has_bias Boolean that specifies if the depthwise convolution has bias. */ - void configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info); + void configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias = false); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/arm_compute/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.h index d493d9f05..7989257d3 100644 --- a/arm_compute/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.h +++ b/arm_compute/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.h @@ -52,14 +52,16 @@ public: * * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM]. Data type supported: F32. * @param[out] output The output tensor. Data type supported: same as @p input. + * @param[in] biases (Optional) The input biases to add. Shape [IFM]. Data type supported: same as @p input. */ - void configure(const ICLTensor *input, ICLTensor *output); + void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *biases = nullptr); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; private: const ICLTensor *_input; + const ICLTensor *_biases; ICLTensor *_output; }; } // arm_compute diff --git a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h b/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h index d876143a3..d47b7da21 100644 --- a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h @@ -56,22 +56,37 @@ public: * 5x5 convolution with stride_x = 1/2, stride_y = 1/2 * * @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/F16/F32. + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QS8/QS16/F16/F32. * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. * The 3rd dimension must be the same as the input's volume 3rd dimension. * Data type supported:Same as @p input. - * @param[in] biases Biases tensor. Biases are 1D tensor with dimension [OFM]. Data type supported: Same as @p input. + * @param[in] biases Biases tensor. Biases are 1D tensor with dimension [OFM]. + * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type * @param[out] output Output tensor. * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. */ void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info); - - // Inherited methods overridden: - BorderSize border_size() const override; + /** Static function to check if given info will lead to a valid configuration of @ref CLDirectConvolutionLayerKernel + * + * @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QASYMM8/QS16/F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * The 3rd dimension must be the same as the input's volume 3rd dimension. + * Data type supported:Same as @p input. + * @param[in] biases Biases tensor. Biases are 1D tensor with dimension [OFM]. Data type supported: Same as @p input. + * @param[in] output Output tensor. + * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] target Target GPU architecture. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, const GPUTarget target); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; + BorderSize border_size() const override; private: const ICLTensor *_input; @@ -79,8 +94,6 @@ private: const ICLTensor *_weights; ICLTensor *_output; BorderSize _border_size; - int _conv_pad_x; - int _conv_pad_y; int _conv_stride_x; int _conv_stride_y; }; diff --git a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h index 8e0c1836a..c87fb2cd6 100644 --- a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h @@ -64,7 +64,7 @@ public: CLGEMMInterleave4x4Kernel &operator=(CLGEMMInterleave4x4Kernel &&) = default; /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 * @param[out] output Output tensor. Data type supported: same as @p input */ void configure(const ICLTensor *input, ICLTensor *output); diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h index 05956aeeb..b60b80618 100644 --- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h @@ -30,15 +30,15 @@ namespace arm_compute { class ICLTensor; -/** OpenCL kernel to compute low precision matrix multiplication kernel +/** OpenCL kernel to multiply matrices * + * @note @ref CLGEMMLowpMatrixMultiplyKernel low precision matrix product kernel * This kernel performs the following computation: - * -# Convert a values from uint8 to int32 and add a_offset to each of them. - * -# Convert b values from uint8 to int32 and add b_offset to each of them. - * -# Compute the int32 matrix product of the resulting a * b. - * -# Add output_offset to each entry of the result. - * -# Multiply each entry of the result and round to the nearest integer - * -# Clamp the resulting int32 values to the [0..255] range and cast to uint8. + * + * -# Convert a values from int8 to int32 + * -# Convert b values from int8 to int32 + * -# Compute the int32 matrix product of the resulting a * b and store the result as int32 + * */ class CLGEMMLowpMatrixMultiplyKernel : public ICLKernel { @@ -55,19 +55,12 @@ public: CLGEMMLowpMatrixMultiplyKernel &operator=(CLGEMMLowpMatrixMultiplyKernel &&) = default; /** Initialise the kernel's input and output. * - * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel. - * These two kernels change the layout of the original matrices to be more cache-friendly. - * - * @param[in] input0 Input tensor containing the interleaved Matrix A. Data types supported: U8 - * @param[in] input1 Input tensor containing the transposed Matrix B. Data types supported: same as @p input0 - * @param[out] output Output tensor to store the result of matrix multiplication, Data types supported: same as @p input0 - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * @param[in] output_offset Offset to be added to each element of the output matrix - * @param[in] output_mult_int Offset to be added to each element of the output matrix - * @param[in] shift Number of bits to shift right the result. + * @param[in] input0 Input tensor containing the interleaved Matrix A. Data type supported: QASYMM8 + * @param[in] input1 Input tensor containing the transposed1xW Matrix B. Data type supported: same as @p input0 + * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32 + * @param[in] is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel */ - void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift); + void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, bool is_interleaved_transposed = true); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h new file mode 100644 index 000000000..5f2e02568 --- /dev/null +++ b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__ +#define __ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place + * + * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), + * and adds to it the offset contribution of matrix A and matrix B in-place. + * + * The final result is: + * + * mm_result[i][k] = mm_result[i][k] + + * (vector_sum_col[k] * a_offset) + + * (vector_sum_row[i] * b_offset) + + * (a_offset * b_offset * k) + * + */ +class CLGEMMLowpOffsetContributionKernel : public ICLKernel +{ +public: + /** Constructor */ + CLGEMMLowpOffsetContributionKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + CLGEMMLowpOffsetContributionKernel(const CLGEMMLowpOffsetContributionKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + CLGEMMLowpOffsetContributionKernel &operator=(const CLGEMMLowpOffsetContributionKernel &) = delete; + /** Allow instances of this class to be moved */ + CLGEMMLowpOffsetContributionKernel(CLGEMMLowpOffsetContributionKernel &&) = default; + /** Allow instances of this class to be moved */ + CLGEMMLowpOffsetContributionKernel &operator=(CLGEMMLowpOffsetContributionKernel &&) = default; + /** Initialise the kernel's input and output. + * + * @param[in, out] mm_result Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32 + * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. + * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result + * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. + * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result + * @param[in] k Number of matrix A columns or Matrix B rows + * @param[in] a_offset Offset to be added to each element of the matrix A. + * @param[in] b_offset Offset to be added to each element of the matrix B. + */ + void configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_vector_sum_col; + const ICLTensor *_vector_sum_row; + ICLTensor *_mm_result; +}; +} // namespace arm_compute + +#endif /* __ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__ */ diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h new file mode 100644 index 000000000..49e19e3c6 --- /dev/null +++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H__ +#define __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8 + * + * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value. + * The following computations will be performed by the kernel: + * + * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier + * -# Add bias to final result if bias tensor is not a nullptr + * -# Round to nearest division by a power-of-two using result_shift + * -# Add offset to each result + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. + */ +class CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public ICLKernel +{ +public: + /** Constructor */ + CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(const CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(const CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete; + /** Allow instances of this class to be moved */ + CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default; + /** Allow instances of this class to be moved */ + CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default; + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add + * @param[in] result_shift Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication + * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions + */ + void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0); + /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel + * + * @param[in] input Input tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + const ICLTensor *_bias; + ICLTensor *_output; +}; +} // namespace arm_compute + +#endif /* __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H__ */ diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h new file mode 100644 index 000000000..87b70efdf --- /dev/null +++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEKERNEL_H__ +#define __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8 + * + * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value. + * The following computations will be performed by the kernel: + * + * -# Add offset terms to final result + * -# Multiply each entry of result by result_mult_int + * -# Add bias to final result if bias tensor is not a nullptr + * -# Shift the int32 accumulator by result_shift + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. + * + */ +class CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel : public ICLKernel +{ +public: + /** Constructor */ + CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel(const CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &operator=(const CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &) = delete; + /** Allow instances of this class to be moved */ + CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel(CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &&) = default; + /** Allow instances of this class to be moved */ + CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &operator=(CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &&) = default; + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] result_offset Offset to be added to each element of the input matrix + * @param[in] result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add + * @param[in] result_shift Number of bits to shift right the result before converting back to QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions + */ + void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0); + /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel + * + * @param[in] input Input tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + const ICLTensor *_bias; + ICLTensor *_output; +}; +} // namespace arm_compute + +#endif /* __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEKERNEL_H__ */
\ No newline at end of file diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h new file mode 100644 index 000000000..aa0583fe8 --- /dev/null +++ b/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H__ +#define __ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Common interface for all OpenCL reduction kernels */ +class ICLGEMMLowpReductionKernel : public ICLKernel +{ +public: + /** Constructor */ + ICLGEMMLowpReductionKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + ICLGEMMLowpReductionKernel(const ICLGEMMLowpReductionKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + ICLGEMMLowpReductionKernel &operator=(const ICLGEMMLowpReductionKernel &) = delete; + /** Allow instances of this class to be moved */ + ICLGEMMLowpReductionKernel(ICLGEMMLowpReductionKernel &&) = default; + /** Allow instances of this class to be moved */ + ICLGEMMLowpReductionKernel &operator=(ICLGEMMLowpReductionKernel &&) = default; + + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data type supported: S8 + * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32 + */ + virtual void configure(const ICLTensor *input, ICLTensor *output) = 0; + +protected: + const ICLTensor *_input; + ICLTensor *_output; +}; + +/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + */ +class CLGEMMLowpMatrixAReductionKernel : public ICLGEMMLowpReductionKernel +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] mtx_a Input tensor. Data type supported: QASYMM8 + * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 + */ + void configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row) override; + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; +}; + +/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + */ +class CLGEMMLowpMatrixBReductionKernel : public ICLGEMMLowpReductionKernel +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8 + * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 + */ + void configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col) override; + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace arm_compute + +#endif /* __ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H__ */ diff --git a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h index 50bc64c2c..8a3772046 100644 --- a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h @@ -70,7 +70,7 @@ class CLGEMMTranspose1xWKernel : public ICLSimple2DKernel public: /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 * @param[out] output Output tensor. Data type supported: same as @p input */ void configure(const ICLTensor *input, ICLTensor *output); diff --git a/arm_compute/core/CL/kernels/CLIm2ColKernel.h b/arm_compute/core/CL/kernels/CLIm2ColKernel.h index eed683b4c..1d8b5500c 100644 --- a/arm_compute/core/CL/kernels/CLIm2ColKernel.h +++ b/arm_compute/core/CL/kernels/CLIm2ColKernel.h @@ -69,7 +69,7 @@ public: /** Set the input and output of the kernel. * * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/F16/F32 + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/QASYMM8/F16/F32 * @param[out] output The output tensor. First 2 lower dimensions represent a transform of each 3D input, * while every dimension above represents a batch. Data types supported: Same as @p input * @param[in] kernel_dims The kernel dimensions (width and height). diff --git a/arm_compute/core/CL/kernels/CLL2NormalizeKernel.h b/arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h index 2056b4e61..f7d717119 100644 --- a/arm_compute/core/CL/kernels/CLL2NormalizeKernel.h +++ b/arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h @@ -32,21 +32,21 @@ namespace arm_compute class ICLTensor; /** Interface for the reduction operation kernel */ -class CLL2NormalizeKernel : public ICLKernel +class CLL2NormalizeLayerKernel : public ICLKernel { public: /** Default constructor */ - CLL2NormalizeKernel(); + CLL2NormalizeLayerKernel(); /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLL2NormalizeKernel(const CLL2NormalizeKernel &) = delete; + CLL2NormalizeLayerKernel(const CLL2NormalizeLayerKernel &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ - CLL2NormalizeKernel &operator=(const CLL2NormalizeKernel &) = delete; + CLL2NormalizeLayerKernel &operator=(const CLL2NormalizeLayerKernel &) = delete; /** Allow instances of this class to be moved */ - CLL2NormalizeKernel(CLL2NormalizeKernel &&) = default; + CLL2NormalizeLayerKernel(CLL2NormalizeLayerKernel &&) = default; /** Allow instances of this class to be moved */ - CLL2NormalizeKernel &operator=(CLL2NormalizeKernel &&) = default; + CLL2NormalizeLayerKernel &operator=(CLL2NormalizeLayerKernel &&) = default; /** Default destructor */ - ~CLL2NormalizeKernel() = default; + ~CLL2NormalizeLayerKernel() = default; /** Set the input and output tensors. * diff --git a/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h b/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h index f74f7514e..d931152cb 100644 --- a/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h @@ -45,7 +45,6 @@ public: CLNormalizationLayerKernel(CLNormalizationLayerKernel &&) = default; /** Default move assignment operator. */ CLNormalizationLayerKernel &operator=(CLNormalizationLayerKernel &&) = default; - /** Set the input and output tensors. * * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], @@ -54,6 +53,16 @@ public: * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters. */ void configure(const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info); + /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayerKernel + * + * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/F16/F32. + * @param[in] output Destination tensor. Output will have the same number of dimensions as input. Data types supported: same as @p input. + * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h index 309a202df..6746a49dd 100644 --- a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h +++ b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h @@ -59,6 +59,20 @@ public: */ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); + /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseMultiplicationKernel + * + * @param[in] input1 An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input2 An input tensor info. Data types supported: same as @p input1. + * @param[in] output The output tensor info, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1. + * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate + * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h index 9251a8ed9..e9ce28b3f 100644 --- a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h @@ -26,6 +26,8 @@ #include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/Error.h" + namespace arm_compute { class ICLTensor; @@ -51,11 +53,20 @@ public: * * @note QS8 and QS16 are supported only for pool sizes 3, 5 and 7 * - * @param[in] input Source tensor. Data types supported: QS8/QS16/F16/F32. + * @param[in] input Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32. * @param[out] output Destination tensor. Data types supported: Same as @p input. * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. */ void configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info); + /** Static function to check if given info will lead to a valid configuration of @ref CLPoolingLayerKernel + * + * @param[in] input Source tensor info. Data types supported: QS8/QASYMM8/QS16/F16/F32. + * @param[in] output Destination tensor info. Data types supported: Same as @p input. + * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h b/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h index d8ccfa88c..044b5e700 100644 --- a/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h @@ -49,7 +49,7 @@ public: ~CLReshapeLayerKernel() = default; /** Set the input and output of the kernel * - * @param[in] input Source tensor. Data type supported: U8/S8/QS8/U16/S16/QS16/U32/S32/F16/F32 + * @param[in] input Source tensor. Data type supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32 * @param[out] output Destination tensor. Data type supported: Same as @p input */ void configure(const ICLTensor *input, ICLTensor *output); diff --git a/arm_compute/core/CL/kernels/CLScaleKernel.h b/arm_compute/core/CL/kernels/CLScaleKernel.h index db0587d6a..3bca6efd0 100644 --- a/arm_compute/core/CL/kernels/CLScaleKernel.h +++ b/arm_compute/core/CL/kernels/CLScaleKernel.h @@ -42,8 +42,9 @@ public: * All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. * @param[in] policy Interpolation type to use * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + * @param[in] sampling_policy (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER */ - void configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, bool border_undefined); + void configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, bool border_undefined, SamplingPolicy sampling_policy = SamplingPolicy::CENTER); // Inherited methods overridden: BorderSize border_size() const override; diff --git a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h b/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h index 1e641b48d..c072d2a6d 100644 --- a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h +++ b/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h @@ -26,6 +26,8 @@ #include "arm_compute/core/CL/ICLSimple3DKernel.h" +#include <tuple> + namespace arm_compute { class ICLTensor; @@ -36,13 +38,21 @@ class CLLogits1DMaxKernel : public ICLSimple3DKernel public: /** Set the input and output tensors. * - * @param[in] input Source tensor. Data types supported: QS8/QS16/F16/F32 + * @param[in] input Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32 * @param[out] output Destination tensor. Data types supported: same as @p input */ void configure(const ICLTensor *input, ICLTensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DMaxKernel + * + * @param[in] input Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32 + * @param[in] output Destination tensor. Data types supported: same as @p input + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); }; -/** Interface for shifting the logits values around the max value and exponentiating the result */ +/** Interface for shifting, exponentiating and summing the logits */ class CLLogits1DShiftExpSumKernel : public ICLKernel { public: @@ -58,12 +68,23 @@ public: CLLogits1DShiftExpSumKernel &operator=(CLLogits1DShiftExpSumKernel &&) = default; /** Set the input and output tensors. * - * @param[in] input Source tensor. Data types supported: QS8/QS16/F16/F32 + * @param[in] input Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32 * @param[in] max Max values tensor. Data types supported: same as @p input - * @param[out] output Destination tensor. Data types supported: same as @p input - * @param[out] sum Sum of 1D logits tensor. Data types supported: same as @p input + * @param[out] output Destination tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input + * @param[out] sum Sum of 1D logits tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input + * @param[in] beta (Optional) A scaling factor for the exponent. Defaults to 1.0 */ - void configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum); + void configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta = 1.0f); + /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DShiftExpSumKernel + * + * @param[in] input Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32 + * @param[in] max Max values tensor. Data types supported: same as @p input + * @param[in] output Destination tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input + * @param[in] sum Sum of 1D logits tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; @@ -75,6 +96,68 @@ private: ICLTensor *_sum; }; +/** Interface for max, shifting, exponentiating and summing the logits */ +class CLLogits1DMaxShiftExpSumKernel : public ICLKernel +{ +public: + using ParallelReductionInfo = std::tuple<bool, unsigned int>; + +public: + /** Default constructor */ + CLLogits1DMaxShiftExpSumKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLLogits1DMaxShiftExpSumKernel(const CLLogits1DMaxShiftExpSumKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLLogits1DMaxShiftExpSumKernel &operator=(const CLLogits1DMaxShiftExpSumKernel &) = delete; + /** Allow instances of this class to be moved */ + CLLogits1DMaxShiftExpSumKernel(CLLogits1DMaxShiftExpSumKernel &&) = default; + /** Allow instances of this class to be moved */ + CLLogits1DMaxShiftExpSumKernel &operator=(CLLogits1DMaxShiftExpSumKernel &&) = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data types supported: QS8/QS16/F16/F32 + * @param[in,out] max Max values tensor. Data types supported: same as @p input + * @param[out] output Destination tensor. Data types supported: same as @p input + * @param[out] sum Sum of 1D logits tensor. Data types supported: same as @p input + * @param[in] beta (Optional) A scaling factor for the exponent. Defaults to 1.f + */ + void configure(const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta = 1.0f); + /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DMaxShiftExpSumKernel + * + * @param[in] input Source tensor. Data types supported: QS8/QS16/F16/F32 + * @param[in] max Max values tensor. Data types supported: same as @p input + * @param[in] output Destination tensor. Data types supported: same as @p input + * @param[in] sum Sum of 1D logits tensor. Data types supported: same as @p input + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum); + /** Checks if the given size is eligible for parallel reduction + * + * @note Serial reduction is launched for width < (_grid_size * _serial_vector_size). + * @note Parallel reduction is launched for width >= (_grid_size * _serial_vector_size) and vector_size is forced to 4. + * + * @param[in] size Size to check + * + * @return A two-element tuple where the first element is a boolean specifying is a parallel reduction will be run, + * while the second elements is the vector size of the execution. + */ + static ParallelReductionInfo is_parallel_reduction(size_t size); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_max; + ICLTensor *_output; + ICLTensor *_sum; + +private: + static const unsigned int _grid_size; + static const unsigned int _serial_vector_size; + static const unsigned int _parallel_vector_size; +}; /** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */ class CLLogits1DNormKernel : public ICLKernel { @@ -91,11 +174,21 @@ public: CLLogits1DNormKernel &operator=(CLLogits1DNormKernel &&) = default; /** Set the input and output tensors. * - * @param[in] input Source tensor. Data types supported: QS8/QS16/F16/F32 + * @param[in] input Source tensor. Data types supported: QS8/QS16/S32/F16/F32 * @param[in] sum Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input - * @param[out] output Destination tensor. Data types supported: same as @p input + * @param[out] output Destination tensor. Data types supported: QASYMM8 for S32 @p input, or same as @p input + * @param[in] beta (Optional) A scaling factor for the exponent. (Default = 1.0) + */ + void configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, float beta = 1.0f); + /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DNormKernel + * + * @param[in] input Source tensor. Data types supported: QS8/QS16/S32/F16/F32 + * @param[in] sum Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input + * @param[in] output Destination tensor. Data types supported: QASYMM8 for S32 @p input, or same as @p input + * + * @return a status */ - void configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output); + static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output); // Inherited methods overridden: void run(const Window &window, cl::CommandQueue &queue) override; diff --git a/arm_compute/core/CL/kernels/CLTransposeKernel.h b/arm_compute/core/CL/kernels/CLTransposeKernel.h index faccf5e37..2e1b481d3 100644 --- a/arm_compute/core/CL/kernels/CLTransposeKernel.h +++ b/arm_compute/core/CL/kernels/CLTransposeKernel.h @@ -40,10 +40,18 @@ class CLTransposeKernel : public ICLSimple2DKernel public: /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 * @param[out] output Output tensor. Data type supported: Same as @p input */ void configure(const ICLTensor *input, ICLTensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref CLTransposeKernel + * + * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] output Output tensor. Data type supported: Same as @p input + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); }; } // namespace arm_compute #endif /* __ARM_COMPUTE_CLTRANSPOSEKERNEL_H__ */ diff --git a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h index 07c7c772c..6c84ded49 100644 --- a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h +++ b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h @@ -47,9 +47,10 @@ public: /** Set the input and output of the kernel. * * @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared, - * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QS8/QS16/F16/F32 + * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QS8/QS16/QASYMM8/F16/F32 * @param[in] biases The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with * dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input + * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types. * @param[out] output The output tensor. Should be a 2D Tensor. Data types supported: Same as @p input */ void configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output); diff --git a/arm_compute/core/CPP/CPPKernels.h b/arm_compute/core/CPP/CPPKernels.h index 1eabfa943..f55f41b0e 100644 --- a/arm_compute/core/CPP/CPPKernels.h +++ b/arm_compute/core/CPP/CPPKernels.h @@ -27,6 +27,7 @@ /* Header regrouping all the CPP kernels */ #include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h" #include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h" +#include "arm_compute/core/CPP/kernels/CPPPermuteKernel.h" #include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h" #endif /* __ARM_COMPUTE_CPPKERNELS_H__ */ diff --git a/arm_compute/core/CPP/kernels/CPPPermuteKernel.h b/arm_compute/core/CPP/kernels/CPPPermuteKernel.h new file mode 100644 index 000000000..0e7c93877 --- /dev/null +++ b/arm_compute/core/CPP/kernels/CPPPermuteKernel.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CPPPERMUTEKERNEL_H__ +#define __ARM_COMPUTE_CPPPERMUTEKERNEL_H__ + +#include "arm_compute/core/CPP/ICPPKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** CPP kernel to perform tensor permutation. + * + * Permutes given a permutation vector + */ +class CPPPermuteKernel : public ICPPKernel +{ +public: + /** Default constructor */ + CPPPermuteKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CPPPermuteKernel(const CPPPermuteKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CPPPermuteKernel &operator=(const CPPPermuteKernel &) = delete; + /** Allow instances of this class to be moved */ + CPPPermuteKernel(CPPPermuteKernel &&) = default; + /** Allow instances of this class to be moved */ + CPPPermuteKernel &operator=(CPPPermuteKernel &&) = default; + /** Default destructor */ + ~CPPPermuteKernel() = default; + + /** Set the input and output of the kernel. + * + * @param[in] input The input tensor to permute. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[out] output The output tensor. Data types supported: Same as @p input + * @param[in] perm Permutation vector + */ + void configure(const ITensor *input, ITensor *output, const PermutationVector &perm); + /** Static function to check if given info will lead to a valid configuration of @ref CPPPermuteKernel + * + * @param[in] input The input tensor to permute. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] output The output tensor. Data types supported: Same as @p input + * @param[in] perm Permutation vector + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + /** Template function to run the permute + * + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). + */ + template <typename T> + void run_permute(const Window &window); + + /** Common signature for all the specialised permute functions + * + * @param[in] window Region on which to execute the kernel. + */ + using PermuteFunctionPtr = void (CPPPermuteKernel::*)(const Window &window); + + PermuteFunctionPtr _func; + const ITensor *_input; + ITensor *_output; + PermutationVector _perm; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CPPPERMUTEKERNEL_H__ */ diff --git a/arm_compute/core/Dimensions.h b/arm_compute/core/Dimensions.h index 96dd3711c..ae8d6c350 100644 --- a/arm_compute/core/Dimensions.h +++ b/arm_compute/core/Dimensions.h @@ -100,7 +100,20 @@ public: * * @return The size of the requested dimension. */ - T operator[](size_t dimension) const + const T &operator[](size_t dimension) const + { + ARM_COMPUTE_ERROR_ON(dimension >= num_max_dimensions); + return _id[dimension]; + } + /** Generic accessor to get the size of any dimension + * + * @note Precondition: dimension < Dimensions::num_max_dimensions + * + * @param[in] dimension Dimension of the wanted size + * + * @return The size of the requested dimension. + */ + T &operator[](size_t dimension) { ARM_COMPUTE_ERROR_ON(dimension >= num_max_dimensions); return _id[dimension]; @@ -119,8 +132,8 @@ public: /** Collapse dimensions. * - * @param[in] first Dimensions into which the following @p n are collapsed. * @param[in] n Number of dimensions to collapse into @p first. + * @param[in] first Dimensions into which the following @p n are collapsed. */ void collapse(size_t n, size_t first = 0) { @@ -141,6 +154,17 @@ public: std::fill(_id.begin() + _num_dimensions, _id.end(), 0); } + /** Collapse dimensions starting from a given point + * + * @param[in] start Starting point of collapsing dimensions + */ + void collapse_from(size_t start) + { + ARM_COMPUTE_ERROR_ON(start > num_dimensions()); + + collapse(num_dimensions() - start, start); + } + /** Returns a read/write iterator that points to the first element in the dimension array. */ typename std::array<T, num_max_dimensions>::iterator begin() { @@ -179,5 +203,16 @@ protected: std::array<T, num_max_dimensions> _id; size_t _num_dimensions{ 0 }; }; + +template <typename T> +inline bool operator==(const Dimensions<T> &lhs, const Dimensions<T> &rhs) +{ + return ((lhs.num_dimensions() == rhs.num_dimensions()) && std::equal(lhs.cbegin(), lhs.cend(), rhs.cbegin())); +} +template <typename T> +inline bool operator!=(const Dimensions<T> &lhs, const Dimensions<T> &rhs) +{ + return !(lhs == rhs); +} } #endif /*__ARM_COMPUTE_DIMENSIONS_H__*/ diff --git a/arm_compute/core/Error.h b/arm_compute/core/Error.h index c4c452bac..97dbba3fa 100644 --- a/arm_compute/core/Error.h +++ b/arm_compute/core/Error.h @@ -24,55 +24,236 @@ #ifndef __ARM_COMPUTE_ERROR_H__ #define __ARM_COMPUTE_ERROR_H__ -/** Print the given message then throw an std::runtime_error. +#include <stdarg.h> +#include <string> + +namespace arm_compute +{ +enum class ErrorCode +{ + OK, /**< No error */ + RUNTIME_ERROR /**< Generic runtime error */ +}; + +/** Status class */ +class Status +{ +public: + /** Default Constructor **/ + Status() + : _code(ErrorCode::OK), _error_description(" ") + { + } + /** Default Constructor + * + * @param error_status Error status. + * @param error_description (Optional) Error description if error_status is not valid. + */ + explicit Status(ErrorCode error_status, std::string error_description = " ") + : _code(error_status), _error_description(error_description) + { + } + /** Allow instances of this class to be copy constructed */ + Status(const Status &) = default; + /** Allow instances of this class to be move constructed */ + Status(Status &&) = default; + /** Allow instances of this class to be copy assigned */ + Status &operator=(const Status &) = default; + /** Allow instances of this class to be move assigned */ + Status &operator=(Status &&) = default; + /** Explicit bool conversion operator + * + * @return True if there is no error else false + */ + explicit operator bool() const noexcept + { + return _code == ErrorCode::OK; + } + /** Gets error code + * + * @return Error code. + */ + ErrorCode error_code() const + { + return _code; + } + /** Gets error description if any + * + * @return Error description. + */ + std::string error_description() const + { + return _error_description; + } + /** Throws a runtime exception in case it contains a valid error status */ + void throw_if_error() + { + if(!bool(*this)) + { + internal_throw_on_error(); + } + } + +private: + /** Internal throwing function */ + [[noreturn]] void internal_throw_on_error(); + +private: + ErrorCode _code; + std::string _error_description; +}; + +/** Creates an error containing the error message from variable argument list * - * @param[in] ... Message to display before aborting. + * @param[in] error_code Error code + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] msg Message to display before aborting. + * @param[in] args Variable argument list of the message. + * + * @return status containing the error */ -#define ARM_COMPUTE_ERROR(...) ::arm_compute::error(__func__, __FILE__, __LINE__, __VA_ARGS__) // NOLINT - -/** Print the given message then throw an std::runtime_error. +Status create_error_va_list(ErrorCode error_code, const char *function, const char *file, const int line, const char *msg, va_list args); +/** Creates an error containing the error message * - * @param[in] func Function in which the error occurred. - * @param[in] file File in which the error occurred. - * @param[in] line Line in which the error occurred. - * @param[in] ... Message to display before aborting. + * @param[in] error_code Error code + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] msg Message to display before aborting. + * @param[in] ... Variable number of arguments of the message. + * + * @return status containing the error */ -#define ARM_COMPUTE_ERROR_LOC(func, file, line, ...) ::arm_compute::error(func, file, line, __VA_ARGS__) // NOLINT - +Status create_error(ErrorCode error_code, const char *function, const char *file, const int line, const char *msg, ...); +/** Print an error message then throw an std::runtime_error + * + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] msg Message to display before aborting. + * @param[in] ... Variable number of arguments of the message. + */ +[[noreturn]] void error(const char *function, const char *file, const int line, const char *msg, ...); +} /** To avoid unused variables warnings * * This is useful if for example a variable is only used * in debug builds and generates a warning in release builds. * - * @param[in] var Variable which is unused + * @param[in] var Variable which is unused. */ #define ARM_COMPUTE_UNUSED(var) (void)(var) -#ifdef ARM_COMPUTE_DEBUG_ENABLED -/** Print the given message +/** Creates an error with a given message + * + * @param[in] error_code Error code. + * @param[in] ... Message to encapsulate. + */ +#define ARM_COMPUTE_CREATE_ERROR(error_code, ...) ::arm_compute::create_error(error_code, __func__, __FILE__, __LINE__, __VA_ARGS__) // NOLINT + +/** Creates an error on location with a given message * - * @param[in] ... Message to display + * @param[in] error_code Error code. + * @param[in] func Function in which the error occurred. + * @param[in] file File in which the error occurred. + * @param[in] line Line in which the error occurred. + * @param[in] ... Message to display before aborting. */ -#define ARM_COMPUTE_INFO(...) ::arm_compute::debug(__func__, __FILE__, __LINE__, __VA_ARGS__) // NOLINT -/** If the condition is true, the given message is printed +#define ARM_COMPUTE_CREATE_ERROR_LOC(error_code, func, file, line, ...) ::arm_compute::create_error(error_code, func, file, line, __VA_ARGS__) // NOLINT + +/** Checks if a status contains an error and returns it + * + * @param[in] status Status value to check + */ +#define ARM_COMPUTE_RETURN_ON_ERROR(status) \ + do \ + { \ + if(!bool(status)) \ + { \ + return status; \ + } \ + } while(false) + +/** Checks if an error value is valid if not throws an exception with the error + * + * @param[in] error Error value to check. + */ +#define ARM_COMPUTE_THROW_ON_ERROR(error) \ + error.throw_if_error(); + +/** If the condition is true, an error is returned * * @param[in] cond Condition to evaluate. - * @param[in] ... Message to print if cond is false. + * @param[in] ... Error description message */ -#define ARM_COMPUTE_INFO_ON_MSG(cond, ...) \ - do \ - { \ - if(cond) \ - { \ - ARM_COMPUTE_INFO(__VA_ARGS__); \ - } \ - } while(0) -#else /* ARM_COMPUTE_DEBUG_ENABLED */ -#define ARM_COMPUTE_INFO_ON_MSG(cond, ...) -#define ARM_COMPUTE_INFO(...) -#endif /* ARM_COMPUTE_DEBUG_ENABLED */ +#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, ...) \ + do \ + { \ + if(cond) \ + { \ + return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, __VA_ARGS__); \ + } \ + } while(false) + +/** If the condition is true, an error is thrown + * + * @param[in] cond Condition to evaluate. + * @param[in] func Function in which the error occurred. + * @param[in] file File in which the error occurred. + * @param[in] line Line in which the error occurred. + * @param[in] ... Error description message. + */ +#define ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(cond, func, file, line, ...) \ + do \ + { \ + if(cond) \ + { \ + return ARM_COMPUTE_CREATE_ERROR_LOC(arm_compute::ErrorCode::RUNTIME_ERROR, func, file, line, __VA_ARGS__); \ + } \ + } while(false) + +/** If the condition is true, an error is returned + * + * @param[in] cond Condition to evaluate + */ +#define ARM_COMPUTE_RETURN_ERROR_ON(cond) \ + ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, #cond) + +/** If the condition is true, an error is returned + * + * @param[in] cond Condition to evaluate. + * @param[in] func Function in which the error occurred. + * @param[in] file File in which the error occurred. + * @param[in] line Line in which the error occurred. + */ +#define ARM_COMPUTE_RETURN_ERROR_ON_LOC(cond, func, file, line) \ + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(cond, func, file, line, #cond) + +/** Print the given message then throw an std::runtime_error. + * + * @param[in] ... Message to display before aborting. + */ +#define ARM_COMPUTE_ERROR(...) ::arm_compute::error(__func__, __FILE__, __LINE__, __VA_ARGS__) // NOLINT + +/** Print the given message then throw an std::runtime_error. + * + * @param[in] func Function in which the error occurred. + * @param[in] file File in which the error occurred. + * @param[in] line Line in which the error occurred. + * @param[in] ... Message to display before aborting. + */ +#define ARM_COMPUTE_ERROR_LOC(func, file, line, ...) ::arm_compute::error(func, file, line, __VA_ARGS__) // NOLINT #ifdef ARM_COMPUTE_ASSERTS_ENABLED +/** Checks if a status value is valid if not throws an exception with the error + * + * @param[in] status Status value to check. + */ +#define ARM_COMPUTE_ERROR_THROW_ON(status) \ + status.throw_if_error() + /** If the condition is true, the given message is printed and an exception is thrown * * @param[in] cond Condition to evaluate. @@ -112,6 +293,7 @@ */ #define ARM_COMPUTE_CONST_ON_ERROR(cond, val, msg) (cond) ? throw std::logic_error(msg) : val; #else /* ARM_COMPUTE_ASSERTS_ENABLED */ +#define ARM_COMPUTE_ERROR_THROW_ON(status) #define ARM_COMPUTE_ERROR_ON_MSG(cond, ...) #define ARM_COMPUTE_ERROR_ON_LOC_MSG(cond, func, file, line, ...) #define ARM_COMPUTE_CONST_ON_ERROR(cond, val, msg) val @@ -119,14 +301,14 @@ /** If the condition is true then an error message is printed and an exception thrown * - * @param[in] cond Condition to evaluate + * @param[in] cond Condition to evaluate. */ #define ARM_COMPUTE_ERROR_ON(cond) \ ARM_COMPUTE_ERROR_ON_MSG(cond, #cond) /** If the condition is true then an error message is printed and an exception thrown * - * @param[in] cond Condition to evaluate + * @param[in] cond Condition to evaluate. * @param[in] func Function in which the error occurred. * @param[in] file File in which the error occurred. * @param[in] line Line in which the error occurred. @@ -134,27 +316,4 @@ #define ARM_COMPUTE_ERROR_ON_LOC(cond, func, file, line) \ ARM_COMPUTE_ERROR_ON_LOC_MSG(cond, func, file, line, #cond) -namespace arm_compute -{ -/** Print an error message then throw an std::runtime_error - * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] msg Message to display before aborting. - * @param[in] ... Variable number of arguments of the message. - */ -[[noreturn]] void error(const char *function, const char *file, const int line, const char *msg, ...); - -/** Print a debug message - * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] msg Message to display before aborting. - * @param[in] ... Variable number of arguments of the message. - */ -void debug(const char *function, const char *file, const int line, const char *msg, ...); -} - #endif /* __ARM_COMPUTE_ERROR_H__ */ diff --git a/arm_compute/core/FixedPoint.h b/arm_compute/core/FixedPoint.h index 82c2d3347..6e00500b1 100644 --- a/arm_compute/core/FixedPoint.h +++ b/arm_compute/core/FixedPoint.h @@ -225,96 +225,96 @@ qint16_t sqmull_qs8(qint8_t a, qint8_t b, int fixed_point_position); qint32_t sqmull_qs16(qint16_t a, qint16_t b, int fixed_point_position); /** 16 bit fixed point scalar saturating multiply -* -* @param[in] a First 16 bit fixed point input -* @param[in] b Second 16 bit fixed point input -* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number -* -* @return The result of the 16 bit fixed point multiplication. The result is saturated in case of overflow -*/ + * + * @param[in] a First 16 bit fixed point input + * @param[in] b Second 16 bit fixed point input + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit fixed point multiplication. The result is saturated in case of overflow + */ qint16_t sqmul_qs16(qint16_t a, qint16_t b, int fixed_point_position); /** 8 bit fixed point scalar inverse square root -* -* @param[in] a 8 bit fixed point input -* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number -* -* @return The result of the 8 bit fixed point inverse square root. -*/ + * + * @param[in] a 8 bit fixed point input + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8 bit fixed point inverse square root. + */ qint8_t sinvsqrt_qs8(qint8_t a, int fixed_point_position); /** 16 bit fixed point scalar inverse square root -* -* @param[in] a 16 bit fixed point input -* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number -* -* @return The result of the 16 bit fixed point inverse square root. -*/ + * + * @param[in] a 16 bit fixed point input + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit fixed point inverse square root. + */ qint16_t sinvsqrt_qs16(qint16_t a, int fixed_point_position); /** 8 bit fixed point scalar division -* -* @param[in] a First 8 bit fixed point input -* @param[in] b Second 8 bit fixed point input -* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number -* -* @return The result of the 8 bit fixed point division. -*/ + * + * @param[in] a First 8 bit fixed point input + * @param[in] b Second 8 bit fixed point input + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8 bit fixed point division. + */ qint8_t sdiv_qs8(qint8_t a, qint8_t b, int fixed_point_position); /** 16 bit fixed point scalar division -* -* @param[in] a First 16 bit fixed point input -* @param[in] b Second 16 bit fixed point input -* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number -* -* @return The result of the 16 bit fixed point division. -*/ + * + * @param[in] a First 16 bit fixed point input + * @param[in] b Second 16 bit fixed point input + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit fixed point division. + */ qint16_t sdiv_qs16(qint16_t a, qint16_t b, int fixed_point_position); /** 8 bit fixed point scalar exponential -* -* @param[in] a 8 bit fixed point input -* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number -* -* @return The result of the 8 bit fixed point exponential. -*/ + * + * @param[in] a 8 bit fixed point input + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8 bit fixed point exponential. + */ qint8_t sqexp_qs8(qint8_t a, int fixed_point_position); /** 16 bit fixed point scalar exponential -* -* @param[in] a 16 bit fixed point input -* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number -* -* @return The result of the 16 bit fixed point exponential. -*/ + * + * @param[in] a 16 bit fixed point input + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit fixed point exponential. + */ qint16_t sqexp_qs16(qint16_t a, int fixed_point_position); /** 16 bit fixed point scalar exponential -* -* @param[in] a 16 bit fixed point input -* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number -* -* @return The result of the 16 bit fixed point exponential. -*/ + * + * @param[in] a 16 bit fixed point input + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit fixed point exponential. + */ qint16_t sexp_qs16(qint16_t a, int fixed_point_position); /** 8 bit fixed point scalar logarithm -* -* @param[in] a 8 bit fixed point input -* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number -* -* @return The result of the 8 bit fixed point logarithm. -*/ + * + * @param[in] a 8 bit fixed point input + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8 bit fixed point logarithm. + */ qint8_t slog_qs8(qint8_t a, int fixed_point_position); /** 16 bit fixed point scalar logarithm -* -* @param[in] a 16 bit fixed point input -* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number -* -* @return The result of the 16 bit fixed point logarithm. -*/ + * + * @param[in] a 16 bit fixed point input + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 16 bit fixed point logarithm. + */ qint16_t slog_qs16(qint16_t a, int fixed_point_position); /** Convert an 8 bit fixed point to float diff --git a/arm_compute/core/GLES_COMPUTE/GCHelpers.h b/arm_compute/core/GLES_COMPUTE/GCHelpers.h new file mode 100644 index 000000000..475554f2b --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/GCHelpers.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCHELPERS_H__ +#define __ARM_COMPUTE_GCHELPERS_H__ + +#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" +#include "arm_compute/core/Helpers.h" +#include "support/ToolchainSupport.h" + +#include <string> + +namespace arm_compute +{ +/** Helper function to create and return a unique_ptr pointed to a GLES kernel object + * It also calls the kernel's configuration. + * + * @param[in] args All the arguments that need pass to kernel's configuration. + * + * @return A unique pointer pointed to a GLES kernel object + */ +template <typename Kernel, typename... T> +std::unique_ptr<Kernel> create_configure_kernel(T &&... args) +{ + std::unique_ptr<Kernel> k = arm_compute::support::cpp14::make_unique<Kernel>(); + k->configure(std::forward<T>(args)...); + return k; +} + +/** Helper function to create and return a unique_ptr pointed to a GLES kernel object + * + * @return A unique pointer pointed to a GLES kernel object + */ +template <typename Kernel> +std::unique_ptr<Kernel> create_kernel() +{ + std::unique_ptr<Kernel> k = arm_compute::support::cpp14::make_unique<Kernel>(); + return k; +} + +/** Max vector width of an GLES vector */ +static constexpr unsigned int max_gc_vector_width = 16; +} +#endif /* __ARM_COMPUTE_GCHELPERS_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h b/arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h new file mode 100644 index 000000000..082732904 --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCKERNELLIBRARY_H__ +#define __ARM_COMPUTE_GCKERNELLIBRARY_H__ + +#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" +#include "arm_compute/core/Utils.h" + +#include <map> +#include <set> +#include <string> +#include <utility> +#include <vector> + +namespace arm_compute +{ +/** GCProgram class */ +class GCProgram +{ +public: + /** Default constructor. */ + GCProgram(); + /** Construct program from source file. + * + * @param[in] name Program name. + * @param[in] source Program source. + */ + GCProgram(std::string name, std::string source); + /** Default Copy Constructor. */ + GCProgram(const GCProgram &) = default; + /** Default Move Constructor. */ + GCProgram(GCProgram &&) = default; + /** Default copy assignment operator. */ + GCProgram &operator=(const GCProgram &) = default; + /** Default move assignment operator. */ + GCProgram &operator=(GCProgram &&) = default; + /** Returns program name. + * + * @return Program's name. + */ + std::string name() const + { + return _name; + } + /** Link program. + * + * @param[in] shader Shader used to link program. + * + * @return linked program id . + */ + GLuint link_program(GLuint shader); + /** Compile shader. + * + * @param[in] build_options Shader build options. + * + * @return GLES shader object. + */ + GLuint compile_shader(const std::string &build_options); + +private: + std::string _name; /**< Program name. */ + std::string _source; /**< Source code for the program. */ +}; + +/** GCKernel class */ +class GCKernel +{ +public: + /** Default Constructor. */ + GCKernel(); + /** Default Copy Constructor. */ + GCKernel(const GCKernel &) = default; + /** Default Move Constructor. */ + GCKernel(GCKernel &&) = default; + /** Default copy assignment operator. */ + GCKernel &operator=(const GCKernel &) = default; + /** Default move assignment operator. */ + GCKernel &operator=(GCKernel &&) = default; + /** Constructor. + * + * @param[in] name Kernel name. + * @param[in] program Built program. + */ + GCKernel(std::string name, GLuint program); + /** Destructor. + */ + ~GCKernel(); + /** Returns kernel name. + * + * @return Kernel's name. + */ + std::string name() const + { + return _name; + } + /** Get program id. + * + * @return program id. + */ + GLuint get_program() const + { + return _program; + } + /** Use current program. + * + * @return program id. + */ + void use(); + /** Unuse current program. + * + * @return program id. + */ + void unuse(); + /** Set argument value at index of shader params. + * + * @param[in] idx Index in shader params. + * @param[in] value Argument value to be set. + */ + template <class T> + void set_argument(unsigned int idx, T value) + { + if(idx >= _shader_arguments.size()) + { + _shader_arguments.resize(idx + 1, 0); + } + + unsigned int *p = reinterpret_cast<unsigned int *>(&value); + _shader_arguments[idx] = *p; + } + /** Clear shader arguments. + * + */ + void clear_arguments() + { + _shader_arguments.clear(); + } + /** Set shader params binding point. + * + * @param[in] binding Shader params binding point. + */ + void set_shader_params_binding_point(unsigned int binding) + { + _shader_params_binding_point = binding; + } + /** Update shader params. + * + */ + void update_shader_params(); + /** Clean up program and ubo. + * + */ + void cleanup(); + +private: + std::string _name; /**< Kernel name */ + GLuint _program; /**< Linked program id */ + std::vector<unsigned int> _shader_arguments; /**< Store all the values of the shader arguments */ + GLuint _shader_params_ubo_name; /**< Uniform buffer object name for shader parameters */ + GLuint _shader_params_binding_point; /**< The binding point of the uniform block for shader parameters */ + GLuint _shader_params_index; /**< The index of the uniform block */ + GLint _shader_params_size; /**< The uniform block data size in the shader */ + static constexpr const char *_shader_params_name = "shader_params"; /**< The uniform block name in the shader */ +}; + +/** GCKernelLibrary class */ +class GCKernelLibrary +{ + using StringSet = std::set<std::string>; + +private: + /** Default Constructor. */ + GCKernelLibrary(); + +public: + /** Prevent instances of this class from being copied. */ + GCKernelLibrary(const GCKernelLibrary &) = delete; + /** Prevent instances of this class from being copied. */ + const GCKernelLibrary &operator=(const GCKernelLibrary &) = delete; + /** Default Destructor. */ + ~GCKernelLibrary(); + + static GCKernelLibrary &get(); + /** Initialises the kernel library. + * + * @param[in] shader_path (Optional) Path of the directory from which shader sources are loaded. + * @param[in] dpy (Optional) EGLdisplay set by external application. + * @param[in] ctx (Optional) EGLContext set by external application. + */ + void init(std::string shader_path = "./", EGLDisplay dpy = EGL_NO_DISPLAY, EGLContext ctx = EGL_NO_CONTEXT) + { + _shader_path = std::move(shader_path); + + _display = dpy; + _context = ctx; + + if(_display == EGL_NO_DISPLAY || _context == EGL_NO_CONTEXT) + { + setup_context(); + + _own_context = true; + } + + eglMakeCurrent(_display, EGL_NO_SURFACE, EGL_NO_SURFACE, _context); + setup_dummy_fbo(); + } + + /** Sets the path that the shaders reside in. + * + * @param[in] shader_path Path of the shader. + */ + void set_shader_path(const std::string &shader_path) + { + _shader_path = shader_path; + }; + /** Sets display and context to create kernel. + * + * @param[in] dpy EGLdisplay set by external application. + * @param[in] ctx EGLContext set by external application. + */ + void set_context(EGLDisplay dpy, EGLContext ctx) + { + _display = dpy; + _context = ctx; + + eglMakeCurrent(dpy, EGL_NO_SURFACE, EGL_NO_SURFACE, ctx); + setup_dummy_fbo(); + }; + /** Creates a kernel from the kernel library. + * + * @param[in] shader_name Shader name. + * @param[in] build_options_set Shader build options as a set. + * + * @return The created kernel. + */ + GCKernel create_kernel(const std::string &shader_name, const StringSet &build_options_set = {}) const; + /** Serializes and saves programs to a binary. + * + */ + void save_binary(); + /** Load serialized binary with all the programs. + * + */ + void load_binary(); + /** Setup a dummy fbo to workaround an issue on Galaxy S8. + * + */ + void setup_dummy_fbo(); + +private: + /** Preprocess GLES shader + * + * @param[in] shader_source Source code of the shader to preprocess. + * + * @return Preprocessed GLES shader object. + */ + const std::string preprocess_shader(const std::string &shader_source) const; + /** Load program and its dependencies. + * + * @param[in] program_name Name of the program to load. + */ + const GCProgram &load_program(const std::string &program_name) const; + /** Concatenates contents of a set into a single string. + * + * @param[in] s Input set to concatenate. + * + * @return Concatenated string. + */ + std::string stringify_set(const StringSet &s) const; + /** Set up EGL context. + */ + void setup_context(); + + EGLDisplay _display; /**< Underlying EGL Display. */ + EGLContext _context; /**< Underlying EGL Context. */ + GLuint _frame_buffer; /**< Dummy fbo */ + GLuint _tex_rt; /**< Dummy texture for render target */ + bool _own_context; /**< Self created context or not. */ + std::string _shader_path; /**< Path to the shaders folder. */ + mutable std::map<std::string, const GCProgram> _programs_map; /**< Map with all already loaded program data. */ + mutable std::map<std::string, const GCKernel> _built_programs_map; /**< Map with all already built program data. */ + static const std::map<std::string, std::string> _shader_program_map; /**< Map that associates kernel names with programs. */ + static const std::map<std::string, std::string> _program_source_map; /**< Contains sources for all programs. + Used for compile-time shader inclusion. */ +}; +} +#endif /* __ARM_COMPUTE_GCKERNELLIBRARY_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/GCKernels.h b/arm_compute/core/GLES_COMPUTE/GCKernels.h new file mode 100644 index 000000000..417c98af6 --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/GCKernels.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCKERNELS_H__ +#define __ARM_COMPUTE_GCKERNELS_H__ + +/* Header regrouping all the GLES compute kernels */ +#include "arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h" + +#endif /* __ARM_COMPUTE_GCKERNELS_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/IGCKernel.h b/arm_compute/core/GLES_COMPUTE/IGCKernel.h new file mode 100644 index 000000000..11b2b17e5 --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/IGCKernel.h @@ -0,0 +1,179 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_IGCKERNEL_H__ +#define __ARM_COMPUTE_IGCKERNEL_H__ + +#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" +#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" + +#include "arm_compute/core/IKernel.h" + +namespace arm_compute +{ +class IGCTensor; +class Window; + +/** Common interface for all the GLES kernels */ +class IGCKernel : public IKernel +{ +public: + /** Constructor */ + IGCKernel(); + /** Returns a reference to the GLES kernel of this object. + * + * @return A reference to the GLES kernel of this object. + */ + GCKernel &kernel(); + + class BufferParam + { + public: + /** Tensor's binding point in this kernel. */ + unsigned int binding_point = 0; + /** The base 2 logarithm of SSBO buffer data type size (Number of bits to be shift for offset calculation) */ + unsigned int buffer_data_type_shift = 0; + + /** Constructor + * + * @param[in] binding Tensor's binding point. + * @param[in] shift Number of bits to be shift for offset calculation + */ + BufferParam(const unsigned int binding, const unsigned int shift) + : binding_point(binding), buffer_data_type_shift(shift) + { + } + }; + + /** Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx. + * + * @param[in] idx Index at which to start adding the tensor's arguments.Input and output tensor will have sperated index, multiple indices start from 1, single index have to be set to 0. + * @param[in] tensor Tensor to set as an argument of the object's kernel. + * @param[in] binding_point Tensor's binding point in this kernel. + * @param[in] window Window the kernel will be executed on. + */ + void add_1D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window); + + /** Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx. + * + * @param[in] idx Index at which to start adding the tensor's arguments.Input and output tensor will have sperated index, multiple indices start from 1, single index have to be set to 0. + * @param[in] tensor Tensor to set as an argument of the object's kernel. + * @param[in] param Additional parameter for GLES SSBO buffer. + * @param[in] window Window the kernel will be executed on. + */ + void add_1D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const BufferParam ¶m, const Window &window); + + /** Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx. + * + * @param[in] idx Index at which to start adding the tensor's arguments.Input and output tensor will have sperated index, multiple indices start from 1, single index have to be set to 0. + * @param[in] tensor Tensor to set as an argument of the object's kernel. + * @param[in] binding_point Tensor's binding point in this kernel. + * @param[in] window Window the kernel will be executed on. + */ + void add_2D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window); + + /** Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx. + * + * @param[in] idx Index at which to start adding the tensor's arguments.Input and output tensor will have sperated index, multiple indices start from 1, single index have to be set to 0. + * @param[in] tensor Tensor to set as an argument of the object's kernel. + * @param[in] param Additional parameter for GLES SSBO buffer. + * @param[in] window Window the kernel will be executed on. + */ + void add_2D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const BufferParam ¶m, const Window &window); + + /** Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx. + * + * @param[in] idx Index at which to start adding the tensor's arguments.Input and output tensor will have sperated index, multiple indices start from 1, single index have to be set to 0. + * @param[in] tensor Tensor to set as an argument of the object's kernel. + * @param[in] binding_point Tensor's binding point in this kernel. + * @param[in] window Window the kernel will be executed on. + */ + void add_3D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window); + + /** Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx. + * + * @param[in] idx Index at which to start adding the tensor's arguments.Input and output tensor will have sperated index, multiple indices start from 1, single index have to be set to 0. + * @param[in] tensor Tensor to set as an argument of the object's kernel. + * @param[in] param Additional parameter for GLES SSBO buffer. + * @param[in] window Window the kernel will be executed on. + */ + void add_3D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const BufferParam ¶m, const Window &window); + + /** Returns the number of arguments enqueued per 1D tensor object. + * + * @return The number of arguments enqueues per 1D tensor object. + */ + unsigned int num_arguments_per_1D_tensor() const; + /** Returns the number of arguments enqueued per 2D tensor object. + * + * @return The number of arguments enqueues per 2D tensor object. + */ + unsigned int num_arguments_per_2D_tensor() const; + /** Returns the number of arguments enqueued per 3D tensor object. + * + * @return The number of arguments enqueues per 3D tensor object. + */ + unsigned int num_arguments_per_3D_tensor() const; + /** Enqueue the OpenGL ES shader to process the given window + * + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). + */ + virtual void run(const Window &window) = 0; + +private: + /** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx. + * + * @param[in] idx Index at which to start adding the tensor's arguments.Input and output tensor will have sperated index, multiple indices start from 1, single index have to be set to 0. + * @param[in] tensor Tensor to set as an argument of the object's kernel. + * @param[in] param Additional parameter for GLES SSBO buffer. + * @param[in] window Window the kernel will be executed on. + */ + template <unsigned int dimension_size> + void add_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const BufferParam ¶m, const Window &window); + + /** Returns the number of arguments enqueued per tensor object. + * + * @return The number of arguments enqueued per tensor object. + */ + template <unsigned int dimension_size> + unsigned int num_arguments_per_tensor() const; + +protected: + GCKernel _kernel; /**< GLES kernel to run */ +}; + +/** Add the kernel to the command queue with the given window. + * + * @note Depending on the size of the window, this might translate into several jobs being enqueued. + * + * @note If kernel->kernel() is empty then the function will return without adding anything to the queue. + * + * @param[in] kernel Kernel to enqueue + * @param[in] window Window the kernel has to process. + * @param[in] lws Local workgroup size requested, by default (1, 1, 1) + * + * @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed. + */ +void enqueue(IGCKernel &kernel, const Window &window, const gles::NDRange &lws = gles::NDRange(1U, 1U, 1U)); +} +#endif /*__ARM_COMPUTE_IGCKERNEL_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h b/arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h new file mode 100644 index 000000000..413e86a2b --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_IGCSIMPLE2DKERNEL_H__ +#define __ARM_COMPUTE_IGCSIMPLE2DKERNEL_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCSimpleKernel.h" + +namespace arm_compute +{ +class IGCTensor; + +/** Interface for simple OpenGL ES kernels having 1 tensor input and 1 tensor output. This interface can be used when the work-item processes a 2D tile */ +class IGCSimple2DKernel : public IGCSimpleKernel +{ +public: + // Inherited methods overridden: + void run(const Window &window) override; +}; +} +#endif /*__ARM_COMPUTE_IGCSIMPLE2DKERNEL_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h b/arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h new file mode 100644 index 000000000..622e53c38 --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_IGCSIMPLE3DKERNEL_H__ +#define __ARM_COMPUTE_IGCSIMPLE3DKERNEL_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h" + +namespace arm_compute +{ +class IGCTensor; + +/** Interface for simple GLES kernels having 1 tensor input and 1 tensor output. + * Both input tensor and output tensor must have at least 3 dimensions. + */ +class IGCSimple3DKernel : public IGCSimple2DKernel +{ +public: + // Inherited methods overridden: + void run(const Window &window) override; +}; +} +#endif /*__ARM_COMPUTE_IGCSIMPLE3DKERNEL_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/IGCSimpleKernel.h b/arm_compute/core/GLES_COMPUTE/IGCSimpleKernel.h new file mode 100644 index 000000000..a23c4e774 --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/IGCSimpleKernel.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_IGCSIMPLEKERNEL_H__ +#define __ARM_COMPUTE_IGCSIMPLEKERNEL_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/Helpers.h" + +namespace arm_compute +{ +/** Interface for simple OpenGL ES kernels having 1 tensor input and 1 tensor output */ +class IGCSimpleKernel : public IGCKernel +{ +public: + /** Constructor. */ + IGCSimpleKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + IGCSimpleKernel(const IGCSimpleKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + IGCSimpleKernel &operator=(const IGCSimpleKernel &) = delete; + /** Allow instances of this class to be moved. */ + IGCSimpleKernel(IGCSimpleKernel &&) = default; + /** Allow instances of this class to be moved. */ + IGCSimpleKernel &operator=(IGCSimpleKernel &&) = default; + /** Default destructor */ + ~IGCSimpleKernel() = default; + + /** Configure the kernel + * + * @param[in] input Source tensor. + * @param[out] output Destination tensor. + * @param[in] num_elems_processed_per_iteration Number of processed elements per iteration. + * @param[in] border_undefined (Optional) True if the border mode is undefined. False if it's replicate or constant. + * @param[in] border_size (Optional) Size of the border. + */ + void configure(const IGCTensor *input, IGCTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined = false, const BorderSize &border_size = BorderSize()); + +protected: + const IGCTensor *_input; + IGCTensor *_output; +}; +} + +#endif /*__ARM_COMPUTE_IGCSIMPLEKERNEL_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/IGCTensor.h b/arm_compute/core/GLES_COMPUTE/IGCTensor.h new file mode 100644 index 000000000..ab4e57e0c --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/IGCTensor.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_IGCTENSOR_H__ +#define __ARM_COMPUTE_IGCTENSOR_H__ + +#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" +#include "arm_compute/core/ITensor.h" + +#include <cstdint> + +namespace arm_compute +{ +/** Interface for GLES Compute tensor */ +class IGCTensor : public ITensor +{ +public: + /** Default constructor. */ + IGCTensor(); + + /** Prevent instances of this class from being copied (As this class contains pointers). */ + IGCTensor(const IGCTensor &) = delete; + + /** Prevent instances of this class from being copy assigned (As this class contains pointers). */ + IGCTensor &operator=(const IGCTensor &) = delete; + + /** Allow instances of this class to be moved */ + IGCTensor(IGCTensor &&) = default; + + /** Allow instances of this class to be moved */ + IGCTensor &operator=(IGCTensor &&) = default; + + /** Virtual destructor */ + virtual ~IGCTensor() = default; + + /** Map on an allocated buffer. + * + * @param[in] blocking (Optional) If true, then the mapping will be ready to use by the time + * this method returns, else it is the caller's responsibility + * to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer. + */ + void map(bool blocking = true); + /** Unmap an allocated and mapped buffer. + */ + void unmap(); + /** Clear the contents of the tensor synchronously. + */ + void clear(); + + // Inherited methods overridden: + uint8_t *buffer() const override; + /** Interface to be implemented by the child class to return the tensor's gles compute buffer id. + * + * @return A SSBO buffer id. + */ + virtual GLuint gc_buffer() const = 0; + +protected: + /** Method to be implemented by the child class to map the SSBO. + * + * @param[in] blocking If true, then the mapping will be ready to use by the time + * this method returns, else it is the caller's responsibility + * to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer. + */ + virtual uint8_t *do_map(bool blocking) = 0; + /** Method to be implemented by the child class to unmap the SSBO. + * + * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before + * the memory is accessed by the device. + */ + virtual void do_unmap() = 0; + +private: + uint8_t *_mapping; +}; + +using IGCImage = IGCTensor; +} +#endif /*__ARM_COMPUTE_IGCTENSOR_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/OpenGLES.h b/arm_compute/core/GLES_COMPUTE/OpenGLES.h new file mode 100644 index 000000000..e12398294 --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/OpenGLES.h @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_OPENGLES_H__ +#define __ARM_COMPUTE_OPENGLES_H__ + +#include "arm_compute/core/Log.h" + +#include <EGL/egl.h> +#include <EGL/eglext.h> +#include <EGL/eglplatform.h> +#include <GLES3/gl31.h> +#include <GLES3/gl3ext.h> +#include <cstddef> + +#ifdef ARM_COMPUTE_DEBUG_ENABLED +#define ARM_COMPUTE_GL_CHECK(x) \ + x; \ + { \ + GLenum error = glGetError(); \ + if(error != GL_NO_ERROR) \ + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("glGetError() = %i (0x%.8x)\n", error, error); \ + } +#else /* ARM_COMPUTE_DEBUG_ENABLED */ +#define ARM_COMPUTE_GL_CHECK(x) x +#endif /* ARM_COMPUTE_DEBUG_ENABLED */ + +namespace arm_compute +{ +namespace gles +{ +/** Class interface for specifying NDRange values. */ +class NDRange +{ +private: + size_t _sizes[3]; + size_t _dimensions; + +public: + /** Default constructor - resulting range has zero dimensions. */ + NDRange() + : _dimensions(0) + { + _sizes[0] = 0; + _sizes[1] = 0; + _sizes[2] = 0; + } + + /** Constructs one-dimensional range. + * + * @param[in] size0 Size of the first dimension. + */ + NDRange(size_t size0) + : _dimensions(1) + { + _sizes[0] = size0; + _sizes[1] = 1; + _sizes[2] = 1; + } + + /** Constructs two-dimensional range. + * + * @param[in] size0 Size of the first dimension. + * @param[in] size1 Size of the second dimension. + */ + NDRange(size_t size0, size_t size1) + : _dimensions(2) + { + _sizes[0] = size0; + _sizes[1] = size1; + _sizes[2] = 1; + } + + /** Constructs three-dimensional range. + * + * @param[in] size0 Size of the first dimension. + * @param[in] size1 Size of the second dimension. + * @param[in] size2 Size of the third dimension. + */ + NDRange(size_t size0, size_t size1, size_t size2) + : _dimensions(3) + { + _sizes[0] = size0; + _sizes[1] = size1; + _sizes[2] = size2; + } + + /** Conversion operator to const size_t *. + * + * @returns A pointer to the size of the first dimension. + */ + operator const size_t *() const + { + return _sizes; + } + + /** Queries the number of dimensions in the range. + * + * @returns The number of dimensions. + */ + size_t dimensions() const + { + return _dimensions; + } + + /** Returns the size of the object in bytes based on the runtime number of dimensions + * + * @returns The size of the object in bytes. + */ + size_t size() const + { + return _dimensions * sizeof(size_t); + } + + /** Returns the sizes array for each dimensions. + * + * @returns The sizes array + */ + size_t *get() + { + return _sizes; + } + + /** Returns the sizes array for each dimensions. + * + * @returns The sizes array + */ + const size_t *get() const + { + return _sizes; + } +}; + +static const NDRange NullRange; +static const NDRange Range_128_1 = NDRange(128, 1); +} // namespace gles + +/** Check if the OpenGL ES 3.1 API is available at runtime. + * + * @returns true if the OpenGL ES 3.1 API is available. + */ +bool opengles31_is_available(); +} // namespace arm_compute + +#endif /* __ARM_COMPUTE_OPENGLES_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h new file mode 100644 index 000000000..71f7b3770 --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCABSOLUTEDIFFERENCEKERNEL_H__ +#define __ARM_COMPUTE_GCABSOLUTEDIFFERENCEKERNEL_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h" + +namespace arm_compute +{ +class IGCTensor; + +/** Interface for the absolute difference kernel. + * + * Absolute difference is computed by: + * @f[ output(x,y) = | input1(x,y) - input2(x,y) | @f] + */ +class GCAbsoluteDifferenceKernel : public IGCKernel +{ +public: + /** Default constructor. */ + GCAbsoluteDifferenceKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + GCAbsoluteDifferenceKernel(const GCAbsoluteDifferenceKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + GCAbsoluteDifferenceKernel &operator=(const GCAbsoluteDifferenceKernel &) = delete; + /** Allow instances of this class to be moved. */ + GCAbsoluteDifferenceKernel(GCAbsoluteDifferenceKernel &&) = default; + /** Allow instances of this class to be moved. */ + GCAbsoluteDifferenceKernel &operator=(GCAbsoluteDifferenceKernel &&) = default; + /** Default destructor */ + ~GCAbsoluteDifferenceKernel() = default; + + /** Set the inputs and output images. + * + * @param[in] input1 Source tensor. Data types supported: U8 + * @param[in] input2 Source tensor. Data types supported: U8 + * @param[out] output Destination tensor. Data types supported: U8 + */ + void configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + const IGCTensor *_input1; /**< Source tensor 1. */ + const IGCTensor *_input2; /**< Source tensor 2. */ + IGCTensor *_output; /**< Destination tensor. */ +}; +} +#endif /* __ARM_COMPUTE_GCABSOLUTEDIFFERENCEKERNEL_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h new file mode 100644 index 000000000..fc1d52f45 --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCACTIVATIONLAYERKERNEL_H__ +#define __ARM_COMPUTE_GCACTIVATIONLAYERKERNEL_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h" + +namespace arm_compute +{ +class IGCTensor; + +/** Interface for the activation layer kernel. */ +class GCActivationLayerKernel : public IGCKernel +{ +public: + /** Default constructor */ + GCActivationLayerKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCActivationLayerKernel(const GCActivationLayerKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCActivationLayerKernel &operator=(const GCActivationLayerKernel &) = delete; + /** Allow instances of this class to be moved */ + GCActivationLayerKernel(GCActivationLayerKernel &&) = default; + /** Allow instances of this class to be moved */ + GCActivationLayerKernel &operator=(GCActivationLayerKernel &&) = default; + /** Default destructor */ + ~GCActivationLayerKernel() = default; + /** Set the input and output tensor. + * + * @note If the output tensor is a nullptr, the activation function will be performed in-place + * + * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result + * of the activation function. Data types supported: F16/F32. + * @param[out] output Destination tensor. Data type should match the input data type. + * @param[in] act_info Activation layer information. + */ + void configure(IGCTensor *input, IGCTensor *output, ActivationLayerInfo act_info); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + IGCTensor *_input; + IGCTensor *_output; +}; +} +#endif /*__ARM_COMPUTE_GCACTIVATIONLAYERKERNEL_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h new file mode 100644 index 000000000..2bbd6a83f --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCBATCHNORMALIZATIONLAYERKERNEL_H__ +#define __ARM_COMPUTE_GCBATCHNORMALIZATIONLAYERKERNEL_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h" + +namespace arm_compute +{ +class IGCTensor; + +/** Interface for the BatchNormalization layer kernel. + */ +class GCBatchNormalizationLayerKernel : public IGCKernel +{ +public: + /** Constructor */ + GCBatchNormalizationLayerKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCBatchNormalizationLayerKernel(const GCBatchNormalizationLayerKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCBatchNormalizationLayerKernel &operator=(const GCBatchNormalizationLayerKernel &) = delete; + /** Default Move Constructor. */ + GCBatchNormalizationLayerKernel(GCBatchNormalizationLayerKernel &&) = default; + /** Default move assignment operator. */ + GCBatchNormalizationLayerKernel &operator=(GCBatchNormalizationLayerKernel &&) = default; + /** Default destructor */ + ~GCBatchNormalizationLayerKernel() = default; + + /** Set the input and output tensors. + * + * @param[in] input Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, FM]. + * The rest are optional and used for representing batches. Data types supported: F16/F32. + * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input + * @param[in] mean Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] var Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] beta Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] gamma Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] epsilon Small value to avoid division with zero. + */ + void configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *var, const IGCTensor *beta, const IGCTensor *gamma, float epsilon); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + const IGCTensor *_input; + IGCTensor *_output; + const IGCTensor *_mean; + const IGCTensor *_var; + const IGCTensor *_beta; + const IGCTensor *_gamma; + float _epsilon; +}; +} +#endif /*__ARM_COMPUTE_GCBATCHNORMALIZATIONLAYERKERNEL_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h new file mode 100644 index 000000000..257ab0eca --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_GCCOL2IMKERNEL_H__ +#define __ARM_COMPUTE_GCCOL2IMKERNEL_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h" + +namespace arm_compute +{ +class IGCTensor; + +/** Interface for the col2im reshaping kernel. + * + * Rearranges each matrix column into image blocks. It's the inverse operation of @ref GCIm2ColKernel. + * + * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3: + * + * @f[ + * \left( \begin{array}{ccccccccc} + * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\ + * \end{array} \right) + * \rightarrow + * \left( \begin{array}{ccc} + * a0 & a1 & a2 \\ + * a3 & a4 & a5 \\ + * a6 & a7 & a8 \\ + * \end{array} \right) + * @f] + */ +class GCCol2ImKernel : public IGCKernel +{ +public: + /** Default constructor */ + GCCol2ImKernel(); + + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCCol2ImKernel(const GCCol2ImKernel &) = delete; + + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCCol2ImKernel &operator=(const GCCol2ImKernel &) = delete; + + /** Allow instances of this class to be moved */ + GCCol2ImKernel(GCCol2ImKernel &&) = default; + + /** Allow instances of this class to be moved */ + GCCol2ImKernel &operator=(GCCol2ImKernel &&) = default; + + /** Default destructor */ + ~GCCol2ImKernel() = default; + + /** Set the input and output of the kernel. + * + * @param[in] input The input tensor to convert. Data types supported: F32 + * @param[out] output The output tensor. 3 lower dimensions represent a single output [width, height, OFM], + * while the rest represent batch of outputs. Data types supported: Same as @p input + * @param[in] convolved_dims Output convolved dimensions. + */ + void configure(const IGCTensor *input, IGCTensor *output, std::pair<unsigned int, unsigned int> convolved_dims); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + const IGCTensor *_input; + IGCTensor *_output; + std::pair<unsigned int, unsigned int> _convolved_dims; +}; +} + +#endif /*__ARM_COMPUTE_GCCOL2IMKERNEL_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h new file mode 100644 index 000000000..ce220cc56 --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_GCDEPTHCONCATENATEKERNEL_H__ +#define __ARM_COMPUTE_GCDEPTHCONCATENATEKERNEL_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class IGCTensor; + +/** Interface for the depth concatenate kernel. + * The input tensor will be concatenated into the output tensor. + */ +class GCDepthConcatenateLayerKernel : public IGCKernel +{ +public: + /** Default constructor */ + GCDepthConcatenateLayerKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCDepthConcatenateLayerKernel(const GCDepthConcatenateLayerKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCDepthConcatenateLayerKernel &operator=(const GCDepthConcatenateLayerKernel &) = delete; + /** Allow instances of this class to be moved */ + GCDepthConcatenateLayerKernel(GCDepthConcatenateLayerKernel &&) = default; + /** Allow instances of this class to be moved */ + GCDepthConcatenateLayerKernel &operator=(GCDepthConcatenateLayerKernel &&) = default; + /** Default destructor */ + ~GCDepthConcatenateLayerKernel() = default; + /** Initialise the kernel's inputs and output + * + * @param[in] input Input tensor. Data types supported: F16/F32. + * @param[in] depth_offset The offset on the Z axis. + * @param[in,out] output Output tensor. Data types supported: Same as @p input. + * + * @note: The output tensor's low two dimensions can't be smaller than the input one's. + * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2. + * + */ + void configure(const IGCTensor *input, unsigned int depth_offset, IGCTensor *output); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + const IGCTensor *_input; + IGCTensor *_output; + int _top_bottom; + int _left_right; +}; +} +#endif /* __ARM_COMPUTE_GCDEPTHCONCATENATEKERNEL_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h new file mode 100644 index 000000000..415b781bc --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCDIRECTCONVOLUTIONLAYERKERNEL_H__ +#define __ARM_COMPUTE_GCDIRECTCONVOLUTIONLAYERKERNEL_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h" +#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" + +namespace arm_compute +{ +class IGCTensor; + +/** Interface for the direct convolution kernel. + */ +template <unsigned int kernel_size> +class GCDirectConvolutionLayerKernel : public IGCKernel +{ +public: + /** Default constructor */ + GCDirectConvolutionLayerKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCDirectConvolutionLayerKernel(const GCDirectConvolutionLayerKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCDirectConvolutionLayerKernel &operator=(const GCDirectConvolutionLayerKernel &) = delete; + /** Allow instances of this class to be moved */ + GCDirectConvolutionLayerKernel(GCDirectConvolutionLayerKernel &&) = default; + /** Allow instances of this class to be moved */ + GCDirectConvolutionLayerKernel &operator=(GCDirectConvolutionLayerKernel &&) = default; + /** Default destructor */ + ~GCDirectConvolutionLayerKernel() = default; + /** Set the input and output of the kernel. + * + * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32 + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. + * @param[in] bias Biases tensor. Shared bias supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input. + * @param[out] output The output tensor. First 2 lower dimensions represent a transform of each 3D input, + * while every dimension above represents a batch. Data types supported: Same as @p input + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + */ + void configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *bias, IGCTensor *output, const PadStrideInfo &conv_info); + + // Inherited methods overridden: + BorderSize border_size() const override; + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + const IGCTensor *_input; + const IGCTensor *_bias; + const IGCTensor *_weights; + IGCTensor *_output; + BorderSize _border_size; + int _conv_stride_x; + int _conv_stride_y; + int _conv_pad_x; + int _conv_pad_y; + gles::NDRange _lws; +}; + +using GCDirectConvolutionLayer1x1Kernel = GCDirectConvolutionLayerKernel<1>; +using GCDirectConvolutionLayer3x3Kernel = GCDirectConvolutionLayerKernel<3>; +using GCDirectConvolutionLayer5x5Kernel = GCDirectConvolutionLayerKernel<5>; +} +#endif /*__ARM_COMPUTE_GCDIRECTCONVOLUTIONLAYERKERNEL_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.h new file mode 100644 index 000000000..9f04411d9 --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_GCDROPOUTLAYERKERNEL_H__ +#define __ARM_COMPUTE_GCDROPOUTLAYERKERNEL_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h" + +namespace arm_compute +{ +class IGCTensor; + +/** Interface for the dropout layer kernel. + * + * Dropout is used to improve over-fit on neural networks. + * + */ +class GCDropoutLayerKernel : public IGCKernel +{ +public: + /** Default constructor */ + GCDropoutLayerKernel(); + + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCDropoutLayerKernel(const GCDropoutLayerKernel &) = delete; + + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCDropoutLayerKernel &operator=(const GCDropoutLayerKernel &) = delete; + + /** Allow instances of this class to be moved */ + GCDropoutLayerKernel(GCDropoutLayerKernel &&) = default; + + /** Allow instances of this class to be moved */ + GCDropoutLayerKernel &operator=(GCDropoutLayerKernel &&) = default; + + /** Set the input and output of the kernel. + * + * @param[in] input The input tensor for this op. Data types supported: F16/F32 + * @param[out] mask The mask tensor. Data types supported: Same as @p input + * @param[out] output The output tensor. Data types supported: Same as @p input + * @param[in] ratio Dropout ratio + * @param[in] forward Forward or backward propagation + * + */ + void configure(const IGCTensor *input, IGCTensor *mask, IGCTensor *output, float ratio, bool forward); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + const IGCTensor *_input; + IGCTensor *_mask; + IGCTensor *_output; + unsigned int _num_elems_processed_per_iteration; +}; +} + +#endif /*__ARM_COMPUTE_GCDROPOUTLAYERKERNEL_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h new file mode 100644 index 000000000..acb8aa67d --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCFILLBORDERKERNEL_H__ +#define __ARM_COMPUTE_GCFILLBORDERKERNEL_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h" +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class IGCTensor; + +/** Interface for filling the border of a kernel */ +class GCFillBorderKernel : public IGCKernel +{ +public: + /** Default constructor */ + GCFillBorderKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCFillBorderKernel(const GCFillBorderKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCFillBorderKernel &operator=(const GCFillBorderKernel &) = delete; + /** Allow instances of this class to be moved */ + GCFillBorderKernel(GCFillBorderKernel &&) = default; + /** Allow instances of this class to be moved */ + GCFillBorderKernel &operator=(GCFillBorderKernel &&) = default; + /** Default destructor */ + ~GCFillBorderKernel() = default; + + /** Initialise the kernel's input, output and border mode. + * + * @param[in,out] tensor Tensor to process Data types supported: F16/F32. + * @param[in] border_size Size of the border to fill in elements. + * @param[in] border_mode Border mode to use for the convolution. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + */ + void configure(const IGCTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); + + /** Function to set the constant value on fill border kernel depending on type. + * + * @param[in] idx Index of the kernel argument to set. + * @param[in] constant_border_value Constant value to use for borders if border_mode is set to CONSTANT. + */ + template <class T> + void set_constant_border(unsigned int idx, const PixelValue &constant_border_value); + + // Inherited methods overridden: + void run(const Window &window) override; + bool is_parallelisable() const override; + +private: + const IGCTensor *_tensor; +}; +} +#endif /*__ARM_COMPUTE_GCFILLBORDERKERNEL_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h new file mode 100644 index 000000000..b2369a6ad --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCGEMMINTERLEAVE4X4KERNEL_H__ +#define __ARM_COMPUTE_GCGEMMINTERLEAVE4X4KERNEL_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h" + +namespace arm_compute +{ +class IGCTensor; + +/** OpenGL ES kernel which interleaves the elements of a matrix A in chunk of 4x4 + * + * This function puts the values in a 4x4 block of Matrix A on the same row (Interleaved values) + * + * @f[ + * \left( \begin{array}{cccc} + * a00 & a01 & a02 & a03 \\ + * a10 & a11 & a12 & a13 \\ + * a20 & a21 & a22 & a23 \\ + * a30 & a31 & a32 & a33 \\ + * \end{array} \right) + * \rightarrow + * \left( \begin{array}{ccccccccccccccccc} + * a00 & a10 & a20 & a30 & a01 & a11 & a21 & a31 & a02 & a12 & a22 & a32 & a03 & a13 & a23 & a33 \\ + * \end{array} \right) + * @f] + * + * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ] + */ +class GCGEMMInterleave4x4Kernel : public IGCKernel +{ +public: + /** Default constructor */ + GCGEMMInterleave4x4Kernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCGEMMInterleave4x4Kernel(const GCGEMMInterleave4x4Kernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCGEMMInterleave4x4Kernel &operator=(const GCGEMMInterleave4x4Kernel &) = delete; + /** Allow instances of this class to be moved */ + GCGEMMInterleave4x4Kernel(GCGEMMInterleave4x4Kernel &&) = default; + /** Allow instances of this class to be moved */ + GCGEMMInterleave4x4Kernel &operator=(GCGEMMInterleave4x4Kernel &&) = default; + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: F32 + * @param[out] output Output tensor. Data type supported: same as @p input + */ + void configure(const IGCTensor *input, IGCTensor *output); + + // Inherited methods overridden + void run(const Window &window) override; + +private: + const IGCTensor *_input; + IGCTensor *_output; +}; +} +#endif /* __ARM_COMPUTE_GCGEMMINTERLEAVE4X4KERNEL_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h new file mode 100644 index 000000000..77a52b2aa --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCGEMMMATRIXACCUMULATEBIASESKERNEL_H__ +#define __ARM_COMPUTE_GCGEMMMATRIXACCUMULATEBIASESKERNEL_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h" + +namespace arm_compute +{ +/** Interface to add a bias to each row of the input tensor + * + */ +class GCGEMMMatrixAccumulateBiasesKernel : public IGCKernel +{ +public: + /** Default constructor */ + GCGEMMMatrixAccumulateBiasesKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCGEMMMatrixAccumulateBiasesKernel(const GCGEMMMatrixAccumulateBiasesKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCGEMMMatrixAccumulateBiasesKernel &operator=(const GCGEMMMatrixAccumulateBiasesKernel &) = delete; + /** Allow instances of this class to be moved */ + GCGEMMMatrixAccumulateBiasesKernel(GCGEMMMatrixAccumulateBiasesKernel &&) = default; + /** Allow instances of this class to be moved */ + GCGEMMMatrixAccumulateBiasesKernel &operator=(GCGEMMMatrixAccumulateBiasesKernel &&) = default; + /** Set the accumulate buffer and the biases of the kernel. + * + * @param[in, out] accum The accumulate tensor to convert. Data types supported: F16/F32 + * @param[in] biases The shared biases tensor to append. It must be 1D tensor. Data types supported: Same as @p input + */ + void configure(IGCTensor *accum, const IGCTensor *biases); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + IGCTensor *_accum; + const IGCTensor *_biases; + gles::NDRange _lws; +}; +} + +#endif /*__ARM_COMPUTE_GCGEMMMATRIXACCUMULATEBIASESKERNEL_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h new file mode 100644 index 000000000..02abb8da7 --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCGEMMMATRIXADDITIONKERNEL_H__ +#define __ARM_COMPUTE_GCGEMMMATRIXADDITIONKERNEL_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h" + +namespace arm_compute +{ +class IGCTensor; + +/** OpenGL ES kernel to perform the in-place matrix addition between 2 matrices, taking into account that the second matrix might be weighted by a scalar value beta. + * The matrices must have the same dimensions + * + * @note This kernel is computed if and only if beta != 0.0. + */ +class GCGEMMMatrixAdditionKernel : public IGCKernel +{ +public: + /** Default constructor */ + GCGEMMMatrixAdditionKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCGEMMMatrixAdditionKernel(const GCGEMMMatrixAdditionKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCGEMMMatrixAdditionKernel &operator=(const GCGEMMMatrixAdditionKernel &) = delete; + /** Allow instances of this class to be moved */ + GCGEMMMatrixAdditionKernel(GCGEMMMatrixAdditionKernel &&) = default; + /** Allow instances of this class to be moved */ + GCGEMMMatrixAdditionKernel &operator=(GCGEMMMatrixAdditionKernel &&) = default; + /** Initialise the kernel's input, output and beta value + * + * @note The input and output tensors must have the same dimensions + * + * @param[in] input Input tensor (Matrix C). Data types supported: F32 + * @param[in, out] output Output tensor. If this kernel is used to finalize the GEMM result (alpha * AB + beta * C), output must contain the result obtained by @ref GCGEMMMatrixMultiplyKernel. Data type supported: same as @p input + * @param[in] beta Weight of matrix C + */ + void configure(const IGCTensor *input, IGCTensor *output, float beta); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + const IGCTensor *_input; + IGCTensor *_output; +}; +} + +#endif /* __ARM_COMPUTE_GCGEMMMATRIXADDITIONKERNEL_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h new file mode 100644 index 000000000..3a0b22f14 --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCGEMMMATRIXMULTIPLYKERNEL_H__ +#define __ARM_COMPUTE_GCGEMMMATRIXMULTIPLYKERNEL_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h" + +namespace arm_compute +{ +class IGCTensor; + +/** GLES Compute kernel to multiply two input matrices "A" and "B" or to multiply a vector "A" by a matrix "B". All elements of the output matrix/vector will be multiplied by alpha + * + * @note If the output tensor is a matrix, the implementation assumes that the input tensors @p input0 and @p input1 are both matrices and reshaped respectively with @ref GCGEMMInterleave4x4Kernel" and @ref GCGEMMTranspose1xWKernel + * @note If the output tensor is a vector and the data type is F32, the implementation assumes that the first input tensor @p input0 is a vector and the second input tensor @p input1 a matrix. The implementation also assumes that both tensors have not been reshaped + * + * @attention The second input tensor must have at least 2 dimensions (matrix) + * + */ +class GCGEMMMatrixMultiplyKernel : public IGCKernel +{ +public: + /** Default constructor */ + GCGEMMMatrixMultiplyKernel(); + + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCGEMMMatrixMultiplyKernel(const GCGEMMMatrixMultiplyKernel &) = delete; + + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCGEMMMatrixMultiplyKernel &operator=(const GCGEMMMatrixMultiplyKernel &) = delete; + + /** Allow instances of this class to be moved */ + GCGEMMMatrixMultiplyKernel(GCGEMMMatrixMultiplyKernel &&) = default; + + /** Allow instances of this class to be moved */ + GCGEMMMatrixMultiplyKernel &operator=(GCGEMMMatrixMultiplyKernel &&) = default; + + /** Initialise the kernel's input, output and alpha + * + * @param[in] input0 Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32 + * @param[in] input1 Input tensor containing the transposed Matrix B if the first input tensor A is not a vector. + * If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0 + * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0 + * @param[in] alpha Weight of the matrix product + * @param[in] is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref GCGEMMInterleave4x4Kernel and @ref GCGEMMTranspose1xWKernel + */ + void configure(const IGCTensor *input0, const IGCTensor *input1, IGCTensor *output, float alpha, bool is_interleaved_transposed = true); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + const IGCTensor *_input0; + const IGCTensor *_input1; + IGCTensor *_output; +}; +} +#endif /* __ARM_COMPUTE_GCGEMMMATRIXMULTIPLYKERNEL_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h new file mode 100644 index 000000000..4223556ac --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCGEMMTRANSPOSE1XWKERNEL_H__ +#define __ARM_COMPUTE_GCGEMMTRANSPOSE1XWKERNEL_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h" + +namespace arm_compute +{ +class IGCTensor; + +/** OpenGLES kernel which transposes the elements of a matrix in chunks of 1xW, where W is equal to (16 / element size of the tensor) + * + * Following an example of how the transposition1xW works when the input data type is F32 + * + * @f[ + * \left( \begin{array}{cccc} + * a00 & a01 & a02 & a03 \\ + * a10 & a11 & a12 & a13 \\ + * a20 & a21 & a22 & a23 \\ + * a30 & a31 & a32 & a33 \\ + * \end{array} \right) + * \rightarrow + * \left( \begin{array}{ccccccccccccccccc} + * a00 & a01 & a02 & a03 & a10 & a11 & a12 & a13 & a20 & a21 & a22 & a23 & a30 & a31 & a32 & a33 \\ + * \end{array} \right) + * @f] + * + * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) + * + */ +class GCGEMMTranspose1xWKernel : public IGCSimple2DKernel +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: F32 + * @param[out] output Output tensor. Data type supported: same as @p input + */ + void configure(const IGCTensor *input, IGCTensor *output); + + // Inherited methods overridden: + void run(const Window &window) override; +}; +} +#endif /* __ARM_COMPUTE_GCGEMMTRANSPOSE1XWKERNEL_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h new file mode 100644 index 000000000..e1b35607f --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCIM2COLKERNEL_H__ +#define __ARM_COMPUTE_GCIM2COLKERNEL_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h" + +namespace arm_compute +{ +class IGCTensor; + +/** Interface for the im2col reshape kernel. + * + * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column. + * It is used to transform a convolution to a plain matrix multiplication. + * + * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have: + * @f[ + * \left( \begin{array}{cccc} + * a00 & a01 & a02 & a03 \\ + * a10 & a11 & a12 & a13 \\ + * a20 & a21 & a22 & a23 \\ + * a30 & a31 & a32 & a33 \\ + * \end{array} \right) + * = + * \left( \begin{array}{ccccccccc} + * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\ + * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\ + * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\ + * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\ + * \end{array} \right) + * @f] + */ +class GCIm2ColKernel : public IGCKernel +{ +public: + /** Default constructor */ + GCIm2ColKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCIm2ColKernel(const GCIm2ColKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCIm2ColKernel &operator=(const GCIm2ColKernel &) = delete; + /** Allow instances of this class to be moved */ + GCIm2ColKernel(GCIm2ColKernel &&) = default; + /** Allow instances of this class to be moved */ + GCIm2ColKernel &operator=(GCIm2ColKernel &&) = default; + /** Set the input and output of the kernel. + * + * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F32 + * @param[out] output The output tensor. First 2 lower dimensions represent a transform of each 3D input, + * while every dimension above represents a batch. Data types supported: Same as @p input + * @param[in] kernel_dims The kernel dimensions (width and height). + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] has_bias In case biases are provided expands the matrix with 1. + */ + void configure(const IGCTensor *input, IGCTensor *output, std::pair<unsigned int, unsigned int> kernel_dims, const PadStrideInfo &conv_info, bool has_bias); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + /** Run the reshape kernel optimised for the special case (stride is 1, padding is 0 and kernel's low 3 dimensions are same as input) + * + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel. + */ + void run_reduced(const Window &window); + /** run the generic convolution layer input reshape kernel + * + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). + * @param[in,out] queue Command queue on which to enqueue the kernel. + */ + void run_generic(const Window &window); + + /** Common signature for the kernel to run */ + using Im2ColFunction = void (GCIm2ColKernel::*)(const Window &); + +private: + const IGCTensor *_input; + IGCTensor *_output; + std::pair<unsigned int, unsigned int> _convolved_dims; + unsigned int _num_elems_processed_per_iteration; + Im2ColFunction _run_func; +}; +} + +#endif /*__ARM_COMPUTE_GCIM2COLKERNEL_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h new file mode 100644 index 000000000..e8bc7ad2b --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCNORMALIZATIONLAYERKERNEL_H__ +#define __ARM_COMPUTE_GCNORMALIZATIONLAYERKERNEL_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h" + +namespace arm_compute +{ +class IGCTensor; + +/** Interface for the normalization layer kernel. + */ +class GCNormalizationLayerKernel : public IGCKernel +{ +public: + /** Constructor */ + GCNormalizationLayerKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCNormalizationLayerKernel(const GCNormalizationLayerKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCNormalizationLayerKernel &operator=(const GCNormalizationLayerKernel &) = delete; + /** Default Move Constructor. */ + GCNormalizationLayerKernel(GCNormalizationLayerKernel &&) = default; + /** Default move assignment operator. */ + GCNormalizationLayerKernel &operator=(GCNormalizationLayerKernel &&) = default; + /** Default destrutor */ + ~GCNormalizationLayerKernel() = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data types supported: F32. + * @param[in] squared_input Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM], + * Data types should match the input type. + * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data types should match the input type. + * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters. + */ + void configure(const IGCTensor *input, const IGCTensor *squared_input, IGCTensor *output, NormalizationLayerInfo norm_info); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + const IGCTensor *_input; + const IGCTensor *_squared_input; + IGCTensor *_output; + BorderSize _border_size; +}; +} +#endif /*__ARM_COMPUTE_GCNORMALIZATIONLAYERKERNEL_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h new file mode 100644 index 000000000..3b01b4ad4 --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCPIXELWISEMULTIPLICATIONKERNEL_H__ +#define __ARM_COMPUTE_GCPIXELWISEMULTIPLICATIONKERNEL_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class IGCTensor; + +/** Interface for the pixelwise multiplication kernel. + * + */ +class GCPixelWiseMultiplicationKernel : public IGCKernel +{ +public: + /** Default constructor.*/ + GCPixelWiseMultiplicationKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + GCPixelWiseMultiplicationKernel(const GCPixelWiseMultiplicationKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + GCPixelWiseMultiplicationKernel &operator=(const GCPixelWiseMultiplicationKernel &) = delete; + /** Allow instances of this class to be moved */ + GCPixelWiseMultiplicationKernel(GCPixelWiseMultiplicationKernel &&) = default; + /** Allow instances of this class to be moved */ + GCPixelWiseMultiplicationKernel &operator=(GCPixelWiseMultiplicationKernel &&) = default; + /** Initialise the kernel's input, output and border mode. + * + * @param[in] input1 An input tensor. Data types supported: F32. + * @param[in] input2 An input tensor. Data types supported: same as @p input1. + * @param[out] output The output tensor, Data types supported: same as @p input1. + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. + */ + void configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output, float scale); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + const IGCTensor *_input1; + const IGCTensor *_input2; + IGCTensor *_output; +}; +} + +#endif /*__ARM_COMPUTE_GCPIXELWISEMULTIPLICATIONKERNEL_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h new file mode 100644 index 000000000..d4921c209 --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCPOOLINGLAYERKERNEL_H__ +#define __ARM_COMPUTE_GCPOOLINGLAYERKERNEL_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h" + +namespace arm_compute +{ +class IGCTensor; + +/** Interface for the pooling layer kernel */ +class GCPoolingLayerKernel : public IGCKernel +{ +public: + /** Default constructor */ + GCPoolingLayerKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCPoolingLayerKernel(const GCPoolingLayerKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCPoolingLayerKernel &operator=(const GCPoolingLayerKernel &) = delete; + /** Allow instances of this class to be moved */ + GCPoolingLayerKernel(GCPoolingLayerKernel &&) = default; + /** Allow instances of this class to be moved */ + GCPoolingLayerKernel &operator=(GCPoolingLayerKernel &&) = default; + /** Default destructor */ + ~GCPoolingLayerKernel() = default; + + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data types supported: F16/F32. + * @param[out] output Destination tensor. Data types supported: Same as @p input. + * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. + */ + void configure(const IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + const IGCTensor *_input; + IGCTensor *_output; + PoolingLayerInfo _pool_info; + BorderSize _border_size; + unsigned int _num_elems_processed_per_iteration; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_GCPOOLINGLAYERKERNEL_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h new file mode 100644 index 000000000..483e19b21 --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCSOFTMAXLAYERKERNEL_H__ +#define __ARM_COMPUTE_GCSOFTMAXLAYERKERNEL_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h" + +namespace arm_compute +{ +class IGCTensor; + +/** Interface for the identifying the max value of 1D Logits */ +class GCLogits1DMaxKernel : public IGCSimple3DKernel +{ +public: + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data types supported: F16/F32 + * @param[out] output Destination tensor. Data types supported: same as @p input + */ + void configure(const IGCTensor *input, IGCTensor *output); +}; + +/** Interface for shifting the logits values around the max value and exponentiating the result */ +class GCLogits1DShiftExpSumKernel : public IGCKernel +{ +public: + /** Default constructor */ + GCLogits1DShiftExpSumKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCLogits1DShiftExpSumKernel(const GCLogits1DShiftExpSumKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCLogits1DShiftExpSumKernel &operator=(const GCLogits1DShiftExpSumKernel &) = delete; + /** Allow instances of this class to be moved */ + GCLogits1DShiftExpSumKernel(GCLogits1DShiftExpSumKernel &&) = default; + /** Allow instances of this class to be moved */ + GCLogits1DShiftExpSumKernel &operator=(GCLogits1DShiftExpSumKernel &&) = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data types supported: F16/F32 + * @param[in] max Max values tensor. Data types supported: same as @p input + * @param[out] output Destination tensor. Data types supported: same as @p input + * @param[out] sum Sum of 1D logits tensor. Data types supported: same as @p input + */ + void configure(const IGCTensor *input, const IGCTensor *max, IGCTensor *output, IGCTensor *sum); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + const IGCTensor *_input; + const IGCTensor *_max; + IGCTensor *_output; + IGCTensor *_sum; +}; + +/** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */ +class GCLogits1DNormKernel : public IGCKernel +{ +public: + /** Default constructor */ + GCLogits1DNormKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCLogits1DNormKernel(const GCLogits1DNormKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + GCLogits1DNormKernel &operator=(const GCLogits1DNormKernel &) = delete; + /** Allow instances of this class to be moved */ + GCLogits1DNormKernel(GCLogits1DNormKernel &&) = default; + /** Allow instances of this class to be moved */ + GCLogits1DNormKernel &operator=(GCLogits1DNormKernel &&) = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data types supported: F16/F32 + * @param[in] sum Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input + * @param[out] output Destination tensor. Data types supported: same as @p input + */ + void configure(const IGCTensor *input, const IGCTensor *sum, IGCTensor *output); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + const IGCTensor *_input; + const IGCTensor *_sum; + IGCTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_GCSOFTMAXLAYERKERNEL_H__ */ diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h new file mode 100644 index 000000000..c628a0058 --- /dev/null +++ b/arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCTRANSPOSEKERNEL_H__ +#define __ARM_COMPUTE_GCTRANSPOSEKERNEL_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h" + +namespace arm_compute +{ +class IGCTensor; + +/** OpenGL ES kernel which transposes the elements of a matrix. + * + * [width, height, batch] -> [height, width, batch] + * + */ +class GCTransposeKernel : public IGCSimple2DKernel +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: F16/F32 + * @param[out] output Output tensor. Data type supported: Same as @p input + */ + void configure(const IGCTensor *input, IGCTensor *output); + + // Inherited methods overridden: + void run(const Window &window) override; +}; +} +#endif /* __ARM_COMPUTE_GCTRANSPOSEKERNEL_H__ */ diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h index 6e4d98718..c02f14aec 100644 --- a/arm_compute/core/Helpers.h +++ b/arm_compute/core/Helpers.h @@ -33,6 +33,7 @@ #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/utility.h" #include <array> #include <cstddef> @@ -116,6 +117,57 @@ inline T delta_bilinear_c1(const T *pixel_ptr, size_t stride, float dx, float dy return static_cast<T>(a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4); } +/** Computes linear interpolation using the pointer to the top pixel and the pixel's distance between + * the real coordinates and the smallest following integer coordinates. Input must be in single channel format. + * + * @param[in] pixel_ptr Pointer to the top pixel value of a single channel input. + * @param[in] stride Stride to access the bottom pixel value + * @param[in] dy Pixel's distance between the Y real coordinate and the smallest Y following integer + * + * @note dy must be in the range [0, 1.0] + * + * @return The linear interpolated pixel value + */ +template <typename T> +inline T delta_linear_c1_y(const T *pixel_ptr, size_t stride, float dy) +{ + ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr); + + const float dy1 = 1.0f - dy; + + const T a00 = *pixel_ptr; + const T a10 = *(pixel_ptr + stride); + + const float w1 = dy1; + const float w3 = dy; + + return static_cast<T>(a00 * w1 + a10 * w3); +} +/** Computes linear interpolation using the pointer to the left pixel and the pixel's distance between + * the real coordinates and the smallest following integer coordinates. Input must be in single channel format. + * + * @param[in] pixel_ptr Pointer to the left pixel value of a single channel input. + * @param[in] dx Pixel's distance between the X real coordinate and the smallest X following integer + * + * @note dx must be in the range [0, 1.0] + * + * @return The linear interpolated pixel value + */ +template <typename T> +inline T delta_linear_c1_x(const T *pixel_ptr, float dx) +{ + ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr); + + const T a00 = *pixel_ptr; + const T a01 = *(pixel_ptr + 1); + + const float dx1 = 1.0f - dx; + + const float w1 = dx1; + const float w2 = dx; + + return static_cast<T>(a00 * w1 + a01 * w2); +} /** Return the pixel at (x,y) using bilinear interpolation. * * @warning Only works if the iterator was created with an IImage @@ -168,6 +220,18 @@ inline uint8_t pixel_bilinear_c1_clamp(const T *first_pixel_ptr, size_t stride, const float dx = x - xi; const float dy = y - yi; + if(dx == 0.0f) + { + if(dy == 0.0f) + { + return static_cast<T>(first_pixel_ptr[static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride]); + } + return delta_linear_c1_y(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride, stride, dy); + } + if(dy == 0.0f) + { + return delta_linear_c1_x(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride, dx); + } return delta_bilinear_c1(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride, stride, dx, dy); } @@ -459,6 +523,23 @@ inline Strides compute_strides(const ITensorInfo &info) return compute_strides(info, info.element_size()); } +/** Permutes given Dimensions according to a permutation vector + * + * @warning Validity of permutation is not checked + * + * @param[in, out] dimensions Dimensions to permute + * @param[in] perm Permutation vector + */ +template <typename T> +inline void permute(Dimensions<T> &dimensions, const PermutationVector &perm) +{ + auto copy_dimensions = utility::make_array<Dimensions<T>::num_max_dimensions>(dimensions.begin(), dimensions.end()); + for(unsigned int i = 0; i < perm.num_dimensions(); ++i) + { + dimensions[i] = copy_dimensions[perm[i]]; + } +} + /* Auto initialize the tensor info (shape, number of channels, data type and fixed point position) if the current assignment is empty. * * @param[in,out] info Tensor info used to check and assign. @@ -466,10 +547,24 @@ inline Strides compute_strides(const ITensorInfo &info) * @param[in] num_channels New number of channels. * @param[in] data_type New data type * @param[in] fixed_point_position New fixed point position + * @param[in] quantization_info (Optional) New quantization info + * + * @return True if the tensor info has been initialized + */ +bool auto_init_if_empty(ITensorInfo &info, + const TensorShape &shape, + int num_channels, DataType data_type, + int fixed_point_position, + QuantizationInfo quantization_info = QuantizationInfo()); + +/** Auto initialize the tensor info using another tensor info. + * + * @param info_sink Tensor info used to check and assign + * @param info_source Tensor info used to assign * * @return True if the tensor info has been initialized */ -bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, int fixed_point_position); +bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_source); /* Set the shape to the specified value if the current assignment is empty. * @@ -509,6 +604,17 @@ bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type); * @return True if the fixed point position has been changed. */ bool set_fixed_point_position_if_zero(ITensorInfo &info, int fixed_point_position); + +/* Set the quantization info to the specified value if + * the current quantization info is empty and the data type of asymmetric quantized type + * + * @param[in,out] info Tensor info used to check and assign. + * @param[in] quantization_info Quantization info + * + * @return True if the quantization info has been changed. + */ +bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo quantization_info); + /** Helper function to calculate the Valid Region for Scale. * * @param[in] src_info Input tensor info used to check. @@ -520,6 +626,7 @@ bool set_fixed_point_position_if_zero(ITensorInfo &info, int fixed_point_positio * @return The corrispondent valid region */ ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const TensorShape &dst_shape, InterpolationPolicy policy, BorderSize border_size, bool border_undefined); + /** Convert a linear index into n-dimensional coordinates. * * @param[in] shape Shape of the n-dimensional tensor. @@ -528,6 +635,7 @@ ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const Tens * @return n-dimensional coordinates. */ inline Coordinates index2coords(const TensorShape &shape, int index); + /** Convert n-dimensional coordinates into a linear index. * * @param[in] shape Shape of the n-dimensional tensor. diff --git a/arm_compute/core/Helpers.inl b/arm_compute/core/Helpers.inl index de6c85ec7..367269281 100644 --- a/arm_compute/core/Helpers.inl +++ b/arm_compute/core/Helpers.inl @@ -197,7 +197,12 @@ inline void Iterator::reset(const size_t dimension) } } -inline bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, int fixed_point_position) +inline bool auto_init_if_empty(ITensorInfo &info, + const TensorShape &shape, + int num_channels, + DataType data_type, + int fixed_point_position, + QuantizationInfo quantization_info) { if(info.tensor_shape().total_size() == 0) { @@ -205,6 +210,22 @@ inline bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int info.set_num_channels(num_channels); info.set_tensor_shape(shape); info.set_fixed_point_position(fixed_point_position); + info.set_quantization_info(quantization_info); + return true; + } + + return false; +} + +inline bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_source) +{ + if(info_sink.tensor_shape().total_size() == 0) + { + info_sink.set_data_type(info_source.data_type()); + info_sink.set_num_channels(info_source.num_channels()); + info_sink.set_tensor_shape(info_source.tensor_shape()); + info_sink.set_fixed_point_position(info_source.fixed_point_position()); + info_sink.set_quantization_info(info_source.quantization_info()); return true; } @@ -255,6 +276,17 @@ inline bool set_fixed_point_position_if_zero(ITensorInfo &info, int fixed_point_ return false; } +inline bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo quantization_info) +{ + if(info.quantization_info().empty() && (is_data_type_quantized_asymmetric(info.data_type()))) + { + info.set_quantization_info(quantization_info); + return true; + } + + return false; +} + inline ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const TensorShape &dst_shape, InterpolationPolicy policy, BorderSize border_size, bool border_undefined) { const auto wr = static_cast<float>(dst_shape[0]) / static_cast<float>(src_info.tensor_shape()[0]); diff --git a/arm_compute/core/IArray.h b/arm_compute/core/IArray.h index 960e18f3d..bc01df981 100644 --- a/arm_compute/core/IArray.h +++ b/arm_compute/core/IArray.h @@ -124,7 +124,7 @@ public: /** Resizes the array to contain "num" elements. If "num" is smaller than the maximum array size, the content is reduced to its first "num" elements, * "num" elements can't be bigger than the maximum number of values which can be stored in this array. * - * @param[in] num The new array size in number of elements + * @param[in] num The new array size in number of elements */ void resize(size_t num) { diff --git a/arm_compute/core/IMultiHOG.h b/arm_compute/core/IMultiHOG.h index e91da7539..5e9ee3a4f 100644 --- a/arm_compute/core/IMultiHOG.h +++ b/arm_compute/core/IMultiHOG.h @@ -43,14 +43,14 @@ public: virtual size_t num_models() const = 0; /** Return a pointer to the requested HOG model * - * @param[in] index The index of the wanted HOG model. + * @param[in] index The index of the wanted HOG model. * * @return A pointer pointed to the HOG model */ virtual IHOG *model(size_t index) = 0; /** Return a const pointer to the requested HOG model * - * @param[in] index The index of the wanted HOG model. + * @param[in] index The index of the wanted HOG model. * * @return A const pointer pointed to the HOG model */ diff --git a/arm_compute/core/IMultiImage.h b/arm_compute/core/IMultiImage.h index 6ed3c785c..0d11c2c6b 100644 --- a/arm_compute/core/IMultiImage.h +++ b/arm_compute/core/IMultiImage.h @@ -43,14 +43,14 @@ public: virtual const MultiImageInfo *info() const = 0; /** Return a pointer to the requested plane of the image. * - * @param[in] index The index of the wanted planed. + * @param[in] index The index of the wanted planed. * * @return A pointer pointed to the plane */ virtual IImage *plane(unsigned int index) = 0; /** Return a constant pointer to the requested plane of the image. * - * @param[in] index The index of the wanted planed. + * @param[in] index The index of the wanted planed. * * @return A constant pointer pointed to the plane */ diff --git a/arm_compute/core/ITensorInfo.h b/arm_compute/core/ITensorInfo.h index bb3ac6e35..9a67712f3 100644 --- a/arm_compute/core/ITensorInfo.h +++ b/arm_compute/core/ITensorInfo.h @@ -29,13 +29,14 @@ #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ICloneable.h" #include <cstddef> namespace arm_compute { /** Store the tensor's metadata */ -class ITensorInfo +class ITensorInfo : public misc::ICloneable<ITensorInfo> { public: /** Default virtual destructor */ @@ -45,15 +46,19 @@ public: * @warning This resets the format to UNKNOWN. * * @param[in] data_type The new data type. + * + * @return Reference to this ITensorInfo object */ - virtual void set_data_type(DataType data_type) = 0; + virtual ITensorInfo &set_data_type(DataType data_type) = 0; /** Set the number of channels to the specified value. * * @warning This resets the format to UNKNOWN. * * @param[in] num_channels New number of channels. + * + * @return Reference to this ITensorInfo object */ - virtual void set_num_channels(int num_channels) = 0; + virtual ITensorInfo &set_num_channels(int num_channels) = 0; /** Set the format of an already initialized tensor. * * @note If the data type has already been configured (i.e. not UNKNOWN) it @@ -61,23 +66,41 @@ public: * be based on the format. * * @param[in] format Single-plane format of the tensor. + * + * @return Reference to this ITensorInfo object */ - virtual void set_format(Format format) = 0; + virtual ITensorInfo &set_format(Format format) = 0; /** Set the shape of an already initialized tensor. * * @warning Changing the shape requires to recompute the strides and is * therefore only possible if the tensor hasn't been allocated yet. * * @param[in] shape New tensor shape. + * + * @return Reference to this ITensorInfo object */ - virtual void set_tensor_shape(TensorShape shape) = 0; + virtual ITensorInfo &set_tensor_shape(TensorShape shape) = 0; /** Set the fixed point position to the specified value * * @warning The fixed point position must be set once the data type has been configured * * @param[in] fixed_point_position The new fixed point position + * + * @return Reference to this ITensorInfo object */ - virtual void set_fixed_point_position(int fixed_point_position) = 0; + virtual ITensorInfo &set_fixed_point_position(int fixed_point_position) = 0; + /** Set the quantization settings (scale and offset) of the tensor. + * + * @param[in] quantization_info QuantizationInfo containing the scale and offset + * + * @return Reference to this ITensorInfo object + */ + virtual ITensorInfo &set_quantization_info(QuantizationInfo quantization_info) = 0; + /** Resets the padding settings of the tensor. + * + * @return Reference to this ITensorInfo object + */ + virtual ITensorInfo &reset_padding() = 0; /** Update the offset to the first element and the strides to automatically computed values. * * @note The padding used by this method is really conservative so that the tensor can be used for most functions. @@ -178,8 +201,10 @@ public: /** Set the flag whether the tensor size can be changed. * * @param[in] is_resizable Flag that marks the tensor if it can be changed or not. + * + * @return Reference to this ITensorInfo object */ - virtual void set_is_resizable(bool is_resizable) = 0; + virtual ITensorInfo &set_is_resizable(bool is_resizable) = 0; /** Valid region of the tensor. All elements in the valid region have defined values, i.e. are not undefined. * * @return The valid region. @@ -190,6 +215,12 @@ public: * @param[in] valid_region Valid region to set. */ virtual void set_valid_region(ValidRegion valid_region) = 0; + + /** Get the quantization settings (scale and offset) of the tensor. + * + * @return A QuantizationInfo containing the scale and offset. + */ + virtual QuantizationInfo quantization_info() const = 0; }; } #endif /*__ARM_COMPUTE_TENSORINFO_H__ */ diff --git a/arm_compute/core/Log.h b/arm_compute/core/Log.h new file mode 100644 index 000000000..70e7c5111 --- /dev/null +++ b/arm_compute/core/Log.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_LOG_H__ +#define __ARM_COMPUTE_LOG_H__ + +#include "arm_compute/core/utils/logging/Macros.h" + +/** Create a default core logger + * + * @note It will eventually create all default loggers in don't exist + */ +#define ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER() \ + do \ + { \ + if(arm_compute::logging::LoggerRegistry::get().logger("CORE") == nullptr) \ + { \ + arm_compute::logging::LoggerRegistry::get().create_reserved_loggers(); \ + } \ + } while(false) + +/** Log a message to the core system logger + * + * @param[in] log_level Logging level + * @param[in] msg Message to log + */ +#define ARM_COMPUTE_LOG_MSG_CORE(log_level, msg) \ + do \ + { \ + ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER(); \ + ARM_COMPUTE_LOG_MSG("CORE", log_level, msg); \ + } while(false) + +/** Log a message with format to the core system logger + * + * @param[in] log_level Logging level + * @param[in] fmt String format (printf style) + * @param[in] ... Message arguments + */ +#define ARM_COMPUTE_LOG_MSG_WITH_FORMAT_CORE(log_level, fmt, ...) \ + do \ + { \ + ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER(); \ + ARM_COMPUTE_LOG_MSG_WITH_FORMAT("CORE", log_level, fmt, __VA_ARGS__); \ + } while(false) + +/** Log a stream to the core system logger + * + * @param[in] log_level Logging level + * @param[in] ss Stream to log + */ +#define ARM_COMPUTE_LOG_STREAM_CORE(log_level, ss) \ + do \ + { \ + ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER(); \ + ARM_COMPUTE_LOG_STREAM("CORE", log_level, ss); \ + } while(false) + +/** Log information level message to the core system logger + * + * @param[in] msg Stream to log + */ +#define ARM_COMPUTE_LOG_INFO_MSG_CORE(msg) \ + do \ + { \ + ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER(); \ + ARM_COMPUTE_LOG_MSG_CORE(arm_compute::logging::LogLevel::INFO, msg); \ + } while(false) + +/** Log information level formatted message to the core system logger + * + * @param[in] fmt String format (printf style) + * @param[in] ... Message arguments + */ +#define ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(fmt, ...) \ + do \ + { \ + ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER(); \ + ARM_COMPUTE_LOG_MSG_WITH_FORMAT_CORE(arm_compute::logging::LogLevel::INFO, fmt, __VA_ARGS__); \ + } while(false) + +/** Log information level stream to the core system logger + * + * @param[in] ss Message to log + */ +#define ARM_COMPUTE_LOG_INFO_STREAM_CORE(ss) \ + do \ + { \ + ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER(); \ + ARM_COMPUTE_LOG_STREAM_CORE(arm_compute::logging::LogLevel::INFO, ss); \ + } while(false) + +#endif /* __ARM_COMPUTE_LOGGING_MACROS_H__ */ diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h new file mode 100644 index 000000000..f0d7439d4 --- /dev/null +++ b/arm_compute/core/NEON/NEAsymm.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEASYMM_H__ +#define __ARM_COMPUTE_NEASYMM_H__ + +#include <arm_neon.h> + +namespace arm_compute +{ +using qasymm8x8_t = uint8x8_t; /**< 8 bit quantized asymmetric vector with 8 elements */ +using qasymm8x8x2_t = uint8x8x2_t; /**< 8 bit quantized asymmetric vector with 16 elements */ +using qasymm8x8x3_t = uint8x8x3_t; /**< 8 bit quantized asymmetric vector with 24 elements */ +using qasymm8x8x4_t = uint8x8x4_t; /**< 8 bit quantized asymmetric vector with 32 elements */ +using qasymm8x16_t = uint8x16_t; /**< 8 bit quantized asymmetric vector with 16 elements */ + +/** Round to the nearest division by a power-of-two using exponent + * + * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent + * + * @param[in] x Vector of 4 elements + * @param[in] exponent Integer value used to round to nearest division by a power-of-two + * + * @return the nearest division by a power-of-two using exponent + */ +int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent); + +/** Perform a multiply-accumulate on all 16 components of a QASYMM8 vector + * + * vd*vs + vo + * + * @param[in] vd Input vector value in QASYMM8 format + * @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes. + * @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes. + * + * @return A 16-component vector in QASYMM8 format, saturated to fit + */ +uint8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo); +} // namespace arm_compute +#include "arm_compute/core/NEON/NEAsymm.inl" +#endif // __ARM_COMPUTE_NEASYMM_H__ diff --git a/arm_compute/core/NEON/NEAsymm.inl b/arm_compute/core/NEON/NEAsymm.inl new file mode 100644 index 000000000..ce999a541 --- /dev/null +++ b/arm_compute/core/NEON/NEAsymm.inl @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +namespace arm_compute +{ +inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent) +{ + const int32x4_t shift_vec = vdupq_n_s32(-exponent); + const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31); + const int32x4_t fixed_up_x = vqaddq_s32(x, fixup); + return vrshlq_s32(fixed_up_x, shift_vec); +} + +inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo) +{ + // Convert uint8 vectors to uint16 vectors + const uint8x8_t vd_low = vget_low_u8(vd); + const uint8x8_t vd_high = vget_high_u8(vd); + uint16x8_t vd_low_u16x8 = vmovl_u8(vd_low); + uint16x8_t vd_high_u16x8 = vmovl_u8(vd_high); + // Convert uint16 vectors to uint32 vectors + uint32x4_t A_u32x4 = vmovl_u16(vget_low_u16(vd_low_u16x8)); + uint32x4_t B_u32x4 = vmovl_u16(vget_high_u16(vd_low_u16x8)); + uint32x4_t C_u32x4 = vmovl_u16(vget_low_u16(vd_high_u16x8)); + uint32x4_t D_u32x4 = vmovl_u16(vget_high_u16(vd_high_u16x8)); + // Convert uint32 vectors to float32 vectors + float32x4_t A_f32x4 = vcvtq_f32_u32(A_u32x4); + float32x4_t B_f32x4 = vcvtq_f32_u32(B_u32x4); + float32x4_t C_f32x4 = vcvtq_f32_u32(C_u32x4); + float32x4_t D_f32x4 = vcvtq_f32_u32(D_u32x4); + // vd = vd*vs + vo + A_f32x4 = vmlaq_f32(vo, A_f32x4, vs); + B_f32x4 = vmlaq_f32(vo, B_f32x4, vs); + C_f32x4 = vmlaq_f32(vo, C_f32x4, vs); + D_f32x4 = vmlaq_f32(vo, D_f32x4, vs); + // Convert float32 vectors to uint32 vectors + A_u32x4 = vcvtq_u32_f32(A_f32x4); + B_u32x4 = vcvtq_u32_f32(B_f32x4); + C_u32x4 = vcvtq_u32_f32(C_f32x4); + D_u32x4 = vcvtq_u32_f32(D_f32x4); + // Convert uint32 vectors to uint16 vectors (with saturation) + vd_low_u16x8 = vcombine_u16(vqmovn_u32(A_u32x4), vqmovn_u32(B_u32x4)); + vd_high_u16x8 = vcombine_u16(vqmovn_u32(C_u32x4), vqmovn_u32(D_u32x4)); + // convert uint16 vectors to uint8 vectors (with saturation) + return vcombine_u8(vqmovn_u16(vd_low_u16x8), vqmovn_u16(vd_high_u16x8)); +} +} // namespace arm_compute diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h index f8579e08b..5719b6361 100644 --- a/arm_compute/core/NEON/NEFixedPoint.h +++ b/arm_compute/core/NEON/NEFixedPoint.h @@ -179,19 +179,19 @@ void vst1_qs16(qint16_t *addr, qint16x4_t b); void vst1q_qs8(qint8_t *addr, qint8x16_t b); /** Store a single 16 bit fixed point vector to memory (8 elements) -* -* @param[in] addr Memory address where the 16 bit fixed point vector should be stored -* @param[in] b 16 bit fixed point vector to store -* -*/ + * + * @param[in] addr Memory address where the 16 bit fixed point vector should be stored + * @param[in] b 16 bit fixed point vector to store + * + */ void vst1q_qs16(qint16_t *addr, qint16x8_t b); /** Store two 16 bit fixed point vector to memory (8x2 elements) -* -* @param[in] addr Memory address where the 16 bit fixed point vectors should be stored -* @param[in] b 16 bit fixed point vectors to store -* -*/ + * + * @param[in] addr Memory address where the 16 bit fixed point vectors should be stored + * @param[in] b 16 bit fixed point vectors to store + * + */ void vst2q_qs16(qint16_t *addr, qint16x8x2_t b); /** 16 bit fixed point vector saturating narrow (8 elements) diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h index bbb440f59..6c31fa4fb 100644 --- a/arm_compute/core/NEON/NEKernels.h +++ b/arm_compute/core/NEON/NEKernels.h @@ -43,8 +43,13 @@ #include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h" #include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h" #include "arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h" -#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h" -#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h" +#include "arm_compute/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.h" +#include "arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h" +#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h" +#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h" +#include "arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h" +#include "arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h" +#include "arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h" #include "arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h" #include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h" #include "arm_compute/core/NEON/kernels/NEDilateKernel.h" @@ -58,10 +63,16 @@ #include "arm_compute/core/NEON/kernels/NEFloorKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h" #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" #include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h" #include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h" @@ -72,7 +83,7 @@ #include "arm_compute/core/NEON/kernels/NEHistogramKernel.h" #include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h" #include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h" -#include "arm_compute/core/NEON/kernels/NEL2NormalizeKernel.h" +#include "arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h" #include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h" #include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h" #include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h" @@ -101,7 +112,12 @@ #include "arm_compute/core/NEON/kernels/NETransposeKernel.h" #include "arm_compute/core/NEON/kernels/NEWarpKernel.h" #include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h" +#include "arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h" #include "arm_compute/core/NEON/kernels/arm32/NEGEMMAArch32Kernel.h" #include "arm_compute/core/NEON/kernels/arm64/NEGEMMAArch64Kernel.h" +#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h" +#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h" +#include "arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h" +#include "arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h" #endif /* __ARM_COMPUTE_NEKERNELS_H__ */ diff --git a/arm_compute/core/NEON/NEMath.h b/arm_compute/core/NEON/NEMath.h index 4c4085e54..5c60d73de 100644 --- a/arm_compute/core/NEON/NEMath.h +++ b/arm_compute/core/NEON/NEMath.h @@ -116,7 +116,7 @@ float32x4_t vtanhq_f32(float32x4_t val); */ float32x4_t vpowq_f32(float32x4_t val, float32x4_t n); -#ifdef ARM_COMPUTE_ENABLE_FP16 +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC /** Calculate hyperbolic tangent. * * tanh(x) = (e^2x - 1)/(e^2x + 1) @@ -179,7 +179,7 @@ float16x8_t vexpq_f16(float16x8_t x); * @return The calculated power. */ float16x8_t vpowq_f16(float16x8_t val, float16x8_t n); -#endif /* ARM_COMPUTE_ENABLE_FP16 */ +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ } // namespace arm_compute #include "arm_compute/core/NEON/NEMath.inl" #endif /* __ARM_COMPUTE_NEMATH_H__ */ diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl index ebfc52d9a..50f217c1f 100644 --- a/arm_compute/core/NEON/NEMath.inl +++ b/arm_compute/core/NEON/NEMath.inl @@ -168,7 +168,7 @@ inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n) { return vexpq_f32(vmulq_f32(n, vlogq_f32(val))); } -#ifdef ARM_COMPUTE_ENABLE_FP16 +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC /* Exponent polynomial coefficients */ const std::array<float16x8_t, 8> exp_tab_f16 = { @@ -301,5 +301,5 @@ inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n) { return vexpq_f16(vmulq_f16(n, vlogq_f16(val))); } -#endif /* ARM_COMPUTE_ENABLE_FP16 */ +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ } // namespace arm_compute diff --git a/arm_compute/core/NEON/kernels/NEAccumulateKernel.h b/arm_compute/core/NEON/kernels/NEAccumulateKernel.h index ad8b02fbc..fa8a3be92 100644 --- a/arm_compute/core/NEON/kernels/NEAccumulateKernel.h +++ b/arm_compute/core/NEON/kernels/NEAccumulateKernel.h @@ -80,7 +80,7 @@ protected: float _alpha; }; -#ifdef ARM_COMPUTE_ENABLE_FP16 +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC /** Interface for the accumulate weighted kernel using F16 */ class NEAccumulateWeightedFP16Kernel : public NEAccumulateWeightedKernel { @@ -88,9 +88,9 @@ public: // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; }; -#else /* ARM_COMPUTE_ENABLE_FP16 */ +#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ using NEAccumulateWeightedFP16Kernel = NEAccumulateWeightedKernel; -#endif /* ARM_COMPUTE_ENABLE_FP16 */ +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ /** Interface for the accumulate squared kernel * diff --git a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h index 08fb3f915..1edda843d 100644 --- a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h @@ -26,10 +26,11 @@ #include "arm_compute/core/FixedPoint.h" #include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/QAsymm8.h" -#ifdef ARM_COMPUTE_ENABLE_FP16 +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC #include <arm_fp16.h> -#endif /* ARM_COMPUTE_ENABLE_FP16 */ +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ namespace arm_compute { @@ -59,6 +60,16 @@ public: * @param[in] activation_info Activation layer information. */ void configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info); + /** Static function to check if given info will lead to a valid configuration of @ref NEActivationLayerKernel + * + * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result + * of the activation function. Data types supported: QS8/QS16/F16/F32. + * @param[in] output Destination tensor info. Data type supported: same as @p input + * @param[in] act_info Activation layer information. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -72,27 +83,33 @@ private: using ActivationFunctionExecutorPtr = void (NEActivationLayerKernel::*)(const Window &window); /** Function to apply an activation function on a tensor. * - * @param[in] window Region on which to execute the kernel + * @param[in] window Region on which to execute the kernel */ template <ActivationLayerInfo::ActivationFunction F, typename T> typename std::enable_if<std::is_same<T, float>::value, void>::type activation(const Window &window); -#ifdef ARM_COMPUTE_ENABLE_FP16 +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC /** Function to apply an activation function on a tensor. * - * @param[in] window Region on which to execute the kernel + * @param[in] window Region on which to execute the kernel */ template <ActivationLayerInfo::ActivationFunction F, typename T> typename std::enable_if<std::is_same<T, float16_t>::value, void>::type activation(const Window &window); -#endif /* ARM_COMPUTE_ENABLE_FP16 */ +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ /** Function to apply an activation function on a tensor. * - * @param[in] window Region on which to execute the kernel + * @param[in] window Region on which to execute the kernel */ template <ActivationLayerInfo::ActivationFunction F, typename T> typename std::enable_if<std::is_same<T, qint8_t>::value, void>::type activation(const Window &window); /** Function to apply an activation function on a tensor. * - * @param[in] window Region on which to execute the kernel + * @param[in] window Region on which to execute the kernel + */ + template <ActivationLayerInfo::ActivationFunction F, typename T> + typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type activation(const Window &window); + /** Function to apply an activation function on a tensor. + * + * @param[in] window Region on which to execute the kernel */ template <ActivationLayerInfo::ActivationFunction F, typename T> typename std::enable_if<std::is_same<T, qint16_t>::value, void>::type activation(const Window &window); diff --git a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h index edb738163..b830e022d 100644 --- a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h +++ b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h @@ -68,6 +68,16 @@ public: * @param[in] policy Overflow policy. */ void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAdditionKernel + * + * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 + * @param[in] input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 + * @param[in] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] policy Overflow policy. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h index d6a219ffd..af81d396b 100644 --- a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h +++ b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h @@ -68,6 +68,16 @@ public: * @param[in] policy Overflow policy. */ void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtractionKernel + * + * @param[in] input1 First tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32 + * @param[in] input2 Second tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32 + * @param[in] output Output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 + * @param[in] policy Policy to use to handle overflow. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h index 04c4c9ebb..f3c5574e7 100644 --- a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h @@ -54,14 +54,32 @@ public: * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result. * 3 lower dimensions represent a single input with dimensions [width, height, FM]. * The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32. + * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input * @param[in] mean Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input * @param[in] var Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input - * @param[in] gamma Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input * @param[in] beta Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] gamma Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input * @param[in] epsilon Small value to avoid division with zero. - * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input */ void configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon); + /** Static function to check if given info will lead to a valid configuration of @ref NEBatchNormalizationLayerKernel + * + * @param[in] input Source tensor info. In case of @p output tensor = nullptr, this tensor will store the result. + * 3 lower dimensions represent a single input with dimensions [width, height, FM]. + * The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32. + * @param[in] output Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input + * @param[in] mean Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] var Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] beta Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] gamma Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] epsilon Small value to avoid division with zero. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *mean, const ITensorInfo *var, + const ITensorInfo *beta, const ITensorInfo *gamma, + float epsilon); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h b/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h index 1366adad3..29248f653 100644 --- a/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h +++ b/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h @@ -46,7 +46,7 @@ public: BorderSize border_size() const override; }; -#ifdef ARM_COMPUTE_ENABLE_FP16 +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC /** NEON kernel to perform a Box 3x3 filter using F16 simd */ class NEBox3x3FP16Kernel : public NEBox3x3Kernel @@ -55,8 +55,8 @@ public: // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; }; -#else /* ARM_COMPUTE_ENABLE_FP16 */ +#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ using NEBox3x3FP16Kernel = NEBox3x3Kernel; -#endif /* ARM_COMPUTE_ENABLE_FP16 */ +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ } // namespace arm_compute #endif /*__ARM_COMPUTE_NEBOX3x3KERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h b/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h index 37d86685d..a57c3894b 100644 --- a/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h +++ b/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h @@ -81,7 +81,7 @@ protected: ITensor *_phase; /**< Destination tensor - Quantized phase */ }; -#ifdef ARM_COMPUTE_ENABLE_FP16 +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC /** NEON kernel to perform Gradient computation */ class NEGradientFP16Kernel : public NEGradientKernel @@ -90,9 +90,9 @@ public: // Inherited methods overriden: void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type) override; }; -#else /* ARM_COMPUTE_ENABLE_FP16 */ +#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ using NEGradientFP16Kernel = NEGradientKernel; -#endif /* ARM_COMPUTE_ENABLE_FP16 */ +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ /** NEON kernel to perform Non-Maxima suppression for Canny Edge. * diff --git a/arm_compute/core/NEON/kernels/NECol2ImKernel.h b/arm_compute/core/NEON/kernels/NECol2ImKernel.h index d537d49c5..243cc77a4 100644 --- a/arm_compute/core/NEON/kernels/NECol2ImKernel.h +++ b/arm_compute/core/NEON/kernels/NECol2ImKernel.h @@ -26,6 +26,8 @@ #include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Size2D.h" + namespace arm_compute { class ITensor; @@ -66,12 +68,22 @@ public: /** Set the input and output of the kernel. * - * @param[in] input The input tensor to convert. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input The input tensor to convert. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 * @param[out] output The output tensor. 3 lower dimensions represent a single output [width, height, OFM], * while the rest represent batch of outputs. Data types supported: Same as @p input * @param[in] convolved_dims Output convolved dimensions. */ - void configure(const ITensor *input, ITensor *output, std::pair<unsigned int, unsigned int> convolved_dims); + void configure(const ITensor *input, ITensor *output, const Size2D &convolved_dims); + /** Static function to check if given info will lead to a valid configuration of @ref NECol2ImKernel + * + * @param[in] input The input tensor to convert. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] output The output tensor. 3 lower dimensions represent a single output [width, height, OFM], + * while the rest represent batch of outputs. Data types supported: Same as @p input + * @param[in] convolved_dims Output convolved dimensions. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -93,7 +105,7 @@ private: Col2ImFunctionPtr _func; const ITensor *_input; ITensor *_output; - std::pair<unsigned int, unsigned int> _convolved_dims; + Size2D _convolved_dims; }; } // namespace arm_compute #endif /*__ARM_COMPUTE_NECOL2IMKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.h b/arm_compute/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.h new file mode 100644 index 000000000..707564683 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEDECONVOLUTIONLAYERKERNEL_H__ +#define __ARM_COMPUTE_NEDECONVOLUTIONLAYERKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to perform scaling on a tensor */ +class NEDeconvolutionLayerUpsampleKernel : public INEKernel +{ +public: + /** Default constructor */ + NEDeconvolutionLayerUpsampleKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDeconvolutionLayerUpsampleKernel(const NEDeconvolutionLayerUpsampleKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDeconvolutionLayerUpsampleKernel &operator=(const NEDeconvolutionLayerUpsampleKernel &) = delete; + /** Allow instances of this class to be moved */ + NEDeconvolutionLayerUpsampleKernel(NEDeconvolutionLayerUpsampleKernel &&) = default; + /** Allow instances of this class to be moved */ + NEDeconvolutionLayerUpsampleKernel &operator=(NEDeconvolutionLayerUpsampleKernel &&) = default; + /** Default destructor */ + ~NEDeconvolutionLayerUpsampleKernel() = default; + + /** Initialise the kernel's inputs, output and interpolation policy + * + * @param[in] input Source tensor. Data types supported: F32. + * @param[in] offsets Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32. + * @param[out] output Destination tensor. Data types supported: F32. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. + */ + void configure(const ITensor *input, const ITensor *offsets, ITensor *output); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + BorderSize border_size() const override; + +private: + /** Function to perform scale using nearest interpolation on the given window */ + void scale_nearest(const Window &window); + + const ITensor *_offsets; + const ITensor *_input; + ITensor *_output; +}; +} // arm_compute +#endif /*__ARM_COMPUTE_NEDECONVOLUTIONLAYERKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h b/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h index 784dfc3f5..6029873f2 100644 --- a/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h +++ b/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h @@ -34,21 +34,21 @@ class ITensor; /** Interface for the depth concatenate kernel. * The input tensor will be concatenated into the output tensor. */ -class NEDepthConcatenateKernel : public INEKernel +class NEDepthConcatenateLayerKernel : public INEKernel { public: /** Default constructor */ - NEDepthConcatenateKernel(); + NEDepthConcatenateLayerKernel(); /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEDepthConcatenateKernel(const NEDepthConcatenateKernel &) = delete; + NEDepthConcatenateLayerKernel(const NEDepthConcatenateLayerKernel &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEDepthConcatenateKernel &operator=(const NEDepthConcatenateKernel &) = delete; + NEDepthConcatenateLayerKernel &operator=(const NEDepthConcatenateLayerKernel &) = delete; /** Allow instances of this class to be moved */ - NEDepthConcatenateKernel(NEDepthConcatenateKernel &&) = default; + NEDepthConcatenateLayerKernel(NEDepthConcatenateLayerKernel &&) = default; /** Allow instances of this class to be moved */ - NEDepthConcatenateKernel &operator=(NEDepthConcatenateKernel &&) = default; + NEDepthConcatenateLayerKernel &operator=(NEDepthConcatenateLayerKernel &&) = default; /** Default destructor */ - ~NEDepthConcatenateKernel() = default; + ~NEDepthConcatenateLayerKernel() = default; /** Initialise the kernel's inputs and output * * @param[in] input Input tensor. Data types supported: QS8/QS16/F16/F32. diff --git a/arm_compute/core/NEON/kernels/NEDepthConvertKernel.h b/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h index 332406f23..af51ded87 100644 --- a/arm_compute/core/NEON/kernels/NEDepthConvertKernel.h +++ b/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h @@ -34,19 +34,19 @@ namespace arm_compute class ITensor; /** Depth conversion kernel */ -class NEDepthConvertKernel : public INEKernel +class NEDepthConvertLayerKernel : public INEKernel { public: /** Default constructor*/ - NEDepthConvertKernel(); + NEDepthConvertLayerKernel(); /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEDepthConvertKernel(const NEDepthConvertKernel &) = delete; + NEDepthConvertLayerKernel(const NEDepthConvertLayerKernel &) = delete; /** Default move constructor */ - NEDepthConvertKernel(NEDepthConvertKernel &&) = default; + NEDepthConvertLayerKernel(NEDepthConvertLayerKernel &&) = default; /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEDepthConvertKernel &operator=(const NEDepthConvertKernel &) = delete; + NEDepthConvertLayerKernel &operator=(const NEDepthConvertLayerKernel &) = delete; /** Default move assignment operator */ - NEDepthConvertKernel &operator=(NEDepthConvertKernel &&) = default; + NEDepthConvertLayerKernel &operator=(NEDepthConvertLayerKernel &&) = default; /** Set the input and output of the kernel * * Valid conversions Input -> Output : diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h new file mode 100644 index 000000000..b8f01cb63 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONKERNEL3x3_H__ +#define __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONKERNEL3x3_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the kernel to run a 3x3 depthwise convolution on a tensor. + */ +class NEDepthwiseConvolutionLayer3x3Kernel : public INEKernel +{ +public: + /** Default constructor */ + NEDepthwiseConvolutionLayer3x3Kernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDepthwiseConvolutionLayer3x3Kernel(const NEDepthwiseConvolutionLayer3x3Kernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDepthwiseConvolutionLayer3x3Kernel &operator=(const NEDepthwiseConvolutionLayer3x3Kernel &) = delete; + /** Default Move Constructor. */ + NEDepthwiseConvolutionLayer3x3Kernel(NEDepthwiseConvolutionLayer3x3Kernel &&) = default; + /** Default move assignment operator. */ + NEDepthwiseConvolutionLayer3x3Kernel &operator=(NEDepthwiseConvolutionLayer3x3Kernel &&) = default; + /** Initialize the function's source, destination, conv and border_size. + * + * @param[in] input Source tensor. DataType supported: F32. + * @param[in] weights Weights tensor. This is a 3D tensor with dimensions [3, 3, IFM]. Data type supported: Same as @p input. + * @param[out] output Destination tensor. Data type supported: Same as @p input. + * @param[in] conv_info Padding and stride information to use for the convolution. + */ + void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + BorderSize border_size() const override; + +private: + BorderSize _border_size; + const ITensor *_input; + ITensor *_output; + const ITensor *_weights; + PadStrideInfo _conv_info; +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEDEPTHWISECONVOLUTIONKERNEL3x3_H__ */
\ No newline at end of file diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h new file mode 100644 index 000000000..fde474d1f --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEDEPTHWISEIM2COLKERNEL_H__ +#define __ARM_COMPUTE_NEDEPTHWISEIM2COLKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Size2D.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the depthwise im2col reshape kernel. + * This kernel reshape the input low 3 dimensions to a new 3D shape where the output's first dimension is + * the linear patch size (FILTER_WIDTH * FILTER_HEIGHT) and second dimension is number of patches in per image and third dimension unchanged . + **/ +class NEDepthwiseIm2ColKernel : public INEKernel +{ +public: + /** Default constructor */ + NEDepthwiseIm2ColKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDepthwiseIm2ColKernel(const NEDepthwiseIm2ColKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDepthwiseIm2ColKernel &operator=(const NEDepthwiseIm2ColKernel &) = delete; + /** Allow instances of this class to be moved */ + NEDepthwiseIm2ColKernel(NEDepthwiseIm2ColKernel &&) = default; + /** Allow instances of this class to be moved */ + NEDepthwiseIm2ColKernel &operator=(NEDepthwiseIm2ColKernel &&) = default; + /** Set the input and output of the kernel. + * + * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F32 + * @param[out] output The output tensor. First 3 lower dimensions represent a transform of each 3D input, + * while every dimension above 3 represents a batch. Data types supported: Same as @p input + * @param[in] kernel_dims The kernel dimensions (width and height). + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] has_bias Boolean that specifies if the depthwise convolution has bias. + */ + void configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias = false); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + const ITensor *_input; + ITensor *_output; + Size2D _kernel_dims; + PadStrideInfo _conv_info; + bool _has_bias; +}; +} // arm_compute +#endif /*__ARM_COMPUTE_NEDEPTHWISEIM2COLKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h new file mode 100644 index 000000000..8b33fae6f --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEDEPTHWISEVECTORTOTENSORKERNEL_H__ +#define __ARM_COMPUTE_NEDEPTHWISEVECTORTOTENSORKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the depthwise vector to tensor kernel. + * + * This kernel takes the 1D tensor that's been produced by the MatrixVectorMultiply + * kernel and reshapes it to given width and height (previously calculated, based + * on input/weights dimensions and convolution strides and padding). + * + **/ +class NEDepthwiseVectorToTensorKernel : public INEKernel +{ +public: + /** Default constructor */ + NEDepthwiseVectorToTensorKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDepthwiseVectorToTensorKernel(const NEDepthwiseVectorToTensorKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDepthwiseVectorToTensorKernel &operator=(const NEDepthwiseVectorToTensorKernel &) = delete; + /** Allow instances of this class to be moved */ + NEDepthwiseVectorToTensorKernel(NEDepthwiseVectorToTensorKernel &&) = default; + /** Allow instances of this class to be moved */ + NEDepthwiseVectorToTensorKernel &operator=(NEDepthwiseVectorToTensorKernel &&) = default; + /** Set the input and output of the kernel. + * + * @param[in] input The input vector to convert. Data type supported: F32. + * @param[out] output The output tensor. 3 lower dimensions represent a single input [width, height, IFM]. Data type supported: same as @p input. + * @param[in] conv_w The converted tensor's width. + * @param[in] conv_h The converted tensor's height. + */ + void configure(const ITensor *input, ITensor *output, size_t conv_w, size_t conv_h); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + const ITensor *_input; + ITensor *_output; + std::pair<size_t, size_t> _conv_dims; +}; +} // arm_compute +#endif /*__ARM_COMPUTE_NEDEPTHWISEVECTORTOTENSORKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h new file mode 100644 index 000000000..2e986117d --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEDEPTHWISEWEIGHTSRESHAPEKERNEL_H__ +#define __ARM_COMPUTE_NEDEPTHWISEWEIGHTSRESHAPEKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the depthwise weights reshape kernel. + * This kernel reshape original weights' low 2D dimensions into a single col and + * have the second dimension as the original depth size. + **/ +class NEDepthwiseWeightsReshapeKernel : public INEKernel +{ +public: + /** Default constructor */ + NEDepthwiseWeightsReshapeKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDepthwiseWeightsReshapeKernel(const NEDepthwiseWeightsReshapeKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDepthwiseWeightsReshapeKernel &operator=(const NEDepthwiseWeightsReshapeKernel &) = delete; + /** Allow instances of this class to be moved */ + NEDepthwiseWeightsReshapeKernel(NEDepthwiseWeightsReshapeKernel &&) = default; + /** Allow instances of this class to be moved */ + NEDepthwiseWeightsReshapeKernel &operator=(NEDepthwiseWeightsReshapeKernel &&) = default; + /** Set the input and output of the kernel. + * + * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM]. Data type supported: F32. + * @param[out] output The output tensor. Data type supported: same as @p input. + * @param[in] biases (Optional) The input biases to add. Shape [IFM]. Data type supported: same as @p input. + */ + void configure(const ITensor *input, ITensor *output, const ITensor *biases); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + const ITensor *_input; + ITensor *_output; + const ITensor *_biases; +}; +} // arm_compute +#endif /*__ARM_COMPUTE_NEDEPTHWISEWEIGHTSRESHAPEKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEDerivativeKernel.h b/arm_compute/core/NEON/kernels/NEDerivativeKernel.h index 7613b586d..5d46516f6 100644 --- a/arm_compute/core/NEON/kernels/NEDerivativeKernel.h +++ b/arm_compute/core/NEON/kernels/NEDerivativeKernel.h @@ -64,17 +64,17 @@ public: private: /** Function to perform derivative along the X direction on the given window * - * @param[in] window Region on which to execute the kernel + * @param[in] window Region on which to execute the kernel */ void derivative_x(const Window &window); /** Function to perform derivative along the Y direction on the given window * - * @param[in] window Region on which to execute the kernel + * @param[in] window Region on which to execute the kernel */ void derivative_y(const Window &window); /** Function to perform derivative along the X and Y direction on the given window * - * @param[in] window Region on which to execute the kernel + * @param[in] window Region on which to execute the kernel */ void derivative_xy(const Window &window); /** Common signature for all the specialised derivative functions diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h index 14c8e9c7e..05ade1c5d 100644 --- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h +++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h @@ -57,6 +57,16 @@ public: * Data type supported: Same as @p input */ void configure(ITensor *input, const ITensor *bias, ITensor *output = nullptr); + /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerBiasAccumulateKernel + * + * @param[in] input Input to add the bias to. If @p output is not specified then accumulation is done in-place. + * Data type supported: QS8/QS16/F16/F32 + * @param[in] bias The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input + * @param[in] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr) + * Data type supported: Same as @p input + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output = nullptr); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h index 370ddca48..4529120f0 100644 --- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h @@ -62,6 +62,20 @@ public: * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. */ void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info); + /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerKernel + * + * @param[in] input The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * The 3rd dimension must be the same as the input's volume 3rd dimension. + * Data type supported:Same as @p input. + * @param[in] output Output tensor. + * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h b/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h index e298bfdeb..9e0fe8059 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h @@ -30,7 +30,7 @@ namespace arm_compute { class ITensor; -/** AssemblyBase/armv7a NEON kernel to multiply two input matrices "A" and "B". */ +/** Base class for GEMM NEON kernels implemented in Assembly. */ class NEGEMMAssemblyBaseKernel : public INEKernel { public: diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h index 1c0d85c27..fd93def0c 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h @@ -56,10 +56,18 @@ public: NEGEMMInterleave4x4Kernel(); /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QS16/U16/S16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/QS16/U16/S16/F16/U32/S32/F32 * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input. */ void configure(const ITensor *input, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMInterleave4x4Kernel + * + * @param[in] input Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/QS16/U16/S16/F16/U32/S32/F32 + * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -67,7 +75,7 @@ public: private: /** Common signature for all the transpose functions * - * @param[in] input An input tensor. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input An input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 * @param[out] output The output tensor. Data type supported: same as @p input * @param[in] window Region on which to execute the kernel. */ diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h new file mode 100644 index 000000000..b9bb18d2b --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMINTERLEAVEBLOCKEDKERNEL_H__ +#define __ARM_COMPUTE_NEGEMMINTERLEAVEBLOCKEDKERNEL_H__ + +#include "arm_compute/core/NEON/INESimpleKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to interleave the elements of a matrix + * + * Interleave_Blocked copies a block of values at a time instead of just one. The main use of this is the gemmlowp with the "dot product" + * instruction, where each operation consumes 4 values, so we need to copy blocks of 4 values. + * + */ +class NEGEMMInterleaveBlockedKernel : public INESimpleKernel +{ +public: + /* Constructor */ + NEGEMMInterleaveBlockedKernel(); + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8 + * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input. + * @param[in] block_height The height of the blocks to be interleaved. + * @param[in] block_width The width of the blocks to be interleaved. + * @param[in] transpose True if transpose operation must be performed, false otherwise. + */ + void configure(const ITensor *input, ITensor *output, unsigned int block_height, unsigned int block_width, bool transpose); + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMInterleaveBlockedKernel + * + * @param[in] input Input tensor. Data types supported: U8 + * @param[in] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input. + * @param[in] block_height The height of the blocks to be interleaved. + * @param[in] block_width The width of the blocks to be interleaved. + * @param[in] transpose True if transpose operation must be performed, false otherwise. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int block_height, unsigned int block_width, bool transpose); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + unsigned int _block_height; + unsigned int _block_width; + bool _transpose; +}; + +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEGEMMINTERLEAVEBLOCKEDKERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h index f526d213c..7435994b8 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h @@ -35,12 +35,9 @@ class ITensor; * @note @ref NEGEMMLowpMatrixMultiplyKernel low precision matrix product kernel * This kernel performs the following computation: * - * -# Convert a values from uint8 to int32 and add a_offset to each of them. - * -# Convert b values from uint8 to int32 and add b_offset to each of them. - * -# Compute the int32 matrix product of the resulting a * b. - * -# Add output_offset to each entry of the result. - * -# Multiply each entry of the result and round to the nearest integer - * -# Clamp the resulting int32 values to the [0..255] range and cast to uint8. + * -# Convert a values from int8 to int32 + * -# Convert b values from int8 to int32 + * -# Compute the int32 matrix product of the resulting a * b and store the result as int32 * */ class NEGEMMLowpMatrixMultiplyKernel : public INEKernel @@ -61,16 +58,21 @@ public: * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel. These two * kernels change the layout of the original matrices to be more cache-friendly. * - * @param[in] input0 Input tensor containing the interleaved Matrix A. Data type supported: U8 - * @param[in] input1 Input tensor containing the transposed Matrix B. Data type supported: same as @p input0 - * @param[out] output Output tensor to store the result of matrix multiplication, Data type supported: same as @p input0 - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * @param[in] output_offset Offset to be added to each element of the output matrix - * @param[in] output_mult_int Value to be multipied to each entry of the result. - * @param[in] shift Number of bits to shift right the result. + * @param[in] input0 Input tensor containing the interleaved Matrix A. Data type supported: QASYMM8 + * @param[in] input1 Input tensor containing the transposed1xW Matrix B. Data type supported: same as @p input0 + * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32 */ - void configure(const ITensor *input0, const ITensor *input1, ITensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift); + void configure(const ITensor *input0, const ITensor *input1, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyKernel + * + * @param[in] input0 Input tensor info containing the interleaved Matrix A. Data type supported: QASYMM8 + * @param[in] input1 Input tensor info containing the transposed Matrix B. Data type supported: same as @p input0 + * @param[in] output Output tensor info to store the result of matrix multiplication. Data type supported: S32 + * + * @return a status + */ + static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output); + // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -78,11 +80,7 @@ private: const ITensor *_input0; const ITensor *_input1; ITensor *_output; - int32_t _a_offset; - int32_t _b_offset; - int32_t _output_offset; - int32_t _output_mult_int; - int32_t _shift; + bool _slide_matrix_b; }; } // namespace arm_compute -#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H__*/
\ No newline at end of file +#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h new file mode 100644 index 000000000..531968304 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__ +#define __ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel used to add the offset contribution after @ref NEGEMMLowpMatrixMultiplyKernel. The computation is performed in-place + * + * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), + * and adds to it the offset contribution of matrix A and matrix B in-place. + * + * The final result is: + * + * mm_result[i][k] = mm_result[i][k] + + * (vector_sum_col[k] * a_offset) + + * (vector_sum_row[i] * b_offset) + + * (a_offset * b_offset * k) + * + */ +class NEGEMMLowpOffsetContributionKernel : public INEKernel +{ +public: + /** Constructor */ + NEGEMMLowpOffsetContributionKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + NEGEMMLowpOffsetContributionKernel(const NEGEMMLowpOffsetContributionKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + NEGEMMLowpOffsetContributionKernel &operator=(const NEGEMMLowpOffsetContributionKernel &) = delete; + /** Allow instances of this class to be moved */ + NEGEMMLowpOffsetContributionKernel(NEGEMMLowpOffsetContributionKernel &&) = default; + /** Allow instances of this class to be moved */ + NEGEMMLowpOffsetContributionKernel &operator=(NEGEMMLowpOffsetContributionKernel &&) = default; + /** Initialise the kernel's input and output. + * + * @param[in, out] mm_result Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32 + * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. + * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result + * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. + * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result + * @param[in] k Number of matrix A columns or Matrix B rows + * @param[in] a_offset Offset to be added to each element of the matrix A. + * @param[in] b_offset Offset to be added to each element of the matrix B. + */ + void configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset); + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpOffsetContributionKernel + * + * @param[in] mm_result Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32 + * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. + * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result + * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. + * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result + * @param[in] a_offset Offset to be added to each element of the matrix A. + * @param[in] b_offset Offset to be added to each element of the matrix B. + * + * @return a status + */ + static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + const ITensor *_vector_sum_col; + const ITensor *_vector_sum_row; + ITensor *_mm_result; + int32_t _a_offset; + int32_t _b_offset; + int32_t _k_offset; + bool _slide_vector_sum_col; +}; +} // namespace arm_compute + +#endif /* __ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h new file mode 100644 index 000000000..b1dd1fb2d --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H__ +#define __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8 + * + * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value. + * The following computations will be performed by the kernel: + * + * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier + * -# Add bias to final result if bias tensor is not a nullptr + * -# Round to nearest division by a power-of-two using result_shift + * -# Add offset to each result + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. + * + */ +class NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public INEKernel +{ +public: + /** Constructor */ + NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete; + /** Allow instances of this class to be moved */ + NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default; + /** Allow instances of this class to be moved */ + NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default; + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add + * @param[in] result_shift Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication + * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions + */ + void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0); + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel + * + * @param[in] input Input tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + /** Template function to run the NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel + * + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). + */ + template <bool is_bounded_relu> + void run(const Window &window); + + /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel functions + * + * @param[in] window Region on which to execute the kernel. + */ + using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::*)(const Window &window); + + QuantizeDownFunctionPtr _func; + const ITensor *_input; + const ITensor *_bias; + ITensor *_output; + int _result_fixedpoint_multiplier; + int _result_shift; + int _result_offset_after_shift; + int _min; + int _max; +}; +} // namespace arm_compute + +#endif /* __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h new file mode 100644 index 000000000..10b333032 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEKERNEL_H__ +#define __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8 + * + * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value. + * The following computations will be performed by the kernel: + * + * -# Add offset terms to final result + * -# Multiply each entry of result by result_mult_int + * -# Add bias to final result if bias tensor is not a nullptr + * -# Shift the int32 accumulator by result_shift + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. + * + */ +class NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel : public INEKernel +{ +public: + /** Constructor */ + NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &operator=(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &) = delete; + /** Allow instances of this class to be moved */ + NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel(NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &&) = default; + /** Allow instances of this class to be moved */ + NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &operator=(NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel &&) = default; + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] result_offset Offset to be added to each element of the input matrix + * @param[in] result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add + * @param[in] result_shift Number of bits to shift right the result before converting back to QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions + */ + void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0); + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel + * + * @param[in] input Input tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + /** Template function to run the NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel + * + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). + */ + template <bool is_bounded_relu> + void run(const Window &window); + + /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel functions + * + * @param[in] window Region on which to execute the kernel. + */ + using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::*)(const Window &window); + + QuantizeDownFunctionPtr _func; + const ITensor *_input; + const ITensor *_bias; + ITensor *_output; + int _result_offset; + int _result_mult_int; + int _result_shift; + int _min; + int _max; +}; +} // namespace arm_compute + +#endif /* __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h new file mode 100644 index 000000000..38c353e29 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H__ +#define __ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Common interface for all NEON reduction kernels */ +class INEGEMMLowpReductionKernel : public INEKernel +{ +public: + /** Constructor */ + INEGEMMLowpReductionKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + INEGEMMLowpReductionKernel(const INEGEMMLowpReductionKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + INEGEMMLowpReductionKernel &operator=(const INEGEMMLowpReductionKernel &) = delete; + /** Allow instances of this class to be moved */ + INEGEMMLowpReductionKernel(INEGEMMLowpReductionKernel &&) = default; + /** Allow instances of this class to be moved */ + INEGEMMLowpReductionKernel &operator=(INEGEMMLowpReductionKernel &&) = default; + + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data type supported: QASYMM8 + * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32 + * @param[in] k Number of matrix A columns (or matrix B rows) + * @param[in] is_reshaped True if the input tensor has been reshaped + */ + virtual void configure(const ITensor *input, ITensor *output, int32_t k, bool is_reshaped) = 0; + +protected: + const ITensor *_input; + ITensor *_output; + int32_t _k; + bool _is_reshaped; +}; + +/** NEON kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + */ +class NEGEMMLowpMatrixAReductionKernel : public INEGEMMLowpReductionKernel +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] mtx_a Input tensor. Data type supported: QASYMM8 + * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 + * @param[in] num_mtx_a_cols Number of matrix A columns + * @param[in] is_interleaved4x4 True if the matrix A has been interleaved4x4 + */ + void configure(const ITensor *mtx_a, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4) override; + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixAReductionKernel + * + * @param[in] mtx_a Input tensor. Data type supported: QASYMM8 + * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 + * @param[in] num_mtx_a_cols Number of matrix A columns + * @param[in] is_interleaved4x4 True if the matrix A has been interleaved4x4 + * + * @return a status + */ + static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; +}; + +/** NEON kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + */ +class NEGEMMLowpMatrixBReductionKernel : public INEGEMMLowpReductionKernel +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8 + * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 + * @param[in] num_mtx_b_rows Number of matrix B rows + * @param[in] is_transposed1xW True if the input tensor is transposed 1xW + */ + void configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW) override; + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixBReductionKernel + * + * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 + * @param[in] num_mtx_b_rows Number of matrix B rows + * @param[in] is_transposed1xW True if the input tensor is transposed 1xW + * + * @return a status + */ + static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; +}; +} // namespace arm_compute + +#endif /* __ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h new file mode 100644 index 000000000..d844af5d5 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMMATRIXVECTORMULTIPLYKERNEL_H_ +#define __ARM_COMPUTE_NEGEMMMATRIXVECTORMULTIPLYKERNEL_H_ + +#include "arm_compute/core/NEON/INESimpleKernel.h" + +namespace arm_compute +{ +class ITensor; + +class NEGEMMMatrixVectorMultiplyKernel : public INESimpleKernel +{ +public: + /** Default constructor */ + NEGEMMMatrixVectorMultiplyKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGEMMMatrixVectorMultiplyKernel(const NEGEMMMatrixVectorMultiplyKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGEMMMatrixVectorMultiplyKernel &operator=(const NEGEMMMatrixVectorMultiplyKernel &) = delete; + /** Allow instances of this class to be moved */ + NEGEMMMatrixVectorMultiplyKernel(NEGEMMMatrixVectorMultiplyKernel &&) = default; + /** Allow instances of this class to be moved */ + NEGEMMMatrixVectorMultiplyKernel &operator=(NEGEMMMatrixVectorMultiplyKernel &&) = default; + /** Initialise the kernel's input and output. + * + * @param[in] input0 First Input tensor. Data types supported: F16/F32 + * @param[in] input1 Second Input tensor. Data types supported: same as @p input. + * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input. + */ + void configure(const ITensor *input0, const ITensor *input1, ITensor *output); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +private: + const ITensor *_input0; + const ITensor *_input1; + ITensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEGEMMMATRIXVECTORMULTIPLYKERNEL_H_*/ diff --git a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h index 4d0bb2a48..e8ee2a7d2 100644 --- a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h +++ b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h @@ -70,10 +70,18 @@ class NEGEMMTranspose1xWKernel : public INESimpleKernel public: /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 * @param[out] output Output tensor. Data type supported: same as @p input. */ void configure(const ITensor *input, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMTranspose1xWKernel + * + * @param[in] input Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] output Output tensor info. Data type supported: same as @p input. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h b/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h index 31779b520..d28501107 100644 --- a/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h +++ b/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h @@ -49,19 +49,17 @@ public: /** Initialise the kernel's source, destination and border mode. * - * @param[in] input Source tensor. Data type supported: U8. - * @param[out] output Destination tensor. Data type supported: S16. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + * @param[in] input Source tensor. Data type supported: U8. + * @param[out] output Destination tensor. Data type supported: S16. */ - void configure(const ITensor *input, ITensor *output, bool border_undefined); + void configure(const ITensor *input, ITensor *output); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; BorderSize border_size() const override; private: - BorderSize _border_size; - int _l2_load_offset; + int _l2_load_offset; }; /** NEON kernel to perform a GaussianPyramid (vertical pass) */ @@ -83,11 +81,10 @@ public: /** Initialise the kernel's source, destination and border mode. * - * @param[in] input Source tensor. Data type supported: S16. - * @param[out] output Destination tensor. Data type supported: U8. - * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + * @param[in] input Source tensor. Data type supported: S16. + * @param[out] output Destination tensor. Data type supported: U8. */ - void configure(const ITensor *input, ITensor *output, bool border_undefined); + void configure(const ITensor *input, ITensor *output); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h index 2aef420e4..c3c37e4d2 100644 --- a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h +++ b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h @@ -99,7 +99,7 @@ private: HarrisScoreFunction *_func; }; -#ifdef ARM_COMPUTE_ENABLE_FP16 +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC /** Interface for the accumulate Weighted kernel using F16 */ template <int32_t block_size> class NEHarrisScoreFP16Kernel : public INEHarrisScoreKernel @@ -118,9 +118,9 @@ private: /** Harris Score function to use for the particular image types passed to configure() */ HarrisScoreFunction *_func; }; -#else /* ARM_COMPUTE_ENABLE_FP16 */ +#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ template <int32_t block_size> using NEHarrisScoreFP16Kernel = NEHarrisScoreKernel<block_size>; -#endif /* ARM_COMPUTE_ENABLE_FP16 */ +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ } // namespace arm_compute #endif /* __ARM_COMPUTE_NEHARRISCORNERSKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEHistogramKernel.h b/arm_compute/core/NEON/kernels/NEHistogramKernel.h index 0fa911dbf..672472e08 100644 --- a/arm_compute/core/NEON/kernels/NEHistogramKernel.h +++ b/arm_compute/core/NEON/kernels/NEHistogramKernel.h @@ -82,28 +82,28 @@ public: private: /** Function to merge multiple partial histograms. * - * @param[out] global_hist Pointer to the final histogram. - * @param[in] local_hist Pointer to the partial histograms. - * @param[in] bins Number of bins. + * @param[out] global_hist Pointer to the final histogram. + * @param[in] local_hist Pointer to the partial histograms. + * @param[in] bins Number of bins. */ void merge_histogram(uint32_t *global_hist, const uint32_t *local_hist, size_t bins); /** Function to merge multiple minimum values of partial histograms. * - * @param[out] global_min Pointer to the global min value. - * @param[in] local_min Local min value. + * @param[out] global_min Pointer to the global min value. + * @param[in] local_min Local min value. */ void merge_min(uint8_t *global_min, const uint8_t &local_min); /** Function to perform histogram on the given window - * - * @param[in] win Region on which to execute the kernel - * @param[in] info Info about the executing thread + * + * @param[in] win Region on which to execute the kernel + * @param[in] info Info about the executing thread */ void histogram_U8(Window win, const ThreadInfo &info); /** Function to perform histogram on the given window where histogram is * of fixed size 256 without ranges and offsets. * - * @param[in] win Region on which to execute the kernel - * @param[in] info Info about the executing thread + * @param[in] win Region on which to execute the kernel + * @param[in] info Info about the executing thread */ void histogram_fixed_U8(Window win, const ThreadInfo &info); /** Pre-calculate the pixel windowing for every possible pixel diff --git a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h index 1a0735ea8..bc12b22e5 100644 --- a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h +++ b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h @@ -73,13 +73,27 @@ public: /** Set the input and output of the kernel. * * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], - * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/F16/F32 + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/QASYMM8/F16/F32 + * Note: QASYMM8 works only for has_bias = false * @param[out] output The output tensor. Data types supported: Same as @p input * @param[in] kernel_dims The kernel dimensions (width and height). * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] has_bias In case biases are provided expands the matrix with 1. */ void configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias); + /** Static function to check if given info will lead to a valid configuration of @ref NEIm2ColKernel + * + * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/QASYMM8/F16/F32 + * Note: QASYMM8 works only for has_bias = false + * @param[in] output The output tensor. Data types supported: Same as @p input + * @param[in] kernel_dims The kernel dimensions (width and height). + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] has_bias In case biases are provided expands the matrix with 1. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/arm_compute/core/NEON/kernels/NEL2NormalizeKernel.h b/arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h index fbbe4bee9..7aa5116b6 100644 --- a/arm_compute/core/NEON/kernels/NEL2NormalizeKernel.h +++ b/arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h @@ -31,21 +31,21 @@ namespace arm_compute class ITensor; /** Interface for performing a L2 normalize on a given axis given the square sum of it in this axis */ -class NEL2NormalizeKernel : public INEKernel +class NEL2NormalizeLayerKernel : public INEKernel { public: /** Default constructor */ - NEL2NormalizeKernel(); + NEL2NormalizeLayerKernel(); /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEL2NormalizeKernel(const NEL2NormalizeKernel &) = delete; + NEL2NormalizeLayerKernel(const NEL2NormalizeLayerKernel &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ - NEL2NormalizeKernel &operator=(const NEL2NormalizeKernel &) = delete; + NEL2NormalizeLayerKernel &operator=(const NEL2NormalizeLayerKernel &) = delete; /** Allow instances of this class to be moved */ - NEL2NormalizeKernel(NEL2NormalizeKernel &&) = default; + NEL2NormalizeLayerKernel(NEL2NormalizeLayerKernel &&) = default; /** Allow instances of this class to be moved */ - NEL2NormalizeKernel &operator=(NEL2NormalizeKernel &&) = default; + NEL2NormalizeLayerKernel &operator=(NEL2NormalizeLayerKernel &&) = default; /** Default destructor */ - ~NEL2NormalizeKernel() = default; + ~NEL2NormalizeLayerKernel() = default; /** Set the input and output tensors. * * @param[in] input Source tensor. Data types supported: F32. diff --git a/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h b/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h index b853d2245..76c616360 100644 --- a/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h +++ b/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h @@ -66,17 +66,17 @@ public: private: /** Function to perform magnitude on the given window * - * @param[in] window Region on which to execute the kernel + * @param[in] window Region on which to execute the kernel */ void magnitude(const Window &window); /** Function to perform phase on the given window * - * @param[in] window Region on which to execute the kernel + * @param[in] window Region on which to execute the kernel */ void phase(const Window &window); /** Function to perform magnitude and phase on the given window * - * @param[in] window Region on which to execute the kernel + * @param[in] window Region on which to execute the kernel */ void magnitude_phase(const Window &window); @@ -94,7 +94,7 @@ private: ITensor *_phase; /**< Output - Phase */ }; -#ifdef ARM_COMPUTE_ENABLE_FP16 +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC /** Template interface for the kernel to compute magnitude and phase */ template <MagnitudeType mag_type, PhaseType phase_type> class NEMagnitudePhaseFP16Kernel : public INEKernel @@ -130,17 +130,17 @@ public: private: /** Function to perform magnitude on the given window * - * @param[in] window Region on which to execute the kernel + * @param[in] window Region on which to execute the kernel */ void magnitude(const Window &window); /** Function to perform phase on the given window * - * @param[in] window Region on which to execute the kernel + * @param[in] window Region on which to execute the kernel */ void phase(const Window &window); /** Function to perform magnitude and phase on the given window * - * @param[in] window Region on which to execute the kernel + * @param[in] window Region on which to execute the kernel */ void magnitude_phase(const Window &window); @@ -156,9 +156,9 @@ private: ITensor *_magnitude; /**< Output - Magnitude */ ITensor *_phase; /**< Output - Phase */ }; -#else /* ARM_COMPUTE_ENABLE_FP16 */ +#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ template <MagnitudeType mag_type, PhaseType phase_type> using NEMagnitudePhaseFP16Kernel = NEMagnitudePhaseKernel<mag_type, phase_type>; -#endif /* ARM_COMPUTE_ENABLE_FP16 */ +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ } // namespace arm_compute #endif /* __ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h b/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h index 3bce1a99f..da8aecff5 100644 --- a/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h +++ b/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h @@ -78,7 +78,7 @@ protected: ITensor *_output; /**< Destination tensor */ }; -#ifdef ARM_COMPUTE_ENABLE_FP16 +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC /** NEON kernel to perform Non-Maxima suppression 3x3 with intermediate results in F16 if the input data type is F32 */ class NENonMaximaSuppression3x3FP16Kernel : public NENonMaximaSuppression3x3Kernel @@ -92,8 +92,8 @@ public: */ void configure(const ITensor *input, ITensor *output, bool border_undefined); }; -#else /* ARM_COMPUTE_ENABLE_FP16 */ +#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ using NENonMaximaSuppression3x3FP16Kernel = NENonMaximaSuppression3x3Kernel; -#endif /* ARM_COMPUTE_ENABLE_FP16 */ +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ } // namespace arm_compute #endif /* _ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h index 40fae3520..405daf106 100644 --- a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h @@ -57,6 +57,18 @@ public: * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters. */ void configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info); + /** Static function to check if given info will lead to a valid configuration of @ref NENormalizationLayerKernel + * + * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/FP16/F32. + * @param[in] input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM], + * Data type supported: same as @p input + * @param[in] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input + * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, NormalizationLayerInfo norm_info); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h index baa4112ca..10f990e7e 100644 --- a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h +++ b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h @@ -62,6 +62,23 @@ public: * @param[in] rounding_policy Rounding policy. */ void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); + /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplicationKernel + * + * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. + * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. + * For QS8/QS16 scale = 1 is the only supported value. + * + * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 + * @param[in] input2 An input tensor. Data types supported: U8, QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32). + * @param[in] output The output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32). + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. + * @param[in] overflow_policy Overflow policy. + * @param[in] rounding_policy Rounding policy. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h index 9d7c75179..87d14e5f9 100644 --- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h @@ -55,6 +55,17 @@ public: * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. */ void configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info); + /** Static function to check if given info will lead to a valid configuration of @ref NEPoolingLayerKernel + * + * @note QS8, QS16 and F16 are supported for pool sizes 2 and 3 only + * + * @param[in] input Source tensor. Data types supported: QS8/QS16/F16/F32. + * @param[in] output Destination tensor. Data types supported: Same as @p input. + * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -66,14 +77,14 @@ private: * @param[in] window_input Input region on which to execute the kernel. * @param[in] window Output region on which to execute the kernel. */ - template <PoolingType pooling_type> + template <PoolingType pooling_type, bool exclude_padding = false> void pooling2_f32(const Window &window_input, const Window &window); /** Function to perform 2x2 pooling for float16_t. * * @param[in] window_input Input region on which to execute the kernel. * @param[in] window Output region on which to execute the kernel. */ - template <PoolingType pooling_type> + template <PoolingType pooling_type, bool exclude_padding = false> void pooling2_f16(const Window &window_input, const Window &window); /** Function to perform 2x2 pooling for 8bit fixed point. @@ -95,14 +106,14 @@ private: * @param[in] window_input Input region on which to execute the kernel. * @param[in] window Output region on which to execute the kernel. */ - template <PoolingType pooling_type> + template <PoolingType pooling_type, bool exclude_padding = false> void pooling3_f32(const Window &window_input, const Window &window); /** Function to perform 3x3 pooling. * * @param[in] window_input Input region on which to execute the kernel. * @param[in] window Output region on which to execute the kernel. */ - template <PoolingType pooling_type> + template <PoolingType pooling_type, bool exclude_padding = false> void pooling3_f16(const Window &window_input, const Window &window); /** Function to perform 3x3 pooling for 8bit fixed point. * @@ -123,14 +134,14 @@ private: * @param[in] window_input Input region on which to execute the kernel. * @param[in] window Output region on which to execute the kernel. */ - template <PoolingType pooling_type> + template <PoolingType pooling_type, bool exclude_padding = false> void pooling7_f32(const Window &window_input, const Window &window); /** Function to perform NxN pooling. * * @param[in] window_input Input region on which to execute the kernel. * @param[in] window Output region on which to execute the kernel. */ - template <PoolingType pooling_type> + template <PoolingType pooling_type, bool exclude_padding = false> void poolingN_f32(const Window &window_input, const Window &window); /** Common signature for all the specialised Pooling functions * @@ -144,7 +155,7 @@ private: const ITensor *_input; ITensor *_output; PoolingLayerInfo _pool_info; - int _num_elems_processed_per_iteration; + unsigned int _num_elems_processed_per_iteration; BorderSize _border_size; }; } // namespace arm_compute diff --git a/arm_compute/core/NEON/kernels/NERemapKernel.h b/arm_compute/core/NEON/kernels/NERemapKernel.h index 5806275ce..7aa5de7a3 100644 --- a/arm_compute/core/NEON/kernels/NERemapKernel.h +++ b/arm_compute/core/NEON/kernels/NERemapKernel.h @@ -60,6 +60,7 @@ public: // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; + BorderSize border_size() const override; private: /** function to perform nearest interpolation on the given window */ diff --git a/arm_compute/core/NEON/kernels/NEScaleKernel.h b/arm_compute/core/NEON/kernels/NEScaleKernel.h index 5ec585484..ac154d445 100644 --- a/arm_compute/core/NEON/kernels/NEScaleKernel.h +++ b/arm_compute/core/NEON/kernels/NEScaleKernel.h @@ -59,8 +59,10 @@ public: * @param[out] output Destination tensor. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. * @param[in] policy Interpolation type to use * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + * @param[in] sampling_policy (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER */ - void configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output, InterpolationPolicy policy, bool border_undefined); + void configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output, InterpolationPolicy policy, bool border_undefined, + SamplingPolicy sampling_policy = SamplingPolicy::CENTER); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h index cce21569d..0fecfac15 100644 --- a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h @@ -43,6 +43,14 @@ public: * @param[out] output Destination tensor. Data types supported: same as @p input */ void configure(const ITensor *input, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DMaxKernel + * + * @param[in] input Source tensor. Data types supported: QS8/QS16/F16/F32 + * @param[in] output Destination tensor. Data types supported: same as @p input + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; @@ -78,14 +86,26 @@ public: * @param[in] max Max values tensor. Data types supported: same as @p input. * @param[out] output Destination tensor. Data types supported: same as @p input. * @param[out] sum Sum of 1D logits tensor. Data types supported: same as @p input. + * @param[in] beta (Optional) A scaling factor for the exponent. QS8/QS16 only support a beta value of 1. */ - void configure(const ITensor *input, const ITensor *max, ITensor *output, ITensor *sum); + void configure(const ITensor *input, const ITensor *max, ITensor *output, ITensor *sum, float beta = 1.0f); + /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DShiftExpSumKernel + * + * @param[in] input Source tensor. Data types supported: QS8/QS16/F16/F32 + * @param[in] max Max values tensor. Data types supported: same as @p input + * @param[in] output Destination tensor. Data types supported: same as @p input. + * @param[in] sum Sum of 1D logits tensor. Data types supported: same as @p input. + * @param[in] beta (Optional) A scaling factor for the exponent. QS8/QS16 only support a beta value of 1. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum, float beta = 1.0f); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; private: - using Logits1DShiftExpSumFunction = void(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window); + using Logits1DShiftExpSumFunction = void(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window, float beta); private: Logits1DShiftExpSumFunction *_func; @@ -93,6 +113,7 @@ private: const ITensor *_max; ITensor *_output; ITensor *_sum; + float _beta; }; /** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */ @@ -118,6 +139,15 @@ public: * @param[out] output Destination tensor. Data types supported: same as @p input. */ void configure(const ITensor *input, const ITensor *sum, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DNormKernel + * + * @param[in] input Source tensor. Data types supported: QS8/QS16/S32/F16/F32 + * @param[in] sum Sum tensor. The number of dimensions should be dim(input)-1. Data types supported: same as @p input. + * @param[in] output Destination tensor. Data types supported: same as @p input. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/arm_compute/core/NEON/kernels/NETransposeKernel.h b/arm_compute/core/NEON/kernels/NETransposeKernel.h index 71bd27437..855d270e4 100644 --- a/arm_compute/core/NEON/kernels/NETransposeKernel.h +++ b/arm_compute/core/NEON/kernels/NETransposeKernel.h @@ -53,10 +53,18 @@ public: /** Initialise the kernel's input and output. * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 * @param[out] output Output tensor. Data type supported: Same as @p input */ void configure(const ITensor *input, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref NETransposeKernel + * + * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] output Output tensor. Data type supported: Same as @p input + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); // Inherited methods overridden: void run(const Window &window, const ThreadInfo &info) override; diff --git a/arm_compute/core/NEON/kernels/NEWarpKernel.h b/arm_compute/core/NEON/kernels/NEWarpKernel.h index 3a1cab158..d7cb82f27 100644 --- a/arm_compute/core/NEON/kernels/NEWarpKernel.h +++ b/arm_compute/core/NEON/kernels/NEWarpKernel.h @@ -66,17 +66,17 @@ public: protected: /** function to perform warp affine or warp perspective on the given window when border mode == UNDEFINED * - * @param[in] window Region on which to execute the kernel + * @param[in] window Region on which to execute the kernel */ virtual void warp_undefined(const Window &window) = 0; /** function to perform warp affine or warp perspective on the given window when border mode == CONSTANT * - * @param[in] window Region on which to execute the kernel + * @param[in] window Region on which to execute the kernel */ virtual void warp_constant(const Window &window) = 0; /** function to perform warp affine or warp perspective on the given window when border mode == REPLICATE * - * @param[in] window Region on which to execute the kernel + * @param[in] window Region on which to execute the kernel */ virtual void warp_replicate(const Window &window) = 0; /** Common signature for all the specialised warp functions diff --git a/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h b/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h new file mode 100644 index 000000000..c1343044a --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMWINOGRADLAYERKERNEL_H__ +#define __ARM_COMPUTE_NEGEMMWINOGRADLAYERKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/NEON/kernels/winograd/tensor.hpp" + +namespace arm_compute +{ +class ITensor; +class NEWinogradLayerKernel; +class Winograd3x3F32 +{ +public: + friend class NEWinogradLayerKernel; + Winograd3x3F32(const KernelShape &kernel_shape, const Tensor4DShape input_shape, const PaddingType padding_type, void *kernel_storage); + ~Winograd3x3F32(); + std::pair<void *, void *> get_nhwc_ptrs(const Tensor4DShape &input_shape, const PaddingType padding_type, void *working_space); + void transform_weights(const void *const kernel, void *transform_working_space); + void reshape_input(const Tensor4DShape &input_shape, const PaddingType padding_type, const void *const input, void *working_space); + void reshape_output(const Tensor4DShape &input_shape, const PaddingType padding_type, void *const output); + void nchw2nhwc(const Tensor4DShape &input_shape, const PaddingType padding_type, void *working_space, const void *const input); + void nhwc2nchw(const Tensor4DShape &input_shape, const PaddingType padding_type, void *working_space, void *const output); + +private: + class Private; + std::unique_ptr<Private> _pimpl; +}; + +class NEWinogradLayerKernel : public INEKernel +{ +public: + /** Constructor */ + NEWinogradLayerKernel(); + + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEWinogradLayerKernel(const NEWinogradLayerKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEWinogradLayerKernel &operator=(const NEWinogradLayerKernel &) = delete; + /** Allow instances of this class to be moved */ + NEWinogradLayerKernel(NEWinogradLayerKernel &&) = default; + /** Allow instances of this class to be moved */ + NEWinogradLayerKernel &operator=(NEWinogradLayerKernel &&) = default; + + virtual ~NEWinogradLayerKernel() = default; + + /** Initialise the kernel + * + * @param[in,out] output Output tensor to store the result of matrix multiplication. Data type supported: F32. + * @param[in] convolver A pointer to the winograd convolver, this object must have been configured and is ready to execute 16 GEMMS . + */ + void configure(ITensor *output, Winograd3x3F32 *convolver); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + + /* Get the memory required to instantiate a new Winograd operator. + */ + static size_t get_kernel_storage_size(const KernelShape &shape); + + /* Get the memory required to apply a Winograd operator to some input. + */ + static size_t get_working_space_size(const Tensor4DShape &input_shape, const KernelShape &k_shape, const PaddingType padding); + + /* Get the memory required to transform the kernel. + */ + static size_t get_kernel_transform_working_size(const KernelShape &shape); + +protected: + Winograd3x3F32 *_convolver; + ITensor *_output; +}; + +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEGEMMWINOGRADLAYERKERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h new file mode 100644 index 000000000..33cd2d42d --- /dev/null +++ b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64A53Kernel.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMLOWPAARCH64A53KERNEL_H__ +#define __ARM_COMPUTE_NEGEMMLOWPAARCH64A53KERNEL_H__ + +#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" + +// Enable only if compiled for AArch64-V8A targets +#ifdef ARM_COMPUTE_AARCH64_V8A + +namespace arm_compute +{ +class ITensor; + +/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */ +class NEGEMMLowpAArch64A53Kernel : public NEGEMMAssemblyBaseKernel +{ +public: + /** Default constructor */ + NEGEMMLowpAArch64A53Kernel(); + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +protected: + void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override; + +private: + using NEGEMMLowpAArch64A53 = void(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1, const Window &window, + const ThreadInfo &info); + NEGEMMLowpAArch64A53 *_func; +}; +} // namespace arm_compute +#endif /* ARM_COMPUTE_AARCH64_V8A */ +#endif /*__ARM_COMPUTE_NEGEMMLOWPAARCH64A53KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h new file mode 100644 index 000000000..a93df033d --- /dev/null +++ b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64Kernel.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMLOWPAARCH64KERNEL_H__ +#define __ARM_COMPUTE_NEGEMMLOWPAARCH64KERNEL_H__ + +#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" + +// Enable only if compiled for AArch64-V8A targets +#ifdef ARM_COMPUTE_AARCH64_V8A + +namespace arm_compute +{ +class ITensor; + +/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */ +class NEGEMMLowpAArch64Kernel : public NEGEMMAssemblyBaseKernel +{ +public: + /** Default constructor */ + NEGEMMLowpAArch64Kernel(); + + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +protected: + void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override; + +private: + using NEGEMMLowpAArch64 = void(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1, const Window &window, + const ThreadInfo &info); + NEGEMMLowpAArch64 *_func; +}; +} // namespace arm_compute +#endif /* ARM_COMPUTE_AARCH64_V8A */ +#endif /*__ARM_COMPUTE_NEGEMMLOWPAARCH64KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h new file mode 100644 index 000000000..b03e5fa1a --- /dev/null +++ b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMLOWPAARCH64V8P4KERNEL_H__ +#define __ARM_COMPUTE_NEGEMMLOWPAARCH64V8P4KERNEL_H__ + +#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" + +// Enable only if compiled for AArch64-V8.2-A targets +#ifdef ARM_COMPUTE_AARCH64_V8_2 + +namespace arm_compute +{ +class ITensor; + +/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */ +class NEGEMMLowpAArch64V8P4Kernel : public NEGEMMAssemblyBaseKernel +{ +public: + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMAssemblyBaseKernel + * + * The computed function is C = a * AxB + b * C. + * + * @param[in] input0 Input tensor info containing the Matrix A. Data types supported: QASYMM8 + * @param[in] input1 Input tensor info containing the Matrix B. Data types supported: same as @p input0 + * @param[in] output Output tensor info to store the result of matrix multiplication. + * If @p beta is not zero the values are multiplied by @p beta before the result is accumulated. Otherwise the values are overwritten by the result. Data types supported: S32 + */ + static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output); + +protected: + void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override; +}; +} // namespace arm_compute +#endif /* ARM_COMPUTE_AARCH64_V8_2 */ +#endif /*__ARM_COMPUTE_NEGEMMLOWPAARCH64V8P4KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h new file mode 100644 index 000000000..9480a6a5d --- /dev/null +++ b/arm_compute/core/NEON/kernels/arm64/NEHGEMMAArch64FP16Kernel.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEHGEMMAARCH64FP16KERNEL_H__ +#define __ARM_COMPUTE_NEHGEMMAARCH64FP16KERNEL_H__ + +#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** AArch64 NEON kernel to multiply two input matrices "A" and "B". */ +class NEHGEMMAArch64FP16Kernel : public NEGEMMAssemblyBaseKernel +{ +public: + // Inherited methods overridden: + void run(const Window &window, const ThreadInfo &info) override; + +protected: + void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_NEHGEMMAARCH64FP16KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp index 00974436f..ef89e3aac 100644 --- a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp +++ b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp @@ -28,6 +28,6 @@ template<typename To, typename Tr> class GemmCommon { public: virtual size_t get_working_size() const = 0; - virtual void execute(const To *, const int, const To *, const int, Tr *, const int, const Tr, const Tr, void *working_space = NULL) const = 0; + virtual void execute(const To *, const int, const To *, const int, Tr *, const int, const Tr, const Tr, void *working_space) const = 0; virtual ~GemmCommon() { } }; diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp index f7d3a94fa..659ef837f 100644 --- a/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp +++ b/arm_compute/core/NEON/kernels/assembly/gemm_interleaved.hpp @@ -24,6 +24,7 @@ #pragma once #include <stdio.h> +#include <cassert> #include "gemm_common.hpp" #include "profiler.hpp" @@ -114,12 +115,13 @@ public: // Work out the rounded size of M - needed for some buffers. Mround = (M + (strat.out_height - 1)) / strat.out_height; Mround *= strat.out_height; + } // Actually execute the GEMM. void execute(const To *A, const int lda, const To *B, const int ldb, Tr *C, const int ldc, const Tr alpha, const Tr beta, void *working_space) const override { + assert(working_space); profiler prof; - int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space); intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space_bytes); size_t diff = 0; @@ -140,7 +142,7 @@ public: int kern_k = ((kmax - k0) + (strat.k_unroll - 1)) / strat.k_unroll; kern_k *= strat.k_unroll; - prof(PROFILE_PREPA, [&](void) { + prof(PROFILE_PREPA, (M * (kmax-k0) * sizeof(Toi)), [&](void) { if (trA ^ strategy::A_transpose) { Transform<strategy::A_interleave, strategy::A_block, true>(a_panel, A, lda, 0, M, k0, kmax); } else { @@ -154,7 +156,7 @@ public: int bblocks = (xmax - x0 + strat.out_width - 1) / strat.out_width; - prof(PROFILE_PREPB, [&](void) { + prof(PROFILE_PREPB, (xmax-x0) * (kmax-k0) * sizeof(Toi), [&](void) { if (trB ^ strategy::B_transpose) { Transform<strategy::B_interleave, strategy::B_block, true>(b_panel, B, ldb, x0, xmax, k0, kmax); } else { @@ -166,8 +168,8 @@ public: unsigned int ymax = y + strat.out_height; if (ymax > M) ymax = M; - prof(PROFILE_KERNEL, [&](void) { strat.kernel(a_panel + (y * kern_k), b_panel, c_panel, 1, bblocks, kern_k); }); - prof(PROFILE_MERGE, [&](void) { MergeResults<strategy::out_width, strategy::out_height>(C, c_panel, ldc, y, ymax, x0, xmax, alpha, (k0==0 ? beta : static_cast<Tr>(1))); }); + prof(PROFILE_KERNEL, (strat.out_height * bblocks * strat.out_width * kern_k), [&](void) { strat.kernel(a_panel + (y * kern_k), b_panel, c_panel, 1, bblocks, kern_k); }); + prof(PROFILE_MERGE, (strat.out_height * bblocks * strat.out_width * sizeof(Tr)), [&](void) { MergeResults<strategy::out_width, strategy::out_height>(C, c_panel, ldc, y, ymax, x0, xmax, alpha, (k0==0 ? beta : static_cast<Tr>(1))); }); } } } diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp new file mode 100644 index 000000000..f7659b9a6 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8.hpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +// Actual kernel implementations +#include "a64_gemm_s16_12x8/generic.hpp" + +// 12x8 SGEMM "strategy" class. +// +// This describes the characteristics of a family of kernels, in terms of +// the required interleave properties and the output block size. +// +// All kernels in the family must share these characteristics. The actual +// kernel to be used can be chosen at runtime, based on the CPU_type +// structure. +class gemm_s16_12x8 { +public: + typedef int16_t operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)(const int16_t *, const int16_t *, int32_t *, int, int, int); + + /* Describes the data layout for A input */ + static const int A_interleave = 8; + static const int A_block = 1; + static const int A_transpose = 0; + + /* Same for B input */ + static const int B_interleave = 12; + static const int B_block = 1; + static const int B_transpose = 1; + + /* Kernel blocking parameters */ + static const int out_width = 12; + static const int out_height = 8; + static const int k_unroll = 1; + + kern_type kernel = nullptr; + + gemm_s16_12x8(const CPUInfo *ci) { + kernel = a64_gemm_s16_asimd_12x8; + } +}; + +#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8/generic.hpp new file mode 100644 index 000000000..10259b2fd --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s16_12x8/generic.hpp @@ -0,0 +1,313 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once +#include <arm_neon.h> + +inline void a64_gemm_s16_asimd_12x8(const int16_t *Apanel, const int16_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) +{ + const int16_t *a_ptr = Apanel; + int32_t *c_ptr = Cpanel; + for (int yb = 0; yb < ablocks; yb++) + { + const int16_t *a_ptr0 = a_ptr; + const int16_t *b_ptr = Bpanel; + + for (int xb = 0; xb < bblocks; xb++) + { + a_ptr = a_ptr0; + const bool odd_k = K & 0x1; + int k = (K+1)/2 - 1; + + register int16x8_t aa asm("v0"); + register int16x8_t ab asm("v1"); + register int16x8_t b0 asm("v2"); + register int16x8_t b1 asm("v3"); + register int16x8_t b2 asm("v4"); + + __asm __volatile ( + "ldr %d[aa], [%x[a_ptr]]\n" // Load A[A].lower + "movi v5.4s, #0\n" + "ldr x20, [%x[a_ptr], #0x08]\n" // Load A[A].upper + "movi v6.4s, #0\n" + "ldr %d[b0], [%x[b_ptr]]\n" // Load B[0].lower + "ins %[aa].d[1], x20\n" // Merge A[A].lower and upper + "movi v7.4s, #0\n" + ASM_PREFETCH("[%[a_ptr], #64]") + "movi v8.4s, #0\n" + "ldr x20, [%x[b_ptr], #0x08]\n" // Load B[0].upper + "movi v9.4s, #0\n" + ASM_PREFETCH("[%[b_ptr], #64]") + "movi v10.4s, #0\n" + "ldr %d[b1], [%x[b_ptr], #0x10]\n" // Load B[1].lower + "ins %[b0].d[1], x20\n" // Merge B[0].lower and upper + "movi v11.4s, #0\n" + ASM_PREFETCH("[%[a_ptr], #96]") + "movi v12.4s, #0\n" + "movi v13.4s, #0\n" + ASM_PREFETCH("[%[b_ptr], #96]") + "movi v14.4s, #0\n" + "movi v15.4s, #0\n" + ASM_PREFETCH("[%[a_ptr], #128]") + "movi v16.4s, #0\n" + "movi v17.4s, #0\n" + ASM_PREFETCH("[%[b_ptr], #128]") + "movi v18.4s, #0\n" + "movi v19.4s, #0\n" + ASM_PREFETCH("[%[a_ptr], #160]") + "movi v20.4s, #0\n" + "movi v21.4s, #0\n" + ASM_PREFETCH("[%[b_ptr], #160]") + "movi v22.4s, #0\n" + "movi v23.4s, #0\n" + ASM_PREFETCH("[%[a_ptr], #192]") + "movi v24.4s, #0\n" + "add %x[a_ptr], %x[a_ptr], #0x10\n" + "movi v25.4s, #0\n" + ASM_PREFETCH("[%[b_ptr], #192]") + "movi v26.4s, #0\n" + "add %x[b_ptr], %x[b_ptr], #0x18\n" + "movi v27.4s, #0\n" + "movi v28.4s, #0\n" + + "cbz %x[k], 2f\n" // Skip the loop if doing zero iterations. + + "1:\n" // Main loop + // First unroll + "smlal v5.4s, %[b0].4h, %[aa].h[0]\n" + "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper + "smlal v6.4s, %[b0].4h, %[aa].h[1]\n" + "smlal v7.4s, %[b0].4h, %[aa].h[2]\n" + "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower + "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper + "smlal v8.4s, %[b0].4h, %[aa].h[3]\n" + "smlal v9.4s, %[b0].4h, %[aa].h[4]\n" + "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper + "smlal v10.4s, %[b0].4h, %[aa].h[5]\n" + "smlal v11.4s, %[b0].4h, %[aa].h[6]\n" + "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower + "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper + "smlal v12.4s, %[b0].4h, %[aa].h[7]\n" + "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" + "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper + "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" + "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" + "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" + "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" + "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" + "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" + "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" + "ldr %d[b0], [%x[b_ptr], #0x18]\n" // Load B[0].lower + "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper + "smlal v21.4s, %[b1].4h, %[aa].h[0]\n" + "smlal v22.4s, %[b1].4h, %[aa].h[1]\n" + "ldr x20, [%x[b_ptr], #0x20]\n" // Load B[0].upper + "smlal v23.4s, %[b1].4h, %[aa].h[2]\n" + "smlal v24.4s, %[b1].4h, %[aa].h[3]\n" + "smlal v25.4s, %[b1].4h, %[aa].h[4]\n" + "smlal v26.4s, %[b1].4h, %[aa].h[5]\n" + "smlal v27.4s, %[b1].4h, %[aa].h[6]\n" + "smlal v28.4s, %[b1].4h, %[aa].h[7]\n" + + // Second unroll + "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n" + "ldr %d[aa], [%x[a_ptr], #0x10]\n" // Load A[A].lower + "ins %[b0].d[1], x20\n" // Merge B[0].lower and .upper + "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n" + "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n" + "ldr x20, [%x[a_ptr], #0x18]\n" // Load A[A].upper + "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n" + "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n" + "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n" + "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n" + "add %x[a_ptr], %x[a_ptr], #0x20\n" + "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n" + "smlal v13.4s, %[b2].4h, %[ab].h[0]\n" + ASM_PREFETCH("[%[b_ptr], #320]") + "smlal v14.4s, %[b2].4h, %[ab].h[1]\n" + "smlal v15.4s, %[b2].4h, %[ab].h[2]\n" + ASM_PREFETCH("[%[a_ptr], #320]") + "smlal v16.4s, %[b2].4h, %[ab].h[3]\n" + "smlal v17.4s, %[b2].4h, %[ab].h[4]\n" + ASM_PREFETCH("[%[b_ptr], #448]") + "smlal v18.4s, %[b2].4h, %[ab].h[5]\n" + "smlal v19.4s, %[b2].4h, %[ab].h[6]\n" + "smlal v20.4s, %[b2].4h, %[ab].h[7]\n" + "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n" + "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n" + "subs %x[k], %x[k], #0x1\n" + "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n" + "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n" + "ldr %d[b1], [%x[b_ptr], #0x28]\n" // Load B[1].lower + "ins %[aa].d[1], x20\n" // Merge A[A].lower and .upper + "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n" + "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n" + "add %x[b_ptr], %x[b_ptr], #0x30\n" + "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n" + "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n" + "bne 1b\n" + + "2:\n" // Even tail + "cbnz %x[odd_k], 3f\n" + + "smlal v5.4s, %[b0].4h, %[aa].h[0]\n" + "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper + "smlal v6.4s, %[b0].4h, %[aa].h[1]\n" + "smlal v7.4s, %[b0].4h, %[aa].h[2]\n" + "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower + "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper + "smlal v8.4s, %[b0].4h, %[aa].h[3]\n" + "smlal v9.4s, %[b0].4h, %[aa].h[4]\n" + "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper + "smlal v10.4s, %[b0].4h, %[aa].h[5]\n" + "smlal v11.4s, %[b0].4h, %[aa].h[6]\n" + "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower + "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper + "smlal v12.4s, %[b0].4h, %[aa].h[7]\n" + "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" + "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper + "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" + "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" + "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" + "add %[a_ptr], %[a_ptr], #0x10\n" + "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" + "add %[b_ptr], %[b_ptr], #0x18\n" + "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" + "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" + "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" + "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper + "smlal v21.4s, %[b1].4h, %[aa].h[0]\n" + "smlal v22.4s, %[b1].4h, %[aa].h[1]\n" + "smlal v23.4s, %[b1].4h, %[aa].h[2]\n" + "smlal v24.4s, %[b1].4h, %[aa].h[3]\n" + "smlal v25.4s, %[b1].4h, %[aa].h[4]\n" + "smlal v26.4s, %[b1].4h, %[aa].h[5]\n" + "smlal v27.4s, %[b1].4h, %[aa].h[6]\n" + "smlal v28.4s, %[b1].4h, %[aa].h[7]\n" + + "smlal2 v5.4s, %[b1].8h, %[ab].h[0]\n" + "smlal v13.4s, %[b2].4h, %[ab].h[0]\n" + "smlal2 v21.4s, %[b2].8h, %[ab].h[0]\n" + "smlal2 v6.4s, %[b1].8h, %[ab].h[1]\n" + "smlal v14.4s, %[b2].4h, %[ab].h[1]\n" + "str q5, [%x[c_ptr]]\n" + "smlal2 v22.4s, %[b2].8h, %[ab].h[1]\n" + "str q13, [%x[c_ptr], #0x10]\n" + "smlal2 v7.4s, %[b1].8h, %[ab].h[2]\n" + "str q21, [%x[c_ptr], #0x20]\n" + "smlal v15.4s, %[b2].4h, %[ab].h[2]\n" + "str q6, [%x[c_ptr], #0x30]\n" + "smlal2 v23.4s, %[b2].8h, %[ab].h[2]\n" + "str q14, [%x[c_ptr], #0x40]\n" + "smlal2 v8.4s, %[b1].8h, %[ab].h[3]\n" + "str q22, [%x[c_ptr], #0x50]\n" + "smlal v16.4s, %[b2].4h, %[ab].h[3]\n" + "str q7, [%x[c_ptr], #0x60]\n" + "smlal2 v24.4s, %[b2].8h, %[ab].h[3]\n" + "str q15, [%x[c_ptr], #0x70]\n" + "smlal2 v9.4s, %[b1].8h, %[ab].h[4]\n" + "str q23, [%x[c_ptr], #0x80]\n" + "smlal v17.4s, %[b2].4h, %[ab].h[4]\n" + "str q8, [%x[c_ptr], #0x90]\n" + "smlal2 v25.4s, %[b2].8h, %[ab].h[4]\n" + "str q16, [%x[c_ptr], #0xa0]\n" + "smlal2 v10.4s, %[b1].8h, %[ab].h[5]\n" + "str q24, [%x[c_ptr], #0xb0]\n" + "smlal v18.4s, %[b2].4h, %[ab].h[5]\n" + "str q9, [%x[c_ptr], #0xc0]\n" + "smlal2 v26.4s, %[b2].8h, %[ab].h[5]\n" + "str q17, [%x[c_ptr], #0xd0]\n" + "smlal2 v11.4s, %[b1].8h, %[ab].h[6]\n" + "str q25, [%x[c_ptr], #0xe0]\n" + "smlal v19.4s, %[b2].4h, %[ab].h[6]\n" + "str q10, [%x[c_ptr], #0xf0]\n" + "smlal2 v27.4s, %[b2].8h, %[ab].h[6]\n" + "str q18, [%x[c_ptr], #0x100]\n" + "smlal2 v12.4s, %[b1].8h, %[ab].h[7]\n" + "str q26, [%x[c_ptr], #0x110]\n" + "smlal v20.4s, %[b2].4h, %[ab].h[7]\n" + "str q11, [%x[c_ptr], #0x120]\n" + "smlal2 v28.4s, %[b2].8h, %[ab].h[7]\n" + "str q19, [%x[c_ptr], #0x130]\n" + "b 4f\n" // Complete write out + + "3:\n" // Odd tail + "smlal v5.4s, %[b0].4h, %[aa].h[0]\n" + "smlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" + "smlal v21.4s, %[b1].4h, %[aa].h[0]\n" + "smlal v6.4s, %[b0].4h, %[aa].h[1]\n" + "smlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" + "smlal v22.4s, %[b1].4h, %[aa].h[1]\n" + "str q5, [%x[c_ptr]]\n" + "smlal v7.4s, %[b0].4h, %[aa].h[2]\n" + "str q13, [%x[c_ptr], #0x10]\n" + "smlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" + "str q21, [%x[c_ptr], #0x20]\n" + "smlal v23.4s, %[b1].4h, %[aa].h[2]\n" + "str q6, [%x[c_ptr], #0x30]\n" + "smlal v8.4s, %[b0].4h, %[aa].h[3]\n" + "str q14, [%x[c_ptr], #0x40]\n" + "smlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" + "str q22, [%x[c_ptr], #0x50]\n" + "smlal v24.4s, %[b1].4h, %[aa].h[3]\n" + "str q7, [%x[c_ptr], #0x60]\n" + "smlal v9.4s, %[b0].4h, %[aa].h[4]\n" + "str q15, [%x[c_ptr], #0x70]\n" + "smlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" + "str q23, [%x[c_ptr], #0x80]\n" + "smlal v25.4s, %[b1].4h, %[aa].h[4]\n" + "str q8, [%x[c_ptr], #0x90]\n" + "smlal v10.4s, %[b0].4h, %[aa].h[5]\n" + "str q16, [%x[c_ptr], #0xa0]\n" + "smlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" + "str q24, [%x[c_ptr], #0xb0]\n" + "smlal v26.4s, %[b1].4h, %[aa].h[5]\n" + "str q9, [%x[c_ptr], #0xc0]\n" + "smlal v11.4s, %[b0].4h, %[aa].h[6]\n" + "str q17, [%x[c_ptr], #0xd0]\n" + "smlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" + "str q25, [%x[c_ptr], #0xe0]\n" + "smlal v27.4s, %[b1].4h, %[aa].h[6]\n" + "str q10, [%x[c_ptr], #0xf0]\n" + "smlal v12.4s, %[b0].4h, %[aa].h[7]\n" + "str q18, [%x[c_ptr], #0x100]\n" + "smlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" + "str q26, [%x[c_ptr], #0x110]\n" + "smlal v28.4s, %[b1].4h, %[aa].h[7]\n" + "str q11, [%x[c_ptr], #0x120]\n" + + "4:\n" // End of function + "str q19, [%x[c_ptr], #0x130]\n" + "str q27, [%x[c_ptr], #0x140]\n" + "str q12, [%x[c_ptr], #0x150]\n" + "str q20, [%x[c_ptr], #0x160]\n" + "str q28, [%x[c_ptr], #0x170]\n" + "add %x[c_ptr], %x[c_ptr], #0x180\n" + : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), + [aa] "+w" (aa), [ab] "+w" (ab), [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2) + : [odd_k] "r" (odd_k) + : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc" + ); + } + } +} diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp new file mode 100644 index 000000000..88cbb361b --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8.hpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +// Load the actual kernel +#include "a64_gemm_s8_12x8/generic.hpp" + +class gemm_s8_12x8 { +public: + typedef int8_t operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); + + /* Describes the data layout for A input */ + static const int A_interleave = 8; + static const int A_block = 4; + static const bool A_transpose = false; + + /* Same for B input */ + static const int B_interleave = 12; + static const int B_block = 4; + static const bool B_transpose = true; + + /* Kernel blocking parameters */ + static const int out_width = 12; + static const int out_height = 8; + static const int k_unroll = 4; + + kern_type kernel = nullptr; + + gemm_s8_12x8(const CPUInfo *ci) { + kernel = a64_gemm_s8_12x8; + } +}; + +#endif // __aarch64__ + diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp new file mode 100644 index 000000000..4ac2ba423 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/a55r1.hpp @@ -0,0 +1,398 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include <arm_neon.h> +#include "dot_toolchain_support.h" +#include <cassert> + +void a64_gemm_s8_12x8_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { + assert(Apanel); + assert(Bpanel); + assert(Cpanel); + const int8_t *a_ptr = Apanel; + int32_t *c_ptr = Cpanel; + // We divide K by 4 because the sdot instruction processes 4 elements at a time. + const int W = K/4; + // Fix up for odd lengths - set a flag if K is odd, but make. + // sure we round up the iteration count. + const int oddk = (W & 1); + const int init_value_k = ((W+1)/2) - 1; + for (int yb=0; yb<ablocks; yb++) { + const int8_t *a_ptr0 = a_ptr; + const int8_t *b_ptr = Bpanel; + for (int xb=0; xb<bblocks; xb++) { + a_ptr = a_ptr0; + int k = init_value_k; + register int32x4_t a0 asm("v0"); + register int32x4_t a1 asm("v1"); + register int32x4_t b0 asm("v2"); + register int32x4_t b1 asm("v3"); + register int32x4_t b2 asm("v4"); + register int32x4_t a0a asm("v5"); + register int32x4_t a1a asm("v6"); + __asm __volatile ( + _DECLARE_SDOT + // Initialize result registers, load initial operands, prime prefetches. + "movi v8.4s, #0x0\n" + "ldp %q[a0], %q[a1], [%[a_ptr]]\n" + "movi v9.4s, #0x0\n" + "ldp %q[b0], %q[b1], [%[b_ptr]]\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #64]") + "movi v13.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #64]") + "movi v14.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #128]") + "movi v15.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #128]") + "movi v16.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #192]") + "movi v17.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #256]") + "movi v18.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #192]") + "movi v19.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #320]") + "movi v20.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #256]") + "movi v21.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #384]") + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + + // Skip loop if we are doing zero iterations of it. + "cbz %w[k], 4f\n" + + + // Loop proper + "1:\n" + "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "ldr %d[b2], [%[b_ptr], #32]\n" + + "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "ldr x20, [%[b_ptr], #40]\n" + "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "ldr %d[a0a], [%[a_ptr], #32]\n" + + "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "ins %[b2].d[1], x20\n" + "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "ldr x20, [%[a_ptr], #40]\n" + "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "ldr %d[a1a], [%[a_ptr], #48]\n" + + + "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "ins %[a0a].d[1], x20\n" + ASM_PREFETCH("[%[a_ptr], #320]") + "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" + "ldr x20, [%[a_ptr], #56]\n" + "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "ldr %d[b0], [%[b_ptr], #48]\n" + + "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "ins %[a1a].d[1], x20\n" + ASM_PREFETCH("[%[b_ptr], #448]") + "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "ldr x20, [%[b_ptr], #56]\n" + "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "ldr %d[b1], [%[b_ptr], #64]\n" + + "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "ins %[b0].d[1], x20\n" + "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" + "ldr x20, [%[b_ptr], #72]\n" + "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" + + "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n" + "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" + "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n" + + "ldr %d[b2], [%[b_ptr], #80]\n" + + "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n" + "ins %[b1].d[1], x20\n" + "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n" + "ldr x20, [%[b_ptr], #88]\n" + "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n" + "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n" + "ldr %d[a0], [%[a_ptr], #64]\n" + + "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n" + "ins %[b2].d[1], x20\n" + "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n" + "ldr x20, [%[a_ptr], #72]\n" + "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n" + "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n" + "ldr %d[a1], [%[a_ptr], #80]\n" + + "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n" + "ins %[a0].d[1], x20\n" + ASM_PREFETCH("[%[b_ptr], #512]") + "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n" + "ldr x20, [%[a_ptr], #88]\n" + "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n" + "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n" + "ldr %d[b0], [%[b_ptr], #96]\n" + + "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n" + "ins %[a1].d[1], x20\n" + "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n" + "ldr x20, [%[b_ptr], #104]\n" + "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n" + "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n" + "ldr %d[b1], [%[b_ptr], #112]\n" + + "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n" + "ins %[b0].d[1], x20\n" + "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n" + "ldr x20, [%[b_ptr], #120]\n" + "add %[a_ptr], %[a_ptr], #64\n" + "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n" + "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n" + + "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n" + "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n" + "subs %w[k], %w[k], #1\n" + "ins %[b1].d[1], x20\n" + "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n" + "ldr %d[b2], [%[b_ptr], #32]\n" + "bne 1b\n" + + // Target to use when K is 1 or 2 (i.e. zero iterations of main loop) + "4:\n" + + // Branch to alternative tail for odd K + "cbnz %w[oddk], 2f\n" + + // Detached final iteration (even K) + "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "ldr x20, [%[b_ptr], #40]\n" + "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "ldr %d[a0a], [%[a_ptr], #32]\n" + + "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "ldr %d[b2], [%[b_ptr], #32]\n" + "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "ldr x20, [%[a_ptr], #40]\n" + "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "ldr %d[a1a], [%[a_ptr], #48]\n" + + + "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "ins %[a0a].d[1], x20\n" + "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" + "ldr x20, [%[a_ptr], #56]\n" + "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "ldr %d[b0], [%[b_ptr], #48]\n" + + "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "ins %[a1a].d[1], x20\n" + "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "ldr x20, [%[b_ptr], #56]\n" + "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n" + + "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" + "add %[a_ptr], %[a_ptr], #64\n" + "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "ldr %d[b1], [%[b_ptr], #64]\n" + + "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "ins %[b0].d[1], x20\n" + "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n" + "ldr x20, [%[b_ptr], #72]\n" + "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" + "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "ldr %d[b2], [%[b_ptr], #80]\n" + + "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n" + "ins %[b1].d[1], x20\n" + "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n" + "ldr x20, [%[b_ptr], #88]\n" + "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n" + "ins %[b2].d[1], x20\n" + + "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n" + "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "str q8, [%[c_ptr], #0]\n" + "str q16, [%[c_ptr], #16]\n" + "str q24, [%[c_ptr], #32]\n" + "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n" + + "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n" + "str q9, [%[c_ptr], #48]\n" + "str q17, [%[c_ptr], #64]\n" + "str q25, [%[c_ptr], #80]\n" + "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n" + "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n" + "str q10, [%[c_ptr], #96]\n" + "str q18, [%[c_ptr], #112]\n" + "str q26, [%[c_ptr], #128]\n" + "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n" + "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n" + "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n" + "str q11, [%[c_ptr], #144]\n" + "str q19, [%[c_ptr], #160]\n" + "str q27, [%[c_ptr], #176]\n" + "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n" + "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n" + "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n" + "str q12, [%[c_ptr], #192]\n" + "str q20, [%[c_ptr], #208]\n" + "str q28, [%[c_ptr], #224]\n" + "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n" + "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n" + "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n" + "str q13, [%[c_ptr], #240]\n" + "str q21, [%[c_ptr], #256]\n" + "str q29, [%[c_ptr], #272]\n" + "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n" + "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n" + "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n" + "str q14, [%[c_ptr], #288]\n" + "str q22, [%[c_ptr], #304]\n" + "str q30, [%[c_ptr], #320]\n" + "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n" + "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n" + "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n" + "str q15, [%[c_ptr], #336]\n" + + "b 3f\n" + + // Detached final iteration (odd K) + "2:\n" + "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "ldr %d[b2], [%[b_ptr], #32]\n" + "ldr x20, [%[b_ptr], #40]\n" + + "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "str q8, [%[c_ptr], #0]\n" + "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" + "str q16, [%[c_ptr], #16]\n" + "ins %[b2].d[1], x20\n" + "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "add %[b_ptr], %[b_ptr], #48\n" + "add %[a_ptr], %[a_ptr], #32\n" + "str q24, [%[c_ptr], #32]\n" + "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" + "str q9, [%[c_ptr], #48]\n" + + "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "str q17, [%[c_ptr], #64]\n" + "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "str q25, [%[c_ptr], #80]\n" + "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "str q10, [%[c_ptr], #96]\n" + + "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "str q18, [%[c_ptr], #112]\n" + "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "str q26, [%[c_ptr], #128]\n" + "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "str q11, [%[c_ptr], #144]\n" + + "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "str q19, [%[c_ptr], #160]\n" + "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "str q27, [%[c_ptr], #176]\n" + "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "str q12, [%[c_ptr], #192]\n" + + "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "str q20, [%[c_ptr], #208]\n" + "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "str q28, [%[c_ptr], #224]\n" + "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n" + "str q13, [%[c_ptr], #240]\n" + + "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "str q21, [%[c_ptr], #256]\n" + "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "str q29, [%[c_ptr], #272]\n" + "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" + "str q14, [%[c_ptr], #288]\n" + + "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "str q22, [%[c_ptr], #304]\n" + "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "str q30, [%[c_ptr], #320]\n" + "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "str q15, [%[c_ptr], #336]\n" + + + // Common tail + "3:\n" + "str q23, [%[c_ptr], #352]\n" + "str q31, [%[c_ptr], #368]\n" + "add %[c_ptr], %[c_ptr], #384\n" + + + + ".purgem sdot\n" + : + [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), + [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a), + [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k) + : [oddk] "r" (oddk) + : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory" + ); + + + } + } +} + +#endif + diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h new file mode 100644 index 000000000..1d6fd1623 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/dot_toolchain_support.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +// Define a macro to assemble the UDOT instruction (in the absence of toolchain support) +#define _DECLARE_SDOT ".altmacro\n"\ + ".macro sdot opd:req, opn:req, opm:req\n"\ + "local vd, vn, vm, h, l\n"\ + ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n"\ + ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n"\ + ".set vd,\\reg\n"\ + ".endif\n"\ + ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n"\ + ".set vn,\\reg\n"\ + ".endif\n"\ + ".irp idx,0,1,2,3\n"\ + ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n"\ + ".set vm,\\reg\n"\ + ".set h,\\idx / 2\n"\ + ".set l,\\idx %% 2\n"\ + ".endif\n"\ + ".endr\n"\ + ".endr\n"\ + ".ifndef vd\n"\ + ".error \"Bad operand \\opd\"\n"\ + ".exitm\n"\ + ".endif\n"\ + ".ifndef vn\n"\ + ".error \"Bad operand \\opn\"\n"\ + ".exitm\n"\ + ".endif\n"\ + ".ifndef vm\n"\ + ".error \"Bad operand \\opm\"\n"\ + ".exitm\n"\ + ".endif\n"\ + ".ifndef h\n"\ + ".error \"Bad operand \\opm\"\n"\ + ".exitm\n"\ + ".endif\n"\ + ".ifndef l\n"\ + ".error \"Bad operand \\opm\"\n"\ + ".exitm\n"\ + ".endif\n"\ + ".int 0x4f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n"\ + ".endm\n"\ + diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/generic.hpp new file mode 100644 index 000000000..bfad0373b --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_12x8/generic.hpp @@ -0,0 +1,363 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include <arm_neon.h> +#include "dot_toolchain_support.h" +#include <cassert> + + +inline void a64_gemm_s8_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { + assert(Apanel); + assert(Bpanel); + assert(Cpanel); + K/=4; + const long int row_jump=0; + const long int block_jump=0; + const int32_t *a_ptr = reinterpret_cast<const int32_t*>(Apanel); + int32_t *c_ptr = reinterpret_cast<int32_t*>(Cpanel); + for (int yb=0; yb<ablocks; yb++) { + const int32_t *a_ptr0 = a_ptr; + const int32_t *b_ptr = reinterpret_cast<const int32_t*>(Bpanel); + for (int xb=0; xb<bblocks; xb++) { + a_ptr = a_ptr0; + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + int oddk = (K & 1); + int k = ((K+1)/2) - 1; + register int32x4_t a0 asm("v0"); + register int32x4_t a1 asm("v1"); + register int32x4_t b0 asm("v2"); + register int32x4_t b1 asm("v3"); + register int32x4_t b2 asm("v4"); + register int32x4_t a0a asm("v5"); + register int32x4_t a1a asm("v6"); + __asm __volatile ( + // Initialize result registers, load initial operands, prime prefetches. + "movi v8.4s, #0x0\n" + "ldr %q[a0], [%[a_ptr]]\n" + "movi v9.4s, #0x0\n" + "ldr %q[b0], [%[b_ptr]]\n" + "movi v10.4s, #0x0\n" + "ldr %q[a1], [%[a_ptr], #16]\n" + "movi v11.4s, #0x0\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "movi v12.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #64]") + "movi v13.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #64]") + "movi v14.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #128]") + "movi v15.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #128]") + "movi v16.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #192]") + "movi v17.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #256]") + "movi v18.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #192]") + "movi v19.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #320]") + "movi v20.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #256]") + "movi v21.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #384]") + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + + // Skip loop if we are doing zero iterations of it. + "cbz %w[k], 4f\n" + + _DECLARE_SDOT + + // Loop proper + "1:\n" + "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n" + + "ldr %q[b2], [%[b_ptr], #32]\n" + "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "add %[b_ptr], %[b_ptr], %[row_jump]\n" + "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "ldr %q[a0a], [%[a_ptr], #32]\n" + "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "ldr %q[a1a], [%[a_ptr], #48]\n" + "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "ldr %q[b0], [%[b_ptr], #48]\n" + + "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" + ASM_PREFETCH("[%[a_ptr], #320]") + "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "ldr %q[b1], [%[b_ptr], #64]\n" + + "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" + ASM_PREFETCH("[%[b_ptr], #448]") + "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n" + "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" + "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "ldr %q[b2], [%[b_ptr], #80]\n" + + "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n" + "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n" + "ldr %q[a0], [%[a_ptr], #64]\n" + "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n" + "add %[b_ptr], %[b_ptr], %[row_jump]\n" + "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n" + "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n" + "ldr %q[a1], [%[a_ptr], #80]\n" + "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n" + "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n" + "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n" + "ldr %q[b0], [%[b_ptr], #96]\n" + + "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n" + "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n" + ASM_PREFETCH("[%[b_ptr], #512]") + "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n" + "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n" + "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n" + "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n" + "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n" + "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n" + "ldr %q[b1], [%[b_ptr], #112]\n" + + "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n" + "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n" + "add %[a_ptr], %[a_ptr], #64\n" + "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n" + "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n" + "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n" + "subs %w[k], %w[k], #1\n" + "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n" + "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n" + "bne 1b\n" + + // Target to use when K is 1 or 2 (i.e. zero iterations of main loop) + "4:\n" + + // Branch to alternative tail for odd K + "cbnz %w[oddk], 2f\n" + + // Detached final iteration (even K) + "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "add %[b_ptr], %[b_ptr], %[row_jump]\n" + "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "ldr %q[a0a], [%[a_ptr], #32]\n" + "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "ldr %q[a1a], [%[a_ptr], #48]\n" + "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "ldr %q[b0], [%[b_ptr], #48]\n" + + "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" + "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "ldr %q[b1], [%[b_ptr], #64]\n" + + "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" + "add %[a_ptr], %[a_ptr], #64\n" + "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n" + "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" + "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "ldr %q[b2], [%[b_ptr], #80]\n" + + "sdot v8.4s , %[b0].16b, %[a0a].4b[0]\n" + + "add %[b_ptr], %[b_ptr], %[block_jump]\n" + "sdot v16.4s, %[b1].16b, %[a0a].4b[0]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "sdot v9.4s , %[b0].16b, %[a0a].4b[1]\n" + "add %[b_ptr], %[b_ptr], %[row_jump]\n" + "str q8, [%[c_ptr], #0]\n" + "sdot v17.4s, %[b1].16b, %[a0a].4b[1]\n" + "str q16, [%[c_ptr], #16]\n" + "sdot v24.4s, %[b2].16b, %[a0a].4b[0]\n" + "str q24, [%[c_ptr], #32]\n" + + "sdot v25.4s, %[b2].16b, %[a0a].4b[1]\n" + "str q9, [%[c_ptr], #48]\n" + "sdot v10.4s, %[b0].16b, %[a0a].4b[2]\n" + "str q17, [%[c_ptr], #64]\n" + "sdot v18.4s, %[b1].16b, %[a0a].4b[2]\n" + "str q25, [%[c_ptr], #80]\n" + "sdot v26.4s, %[b2].16b, %[a0a].4b[2]\n" + "str q10, [%[c_ptr], #96]\n" + + "sdot v11.4s, %[b0].16b, %[a0a].4b[3]\n" + "str q18, [%[c_ptr], #112]\n" + "sdot v19.4s, %[b1].16b, %[a0a].4b[3]\n" + "str q26, [%[c_ptr], #128]\n" + "sdot v27.4s, %[b2].16b, %[a0a].4b[3]\n" + "str q11, [%[c_ptr], #144]\n" + + "sdot v12.4s, %[b0].16b, %[a1a].4b[0]\n" + "str q19, [%[c_ptr], #160]\n" + "sdot v20.4s, %[b1].16b, %[a1a].4b[0]\n" + "str q27, [%[c_ptr], #176]\n" + "sdot v28.4s, %[b2].16b, %[a1a].4b[0]\n" + "str q12, [%[c_ptr], #192]\n" + + "sdot v13.4s, %[b0].16b, %[a1a].4b[1]\n" + "str q20, [%[c_ptr], #208]\n" + "sdot v21.4s, %[b1].16b, %[a1a].4b[1]\n" + "str q28, [%[c_ptr], #224]\n" + "sdot v29.4s, %[b2].16b, %[a1a].4b[1]\n" + "str q13, [%[c_ptr], #240]\n" + + "sdot v14.4s, %[b0].16b, %[a1a].4b[2]\n" + "str q21, [%[c_ptr], #256]\n" + "sdot v22.4s, %[b1].16b, %[a1a].4b[2]\n" + "str q29, [%[c_ptr], #272]\n" + "sdot v30.4s, %[b2].16b, %[a1a].4b[2]\n" + "str q14, [%[c_ptr], #288]\n" + + "sdot v15.4s, %[b0].16b, %[a1a].4b[3]\n" + "str q22, [%[c_ptr], #304]\n" + "sdot v23.4s, %[b1].16b, %[a1a].4b[3]\n" + "str q30, [%[c_ptr], #320]\n" + "sdot v31.4s, %[b2].16b, %[a1a].4b[3]\n" + "str q15, [%[c_ptr], #336]\n" + + "b 3f\n" + + // Detached final iteration (odd K) + "2:\n" + "sdot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "sdot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "add %[b_ptr], %[b_ptr], %[row_jump]\n" + "sdot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "str q8, [%[c_ptr], #0]\n" + "sdot v17.4s, %[b1].16b, %[a0].4b[1]\n" + "str q16, [%[c_ptr], #16]\n" + "sdot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "add %[b_ptr], %[b_ptr], #48\n" + "add %[a_ptr], %[a_ptr], #32\n" + "str q24, [%[c_ptr], #32]\n" + "sdot v25.4s, %[b2].16b, %[a0].4b[1]\n" + "str q9, [%[c_ptr], #48]\n" + + "sdot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "str q17, [%[c_ptr], #64]\n" + "sdot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "str q25, [%[c_ptr], #80]\n" + "sdot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "str q10, [%[c_ptr], #96]\n" + + "sdot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "str q18, [%[c_ptr], #112]\n" + "sdot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "str q26, [%[c_ptr], #128]\n" + "sdot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "str q11, [%[c_ptr], #144]\n" + + "sdot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "str q19, [%[c_ptr], #160]\n" + "sdot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "str q27, [%[c_ptr], #176]\n" + "sdot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "str q12, [%[c_ptr], #192]\n" + + "sdot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "str q20, [%[c_ptr], #208]\n" + "sdot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "str q28, [%[c_ptr], #224]\n" + "sdot v29.4s, %[b2].16b, %[a1].4b[1]\n" + "str q13, [%[c_ptr], #240]\n" + + "sdot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "str q21, [%[c_ptr], #256]\n" + "sdot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "str q29, [%[c_ptr], #272]\n" + "sdot v30.4s, %[b2].16b, %[a1].4b[2]\n" + "str q14, [%[c_ptr], #288]\n" + + "sdot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "str q22, [%[c_ptr], #304]\n" + "sdot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "str q30, [%[c_ptr], #320]\n" + "sdot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "str q15, [%[c_ptr], #336]\n" + + + // Common tail + "3:\n" + "str q23, [%[c_ptr], #352]\n" + "str q31, [%[c_ptr], #368]\n" + "add %[c_ptr], %[c_ptr], #384\n" + + ".purgem sdot\n" + : + [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), + [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a), + [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k) + : [oddk] "r" (oddk), [row_jump] "r" (row_jump), [block_jump] "r" (block_jump) + : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" + ); + } + } + + +} + + +#endif diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp new file mode 100644 index 000000000..1588f049f --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4.hpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +// Load the actual kernel +#include "a64_gemm_s8_4x4/generic.hpp" + +class gemm_s8_4x4 { +public: + typedef int8_t operand_type; + typedef int32_t result_type; + + typedef void (*kern_type)(const int8_t *, const int8_t *, int32_t *, int, int, int); + + /* Describes the data layout for A input */ + static const int A_interleave = 4; + static const int A_block = 16; + static const bool A_transpose = false; + + /* Same for B input */ + static const int B_interleave = 4; + static const int B_block = 16; + static const bool B_transpose = true; + + /* Kernel blocking parameters */ + static const int out_width = 4; + static const int out_height = 4; + static const int k_unroll = 16; + + kern_type kernel = nullptr; + + gemm_s8_4x4(const CPUInfo *ci) { + kernel = a64_gemm_s8_4x4; + } +}; + +#endif // __aarch64__ + diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4/generic.hpp new file mode 100644 index 000000000..0ec435b33 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_s8_4x4/generic.hpp @@ -0,0 +1,465 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include <arm_neon.h> + +inline void a64_gemm_s8_4x4(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) { + const int8_t *a_ptr = Apanel; + int32_t *c_ptr = Cpanel; + K /= 16; + int oddk = (K & 1); + + for (int yb=0; yb<ablocks; yb++) { + const int8_t *a_ptr0 = a_ptr; + const int8_t *b_ptr = Bpanel; + + for (int xb=0; xb<bblocks; xb++) { + a_ptr = a_ptr0; + + int k = ((K+1)/2)-1; + + register int8x16_t b0 asm("v4"); + register int8x16_t b1 asm("v5"); + register int8x16_t b2 asm("v6"); + register int8x16_t b3 asm("v7"); + register int8x16_t b0a asm("v8"); + register int8x16_t b1a asm("v9"); + register int8x16_t b2a asm("v10"); + register int8x16_t b3a asm("v11"); + + __asm __volatile ( + "movi v16.4s, #0x0\n" + "ldr q0, [%[a_ptr]]\n" + "movi v17.4s, #0x0\n" + "ldr %q[b0], [%[b_ptr]]\n" + "movi v18.4s, #0x0\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "movi v19.4s, #0x0\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "movi v20.4s, #0x0\n" + "ldr %q[b3], [%[b_ptr], #48]\n" + "movi v21.4s, #0x0\n" + "ldr q1, [%[a_ptr], #16]\n" + "movi v22.4s, #0x0\n" + "ldr q2, [%[a_ptr], #32]\n" + "movi v23.4s, #0x0\n" + "ldr q3, [%[a_ptr], #48]\n" + "movi v24.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #64]") + "movi v25.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #64]") + "movi v26.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #128]") + "movi v27.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #128]") + "movi v28.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #192]") + "movi v29.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #192]") + "movi v30.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #256]") + "movi v31.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #256]") + + // Loop structure optimized for A57 (after r0). + + // Unavoidably, the multiply will "dribble" if + // dual issued with an add. + + // Minimize the effect of this by making sure + // there are 2 adds to run under the dribbled + // multiply. + + // Pipeline in blocks of 8 multiplies - combine + // this iteration's multiplies with adds from + // the previous iteration. + + // So the first block doesn't have any adds to + // do - but because all the adds are at the + // start of the block it's only the first couple + // of multiplies that need to be pulled out. + + // Start of unroll 0 (first iteration) + "smull v12.8h, v0.8b, %[b0].8b\n" + "smull v13.8h, v0.8b, %[b1].8b\n" + + // Skip loop if we are doing zero iterations of it. + "cbz %w[k], 4f\n" + + // Unroll 0 continuation (branch target) + "1:\n" + "smull v14.8h, v0.8b, %[b2].8b\n" + "subs %w[k], %w[k], #1\n" + "smull v15.8h, v0.8b, %[b3].8b\n" + "ldr %q[b0a], [%[b_ptr], #64]\n" + "smlal2 v12.8h, v0.16b, %[b0].16b\n" + "smlal2 v13.8h, v0.16b, %[b1].16b\n" + "ldr %q[b1a], [%[b_ptr], #80]\n" + "smlal2 v14.8h, v0.16b, %[b2].16b\n" + "smlal2 v15.8h, v0.16b, %[b3].16b\n" + "ldr q0, [%[a_ptr], #64]\n" + + "sadalp v16.4s, v12.8h\n" + "smull v12.8h, v1.8b, %[b0].8b\n" + "sadalp v17.4s, v13.8h\n" + "sadalp v18.4s, v14.8h\n" + "smull v13.8h, v1.8b, %[b1].8b\n" + "sadalp v19.4s, v15.8h\n" + "smull v14.8h, v1.8b, %[b2].8b\n" + "ldr %q[b2a], [%[b_ptr], #96]\n" + "smull v15.8h, v1.8b, %[b3].8b\n" + "smlal2 v12.8h, v1.16b, %[b0].16b\n" + "ldr %q[b3a], [%[b_ptr], #112]\n" + "smlal2 v13.8h, v1.16b, %[b1].16b\n" + "add %[b_ptr], %[b_ptr], #128\n" + "smlal2 v14.8h, v1.16b, %[b2].16b\n" + "smlal2 v15.8h, v1.16b, %[b3].16b\n" + "ldr q1, [%[a_ptr], #80]\n" + + "sadalp v20.4s, v12.8h\n" + "smull v12.8h, v2.8b, %[b0].8b\n" + "sadalp v21.4s, v13.8h\n" + "sadalp v22.4s, v14.8h\n" + "smull v13.8h, v2.8b, %[b1].8b\n" + "sadalp v23.4s, v15.8h\n" + "smull v14.8h, v2.8b, %[b2].8b\n" + "smull v15.8h, v2.8b, %[b3].8b\n" + "smlal2 v12.8h, v2.16b, %[b0].16b\n" + ASM_PREFETCH("[%[b_ptr], #192]") + "smlal2 v13.8h, v2.16b, %[b1].16b\n" + "smlal2 v14.8h, v2.16b, %[b2].16b\n" + ASM_PREFETCH("[%[a_ptr], #320]") + "smlal2 v15.8h, v2.16b, %[b3].16b\n" + "ldr q2, [%[a_ptr], #96]\n" + + "sadalp v24.4s, v12.8h\n" + "smull v12.8h, v3.8b, %[b0].8b\n" + "sadalp v25.4s, v13.8h\n" + "sadalp v26.4s, v14.8h\n" + "smull v13.8h, v3.8b, %[b1].8b\n" + "sadalp v27.4s, v15.8h\n" + "smull v14.8h, v3.8b, %[b2].8b\n" + "smull v15.8h, v3.8b, %[b3].8b\n" + "smlal2 v12.8h, v3.16b, %[b0].16b\n" + "ldr %q[b0], [%[b_ptr], #0]\n" + "smlal2 v13.8h, v3.16b, %[b1].16b\n" + "smlal2 v14.8h, v3.16b, %[b2].16b\n" + "smlal2 v15.8h, v3.16b, %[b3].16b\n" + "ldr q3, [%[a_ptr], #112]\n" + + // Unroll 1 + "sadalp v28.4s, v12.8h\n" + "smull v12.8h, v0.8b, %[b0a].8b\n" + "sadalp v29.4s, v13.8h\n" + "sadalp v30.4s, v14.8h\n" + "smull v13.8h, v0.8b, %[b1a].8b\n" + "sadalp v31.4s, v15.8h\n" + "smull v14.8h, v0.8b, %[b2a].8b\n" + "smull v15.8h, v0.8b, %[b3a].8b\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "smlal2 v12.8h, v0.16b, %[b0a].16b\n" + "smlal2 v13.8h, v0.16b, %[b1a].16b\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "smlal2 v14.8h, v0.16b, %[b2a].16b\n" + "smlal2 v15.8h, v0.16b, %[b3a].16b\n" + "ldr q0, [%[a_ptr], #128]\n" + + "sadalp v16.4s, v12.8h\n" + "smull v12.8h, v1.8b, %[b0a].8b\n" + "sadalp v17.4s, v13.8h\n" + "sadalp v18.4s, v14.8h\n" + "smull v13.8h, v1.8b, %[b1a].8b\n" + "sadalp v19.4s, v15.8h\n" + "add %[a_ptr], %[a_ptr], #128\n" + "smull v14.8h, v1.8b, %[b2a].8b\n" + "smull v15.8h, v1.8b, %[b3a].8b\n" + "ldr %q[b3], [%[b_ptr], #48]\n" + "smlal2 v12.8h, v1.16b, %[b0a].16b\n" + "smlal2 v13.8h, v1.16b, %[b1a].16b\n" + "smlal2 v14.8h, v1.16b, %[b2a].16b\n" + "smlal2 v15.8h, v1.16b, %[b3a].16b\n" + "ldr q1, [%[a_ptr], #16]\n" + + "sadalp v20.4s, v12.8h\n" + "smull v12.8h, v2.8b, %[b0a].8b\n" + "sadalp v21.4s, v13.8h\n" + "sadalp v22.4s, v14.8h\n" + "smull v13.8h, v2.8b, %[b1a].8b\n" + "sadalp v23.4s, v15.8h\n" + "smull v14.8h, v2.8b, %[b2a].8b\n" + "smull v15.8h, v2.8b, %[b3a].8b\n" + "smlal2 v12.8h, v2.16b, %[b0a].16b\n" + ASM_PREFETCH("[%[b_ptr], #256]") + "smlal2 v13.8h, v2.16b, %[b1a].16b\n" + "smlal2 v14.8h, v2.16b, %[b2a].16b\n" + ASM_PREFETCH("[%[a_ptr], #256]") + "smlal2 v15.8h, v2.16b, %[b3a].16b\n" + "ldr q2, [%[a_ptr], #32]\n" + + "sadalp v24.4s, v12.8h\n" + "smull v12.8h, v3.8b, %[b0a].8b\n" + "sadalp v25.4s, v13.8h\n" + "sadalp v26.4s, v14.8h\n" + "smull v13.8h, v3.8b, %[b1a].8b\n" + "sadalp v27.4s, v15.8h\n" + "smull v14.8h, v3.8b, %[b2a].8b\n" + "smull v15.8h, v3.8b, %[b3a].8b\n" + "smlal2 v12.8h, v3.16b, %[b0a].16b\n" + "smlal2 v13.8h, v3.16b, %[b1a].16b\n" + "smlal2 v14.8h, v3.16b, %[b2a].16b\n" + "smlal2 v15.8h, v3.16b, %[b3a].16b\n" + "ldr q3, [%[a_ptr], #48]\n" + + // Start of unroll 0 for next iteration. + "sadalp v28.4s, v12.8h\n" + "smull v12.8h, v0.8b, %[b0].8b\n" + "sadalp v29.4s, v13.8h\n" + "sadalp v30.4s, v14.8h\n" + "smull v13.8h, v0.8b, %[b1].8b\n" + "sadalp v31.4s, v15.8h\n" + "bne 1b\n" + + // Target to use when K=1 or 2 (i.e. zero iterations of main loop) + "4:\n" + + // Branch to alternative tail for odd K + "cbnz %w[oddk], 2f\n" + + // Detached final iteration (even K) + "smull v14.8h, v0.8b, %[b2].8b\n" + "smull v15.8h, v0.8b, %[b3].8b\n" + "ldr %q[b0a], [%[b_ptr], #64]\n" + "smlal2 v12.8h, v0.16b, %[b0].16b\n" + "smlal2 v13.8h, v0.16b, %[b1].16b\n" + "ldr %q[b1a], [%[b_ptr], #80]\n" + "smlal2 v14.8h, v0.16b, %[b2].16b\n" + "smlal2 v15.8h, v0.16b, %[b3].16b\n" + "ldr q0, [%[a_ptr], #64]\n" + + "sadalp v16.4s, v12.8h\n" + "smull v12.8h, v1.8b, %[b0].8b\n" + "sadalp v17.4s, v13.8h\n" + "sadalp v18.4s, v14.8h\n" + "smull v13.8h, v1.8b, %[b1].8b\n" + "sadalp v19.4s, v15.8h\n" + "smull v14.8h, v1.8b, %[b2].8b\n" + "ldr %q[b2a], [%[b_ptr], #96]\n" + "smull v15.8h, v1.8b, %[b3].8b\n" + "smlal2 v12.8h, v1.16b, %[b0].16b\n" + "ldr %q[b3a], [%[b_ptr], #112]\n" + "smlal2 v13.8h, v1.16b, %[b1].16b\n" + "add %[b_ptr], %[b_ptr], #128\n" + "smlal2 v14.8h, v1.16b, %[b2].16b\n" + "smlal2 v15.8h, v1.16b, %[b3].16b\n" + "ldr q1, [%[a_ptr], #80]\n" + + "sadalp v20.4s, v12.8h\n" + "smull v12.8h, v2.8b, %[b0].8b\n" + "sadalp v21.4s, v13.8h\n" + "sadalp v22.4s, v14.8h\n" + "smull v13.8h, v2.8b, %[b1].8b\n" + "sadalp v23.4s, v15.8h\n" + "smull v14.8h, v2.8b, %[b2].8b\n" + "smull v15.8h, v2.8b, %[b3].8b\n" + "smlal2 v12.8h, v2.16b, %[b0].16b\n" + "smlal2 v13.8h, v2.16b, %[b1].16b\n" + "smlal2 v14.8h, v2.16b, %[b2].16b\n" + "smlal2 v15.8h, v2.16b, %[b3].16b\n" + "ldr q2, [%[a_ptr], #96]\n" + + "sadalp v24.4s, v12.8h\n" + "smull v12.8h, v3.8b, %[b0].8b\n" + "sadalp v25.4s, v13.8h\n" + "sadalp v26.4s, v14.8h\n" + "smull v13.8h, v3.8b, %[b1].8b\n" + "sadalp v27.4s, v15.8h\n" + "smull v14.8h, v3.8b, %[b2].8b\n" + "smull v15.8h, v3.8b, %[b3].8b\n" + "smlal2 v12.8h, v3.16b, %[b0].16b\n" + "smlal2 v13.8h, v3.16b, %[b1].16b\n" + "smlal2 v14.8h, v3.16b, %[b2].16b\n" + "smlal2 v15.8h, v3.16b, %[b3].16b\n" + "ldr q3, [%[a_ptr], #112]\n" + + // Unroll 1 + "sadalp v28.4s, v12.8h\n" + "smull v12.8h, v0.8b, %[b0a].8b\n" + "sadalp v29.4s, v13.8h\n" + "sadalp v30.4s, v14.8h\n" + "smull v13.8h, v0.8b, %[b1a].8b\n" + "sadalp v31.4s, v15.8h\n" + "smull v14.8h, v0.8b, %[b2a].8b\n" + "add %[a_ptr], %[a_ptr], #128\n" + "smull v15.8h, v0.8b, %[b3a].8b\n" + "smlal2 v12.8h, v0.16b, %[b0a].16b\n" + "smlal2 v13.8h, v0.16b, %[b1a].16b\n" + "smlal2 v14.8h, v0.16b, %[b2a].16b\n" + "smlal2 v15.8h, v0.16b, %[b3a].16b\n" + + "sadalp v16.4s, v12.8h\n" + "smull v12.8h, v1.8b, %[b0a].8b\n" + "sadalp v17.4s, v13.8h\n" + "sadalp v18.4s, v14.8h\n" + "smull v13.8h, v1.8b, %[b1a].8b\n" + "sadalp v19.4s, v15.8h\n" + "smull v14.8h, v1.8b, %[b2a].8b\n" + "smull v15.8h, v1.8b, %[b3a].8b\n" + "smlal2 v12.8h, v1.16b, %[b0a].16b\n" + "addp v16.4s, v16.4s, v17.4s\n" + "smlal2 v13.8h, v1.16b, %[b1a].16b\n" + "addp v17.4s, v18.4s, v19.4s\n" + "smlal2 v14.8h, v1.16b, %[b2a].16b\n" + "smlal2 v15.8h, v1.16b, %[b3a].16b\n" + + "sadalp v20.4s, v12.8h\n" + "smull v12.8h, v2.8b, %[b0a].8b\n" + "sadalp v21.4s, v13.8h\n" + "sadalp v22.4s, v14.8h\n" + "smull v13.8h, v2.8b, %[b1a].8b\n" + "sadalp v23.4s, v15.8h\n" + "addp v16.4s, v16.4s, v17.4s\n" + "smull v14.8h, v2.8b, %[b2a].8b\n" + "addp v18.4s, v20.4s, v21.4s\n" + "addp v19.4s, v22.4s, v23.4s\n" + "smull v15.8h, v2.8b, %[b3a].8b\n" + "smlal2 v12.8h, v2.16b, %[b0a].16b\n" + "str q16, [%[c_ptr]]\n" + "smlal2 v13.8h, v2.16b, %[b1a].16b\n" + "smlal2 v14.8h, v2.16b, %[b2a].16b\n" + "smlal2 v15.8h, v2.16b, %[b3a].16b\n" + + "sadalp v24.4s, v12.8h\n" + "smull v12.8h, v3.8b, %[b0a].8b\n" + "sadalp v25.4s, v13.8h\n" + "sadalp v26.4s, v14.8h\n" + "smull v13.8h, v3.8b, %[b1a].8b\n" + "sadalp v27.4s, v15.8h\n" + "addp v17.4s, v18.4s, v19.4s\n" + "smull v14.8h, v3.8b, %[b2a].8b\n" + "addp v20.4s, v24.4s, v25.4s\n" + "addp v21.4s, v26.4s, v27.4s\n" + "smull v15.8h, v3.8b, %[b3a].8b\n" + "smlal2 v12.8h, v3.16b, %[b0a].16b\n" + "str q17, [%[c_ptr], #16]\n" + "smlal2 v13.8h, v3.16b, %[b1a].16b\n" + "smlal2 v14.8h, v3.16b, %[b2a].16b\n" + "addp v18.4s, v20.4s, v21.4s\n" + "smlal2 v15.8h, v3.16b, %[b3a].16b\n" + "b 3f\n" + + // Detached final iteration (odd K) + "2:\n" + "smull v14.8h, v0.8b, %[b2].8b\n" + "add %[a_ptr], %[a_ptr], #64\n" + "smull v15.8h, v0.8b, %[b3].8b\n" + "add %[b_ptr], %[b_ptr], #64\n" + "smlal2 v12.8h, v0.16b, %[b0].16b\n" + "smlal2 v13.8h, v0.16b, %[b1].16b\n" + "smlal2 v14.8h, v0.16b, %[b2].16b\n" + "smlal2 v15.8h, v0.16b, %[b3].16b\n" + + "sadalp v16.4s, v12.8h\n" + "smull v12.8h, v1.8b, %[b0].8b\n" + "sadalp v17.4s, v13.8h\n" + "sadalp v18.4s, v14.8h\n" + "smull v13.8h, v1.8b, %[b1].8b\n" + "sadalp v19.4s, v15.8h\n" + "smull v14.8h, v1.8b, %[b2].8b\n" + "smull v15.8h, v1.8b, %[b3].8b\n" + "smlal2 v12.8h, v1.16b, %[b0].16b\n" + "addp v16.4s, v16.4s, v17.4s\n" + "smlal2 v13.8h, v1.16b, %[b1].16b\n" + "addp v17.4s, v18.4s, v19.4s\n" + "smlal2 v14.8h, v1.16b, %[b2].16b\n" + "smlal2 v15.8h, v1.16b, %[b3].16b\n" + + "sadalp v20.4s, v12.8h\n" + "smull v12.8h, v2.8b, %[b0].8b\n" + "sadalp v21.4s, v13.8h\n" + "sadalp v22.4s, v14.8h\n" + "smull v13.8h, v2.8b, %[b1].8b\n" + "sadalp v23.4s, v15.8h\n" + "addp v16.4s, v16.4s, v17.4s\n" + "smull v14.8h, v2.8b, %[b2].8b\n" + "addp v18.4s, v20.4s, v21.4s\n" + "addp v19.4s, v22.4s, v23.4s\n" + "smull v15.8h, v2.8b, %[b3].8b\n" + "smlal2 v12.8h, v2.16b, %[b0].16b\n" + "str q16, [%[c_ptr]]\n" + "smlal2 v13.8h, v2.16b, %[b1].16b\n" + "smlal2 v14.8h, v2.16b, %[b2].16b\n" + "smlal2 v15.8h, v2.16b, %[b3].16b\n" + + "sadalp v24.4s, v12.8h\n" + "smull v12.8h, v3.8b, %[b0].8b\n" + "sadalp v25.4s, v13.8h\n" + "sadalp v26.4s, v14.8h\n" + "smull v13.8h, v3.8b, %[b1].8b\n" + "sadalp v27.4s, v15.8h\n" + "addp v17.4s, v18.4s, v19.4s\n" + "smull v14.8h, v3.8b, %[b2].8b\n" + "addp v20.4s, v24.4s, v25.4s\n" + "addp v21.4s, v26.4s, v27.4s\n" + "smull v15.8h, v3.8b, %[b3].8b\n" + "smlal2 v12.8h, v3.16b, %[b0].16b\n" + "str q17, [%[c_ptr], #16]\n" + "smlal2 v13.8h, v3.16b, %[b1].16b\n" + "smlal2 v14.8h, v3.16b, %[b2].16b\n" + "addp v18.4s, v20.4s, v21.4s\n" + "smlal2 v15.8h, v3.16b, %[b3].16b\n" + + "3:\n" + + // Final additions + "sadalp v28.4s, v12.8h\n" + "str q18, [%[c_ptr], #32]\n" + "sadalp v29.4s, v13.8h\n" + "sadalp v30.4s, v14.8h\n" + "sadalp v31.4s, v15.8h\n" + + // Horizontal reduction, phase 1 + "addp v22.4s, v28.4s, v29.4s\n" + "addp v23.4s, v30.4s, v31.4s\n" + + // Horizontal reduction, phase 2 + "addp v19.4s, v22.4s, v23.4s\n" + "str q19, [%[c_ptr], #48]\n" + "add %[c_ptr], %[c_ptr], #64\n" + + : + [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), + [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [b3] "+w" (b3), + [b0a] "+w" (b0a), [b1a] "+w" (b1a), [b2a] "+w" (b2a), [b3a] "+w" (b3a), + [k] "+r" (k) + : [oddk] "r" (oddk) + : "x20", "x21", "v0","v1","v2","v3","v12","v13","v14","v15","v16","v17","v18","v19", + "v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31", "cc"); + } + } +} + +#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp new file mode 100644 index 000000000..7eb8b2dac --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8.hpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +// Actual kernel implementations +#include "a64_gemm_u16_12x8/generic.hpp" + +// 12x8 SGEMM "strategy" class. +// +// This describes the characteristics of a family of kernels, in terms of +// the required interleave properties and the output block size. +// +// All kernels in the family must share these characteristics. The actual +// kernel to be used can be chosen at runtime, based on the CPU_type +// structure. +class gemm_u16_12x8 { +public: + typedef uint16_t operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)(const uint16_t *, const uint16_t *, uint32_t *, int, int, int); + + /* Describes the data layout for A input */ + static const int A_interleave = 8; + static const int A_block = 1; + static const int A_transpose = 0; + + /* Same for B input */ + static const int B_interleave = 12; + static const int B_block = 1; + static const int B_transpose = 1; + + /* Kernel blocking parameters */ + static const int out_width = 12; + static const int out_height = 8; + static const int k_unroll = 1; + + kern_type kernel = nullptr; + + gemm_u16_12x8(const CPUInfo *ci) { + kernel = a64_gemm_u16_asimd_12x8; + } +}; + +#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8/generic.hpp new file mode 100644 index 000000000..b3f310ce6 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u16_12x8/generic.hpp @@ -0,0 +1,314 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once +#include <arm_neon.h> + +inline void a64_gemm_u16_asimd_12x8(const uint16_t *Apanel, const uint16_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) +{ + const uint16_t *a_ptr = Apanel; + uint32_t *c_ptr = Cpanel; + + for (int yb = 0; yb < ablocks; yb++) + { + const uint16_t *a_ptr0 = a_ptr; + const uint16_t *b_ptr = Bpanel; + + for (int xb = 0; xb < bblocks; xb++) + { + a_ptr = a_ptr0; + const bool odd_k = K & 0x1; + int k = (K+1)/2 - 1; + + register uint16x8_t aa asm("v0"); + register uint16x8_t ab asm("v1"); + register uint16x8_t b0 asm("v2"); + register uint16x8_t b1 asm("v3"); + register uint16x8_t b2 asm("v4"); + + __asm __volatile ( + "ldr %d[aa], [%x[a_ptr]]\n" // Load A[A].lower + "movi v5.4s, #0\n" + "ldr x20, [%x[a_ptr], #0x08]\n" // Load A[A].upper + "movi v6.4s, #0\n" + "ldr %d[b0], [%x[b_ptr]]\n" // Load B[0].lower + "ins %[aa].d[1], x20\n" // Merge A[A].lower and upper + "movi v7.4s, #0\n" + ASM_PREFETCH("[%[a_ptr], #64]") + "movi v8.4s, #0\n" + "ldr x20, [%x[b_ptr], #0x08]\n" // Load B[0].upper + "movi v9.4s, #0\n" + ASM_PREFETCH("[%[b_ptr], #64]") + "movi v10.4s, #0\n" + "ldr %d[b1], [%x[b_ptr], #0x10]\n" // Load B[1].lower + "ins %[b0].d[1], x20\n" // Merge B[0].lower and upper + "movi v11.4s, #0\n" + ASM_PREFETCH("[%[a_ptr], #96]") + "movi v12.4s, #0\n" + "movi v13.4s, #0\n" + ASM_PREFETCH("[%[b_ptr], #96]") + "movi v14.4s, #0\n" + "movi v15.4s, #0\n" + ASM_PREFETCH("[%[a_ptr], #128]") + "movi v16.4s, #0\n" + "movi v17.4s, #0\n" + ASM_PREFETCH("[%[b_ptr], #128]") + "movi v18.4s, #0\n" + "movi v19.4s, #0\n" + ASM_PREFETCH("[%[a_ptr], #160]") + "movi v20.4s, #0\n" + "movi v21.4s, #0\n" + ASM_PREFETCH("[%[b_ptr], #160]") + "movi v22.4s, #0\n" + "movi v23.4s, #0\n" + ASM_PREFETCH("[%[a_ptr], #192]") + "movi v24.4s, #0\n" + "add %x[a_ptr], %x[a_ptr], #0x10\n" + "movi v25.4s, #0\n" + ASM_PREFETCH("[%[b_ptr], #192]") + "movi v26.4s, #0\n" + "add %x[b_ptr], %x[b_ptr], #0x18\n" + "movi v27.4s, #0\n" + "movi v28.4s, #0\n" + + "cbz %x[k], 2f\n" // Skip the loop if doing zero iterations. + + "1:\n" // Main loop + // First unroll + "umlal v5.4s, %[b0].4h, %[aa].h[0]\n" + "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper + "umlal v6.4s, %[b0].4h, %[aa].h[1]\n" + "umlal v7.4s, %[b0].4h, %[aa].h[2]\n" + "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower + "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper + "umlal v8.4s, %[b0].4h, %[aa].h[3]\n" + "umlal v9.4s, %[b0].4h, %[aa].h[4]\n" + "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper + "umlal v10.4s, %[b0].4h, %[aa].h[5]\n" + "umlal v11.4s, %[b0].4h, %[aa].h[6]\n" + "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower + "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper + "umlal v12.4s, %[b0].4h, %[aa].h[7]\n" + "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" + "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper + "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" + "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" + "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" + "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" + "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" + "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" + "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" + "ldr %d[b0], [%x[b_ptr], #0x18]\n" // Load B[0].lower + "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper + "umlal v21.4s, %[b1].4h, %[aa].h[0]\n" + "umlal v22.4s, %[b1].4h, %[aa].h[1]\n" + "ldr x20, [%x[b_ptr], #0x20]\n" // Load B[0].upper + "umlal v23.4s, %[b1].4h, %[aa].h[2]\n" + "umlal v24.4s, %[b1].4h, %[aa].h[3]\n" + "umlal v25.4s, %[b1].4h, %[aa].h[4]\n" + "umlal v26.4s, %[b1].4h, %[aa].h[5]\n" + "umlal v27.4s, %[b1].4h, %[aa].h[6]\n" + "umlal v28.4s, %[b1].4h, %[aa].h[7]\n" + + // Second unroll + "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n" + "ldr %d[aa], [%x[a_ptr], #0x10]\n" // Load A[A].lower + "ins %[b0].d[1], x20\n" // Merge B[0].lower and .upper + "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n" + "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n" + "ldr x20, [%x[a_ptr], #0x18]\n" // Load A[A].upper + "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n" + "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n" + "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n" + "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n" + "add %x[a_ptr], %x[a_ptr], #0x20\n" + "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n" + "umlal v13.4s, %[b2].4h, %[ab].h[0]\n" + ASM_PREFETCH("[%[b_ptr], #320]") + "umlal v14.4s, %[b2].4h, %[ab].h[1]\n" + "umlal v15.4s, %[b2].4h, %[ab].h[2]\n" + ASM_PREFETCH("[%[a_ptr], #320]") + "umlal v16.4s, %[b2].4h, %[ab].h[3]\n" + "umlal v17.4s, %[b2].4h, %[ab].h[4]\n" + ASM_PREFETCH("[%[b_ptr], #448]") + "umlal v18.4s, %[b2].4h, %[ab].h[5]\n" + "umlal v19.4s, %[b2].4h, %[ab].h[6]\n" + "umlal v20.4s, %[b2].4h, %[ab].h[7]\n" + "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n" + "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n" + "subs %x[k], %x[k], #0x1\n" + "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n" + "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n" + "ldr %d[b1], [%x[b_ptr], #0x28]\n" // Load B[1].lower + "ins %[aa].d[1], x20\n" // Merge A[A].lower and .upper + "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n" + "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n" + "add %x[b_ptr], %x[b_ptr], #0x30\n" + "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n" + "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n" + "bne 1b\n" + + "2:\n" // Even tail + "cbnz %x[odd_k], 3f\n" + + "umlal v5.4s, %[b0].4h, %[aa].h[0]\n" + "ldr x20, [%x[b_ptr]]\n" // Load B[1].upper + "umlal v6.4s, %[b0].4h, %[aa].h[1]\n" + "umlal v7.4s, %[b0].4h, %[aa].h[2]\n" + "ldr %d[ab], [%x[a_ptr]]\n" // Load A[B].lower + "ins %[b1].d[1], x20\n" // Merge B[1].lower and .upper + "umlal v8.4s, %[b0].4h, %[aa].h[3]\n" + "umlal v9.4s, %[b0].4h, %[aa].h[4]\n" + "ldr x20, [%x[a_ptr], #0x8]\n" // Load A[B].upper + "umlal v10.4s, %[b0].4h, %[aa].h[5]\n" + "umlal v11.4s, %[b0].4h, %[aa].h[6]\n" + "ldr %d[b2], [%x[b_ptr], #0x8]\n" // Load B[2].lower + "ins %[ab].d[1], x20\n" // Merge A[B].lower and .upper + "umlal v12.4s, %[b0].4h, %[aa].h[7]\n" + "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" + "ldr x20, [%x[b_ptr], #0x10]\n" // Load B[2].upper + "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" + "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" + "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" + "add %[a_ptr], %[a_ptr], #0x10\n" + "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" + "add %[b_ptr], %[b_ptr], #0x18\n" + "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" + "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" + "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" + "ins %[b2].d[1], x20\n" // Merge B[2].lower and .upper + "umlal v21.4s, %[b1].4h, %[aa].h[0]\n" + "umlal v22.4s, %[b1].4h, %[aa].h[1]\n" + "umlal v23.4s, %[b1].4h, %[aa].h[2]\n" + "umlal v24.4s, %[b1].4h, %[aa].h[3]\n" + "umlal v25.4s, %[b1].4h, %[aa].h[4]\n" + "umlal v26.4s, %[b1].4h, %[aa].h[5]\n" + "umlal v27.4s, %[b1].4h, %[aa].h[6]\n" + "umlal v28.4s, %[b1].4h, %[aa].h[7]\n" + + "umlal2 v5.4s, %[b1].8h, %[ab].h[0]\n" + "umlal v13.4s, %[b2].4h, %[ab].h[0]\n" + "umlal2 v21.4s, %[b2].8h, %[ab].h[0]\n" + "umlal2 v6.4s, %[b1].8h, %[ab].h[1]\n" + "umlal v14.4s, %[b2].4h, %[ab].h[1]\n" + "str q5, [%x[c_ptr]]\n" + "umlal2 v22.4s, %[b2].8h, %[ab].h[1]\n" + "str q13, [%x[c_ptr], #0x10]\n" + "umlal2 v7.4s, %[b1].8h, %[ab].h[2]\n" + "str q21, [%x[c_ptr], #0x20]\n" + "umlal v15.4s, %[b2].4h, %[ab].h[2]\n" + "str q6, [%x[c_ptr], #0x30]\n" + "umlal2 v23.4s, %[b2].8h, %[ab].h[2]\n" + "str q14, [%x[c_ptr], #0x40]\n" + "umlal2 v8.4s, %[b1].8h, %[ab].h[3]\n" + "str q22, [%x[c_ptr], #0x50]\n" + "umlal v16.4s, %[b2].4h, %[ab].h[3]\n" + "str q7, [%x[c_ptr], #0x60]\n" + "umlal2 v24.4s, %[b2].8h, %[ab].h[3]\n" + "str q15, [%x[c_ptr], #0x70]\n" + "umlal2 v9.4s, %[b1].8h, %[ab].h[4]\n" + "str q23, [%x[c_ptr], #0x80]\n" + "umlal v17.4s, %[b2].4h, %[ab].h[4]\n" + "str q8, [%x[c_ptr], #0x90]\n" + "umlal2 v25.4s, %[b2].8h, %[ab].h[4]\n" + "str q16, [%x[c_ptr], #0xa0]\n" + "umlal2 v10.4s, %[b1].8h, %[ab].h[5]\n" + "str q24, [%x[c_ptr], #0xb0]\n" + "umlal v18.4s, %[b2].4h, %[ab].h[5]\n" + "str q9, [%x[c_ptr], #0xc0]\n" + "umlal2 v26.4s, %[b2].8h, %[ab].h[5]\n" + "str q17, [%x[c_ptr], #0xd0]\n" + "umlal2 v11.4s, %[b1].8h, %[ab].h[6]\n" + "str q25, [%x[c_ptr], #0xe0]\n" + "umlal v19.4s, %[b2].4h, %[ab].h[6]\n" + "str q10, [%x[c_ptr], #0xf0]\n" + "umlal2 v27.4s, %[b2].8h, %[ab].h[6]\n" + "str q18, [%x[c_ptr], #0x100]\n" + "umlal2 v12.4s, %[b1].8h, %[ab].h[7]\n" + "str q26, [%x[c_ptr], #0x110]\n" + "umlal v20.4s, %[b2].4h, %[ab].h[7]\n" + "str q11, [%x[c_ptr], #0x120]\n" + "umlal2 v28.4s, %[b2].8h, %[ab].h[7]\n" + "str q19, [%x[c_ptr], #0x130]\n" + "b 4f\n" // Complete write out + + "3:\n" // Odd tail + "umlal v5.4s, %[b0].4h, %[aa].h[0]\n" + "umlal2 v13.4s, %[b0].8h, %[aa].h[0]\n" + "umlal v21.4s, %[b1].4h, %[aa].h[0]\n" + "umlal v6.4s, %[b0].4h, %[aa].h[1]\n" + "umlal2 v14.4s, %[b0].8h, %[aa].h[1]\n" + "umlal v22.4s, %[b1].4h, %[aa].h[1]\n" + "str q5, [%x[c_ptr]]\n" + "umlal v7.4s, %[b0].4h, %[aa].h[2]\n" + "str q13, [%x[c_ptr], #0x10]\n" + "umlal2 v15.4s, %[b0].8h, %[aa].h[2]\n" + "str q21, [%x[c_ptr], #0x20]\n" + "umlal v23.4s, %[b1].4h, %[aa].h[2]\n" + "str q6, [%x[c_ptr], #0x30]\n" + "umlal v8.4s, %[b0].4h, %[aa].h[3]\n" + "str q14, [%x[c_ptr], #0x40]\n" + "umlal2 v16.4s, %[b0].8h, %[aa].h[3]\n" + "str q22, [%x[c_ptr], #0x50]\n" + "umlal v24.4s, %[b1].4h, %[aa].h[3]\n" + "str q7, [%x[c_ptr], #0x60]\n" + "umlal v9.4s, %[b0].4h, %[aa].h[4]\n" + "str q15, [%x[c_ptr], #0x70]\n" + "umlal2 v17.4s, %[b0].8h, %[aa].h[4]\n" + "str q23, [%x[c_ptr], #0x80]\n" + "umlal v25.4s, %[b1].4h, %[aa].h[4]\n" + "str q8, [%x[c_ptr], #0x90]\n" + "umlal v10.4s, %[b0].4h, %[aa].h[5]\n" + "str q16, [%x[c_ptr], #0xa0]\n" + "umlal2 v18.4s, %[b0].8h, %[aa].h[5]\n" + "str q24, [%x[c_ptr], #0xb0]\n" + "umlal v26.4s, %[b1].4h, %[aa].h[5]\n" + "str q9, [%x[c_ptr], #0xc0]\n" + "umlal v11.4s, %[b0].4h, %[aa].h[6]\n" + "str q17, [%x[c_ptr], #0xd0]\n" + "umlal2 v19.4s, %[b0].8h, %[aa].h[6]\n" + "str q25, [%x[c_ptr], #0xe0]\n" + "umlal v27.4s, %[b1].4h, %[aa].h[6]\n" + "str q10, [%x[c_ptr], #0xf0]\n" + "umlal v12.4s, %[b0].4h, %[aa].h[7]\n" + "str q18, [%x[c_ptr], #0x100]\n" + "umlal2 v20.4s, %[b0].8h, %[aa].h[7]\n" + "str q26, [%x[c_ptr], #0x110]\n" + "umlal v28.4s, %[b1].4h, %[aa].h[7]\n" + "str q11, [%x[c_ptr], #0x120]\n" + + "4:\n" // End of function + "str q19, [%x[c_ptr], #0x130]\n" + "str q27, [%x[c_ptr], #0x140]\n" + "str q12, [%x[c_ptr], #0x150]\n" + "str q20, [%x[c_ptr], #0x160]\n" + "str q28, [%x[c_ptr], #0x170]\n" + "add %x[c_ptr], %x[c_ptr], #0x180\n" + : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), [k] "+r" (k), + [aa] "+w" (aa), [ab] "+w" (ab), [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2) + : [odd_k] "r" (odd_k) + : "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x20", "cc" + ); + } + } +} diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp new file mode 100644 index 000000000..62cd747d7 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8.hpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +// Load the actual kernel +#include "a64_gemm_u8_12x8/generic.hpp" +#include "a64_gemm_u8_12x8/a55r1.hpp" + +class gemm_u8_12x8 { +public: + typedef uint8_t operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); + + /* Describes the data layout for A input */ + static const int A_interleave = 8; + static const int A_block = 4; + static const bool A_transpose = false; + + /* Same for B input */ + static const int B_interleave = 12; + static const int B_block = 4; + static const bool B_transpose = true; + + /* Kernel blocking parameters */ + static const int out_width = 12; + static const int out_height = 8; + static const int k_unroll = 4; + + kern_type kernel = nullptr; + + gemm_u8_12x8(const CPUInfo *ci) { + kernel = a64_gemm_u8_12x8; + if (ci->CPU == CPUTarget::A55_DOT) { + kernel = a64_gemm_u8_12x8_a55r1; + } + } +}; + +#endif // __aarch64__ + diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/a55r1.hpp new file mode 100644 index 000000000..c7c2acbb4 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/a55r1.hpp @@ -0,0 +1,396 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include <arm_neon.h> +#include "dot_toolchain_support.h" +#include <cassert> + +inline void a64_gemm_u8_12x8_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { + assert(Apanel); + assert(Bpanel); + assert(Cpanel); + const uint8_t *a_ptr = Apanel; + uint32_t *c_ptr = Cpanel; + // We divide K by 4 because the udot instruction processes 4 elements at a time. + const int W = K/4; + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + const int oddk = (W & 1); + const int init_value_k = ((W+1)/2) - 1; + for (int yb=0; yb<ablocks; yb++) { + const uint8_t *a_ptr0 = a_ptr; + const uint8_t *b_ptr = Bpanel; + for (int xb=0; xb<bblocks; xb++) { + a_ptr = a_ptr0; + int k = init_value_k; + register int32x4_t a0 asm("v0"); + register int32x4_t a1 asm("v1"); + register int32x4_t b0 asm("v2"); + register int32x4_t b1 asm("v3"); + register int32x4_t b2 asm("v4"); + register int32x4_t a0a asm("v5"); + register int32x4_t a1a asm("v6"); + __asm __volatile ( + _DECLARE_UDOT + // Initialize result registers, load initial operands, prime prefetches. + "movi v8.4s, #0x0\n" + "ldp %q[a0], %q[a1], [%[a_ptr]]\n" + "movi v9.4s, #0x0\n" + "ldp %q[b0], %q[b1], [%[b_ptr]]\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #64]") + "movi v13.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #64]") + "movi v14.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #128]") + "movi v15.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #128]") + "movi v16.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #192]") + "movi v17.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #256]") + "movi v18.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #192]") + "movi v19.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #320]") + "movi v20.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #256]") + "movi v21.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #384]") + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + + // Skip loop if we are doing zero iterations of it. + "cbz %w[k], 4f\n" + + + // Loop proper + "1:\n" + "udot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "ldr %d[b2], [%[b_ptr], #32]\n" + + "udot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "ldr x20, [%[b_ptr], #40]\n" + "udot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "udot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "ldr %d[a0a], [%[a_ptr], #32]\n" + + "udot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "ins %[b2].d[1], x20\n" + "udot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "ldr x20, [%[a_ptr], #40]\n" + "udot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "udot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "ldr %d[a1a], [%[a_ptr], #48]\n" + + + "udot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "ins %[a0a].d[1], x20\n" + ASM_PREFETCH("[%[a_ptr], #320]") + "udot v17.4s, %[b1].16b, %[a0].4b[1]\n" + "ldr x20, [%[a_ptr], #56]\n" + "udot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "udot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "ldr %d[b0], [%[b_ptr], #48]\n" + + "udot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "ins %[a1a].d[1], x20\n" + ASM_PREFETCH("[%[b_ptr], #448]") + "udot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "ldr x20, [%[b_ptr], #56]\n" + "udot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "udot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "ldr %d[b1], [%[b_ptr], #64]\n" + + "udot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "ins %[b0].d[1], x20\n" + "udot v25.4s, %[b2].16b, %[a0].4b[1]\n" + "ldr x20, [%[b_ptr], #72]\n" + "udot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "udot v27.4s, %[b2].16b, %[a0].4b[3]\n" + + "udot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "udot v29.4s, %[b2].16b, %[a1].4b[1]\n" + "udot v30.4s, %[b2].16b, %[a1].4b[2]\n" + "udot v31.4s, %[b2].16b, %[a1].4b[3]\n" + + "ldr %d[b2], [%[b_ptr], #80]\n" + + "udot v8.4s , %[b0].16b, %[a0a].4b[0]\n" + "ins %[b1].d[1], x20\n" + "udot v9.4s , %[b0].16b, %[a0a].4b[1]\n" + "ldr x20, [%[b_ptr], #88]\n" + "udot v10.4s, %[b0].16b, %[a0a].4b[2]\n" + "udot v11.4s, %[b0].16b, %[a0a].4b[3]\n" + "ldr %d[a0], [%[a_ptr], #64]\n" + + "udot v12.4s, %[b0].16b, %[a1a].4b[0]\n" + "ins %[b2].d[1], x20\n" + "udot v13.4s, %[b0].16b, %[a1a].4b[1]\n" + "ldr x20, [%[a_ptr], #72]\n" + "udot v14.4s, %[b0].16b, %[a1a].4b[2]\n" + "udot v15.4s, %[b0].16b, %[a1a].4b[3]\n" + "ldr %d[a1], [%[a_ptr], #80]\n" + + "udot v16.4s, %[b1].16b, %[a0a].4b[0]\n" + "ins %[a0].d[1], x20\n" + ASM_PREFETCH("[%[b_ptr], #512]") + "udot v17.4s, %[b1].16b, %[a0a].4b[1]\n" + "ldr x20, [%[a_ptr], #88]\n" + "udot v18.4s, %[b1].16b, %[a0a].4b[2]\n" + "udot v19.4s, %[b1].16b, %[a0a].4b[3]\n" + "ldr %d[b0], [%[b_ptr], #96]\n" + + "udot v20.4s, %[b1].16b, %[a1a].4b[0]\n" + "ins %[a1].d[1], x20\n" + "udot v21.4s, %[b1].16b, %[a1a].4b[1]\n" + "ldr x20, [%[b_ptr], #104]\n" + "udot v22.4s, %[b1].16b, %[a1a].4b[2]\n" + "udot v23.4s, %[b1].16b, %[a1a].4b[3]\n" + "ldr %d[b1], [%[b_ptr], #112]\n" + + "udot v24.4s, %[b2].16b, %[a0a].4b[0]\n" + "ins %[b0].d[1], x20\n" + "udot v25.4s, %[b2].16b, %[a0a].4b[1]\n" + "ldr x20, [%[b_ptr], #120]\n" + "add %[a_ptr], %[a_ptr], #64\n" + "udot v26.4s, %[b2].16b, %[a0a].4b[2]\n" + "udot v27.4s, %[b2].16b, %[a0a].4b[3]\n" + + "udot v28.4s, %[b2].16b, %[a1a].4b[0]\n" + "udot v29.4s, %[b2].16b, %[a1a].4b[1]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "udot v30.4s, %[b2].16b, %[a1a].4b[2]\n" + "subs %w[k], %w[k], #1\n" + "ins %[b1].d[1], x20\n" + "udot v31.4s, %[b2].16b, %[a1a].4b[3]\n" + "ldr %d[b2], [%[b_ptr], #32]\n" + "bne 1b\n" + + // Target to use when K is 1 or 2 (i.e. zero iterations of main loop) + "4:\n" + + // Branch to alternative tail for odd K + "cbnz %w[oddk], 2f\n" + "ldr %d[b2], [%[b_ptr], #32]\n" + + // Detached final iteration (even K) + "udot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "udot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "ldr x20, [%[b_ptr], #40]\n" + "udot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "udot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "ldr %d[a0a], [%[a_ptr], #32]\n" + + "udot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "ins %[b2].d[1], x20\n" + + "udot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "ldr x20, [%[a_ptr], #40]\n" + "udot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "udot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "ldr %d[a1a], [%[a_ptr], #48]\n" + + "udot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "ins %[a0a].d[1], x20\n" + "udot v17.4s, %[b1].16b, %[a0].4b[1]\n" + "ldr x20, [%[a_ptr], #56]\n" + "udot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "udot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "ldr %d[b0], [%[b_ptr], #48]\n" + + "udot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "ins %[a1a].d[1], x20\n" + "udot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "ldr x20, [%[b_ptr], #56]\n" + "udot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "udot v23.4s, %[b1].16b, %[a1].4b[3]\n" + + "udot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "udot v25.4s, %[b2].16b, %[a0].4b[1]\n" + "add %[a_ptr], %[a_ptr], #64\n" + "udot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "udot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "ldr %d[b1], [%[b_ptr], #64]\n" + + "udot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "ins %[b0].d[1], x20\n" + "udot v29.4s, %[b2].16b, %[a1].4b[1]\n" + "ldr x20, [%[b_ptr], #72]\n" + "udot v30.4s, %[b2].16b, %[a1].4b[2]\n" + "udot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "ldr %d[b2], [%[b_ptr], #80]\n" + + "udot v8.4s , %[b0].16b, %[a0a].4b[0]\n" + "ins %[b1].d[1], x20\n" + "udot v9.4s , %[b0].16b, %[a0a].4b[1]\n" + "ldr x20, [%[b_ptr], #88]\n" + "udot v10.4s, %[b0].16b, %[a0a].4b[2]\n" + "ins %[b2].d[1], x20\n" + + "udot v16.4s, %[b1].16b, %[a0a].4b[0]\n" + "udot v24.4s, %[b2].16b, %[a0a].4b[0]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "str q8, [%[c_ptr], #0]\n" + "str q16, [%[c_ptr], #16]\n" + "str q24, [%[c_ptr], #32]\n" + "udot v17.4s, %[b1].16b, %[a0a].4b[1]\n" + + "udot v25.4s, %[b2].16b, %[a0a].4b[1]\n" + "str q9, [%[c_ptr], #48]\n" + "str q17, [%[c_ptr], #64]\n" + "str q25, [%[c_ptr], #80]\n" + "udot v18.4s, %[b1].16b, %[a0a].4b[2]\n" + "udot v26.4s, %[b2].16b, %[a0a].4b[2]\n" + "str q10, [%[c_ptr], #96]\n" + "str q18, [%[c_ptr], #112]\n" + "str q26, [%[c_ptr], #128]\n" + "udot v11.4s, %[b0].16b, %[a0a].4b[3]\n" + "udot v19.4s, %[b1].16b, %[a0a].4b[3]\n" + "udot v27.4s, %[b2].16b, %[a0a].4b[3]\n" + "str q11, [%[c_ptr], #144]\n" + "str q19, [%[c_ptr], #160]\n" + "str q27, [%[c_ptr], #176]\n" + "udot v12.4s, %[b0].16b, %[a1a].4b[0]\n" + "udot v20.4s, %[b1].16b, %[a1a].4b[0]\n" + "udot v28.4s, %[b2].16b, %[a1a].4b[0]\n" + "str q12, [%[c_ptr], #192]\n" + "str q20, [%[c_ptr], #208]\n" + "str q28, [%[c_ptr], #224]\n" + "udot v13.4s, %[b0].16b, %[a1a].4b[1]\n" + "udot v21.4s, %[b1].16b, %[a1a].4b[1]\n" + "udot v29.4s, %[b2].16b, %[a1a].4b[1]\n" + "str q13, [%[c_ptr], #240]\n" + "str q21, [%[c_ptr], #256]\n" + "str q29, [%[c_ptr], #272]\n" + "udot v14.4s, %[b0].16b, %[a1a].4b[2]\n" + "udot v22.4s, %[b1].16b, %[a1a].4b[2]\n" + "udot v30.4s, %[b2].16b, %[a1a].4b[2]\n" + "str q14, [%[c_ptr], #288]\n" + "str q22, [%[c_ptr], #304]\n" + "str q30, [%[c_ptr], #320]\n" + "udot v15.4s, %[b0].16b, %[a1a].4b[3]\n" + "udot v23.4s, %[b1].16b, %[a1a].4b[3]\n" + "udot v31.4s, %[b2].16b, %[a1a].4b[3]\n" + "str q15, [%[c_ptr], #336]\n" + + "b 3f\n" + + // Detached final iteration (odd K) + "2:\n" + "udot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "ldr %d[b2], [%[b_ptr], #32]\n" + "ldr x20, [%[b_ptr], #40]\n" + + "udot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "udot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "str q8, [%[c_ptr], #0]\n" + "udot v17.4s, %[b1].16b, %[a0].4b[1]\n" + "str q16, [%[c_ptr], #16]\n" + "ins %[b2].d[1], x20\n" + "udot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "add %[b_ptr], %[b_ptr], #48\n" + "add %[a_ptr], %[a_ptr], #32\n" + "str q24, [%[c_ptr], #32]\n" + "udot v25.4s, %[b2].16b, %[a0].4b[1]\n" + "str q9, [%[c_ptr], #48]\n" + + "udot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "str q17, [%[c_ptr], #64]\n" + "udot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "str q25, [%[c_ptr], #80]\n" + "udot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "str q10, [%[c_ptr], #96]\n" + + "udot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "str q18, [%[c_ptr], #112]\n" + "udot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "str q26, [%[c_ptr], #128]\n" + "udot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "str q11, [%[c_ptr], #144]\n" + + "udot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "str q19, [%[c_ptr], #160]\n" + "udot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "str q27, [%[c_ptr], #176]\n" + "udot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "str q12, [%[c_ptr], #192]\n" + + "udot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "str q20, [%[c_ptr], #208]\n" + "udot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "str q28, [%[c_ptr], #224]\n" + "udot v29.4s, %[b2].16b, %[a1].4b[1]\n" + "str q13, [%[c_ptr], #240]\n" + + "udot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "str q21, [%[c_ptr], #256]\n" + "udot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "str q29, [%[c_ptr], #272]\n" + "udot v30.4s, %[b2].16b, %[a1].4b[2]\n" + "str q14, [%[c_ptr], #288]\n" + + "udot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "str q22, [%[c_ptr], #304]\n" + "udot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "str q30, [%[c_ptr], #320]\n" + "udot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "str q15, [%[c_ptr], #336]\n" + + + // Common tail + "3:\n" + "str q23, [%[c_ptr], #352]\n" + "str q31, [%[c_ptr], #368]\n" + "add %[c_ptr], %[c_ptr], #384\n" + + + + ".purgem udot\n" + : + [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), + [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a), + [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k) + : [oddk] "r" (oddk) + : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory" + ); + } + } +} +#endif + diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h new file mode 100644 index 000000000..718232fb0 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/dot_toolchain_support.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +// Define a macro to assemble the UDOT instruction (in the absence of toolchain support) +#define _DECLARE_UDOT ".altmacro\n"\ + ".macro udot opd:req, opn:req, opm:req\n"\ + "local vd, vn, vm, h, l\n"\ + ".irp reg,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31\n"\ + ".ifeqs \"\\opd\",\"v\\reg\\.4s\"\n"\ + ".set vd,\\reg\n"\ + ".endif\n"\ + ".ifeqs \"\\opn\",\"v\\reg\\.16b\"\n"\ + ".set vn,\\reg\n"\ + ".endif\n"\ + ".irp idx,0,1,2,3\n"\ + ".ifeqs \"\\opm\",\"v\\reg\\.4b[\\idx\\]\"\n"\ + ".set vm,\\reg\n"\ + ".set h,\\idx / 2\n"\ + ".set l,\\idx %% 2\n"\ + ".endif\n"\ + ".endr\n"\ + ".endr\n"\ + ".ifndef vd\n"\ + ".error \"Bad operand \\opd\"\n"\ + ".exitm\n"\ + ".endif\n"\ + ".ifndef vn\n"\ + ".error \"Bad operand \\opn\"\n"\ + ".exitm\n"\ + ".endif\n"\ + ".ifndef vm\n"\ + ".error \"Bad operand \\opm\"\n"\ + ".exitm\n"\ + ".endif\n"\ + ".ifndef h\n"\ + ".error \"Bad operand \\opm\"\n"\ + ".exitm\n"\ + ".endif\n"\ + ".ifndef l\n"\ + ".error \"Bad operand \\opm\"\n"\ + ".exitm\n"\ + ".endif\n"\ + ".int 0x6f80e000 | vd | (vn << 5) | (vm << 16) | (l << 21) | (h << 11)\n"\ + ".endm\n"\ + diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/generic.hpp new file mode 100644 index 000000000..3531eb6d2 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_12x8/generic.hpp @@ -0,0 +1,354 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include <arm_neon.h> +#include "dot_toolchain_support.h" +#include <cassert> + +inline void a64_gemm_u8_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { + assert(Apanel); + assert(Bpanel); + assert(Cpanel); + const uint8_t *a_ptr = Apanel; + uint32_t *c_ptr = Cpanel; + // We divide K by 4 because the udot instruction processes 4 elements at a time. + const int W = K/4; + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + const int oddk = (W & 1); + const int init_value_k = ((W+1)/2) - 1; + for (int yb=0; yb<ablocks; yb++) { + const uint8_t *a_ptr0 = a_ptr; + const uint8_t *b_ptr = Bpanel; + for (int xb=0; xb<bblocks; xb++) { + a_ptr = a_ptr0; + int k = init_value_k; + register uint32x4_t a0 asm("v0"); + register uint32x4_t a1 asm("v1"); + register uint32x4_t b0 asm("v2"); + register uint32x4_t b1 asm("v3"); + register uint32x4_t b2 asm("v4"); + register uint32x4_t a0a asm("v5"); + register uint32x4_t a1a asm("v6"); + __asm __volatile ( + _DECLARE_UDOT + // Initialize result registers, load initial operands, prime prefetches. + "movi v8.4s, #0x0\n" + "ldr %q[a0], [%[a_ptr]]\n" + "movi v9.4s, #0x0\n" + "ldr %q[b0], [%[b_ptr]]\n" + "movi v10.4s, #0x0\n" + "ldr %q[a1], [%[a_ptr], #16]\n" + "movi v11.4s, #0x0\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "movi v12.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #64]") + "movi v13.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #64]") + "movi v14.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #128]") + "movi v15.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #128]") + "movi v16.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #192]") + "movi v17.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #256]") + "movi v18.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #192]") + "movi v19.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #320]") + "movi v20.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #256]") + "movi v21.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #384]") + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + + // Skip loop if we are doing zero iterations of it. + "cbz %w[k], 4f\n" + + // Loop proper + "1:\n" + "udot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "udot v9.4s , %[b0].16b, %[a0].4b[1]\n" + + "ldr %q[b2], [%[b_ptr], #32]\n" + "udot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "udot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "ldr %q[a0a], [%[a_ptr], #32]\n" + "udot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "udot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "ldr %q[a1a], [%[a_ptr], #48]\n" + "udot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "udot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "ldr %q[b0], [%[b_ptr], #48]\n" + + "udot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "udot v17.4s, %[b1].16b, %[a0].4b[1]\n" + ASM_PREFETCH("[%[a_ptr], #320]") + "udot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "udot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "udot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "udot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "udot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "udot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "ldr %q[b1], [%[b_ptr], #64]\n" + + "udot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "udot v25.4s, %[b2].16b, %[a0].4b[1]\n" + ASM_PREFETCH("[%[b_ptr], #448]") + "udot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "udot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "udot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "udot v29.4s, %[b2].16b, %[a1].4b[1]\n" + "udot v30.4s, %[b2].16b, %[a1].4b[2]\n" + "udot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "ldr %q[b2], [%[b_ptr], #80]\n" + + "udot v8.4s , %[b0].16b, %[a0a].4b[0]\n" + "udot v9.4s , %[b0].16b, %[a0a].4b[1]\n" + "ldr %q[a0], [%[a_ptr], #64]\n" + "udot v10.4s, %[b0].16b, %[a0a].4b[2]\n" + "udot v11.4s, %[b0].16b, %[a0a].4b[3]\n" + "udot v12.4s, %[b0].16b, %[a1a].4b[0]\n" + "ldr %q[a1], [%[a_ptr], #80]\n" + "udot v13.4s, %[b0].16b, %[a1a].4b[1]\n" + "udot v14.4s, %[b0].16b, %[a1a].4b[2]\n" + "udot v15.4s, %[b0].16b, %[a1a].4b[3]\n" + "ldr %q[b0], [%[b_ptr], #96]\n" + + "udot v16.4s, %[b1].16b, %[a0a].4b[0]\n" + "udot v17.4s, %[b1].16b, %[a0a].4b[1]\n" + ASM_PREFETCH("[%[b_ptr], #512]") + "udot v18.4s, %[b1].16b, %[a0a].4b[2]\n" + "udot v19.4s, %[b1].16b, %[a0a].4b[3]\n" + "udot v20.4s, %[b1].16b, %[a1a].4b[0]\n" + "udot v21.4s, %[b1].16b, %[a1a].4b[1]\n" + "udot v22.4s, %[b1].16b, %[a1a].4b[2]\n" + "udot v23.4s, %[b1].16b, %[a1a].4b[3]\n" + "ldr %q[b1], [%[b_ptr], #112]\n" + + "udot v24.4s, %[b2].16b, %[a0a].4b[0]\n" + "udot v25.4s, %[b2].16b, %[a0a].4b[1]\n" + "add %[a_ptr], %[a_ptr], #64\n" + "udot v26.4s, %[b2].16b, %[a0a].4b[2]\n" + "udot v27.4s, %[b2].16b, %[a0a].4b[3]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "udot v28.4s, %[b2].16b, %[a1a].4b[0]\n" + "udot v29.4s, %[b2].16b, %[a1a].4b[1]\n" + "subs %w[k], %w[k], #1\n" + "udot v30.4s, %[b2].16b, %[a1a].4b[2]\n" + "udot v31.4s, %[b2].16b, %[a1a].4b[3]\n" + "bne 1b\n" + + // Target to use when K is 1 or 2 (i.e. zero iterations of main loop) + "4:\n" + + // Branch to alternative tail for odd K + "cbnz %w[oddk], 2f\n" + + // Detached final iteration (even K) + "udot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "udot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "udot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "udot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "ldr %q[a0a], [%[a_ptr], #32]\n" + "udot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "udot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "ldr %q[a1a], [%[a_ptr], #48]\n" + "udot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "udot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "ldr %q[b0], [%[b_ptr], #48]\n" + + "udot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "udot v17.4s, %[b1].16b, %[a0].4b[1]\n" + "udot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "udot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "udot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "udot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "udot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "udot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "ldr %q[b1], [%[b_ptr], #64]\n" + + "udot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "udot v25.4s, %[b2].16b, %[a0].4b[1]\n" + "add %[a_ptr], %[a_ptr], #64\n" + "udot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "udot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "udot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "udot v29.4s, %[b2].16b, %[a1].4b[1]\n" + "udot v30.4s, %[b2].16b, %[a1].4b[2]\n" + "udot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "ldr %q[b2], [%[b_ptr], #80]\n" + + "udot v8.4s , %[b0].16b, %[a0a].4b[0]\n" + + "udot v16.4s, %[b1].16b, %[a0a].4b[0]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "udot v9.4s , %[b0].16b, %[a0a].4b[1]\n" + "str q8, [%[c_ptr], #0]\n" + "udot v17.4s, %[b1].16b, %[a0a].4b[1]\n" + "str q16, [%[c_ptr], #16]\n" + "udot v24.4s, %[b2].16b, %[a0a].4b[0]\n" + "str q24, [%[c_ptr], #32]\n" + + "udot v25.4s, %[b2].16b, %[a0a].4b[1]\n" + "str q9, [%[c_ptr], #48]\n" + "udot v10.4s, %[b0].16b, %[a0a].4b[2]\n" + "str q17, [%[c_ptr], #64]\n" + "udot v18.4s, %[b1].16b, %[a0a].4b[2]\n" + "str q25, [%[c_ptr], #80]\n" + "udot v26.4s, %[b2].16b, %[a0a].4b[2]\n" + "str q10, [%[c_ptr], #96]\n" + + "udot v11.4s, %[b0].16b, %[a0a].4b[3]\n" + "str q18, [%[c_ptr], #112]\n" + "udot v19.4s, %[b1].16b, %[a0a].4b[3]\n" + "str q26, [%[c_ptr], #128]\n" + "udot v27.4s, %[b2].16b, %[a0a].4b[3]\n" + "str q11, [%[c_ptr], #144]\n" + + "udot v12.4s, %[b0].16b, %[a1a].4b[0]\n" + "str q19, [%[c_ptr], #160]\n" + "udot v20.4s, %[b1].16b, %[a1a].4b[0]\n" + "str q27, [%[c_ptr], #176]\n" + "udot v28.4s, %[b2].16b, %[a1a].4b[0]\n" + "str q12, [%[c_ptr], #192]\n" + + "udot v13.4s, %[b0].16b, %[a1a].4b[1]\n" + "str q20, [%[c_ptr], #208]\n" + "udot v21.4s, %[b1].16b, %[a1a].4b[1]\n" + "str q28, [%[c_ptr], #224]\n" + "udot v29.4s, %[b2].16b, %[a1a].4b[1]\n" + "str q13, [%[c_ptr], #240]\n" + + "udot v14.4s, %[b0].16b, %[a1a].4b[2]\n" + "str q21, [%[c_ptr], #256]\n" + "udot v22.4s, %[b1].16b, %[a1a].4b[2]\n" + "str q29, [%[c_ptr], #272]\n" + "udot v30.4s, %[b2].16b, %[a1a].4b[2]\n" + "str q14, [%[c_ptr], #288]\n" + + "udot v15.4s, %[b0].16b, %[a1a].4b[3]\n" + "str q22, [%[c_ptr], #304]\n" + "udot v23.4s, %[b1].16b, %[a1a].4b[3]\n" + "str q30, [%[c_ptr], #320]\n" + "udot v31.4s, %[b2].16b, %[a1a].4b[3]\n" + "str q15, [%[c_ptr], #336]\n" + + "b 3f\n" + + // Detached final iteration (odd K) + "2:\n" + "udot v8.4s , %[b0].16b, %[a0].4b[0]\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "udot v16.4s, %[b1].16b, %[a0].4b[0]\n" + "udot v9.4s , %[b0].16b, %[a0].4b[1]\n" + "str q8, [%[c_ptr], #0]\n" + "udot v17.4s, %[b1].16b, %[a0].4b[1]\n" + "str q16, [%[c_ptr], #16]\n" + "udot v24.4s, %[b2].16b, %[a0].4b[0]\n" + "add %[b_ptr], %[b_ptr], #48\n" + "add %[a_ptr], %[a_ptr], #32\n" + "str q24, [%[c_ptr], #32]\n" + "udot v25.4s, %[b2].16b, %[a0].4b[1]\n" + "str q9, [%[c_ptr], #48]\n" + + "udot v10.4s, %[b0].16b, %[a0].4b[2]\n" + "str q17, [%[c_ptr], #64]\n" + "udot v18.4s, %[b1].16b, %[a0].4b[2]\n" + "str q25, [%[c_ptr], #80]\n" + "udot v26.4s, %[b2].16b, %[a0].4b[2]\n" + "str q10, [%[c_ptr], #96]\n" + + "udot v11.4s, %[b0].16b, %[a0].4b[3]\n" + "str q18, [%[c_ptr], #112]\n" + "udot v19.4s, %[b1].16b, %[a0].4b[3]\n" + "str q26, [%[c_ptr], #128]\n" + "udot v27.4s, %[b2].16b, %[a0].4b[3]\n" + "str q11, [%[c_ptr], #144]\n" + + "udot v12.4s, %[b0].16b, %[a1].4b[0]\n" + "str q19, [%[c_ptr], #160]\n" + "udot v20.4s, %[b1].16b, %[a1].4b[0]\n" + "str q27, [%[c_ptr], #176]\n" + "udot v28.4s, %[b2].16b, %[a1].4b[0]\n" + "str q12, [%[c_ptr], #192]\n" + + "udot v13.4s, %[b0].16b, %[a1].4b[1]\n" + "str q20, [%[c_ptr], #208]\n" + "udot v21.4s, %[b1].16b, %[a1].4b[1]\n" + "str q28, [%[c_ptr], #224]\n" + "udot v29.4s, %[b2].16b, %[a1].4b[1]\n" + "str q13, [%[c_ptr], #240]\n" + + "udot v14.4s, %[b0].16b, %[a1].4b[2]\n" + "str q21, [%[c_ptr], #256]\n" + "udot v22.4s, %[b1].16b, %[a1].4b[2]\n" + "str q29, [%[c_ptr], #272]\n" + "udot v30.4s, %[b2].16b, %[a1].4b[2]\n" + "str q14, [%[c_ptr], #288]\n" + + "udot v15.4s, %[b0].16b, %[a1].4b[3]\n" + "str q22, [%[c_ptr], #304]\n" + "udot v23.4s, %[b1].16b, %[a1].4b[3]\n" + "str q30, [%[c_ptr], #320]\n" + "udot v31.4s, %[b2].16b, %[a1].4b[3]\n" + "str q15, [%[c_ptr], #336]\n" + + + // Common tail + "3:\n" + "str q23, [%[c_ptr], #352]\n" + "str q31, [%[c_ptr], #368]\n" + "add %[c_ptr], %[c_ptr], #384\n" + + ".purgem udot\n" + : + [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), + [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a), + [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k) + : [oddk] "r" (oddk) + : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" + ); + + } + } + + +} +#endif diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp new file mode 100644 index 000000000..3561bfec9 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4.hpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +// Load the actual kernel +#include "a64_gemm_u8_4x4/generic.hpp" + +class gemm_u8_4x4 { +public: + typedef uint8_t operand_type; + typedef uint32_t result_type; + + typedef void (*kern_type)(const uint8_t *, const uint8_t *, uint32_t *, int, int, int); + + /* Describes the data layout for A input */ + static const int A_interleave = 4; + static const int A_block = 16; + static const bool A_transpose = false; + + /* Same for B input */ + static const int B_interleave = 4; + static const int B_block = 16; + static const bool B_transpose = true; + + /* Kernel blocking parameters */ + static const int out_width = 4; + static const int out_height = 4; + static const int k_unroll = 16; + + kern_type kernel = nullptr; + + gemm_u8_4x4(const CPUInfo *ci) { + kernel = a64_gemm_u8_4x4; + } +}; + +#endif // __aarch64__ + diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4/generic.hpp new file mode 100644 index 000000000..e48c373f2 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_gemm_u8_4x4/generic.hpp @@ -0,0 +1,465 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include <arm_neon.h> + +inline void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) { + const uint8_t *a_ptr = Apanel; + uint32_t *c_ptr = Cpanel; + K /= 16; + int oddk = (K & 1); + + for (int yb=0; yb<ablocks; yb++) { + const uint8_t *a_ptr0 = a_ptr; + const uint8_t *b_ptr = Bpanel; + + for (int xb=0; xb<bblocks; xb++) { + a_ptr = a_ptr0; + + int k = ((K+1)/2)-1; + + register uint8x16_t b0 asm("v4"); + register uint8x16_t b1 asm("v5"); + register uint8x16_t b2 asm("v6"); + register uint8x16_t b3 asm("v7"); + register uint8x16_t b0a asm("v8"); + register uint8x16_t b1a asm("v9"); + register uint8x16_t b2a asm("v10"); + register uint8x16_t b3a asm("v11"); + + __asm __volatile ( + "movi v16.4s, #0x0\n" + "ldr q0, [%[a_ptr]]\n" + "movi v17.4s, #0x0\n" + "ldr %q[b0], [%[b_ptr]]\n" + "movi v18.4s, #0x0\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "movi v19.4s, #0x0\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "movi v20.4s, #0x0\n" + "ldr %q[b3], [%[b_ptr], #48]\n" + "movi v21.4s, #0x0\n" + "ldr q1, [%[a_ptr], #16]\n" + "movi v22.4s, #0x0\n" + "ldr q2, [%[a_ptr], #32]\n" + "movi v23.4s, #0x0\n" + "ldr q3, [%[a_ptr], #48]\n" + "movi v24.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #64]") + "movi v25.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #64]") + "movi v26.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #128]") + "movi v27.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #128]") + "movi v28.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #192]") + "movi v29.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #192]") + "movi v30.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #256]") + "movi v31.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #256]") + + // Loop structure optimized for A57 (after r0). + + // Unavoidably, the multiply will "dribble" if + // dual issued with an add. + + // Minimize the effect of this by making sure + // there are 2 adds to run under the dribbled + // multiply. + + // Pipeline in blocks of 8 multiplies - combine + // this iteration's multiplies with adds from + // the previous iteration. + + // So the first block doesn't have any adds to + // do - but because all the adds are at the + // start of the block it's only the first couple + // of multiplies that need to be pulled out. + + // Start of unroll 0 (first iteration) + "umull v12.8h, v0.8b, %[b0].8b\n" + "umull v13.8h, v0.8b, %[b1].8b\n" + + // Skip loop if we are doing zero iterations of it. + "cbz %w[k], 4f\n" + + // Unroll 0 continuation (branch target) + "1:\n" + "umull v14.8h, v0.8b, %[b2].8b\n" + "subs %w[k], %w[k], #1\n" + "umull v15.8h, v0.8b, %[b3].8b\n" + "ldr %q[b0a], [%[b_ptr], #64]\n" + "umlal2 v12.8h, v0.16b, %[b0].16b\n" + "umlal2 v13.8h, v0.16b, %[b1].16b\n" + "ldr %q[b1a], [%[b_ptr], #80]\n" + "umlal2 v14.8h, v0.16b, %[b2].16b\n" + "umlal2 v15.8h, v0.16b, %[b3].16b\n" + "ldr q0, [%[a_ptr], #64]\n" + + "uadalp v16.4s, v12.8h\n" + "umull v12.8h, v1.8b, %[b0].8b\n" + "uadalp v17.4s, v13.8h\n" + "uadalp v18.4s, v14.8h\n" + "umull v13.8h, v1.8b, %[b1].8b\n" + "uadalp v19.4s, v15.8h\n" + "umull v14.8h, v1.8b, %[b2].8b\n" + "ldr %q[b2a], [%[b_ptr], #96]\n" + "umull v15.8h, v1.8b, %[b3].8b\n" + "umlal2 v12.8h, v1.16b, %[b0].16b\n" + "ldr %q[b3a], [%[b_ptr], #112]\n" + "umlal2 v13.8h, v1.16b, %[b1].16b\n" + "add %[b_ptr], %[b_ptr], #128\n" + "umlal2 v14.8h, v1.16b, %[b2].16b\n" + "umlal2 v15.8h, v1.16b, %[b3].16b\n" + "ldr q1, [%[a_ptr], #80]\n" + + "uadalp v20.4s, v12.8h\n" + "umull v12.8h, v2.8b, %[b0].8b\n" + "uadalp v21.4s, v13.8h\n" + "uadalp v22.4s, v14.8h\n" + "umull v13.8h, v2.8b, %[b1].8b\n" + "uadalp v23.4s, v15.8h\n" + "umull v14.8h, v2.8b, %[b2].8b\n" + "umull v15.8h, v2.8b, %[b3].8b\n" + "umlal2 v12.8h, v2.16b, %[b0].16b\n" + ASM_PREFETCH("[%[b_ptr], #192]") + "umlal2 v13.8h, v2.16b, %[b1].16b\n" + "umlal2 v14.8h, v2.16b, %[b2].16b\n" + ASM_PREFETCH("[%[a_ptr], #320]") + "umlal2 v15.8h, v2.16b, %[b3].16b\n" + "ldr q2, [%[a_ptr], #96]\n" + + "uadalp v24.4s, v12.8h\n" + "umull v12.8h, v3.8b, %[b0].8b\n" + "uadalp v25.4s, v13.8h\n" + "uadalp v26.4s, v14.8h\n" + "umull v13.8h, v3.8b, %[b1].8b\n" + "uadalp v27.4s, v15.8h\n" + "umull v14.8h, v3.8b, %[b2].8b\n" + "umull v15.8h, v3.8b, %[b3].8b\n" + "umlal2 v12.8h, v3.16b, %[b0].16b\n" + "ldr %q[b0], [%[b_ptr], #0]\n" + "umlal2 v13.8h, v3.16b, %[b1].16b\n" + "umlal2 v14.8h, v3.16b, %[b2].16b\n" + "umlal2 v15.8h, v3.16b, %[b3].16b\n" + "ldr q3, [%[a_ptr], #112]\n" + + // Unroll 1 + "uadalp v28.4s, v12.8h\n" + "umull v12.8h, v0.8b, %[b0a].8b\n" + "uadalp v29.4s, v13.8h\n" + "uadalp v30.4s, v14.8h\n" + "umull v13.8h, v0.8b, %[b1a].8b\n" + "uadalp v31.4s, v15.8h\n" + "umull v14.8h, v0.8b, %[b2a].8b\n" + "umull v15.8h, v0.8b, %[b3a].8b\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "umlal2 v12.8h, v0.16b, %[b0a].16b\n" + "umlal2 v13.8h, v0.16b, %[b1a].16b\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "umlal2 v14.8h, v0.16b, %[b2a].16b\n" + "umlal2 v15.8h, v0.16b, %[b3a].16b\n" + "ldr q0, [%[a_ptr], #128]\n" + + "uadalp v16.4s, v12.8h\n" + "umull v12.8h, v1.8b, %[b0a].8b\n" + "uadalp v17.4s, v13.8h\n" + "uadalp v18.4s, v14.8h\n" + "umull v13.8h, v1.8b, %[b1a].8b\n" + "uadalp v19.4s, v15.8h\n" + "add %[a_ptr], %[a_ptr], #128\n" + "umull v14.8h, v1.8b, %[b2a].8b\n" + "umull v15.8h, v1.8b, %[b3a].8b\n" + "ldr %q[b3], [%[b_ptr], #48]\n" + "umlal2 v12.8h, v1.16b, %[b0a].16b\n" + "umlal2 v13.8h, v1.16b, %[b1a].16b\n" + "umlal2 v14.8h, v1.16b, %[b2a].16b\n" + "umlal2 v15.8h, v1.16b, %[b3a].16b\n" + "ldr q1, [%[a_ptr], #16]\n" + + "uadalp v20.4s, v12.8h\n" + "umull v12.8h, v2.8b, %[b0a].8b\n" + "uadalp v21.4s, v13.8h\n" + "uadalp v22.4s, v14.8h\n" + "umull v13.8h, v2.8b, %[b1a].8b\n" + "uadalp v23.4s, v15.8h\n" + "umull v14.8h, v2.8b, %[b2a].8b\n" + "umull v15.8h, v2.8b, %[b3a].8b\n" + "umlal2 v12.8h, v2.16b, %[b0a].16b\n" + ASM_PREFETCH("[%[b_ptr], #256]") + "umlal2 v13.8h, v2.16b, %[b1a].16b\n" + "umlal2 v14.8h, v2.16b, %[b2a].16b\n" + ASM_PREFETCH("[%[a_ptr], #256]") + "umlal2 v15.8h, v2.16b, %[b3a].16b\n" + "ldr q2, [%[a_ptr], #32]\n" + + "uadalp v24.4s, v12.8h\n" + "umull v12.8h, v3.8b, %[b0a].8b\n" + "uadalp v25.4s, v13.8h\n" + "uadalp v26.4s, v14.8h\n" + "umull v13.8h, v3.8b, %[b1a].8b\n" + "uadalp v27.4s, v15.8h\n" + "umull v14.8h, v3.8b, %[b2a].8b\n" + "umull v15.8h, v3.8b, %[b3a].8b\n" + "umlal2 v12.8h, v3.16b, %[b0a].16b\n" + "umlal2 v13.8h, v3.16b, %[b1a].16b\n" + "umlal2 v14.8h, v3.16b, %[b2a].16b\n" + "umlal2 v15.8h, v3.16b, %[b3a].16b\n" + "ldr q3, [%[a_ptr], #48]\n" + + // Start of unroll 0 for next iteration. + "uadalp v28.4s, v12.8h\n" + "umull v12.8h, v0.8b, %[b0].8b\n" + "uadalp v29.4s, v13.8h\n" + "uadalp v30.4s, v14.8h\n" + "umull v13.8h, v0.8b, %[b1].8b\n" + "uadalp v31.4s, v15.8h\n" + "bne 1b\n" + + // Target to use when K=1 or 2 (i.e. zero iterations of main loop) + "4:\n" + + // Branch to alternative tail for odd K + "cbnz %w[oddk], 2f\n" + + // Detached final iteration (even K) + "umull v14.8h, v0.8b, %[b2].8b\n" + "umull v15.8h, v0.8b, %[b3].8b\n" + "ldr %q[b0a], [%[b_ptr], #64]\n" + "umlal2 v12.8h, v0.16b, %[b0].16b\n" + "umlal2 v13.8h, v0.16b, %[b1].16b\n" + "ldr %q[b1a], [%[b_ptr], #80]\n" + "umlal2 v14.8h, v0.16b, %[b2].16b\n" + "umlal2 v15.8h, v0.16b, %[b3].16b\n" + "ldr q0, [%[a_ptr], #64]\n" + + "uadalp v16.4s, v12.8h\n" + "umull v12.8h, v1.8b, %[b0].8b\n" + "uadalp v17.4s, v13.8h\n" + "uadalp v18.4s, v14.8h\n" + "umull v13.8h, v1.8b, %[b1].8b\n" + "uadalp v19.4s, v15.8h\n" + "umull v14.8h, v1.8b, %[b2].8b\n" + "ldr %q[b2a], [%[b_ptr], #96]\n" + "umull v15.8h, v1.8b, %[b3].8b\n" + "umlal2 v12.8h, v1.16b, %[b0].16b\n" + "ldr %q[b3a], [%[b_ptr], #112]\n" + "umlal2 v13.8h, v1.16b, %[b1].16b\n" + "add %[b_ptr], %[b_ptr], #128\n" + "umlal2 v14.8h, v1.16b, %[b2].16b\n" + "umlal2 v15.8h, v1.16b, %[b3].16b\n" + "ldr q1, [%[a_ptr], #80]\n" + + "uadalp v20.4s, v12.8h\n" + "umull v12.8h, v2.8b, %[b0].8b\n" + "uadalp v21.4s, v13.8h\n" + "uadalp v22.4s, v14.8h\n" + "umull v13.8h, v2.8b, %[b1].8b\n" + "uadalp v23.4s, v15.8h\n" + "umull v14.8h, v2.8b, %[b2].8b\n" + "umull v15.8h, v2.8b, %[b3].8b\n" + "umlal2 v12.8h, v2.16b, %[b0].16b\n" + "umlal2 v13.8h, v2.16b, %[b1].16b\n" + "umlal2 v14.8h, v2.16b, %[b2].16b\n" + "umlal2 v15.8h, v2.16b, %[b3].16b\n" + "ldr q2, [%[a_ptr], #96]\n" + + "uadalp v24.4s, v12.8h\n" + "umull v12.8h, v3.8b, %[b0].8b\n" + "uadalp v25.4s, v13.8h\n" + "uadalp v26.4s, v14.8h\n" + "umull v13.8h, v3.8b, %[b1].8b\n" + "uadalp v27.4s, v15.8h\n" + "umull v14.8h, v3.8b, %[b2].8b\n" + "umull v15.8h, v3.8b, %[b3].8b\n" + "umlal2 v12.8h, v3.16b, %[b0].16b\n" + "umlal2 v13.8h, v3.16b, %[b1].16b\n" + "umlal2 v14.8h, v3.16b, %[b2].16b\n" + "umlal2 v15.8h, v3.16b, %[b3].16b\n" + "ldr q3, [%[a_ptr], #112]\n" + + // Unroll 1 + "uadalp v28.4s, v12.8h\n" + "umull v12.8h, v0.8b, %[b0a].8b\n" + "uadalp v29.4s, v13.8h\n" + "uadalp v30.4s, v14.8h\n" + "umull v13.8h, v0.8b, %[b1a].8b\n" + "uadalp v31.4s, v15.8h\n" + "umull v14.8h, v0.8b, %[b2a].8b\n" + "add %[a_ptr], %[a_ptr], #128\n" + "umull v15.8h, v0.8b, %[b3a].8b\n" + "umlal2 v12.8h, v0.16b, %[b0a].16b\n" + "umlal2 v13.8h, v0.16b, %[b1a].16b\n" + "umlal2 v14.8h, v0.16b, %[b2a].16b\n" + "umlal2 v15.8h, v0.16b, %[b3a].16b\n" + + "uadalp v16.4s, v12.8h\n" + "umull v12.8h, v1.8b, %[b0a].8b\n" + "uadalp v17.4s, v13.8h\n" + "uadalp v18.4s, v14.8h\n" + "umull v13.8h, v1.8b, %[b1a].8b\n" + "uadalp v19.4s, v15.8h\n" + "umull v14.8h, v1.8b, %[b2a].8b\n" + "umull v15.8h, v1.8b, %[b3a].8b\n" + "umlal2 v12.8h, v1.16b, %[b0a].16b\n" + "addp v16.4s, v16.4s, v17.4s\n" + "umlal2 v13.8h, v1.16b, %[b1a].16b\n" + "addp v17.4s, v18.4s, v19.4s\n" + "umlal2 v14.8h, v1.16b, %[b2a].16b\n" + "umlal2 v15.8h, v1.16b, %[b3a].16b\n" + + "uadalp v20.4s, v12.8h\n" + "umull v12.8h, v2.8b, %[b0a].8b\n" + "uadalp v21.4s, v13.8h\n" + "uadalp v22.4s, v14.8h\n" + "umull v13.8h, v2.8b, %[b1a].8b\n" + "uadalp v23.4s, v15.8h\n" + "addp v16.4s, v16.4s, v17.4s\n" + "umull v14.8h, v2.8b, %[b2a].8b\n" + "addp v18.4s, v20.4s, v21.4s\n" + "addp v19.4s, v22.4s, v23.4s\n" + "umull v15.8h, v2.8b, %[b3a].8b\n" + "umlal2 v12.8h, v2.16b, %[b0a].16b\n" + "str q16, [%[c_ptr]]\n" + "umlal2 v13.8h, v2.16b, %[b1a].16b\n" + "umlal2 v14.8h, v2.16b, %[b2a].16b\n" + "umlal2 v15.8h, v2.16b, %[b3a].16b\n" + + "uadalp v24.4s, v12.8h\n" + "umull v12.8h, v3.8b, %[b0a].8b\n" + "uadalp v25.4s, v13.8h\n" + "uadalp v26.4s, v14.8h\n" + "umull v13.8h, v3.8b, %[b1a].8b\n" + "uadalp v27.4s, v15.8h\n" + "addp v17.4s, v18.4s, v19.4s\n" + "umull v14.8h, v3.8b, %[b2a].8b\n" + "addp v20.4s, v24.4s, v25.4s\n" + "addp v21.4s, v26.4s, v27.4s\n" + "umull v15.8h, v3.8b, %[b3a].8b\n" + "umlal2 v12.8h, v3.16b, %[b0a].16b\n" + "str q17, [%[c_ptr], #16]\n" + "umlal2 v13.8h, v3.16b, %[b1a].16b\n" + "umlal2 v14.8h, v3.16b, %[b2a].16b\n" + "addp v18.4s, v20.4s, v21.4s\n" + "umlal2 v15.8h, v3.16b, %[b3a].16b\n" + "b 3f\n" + + // Detached final iteration (odd K) + "2:\n" + "umull v14.8h, v0.8b, %[b2].8b\n" + "add %[a_ptr], %[a_ptr], #64\n" + "umull v15.8h, v0.8b, %[b3].8b\n" + "add %[b_ptr], %[b_ptr], #64\n" + "umlal2 v12.8h, v0.16b, %[b0].16b\n" + "umlal2 v13.8h, v0.16b, %[b1].16b\n" + "umlal2 v14.8h, v0.16b, %[b2].16b\n" + "umlal2 v15.8h, v0.16b, %[b3].16b\n" + + "uadalp v16.4s, v12.8h\n" + "umull v12.8h, v1.8b, %[b0].8b\n" + "uadalp v17.4s, v13.8h\n" + "uadalp v18.4s, v14.8h\n" + "umull v13.8h, v1.8b, %[b1].8b\n" + "uadalp v19.4s, v15.8h\n" + "umull v14.8h, v1.8b, %[b2].8b\n" + "umull v15.8h, v1.8b, %[b3].8b\n" + "umlal2 v12.8h, v1.16b, %[b0].16b\n" + "addp v16.4s, v16.4s, v17.4s\n" + "umlal2 v13.8h, v1.16b, %[b1].16b\n" + "addp v17.4s, v18.4s, v19.4s\n" + "umlal2 v14.8h, v1.16b, %[b2].16b\n" + "umlal2 v15.8h, v1.16b, %[b3].16b\n" + + "uadalp v20.4s, v12.8h\n" + "umull v12.8h, v2.8b, %[b0].8b\n" + "uadalp v21.4s, v13.8h\n" + "uadalp v22.4s, v14.8h\n" + "umull v13.8h, v2.8b, %[b1].8b\n" + "uadalp v23.4s, v15.8h\n" + "addp v16.4s, v16.4s, v17.4s\n" + "umull v14.8h, v2.8b, %[b2].8b\n" + "addp v18.4s, v20.4s, v21.4s\n" + "addp v19.4s, v22.4s, v23.4s\n" + "umull v15.8h, v2.8b, %[b3].8b\n" + "umlal2 v12.8h, v2.16b, %[b0].16b\n" + "str q16, [%[c_ptr]]\n" + "umlal2 v13.8h, v2.16b, %[b1].16b\n" + "umlal2 v14.8h, v2.16b, %[b2].16b\n" + "umlal2 v15.8h, v2.16b, %[b3].16b\n" + + "uadalp v24.4s, v12.8h\n" + "umull v12.8h, v3.8b, %[b0].8b\n" + "uadalp v25.4s, v13.8h\n" + "uadalp v26.4s, v14.8h\n" + "umull v13.8h, v3.8b, %[b1].8b\n" + "uadalp v27.4s, v15.8h\n" + "addp v17.4s, v18.4s, v19.4s\n" + "umull v14.8h, v3.8b, %[b2].8b\n" + "addp v20.4s, v24.4s, v25.4s\n" + "addp v21.4s, v26.4s, v27.4s\n" + "umull v15.8h, v3.8b, %[b3].8b\n" + "umlal2 v12.8h, v3.16b, %[b0].16b\n" + "str q17, [%[c_ptr], #16]\n" + "umlal2 v13.8h, v3.16b, %[b1].16b\n" + "umlal2 v14.8h, v3.16b, %[b2].16b\n" + "addp v18.4s, v20.4s, v21.4s\n" + "umlal2 v15.8h, v3.16b, %[b3].16b\n" + + "3:\n" + + // Final additions + "uadalp v28.4s, v12.8h\n" + "str q18, [%[c_ptr], #32]\n" + "uadalp v29.4s, v13.8h\n" + "uadalp v30.4s, v14.8h\n" + "uadalp v31.4s, v15.8h\n" + + // Horizontal reduction, phase 1 + "addp v22.4s, v28.4s, v29.4s\n" + "addp v23.4s, v30.4s, v31.4s\n" + + // Horizontal reduction, phase 2 + "addp v19.4s, v22.4s, v23.4s\n" + "str q19, [%[c_ptr], #48]\n" + "add %[c_ptr], %[c_ptr], #64\n" + + : + [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), + [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [b3] "+w" (b3), + [b0a] "+w" (b0a), [b1a] "+w" (b1a), [b2a] "+w" (b2a), [b3a] "+w" (b3a), + [k] "+r" (k) + : [oddk] "r" (oddk) + : "x20", "x21", "v0","v1","v2","v3","v12","v13","v14","v15","v16","v17","v18","v19", + "v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31", "cc"); + } + } +} + +#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp new file mode 100644 index 000000000..ba6d2989c --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8.hpp @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +// Get the components we need to implement SGEMM. +// Can select appropriate components dependent on AArch32 vs. AArch64 etc. at build time. +#include "a64_hgemm_24x8/generic.hpp" + +// 24x8 HGEMM "strategy" class. Describes the kernel properties. +// +// The generic "gemm_opt" function will instantiate one of these (allowing +// the constructor to pick a kernel implementation). +class hgemm_24x8 { +public: + typedef __fp16 operand_type; + typedef __fp16 result_type; + + typedef void (*kern_type)(const __fp16 *, const __fp16 *, __fp16 *, int, int, int); + + static const int A_block = 1; + static const int A_interleave = 8; + static const bool A_transpose = false; + + static const int B_block = 1; + static const int B_interleave = 24; + static const bool B_transpose = true; + + static const int out_width = 24; + static const int out_height = 8; + static const int k_unroll = 1; + + kern_type kernel = nullptr; + + hgemm_24x8(const struct CPUInfo *ci) { + kernel = a64_hgemm_asimd_24x8; + } +}; + +#endif // __aarch64__ and FP16_VECTOR_ARITHMETIC diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/generic.hpp new file mode 100644 index 000000000..03e2bb95a --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_hgemm_24x8/generic.hpp @@ -0,0 +1,337 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#include <arm_neon.h> + +// Kernel implementation. +// +// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order. +// Assume that "Bpanel" points to a chunk of B blocks (each size 12xK) in read-order. +// Assume that "Cpanel" points to a chunk of C output blocks (each size +// 12x8), the chunks being arranged in a row major fashion. +// +// Note that the intent of this is that either ablocks or bblocks will be 1 +// - this construction allows the output loop to proceed in either order. + +inline void a64_hgemm_asimd_24x8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) { + const __fp16 *a_ptr = Apanel; + __fp16 *c_ptr = Cpanel; + for (int yb=0; yb<ablocks; yb++) { + const __fp16 *a_ptr0 = a_ptr; + const __fp16 *b_ptr = Bpanel; + + for (int xb=0; xb<bblocks; xb++) { + a_ptr = a_ptr0; + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + int oddk = (K & 1); + int k = ((K+1)/2) - 1; + register float16x8_t a0 asm("v0"); + register float16x8_t a0a asm("v1"); + register float16x8_t b0 asm("v2"); + register float16x8_t b1 asm("v3"); + register float16x8_t b2 asm("v4"); + register float16x8_t b0a asm("v5"); + register float16x8_t b1a asm("v6"); + register float16x8_t b2a asm("v7"); + + __asm __volatile ( + // Initialize result registers, load initial operands, prime prefetches. + "movi v8.8h, #0x0\n" + "ldr %q[a0], [%[a_ptr]]\n" + "movi v9.8h, #0x0\n" + "ldr %q[b0], [%[b_ptr]]\n" + "movi v10.8h, #0x0\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "movi v11.8h, #0x0\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "movi v12.8h, #0x0\n" + "ldr %q[b0a], [%[b_ptr], #48]\n" + "movi v13.8h, #0x0\n" + "ldr %q[b1a], [%[b_ptr], #64]\n" + "movi v14.8h, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #64]") + "movi v15.8h, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #128]") + "movi v16.8h, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #64]") + "movi v17.8h, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #192]") + "movi v18.8h, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #256]") + "movi v19.8h, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #320]") + "movi v20.8h, #0x0\n" + "movi v21.8h, #0x0\n" + "movi v22.8h, #0x0\n" + "movi v23.8h, #0x0\n" + "movi v24.8h, #0x0\n" + "movi v25.8h, #0x0\n" + "movi v26.8h, #0x0\n" + "movi v27.8h, #0x0\n" + "movi v28.8h, #0x0\n" + "movi v29.8h, #0x0\n" + "movi v30.8h, #0x0\n" + "movi v31.8h, #0x0\n" + + // Skip loop if we are doing zero iterations of it. + "cbz %w[k], 4f\n" + + "1:\n" + "fmla v8.8h , %[b0].8h, %[a0].h[0]\n" + "fmla v9.8h , %[b0].8h, %[a0].h[1]\n" + "ldr %q[a0a], [%[a_ptr], #16]\n" + "fmla v10.8h, %[b0].8h, %[a0].h[2]\n" + "fmla v11.8h, %[b0].8h, %[a0].h[3]\n" + "ldr %q[b2a], [%[b_ptr], #80]\n" + "fmla v12.8h, %[b0].8h, %[a0].h[4]\n" + "fmla v13.8h, %[b0].8h, %[a0].h[5]\n" + "fmla v14.8h, %[b0].8h, %[a0].h[6]\n" + "fmla v15.8h, %[b0].8h, %[a0].h[7]\n" + "ldr %q[b0], [%[b_ptr], #96]\n" + + "fmla v16.8h, %[b1].8h, %[a0].h[0]\n" + "fmla v17.8h, %[b1].8h, %[a0].h[1]\n" + ASM_PREFETCH("[%[a_ptr], #128]") + "fmla v18.8h, %[b1].8h, %[a0].h[2]\n" + "fmla v19.8h, %[b1].8h, %[a0].h[3]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "fmla v20.8h, %[b1].8h, %[a0].h[4]\n" + "fmla v21.8h, %[b1].8h, %[a0].h[5]\n" + "fmla v22.8h, %[b1].8h, %[a0].h[6]\n" + "fmla v23.8h, %[b1].8h, %[a0].h[7]\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + + "fmla v24.8h, %[b2].8h, %[a0].h[0]\n" + "fmla v25.8h, %[b2].8h, %[a0].h[1]\n" + ASM_PREFETCH("[%[b_ptr], #288]") + "fmla v26.8h, %[b2].8h, %[a0].h[2]\n" + "fmla v27.8h, %[b2].8h, %[a0].h[3]\n" + "fmla v28.8h, %[b2].8h, %[a0].h[4]\n" + "fmla v29.8h, %[b2].8h, %[a0].h[5]\n" + "fmla v30.8h, %[b2].8h, %[a0].h[6]\n" + "fmla v31.8h, %[b2].8h, %[a0].h[7]\n" + "ldr %q[a0], [%[a_ptr], #32]\n" + + "fmla v8.8h , %[b0a].8h, %[a0a].h[0]\n" + "fmla v9.8h , %[b0a].8h, %[a0a].h[1]\n" + "ldr %q[b2], [%[b_ptr], #32]\n" + "fmla v10.8h, %[b0a].8h, %[a0a].h[2]\n" + "fmla v11.8h, %[b0a].8h, %[a0a].h[3]\n" + "fmla v12.8h, %[b0a].8h, %[a0a].h[4]\n" + "fmla v13.8h, %[b0a].8h, %[a0a].h[5]\n" + "fmla v14.8h, %[b0a].8h, %[a0a].h[6]\n" + "fmla v15.8h, %[b0a].8h, %[a0a].h[7]\n" + "ldr %q[b0a], [%[b_ptr], #48]\n" + + "fmla v16.8h, %[b1a].8h, %[a0a].h[0]\n" + "fmla v17.8h, %[b1a].8h, %[a0a].h[1]\n" + ASM_PREFETCH("[%[b_ptr], #352]") + "fmla v18.8h, %[b1a].8h, %[a0a].h[2]\n" + "fmla v19.8h, %[b1a].8h, %[a0a].h[3]\n" + "fmla v20.8h, %[b1a].8h, %[a0a].h[4]\n" + "fmla v21.8h, %[b1a].8h, %[a0a].h[5]\n" + "fmla v22.8h, %[b1a].8h, %[a0a].h[6]\n" + "fmla v23.8h, %[b1a].8h, %[a0a].h[7]\n" + "ldr %q[b1a], [%[b_ptr], #64]\n" + + "fmla v24.8h, %[b2a].8h, %[a0a].h[0]\n" + "fmla v25.8h, %[b2a].8h, %[a0a].h[1]\n" + "add %[a_ptr], %[a_ptr], #32\n" + "fmla v26.8h, %[b2a].8h, %[a0a].h[2]\n" + "fmla v27.8h, %[b2a].8h, %[a0a].h[3]\n" + "fmla v28.8h, %[b2a].8h, %[a0a].h[4]\n" + "fmla v29.8h, %[b2a].8h, %[a0a].h[5]\n" + "subs %w[k], %w[k], #1\n" + "fmla v30.8h, %[b2a].8h, %[a0a].h[6]\n" + "fmla v31.8h, %[b2a].8h, %[a0a].h[7]\n" + + "bne 1b\n" + "4:\n" + + // Jump to odd tail if necessary. + "cbnz %w[oddk], 2f\n" + + // Even tail. + "fmla v8.8h , %[b0].8h, %[a0].h[0]\n" + "fmla v9.8h , %[b0].8h, %[a0].h[1]\n" + "ldr %q[a0a], [%[a_ptr], #16]\n" + "fmla v10.8h, %[b0].8h, %[a0].h[2]\n" + "fmla v11.8h, %[b0].8h, %[a0].h[3]\n" + "ldr %q[b2a], [%[b_ptr], #80]\n" + "fmla v12.8h, %[b0].8h, %[a0].h[4]\n" + "fmla v13.8h, %[b0].8h, %[a0].h[5]\n" + "fmla v14.8h, %[b0].8h, %[a0].h[6]\n" + "fmla v15.8h, %[b0].8h, %[a0].h[7]\n" + + "fmla v16.8h, %[b1].8h, %[a0].h[0]\n" + "fmla v17.8h, %[b1].8h, %[a0].h[1]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "fmla v18.8h, %[b1].8h, %[a0].h[2]\n" + "fmla v19.8h, %[b1].8h, %[a0].h[3]\n" + "fmla v20.8h, %[b1].8h, %[a0].h[4]\n" + "fmla v21.8h, %[b1].8h, %[a0].h[5]\n" + "add %[a_ptr], %[a_ptr], #32\n" + "fmla v22.8h, %[b1].8h, %[a0].h[6]\n" + "fmla v23.8h, %[b1].8h, %[a0].h[7]\n" + + "fmla v24.8h, %[b2].8h, %[a0].h[0]\n" + "fmla v25.8h, %[b2].8h, %[a0].h[1]\n" + "fmla v26.8h, %[b2].8h, %[a0].h[2]\n" + "fmla v27.8h, %[b2].8h, %[a0].h[3]\n" + "fmla v28.8h, %[b2].8h, %[a0].h[4]\n" + "fmla v29.8h, %[b2].8h, %[a0].h[5]\n" + "fmla v30.8h, %[b2].8h, %[a0].h[6]\n" + "fmla v31.8h, %[b2].8h, %[a0].h[7]\n" + + "fmla v8.8h , %[b0a].8h, %[a0a].h[0]\n" + "fmla v16.8h, %[b1a].8h, %[a0a].h[0]\n" + "str q8, [%[c_ptr]]\n" + "fmla v24.8h, %[b2a].8h, %[a0a].h[0]\n" + "str q16, [%[c_ptr], #16]\n" + + "fmla v9.8h , %[b0a].8h, %[a0a].h[1]\n" + "str q24, [%[c_ptr], #32]\n" + "fmla v17.8h, %[b1a].8h, %[a0a].h[1]\n" + "str q9, [%[c_ptr], #48]\n" + "fmla v25.8h, %[b2a].8h, %[a0a].h[1]\n" + "str q17, [%[c_ptr], #64]\n" + + "fmla v10.8h, %[b0a].8h, %[a0a].h[2]\n" + "str q25, [%[c_ptr], #80]\n" + "fmla v18.8h, %[b1a].8h, %[a0a].h[2]\n" + "str q10, [%[c_ptr], #96]\n" + "fmla v26.8h, %[b2a].8h, %[a0a].h[2]\n" + "str q18, [%[c_ptr], #112]\n" + + "fmla v11.8h, %[b0a].8h, %[a0a].h[3]\n" + "str q26, [%[c_ptr], #128]\n" + "fmla v19.8h, %[b1a].8h, %[a0a].h[3]\n" + "str q11, [%[c_ptr], #144]\n" + "fmla v27.8h, %[b2a].8h, %[a0a].h[3]\n" + "str q19, [%[c_ptr], #160]\n" + + "fmla v12.8h, %[b0a].8h, %[a0a].h[4]\n" + "str q27, [%[c_ptr], #176]\n" + "fmla v20.8h, %[b1a].8h, %[a0a].h[4]\n" + "str q12, [%[c_ptr], #192]\n" + "fmla v28.8h, %[b2a].8h, %[a0a].h[4]\n" + "str q20, [%[c_ptr], #208]\n" + + "fmla v13.8h, %[b0a].8h, %[a0a].h[5]\n" + "str q28, [%[c_ptr], #224]\n" + "fmla v21.8h, %[b1a].8h, %[a0a].h[5]\n" + "str q13, [%[c_ptr], #240]\n" + "fmla v29.8h, %[b2a].8h, %[a0a].h[5]\n" + "str q21, [%[c_ptr], #256]\n" + + "fmla v14.8h, %[b0a].8h, %[a0a].h[6]\n" + "str q29, [%[c_ptr], #272]\n" + "fmla v22.8h, %[b1a].8h, %[a0a].h[6]\n" + "str q14, [%[c_ptr], #288]\n" + "fmla v30.8h, %[b2a].8h, %[a0a].h[6]\n" + "str q22, [%[c_ptr], #304]\n" + + "fmla v15.8h, %[b0a].8h, %[a0a].h[7]\n" + "str q30, [%[c_ptr], #320]\n" + "fmla v23.8h, %[b1a].8h, %[a0a].h[7]\n" + "str q15, [%[c_ptr], #336]\n" + "fmla v31.8h, %[b2a].8h, %[a0a].h[7]\n" + "b 3f\n" + + // Odd tail + "2:\n" + "fmla v8.8h , %[b0].8h, %[a0].h[0]\n" + "add %[b_ptr], %[b_ptr], #48\n" + "fmla v16.8h, %[b1].8h, %[a0].h[0]\n" + "add %[a_ptr], %[a_ptr], #16\n" + "str q8, [%[c_ptr]]\n" + "fmla v24.8h, %[b2].8h, %[a0].h[0]\n" + "str q16, [%[c_ptr], #16]\n" + + "fmla v9.8h , %[b0].8h, %[a0].h[1]\n" + "str q24, [%[c_ptr], #32]\n" + "fmla v17.8h, %[b1].8h, %[a0].h[1]\n" + "str q9, [%[c_ptr], #48]\n" + "fmla v25.8h, %[b2].8h, %[a0].h[1]\n" + "str q17, [%[c_ptr], #64]\n" + + "fmla v10.8h, %[b0].8h, %[a0].h[2]\n" + "str q25, [%[c_ptr], #80]\n" + "fmla v18.8h, %[b1].8h, %[a0].h[2]\n" + "str q10, [%[c_ptr], #96]\n" + "fmla v26.8h, %[b2].8h, %[a0].h[2]\n" + "str q18, [%[c_ptr], #112]\n" + + "fmla v11.8h, %[b0].8h, %[a0].h[3]\n" + "str q26, [%[c_ptr], #128]\n" + "fmla v19.8h, %[b1].8h, %[a0].h[3]\n" + "str q11, [%[c_ptr], #144]\n" + "fmla v27.8h, %[b2].8h, %[a0].h[3]\n" + "str q19, [%[c_ptr], #160]\n" + + "fmla v12.8h, %[b0].8h, %[a0].h[4]\n" + "str q27, [%[c_ptr], #176]\n" + "fmla v20.8h, %[b1].8h, %[a0].h[4]\n" + "str q12, [%[c_ptr], #192]\n" + "fmla v28.8h, %[b2].8h, %[a0].h[4]\n" + "str q20, [%[c_ptr], #208]\n" + + "fmla v13.8h, %[b0].8h, %[a0].h[5]\n" + "str q28, [%[c_ptr], #224]\n" + "fmla v21.8h, %[b1].8h, %[a0].h[5]\n" + "str q13, [%[c_ptr], #240]\n" + "fmla v29.8h, %[b2].8h, %[a0].h[5]\n" + "str q21, [%[c_ptr], #256]\n" + + "fmla v14.8h, %[b0].8h, %[a0].h[6]\n" + "str q29, [%[c_ptr], #272]\n" + "fmla v22.8h, %[b1].8h, %[a0].h[6]\n" + "str q14, [%[c_ptr], #288]\n" + "fmla v30.8h, %[b2].8h, %[a0].h[6]\n" + "str q22, [%[c_ptr], #304]\n" + + "fmla v15.8h, %[b0].8h, %[a0].h[7]\n" + "str q30, [%[c_ptr], #320]\n" + "fmla v23.8h, %[b1].8h, %[a0].h[7]\n" + "str q15, [%[c_ptr], #336]\n" + "fmla v31.8h, %[b2].8h, %[a0].h[7]\n" + + "3:\n" + "str q23, [%[c_ptr], #352]\n" + "str q31, [%[c_ptr], #368]\n" + "add %[c_ptr], %[c_ptr], #384\n" + : + [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), + [a0] "+w" (a0), [a0a] "+w" (a0a), + [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k), + [b0a] "+w" (b0a), [b1a] "+w" (b1a), [b2a] "+w" (b2a) + : [oddk] "r" (oddk) + : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory" + ); + } + } +} diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp index e229e215e..603ad8dc0 100644 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8.hpp @@ -28,6 +28,9 @@ // Actual kernel implementations #include "a64_sgemm_12x8/generic.hpp" #include "a64_sgemm_12x8/a53.hpp" +#include "a64_sgemm_12x8/a55.hpp" +#include "a64_sgemm_12x8/a55r1.hpp" + // 12x8 SGEMM "strategy" class. // @@ -66,6 +69,12 @@ public: if (ci->CPU == CPUTarget::A53) { kernel = a64_sgemm_asimd_12x8_a53; } + else if (ci->CPU == CPUTarget::A55) { + kernel = a64_sgemm_asimd_12x8_a55; + } + else if (ci->CPU == CPUTarget::A55_DOT) { + kernel = a64_sgemm_asimd_12x8_a55r1; + } } }; diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp index e58ce6682..1c9b4b38f 100644 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a53.hpp @@ -206,7 +206,7 @@ inline void a64_sgemm_asimd_12x8_a53(const float *Apanel, const float *Bpanel, f // Branch here if K=1 or 2. Do the right thing for odd/even at the end. "4:\n" - "cbnz %[oddk], 2f\n" + "cbnz %w[oddk], 2f\n" // Detached final iteration. (even K) "ldr %d[b2], [%[b_ptr], #32]\n" @@ -360,8 +360,9 @@ inline void a64_sgemm_asimd_12x8_a53(const float *Apanel, const float *Bpanel, f [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k) : [oddk] "r" (oddk) : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", - "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" ); } } } + diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55.hpp new file mode 100644 index 000000000..85d8a502f --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55.hpp @@ -0,0 +1,368 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +inline void a64_sgemm_asimd_12x8_a55(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { + const float *a_ptr = Apanel; + float *c_ptr = Cpanel; + + for (int yb=0; yb<ablocks; yb++) { + const float *a_ptr0 = a_ptr; + const float *b_ptr = Bpanel; + + for (int xb=0; xb<bblocks; xb++) { + a_ptr = a_ptr0; + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + int oddk = (K & 1); + int k = ((K+1)/2) - 1; + + register float32x4_t a0 asm("v0"); + register float32x4_t a1 asm("v1"); + register float32x4_t b0 asm("v2"); + register float32x4_t b1 asm("v3"); + register float32x4_t b2 asm("v4"); + register float32x4_t a0a asm("v5"); + register float32x4_t a1a asm("v6"); + + __asm __volatile ( + // Initialize result registers, load initial operands, prime prefetches. + "movi v8.4s, #0x0\n" + "ldr %q[a0], [%[a_ptr]]\n" + "movi v9.4s, #0x0\n" + "ldr %q[b0], [%[b_ptr]]\n" + "movi v10.4s, #0x0\n" + "ldr %q[a1], [%[a_ptr], #16]\n" + "movi v11.4s, #0x0\n" + "ldr %q[b1], [%[b_ptr], #16]\n" + "movi v12.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #64]") + "movi v13.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #64]") + "movi v14.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #128]") + "movi v15.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #128]") + "movi v16.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #192]") + "movi v17.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #256]") + "movi v18.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #192]") + "movi v19.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #320]") + "movi v20.4s, #0x0\n" + ASM_PREFETCH("[%[a_ptr], #256]") + "movi v21.4s, #0x0\n" + ASM_PREFETCH("[%[b_ptr], #384]") + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + + // Skip loop if we are doing zero iterations of it. + "cbz %w[k], 4f\n" + + "1:\n" + // Unroll 0 + "ldr %d[b2], [%[b_ptr], #32]\n" + "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" + + "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" + "ldr x20, [%[b_ptr], #40]\n" + "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" + "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" + "subs %w[k], %w[k], #1\n" + + + "ldr %d[a0a], [%[a_ptr], #32]\n" + "ins %[b2].d[1], x20\n" + + "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" + "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" + "ldr x20, [%[a_ptr], #40]\n" + "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" + "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" + + "ldr %d[a1a], [%[a_ptr], #48]\n" + "ins %[a0a].d[1], x20\n" + + "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" + "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" + "ldr x20, [%[a_ptr], #56]\n" + "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" + + "ldr %d[b0], [%[b_ptr], #48]\n" + "ins %[a1a].d[1], x20\n" + ASM_PREFETCH("[%[a_ptr], #320]") + "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" + "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" + "ldr x20, [%[b_ptr], #56]\n" + "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" + + "ldr %d[b1], [%[b_ptr], #64]\n" + "ins %[b0].d[1], x20\n" + + "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" + "ldr x20, [%[b_ptr], #72]\n" + "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" + "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" + ASM_PREFETCH("[%[b_ptr], #448]") + + + "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" + "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" + ASM_PREFETCH("[%[b_ptr], #512]") + "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" + "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" + + // Unroll 1 + "ldr %d[b2], [%[b_ptr], #80]\n" + "ins %[b1].d[1], x20\n" + + "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n" + "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n" + "ldr x20, [%[b_ptr], #88]\n" + "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n" + "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n" + + "ldr %d[a0], [%[a_ptr], #64]\n" + "ins %[b2].d[1], x20\n" + + "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n" + "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n" + "ldr x20, [%[a_ptr], #72]\n" + "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n" + "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n" + + "ldr %d[a1], [%[a_ptr], #80]\n" + "ins %[a0].d[1], x20\n" + + "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n" + "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n" + "ldr x20, [%[a_ptr], #88]\n" + "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n" + + + "ldr %d[b0], [%[b_ptr], #96]\n" + "ins %[a1].d[1], x20\n" + + "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n" + "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n" + "ldr x20, [%[b_ptr], #104]\n" + "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n" + + "ldr %d[b1], [%[b_ptr], #112]\n" + "ins %[b0].d[1], x20\n" + + "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n" + "ldr x20, [%[b_ptr], #120]\n" + "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n" + "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n" + "add %[a_ptr], %[a_ptr], #64\n" + + "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n" + "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n" + "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n" + + + "ldr %d[b2], [%[b_ptr], #32]\n" + "ins %[b1].d[1], x20\n" + + + "bne 1b\n" + + // Branch here if K=1 or 2. Do the right thing for odd/even at the end. + "4:\n" + "cbnz %w[oddk], 2f\n" + "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" + + // Detached final iteration. (even K) + "ldr x20, [%[b_ptr], #40]\n" + "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" + "subs %w[k], %w[k], #1\n" + "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" + "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" + + "ldr %d[a0a], [%[a_ptr], #32]\n" + "ins %[b2].d[1], x20\n" + + "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" + "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" + "ldr x20, [%[a_ptr], #40]\n" + "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" + "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" + + "ldr %d[a1a], [%[a_ptr], #48]\n" + "ins %[a0a].d[1], x20\n" + + "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" + "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" + "ldr x20, [%[a_ptr], #56]\n" + "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" + + "ldr %d[b0], [%[b_ptr], #48]\n" + "ins %[a1a].d[1], x20\n" + + "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" + "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" + "ldr x20, [%[b_ptr], #56]\n" + "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" + + "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" + "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" + "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" + + "ldr %d[b1], [%[b_ptr], #64]\n" + "ins %[b0].d[1], x20\n" + + "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" + "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" + "ldr x20, [%[b_ptr], #72]\n" + "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" + "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" + + "ldr %d[b2], [%[b_ptr], #80]\n" + "ins %[b1].d[1], x20\n" + + "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n" + "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n" + "ldr x20, [%[b_ptr], #88]\n" + "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n" + + "ins %[b2].d[1], x20\n" + "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n" + "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n" + "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n" + "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n" + "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n" + "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n" + "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n" + "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n" + "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n" + "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n" + "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n" + "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n" + "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n" + "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n" + "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n" + "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n" + "add %[a_ptr], %[a_ptr], #64\n" + "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n" + "b 3f\n" + + // Detached final iteration. (odd K) + "2:\n" + + "ldr %d[b2], [%[b_ptr], #32]\n" + "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" + + "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" + "ldr x20, [%[b_ptr], #40]\n" + "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" + "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" + "ins %[b2].d[1], x20\n" + "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" + "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" + "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" + "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" + "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" + "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" + "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" + "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" + "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" + "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" + "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" + "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" + "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" + "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" + "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" + "add %[a_ptr], %[a_ptr], #32\n" + "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" + "add %[b_ptr], %[b_ptr], #48\n" + "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" + + // Common tail + "3:\n" + "str q8, [%[c_ptr]]\n" + "str q16, [%[c_ptr], #16]\n" + "str q24, [%[c_ptr], #32]\n" + "str q9, [%[c_ptr], #48]\n" + "str q17, [%[c_ptr], #64]\n" + "str q25, [%[c_ptr], #80]\n" + "str q10, [%[c_ptr], #96]\n" + "str q18, [%[c_ptr], #112]\n" + "str q26, [%[c_ptr], #128]\n" + "str q11, [%[c_ptr], #144]\n" + "str q19, [%[c_ptr], #160]\n" + "str q27, [%[c_ptr], #176]\n" + "str q12, [%[c_ptr], #192]\n" + "str q20, [%[c_ptr], #208]\n" + "str q28, [%[c_ptr], #224]\n" + "str q13, [%[c_ptr], #240]\n" + "str q21, [%[c_ptr], #256]\n" + "str q29, [%[c_ptr], #272]\n" + "str q14, [%[c_ptr], #288]\n" + "str q22, [%[c_ptr], #304]\n" + "str q30, [%[c_ptr], #320]\n" + "str q15, [%[c_ptr], #336]\n" + "str q23, [%[c_ptr], #352]\n" + "str q31, [%[c_ptr], #368]\n" + "add %[c_ptr], %[c_ptr], #384\n" + : + [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), + [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a), + [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k) + : [oddk] "r" (oddk) + : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" + ); + } + } +} diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55r1.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55r1.hpp new file mode 100644 index 000000000..295308053 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/a55r1.hpp @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +inline void a64_sgemm_asimd_12x8_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) { + const float *a_ptr = Apanel; + float *c_ptr = Cpanel; + for (int yb=0; yb<ablocks; yb++) { + const float *a_ptr0 = a_ptr; + const float *b_ptr = Bpanel; + for (int xb=0; xb<bblocks; xb++) { + a_ptr = a_ptr0; + // Fix up for odd lengths - set a flag if K is odd, but make + // sure we round up the iteration count. + int oddk = (K & 1); + int k = ((K+1)/2) - 1; + + register float32x4_t a0 asm("v0"); + register float32x4_t a1 asm("v1"); + register float32x4_t b0 asm("v2"); + register float32x4_t b1 asm("v3"); + register float32x4_t b2 asm("v4"); + register float32x4_t a0a asm("v5"); + register float32x4_t a1a asm("v6"); + + __asm __volatile ( + // Initialize result registers, load initial operands, prime prefetches. + "ldp %q[a0], %q[a1], [%[a_ptr]]\n" + ASM_PREFETCH("[%[a_ptr], #64]") + + ASM_PREFETCH("[%[a_ptr], #128]") + ASM_PREFETCH("[%[a_ptr], #192]") + "ldp %q[b0], %q[b1], [%[b_ptr]]\n" + ASM_PREFETCH("[%[b_ptr], #64]") + + ASM_PREFETCH("[%[b_ptr], #128]") + ASM_PREFETCH("[%[b_ptr], #192]") + ASM_PREFETCH("[%[b_ptr], #256]") + + ASM_PREFETCH("[%[a_ptr], #256]") + ASM_PREFETCH("[%[a_ptr], #320]") + ASM_PREFETCH("[%[a_ptr], #384]") + + ASM_PREFETCH("[%[b_ptr], #320]") + ASM_PREFETCH("[%[b_ptr], #384]") + ASM_PREFETCH("[%[b_ptr], #448]") + ASM_PREFETCH("[%[b_ptr], #512]") + + "movi v8.4s, #0x0\n" + "movi v9.4s, #0x0\n" + "movi v10.4s, #0x0\n" + "movi v11.4s, #0x0\n" + "movi v12.4s, #0x0\n" + "movi v13.4s, #0x0\n" + "movi v14.4s, #0x0\n" + "movi v15.4s, #0x0\n" + "movi v16.4s, #0x0\n" + "movi v17.4s, #0x0\n" + + "movi v18.4s, #0x0\n" + "movi v19.4s, #0x0\n" + "movi v20.4s, #0x0\n" + "movi v21.4s, #0x0\n" + "movi v22.4s, #0x0\n" + "movi v23.4s, #0x0\n" + "movi v24.4s, #0x0\n" + "movi v25.4s, #0x0\n" + "movi v26.4s, #0x0\n" + "movi v27.4s, #0x0\n" + "movi v28.4s, #0x0\n" + "movi v29.4s, #0x0\n" + "movi v30.4s, #0x0\n" + "movi v31.4s, #0x0\n" + + // Skip loop if we are doing zero iterations of it. + "cbz %w[k], 4f\n" + + "1:\n" + // Unroll 0 + "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" + "ldr %d[b2], [%[b_ptr], #32]\n" + + "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" + "ldr x20, [%[b_ptr], #40]\n" + "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" + "subs %w[k], %w[k], #1\n" + "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" + "ldr %d[a0a], [%[a_ptr], #32]\n" + + "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" + "ins %[b2].d[1], x20\n" + "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" + "ldr x20, [%[a_ptr], #40]\n" + "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" + "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" + "ldr %d[a1a], [%[a_ptr], #48]\n" + + "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" + "ins %[a0a].d[1], x20\n" + "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" + "ldr x20, [%[a_ptr], #56]\n" + "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" + "ldr %d[b0], [%[b_ptr], #48]\n" + + "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" + "ins %[a1a].d[1], x20\n" + "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" + "ldr x20, [%[b_ptr], #56]\n" + "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" + "ldr %d[b1], [%[b_ptr], #64]\n" + + "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" + "ins %[b0].d[1], x20\n" + "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" + "ldr x20, [%[b_ptr], #72]\n" + "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" + "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" + ASM_PREFETCH("[%[a_ptr], #448]") + + "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" + "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" + ASM_PREFETCH("[%[b_ptr], #576]") + "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" + "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" + + // Unroll 1 + "ldr %d[b2], [%[b_ptr], #80]\n" + + "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n" + "ins %[b1].d[1], x20\n" + "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n" + "ldr x20, [%[b_ptr], #88]\n" + "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n" + "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n" + "ldr %d[a0], [%[a_ptr], #64]\n" + + "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n" + "ins %[b2].d[1], x20\n" + "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n" + "ldr x20, [%[a_ptr], #72]\n" + "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n" + "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n" + "ldr %d[a1], [%[a_ptr], #80]\n" + + "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n" + "ins %[a0].d[1], x20\n" + "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n" + "ldr x20, [%[a_ptr], #88]\n" + "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n" + "ldr %d[b0], [%[b_ptr], #96]\n" + + "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n" + "ins %[a1].d[1], x20\n" + "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n" + "ldr x20, [%[b_ptr], #104]\n" + "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n" + "ldr %d[b1], [%[b_ptr], #112]\n" + + "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n" + "ins %[b0].d[1], x20\n" + "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n" + "ldr x20, [%[b_ptr], #120]\n" + "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n" + + "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n" + "add %[a_ptr], %[a_ptr], #64\n" + + "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n" + ASM_PREFETCH("[%[b_ptr], #640]") + "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n" + "ins %[b1].d[1], x20\n" + "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n" + "ldr %d[b2], [%[b_ptr], #32]\n" + + + "bne 1b\n" + + // Branch here if K=1 or 2. Do the right thing for odd/even at the end. + "4:\n" + "cbnz %w[oddk], 2f\n" + "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" + + // Detached final iteration. (even K) + "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" + "ldr x20, [%[b_ptr], #40]\n" + "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" + "subs %w[k], %w[k], #1\n" + "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" + "ldr %d[a0a], [%[a_ptr], #32]\n" + + "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" + "ins %[b2].d[1], x20\n" + "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" + "ldr x20, [%[a_ptr], #40]\n" + "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" + + "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" + "ldr %d[a1a], [%[a_ptr], #48]\n" + + "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" + "ins %[a0a].d[1], x20\n" + "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" + "ldr x20, [%[a_ptr], #56]\n" + "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" + + "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" + "ldr %d[b0], [%[b_ptr], #48]\n" + + "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" + "ins %[a1a].d[1], x20\n" + "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" + "ldr x20, [%[b_ptr], #56]\n" + "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" + + "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" + "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" + "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" + "ldr %d[b1], [%[b_ptr], #64]\n" + + "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" + "ins %[b0].d[1], x20\n" + "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" + "ldr x20, [%[b_ptr], #72]\n" + "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" + + "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" + "ldr %d[b2], [%[b_ptr], #80]\n" + + "fmla v8.4s , %[b0].4s, %[a0a].s[0]\n" + "ins %[b1].d[1], x20\n" + "fmla v9.4s , %[b0].4s, %[a0a].s[1]\n" + "ldr x20, [%[b_ptr], #88]\n" + "fmla v10.4s, %[b0].4s, %[a0a].s[2]\n" + "ins %[b2].d[1], x20\n" + + "fmla v11.4s, %[b0].4s, %[a0a].s[3]\n" + "fmla v12.4s, %[b0].4s, %[a1a].s[0]\n" + "fmla v13.4s, %[b0].4s, %[a1a].s[1]\n" + "fmla v14.4s, %[b0].4s, %[a1a].s[2]\n" + "fmla v15.4s, %[b0].4s, %[a1a].s[3]\n" + "fmla v16.4s, %[b1].4s, %[a0a].s[0]\n" + "fmla v17.4s, %[b1].4s, %[a0a].s[1]\n" + "fmla v18.4s, %[b1].4s, %[a0a].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0a].s[3]\n" + "fmla v20.4s, %[b1].4s, %[a1a].s[0]\n" + "fmla v21.4s, %[b1].4s, %[a1a].s[1]\n" + "fmla v22.4s, %[b1].4s, %[a1a].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1a].s[3]\n" + "fmla v24.4s, %[b2].4s, %[a0a].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0a].s[1]\n" + "fmla v26.4s, %[b2].4s, %[a0a].s[2]\n" + "fmla v27.4s, %[b2].4s, %[a0a].s[3]\n" + "fmla v28.4s, %[b2].4s, %[a1a].s[0]\n" + "fmla v29.4s, %[b2].4s, %[a1a].s[1]\n" + "add %[a_ptr], %[a_ptr], #64\n" + "fmla v30.4s, %[b2].4s, %[a1a].s[2]\n" + "add %[b_ptr], %[b_ptr], #96\n" + "fmla v31.4s, %[b2].4s, %[a1a].s[3]\n" + "b 3f\n" + + // Detached final iteration. (odd K) + "2:\n" + + "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" + "ldr %d[b2], [%[b_ptr], #32]\n" + "fmla v9.4s , %[b0].4s, %[a0].s[1]\n" + "ldr x20, [%[b_ptr], #40]\n" + "fmla v10.4s, %[b0].4s, %[a0].s[2]\n" + "fmla v11.4s, %[b0].4s, %[a0].s[3]\n" + "fmla v12.4s, %[b0].4s, %[a1].s[0]\n" + "ins %[b2].d[1], x20\n" + "fmla v13.4s, %[b0].4s, %[a1].s[1]\n" + "fmla v14.4s, %[b0].4s, %[a1].s[2]\n" + "fmla v15.4s, %[b0].4s, %[a1].s[3]\n" + "fmla v16.4s, %[b1].4s, %[a0].s[0]\n" + "fmla v17.4s, %[b1].4s, %[a0].s[1]\n" + "fmla v18.4s, %[b1].4s, %[a0].s[2]\n" + "fmla v19.4s, %[b1].4s, %[a0].s[3]\n" + "fmla v20.4s, %[b1].4s, %[a1].s[0]\n" + "fmla v21.4s, %[b1].4s, %[a1].s[1]\n" + "fmla v22.4s, %[b1].4s, %[a1].s[2]\n" + "fmla v23.4s, %[b1].4s, %[a1].s[3]\n" + "fmla v24.4s, %[b2].4s, %[a0].s[0]\n" + "fmla v25.4s, %[b2].4s, %[a0].s[1]\n" + "fmla v26.4s, %[b2].4s, %[a0].s[2]\n" + "fmla v27.4s, %[b2].4s, %[a0].s[3]\n" + "fmla v28.4s, %[b2].4s, %[a1].s[0]\n" + "fmla v29.4s, %[b2].4s, %[a1].s[1]\n" + "add %[a_ptr], %[a_ptr], #32\n" + "fmla v30.4s, %[b2].4s, %[a1].s[2]\n" + "add %[b_ptr], %[b_ptr], #48\n" + "fmla v31.4s, %[b2].4s, %[a1].s[3]\n" + + // Common tail + "3:\n" + "str q8, [%[c_ptr]]\n" + "str q16, [%[c_ptr], #16]\n" + "str q24, [%[c_ptr], #32]\n" + "str q9, [%[c_ptr], #48]\n" + "str q17, [%[c_ptr], #64]\n" + "str q25, [%[c_ptr], #80]\n" + "str q10, [%[c_ptr], #96]\n" + "str q18, [%[c_ptr], #112]\n" + "str q26, [%[c_ptr], #128]\n" + "str q11, [%[c_ptr], #144]\n" + "str q19, [%[c_ptr], #160]\n" + "str q27, [%[c_ptr], #176]\n" + "str q12, [%[c_ptr], #192]\n" + "str q20, [%[c_ptr], #208]\n" + "str q28, [%[c_ptr], #224]\n" + "str q13, [%[c_ptr], #240]\n" + "str q21, [%[c_ptr], #256]\n" + "str q29, [%[c_ptr], #272]\n" + "str q14, [%[c_ptr], #288]\n" + "str q22, [%[c_ptr], #304]\n" + "str q30, [%[c_ptr], #320]\n" + "str q15, [%[c_ptr], #336]\n" + "str q23, [%[c_ptr], #352]\n" + "str q31, [%[c_ptr], #368]\n" + "add %[c_ptr], %[c_ptr], #384\n" + : + [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr), + [a0] "+w" (a0), [a1] "+w" (a1), [a0a] "+w" (a0a), [a1a] "+w" (a1a), + [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k) + : [oddk] "r" (oddk) + : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" + ); + } + } +} diff --git a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp index 082c20064..c4a5875a3 100644 --- a/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp +++ b/arm_compute/core/NEON/kernels/assembly/kernels/a64_sgemm_12x8/generic.hpp @@ -181,7 +181,7 @@ inline void a64_sgemm_asimd_12x8_jumps(const float *Apanel, const float *Bpanel, "4:\n" // Branch to alternative tail for odd K - "cbnz %[oddk], 2f\n" + "cbnz %w[oddk], 2f\n" // Detached final iteration (even K) "fmla v8.4s , %[b0].4s, %[a0].s[0]\n" @@ -347,7 +347,7 @@ inline void a64_sgemm_asimd_12x8_jumps(const float *Apanel, const float *Bpanel, [b0] "+w" (b0), [b1] "+w" (b1), [b2] "+w" (b2), [k] "+r" (k) : [oddk] "r" (oddk), [row_jump] "r" (row_jump), [block_jump] "r" (block_jump) : "x20", "x21", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", - "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc" ); } } diff --git a/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp b/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp index f2c5fd86b..e8edddb4f 100644 --- a/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp +++ b/arm_compute/core/NEON/kernels/assembly/merges/a64_merge_float_12x8.hpp @@ -226,7 +226,7 @@ inline void MergeResults<12, 8>(float *out, const float *in, const int ldout, co [outptr4] "+r" (outptr4), [outptr5] "+r" (outptr5), [outptr6] "+r" (outptr6), [outptr7] "+r" (outptr7), [inptr] "+r" (inptr) : [av] "w" (av), [bv] "w" (bv) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q16", "q17", "q18", "q19", "q20", "q21" + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21" ); } } diff --git a/arm_compute/core/NEON/kernels/assembly/profiler.hpp b/arm_compute/core/NEON/kernels/assembly/profiler.hpp index d2f8ba923..f7a1d1c70 100644 --- a/arm_compute/core/NEON/kernels/assembly/profiler.hpp +++ b/arm_compute/core/NEON/kernels/assembly/profiler.hpp @@ -31,6 +31,7 @@ class profiler { private: static const int maxevents = 10000; unsigned long times[maxevents]; + unsigned long units[maxevents]; int events[maxevents]; int currentevent; int countfd; @@ -45,35 +46,38 @@ public: close(countfd); int tots[5]; unsigned long counts[5]; + unsigned long tunits[5]; const char * descs[] = { "Prepare A", "Prepare B", "Kernel", "Merge" }; for (int i=1; i<5; i++) { tots[i] = 0; counts[i] = 0; + tunits[i] = 0; } printf("Profiled events:\n"); for (int i=0; i<currentevent; i++) { - printf("%10s: %ld\n", descs[events[i]-1], times[i]); tots[events[i]]++; counts[events[i]] += times[i]; + tunits[events[i]] += units[i]; } - printf("%20s %9s %9s %9s\n", "", "Events", "Total", "Average"); + printf("%20s %9s %9s %9s %12s %9s\n", "", "Events", "Total", "Average", "Bytes/MACs", "Per cycle"); for (int i=1; i<5; i++) { - printf("%20s: %9d %9ld %9ld\n",descs[i-1],tots[i],counts[i],counts[i]/tots[i]); + printf("%20s: %9d %9ld %9ld %12lu %9.2f\n",descs[i-1],tots[i],counts[i],counts[i]/tots[i],tunits[i],(float)tunits[i]/counts[i]); } } template <typename T> - void operator() (int i, T func) { + void operator() (int i, unsigned long u, T func) { if (currentevent==maxevents) { func(); } else { + events[currentevent] = i; + units[currentevent] = u; start_counter(countfd); func(); long long cycs = stop_counter(countfd); - events[currentevent] = i; times[currentevent++] = cycs; } } @@ -84,7 +88,7 @@ public: class profiler { public: template <typename T> - void operator() (int i, T func) { + void operator() (int i, unsigned long u, T func) { func(); } }; @@ -95,3 +99,5 @@ public: #define PROFILE_PREPB 2 #define PROFILE_KERNEL 3 #define PROFILE_MERGE 4 + + diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_16bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_16bit.hpp new file mode 100644 index 000000000..0c23cebe6 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/transforms/a64_interleave_8way_16bit.hpp @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include <arm_neon.h> +#include "../asmlib.hpp" + + +template<> +template<typename T> +void TransformImpl<8, 1, false, 2, 2>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) { + uint16_t *outptr = (uint16_t *)out; + const uint16_t *inptr = (const uint16_t *)in; + + uint16_t zerobuff[24]; + + for (int y=y0; y<ymax; y+=8) { + const uint16_t *inptr0 = inptr + y * ldin + k0; + const uint16_t *inptr1 = inptr0 + ldin; + const uint16_t *inptr2 = inptr1 + ldin; + const uint16_t *inptr3 = inptr2 + ldin; + const uint16_t *inptr4 = inptr3 + ldin; + const uint16_t *inptr5 = inptr4 + ldin; + const uint16_t *inptr6 = inptr5 + ldin; + const uint16_t *inptr7 = inptr6 + ldin; + + prefetch_2x(inptr0); + prefetch_2x(inptr1); + prefetch_2x(inptr2); + prefetch_2x(inptr3); + prefetch_2x(inptr4); + prefetch_2x(inptr5); + prefetch_2x(inptr6); + prefetch_2x(inptr7); + + int x=(kmax-k0); + for (;x>7;x-=8) { + /* Cope with ragged cases by copying from a buffer of zeroes instead */ + if ((y + 7) >= ymax) { + switch ((y + 7) - ymax) { + /* Everything falls through in here */ + case 6: + inptr1 = zerobuff; + case 5: + inptr2 = zerobuff; + case 4: + inptr3 = zerobuff; + case 3: + inptr4 = zerobuff; + case 2: + inptr5 = zerobuff; + case 1: + inptr6 = zerobuff; + case 0: + inptr7 = zerobuff; + default: + break; + } + } + + int skippf = (x & 31); + __asm __volatile ( + // Load up 8 elements (1 vector) from each of 8 sources. + "CBNZ %w[skippf], 1f\n" + ASM_PREFETCH("[%[inptr0], #128]") + ASM_PREFETCH("[%[inptr1], #128]") + ASM_PREFETCH("[%[inptr2], #128]") + ASM_PREFETCH("[%[inptr3], #128]") + "1:\n" + + "LDR q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7 + "LDR q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7 + "LDR q2, [%[inptr2]], #16\n" // q4=C0C1C2C3... + "LDR q6, [%[inptr6]], #16\n" + "ZIP1 v8.8h, v0.8h, v4.8h\n" // q8=A0E0A1E1A2E2A3E3 + "ZIP2 v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7 + "ZIP1 v9.8h, v2.8h, v6.8h\n" // q9=C0G0C1G1C2G2C3G3 + "ZIP2 v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7 + "LDR q1, [%[inptr1]], #16\n" // q1=B0B1B2B3B4B5B6B7 + "LDR q5, [%[inptr5]], #16\n" + "LDR q3, [%[inptr3]], #16\n" // q3=D0D1D2D3.... + "LDR q7, [%[inptr7]], #16\n" + "ZIP1 v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3 + "ZIP2 v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7 + "ZIP1 v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3 + "ZIP2 v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7 + + "ZIP1 v12.8h, v8.8h, v9.8h\n" // q20=A0C0E0G0A1C1E1G1 + "ZIP2 v20.8h, v8.8h, v9.8h\n" + "ZIP1 v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1 + "ZIP2 v21.8h, v10.8h, v11.8h\n" + + "CBNZ %w[skippf], 2f\n" + ASM_PREFETCH("[%[inptr4], #112]") + ASM_PREFETCH("[%[inptr5], #112]") + ASM_PREFETCH("[%[inptr6], #112]") + ASM_PREFETCH("[%[inptr7], #112]") + "2:\n" + + "ZIP1 v22.8h, v16.8h, v17.8h\n" + "ZIP2 v30.8h, v16.8h, v17.8h\n" + "ZIP1 v23.8h, v18.8h, v19.8h\n" + "ZIP2 v31.8h, v18.8h, v19.8h\n" + + "ZIP1 v14.8h, v12.8h, v13.8h\n" // q22=A0B0C0D0E0F0G0H0 + "ZIP2 v15.8h, v12.8h, v13.8h\n" // q23=A1B1C1D1E1F1G1H1 + "STP q14, q15, [%[outptr]], #32\n" // Write back first two elements + + "ZIP1 v0.8h, v20.8h, v21.8h\n" + "ZIP2 v1.8h, v20.8h, v21.8h\n" + "STP q0, q1, [%[outptr]], #32\n" // Write back next two elements + + "ZIP1 v2.8h, v22.8h, v23.8h\n" + "ZIP2 v3.8h, v22.8h, v23.8h\n" + "STP q2, q3, [%[outptr]], #32\n" // Write back next two elements + + "ZIP1 v4.8h, v30.8h, v31.8h\n" + "ZIP2 v5.8h, v30.8h, v31.8h\n" + "STP q4, q5, [%[outptr]], #32\n" // Write back last two elements + : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), + [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr) + : [skippf] "r" (skippf) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", + "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31" + ); + } + + for (;x>0;x--) { + *outptr++ = *inptr0++; + *outptr++ = *inptr1++; + *outptr++ = *inptr2++; + *outptr++ = *inptr3++; + *outptr++ = *inptr4++; + *outptr++ = *inptr5++; + *outptr++ = *inptr6++; + *outptr++ = *inptr7++; + } + } +} + +#endif // __aarch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_24way_16bit.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_24way_16bit.hpp new file mode 100644 index 000000000..e440e3288 --- /dev/null +++ b/arm_compute/core/NEON/kernels/assembly/transforms/a64_transpose_interleave_24way_16bit.hpp @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef __aarch64__ + +#include "transpose_interleave_common.hpp" + +// Generic unblocked transposed 12x32-bit sized specialisation +template <> +template <typename T> +inline void TransformImpl<12, 1, true, 4, 4>::Transform( + T* out, const T* const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax +) { + // Redirect to a 24 x uint16_t specialisation + TransformImpl<24, 1, true, 2, 2>::Transform( + reinterpret_cast<uint16_t *>(out), + reinterpret_cast<const uint16_t * const>(in), + stride*2, x0*2, xmax*2, k0, kmax + ); +} + +// Generic 24x16-bit sized specialisation +template <> +template <typename T> +inline void TransformImpl<24, 1, true, 2, 2>::Transform( + T* out, const T* const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax +) { + // Redirect to a uint16_t specialisation + Transform( + reinterpret_cast<uint16_t *>(out), + reinterpret_cast<const uint16_t * const>(in), + stride, x0, xmax, k0, kmax + ); +} + +// Specialised 24 x uint16_t version +template <> +inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x1(const uint16_t *&in0, uint16_t *out) { + __asm __volatile ( + "LDP q0, q1, [%[in0]], #32\n" + "STP q0, q1, [%[out]]\n" + ASM_PREFETCH("[%[in0], #192]") + "LDR q2, [%[in0]], #16\n" + "STR q2, [%[out], #32]\n" + : [in0] "+r" (in0), [out] "+r" (out) + : + : "v0", "v1", "v2", "memory" + ); +} + +template <> +inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x2(const uint16_t *&in0, const uint16_t *&in1,uint16_t *out) { + __asm __volatile ( + "LDP q0, q1, [%[in0]], #32\n" + "STP q0, q1, [%[out]]\n" + ASM_PREFETCH("[%[in0], #192]") + "LDR q2, [%[in0]], #16\n" + "LDP q3, q4, [%[in1]], #32\n" + "STP q2, q3, [%[out], #32]\n" + ASM_PREFETCH("[%[in1], #192]") + "LDR q5, [%[in1]], #16\n" + "STP q4, q5, [%[out], #64]\n" + : [in0] "+r" (in0), [in1] "+r" (in1), [out] "+r" (out) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "memory" + ); +} + +template <> +inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x4(const uint16_t *&in0, const uint16_t *&in1, const uint16_t *&in2, const uint16_t *&in3, uint16_t *out) { + __asm __volatile ( + "LDP q0, q1, [%[in0]], #32\n" + "STP q0, q1, [%[out]]\n" + "LDR q2, [%[in0]], #16\n" + ASM_PREFETCH("[%[in0], #192]") + "LDP q3, q4, [%[in1]], #32\n" + "STP q2, q3, [%[out], #32]\n" + "LDR q5, [%[in1]], #16\n" + ASM_PREFETCH("[%[in1], #192]") + "STP q4, q5, [%[out], #64]\n" + "LDP q6, q7, [%[in2]], #32\n" + "STP q6, q7, [%[out], #96]\n" + "LDR q8, [%[in2]], #16\n" + ASM_PREFETCH("[%[in2], #192]") + "LDP q9, q10, [%[in3]], #32\n" + "STP q8, q9, [%[out], #128]\n" + "LDR q11, [%[in3]], #16\n" + "STP q10, q11, [%[out], #160]\n" + ASM_PREFETCH("[%[in3], #192]") + + : [in0] "+r" (in0), [in1] "+r" (in1), [in2] "+r" (in2), [in3] "+r" (in3), [out] "+r" (out) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "memory" + ); +} + +template <> +template <> +inline void TransformImpl<24, 1, true, 2, 2>::Transform( + uint16_t* out, const uint16_t* const in, const int stride, + const int x0, const int xmax, const int k0, const int kmax +) { + TransposeInterleaveCommon<24, uint16_t, uint16_t>::Transform(out, in, stride, x0, xmax, k0, kmax); +} + +#endif // __arch64__ diff --git a/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp b/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp index 13e1b5468..8a2213f7f 100644 --- a/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp +++ b/arm_compute/core/NEON/kernels/assembly/transforms/list.hpp @@ -23,10 +23,10 @@ */ #include "a32_interleave_6way_32bit.hpp" #include "a32_transpose_interleave_8way_32bit.hpp" -//#include "a64_interleave_8way_16bit.hpp" +#include "a64_interleave_8way_16bit.hpp" #include "a64_interleave_8way_32bit.hpp" //#include "a64_interleave_8way_half_to_float.hpp" //#include "a64_transpose_interleave_12way_16bit.hpp" //#include "a64_transpose_interleave_12way_half_to_float.hpp" -//#include "a64_transpose_interleave_24way_16bit.hpp" +#include "a64_transpose_interleave_24way_16bit.hpp" #include "transpose_interleave_common.hpp" diff --git a/arm_compute/core/NEON/kernels/convolution/NEDirectConvolution3x3.h b/arm_compute/core/NEON/kernels/convolution/NEDirectConvolution3x3.h new file mode 100644 index 000000000..7f39e5ee8 --- /dev/null +++ b/arm_compute/core/NEON/kernels/convolution/NEDirectConvolution3x3.h @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H__ +#define __ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H__ + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace detail +{ +inline float32x4x3_t load_matrix_row(const float *ptr) +{ + const float32x4x3_t r = + { + { + vld1q_dup_f32(ptr), + vld1q_dup_f32(1 + ptr), + vld1q_dup_f32(2 + ptr) + } + }; + return r; +} + +template <unsigned int stridex> +float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position); + +template <> +inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position) +{ + ARM_COMPUTE_UNUSED(fixed_point_position); + + const float32x4x3_t vtop = + { + { + vld1q_f32(in_top), + vld1q_f32(in_top + 4), + vld1q_f32(in_top + 8) + } + }; + const float32x4x3_t vmid = + { + { + vld1q_f32(in_mid), + vld1q_f32(in_mid + 4), + vld1q_f32(in_mid + 8) + } + }; + const float32x4x3_t vlow = + { + { + vld1q_f32(in_low), + vld1q_f32(in_low + 4), + vld1q_f32(in_low + 8) + } + }; + float32x4x2_t out = + { + { + vmulq_f32(vtop.val[0], m0.val[0]), + vmulq_f32(vtop.val[1], m0.val[0]) + } + }; + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]); + + out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]); + + out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]); + + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]); + + out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]); + + out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]); + return out; +} + +template <> +inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position) +{ + float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3); + return out; +} + +template <> +inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position) +{ + float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); + return out; +} + +template <unsigned int stridex> +void store_results(float *buffer, const float32x4x2_t &values); + +template <> +void store_results<1>(float *buffer, const float32x4x2_t &values) +{ + vst1q_f32(buffer, values.val[0]); + vst1q_f32(buffer + 4, values.val[1]); +} + +template <> +void store_results<2>(float *buffer, const float32x4x2_t &values) +{ + vst1q_f32(buffer, values.val[0]); +} + +template <> +void store_results<3>(float *buffer, const float32x4x2_t &values) +{ + vst1_f32(buffer, vget_low_f32(values.val[0])); +} + +template <unsigned int stridex> +int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration); + +template <> +int get_input_num_elems_processed<1>(unsigned int num_elems_written_per_iteration) +{ + return num_elems_written_per_iteration; +} + +template <> +int get_input_num_elems_processed<2>(unsigned int num_elems_written_per_iteration) +{ + return num_elems_written_per_iteration << 1; +} + +template <> +int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteration) +{ + return num_elems_written_per_iteration * 3; +} +} +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H__ */
\ No newline at end of file diff --git a/arm_compute/core/NEON/kernels/convolution/NEDirectConvolutionDetail.h b/arm_compute/core/NEON/kernels/convolution/NEDirectConvolutionDetail.h new file mode 100644 index 000000000..c35855861 --- /dev/null +++ b/arm_compute/core/NEON/kernels/convolution/NEDirectConvolutionDetail.h @@ -0,0 +1,503 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H__ +#define __ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H__ + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/NEON/NEFixedPoint.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace detail +{ +/** Loads a 3x3 matrix as a row (float). + * + * @param[in] ptr Pointer to a float 3x3 matrix. + * + * @return The loaded matrix. + */ +inline float32x4x3_t load_matrix_row(const float *ptr) +{ + const float32x4x3_t r = + { + { + vld1q_dup_f32(ptr), + vld1q_dup_f32(1 + ptr), + vld1q_dup_f32(2 + ptr) + } + }; + return r; +} + +/** Loads a 3x3 matrix as a row (qint8_t). + * + * @param[in] ptr Pointer to a qint8 3x3 matrix. + * + * @return The loaded matrix. + */ +inline qint8x8x3_t load_matrix_row(const qint8_t *ptr) +{ + /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes: + r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */ + const qint8x8x3_t r = + { + { + vld1_dup_qs8(ptr), + vld1_dup_qs8(1 + ptr), + vld1_dup_qs8(2 + ptr) + } + }; + return r; +} + +/** Perform a convolve3x3 on float32. + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] fixed_point_position (Optional) Fixed point position. + * + */ +template <unsigned int stridex> +float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position); + +template <> +inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position) +{ + ARM_COMPUTE_UNUSED(fixed_point_position); + + const float32x4x3_t vtop = + { + { + vld1q_f32(in_top), + vld1q_f32(in_top + 4), + vld1q_f32(in_top + 8) + } + }; + const float32x4x3_t vmid = + { + { + vld1q_f32(in_mid), + vld1q_f32(in_mid + 4), + vld1q_f32(in_mid + 8) + } + }; + const float32x4x3_t vlow = + { + { + vld1q_f32(in_low), + vld1q_f32(in_low + 4), + vld1q_f32(in_low + 8) + } + }; + float32x4x2_t out = + { + { + vmulq_f32(vtop.val[0], m0.val[0]), + vmulq_f32(vtop.val[1], m0.val[0]) + } + }; + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]); + + out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]); + + out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]); + + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]); + + out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]); + + out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]); + return out; +} + +template <> +inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position) +{ + float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3); + return out; +} + +template <> +inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position) +{ + float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); + return out; +} + +/** Perform a convolve3x3 on qint16. + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] fixed_point_position (Optional) Fixed point position. + * + */ +template <unsigned int stridex> +qint16x8x2_t convolve_3x3(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position); + +template <> +inline qint16x8x2_t convolve_3x3<1>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position) +{ + ARM_COMPUTE_UNUSED(fixed_point_position); + + const qint8x8x3_t vtop = + { + { + vld1_qs8(in_top), + vld1_qs8(in_top + 8), + vld1_qs8(in_top + 16) + } + }; + const qint8x8x3_t vmid = + { + { + vld1_qs8(in_mid), + vld1_qs8(in_mid + 8), + vld1_qs8(in_mid + 16) + } + }; + const qint8x8x3_t vlow = + { + { + vld1_qs8(in_low), + vld1_qs8(in_low + 8), + vld1_qs8(in_low + 16) + } + }; + qint16x8x2_t out = + { + { + vmull_qs8(vtop.val[0], m0.val[0], fixed_point_position), + vmull_qs8(vtop.val[1], m0.val[0], fixed_point_position) + } + }; + out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 1), m0.val[1], fixed_point_position); + out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 2), m0.val[2], fixed_point_position); + out.val[0] = vqmlal_qs8(out.val[0], vmid.val[0], m1.val[0], fixed_point_position); + out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 1), m1.val[1], fixed_point_position); + out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 2), m1.val[2], fixed_point_position); + out.val[0] = vqmlal_qs8(out.val[0], vlow.val[0], m2.val[0], fixed_point_position); + out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 1), m2.val[1], fixed_point_position); + out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 2), m2.val[2], fixed_point_position); + out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 1), m0.val[1], fixed_point_position); + out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 2), m0.val[2], fixed_point_position); + out.val[1] = vqmlal_qs8(out.val[1], vmid.val[1], m1.val[0], fixed_point_position); + out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 1), m1.val[1], fixed_point_position); + out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 2), m1.val[2], fixed_point_position); + out.val[1] = vqmlal_qs8(out.val[1], vlow.val[1], m2.val[0], fixed_point_position); + out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 1), m2.val[1], fixed_point_position); + out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 2), m2.val[2], fixed_point_position); + return out; +} + +template <> +inline qint16x8x2_t convolve_3x3<2>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position) +{ + qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position); + out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 2), out.val[0], 1); + out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 4), out.val[0], 2); + out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 3); + out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 0), out.val[0], 4); + out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 2), out.val[0], 5); + out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 4), out.val[0], 6); + out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 6), out.val[0], 7); + return out; +} + +template <> +inline qint16x8x2_t convolve_3x3<3>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position) +{ + qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position); + out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 3), out.val[0], 1); + out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 2); + out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 1), out.val[0], 3); + return out; +} + +/** Stores a float32x4x2_t array into a memory location. + * + * @param[in] buffer Pointer to the memory location where the values will be stored. + * @param[in] values Values that will be stored. + * + */ +template <unsigned int stridex> +void store_results(float *buffer, const float32x4x2_t &values); + +template <> +inline void store_results<1>(float *buffer, const float32x4x2_t &values) +{ + vst1q_f32(buffer, values.val[0]); + vst1q_f32(buffer + 4, values.val[1]); +} + +template <> +inline void store_results<2>(float *buffer, const float32x4x2_t &values) +{ + vst1q_f32(buffer, values.val[0]); +} + +template <> +inline void store_results<3>(float *buffer, const float32x4x2_t &values) +{ + vst1_f32(buffer, vget_low_f32(values.val[0])); +} + +/** Stores a qint16_t array into a memory location. + * + * @param[in] buffer Pointer to the memory location where the values will be stored. + * @param[in] values Values that will be stored. + * + */ +template <unsigned int stridex> +void store_results(qint16_t *buffer, const qint16x8x2_t &values); + +template <> +inline void store_results<1>(qint16_t *buffer, const qint16x8x2_t &values) +{ + vst1q_qs16(buffer, values.val[0]); + vst1q_qs16(buffer + 8, values.val[1]); +} + +template <> +inline void store_results<2>(qint16_t *buffer, const qint16x8x2_t &values) +{ + vst1q_qs16(buffer, values.val[0]); +} + +template <> +inline void store_results<3>(qint16_t *buffer, const qint16x8x2_t &values) +{ + vst1_qs16(buffer, vget_low_s16(values.val[0])); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +/** Loads a 3x3 matrix as a row (float16_t). + * + * @param[in] ptr Pointer to a float 3x3 matrix. + * + * @return The loaded matrix. + */ +inline float16x8x3_t load_matrix_row(const float16_t *ptr) +{ + /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes: + r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */ + const float16x8x3_t r = + { + { + vld1q_dup_f16(ptr), + vld1q_dup_f16(1 + ptr), + vld1q_dup_f16(2 + ptr) + } + }; + return r; +} + +/** Perform a convolve3x3 on float16. + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] fixed_point_position (Optional) Fixed point position. + * + */ +template <unsigned int stridex> +float16x8x2_t convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, + int fixed_point_position); + +template <> +inline float16x8x2_t convolve_3x3<1>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, + int fixed_point_position) +{ + ARM_COMPUTE_UNUSED(fixed_point_position); + + const float16x8x3_t vtop = + { + { + vld1q_f16(in_top), + vld1q_f16(in_top + 8), + vld1q_f16(in_top + 16) + } + }; + const float16x8x3_t vmid = + { + { + vld1q_f16(in_mid), + vld1q_f16(in_mid + 8), + vld1q_f16(in_mid + 16) + } + }; + const float16x8x3_t vlow = + { + { + vld1q_f16(in_low), + vld1q_f16(in_low + 8), + vld1q_f16(in_low + 16) + } + }; + float16x8x2_t out = + { + { + vmulq_f16(vtop.val[0], m0.val[0]), + vmulq_f16(vtop.val[1], m0.val[0]) + } + }; + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 1), m0.val[1])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 2), m0.val[2])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[0], m1.val[0])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 1), m1.val[1])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 2), m1.val[2])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[0], m2.val[0])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 1), m2.val[1])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 2), m2.val[2])); + out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 1), m0.val[1])); + out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 2), m0.val[2])); + out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vmid.val[1], m1.val[0])); + out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 1), m1.val[1])); + out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 2), m1.val[2])); + out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vlow.val[1], m2.val[0])); + out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 1), m2.val[1])); + out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 2), m2.val[2])); + return out; +} + +template <> +inline float16x8x2_t convolve_3x3<2>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, + int fixed_point_position) +{ + float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 0), out.val[0], 2); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 2), out.val[0], 3); + return out; +} + +template <> +inline float16x8x2_t convolve_3x3<3>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, + int fixed_point_position) +{ + float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1); + return out; +} + +/** Stores a float16x8x2_t array into a memory location. + * + * @param[in] buffer Pointer to the memory location where the values will be stored. + * @param[in] values Values that will be stored. + * + */ +template <unsigned int stridex> +void store_results(float16_t *buffer, const float16x8x2_t &values); + +template <> +inline void store_results<1>(float16_t *buffer, const float16x8x2_t &values) +{ + vst1q_f16(buffer, values.val[0]); + vst1q_f16(buffer + 8, values.val[1]); +} + +template <> +inline void store_results<2>(float16_t *buffer, const float16x8x2_t &values) +{ + vst1q_f16(buffer, values.val[0]); +} + +template <> +inline void store_results<3>(float16_t *buffer, const float16x8x2_t &values) +{ + vst1_f16(buffer, vget_low_f16(values.val[0])); +} +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + +/** Get the number of elements processed on 3x3 convolution. + * + * @param[in] num_elems_written_per_iteration Number of elements written per iteration on 3x3 convolution. + * + * @return The number of elements processed. + */ +template <unsigned int stridex> +int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration); + +template <> +inline int get_input_num_elems_processed<1>(unsigned int num_elems_written_per_iteration) +{ + return num_elems_written_per_iteration; +} + +template <> +inline int get_input_num_elems_processed<2>(unsigned int num_elems_written_per_iteration) +{ + return num_elems_written_per_iteration << 1; +} + +template <> +inline int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteration) +{ + return num_elems_written_per_iteration * 3; +} +inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration, unsigned int stridex) +{ + switch(stridex) + { + case 1: + return get_input_num_elems_processed<1>(num_elems_written_per_iteration); + case 2: + return get_input_num_elems_processed<2>(num_elems_written_per_iteration); + case 3: + return get_input_num_elems_processed<3>(num_elems_written_per_iteration); + default: + ARM_COMPUTE_ERROR("stridex not supported"); + return 0; + } +} +} +} // namespace arm_compute +#endif /* __ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H__ */ diff --git a/arm_compute/core/NEON/kernels/winograd/alloc.hpp b/arm_compute/core/NEON/kernels/winograd/alloc.hpp new file mode 100644 index 000000000..ef6f2b511 --- /dev/null +++ b/arm_compute/core/NEON/kernels/winograd/alloc.hpp @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#pragma once + +#ifdef ALLOC_ALIGN +#define ALLOCATE(x) aligned_alloc(ALLOC_ALIGN, x) +#else +#define ALLOCATE(x) malloc(x) +#endif diff --git a/arm_compute/core/NEON/kernels/winograd/tensor.hpp b/arm_compute/core/NEON/kernels/winograd/tensor.hpp new file mode 100644 index 000000000..70ef65d2a --- /dev/null +++ b/arm_compute/core/NEON/kernels/winograd/tensor.hpp @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once +#include <cstdio> +#include <cstdlib> +#include <random> + +#include "alloc.hpp" + +/*****************************************************************************/ +/* Padding definitions */ +enum PaddingType { + PADDING_SAME, PADDING_VALID +}; + +/*****************************************************************************/ +/* Shape of a kernel */ +struct KernelShape { + int n_output_channels, n_rows, n_cols, n_input_channels; + + int size(void) const { + return n_output_channels * n_rows * n_cols * n_input_channels; + } +}; + +struct Tensor4DShape { + int n_batches, + n_rows, + n_cols, + n_channels; + + int size() const { + return n_batches * n_rows * n_cols * n_channels; + } + + bool TestEq(const Tensor4DShape& other) const { + return (n_batches == other.n_batches && + n_rows == other.n_rows && + n_cols == other.n_cols && + n_channels == other.n_channels); + } +}; + +template <typename ShapeT, typename T> +class Tensor4D final { + public: + Tensor4D(ShapeT shape) : + _shape(shape), + _data(reinterpret_cast<T*>(ALLOCATE(size_bytes()))) { + Clear(); + } + + ~Tensor4D() { + free(_data); + } + + T* ptr() const { + return _data; + } + + const ShapeT& shape() const { + return _shape; + } + + size_t size_bytes() const { + return _shape.size() * sizeof(T); + } + + bool TestEq(Tensor4D<ShapeT, T>& other) const; + T& element(int, int, int, int) const; + void Print() const; + + void Clear() { + Fill(static_cast<T>(0)); + } + + void Fill(T val) { + for (int i = 0; i < _shape.size(); i++) + _data[i] = val; + } + + void TestPattern() { + for (int i = 0; i < _shape.size(); i++) + _data[i] = static_cast<T>(i); + } + + void Rand(const int seed=2311) { + std::mt19937 gen(seed); + std::uniform_int_distribution<> dis(-50, +50); + + for (int i = 0; i < _shape.size(); i++) { + _data[i] = static_cast<T>(dis(gen)); + } + } + Tensor4D(const Tensor4D &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + Tensor4D &operator=(const Tensor4D &) = delete; + /** Allow instances of this class to be moved */ + Tensor4D(Tensor4D &&) = default; + /** Allow instances of this class to be moved */ + Tensor4D &operator=(Tensor4D &&) = default; + + + private: + const ShapeT _shape; + T* const _data; +}; + + +template <> +inline float& Tensor4D<Tensor4DShape, float>::element(int n, int i, int j, int c) const { + int index = ((n*_shape.n_rows + i)*_shape.n_cols + j)*_shape.n_channels + c; + return _data[index]; +} + + +template <> +inline float& Tensor4D<KernelShape, float>::element(int oc, int i, int j, int ic) const { + int index = ((i*_shape.n_cols + j)*_shape.n_input_channels + ic)*_shape.n_output_channels + oc; + return _data[index]; +} + +template <> +inline bool Tensor4D<Tensor4DShape, float>::TestEq(Tensor4D<Tensor4DShape, float>& other) const { + // Test equivalence, printing errors + // First test the shapes are the same + if (!_shape.TestEq(other.shape())) { + printf("Tensors have different shapes.\n"); + return false; + } else { + int incorrects = 0; + + for (int n = 0; n < _shape.n_batches; n++) { + for (int i = 0; i < _shape.n_rows; i++) { + for (int j = 0; j < _shape.n_cols; j++) { + for (int c = 0; c < _shape.n_channels; c++) { + // Check elements for equivalence + const auto a = this->element(n, i, j, c); + const auto b = other.element(n, i, j, c); + + if (a != b) { + printf("Difference at element {%d, %d, %d, %d}: %.3f != %.3f\n", n, i, j, c, a, b); + + if (++incorrects > 100) { + printf("More than 100 incorrect values, stopping test.\n"); + return false; + } + } + } + } + } + } + + return incorrects == 0; + } +} + + +template <> +inline void Tensor4D<Tensor4DShape, float>::Print() const { + for (int n = 0; n < _shape.n_batches; n++) { + for (int c = 0; c < _shape.n_channels; c++) { + for (int i = 0; i < _shape.n_rows; i++) { + for (int j = 0; j < _shape.n_cols; j++) { + printf("%5.2f ", element(n, i, j, c)); + } + printf("\n"); + } + printf("\n"); + } + } +} + + +template <> +inline void Tensor4D<KernelShape, float>::Print() const { + for (int oc = 0; oc < _shape.n_output_channels; oc++) { + for (int ic = 0; ic < _shape.n_input_channels; ic++) { + for (int i = 0; i < _shape.n_rows; i++) { + for (int j = 0; j < _shape.n_cols; j++) { + printf("%5.2f ", element(oc, i, j, ic)); + } + printf("\n"); + } + printf("\n"); + } + } +} diff --git a/arm_compute/core/QAsymm8.h b/arm_compute/core/QAsymm8.h new file mode 100644 index 000000000..2fa402980 --- /dev/null +++ b/arm_compute/core/QAsymm8.h @@ -0,0 +1,33 @@ +/* + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_QASYMM8_H__ +#define __ARM_COMPUTE_QASYMM8_H__ + +#include "arm_compute/core/Rounding.h" +#include <cstdint> + +namespace arm_compute +{ +using qasymm8_t = uint8_t; /**< 8 bit quantized asymmetric scalar value */ +} +#include "arm_compute/core/QAsymm8.inl" +#endif /* __ARM_COMPUTE_QASYMM8_H__ */ diff --git a/arm_compute/core/QAsymm8.inl b/arm_compute/core/QAsymm8.inl new file mode 100644 index 000000000..611d68eb2 --- /dev/null +++ b/arm_compute/core/QAsymm8.inl @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include <cmath> +#include <limits> + +namespace arm_compute +{ +inline qasymm8_t sqcvt_qasymm8_f32(float value, float scale, int offset, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP) +{ + int quantized = arm_compute::round(value / scale, rounding_policy) + offset; + quantized = std::max(0, std::min(quantized, 255)); + return quantized; +} + +inline float scvt_f32_qasymm8(qasymm8_t value, float scale, int offset) +{ + float dequantized = (static_cast<int>(value) - offset) * scale; + return dequantized; +} +} diff --git a/arm_compute/core/Rounding.h b/arm_compute/core/Rounding.h new file mode 100644 index 000000000..f95058c56 --- /dev/null +++ b/arm_compute/core/Rounding.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_ROUNDING_H__ +#define __ARM_COMPUTE_ROUNDING_H__ + +namespace arm_compute +{ +/** Rounding method */ +enum class RoundingPolicy +{ + TO_ZERO, /**< Truncates the least significand values that are lost in operations. */ + TO_NEAREST_UP, /**< Rounds to nearest value; half rounds away from zero */ + TO_NEAREST_EVEN, /**< Rounds to nearest value; half rounds to nearest even */ +}; + +/** Return a rounded value of x. Rounding is done according to the rounding_policy. + * + * @param[in] x Float value to be rounded. + * @param[in] rounding_policy Policy determining how rounding is done. + * + * @return Rounded value of the argument x. + */ +int round(float x, RoundingPolicy rounding_policy); +} +#endif /*__ARM_COMPUTE_ROUNDING_H__ */ diff --git a/arm_compute/core/Strides.h b/arm_compute/core/Strides.h index 329fafb5f..105fdfde4 100644 --- a/arm_compute/core/Strides.h +++ b/arm_compute/core/Strides.h @@ -26,7 +26,6 @@ #include "arm_compute/core/Dimensions.h" #include "arm_compute/core/Error.h" -#include "arm_compute/core/Types.h" #include <algorithm> #include <array> @@ -58,5 +57,5 @@ public: /** Default destructor */ ~Strides() = default; }; -} +} // namespace arm_compute #endif /*__ARM_COMPUTE_STRIDES_H__*/ diff --git a/arm_compute/core/SubTensorInfo.h b/arm_compute/core/SubTensorInfo.h index 54fb66a57..7c464c0b1 100644 --- a/arm_compute/core/SubTensorInfo.h +++ b/arm_compute/core/SubTensorInfo.h @@ -34,6 +34,7 @@ #include "arm_compute/core/Validate.h" #include <cstddef> +#include <memory> namespace arm_compute { @@ -50,7 +51,7 @@ public: * X and Y dimensions must match the parent's ones. * @param[in] coords Coordinates of starting element inside parent tensor. */ - SubTensorInfo(ITensorInfo *parent, const TensorShape &tensor_shape, const Coordinates &coords); + SubTensorInfo(ITensorInfo *parent, TensorShape tensor_shape, Coordinates coords); /** Default destructor */ ~SubTensorInfo() = default; /** Allow instances of this class to be copy constructed */ @@ -61,29 +62,54 @@ public: SubTensorInfo(SubTensorInfo &&) = default; /** Allow instances of this class to be moved */ SubTensorInfo &operator=(SubTensorInfo &&) = default; + /** Returns the coordinates of the sub-tensor inside the parent tensor + * + * @return Sub-tensor coordinates + */ + Coordinates coords() const + { + return _coords; + } // Inherited methods overridden: - void set_data_type(DataType data_type) override + std::unique_ptr<ITensorInfo> clone() const override; + ITensorInfo &set_data_type(DataType data_type) override { ARM_COMPUTE_ERROR_ON(_parent == nullptr); _parent->set_data_type(data_type); + return *this; }; - void set_num_channels(int num_channels) override + ITensorInfo &set_num_channels(int num_channels) override { ARM_COMPUTE_ERROR_ON(_parent == nullptr); _parent->set_num_channels(num_channels); + return *this; }; - void set_format(Format format) override + ITensorInfo &set_format(Format format) override { ARM_COMPUTE_ERROR_ON(_parent == nullptr); _parent->set_format(format); + return *this; }; - void set_fixed_point_position(int fixed_point_position) override + ITensorInfo &set_fixed_point_position(int fixed_point_position) override { ARM_COMPUTE_ERROR_ON(_parent == nullptr); _parent->set_fixed_point_position(fixed_point_position); + return *this; }; - void set_tensor_shape(TensorShape shape) override; + ITensorInfo &set_tensor_shape(TensorShape shape) override; + ITensorInfo &set_quantization_info(QuantizationInfo quantization_info) override + { + ARM_COMPUTE_ERROR_ON(_parent == nullptr); + _parent->set_quantization_info(quantization_info); + return *this; + } + ITensorInfo &reset_padding() override + { + ARM_COMPUTE_ERROR_ON(_parent == nullptr); + _parent->reset_padding(); + return *this; + } bool auto_padding() override { ARM_COMPUTE_ERROR_ON(_parent == nullptr); @@ -159,10 +185,11 @@ public: ARM_COMPUTE_ERROR_ON(_parent == nullptr); return _parent->is_resizable(); } - void set_is_resizable(bool is_resizable) override + ITensorInfo &set_is_resizable(bool is_resizable) override { ARM_COMPUTE_ERROR_ON(_parent == nullptr); _parent->set_is_resizable(is_resizable); + return *this; } ValidRegion valid_region() const override { @@ -171,9 +198,18 @@ public: void set_valid_region(ValidRegion valid_region) override { ARM_COMPUTE_ERROR_ON(_parent == nullptr); - ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(_parent->valid_region(), valid_region); + // Check if subtensor is valid if parent is configured + if(_parent->tensor_shape().total_size() != 0) + { + ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(_parent->valid_region(), valid_region); + } _valid_region = std::move(valid_region); } + QuantizationInfo quantization_info() const override + { + ARM_COMPUTE_ERROR_ON(_parent == nullptr); + return _parent->quantization_info(); + } private: ITensorInfo *_parent; diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h index 35b9ccb9f..80ef7f8d5 100644 --- a/arm_compute/core/TensorInfo.h +++ b/arm_compute/core/TensorInfo.h @@ -26,6 +26,7 @@ #include "arm_compute/core/ITensorInfo.h" +#include "ITensorInfo.h" #include "arm_compute/core/Coordinates.h" #include "arm_compute/core/Strides.h" #include "arm_compute/core/TensorShape.h" @@ -33,6 +34,7 @@ #include "arm_compute/core/Utils.h" #include <cstddef> +#include <memory> namespace arm_compute { @@ -97,6 +99,16 @@ public: * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16. */ TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position = 0); + + /** Constructor + * + * @param[in] tensor_shape It specifies the size for each dimension of the tensor in number of elements. + * @param[in] num_channels It indicates the number of channels for each tensor element + * @param[in] data_type Data type to use for each tensor element + * @param[in] quantization_info The quantization settings for the tensor data. + */ + TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, QuantizationInfo quantization_info); + /** Constructor * * @param[in] hog_info HOG's metadata used to allocate normalized HOG space @@ -147,6 +159,7 @@ public: * @param[in] fixed_point_position (Optional) Fixed point position that expresses the number of bits for the fractional part of the number when the tensor's data type is QS8 or QS16. */ void init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, int fixed_point_position = 0); + /** Initialize the metadata structure with the given parameters * * @param[in] tensor_shape Size for each dimension of the tensor in number of elements. @@ -200,12 +213,15 @@ public: size_t init_auto_padding(const HOGInfo &hog_info, unsigned int width, unsigned int height); // Inherited methods overridden: - void set_data_type(DataType data_type) override; - void set_num_channels(int num_channels) override; - void set_format(Format format) override; - void set_tensor_shape(TensorShape shape) override; - void set_fixed_point_position(int fixed_point_position) override; - bool auto_padding() override; + std::unique_ptr<ITensorInfo> clone() const override; + ITensorInfo &set_data_type(DataType data_type) override; + ITensorInfo &set_num_channels(int num_channels) override; + ITensorInfo &set_format(Format format) override; + ITensorInfo &set_tensor_shape(TensorShape shape) override; + ITensorInfo &set_fixed_point_position(int fixed_point_position) override; + ITensorInfo &set_quantization_info(QuantizationInfo quantization_info) override; + ITensorInfo &reset_padding() override; + bool auto_padding() override; bool extend_padding(const PaddingSize &padding) override; size_t dimension(size_t index) const override { @@ -264,9 +280,10 @@ public: { return _is_resizable; } - void set_is_resizable(bool is_resizable) override + ITensorInfo &set_is_resizable(bool is_resizable) override { _is_resizable = is_resizable; + return *this; } ValidRegion valid_region() const override { @@ -276,6 +293,10 @@ public: { _valid_region = std::move(valid_region); } + QuantizationInfo quantization_info() const override + { + return _quantization_info; + } private: /** Calculates strides, offset and total size resulting from the specified padding around the XY plane. @@ -284,17 +305,18 @@ private: */ std::tuple<Strides, size_t, size_t> calculate_padding_requirements(const PaddingSize &padding); - size_t _total_size; - int _fixed_point_position; - size_t _offset_first_element_in_bytes; - Strides _strides_in_bytes; - size_t _num_channels; - TensorShape _tensor_shape; - DataType _data_type; - Format _format; - bool _is_resizable; - ValidRegion _valid_region; - PaddingSize _padding; + size_t _total_size; + int _fixed_point_position; + size_t _offset_first_element_in_bytes; + Strides _strides_in_bytes; + size_t _num_channels; + TensorShape _tensor_shape; + DataType _data_type; + Format _format; + bool _is_resizable; + ValidRegion _valid_region; + PaddingSize _padding; + QuantizationInfo _quantization_info; }; } #endif /*__ARM_COMPUTE_TENSORINFO_H__ */ diff --git a/arm_compute/core/TensorShape.h b/arm_compute/core/TensorShape.h index 3b395e74c..ad102607e 100644 --- a/arm_compute/core/TensorShape.h +++ b/arm_compute/core/TensorShape.h @@ -117,8 +117,8 @@ public: /** Collapse the first n dimensions. * + * @param[in] n Number of dimensions to collapse into @p first * @param[in] first Dimensions into which the following @p n are collapsed. - * @param[in] n Number of dimensions to collapse into @p first. */ void collapse(size_t n, size_t first = 0) { diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h index f9766b39b..538449b40 100644 --- a/arm_compute/core/Types.h +++ b/arm_compute/core/Types.h @@ -25,9 +25,13 @@ #define __ARM_COMPUTE_TYPES_H__ #include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/QAsymm8.h" +#include "arm_compute/core/Rounding.h" +#include "arm_compute/core/Strides.h" #include "arm_compute/core/TensorShape.h" #include "support/Half.h" +#include <cmath> #include <cstddef> #include <cstdint> #include <string> @@ -38,26 +42,29 @@ namespace arm_compute /** 16-bit floating point type */ using half = half_float::half; +/** Permutation vector */ +using PermutationVector = Strides; + /** Image colour formats */ enum class Format { - UNKNOWN, /** Unknown image format */ - U8, /** 1 channel, 1 U8 per channel */ - S16, /** 1 channel, 1 S16 per channel */ - U16, /** 1 channel, 1 U16 per channel */ - S32, /** 1 channel, 1 S32 per channel */ - U32, /** 1 channel, 1 U32 per channel */ - F16, /** 1 channel, 1 F16 per channel */ - F32, /** 1 channel, 1 F32 per channel */ - UV88, /** 2 channel, 1 U8 per channel */ - RGB888, /** 3 channels, 1 U8 per channel */ - RGBA8888, /** 4 channels, 1 U8 per channel */ - YUV444, /** A 3 plane of 8 bit 4:4:4 sampled Y, U, V planes */ - YUYV422, /** A single plane of 32-bit macro pixel of Y0, U0, Y1, V0 bytes */ - NV12, /** A 2 plane YUV format of Luma (Y) and interleaved UV data at 4:2:0 sampling */ - NV21, /** A 2 plane YUV format of Luma (Y) and interleaved VU data at 4:2:0 sampling */ - IYUV, /** A 3 plane of 8-bit 4:2:0 sampled Y, U, V planes */ - UYVY422 /** A single plane of 32-bit macro pixel of U0, Y0, V0, Y1 byte */ + UNKNOWN, /**< Unknown image format */ + U8, /**< 1 channel, 1 U8 per channel */ + S16, /**< 1 channel, 1 S16 per channel */ + U16, /**< 1 channel, 1 U16 per channel */ + S32, /**< 1 channel, 1 S32 per channel */ + U32, /**< 1 channel, 1 U32 per channel */ + F16, /**< 1 channel, 1 F16 per channel */ + F32, /**< 1 channel, 1 F32 per channel */ + UV88, /**< 2 channel, 1 U8 per channel */ + RGB888, /**< 3 channels, 1 U8 per channel */ + RGBA8888, /**< 4 channels, 1 U8 per channel */ + YUV444, /**< A 3 plane of 8 bit 4:4:4 sampled Y, U, V planes */ + YUYV422, /**< A single plane of 32-bit macro pixel of Y0, U0, Y1, V0 bytes */ + NV12, /**< A 2 plane YUV format of Luma (Y) and interleaved UV data at 4:2:0 sampling */ + NV21, /**< A 2 plane YUV format of Luma (Y) and interleaved VU data at 4:2:0 sampling */ + IYUV, /**< A 3 plane of 8-bit 4:2:0 sampled Y, U, V planes */ + UYVY422 /**< A single plane of 32-bit macro pixel of U0, Y0, V0, Y1 byte */ }; /** Available data types */ @@ -67,6 +74,7 @@ enum class DataType U8, S8, QS8, + QASYMM8, U16, S16, QS16, @@ -81,6 +89,13 @@ enum class DataType SIZET }; +/** Available Sampling Policies */ +enum class SamplingPolicy +{ + CENTER, /**< Samples are taken at pixel center */ + TOP_LEFT /**< Samples are taken at pixel top left corner */ +}; + /** Constant value of the border pixels when using BorderMode::CONSTANT */ constexpr uint8_t CONSTANT_BORDER_VALUE = 199; @@ -90,6 +105,53 @@ constexpr float SCALE_PYRAMID_HALF = 0.5f; /* Constant value used to indicate a ORB scaled pyramid */ constexpr float SCALE_PYRAMID_ORB = 8.408964152537146130583778358414e-01; +/** Quantization settings (used for QASYMM8 data type) */ +struct QuantizationInfo +{ + QuantizationInfo() + : scale(0.0f), offset(0) + { + } + + QuantizationInfo(float scale, int offset) + : scale(scale), offset(offset) + { + } + + bool operator==(const QuantizationInfo &other) + { + return scale == other.scale && offset == other.offset; + } + + bool operator!=(const QuantizationInfo &other) + { + return !(*this == other); + } + + float scale; /**< scale */ + int offset; /**< offset */ + + /** Quantizes a value using the scale/offset in this QuantizationInfo */ + qasymm8_t quantize(float value, RoundingPolicy rounding_policy) const + { + ARM_COMPUTE_ERROR_ON_MSG(scale == 0, "QuantizationInfo::quantize: scale == 0"); + return sqcvt_qasymm8_f32(value, scale, offset, rounding_policy); + } + + /** Dequantizes a value using the scale/offset in this QuantizationInfo */ + float dequantize(qasymm8_t value) const + { + ARM_COMPUTE_ERROR_ON_MSG(scale == 0, "QuantizationInfo::dequantize: scale == 0"); + return scvt_f32_qasymm8(value, scale, offset); + } + + /** Indicates whether this QuantizationInfo has valid settings or not */ + bool empty() const + { + return scale == 0; + } +}; + struct ValidRegion { ValidRegion() @@ -234,14 +296,6 @@ enum class ThresholdType RANGE /**< Threshold with two values*/ }; -/** Rounding method */ -enum class RoundingPolicy -{ - TO_ZERO, /**< Truncates the least significand values that are lost in operations. */ - TO_NEAREST_UP, /**< Rounds to nearest value; half rounds up */ - TO_NEAREST_EVEN /**< Rounds to nearest value; half rounds to nearest even */ -}; - /** Termination criteria */ enum class Termination { @@ -418,7 +472,32 @@ public: unsigned int pad_x = 0, unsigned int pad_y = 0, DimensionRoundingType round = DimensionRoundingType::FLOOR) : _stride(std::make_pair(stride_x, stride_y)), - _pad(std::make_pair(pad_x, pad_y)), + _pad_left(pad_x), + _pad_top(pad_y), + _pad_right(pad_x), + _pad_bottom(pad_y), + _round_type(round) + { + } + /** Constructor + * + * @param[in] stride_x Stride, in elements, across x. + * @param[in] stride_y Stride, in elements, across y. + * @param[in] pad_left Padding across x on the left, in elements. + * @param[in] pad_top Padding across y on the top, in elements. + * @param[in] pad_right Padding across x on the right, in elements. + * @param[in] pad_bottom Padding across y on the bottom, in elements. + * @param[in] round Dimensions rounding. + */ + PadStrideInfo(unsigned int stride_x, unsigned int stride_y, + unsigned int pad_left, unsigned int pad_right, + unsigned int pad_top, unsigned int pad_bottom, + DimensionRoundingType round) + : _stride(std::make_pair(stride_x, stride_y)), + _pad_left(pad_left), + _pad_top(pad_top), + _pad_right(pad_right), + _pad_bottom(pad_bottom), _round_type(round) { } @@ -428,16 +507,45 @@ public: } std::pair<unsigned int, unsigned int> pad() const { - return _pad; + //this accessor should be used only when padding is symmetric + ARM_COMPUTE_ERROR_ON(_pad_left != _pad_right || _pad_top != _pad_bottom); + return std::make_pair(_pad_left, _pad_top); + } + + unsigned int pad_left() const + { + return _pad_left; + } + unsigned int pad_right() const + { + return _pad_right; + } + unsigned int pad_top() const + { + return _pad_top; } + unsigned int pad_bottom() const + { + return _pad_bottom; + } + DimensionRoundingType round() const { return _round_type; } + bool has_padding() const + { + return (_pad_left != 0 || _pad_top != 0 || _pad_right != 0 || _pad_bottom != 0); + } + private: std::pair<unsigned int, unsigned int> _stride; - std::pair<unsigned int, unsigned int> _pad; + unsigned int _pad_left; + unsigned int _pad_top; + unsigned int _pad_right; + unsigned int _pad_bottom; + DimensionRoundingType _round_type; }; @@ -445,14 +553,35 @@ private: class PoolingLayerInfo { public: + /** Default Constructor */ + PoolingLayerInfo() + : _pool_type(PoolingType::MAX), _pool_size(0), _pad_stride_info(PadStrideInfo()), _exclude_padding(false), _is_global_pooling(false) + { + } /** Default Constructor * - * @param[in] pool_type Pooling type @ref PoolingType. Defaults to @ref PoolingType::MAX - * @param[in] pool_size (Optional) Pooling size, in elements, across x and y. Defaults to 2. + * @param[in] pool_type Pooling type @ref PoolingType. + * @param[in] pool_size Pooling size, in elements, across x and y. * @param[in] pad_stride_info (Optional) Padding and stride information @ref PadStrideInfo + * @param[in] exclude_padding (Optional) Strategy when accounting padding in calculations. + * True will exclude padding while false will not (Used in AVG/L2 pooling to determine the pooling area). + * Defaults to false; + */ + explicit PoolingLayerInfo(PoolingType pool_type, + unsigned int pool_size, + PadStrideInfo pad_stride_info = PadStrideInfo(), + bool exclude_padding = false) + : _pool_type(pool_type), _pool_size(pool_size), _pad_stride_info(pad_stride_info), _exclude_padding(exclude_padding), _is_global_pooling(false) + { + } + /** Default Constructor + * + * @note This constructor is used for global pooling + * + * @param[in] pool_type Pooling type @ref PoolingType. */ - PoolingLayerInfo(PoolingType pool_type = PoolingType::MAX, unsigned int pool_size = 2, PadStrideInfo pad_stride_info = PadStrideInfo()) - : _pool_type(pool_type), _pool_size(pool_size), _pad_stride_info(pad_stride_info) + explicit PoolingLayerInfo(PoolingType pool_type) + : _pool_type(pool_type), _pool_size(0), _pad_stride_info(PadStrideInfo(1, 1, 0, 0)), _exclude_padding(false), _is_global_pooling(true) { } PoolingType pool_type() const @@ -467,11 +596,21 @@ public: { return _pad_stride_info; } + bool exclude_padding() const + { + return _exclude_padding; + } + bool is_global_pooling() const + { + return _is_global_pooling; + } private: PoolingType _pool_type; unsigned int _pool_size; PadStrideInfo _pad_stride_info; + bool _exclude_padding; + bool _is_global_pooling; }; /** ROI Pooling Layer Information class */ @@ -565,12 +704,14 @@ public: * * @param[in] type The normalization type. Can be @ref NormType::IN_MAP_1D, @ref NormType::IN_MAP_2D or @ref NORM_TYPE::CROSS_MAP * @param[in] norm_size The normalization size is the number of elements to normalize across. Defaults to 5. - * @param[in] alpha Alpha parameter used by normalization equation. Defaults to 0.0001. - * @param[in] beta Beta parameter used by normalization equation. Defaults to 0.5. - * @param[in] kappa Kappa parameter used by [Krichevksy 2012] Across Channel Local Brightness Normalization equation. + * @param[in] alpha (Optional) Alpha parameter used by normalization equation. Defaults to 0.0001. + * @param[in] beta (Optional) Beta parameter used by normalization equation. Defaults to 0.5. + * @param[in] kappa (Optional) Kappa parameter used by [Krichevksy 2012] Across Channel Local Brightness Normalization equation. + * @param[in] is_scaled (Optional) Boolean that specifies if alpha will be scaled by the normalization size or not. + * Should be false to follow [Krichevksy 2012]. */ - NormalizationLayerInfo(NormType type, uint32_t norm_size = 5, float alpha = 0.0001f, float beta = 0.5f, float kappa = 1.f) - : _type(type), _norm_size(norm_size), _alpha(alpha), _beta(beta), _kappa(kappa) + NormalizationLayerInfo(NormType type, uint32_t norm_size = 5, float alpha = 0.0001f, float beta = 0.5f, float kappa = 1.f, bool is_scaled = true) + : _type(type), _norm_size(norm_size), _alpha(alpha), _beta(beta), _kappa(kappa), _is_scaled(is_scaled) { } NormType type() const @@ -593,17 +734,25 @@ public: { return _kappa; } - /** Return the scaling factor of the normalization function. If kappa is not - * 1 then [Krichevksy 2012] normalization scaling is specified. Scaling - * factor takes into account the total number of elements used for the - * normalization, so in case of 2 dimensions this is _norm_size^2. + bool is_cross_map() const + { + return _type == NormType::CROSS_MAP; + } + bool is_in_map() const + { + return !is_cross_map(); + } + /** Return the scaling factor of the normalization function. + * + * If is_scaled is set to false then [Krichevksy 2012] normalization scaling is performed, + * where alpha is returned plainly, else alpha is scaled by the total number of elements used for the normalization. * * @return The normalization scaling factor. */ float scale_coeff() const { const uint32_t size = (_type == NormType::IN_MAP_2D) ? _norm_size * _norm_size : _norm_size; - return (_kappa == 1.f) ? (_alpha / size) : _alpha; + return (_is_scaled) ? (_alpha / size) : _alpha; } private: @@ -612,6 +761,7 @@ private: float _alpha; float _beta; float _kappa; + bool _is_scaled; }; /** Convolution Layer Weights Information class. This class stores the necessary information to compute convolution layer when the weights are already reshaped */ @@ -666,6 +816,58 @@ private: const unsigned int _num_kernels; }; +/** GEMM Information class. This class stores the necessary information to compute GEMM functions */ +class GEMMInfo +{ +public: + /** Default constructor */ + GEMMInfo() + : _is_a_reshaped(false), _is_b_reshaped(false), _reshape_b_only_on_first_run(false) + { + } + /** Constructor + * + * @param[in] is_a_reshaped True if the matrix A has been reshaped + * @param[in] is_b_reshaped True if the matrix B has been reshaped + * @param[in] reshape_b_only_on_first_run Reshape matrix B only for the first run + */ + GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run) + : _is_a_reshaped(is_a_reshaped), _is_b_reshaped(is_b_reshaped), _reshape_b_only_on_first_run(reshape_b_only_on_first_run) + { + } + /** Flag which specifies if the matrix A has been reshaped + * + * @return True if the matrix A has been reshaped + */ + bool is_a_reshaped() const + { + return _is_a_reshaped; + }; + /** Flag which specifies if the matrix B has been reshaped + * + * @return True if the matrix B has been reshaped + */ + bool is_b_reshaped() const + { + return _is_b_reshaped; + }; + /** Flag which specifies if the reshape of matrix B should executed only for the first + * + * @note This flag could be set to TRUE when GEMM is used to accelerate convolution layer + * + * @return True if the reshaped of matrix B happens only for the first run + */ + bool reshape_b_only_on_first_run() const + { + return _reshape_b_only_on_first_run; + }; + +private: + const bool _is_a_reshaped; + const bool _is_b_reshaped; + const bool _reshape_b_only_on_first_run; +}; + /** IO formatting information class*/ struct IOFormatInfo { diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h index 06d674644..f78add13f 100644 --- a/arm_compute/core/Utils.h +++ b/arm_compute/core/Utils.h @@ -25,6 +25,7 @@ #define __ARM_COMPUTE_UTILS_H__ #include "arm_compute/core/Error.h" +#include "arm_compute/core/Rounding.h" #include "arm_compute/core/Types.h" #include <algorithm> @@ -92,6 +93,7 @@ inline size_t data_size_from_type(DataType data_type) case DataType::U8: case DataType::S8: case DataType::QS8: + case DataType::QASYMM8: return 1; case DataType::U16: case DataType::S16: @@ -166,6 +168,7 @@ inline size_t element_size_from_data_type(DataType dt) case DataType::S8: case DataType::U8: case DataType::QS8: + case DataType::QASYMM8: return 1; case DataType::U16: case DataType::S16: @@ -344,15 +347,52 @@ inline size_t num_channels_from_format(Format format) } } +/** Return the promoted data type of a given data type. + * + * @note If promoted data type is not supported an error will be thrown + * + * @param[in] dt Data type to get the promoted type of. + * + * @return Promoted data type + */ +inline DataType get_promoted_data_type(DataType dt) +{ + switch(dt) + { + case DataType::U8: + return DataType::U16; + case DataType::S8: + return DataType::S16; + case DataType::QS8: + return DataType::QS16; + case DataType::U16: + return DataType::U32; + case DataType::S16: + return DataType::S32; + case DataType::QS16: + return DataType::QS32; + case DataType::QASYMM8: + case DataType::F16: + case DataType::U32: + case DataType::S32: + case DataType::F32: + case DataType::QS32: + ARM_COMPUTE_ERROR("Unsupported data type promotions!"); + default: + ARM_COMPUTE_ERROR("Undefined data type!"); + } + return DataType::UNKNOWN; +} + /** Separate a 2D convolution into two 1D convolutions -* -* @param[in] conv 2D convolution -* @param[out] conv_col 1D vertical convolution -* @param[out] conv_row 1D horizontal convolution -* @param[in] size Size of the 2D convolution -* -* @return true if the separation was successful -*/ + * + * @param[in] conv 2D convolution + * @param[out] conv_col 1D vertical convolution + * @param[out] conv_row 1D horizontal convolution + * @param[in] size Size of the 2D convolution + * + * @return true if the separation was successful + */ inline bool separate_matrix(const int16_t *conv, int16_t *conv_col, int16_t *conv_row, uint8_t size) { int32_t min_col = -1; @@ -562,6 +602,38 @@ inline DataType data_type_for_convolution_matrix(const int16_t *conv, size_t siz } } +/** Returns expected shape for the deconvolution output tensor. + * + * @param[in] out_dims widht and height of the output tensor, these values can be obtained with the function deconvolution_output_dimensions. + * @param[in] input Shape of the input tensor. + * @param[in] weights Shape of the weights tensor. + * + * @return Deconvolution output tensor shape. + */ +TensorShape deconvolution_output_shape(const std::pair<unsigned int, unsigned int> &out_dims, TensorShape input, TensorShape weights); + +/** Returns expected width and height of the deconvolution's output tensor. + * + * @param[in] in_width Width of input tensor (Number of columns) + * @param[in] in_height Height of input tensor (Number of rows) + * @param[in] kernel_width Kernel width. + * @param[in] kernel_height Kernel height. + * @param[in] padx X axis padding. + * @param[in] pady Y axis padding. + * @param[in] ax The number of zeros added to right edge of the input. + * @param[in] ay The number of zeros added to top edge of the input. + * @param[in] upscalex How much to scale the X axis. + * @param[in] upscaley How much to scale the Y axis. + * @param[in] round Rounding policy to be used when computing the output's dimensions. + * + * @return A pair with the new width in the first position and the new height in the second. + */ + +const std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned int in_width, unsigned int in_height, + unsigned int kernel_width, unsigned int kernel_height, + unsigned int padx, unsigned int pady, unsigned int ax, unsigned int ay, + float upscalex, float upscaley, DimensionRoundingType round); + /** Returns expected width and height of output scaled tensor depending on dimensions rounding mode. * * @param[in] width Width of input tensor (Number of columns) @@ -674,6 +746,28 @@ inline bool is_data_type_float(DataType dt) } } +/** Check if a given data type is of quantized type + * + * @note Quantized is considered a super-set of fixed-point and asymmetric data types. + * + * @param[in] dt Input data type. + * + * @return True if data type is of quantized type, else false. + */ +inline bool is_data_type_quantized(DataType dt) +{ + switch(dt) + { + case DataType::QS8: + case DataType::QASYMM8: + case DataType::QS16: + case DataType::QS32: + return true; + default: + return false; + } +} + /** Check if a given data type is of fixed point type * * @param[in] dt Input data type. @@ -693,6 +787,23 @@ inline bool is_data_type_fixed_point(DataType dt) } } +/** Check if a given data type is of asymmetric quantized type + * + * @param[in] dt Input data type. + * + * @return True if data type is of symmetric quantized type, else false. + */ +inline bool is_data_type_quantized_asymmetric(DataType dt) +{ + switch(dt) + { + case DataType::QASYMM8: + return true; + default: + return false; + } +} + /** Create a string with the float in full precision. * * @param val Floating point value @@ -727,7 +838,16 @@ void print_consecutive_elements_impl(std::ostream &s, const T *ptr, unsigned int { s.width(stream_width); } - s << std::right << static_cast<print_type>(ptr[i]) << element_delim; + + if(std::is_same<typename std::decay<T>::type, half>::value) + { + // We use T instead of print_type here is because the std::is_floating_point<half> returns false and then the print_type becomes int. + s << std::right << static_cast<T>(ptr[i]) << element_delim; + } + else + { + s << std::right << static_cast<print_type>(ptr[i]) << element_delim; + } } } @@ -749,7 +869,17 @@ int max_consecutive_elements_display_width_impl(std::ostream &s, const T *ptr, u { std::stringstream ss; ss.copyfmt(s); - ss << static_cast<print_type>(ptr[i]); + + if(std::is_same<typename std::decay<T>::type, half>::value) + { + // We use T instead of print_type here is because the std::is_floating_point<half> returns false and then the print_type becomes int. + ss << static_cast<T>(ptr[i]); + } + else + { + ss << static_cast<print_type>(ptr[i]); + } + max_width = std::max<int>(max_width, ss.str().size()); } return max_width; diff --git a/arm_compute/core/Validate.h b/arm_compute/core/Validate.h index 34da339f0..4ef0e1143 100644 --- a/arm_compute/core/Validate.h +++ b/arm_compute/core/Validate.h @@ -64,9 +64,9 @@ inline bool have_different_dimensions(const Dimensions<T> &dim1, const Dimension /** Functor to compare two @ref Dimensions objects and throw an error on mismatch. * * @param[in] dim Object to compare against. - * @param[in] function Function in which the error occured. - * @param[in] file File in which the error occured. - * @param[in] line Line in which the error occured. + * @param[in] function Function in which the error occurred. + * @param[in] file File in which the error occurred. + * @param[in] line Line in which the error occurred. */ template <typename T> class compare_dimension @@ -81,10 +81,11 @@ public: * * @param[in] dim To be compared object. */ - void operator()(const Dimensions<T> &dim) + arm_compute::Status operator()(const Dimensions<T> &dim) { - ARM_COMPUTE_ERROR_ON_LOC_MSG(have_different_dimensions(_dim, dim, 0), _function, _file, _line, - "Objects have different dimensions"); + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(have_different_dimensions(_dim, dim, 0), _function, _file, _line, + "Objects have different dimensions"); + return arm_compute::Status{}; } private: @@ -93,264 +94,462 @@ private: const char *const _file; const int _line; }; + +template <typename F> +inline arm_compute::Status for_each_error(F &&) +{ + return arm_compute::Status{}; +} + +template <typename F, typename T, typename... Ts> +inline arm_compute::Status for_each_error(F &&func, T &&arg, Ts &&... args) +{ + ARM_COMPUTE_RETURN_ON_ERROR(func(arg)); + ARM_COMPUTE_RETURN_ON_ERROR(for_each_error(func, args...)); + return arm_compute::Status{}; +} + +template <typename T> +struct get_tensor_info_t; +template <> +struct get_tensor_info_t<ITensorInfo *> +{ + ITensorInfo *operator()(const ITensor *tensor) + { + return tensor->info(); + } +}; } // namespace detail -/** Throw an error if one of the pointers is a nullptr. + +/** Create an error if one of the pointers is a nullptr. + * + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] pointers Pointers to check against nullptr. * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] pointers Pointers to check against nullptr. + * @return Status */ template <typename... Ts> -void error_on_nullptr(const char *function, const char *file, const int line, Ts &&... pointers) +inline arm_compute::Status error_on_nullptr(const char *function, const char *file, const int line, Ts &&... pointers) { - auto is_nullptr = [&](const void *ptr) + const std::array<const void *, sizeof...(Ts)> pointers_array{ { std::forward<Ts>(pointers)... } }; + bool has_nullptr = std::any_of(pointers_array.begin(), pointers_array.end(), [&](const void *ptr) { - ARM_COMPUTE_ERROR_ON_LOC(ptr == nullptr, function, file, line); - }; - - for_each(is_nullptr, std::forward<Ts>(pointers)...); + return (ptr == nullptr); + }); + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(has_nullptr, function, file, line, "Nullptr object!"); + return arm_compute::Status{}; } -#define ARM_COMPUTE_ERROR_ON_NULLPTR(...) ::arm_compute::error_on_nullptr(__func__, __FILE__, __LINE__, __VA_ARGS__) +#define ARM_COMPUTE_ERROR_ON_NULLPTR(...) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_nullptr(__func__, __FILE__, __LINE__, __VA_ARGS__)) +#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(__func__, __FILE__, __LINE__, __VA_ARGS__)) -/** Throw an error if the passed window is invalid. +/** Return an error if the passed window is invalid. * * The subwindow is invalid if: * - It is not a valid window. * - Its dimensions don't match the full window's ones * - The step for each of its dimension is not identical to the corresponding one of the full window. * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] full Full size window - * @param[in] win Window to validate. + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] full Full size window + * @param[in] win Window to validate. + * + * @return Status */ -void error_on_mismatching_windows(const char *function, const char *file, const int line, - const Window &full, const Window &win); -#define ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(f, w) ::arm_compute::error_on_mismatching_windows(__func__, __FILE__, __LINE__, f, w) +arm_compute::Status error_on_mismatching_windows(const char *function, const char *file, const int line, + const Window &full, const Window &win); +#define ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(f, w) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_windows(__func__, __FILE__, __LINE__, f, w)) +#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_WINDOWS(f, w) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_windows(__func__, __FILE__, __LINE__, f, w)) -/** Throw an error if the passed subwindow is invalid. +/** Return an error if the passed subwindow is invalid. * * The subwindow is invalid if: * - It is not a valid window. * - It is not fully contained inside the full window * - The step for each of its dimension is not identical to the corresponding one of the full window. * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] full Full size window - * @param[in] sub Sub-window to validate. + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] full Full size window + * @param[in] sub Sub-window to validate. + * + * @return Status */ -void error_on_invalid_subwindow(const char *function, const char *file, const int line, - const Window &full, const Window &sub); -#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s) ::arm_compute::error_on_invalid_subwindow(__func__, __FILE__, __LINE__, f, s) +arm_compute::Status error_on_invalid_subwindow(const char *function, const char *file, const int line, + const Window &full, const Window &sub); +#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_invalid_subwindow(__func__, __FILE__, __LINE__, f, s)) +#define ARM_COMPUTE_RETURN_ERROR_ON_INVALID_SUBWINDOW(f, s) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_invalid_subwindow(__func__, __FILE__, __LINE__, f, s)) -/** Throw an error if the window can't be collapsed at the given dimension. +/** Return an error if the window can't be collapsed at the given dimension. * * The window cannot be collapsed if the given dimension not equal to the full window's dimension or not start from 0. * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] full Full size window - * @param[in] window Window to be collapsed. - * @param[in] dim Dimension need to be checked. + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] full Full size window + * @param[in] window Window to be collapsed. + * @param[in] dim Dimension need to be checked. + * + * @return Status */ -void error_on_window_not_collapsable_at_dimension(const char *function, const char *file, const int line, - const Window &full, const Window &window, const int dim); -#define ARM_COMPUTE_ERROR_ON_WINDOW_NOT_COLLAPSABLE_AT_DIMENSION(f, w, d) ::arm_compute::error_on_window_not_collapsable_at_dimension(__func__, __FILE__, __LINE__, f, w, d) +arm_compute::Status error_on_window_not_collapsable_at_dimension(const char *function, const char *file, const int line, + const Window &full, const Window &window, const int dim); +#define ARM_COMPUTE_ERROR_ON_WINDOW_NOT_COLLAPSABLE_AT_DIMENSION(f, w, d) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_window_not_collapsable_at_dimension(__func__, __FILE__, __LINE__, f, w, d)) +#define ARM_COMPUTE_RETURN_ERROR_ON_WINDOW_NOT_COLLAPSABLE_AT_DIMENSION(f, w, d) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_window_not_collapsable_at_dimension(__func__, __FILE__, __LINE__, f, w, d)) -/** Throw an error if the passed coordinates have too many dimensions. +/** Return an error if the passed coordinates have too many dimensions. * * The coordinates have too many dimensions if any of the dimensions greater or equal to max_dim is different from 0. * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] pos Coordinates to validate - * @param[in] max_dim Maximum number of dimensions allowed. + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] pos Coordinates to validate + * @param[in] max_dim Maximum number of dimensions allowed. + * + * @return Status */ -void error_on_coordinates_dimensions_gte(const char *function, const char *file, const int line, - const Coordinates &pos, unsigned int max_dim); -#define ARM_COMPUTE_ERROR_ON_COORDINATES_DIMENSIONS_GTE(p, md) ::arm_compute::error_on_coordinates_dimensions_gte(__func__, __FILE__, __LINE__, p, md) +arm_compute::Status error_on_coordinates_dimensions_gte(const char *function, const char *file, const int line, + const Coordinates &pos, unsigned int max_dim); +#define ARM_COMPUTE_ERROR_ON_COORDINATES_DIMENSIONS_GTE(p, md) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_coordinates_dimensions_gte(__func__, __FILE__, __LINE__, p, md)) +#define ARM_COMPUTE_RETURN_ERROR_ON_COORDINATES_DIMENSIONS_GTE(p, md) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_coordinates_dimensions_gte(__func__, __FILE__, __LINE__, p, md)) -/** Throw an error if the passed window has too many dimensions. +/** Return an error if the passed window has too many dimensions. * * The window has too many dimensions if any of the dimension greater or equal to max_dim is different from 0. * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] win Window to validate - * @param[in] max_dim Maximum number of dimensions allowed. - */ -void error_on_window_dimensions_gte(const char *function, const char *file, const int line, - const Window &win, unsigned int max_dim); -#define ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(w, md) ::arm_compute::error_on_window_dimensions_gte(__func__, __FILE__, __LINE__, w, md) - -/** Throw an error if the passed dimension objects differ. + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] win Window to validate + * @param[in] max_dim Maximum number of dimensions allowed. * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] dim1 The first object to be compared. - * @param[in] dim2 The second object to be compared. - * @param[in] dims (Optional) Further allowed objects. + * @return Status + */ +arm_compute::Status error_on_window_dimensions_gte(const char *function, const char *file, const int line, + const Window &win, unsigned int max_dim); +#define ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(w, md) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_window_dimensions_gte(__func__, __FILE__, __LINE__, w, md)) +#define ARM_COMPUTE_RETURN_ERROR_ON_WINDOW_DIMENSIONS_GTE(w, md) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_window_dimensions_gte(__func__, __FILE__, __LINE__, w, md)) + +/** Return an error if the passed dimension objects differ. + * + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] dim1 The first object to be compared. + * @param[in] dim2 The second object to be compared. + * @param[in] dims (Optional) Further allowed objects. + * + * @return Status */ template <typename T, typename... Ts> -void error_on_mismatching_dimensions(const char *function, const char *file, int line, - const Dimensions<T> &dim1, const Dimensions<T> &dim2, Ts &&... dims) +arm_compute::Status error_on_mismatching_dimensions(const char *function, const char *file, int line, + const Dimensions<T> &dim1, const Dimensions<T> &dim2, Ts &&... dims) { - ARM_COMPUTE_UNUSED(function); - ARM_COMPUTE_UNUSED(file); - ARM_COMPUTE_UNUSED(line); - - for_each(detail::compare_dimension<T>(dim1, function, file, line), dim2, std::forward<Ts>(dims)...); + ARM_COMPUTE_RETURN_ON_ERROR(detail::for_each_error(detail::compare_dimension<T>(dim1, function, file, line), dim2, std::forward<Ts>(dims)...)); + return arm_compute::Status{}; } -#define ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(...) ::arm_compute::error_on_mismatching_dimensions(__func__, __FILE__, __LINE__, __VA_ARGS__) +#define ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(...) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_dimensions(__func__, __FILE__, __LINE__, __VA_ARGS__)) +#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_dimensions(__func__, __FILE__, __LINE__, __VA_ARGS__)) -/** Throw an error if the passed two tensors have different shapes from the given dimension +/** Return an error if the passed two tensor infos have different shapes from the given dimension * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] tensor_1 The first tensor to be compared. - * @param[in] tensor_2 The second tensor to be compared. - * @param[in] tensors (Optional) Further allowed tensors. + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] tensor_info_1 The first tensor info to be compared. + * @param[in] tensor_info_2 The second tensor info to be compared. + * @param[in] tensor_infos (Optional) Further allowed tensor infos. + * + * @return Status */ template <typename... Ts> -void error_on_mismatching_shapes(const char *function, const char *file, const int line, - const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors) +inline arm_compute::Status error_on_mismatching_shapes(const char *function, const char *file, const int line, + const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos) { - error_on_mismatching_shapes(function, file, line, 0U, tensor_1, tensor_2, std::forward<Ts>(tensors)...); + return error_on_mismatching_shapes(function, file, line, 0U, tensor_info_1, tensor_info_2, std::forward<Ts>(tensor_infos)...); } - -/** Throw an error if the passed two tensors have different shapes from the given dimension +/** Return an error if the passed two tensors have different shapes from the given dimension * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] upper_dim The dimension from which to check. - * @param[in] tensor_1 The first tensor to be compared. - * @param[in] tensor_2 The second tensor to be compared. - * @param[in] tensors (Optional) Further allowed tensors. + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] tensor_1 The first tensor to be compared. + * @param[in] tensor_2 The second tensor to be compared. + * @param[in] tensors (Optional) Further allowed tensors. + * + * @return Status */ template <typename... Ts> -void error_on_mismatching_shapes(const char *function, const char *file, const int line, - unsigned int upper_dim, const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors) +inline arm_compute::Status error_on_mismatching_shapes(const char *function, const char *file, const int line, + const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors) { - ARM_COMPUTE_UNUSED(function); - ARM_COMPUTE_UNUSED(file); - ARM_COMPUTE_UNUSED(line); - - const std::array < const ITensor *, 2 + sizeof...(Ts) > tensors_array{ { tensor_1, tensor_2, std::forward<Ts>(tensors)... } }; - ARM_COMPUTE_UNUSED(tensors_array); - - ARM_COMPUTE_ERROR_ON_LOC(tensors_array.cbegin() == nullptr, function, file, line); + return error_on_mismatching_shapes(function, file, line, 0U, tensor_1, tensor_2, std::forward<Ts>(tensors)...); +} +/** Return an error if the passed two tensors have different shapes from the given dimension + * + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] upper_dim The dimension from which to check. + * @param[in] tensor_info_1 The first tensor info to be compared. + * @param[in] tensor_info_2 The second tensor info to be compared. + * @param[in] tensor_infos (Optional) Further allowed tensor infos. + * + * @return Status + */ +template <typename... Ts> +inline arm_compute::Status error_on_mismatching_shapes(const char *function, const char *file, const int line, + unsigned int upper_dim, const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos) +{ + ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info_1 == nullptr, function, file, line); + ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info_2 == nullptr, function, file, line); + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensor_infos)...)); - ARM_COMPUTE_ERROR_ON_LOC_MSG(std::any_of(std::next(tensors_array.cbegin()), tensors_array.cend(), [&](const ITensor * tensor) + const std::array < const ITensorInfo *, 2 + sizeof...(Ts) > tensors_info_array{ { tensor_info_1, tensor_info_2, std::forward<Ts>(tensor_infos)... } }; + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(std::next(tensors_info_array.cbegin()), tensors_info_array.cend(), [&](const ITensorInfo * tensor_info) { - ARM_COMPUTE_ERROR_ON_LOC(tensor == nullptr, function, file, line); - return detail::have_different_dimensions((*tensors_array.cbegin())->info()->tensor_shape(), tensor->info()->tensor_shape(), upper_dim); + return detail::have_different_dimensions((*tensors_info_array.cbegin())->tensor_shape(), tensor_info->tensor_shape(), upper_dim); }), function, file, line, "Tensors have different shapes"); + return arm_compute::Status{}; } -#define ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(...) ::arm_compute::error_on_mismatching_shapes(__func__, __FILE__, __LINE__, __VA_ARGS__) - -/** Throw an error if the passed two tensors have different data types +/** Return an error if the passed two tensors have different shapes from the given dimension * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] tensor The first tensor to be compared. - * @param[in] tensors (Optional) Further allowed tensors. + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] upper_dim The dimension from which to check. + * @param[in] tensor_1 The first tensor to be compared. + * @param[in] tensor_2 The second tensor to be compared. + * @param[in] tensors (Optional) Further allowed tensors. + * + * @return Status */ template <typename... Ts> -void error_on_mismatching_data_types(const char *function, const char *file, const int line, - const ITensor *tensor, Ts... tensors) +inline arm_compute::Status error_on_mismatching_shapes(const char *function, const char *file, const int line, + unsigned int upper_dim, const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors) { - ARM_COMPUTE_UNUSED(function); - ARM_COMPUTE_UNUSED(file); - ARM_COMPUTE_UNUSED(line); - ARM_COMPUTE_UNUSED(tensor); - - ARM_COMPUTE_ERROR_ON_LOC(tensor == nullptr, function, file, line); - - DataType &&tensor_data_type = tensor->info()->data_type(); - ARM_COMPUTE_UNUSED(tensor_data_type); + ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_1 == nullptr, function, file, line); + ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_2 == nullptr, function, file, line); + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensors)...)); + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_shapes(function, file, line, upper_dim, tensor_1->info(), tensor_2->info(), + detail::get_tensor_info_t<ITensorInfo *>()(tensors)...)); + return arm_compute::Status{}; +} +#define ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(...) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_shapes(__func__, __FILE__, __LINE__, __VA_ARGS__)) +#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_shapes(__func__, __FILE__, __LINE__, __VA_ARGS__)) - const std::array<const ITensor *, sizeof...(Ts)> tensors_array{ { std::forward<Ts>(tensors)... } }; - ARM_COMPUTE_UNUSED(tensors_array); +/** Return an error if the passed two tensor infos have different data types + * + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] tensor_info The first tensor info to be compared. + * @param[in] tensor_infos (Optional) Further allowed tensor infos. + * + * @return Status + */ +template <typename... Ts> +inline arm_compute::Status error_on_mismatching_data_types(const char *function, const char *file, const int line, + const ITensorInfo *tensor_info, Ts... tensor_infos) +{ + ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line); + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensor_infos)...)); - ARM_COMPUTE_ERROR_ON_LOC_MSG(std::any_of(tensors_array.begin(), tensors_array.end(), [&](const ITensor * tensor_obj) + DataType &&tensor_data_type = tensor_info->data_type(); + const std::array<const ITensorInfo *, sizeof...(Ts)> tensors_infos_array{ { std::forward<Ts>(tensor_infos)... } }; + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_infos_array.begin(), tensors_infos_array.end(), [&](const ITensorInfo * tensor_info_obj) { - ARM_COMPUTE_ERROR_ON_LOC(tensor_obj == nullptr, function, file, line); - return tensor_obj->info()->data_type() != tensor_data_type; + return tensor_info_obj->data_type() != tensor_data_type; }), function, file, line, "Tensors have different data types"); + return arm_compute::Status{}; } +/** Return an error if the passed two tensors have different data types + * + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] tensor The first tensor to be compared. + * @param[in] tensors (Optional) Further allowed tensors. + * + * @return Status + */ +template <typename... Ts> +inline arm_compute::Status error_on_mismatching_data_types(const char *function, const char *file, const int line, + const ITensor *tensor, Ts... tensors) +{ + ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line); + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensors)...)); + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_types(function, file, line, tensor->info(), + detail::get_tensor_info_t<ITensorInfo *>()(tensors)...)); + return arm_compute::Status{}; +} +#define ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(...) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__)) +#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__)) -#define ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(...) ::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__) - -/** Throw an error if the passed tensors have different fixed point data types or different fixed point positions +/** Return an error if the passed tensor infos have different fixed point data types or different fixed point positions * * @note: If the first tensor doesn't have fixed point data type, the function returns without throwing an error * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] tensor_1 The first tensor to be compared. - * @param[in] tensor_2 The second tensor to be compared. - * @param[in] tensors (Optional) Further allowed tensors. + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] tensor_info_1 The first tensor info to be compared. + * @param[in] tensor_info_2 The second tensor info to be compared. + * @param[in] tensor_infos (Optional) Further allowed tensor infos. + * + * @return Status */ template <typename... Ts> -void error_on_mismatching_fixed_point(const char *function, const char *file, const int line, - const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors) +inline arm_compute::Status error_on_mismatching_fixed_point(const char *function, const char *file, const int line, + const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos) { - ARM_COMPUTE_UNUSED(function); - ARM_COMPUTE_UNUSED(file); - ARM_COMPUTE_UNUSED(line); - ARM_COMPUTE_UNUSED(tensor_1); - ARM_COMPUTE_UNUSED(tensor_2); - - DataType &&first_data_type = tensor_1->info()->data_type(); - const int first_fixed_point_position = tensor_1->info()->fixed_point_position(); - ARM_COMPUTE_UNUSED(first_data_type); - ARM_COMPUTE_UNUSED(first_fixed_point_position); - - if((first_data_type != DataType::QS8) && (first_data_type != DataType::QS16)) + DataType &&first_data_type = tensor_info_1->data_type(); + const int first_fixed_point_position = tensor_info_1->fixed_point_position(); + + if(!is_data_type_fixed_point(first_data_type)) { - return; + return arm_compute::Status{}; } - const std::array < const ITensor *, 1 + sizeof...(Ts) > tensors_array{ { tensor_2, std::forward<Ts>(tensors)... } }; - ARM_COMPUTE_UNUSED(tensors_array); - - ARM_COMPUTE_ERROR_ON_LOC_MSG(std::any_of(tensors_array.begin(), tensors_array.end(), [&](const ITensor * tensor) + const std::array < const ITensorInfo *, 1 + sizeof...(Ts) > tensor_infos_array{ { tensor_info_2, std::forward<Ts>(tensor_infos)... } }; + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info) { - return tensor->info()->data_type() != first_data_type; + return tensor_info->data_type() != first_data_type; }), function, file, line, "Tensors have different fixed point data types"); - - ARM_COMPUTE_ERROR_ON_LOC_MSG(std::any_of(tensors_array.begin(), tensors_array.end(), [&](const ITensor * tensor) + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info) { - return tensor->info()->fixed_point_position() != first_fixed_point_position; + return tensor_info->fixed_point_position() != first_fixed_point_position; }), function, file, line, "Tensors have different fixed point positions"); + + return arm_compute::Status{}; } +/** Return an error if the passed tensor have different fixed point data types or different fixed point positions + * + * @note: If the first tensor doesn't have fixed point data type, the function returns without throwing an error + * + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] tensor_1 The first tensor to be compared. + * @param[in] tensor_2 The second tensor to be compared. + * @param[in] tensors (Optional) Further allowed tensors. + * + * @return Status + */ +template <typename... Ts> +inline arm_compute::Status error_on_mismatching_fixed_point(const char *function, const char *file, const int line, + const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors) +{ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_fixed_point(function, file, line, tensor_1->info(), tensor_2->info(), + detail::get_tensor_info_t<ITensorInfo *>()(tensors)...)); + return arm_compute::Status{}; +} +#define ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(...) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__)) +#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(...) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__)) + +/** Return an error if the passed tensor infos have different asymmetric quantized data types or different quantization info + * + * @note: If the first tensor info doesn't have asymmetric quantized data type, the function returns without throwing an error + * + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] tensor_info_1 The first tensor info to be compared. + * @param[in] tensor_info_2 The second tensor info to be compared. + * @param[in] tensor_infos (Optional) Further allowed tensor infos. + * + * @return Status + */ +template <typename... Ts> +inline arm_compute::Status error_on_mismatching_quantization_info(const char *function, const char *file, const int line, + const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos) +{ + DataType &&first_data_type = tensor_info_1->data_type(); + const QuantizationInfo first_quantization_info = tensor_info_1->quantization_info(); + + if(!is_data_type_quantized_asymmetric(first_data_type)) + { + return arm_compute::Status{}; + } -#define ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(...) ::arm_compute::error_on_mismatching_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__) + const std::array < const ITensorInfo *, 1 + sizeof...(Ts) > tensor_infos_array{ { tensor_info_2, std::forward<Ts>(tensor_infos)... } }; + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info) + { + return tensor_info->data_type() != first_data_type; + }), + function, file, line, "Tensors have different asymmetric quantized data types"); + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info) + { + return tensor_info->quantization_info() != first_quantization_info; + }), + function, file, line, "Tensors have different quantization information"); + + return arm_compute::Status{}; +} +/** Return an error if the passed tensor have different asymmetric quantized data types or different quantization info + * + * @note: If the first tensor doesn't have asymmetric quantized data type, the function returns without throwing an error + * + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] tensor_1 The first tensor to be compared. + * @param[in] tensor_2 The second tensor to be compared. + * @param[in] tensors (Optional) Further allowed tensors. + * + * @return Status + */ +template <typename... Ts> +inline arm_compute::Status error_on_mismatching_quantization_info(const char *function, const char *file, const int line, + const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors) +{ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_quantization_info(function, file, line, tensor_1->info(), tensor_2->info(), + detail::get_tensor_info_t<ITensorInfo *>()(tensors)...)); + return arm_compute::Status{}; +} +#define ARM_COMPUTE_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(...) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_quantization_info(__func__, __FILE__, __LINE__, __VA_ARGS__)) +#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(...) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_quantization_info(__func__, __FILE__, __LINE__, __VA_ARGS__)) /** Throw an error if the format of the passed tensor/multi-image does not match any of the formats provided. * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] object Tensor/multi-image to validate. - * @param[in] format First format allowed. - * @param[in] formats (Optional) Further allowed formats. + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] object Tensor/multi-image to validate. + * @param[in] format First format allowed. + * @param[in] formats (Optional) Further allowed formats. */ template <typename T, typename F, typename... Fs> void error_on_format_not_in(const char *function, const char *file, const int line, @@ -374,109 +573,168 @@ void error_on_format_not_in(const char *function, const char *file, const int li } #define ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(t, ...) ::arm_compute::error_on_format_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__) -/** Throw an error if the data type of the passed tensor does not match any of the data types provided. +/** Return an error if the data type of the passed tensor info does not match any of the data types provided. + * + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] tensor_info Tensor info to validate. + * @param[in] dt First data type allowed. + * @param[in] dts (Optional) Further allowed data types. * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] tensor Tensor to validate. - * @param[in] dt First data type allowed. - * @param[in] dts (Optional) Further allowed data types. + * @return Status */ template <typename T, typename... Ts> -void error_on_data_type_not_in(const char *function, const char *file, const int line, - const ITensor *tensor, T &&dt, Ts &&... dts) +inline arm_compute::Status error_on_data_type_not_in(const char *function, const char *file, const int line, + const ITensorInfo *tensor_info, T &&dt, Ts &&... dts) { - ARM_COMPUTE_ERROR_ON_LOC(tensor == nullptr, function, file, line); + ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line); - const DataType &tensor_dt = tensor->info()->data_type(); //NOLINT - ARM_COMPUTE_UNUSED(tensor_dt); - - ARM_COMPUTE_ERROR_ON_LOC(tensor_dt == DataType::UNKNOWN, function, file, line); + const DataType &tensor_dt = tensor_info->data_type(); //NOLINT + ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_dt == DataType::UNKNOWN, function, file, line); const std::array<T, sizeof...(Ts)> dts_array{ { std::forward<Ts>(dts)... } }; - ARM_COMPUTE_UNUSED(dts_array); - - ARM_COMPUTE_ERROR_ON_LOC_MSG(tensor_dt != dt && std::none_of(dts_array.begin(), dts_array.end(), [&](const T & d) + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(tensor_dt != dt && std::none_of(dts_array.begin(), dts_array.end(), [&](const T & d) { return d == tensor_dt; }), function, file, line, "ITensor data type %s not supported by this kernel", string_from_data_type(tensor_dt).c_str()); + return arm_compute::Status{}; } -#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(t, ...) ::arm_compute::error_on_data_type_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__) - -/** Throw an error if the data type or the number of channels of the passed tensor does not match any of the data types and number of channels provided. +/** Return an error if the data type of the passed tensor does not match any of the data types provided. + * + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] tensor Tensor to validate. + * @param[in] dt First data type allowed. + * @param[in] dts (Optional) Further allowed data types. * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] tensor Tensor to validate. - * @param[in] num_channels Number of channels to check - * @param[in] dt First data type allowed. - * @param[in] dts (Optional) Further allowed data types. + * @return Status */ template <typename T, typename... Ts> -void error_on_data_type_channel_not_in(const char *function, const char *file, const int line, - const ITensor *tensor, size_t num_channels, T &&dt, Ts &&... dts) +inline arm_compute::Status error_on_data_type_not_in(const char *function, const char *file, const int line, + const ITensor *tensor, T &&dt, Ts &&... dts) { - error_on_data_type_not_in(function, file, line, tensor, std::forward<T>(dt), std::forward<Ts>(dts)...); - - const size_t tensor_nc = tensor->info()->num_channels(); - ARM_COMPUTE_UNUSED(tensor_nc); - - ARM_COMPUTE_ERROR_ON_LOC_MSG(tensor_nc != num_channels, function, file, line, "Number of channels %d. Required number of channels %d", tensor_nc, num_channels); + ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line); + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_not_in(function, file, line, tensor->info(), std::forward<T>(dt), std::forward<Ts>(dts)...)); + return arm_compute::Status{}; } -#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c, ...) ::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, t, c, __VA_ARGS__) +#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(t, ...) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_data_type_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__)) +#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(t, ...) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__)) -/** Throw an error if the tensor is not 2D. +/** Return an error if the data type or the number of channels of the passed tensor info does not match any of the data types and number of channels provided. + * + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] tensor_info Tensor info to validate. + * @param[in] num_channels Number of channels to check + * @param[in] dt First data type allowed. + * @param[in] dts (Optional) Further allowed data types. * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] tensor Tensor to validate. + * @return Status */ -void error_on_tensor_not_2d(const char *function, const char *file, const int line, - const ITensor *tensor); -#define ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(t) ::arm_compute::error_on_tensor_not_2d(__func__, __FILE__, __LINE__, t) +template <typename T, typename... Ts> +inline arm_compute::Status error_on_data_type_channel_not_in(const char *function, const char *file, const int line, + const ITensorInfo *tensor_info, size_t num_channels, T &&dt, Ts &&... dts) +{ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_not_in(function, file, line, tensor_info, std::forward<T>(dt), std::forward<Ts>(dts)...)); + const size_t tensor_nc = tensor_info->num_channels(); + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(tensor_nc != num_channels, function, file, line, "Number of channels %d. Required number of channels %d", tensor_nc, num_channels); + return arm_compute::Status{}; +} +/** Return an error if the data type or the number of channels of the passed tensor does not match any of the data types and number of channels provided. + * + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] tensor Tensor to validate. + * @param[in] num_channels Number of channels to check + * @param[in] dt First data type allowed. + * @param[in] dts (Optional) Further allowed data types. + * + * @return Status + */ +template <typename T, typename... Ts> +inline arm_compute::Status error_on_data_type_channel_not_in(const char *function, const char *file, const int line, + const ITensor *tensor, size_t num_channels, T &&dt, Ts &&... dts) +{ + ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line); + ARM_COMPUTE_RETURN_ON_ERROR(error_on_data_type_channel_not_in(function, file, line, tensor->info(), num_channels, std::forward<T>(dt), std::forward<Ts>(dts)...)); + return arm_compute::Status{}; +} +#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c, ...) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, t, c, __VA_ARGS__)) +#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c, ...) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, t, c, __VA_ARGS__)) -/** Throw an error if the channel is not in channels. +/** Return an error if the tensor is not 2D. + * + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] tensor Tensor to validate. * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] cn Input channel - * @param[in] channel First channel allowed. - * @param[in] channels (Optional) Further allowed channels. + * @return Status + */ +arm_compute::Status error_on_tensor_not_2d(const char *function, const char *file, const int line, + const ITensor *tensor); +#define ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(t) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_tensor_not_2d(__func__, __FILE__, __LINE__, t)) +#define ARM_COMPUTE_RETURN_ERROR_ON_TENSOR_NOT_2D(t) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_tensor_not_2d(__func__, __FILE__, __LINE__, t)) + +/** Return an error if the channel is not in channels. + * + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] cn Input channel + * @param[in] channel First channel allowed. + * @param[in] channels (Optional) Further allowed channels. + * + * @return Status */ template <typename T, typename... Ts> -void error_on_channel_not_in(const char *function, const char *file, const int line, - T cn, T &&channel, Ts &&... channels) +inline arm_compute::Status error_on_channel_not_in(const char *function, const char *file, const int line, + T cn, T &&channel, Ts &&... channels) { - ARM_COMPUTE_ERROR_ON_LOC(cn == Channel::UNKNOWN, function, file, line); + ARM_COMPUTE_RETURN_ERROR_ON_LOC(cn == Channel::UNKNOWN, function, file, line); const std::array<T, sizeof...(Ts)> channels_array{ { std::forward<Ts>(channels)... } }; - ARM_COMPUTE_UNUSED(channels_array); - ARM_COMPUTE_ERROR_ON_LOC(channel != cn && std::none_of(channels_array.begin(), channels_array.end(), [&](const T & f) + ARM_COMPUTE_RETURN_ERROR_ON_LOC(channel != cn && std::none_of(channels_array.begin(), channels_array.end(), [&](const T & f) { return f == cn; }), function, file, line); + return arm_compute::Status{}; } -#define ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN(c, ...) ::arm_compute::error_on_channel_not_in(__func__, __FILE__, __LINE__, c, __VA_ARGS__) +#define ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN(c, ...) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_channel_not_in(__func__, __FILE__, __LINE__, c, __VA_ARGS__)) +#define ARM_COMPUTE_RETURN_ERROR_ON_CHANNEL_NOT_IN(c, ...) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_channel_not_in(__func__, __FILE__, __LINE__, c, __VA_ARGS__)) -/** Throw an error if the channel is not in format. +/** Return an error if the channel is not in format. + * + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] fmt Input channel + * @param[in] cn First channel allowed. * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] fmt Input channel - * @param[in] cn First channel allowed. + * @return Status */ -void error_on_channel_not_in_known_format(const char *function, const char *file, const int line, - Format fmt, Channel cn); -#define ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(f, c) ::arm_compute::error_on_channel_not_in_known_format(__func__, __FILE__, __LINE__, f, c) +arm_compute::Status error_on_channel_not_in_known_format(const char *function, const char *file, const int line, + Format fmt, Channel cn); +#define ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(f, c) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_channel_not_in_known_format(__func__, __FILE__, __LINE__, f, c)) +#define ARM_COMPUTE_RETURN_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(f, c) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_channel_not_in_known_format(__func__, __FILE__, __LINE__, f, c)) -/** Throw an error if the @ref IMultiHOG container is invalid +/** Return an error if the @ref IMultiHOG container is invalid * * An @ref IMultiHOG container is invalid if: * @@ -484,27 +742,35 @@ void error_on_channel_not_in_known_format(const char *function, const char *file * -# it doesn't contain models * -# it doesn't have the HOG data objects with the same phase_type, normalization_type and l2_hyst_threshold (if normalization_type == L2HYS_NORM) * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] multi_hog IMultiHOG container to validate - */ -void error_on_invalid_multi_hog(const char *function, const char *file, const int line, - const IMultiHOG *multi_hog); -#define ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(m) ::arm_compute::error_on_invalid_multi_hog(__func__, __FILE__, __LINE__, m) - -/** Throw an error if the kernel is not configured. + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] multi_hog IMultiHOG container to validate * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] kernel Kernel to validate. + * @return Status */ -void error_on_unconfigured_kernel(const char *function, const char *file, const int line, - const IKernel *kernel); -#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k) ::arm_compute::error_on_unconfigured_kernel(__func__, __FILE__, __LINE__, k) +arm_compute::Status error_on_invalid_multi_hog(const char *function, const char *file, const int line, + const IMultiHOG *multi_hog); +#define ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(m) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_invalid_multi_hog(__func__, __FILE__, __LINE__, m)) +#define ARM_COMPUTE_RETURN_ERROR_ON_INVALID_MULTI_HOG(m) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_invalid_multi_hog(__func__, __FILE__, __LINE__, m)) + +/** Return an error if the kernel is not configured. + * + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] kernel Kernel to validate. + */ +arm_compute::Status error_on_unconfigured_kernel(const char *function, const char *file, const int line, + const IKernel *kernel); +#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unconfigured_kernel(__func__, __FILE__, __LINE__, k)) +#define ARM_COMPUTE_RETURN_ERROR_ON_UNCONFIGURED_KERNEL(k) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unconfigured_kernel(__func__, __FILE__, __LINE__, k)) -/** Throw an error if if the coordinates and shape of the subtensor are within the parent tensor. +/** Return an error if if the coordinates and shape of the subtensor are within the parent tensor. * * @param[in] function Function in which the error occurred. * @param[in] file Name of the file where the error occurred. @@ -512,68 +778,122 @@ void error_on_unconfigured_kernel(const char *function, const char *file, const * @param[in] parent_shape Parent tensor shape * @param[in] coords Coordinates inside the parent tensor where the first element of the subtensor is * @param[in] shape Shape of the subtensor + * + * @return Status */ -void error_on_invalid_subtensor(const char *function, const char *file, const int line, - const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape); -#define ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(p, c, s) ::arm_compute::error_on_invalid_subtensor(__func__, __FILE__, __LINE__, p, c, s) +arm_compute::Status error_on_invalid_subtensor(const char *function, const char *file, const int line, + const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape); +#define ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(p, c, s) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_invalid_subtensor(__func__, __FILE__, __LINE__, p, c, s)) +#define ARM_COMPUTE_RETURN_ERROR_ON_INVALID_SUBTENSOR(p, c, s) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_invalid_subtensor(__func__, __FILE__, __LINE__, p, c, s)) -/** Throw an error if the valid region of a subtensor is not inside the valid region of the parent tensor. +/** Return an error if the valid region of a subtensor is not inside the valid region of the parent tensor. * * @param[in] function Function in which the error occurred. * @param[in] file Name of the file where the error occurred. * @param[in] line Line on which the error occurred. * @param[in] parent_valid_region Parent valid region. * @param[in] valid_region Valid region of subtensor. - */ -void error_on_invalid_subtensor_valid_region(const char *function, const char *file, const int line, - const ValidRegion &parent_valid_region, const ValidRegion &valid_region); -#define ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(pv, sv) ::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv) - -/** Throw an error if the input fixed-point positions are different. * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] tensor_1 The first tensor to be compared. - * @param[in] tensor_2 The second tensor to be compared. - * @param[in] tensors (Optional) Further allowed tensors. + * @return Status + */ +arm_compute::Status error_on_invalid_subtensor_valid_region(const char *function, const char *file, const int line, + const ValidRegion &parent_valid_region, const ValidRegion &valid_region); +#define ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(pv, sv) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv)) +#define ARM_COMPUTE_RETURN_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(pv, sv) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv)) + +/** Return an error if the input fixed-point positions are different. + * + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] tensor_info_1 The first tensor info to be compared. + * @param[in] tensor_info_2 The second tensor info to be compared. + * @param[in] tensor_infos (Optional) Further allowed tensor infos. + * + * @return Status */ template <typename... Ts> -void error_on_mismatching_fixed_point_position(const char *function, const char *file, const int line, - const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors) +inline arm_compute::Status error_on_mismatching_fixed_point_position(const char *function, const char *file, const int line, + const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos) { - const std::array < const ITensor *, 1 + sizeof...(Ts) > tensors_array{ { tensor_2, std::forward<Ts>(tensors)... } }; - ARM_COMPUTE_UNUSED(tensors_array); - - ARM_COMPUTE_ERROR_ON_LOC_MSG(std::any_of(tensors_array.begin(), tensors_array.end(), [&](const ITensor * tensor) + const std::array < const ITensorInfo *, 1 + sizeof...(Ts) > tensor_info_array{ { tensor_info_2, std::forward<Ts>(tensor_infos)... } }; + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_info_array.begin(), tensor_info_array.end(), [&](const ITensorInfo * tensor_info) { - return tensor->info()->fixed_point_position() != tensor_1->info()->fixed_point_position(); + return tensor_info->fixed_point_position() != tensor_info_1->fixed_point_position(); }), function, file, line, "Tensors have different fixed-point positions"); + return arm_compute::Status{}; } -#define ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(...) ::arm_compute::error_on_mismatching_fixed_point_position(__func__, __FILE__, __LINE__, __VA_ARGS__) - -/** Throw an error if the fixed-point value is not representable in the specified Q format. +/** Return an error if the input fixed-point positions are different. * - * @param[in] function Function in which the error occurred. - * @param[in] file Name of the file where the error occurred. - * @param[in] line Line on which the error occurred. - * @param[in] value The floating point value to be checked. - * @param[in] tensor Input tensor that has information on data type and fixed-point position. + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] tensor_1 The first tensor to be compared. + * @param[in] tensor_2 The second tensor to be compared. + * @param[in] tensors (Optional) Further allowed tensors. + * + * @return Status */ template <typename... Ts> -void error_on_value_not_representable_in_fixed_point(const char *function, const char *file, int line, - float value, const ITensor *tensor) +inline arm_compute::Status error_on_mismatching_fixed_point_position(const char *function, const char *file, const int line, + const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors) { - const int fixed_point_position = tensor->info()->fixed_point_position(); - const DataType dt = tensor->info()->data_type(); + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_fixed_point_position(function, file, line, tensor_1->info(), tensor_2->info(), + detail::get_tensor_info_t<ITensorInfo *>()(tensors)...)); + return arm_compute::Status{}; +} +#define ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(...) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_fixed_point_position(__func__, __FILE__, __LINE__, __VA_ARGS__)) +#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(...) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_fixed_point_position(__func__, __FILE__, __LINE__, __VA_ARGS__)) + +/** Return an error if the fixed-point value is not representable in the specified Q format. + * + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] value The floating point value to be checked. + * @param[in] tensor_info Input tensor info that has information on data type and fixed-point position. + * + * @return Status + */ +inline arm_compute::Status error_on_value_not_representable_in_fixed_point(const char *function, const char *file, int line, + float value, const ITensorInfo *tensor_info) +{ + const int fixed_point_position = tensor_info->fixed_point_position(); + const DataType dt = tensor_info->data_type(); const unsigned int q_max_range = 0xFFFFFFFFu >> (((sizeof(unsigned int) - element_size_from_data_type(dt)) * 8) + 1); const float max_range = q_max_range / (static_cast<float>(1 << fixed_point_position)); - ARM_COMPUTE_UNUSED(max_range); - ARM_COMPUTE_ERROR_ON_LOC_MSG(value > max_range, function, file, line, - "Value %f is not representable in %s with fixed-point position %d", value, string_from_data_type(dt).c_str(), fixed_point_position); + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(value > max_range, function, file, line, + "Value %f is not representable in %s with fixed-point position %d", value, string_from_data_type(dt).c_str(), fixed_point_position); + return arm_compute::Status{}; +} +/** Return an error an error if the fixed-point value is not representable in the specified Q format. + * + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] value The floating point value to be checked. + * @param[in] tensor Input tensor that has information on data type and fixed-point position. + * + * @return Status + */ +inline arm_compute::Status error_on_value_not_representable_in_fixed_point(const char *function, const char *file, int line, + float value, const ITensor *tensor) +{ + ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line); + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_value_not_representable_in_fixed_point(function, file, line, value, tensor->info())); + return arm_compute::Status{}; } -#define ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(...) ::arm_compute::error_on_value_not_representable_in_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__) +#define ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(...) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_value_not_representable_in_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__)) +#define ARM_COMPUTE_RETURN_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(...) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_value_not_representable_in_fixed_point(__func__, __FILE__, __LINE__, __VA_ARGS__)) } #endif /* __ARM_COMPUTE_VALIDATE_H__*/ diff --git a/arm_compute/core/Logger.h b/arm_compute/core/utils/io/FileHandler.h index 0848479d3..d915dbe28 100644 --- a/arm_compute/core/Logger.h +++ b/arm_compute/core/utils/io/FileHandler.h @@ -21,51 +21,56 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#ifndef __ARM_COMPUTE_IO_FILE_HANDLER_H__ +#define __ARM_COMPUTE_IO_FILE_HANDLER_H__ -#ifndef __ARM_COMPUTE_LOGGER_H__ -#define __ARM_COMPUTE_LOGGER_H__ - -#include <iostream> -#include <memory> - -#ifdef ARM_COMPUTE_DEBUG_ENABLED -#define ARM_COMPUTE_LOG(x) (arm_compute::Logger::get().log_info() << x) -#else /* ARM_COMPUTE_DEBUG_ENABLED */ -#define ARM_COMPUTE_LOG(...) -#endif /* ARM_COMPUTE_DEBUG_ENABLED */ +#include <fstream> +#include <string> namespace arm_compute { -/**< Verbosity of the logger */ -enum class LoggerVerbosity +namespace io { - NONE, /**< No info */ - INFO /**< Log info */ -}; - -/** Logger singleton class */ -class Logger +/** File Handling interface */ +class FileHandler { public: - static Logger &get(); - void set_logger(std::ostream &ostream, LoggerVerbosity verbosity); - std::ostream &log_info(); - -private: - /** Default constructor */ - Logger(); + /** Default Constructor */ + FileHandler(); + /** Default Destructor */ + ~FileHandler(); /** Allow instances of this class to be moved */ - Logger(Logger &&) = default; + FileHandler(FileHandler &&) = default; /** Prevent instances of this class from being copied (As this class contains pointers) */ - Logger(const Logger &) = delete; + FileHandler(const FileHandler &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ - Logger &operator=(const Logger &) = delete; + FileHandler &operator=(const FileHandler &) = delete; /** Allow instances of this class to be moved */ - Logger &operator=(Logger &&) = default; + FileHandler &operator=(FileHandler &&) = default; + /** Opens file + * + * @param[in] filename File name + * @param[in] mode File open mode + */ + void open(const std::string &filename, std::ios_base::openmode mode); + /** Closes file */ + void close(); + /** Returns the file stream + * + * @return File stream + */ + std::fstream &stream(); + /** Returns filename of the handled file + * + * @return File filename + */ + std::string filename() const; - std::ostream *_ostream; - std::ostream _nullstream; - LoggerVerbosity _verbosity; +private: + std::fstream _filestream; + std::string _filename; + std::ios_base::openmode _mode; }; -} // arm_compute -#endif /* __ARM_COMPUTE_LOGGER_H__ */
\ No newline at end of file +} // namespace io +} // namespace arm_compute +#endif /* __ARM_COMPUTE_IO_FILE_HANDLER_H__ */ diff --git a/arm_compute/core/utils/logging/FilePrinter.h b/arm_compute/core/utils/logging/FilePrinter.h new file mode 100644 index 000000000..e2ae95208 --- /dev/null +++ b/arm_compute/core/utils/logging/FilePrinter.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_LOGGING_FILE_PRINTER_H__ +#define __ARM_COMPUTE_LOGGING_FILE_PRINTER_H__ + +#include "arm_compute/core/utils/logging/IPrinter.h" + +#include "arm_compute/core/utils/io/FileHandler.h" + +namespace arm_compute +{ +namespace logging +{ +/** File Printer */ +class FilePrinter final : public Printer +{ +public: + /** Default Constructor + * + * @param[in] filename File name + */ + FilePrinter(const std::string &filename); + +private: + // Inherited methods overridden: + void print_internal(const std::string &msg) override; + +private: + io::FileHandler _handler; +}; +} // namespace logging +} // namespace arm_compute +#endif /* __ARM_COMPUTE_LOGGING_FILE_PRINTER_H__ */ diff --git a/arm_compute/core/utils/logging/Helpers.h b/arm_compute/core/utils/logging/Helpers.h new file mode 100644 index 000000000..4bc54e80d --- /dev/null +++ b/arm_compute/core/utils/logging/Helpers.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_LOGGING_HELPERS_H__ +#define __ARM_COMPUTE_LOGGING_HELPERS_H__ + +#include "arm_compute/core/utils/logging/Types.h" +#include "support/ToolchainSupport.h" + +#include <cstddef> +#include <cstdio> +#include <memory> +#include <sstream> +#include <string> + +namespace arm_compute +{ +namespace logging +{ +/** Create a string given a format + * + * @param[in] fmt String format + * @param[in] args Arguments + * + * @return The formatted string + */ +template <typename... Ts> +inline std::string string_with_format(const std::string &fmt, Ts &&... args) +{ + size_t size = support::cpp11::snprintf(nullptr, 0, fmt.c_str(), args...) + 1; + auto char_str = support::cpp14::make_unique<char[]>(size); + support::cpp11::snprintf(char_str.get(), size, fmt.c_str(), args...); + return std::string(char_str.get(), char_str.get() + size - 1); +} +/** Wraps a value with angles and returns the string + * + * @param[in] val Value to wrap + * + * @return Wrapped string + */ +template <typename T> +inline std::string angle_wrap_value(const T &val) +{ + std::ostringstream ss; + ss << "[" << val << "]"; + return ss.str(); +} +/** Translates a given log level to a string. + * + * @param[in] log_level @ref LogLevel to be translated to string. + * + * @return The string describing the logging level. + */ +const std::string &string_from_log_level(LogLevel log_level); +} // namespace logging +} // namespace arm_compute +#endif /* __ARM_COMPUTE_LOGGING_HELPERS_H__ */ diff --git a/arm_compute/core/utils/logging/IPrinter.h b/arm_compute/core/utils/logging/IPrinter.h new file mode 100644 index 000000000..6b410d4d1 --- /dev/null +++ b/arm_compute/core/utils/logging/IPrinter.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_LOGGING_PRINTER_H__ +#define __ARM_COMPUTE_LOGGING_PRINTER_H__ + +#include "support/Mutex.h" + +namespace arm_compute +{ +namespace logging +{ +/** Base printer class to be inherited by other printer classes */ +class Printer +{ +public: + /** Default Constructor */ + Printer() + : _mtx() + { + } + /** Prevent instances of this class from being copied */ + Printer(const Printer &) = delete; + /** Prevent instances of this class from being copied */ + Printer &operator=(const Printer &) = delete; + /** Prevent instances of this class from being moved */ + Printer(Printer &&) = delete; + /** Prevent instances of this class from being moved */ + Printer &operator=(Printer &&) = delete; + /** Defaults Destructor */ + virtual ~Printer() = default; + /** Print message + * + * @param[in] msg Message to print + */ + inline void print(const std::string &msg) + { + std::lock_guard<arm_compute::Mutex> lock(_mtx); + print_internal(msg); + } + +private: + /** Interface to be implemented by the child to print a message + * + * @param[in] msg Message to print + */ + virtual void print_internal(const std::string &msg) = 0; + +private: + arm_compute::Mutex _mtx; +}; +} // namespace logging +} // namespace arm_compute +#endif /* __ARM_COMPUTE_LOGGING_PRINTER_H__ */ diff --git a/arm_compute/core/utils/logging/LogMsgDecorators.h b/arm_compute/core/utils/logging/LogMsgDecorators.h new file mode 100644 index 000000000..0ffb438be --- /dev/null +++ b/arm_compute/core/utils/logging/LogMsgDecorators.h @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_LOGGING_LOG_MSG_DECORATORS_H__ +#define __ARM_COMPUTE_LOGGING_LOG_MSG_DECORATORS_H__ + +#include "arm_compute/core/utils/logging/Helpers.h" +#include "arm_compute/core/utils/logging/Types.h" + +#include <chrono> +#include <ctime> +#include <string> +#include <thread> + +namespace arm_compute +{ +namespace logging +{ +/** Log message decorator interface */ +class IDecorator +{ +public: + /** Default Destructor */ + virtual ~IDecorator() = default; + /** Decorates log message + * + * @param[in] log_msg Log message to decorate + */ + virtual void decorate(LogMsg &log_msg) = 0; +}; + +/** String Decorator + * + * Appends a user defined string in the log message + */ +class StringDecorator : public IDecorator +{ +public: + /** Defaults constructor + * + * @param str Sting to append + */ + StringDecorator(const std::string &str) + : _str(str) + { + _str = angle_wrap_value(str); + } + + // Inherited methods overridden: + void decorate(LogMsg &log_msg) override + { + log_msg.raw_ += _str; + } + +private: + std::string _str; +}; + +/** Date Decorator + * + * Appends the date and time in the log message + */ +class DateDecorator : public IDecorator +{ +public: + // Inherited methods overridden: + void decorate(LogMsg &log_msg) override + { + log_msg.raw_ += angle_wrap_value(get_time()); + } + +private: + /** Gets current system local time + * + * @return Local time + */ + std::string get_time() + { + auto now = std::chrono::system_clock::now(); + auto time = std::chrono::system_clock::to_time_t(now); + + char buf[100] = { 0 }; + std::strftime(buf, sizeof(buf), "%d-%m-%Y %I:%M:%S", std::localtime(&time)); + return buf; + } +}; + +/** Thread ID Decorator + * + * Appends the thread ID in the log message + */ +class ThreadIdDecorator : public IDecorator +{ +public: + // Inherited methods overridden: + void decorate(LogMsg &log_msg) override + { +#ifndef NO_MULTI_THREADING + log_msg.raw_ += angle_wrap_value(std::this_thread::get_id()); +#endif /* NO_MULTI_THREADING */ + } +}; + +/** Log Level Decorator + * + * Appends the logging level in the log message + */ +class LogLevelDecorator : public IDecorator +{ +public: + // Inherited methods overridden: + void decorate(LogMsg &log_msg) override + { + log_msg.raw_ += angle_wrap_value(string_from_log_level(log_msg.log_level_)); + } +}; +} // namespace logging +} // namespace arm_compute +#endif /* __ARM_COMPUTE_LOGGING_LOG_MSG_DECORATORS_H__ */ diff --git a/arm_compute/core/utils/logging/Logger.h b/arm_compute/core/utils/logging/Logger.h new file mode 100644 index 000000000..eb9bdd2e3 --- /dev/null +++ b/arm_compute/core/utils/logging/Logger.h @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_LOGGING_LOGGER_H__ +#define __ARM_COMPUTE_LOGGING_LOGGER_H__ + +#include "arm_compute/core/utils/logging/Helpers.h" +#include "arm_compute/core/utils/logging/IPrinter.h" +#include "arm_compute/core/utils/logging/LogMsgDecorators.h" +#include "arm_compute/core/utils/logging/Types.h" + +#include <memory> +#include <sstream> +#include <string> +#include <vector> + +namespace arm_compute +{ +namespace logging +{ +/** Logger class */ +class Logger +{ +public: + /** Default Constructor + * + * @param[in] name Name of the logger + * @param[in] log_level Logger log level + * @param[in] printer Printer to push the messages + */ + Logger(std::string name, LogLevel log_level, std::shared_ptr<Printer> printer); + /** Default Constructor + * + * @param[in] name Name of the logger + * @param[in] log_level Logger log level + * @param[in] printers Printers to push the messages + */ + Logger(std::string name, LogLevel log_level, std::vector<std::shared_ptr<Printer>> printers = {}); + /** Default Constructor + * + * @param[in] name Name of the logger + * @param[in] log_level Logger log level + * @param[in] printers Printers to push the messages + * @param[in] decorators Message decorators, which append information in the logged message + */ + Logger(std::string name, + LogLevel log_level, + std::vector<std::shared_ptr<Printer>> printers, + std::vector<std::unique_ptr<IDecorator>> decorators); + /** Allow instances of this class to be moved */ + Logger(Logger &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + Logger(const Logger &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + Logger &operator=(const Logger &) = delete; + /** Allow instances of this class to be moved */ + Logger &operator=(Logger &&) = default; + /** Logs a message + * + * @param[in] log_level Log level of the message + * @param[in] msg Message to log + */ + void log(LogLevel log_level, const std::string &msg); + /** Logs a formatted message + * + * @param[in] log_level Log level of the message + * @param[in] fmt Message format + * @param[in] args Message arguments + */ + template <typename... Ts> + void log(LogLevel log_level, const std::string &fmt, Ts &&... args); + /** Sets log level of the logger + * + * @warning Not thread-safe + * + * @param[in] log_level Log level to set + */ + void set_log_level(LogLevel log_level); + /** Returns logger's log level + * + * @return Logger's log level + */ + LogLevel log_level() const; + /** Returns logger's name + * + * @return Logger's name + */ + std::string name() const; + /** Adds a printer to the logger + * + * @warning Not thread-safe + * + * @param[in] printer + */ + void add_printer(std::shared_ptr<Printer> printer); + /** Adds a log message decorator to the logger + * + * @warning Not thread-safe + * + * @param[in] decorator + */ + void add_decorator(std::unique_ptr<IDecorator> decorator); + +private: + /** Set default message decorators */ + void set_default_decorators(); + /** Checks if a message should be logged depending + * on the message log level and the loggers one + * + * @param[in] log_level Log level + * + * @return True if message should be logged else false + */ + bool is_loggable(LogLevel log_level); + /** Decorate log message + * + * @param[in] Log message to decorate + */ + void decorate_log_msg(LogMsg &msg); + /** Creates final log message by creating the prefix + * + * @param[in] str Log message + * @param[in] log_level Message's log level + * + * @return Final log message to print + */ + std::string create_log_msg(const std::string &str, LogLevel log_level); + /** Prints the message to all the printers + * + * @param[in] msg Message to print + */ + void print_all(const std::string &msg); + +private: + std::string _name; + LogLevel _log_level; + std::vector<std::shared_ptr<Printer>> _printers; + std::vector<std::unique_ptr<IDecorator>> _decorators; +}; + +template <typename... Ts> +inline void Logger::log(LogLevel log_level, const std::string &fmt, Ts &&... args) +{ + // Return if message shouldn't be logged + // i.e. if log level does not match the logger's + if(!is_loggable(log_level)) + { + return; + } + + // Print message to all printers + print_all(create_log_msg(string_with_format(fmt, args...), log_level)); +} +} // namespace logging +} // namespace arm_compute +#endif /* __ARM_COMPUTE_LOGGING_LOGGER_H__ */ diff --git a/arm_compute/core/utils/logging/LoggerRegistry.h b/arm_compute/core/utils/logging/LoggerRegistry.h new file mode 100644 index 000000000..d3c691139 --- /dev/null +++ b/arm_compute/core/utils/logging/LoggerRegistry.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_LOGGING_LOGGER_REGISTRY_H__ +#define __ARM_COMPUTE_LOGGING_LOGGER_REGISTRY_H__ + +#include "arm_compute/core/utils/logging/Logger.h" +#include "arm_compute/core/utils/logging/Printers.h" +#include "arm_compute/core/utils/logging/Types.h" +#include "support/Mutex.h" + +#include <memory> +#include <set> +#include <unordered_map> + +namespace arm_compute +{ +namespace logging +{ +/** Registry class holding all the instantiated loggers */ +class LoggerRegistry final +{ +public: + /** Gets registry instance + * + * @return Logger registry instance + */ + static LoggerRegistry &get(); + /** Creates a logger + * + * @note Some names are reserved e.g. [CORE, RUNTIME, GRAPH] + * + * @param[in] name Logger's name + * @param[in] log_level Logger's log level. Defaults to @ref LogLevel::INFO + * @param[in] printers Printers to attach to the system loggers. Defaults with a @ref StdPrinter. + */ + void create_logger(const std::string &name, LogLevel log_level = LogLevel::INFO, + std::vector<std::shared_ptr<Printer>> printers = { std::make_shared<StdPrinter>() }); + /** Remove a logger + * + * @param name Logger's name + */ + void remove_logger(const std::string &name); + /** Returns a logger instance + * + * @param[in] name Logger to return + * + * @return Logger + */ + std::shared_ptr<Logger> logger(const std::string &name); + /** Creates reserved library loggers + * + * @param[in] log_level (Optional) Logger's log level. Defaults to @ref LogLevel::INFO + * @param[in] printers (Optional) Printers to attach to the system loggers. Defaults with a @ref StdPrinter. + */ + void create_reserved_loggers(LogLevel log_level = LogLevel::INFO, + std::vector<std::shared_ptr<Printer>> printers = { std::make_shared<StdPrinter>() }); + +private: + /** Default constructor */ + LoggerRegistry(); + +private: + arm_compute::Mutex _mtx; + std::unordered_map<std::string, std::shared_ptr<Logger>> _loggers; + static std::set<std::string> _reserved_loggers; +}; +} // namespace logging +} // namespace arm_compute +#endif /* __ARM_COMPUTE_LOGGING_LOGGER_REGISTRY_H__ */ diff --git a/arm_compute/core/utils/logging/Macros.h b/arm_compute/core/utils/logging/Macros.h new file mode 100644 index 000000000..bc121e25e --- /dev/null +++ b/arm_compute/core/utils/logging/Macros.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_LOGGING_MACROS_H__ +#define __ARM_COMPUTE_LOGGING_MACROS_H__ + +#include "arm_compute/core/utils/logging/LoggerRegistry.h" + +#include <sstream> + +#ifdef ARM_COMPUTE_LOGGING_ENABLED + +#define ARM_COMPUTE_LOG_MSG(logger_name, log_level, msg) \ + do \ + { \ + auto __logger = arm_compute::logging::LoggerRegistry::get().logger(logger_name); \ + if(__logger != nullptr) \ + { \ + __logger->log(log_level, msg); \ + } \ + } while(false) + +#define ARM_COMPUTE_LOG_MSG_WITH_FORMAT(logger_name, log_level, fmt, ...) \ + do \ + { \ + auto __logger = arm_compute::logging::LoggerRegistry::get().logger(logger_name); \ + if(__logger != nullptr) \ + { \ + __logger->log(log_level, fmt, __VA_ARGS__); \ + } \ + } while(false) + +#define ARM_COMPUTE_LOG_STREAM(logger_name, log_level, stream) \ + do \ + { \ + auto __logger = arm_compute::logging::LoggerRegistry::get().logger(logger_name); \ + if(__logger != nullptr) \ + { \ + __logger->log(log_level, static_cast<std::ostringstream &>(std::ostringstream() << stream).str()); \ + } \ + } while(false) + +#else /* ARM_COMPUTE_LOGGING_ENABLED */ + +#define ARM_COMPUTE_LOG_MSG(logger_name, log_level, msg) +#define ARM_COMPUTE_LOG_MSG_WITH_FORMAT(logger_name, log_level, fmt, ...) +#define ARM_COMPUTE_LOG_STREAM(logger_name, log_level, stream) + +#endif /* ARM_COMPUTE_LOGGING_ENABLED */ + +#endif /* __ARM_COMPUTE_LOGGING_MACROS_H__ */ diff --git a/arm_compute/core/utils/logging/Printers.h b/arm_compute/core/utils/logging/Printers.h new file mode 100644 index 000000000..7e5eef6a0 --- /dev/null +++ b/arm_compute/core/utils/logging/Printers.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_LOGGING_PRINTERS_H__ +#define __ARM_COMPUTE_LOGGING_PRINTERS_H__ + +#include "arm_compute/core/utils/logging/FilePrinter.h" +#include "arm_compute/core/utils/logging/IPrinter.h" +#include "arm_compute/core/utils/logging/StdPrinter.h" + +#endif /* __ARM_COMPUTE_LOGGING_PRINTERS_H__ */ diff --git a/arm_compute/core/utils/logging/StdPrinter.h b/arm_compute/core/utils/logging/StdPrinter.h new file mode 100644 index 000000000..0b41b2602 --- /dev/null +++ b/arm_compute/core/utils/logging/StdPrinter.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_LOGGING_STD_PRINTER_H__ +#define __ARM_COMPUTE_LOGGING_STD_PRINTER_H__ + +#include "arm_compute/core/utils/logging/IPrinter.h" + +#include <iostream> + +namespace arm_compute +{ +namespace logging +{ +/** Std Printer */ +class StdPrinter final : public Printer +{ +private: + // Inherited methods overridden: + void print_internal(const std::string &msg) override + { + std::cout << msg << std::endl; + } +}; +} // namespace logging +} // namespace arm_compute +#endif /* __ARM_COMPUTE_LOGGING_STD_PRINTER_H__ */ diff --git a/arm_compute/core/utils/logging/Types.h b/arm_compute/core/utils/logging/Types.h new file mode 100644 index 000000000..171270d4e --- /dev/null +++ b/arm_compute/core/utils/logging/Types.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_LOGGING_TYPES_H__ +#define __ARM_COMPUTE_LOGGING_TYPES_H__ + +#include <string> + +namespace arm_compute +{ +namespace logging +{ +/** Logging level enumeration */ +enum class LogLevel : unsigned int +{ + VERBOSE, /**< All logging messages */ + INFO, /**< Information log level */ + WARN, /**< Warning log level */ + OFF /**< No logging */ +}; + +struct LogMsg +{ + LogMsg() + : raw_(), log_level_(LogLevel::OFF) + { + } + LogMsg(std::string msg, LogLevel log_level = LogLevel::OFF) + : raw_(msg), log_level_(log_level) + { + } + + std::string raw_; + LogLevel log_level_; +}; +} // namespace logging +} // namespace arm_compute +#endif /* __ARM_COMPUTE_TYPES_H__ */ diff --git a/arm_compute/core/utils/misc/ICloneable.h b/arm_compute/core/utils/misc/ICloneable.h new file mode 100644 index 000000000..5852f14f7 --- /dev/null +++ b/arm_compute/core/utils/misc/ICloneable.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_MISC_ICLONEABLE_H__ +#define __ARM_COMPUTE_MISC_ICLONEABLE_H__ + +#include <memory> + +namespace arm_compute +{ +namespace misc +{ +/** Clonable Interface */ +template <class T> +class ICloneable +{ +public: + /** Default virtual desctructor */ + virtual ~ICloneable() = default; + /** Provide a clone of the current object of class T + * + * @return Clone object of class T + */ + virtual std::unique_ptr<T> clone() const = 0; +}; +} // namespace misc +} // namespace arm_compute +#endif /* __ARM_COMPUTE_MISC_ICLONEABLE_H__ */ diff --git a/arm_compute/core/utils/misc/utility.h b/arm_compute/core/utils/misc/utility.h new file mode 100644 index 000000000..898d0cdea --- /dev/null +++ b/arm_compute/core/utils/misc/utility.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_MISC_UTILITY_H__ +#define __ARM_COMPUTE_MISC_UTILITY_H__ + +#include <array> + +namespace arm_compute +{ +namespace utility +{ +/** @cond */ +template <std::size_t...> +struct index_sequence +{ +}; + +template <std::size_t N, std::size_t... S> +struct index_sequence_generator : index_sequence_generator < N - 1, N - 1, S... > +{ +}; + +template <std::size_t... S> +struct index_sequence_generator<0u, S...> : index_sequence<S...> +{ + using type = index_sequence<S...>; +}; + +template <std::size_t N> +using index_sequence_t = typename index_sequence_generator<N>::type; +/** @endcond */ + +namespace detail +{ +template <std::size_t... S, + typename Iterator, + typename T = std::array<typename std::iterator_traits<Iterator>::value_type, sizeof...(S)>> +T make_array(Iterator first, index_sequence<S...>) +{ + return T{ { first[S]... } }; +} +} // namespace detail + +template <std::size_t N, typename Iterator> +std::array<typename std::iterator_traits<Iterator>::value_type, N> make_array(Iterator first, Iterator last) +{ + return detail::make_array(first, index_sequence_t<N> {}); +} +} // namespace misc +} // namespace arm_compute +#endif /* __ARM_COMPUTE_MISC_UTILITY_H__ */ diff --git a/arm_compute/core/utils/quantization/AsymmHelpers.h b/arm_compute/core/utils/quantization/AsymmHelpers.h new file mode 100644 index 000000000..6fd1d8001 --- /dev/null +++ b/arm_compute/core/utils/quantization/AsymmHelpers.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_QUANTIZATION_ASYMM_HELPERS_H__ +#define __ARM_COMPUTE_QUANTIZATION_ASYMM_HELPERS_H__ + +#include "arm_compute/core/Error.h" + +namespace arm_compute +{ +namespace quantization +{ +/** Calculate quantized representation of multiplier with value less than one. + * + * @param[in] multiplier Real multiplier. + * @param[out] quant_multiplier Integer multiplier. + * @param[out] right_shift Right bit shift. + * + * @return a status + */ +arm_compute::Status calculate_quantized_multiplier_less_than_one(double multiplier, int *quant_multiplier, int *right_shift); +/** Calculate quantized representation of multiplier having value greater than one. + * + * @param[in] multiplier Real multiplier. + * @param[out] quantized_multiplier Integer multiplier. + * @param[out] left_shift Left bit shift. + * + * @return a status + */ +arm_compute::Status calculate_quantized_multiplier_greater_than_one(double multiplier, int *quantized_multiplier, int *left_shift); +} // namespace quantization +} // namespace arm_compute +#endif /* __ARM_COMPUTE_IO_FILE_HANDLER_H__ */ diff --git a/arm_compute/graph/CL/CLMap.h b/arm_compute/graph/CL/CLMap.h index a205ebcad..732a1df77 100644 --- a/arm_compute/graph/CL/CLMap.h +++ b/arm_compute/graph/CL/CLMap.h @@ -29,11 +29,11 @@ namespace arm_compute { -class CLTensor; +class ICLTensor; namespace graph { -class Tensor; +class ITensorObject; /** OpenCL map function */ class CLMap : public arm_compute::IFunction { @@ -43,7 +43,7 @@ public: * @param[in] tensor Tensor to map * @param[in] blocking Flag to specify if the map should be blocking or not (defaults to false) */ - CLMap(Tensor *tensor, bool blocking = false); + CLMap(ITensorObject *tensor, bool blocking = false); /** Prevent instances from being copy constructed */ CLMap(const CLMap &) = delete; /** Prevent instances from being copy assigned */ @@ -57,8 +57,8 @@ public: void run() override; private: - arm_compute::CLTensor *_tensor; /**< Tensor */ - bool _blocking; /**< Blocking flag */ + arm_compute::ICLTensor *_tensor; /**< Tensor */ + bool _blocking; /**< Blocking flag */ }; } // namespace graph } // namespace arm_compute diff --git a/arm_compute/graph/CL/CLUnmap.h b/arm_compute/graph/CL/CLUnmap.h index a72706353..17745c436 100644 --- a/arm_compute/graph/CL/CLUnmap.h +++ b/arm_compute/graph/CL/CLUnmap.h @@ -29,11 +29,11 @@ namespace arm_compute { -class CLTensor; +class ICLTensor; namespace graph { -class Tensor; +class ITensorObject; /** OpenCL un-map function */ class CLUnmap : public arm_compute::IFunction { @@ -42,7 +42,7 @@ public: * * @param[in] tensor Tensor to un-map */ - CLUnmap(Tensor *tensor); + CLUnmap(ITensorObject *tensor); /** Prevent instances from being copy constructed */ CLUnmap(const CLUnmap &) = delete; /** Prevent instances from being copy assigned */ @@ -56,7 +56,7 @@ public: void run() override; private: - arm_compute::CLTensor *_tensor; /**< Tensor */ + arm_compute::ICLTensor *_tensor; /**< Tensor */ }; } // namespace graph } // namespace arm_compute diff --git a/arm_compute/graph/Error.h b/arm_compute/graph/Error.h new file mode 100644 index 000000000..0c8ed266c --- /dev/null +++ b/arm_compute/graph/Error.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GRAPH_ERROR_H__ +#define __ARM_COMPUTE_GRAPH_ERROR_H__ + +#include "arm_compute/graph/ITensorObject.h" + +namespace arm_compute +{ +namespace graph +{ +/** Evaluate if a tensor object is null. If the condition is true then an error message is printed and an exception thrown + * + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] tensor_object Tensor object to evaluate + * @param[in] tensor_objects (Optional) Further allowed tensor objects. + */ +template <typename... Ts> +void error_on_unallocated_tensor_object(const char *function, const char *file, int line, + const ITensorObject *tensor_object, Ts... tensor_objects) +{ + ARM_COMPUTE_UNUSED(function); + ARM_COMPUTE_UNUSED(file); + ARM_COMPUTE_UNUSED(line); + ARM_COMPUTE_UNUSED(tensor_object); + + ARM_COMPUTE_ERROR_ON_LOC(tensor_object == nullptr || tensor_object->tensor() == nullptr, function, file, line); + + const std::array<const ITensorObject *, sizeof...(Ts)> tensor_objects_array{ { std::forward<Ts>(tensor_objects)... } }; + ARM_COMPUTE_UNUSED(tensor_objects_array); + + ARM_COMPUTE_ERROR_ON_LOC(std::any_of(tensor_objects_array.begin(), tensor_objects_array.end(), [&](const ITensorObject * tensor_obj) + { + return (tensor_obj == nullptr || tensor_object->tensor() == nullptr); + }), + function, file, line); +} +#define ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(...) ::arm_compute::graph::error_on_unallocated_tensor_object(__func__, __FILE__, __LINE__, __VA_ARGS__) +} // namespace graph +} // namespace arm_compute +#endif /* __ARM_COMPUTE_GRAPH_ERROR_H__ */ diff --git a/arm_compute/graph/Graph.h b/arm_compute/graph/Graph.h index 9d06f44be..ab1d8b886 100644 --- a/arm_compute/graph/Graph.h +++ b/arm_compute/graph/Graph.h @@ -25,6 +25,8 @@ #define __ARM_COMPUTE_GRAPH_GRAPH_H__ #include "arm_compute/graph/INode.h" +#include "arm_compute/graph/ITensorObject.h" +#include "arm_compute/graph/SubTensor.h" #include "arm_compute/graph/Tensor.h" #include "arm_compute/graph/Types.h" #include "support/ToolchainSupport.h" @@ -64,7 +66,10 @@ public: * * @param[in] tensor Tensor to add */ - void add_tensor(std::unique_ptr<Tensor> tensor); + void add_tensor_object(std::unique_ptr<ITensorObject> tensor); + /** Finalizes the current node's configuration + */ + static bool opencl_is_available(); /** Manually sets the output of the current node * * @param[in] tmp Output info to set @@ -98,6 +103,14 @@ Graph &operator<<(Graph &graph, TensorInfo &&info); * @return Updated graph */ Graph &operator<<(Graph &graph, Tensor &&tensor); +/** Overloaded stream operator to add a sub-tensor to the graph + * + * @param[in, out] graph Graph to add the tensor + * @param[in] sub_tensor Sub-tensor to be added + * + * @return Updated graph + */ +Graph &operator<<(Graph &graph, SubTensor &&sub_tensor); /** Overloaded stream operator to provide a target hint to the graph * * @param[in, out] graph Graph to provide the hint to diff --git a/arm_compute/graph/INode.h b/arm_compute/graph/INode.h index 1b22bdf63..56b50b942 100644 --- a/arm_compute/graph/INode.h +++ b/arm_compute/graph/INode.h @@ -25,6 +25,7 @@ #define __ARM_COMPUTE_GRAPH_INODE_H__ #include "arm_compute/graph/GraphContext.h" +#include "arm_compute/graph/ITensorObject.h" #include "arm_compute/graph/Types.h" #include "arm_compute/runtime/IFunction.h" @@ -46,7 +47,7 @@ public: * @param[in] input Input tensor of the node * @param[in] output Output tensor of the node */ - virtual std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output) = 0; + virtual std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) = 0; /** Override the existing target hint * * @note If the input is DONT_CARE then the method has to pick a technology, diff --git a/arm_compute/graph/IOperation.h b/arm_compute/graph/IOperation.h new file mode 100644 index 000000000..a9fa4f83c --- /dev/null +++ b/arm_compute/graph/IOperation.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GRAPH_IOPERATION_H__ +#define __ARM_COMPUTE_GRAPH_IOPERATION_H__ + +#include "arm_compute/graph/NodeContext.h" +#include "arm_compute/graph/Types.h" +#include "arm_compute/runtime/IFunction.h" + +#include <memory> + +namespace arm_compute +{ +namespace graph +{ +/** Operation functor interface */ +class IOperation +{ +public: + /** Virtual Destructor */ + virtual ~IOperation() = default; + /** Interface to be implemented that configures an operation + * + * @param[in] ctx Node parameters to be used by the operation + */ + virtual std::unique_ptr<arm_compute::IFunction> configure(NodeContext &ctx) = 0; + /** Interface to be implemented that returns the target of the operation + * + * @return Target of the operation + */ + virtual TargetHint target() const = 0; +}; + +#define REGISTER_SIMPLE_OPERATION(NAME, TARGET, OP) \ + class NAME : public IOperation \ + { \ + public: \ + std::unique_ptr<arm_compute::IFunction> configure(NodeContext &ctx) final; \ + TargetHint target() const final \ + { \ + return TargetHint::TARGET; \ + } \ + }; \ + static detail::OperationRegistrar<NAME> NAME##_registrar(OP); \ + std::unique_ptr<arm_compute::IFunction> NAME::configure(NodeContext &ctx) + +} // namespace graph +} // namespace arm_compute +#endif /* __ARM_COMPUTE_GRAPH_IOPERATION_H__ */ diff --git a/arm_compute/graph/ITensorObject.h b/arm_compute/graph/ITensorObject.h new file mode 100644 index 000000000..a922dd53f --- /dev/null +++ b/arm_compute/graph/ITensorObject.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GRAPH_ITENSOROBJECT_H__ +#define __ARM_COMPUTE_GRAPH_ITENSOROBJECT_H__ + +#include "arm_compute/graph/ITensorAccessor.h" +#include "arm_compute/graph/Types.h" +#include "support/ToolchainSupport.h" + +#include <memory> + +namespace arm_compute +{ +namespace graph +{ +/** Tensor object interface */ +class ITensorObject +{ +public: + /** Default Destructor */ + virtual ~ITensorObject() = default; + /** Calls accessor on tensor + * + * @return True if succeeds else false + */ + virtual bool call_accessor() = 0; + /** Checks if tensor has an accessor set. + * + * @return True if an accessor has been set else false + */ + virtual bool has_accessor() const = 0; + /** Sets target of the tensor + * + * @param[in] target Target where the tensor should be pinned in + * + * @return Backend tensor + */ + virtual ITensor *set_target(TargetHint target) = 0; + /** Returns a pointer to the internal tensor + * + * @return Tensor + */ + virtual ITensor *tensor() = 0; + virtual const ITensor *tensor() const = 0; + /** Return the target that this tensor is pinned on + * + * @return Target of the tensor + */ + virtual TargetHint target() const = 0; + /** Allocates the tensor */ + virtual void allocate() = 0; +}; +} // namespace graph +} // namespace arm_compute +#endif /* __ARM_COMPUTE_GRAPH_ITENSOROBJECT_H__ */ diff --git a/arm_compute/graph/NodeContext.h b/arm_compute/graph/NodeContext.h new file mode 100644 index 000000000..bc90f217a --- /dev/null +++ b/arm_compute/graph/NodeContext.h @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GRAPH_NODE_CONTEXT_H__ +#define __ARM_COMPUTE_GRAPH_NODE_CONTEXT_H__ + +#include "arm_compute/core/Error.h" +#include "arm_compute/graph/NodeParameter.h" +#include "arm_compute/graph/Types.h" +#include "support/ToolchainSupport.h" + +#include <map> +#include <memory> +#include <string> + +namespace arm_compute +{ +namespace graph +{ +/** Node Context class + * + * Node context class is used to hold all the parameters required by a node to execute + */ +class NodeContext +{ +public: + /** Default Constructor + * + * @param[in] operation Name of the operation + */ + NodeContext(OperationType operation) + : _operation(operation), _target(TargetHint::DONT_CARE), _inputs(), _outputs(), _parameters() {}; + /** Sets the execution target of the node + * + * @param[in] target Execution target of the node + */ + void set_target(TargetHint target); + /** Adds an input tensor to the context + * + * @param[in] input Input to add + */ + void add_input(arm_compute::ITensor *input); + /** Adds and output to the context + * + * @param[in] output Output to add + */ + void add_output(arm_compute::ITensor *output); + /** Adds a parameter to the context + * + * @param[in] name Parameter name + * @param[in] parameter Parameter to add + */ + template <typename T> + void add_parameter(std::string name, T parameter); + /** Returns the operation of this node. + * + * @return The operation type + */ + OperationType operation() const; + /** Returns the execution target of this node + * + * @return The execution target + */ + TargetHint target() const; + /** Returns input tensor of a given index + * + * @param[in] idx Index of the input tensor + * + * @return A pointer the requested input tensor else nullptr + */ + arm_compute::ITensor *input(size_t idx) const; + /** Returns output tensor of a given index + * + * @param[in] idx Index of the output tensor + * + * @return A pointer the requested output tensor else nullptr + */ + arm_compute::ITensor *output(size_t idx) const; + /** Returns the parameter with the given name + * + * @param[in] name Parameter name + * + * @return The requested parameter else an empty object + */ + template <typename T> + T parameter(std::string name) const; + /** Returns number of inputs + * + * @return Number of inputs + */ + size_t num_inputs() const; + /** Returns number of output + * + * @return Number of outputs + */ + size_t num_outputs() const; + +private: + OperationType _operation; + TargetHint _target; + std::vector<arm_compute::ITensor *> _inputs; + std::vector<arm_compute::ITensor *> _outputs; + std::map<std::string, std::unique_ptr<NodeParameterBase>> _parameters; +}; + +template <typename T> +inline void NodeContext::add_parameter(std::string name, T parameter) +{ + ARM_COMPUTE_ERROR_ON_MSG(_parameters.find(name) != _parameters.end(), "Parameter already exists!"); + _parameters[name] = support::cpp14::make_unique<NodeParameter<T>>(name, parameter); +} + +template <typename T> +inline T NodeContext::parameter(std::string name) const +{ + auto it = _parameters.find(name); + ARM_COMPUTE_ERROR_ON(it == _parameters.end()); + return static_cast<NodeParameter<T> *>(it->second.get())->value(); +} +} // namespace graph +} // namespace arm_compute +#endif /* __ARM_COMPUTE_GRAPH_NODE_CONTEXT_H__ */ diff --git a/arm_compute/graph/NodeParameter.h b/arm_compute/graph/NodeParameter.h new file mode 100644 index 000000000..9d3823d54 --- /dev/null +++ b/arm_compute/graph/NodeParameter.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GRAPH_NODE_PARAMETER_H__ +#define __ARM_COMPUTE_GRAPH_NODE_PARAMETER_H__ + +#include <ostream> +#include <string> + +namespace arm_compute +{ +namespace graph +{ +/**Node Parameter Empty base class */ +class NodeParameterBase +{ +}; + +/** Template parameter implementation */ +template <typename T> +class NodeParameter : public NodeParameterBase +{ +public: + /** Default Constructor + * + * @param[in] name Paremeter name + * @param[in] val Parameter value + */ + NodeParameter(std::string name, T val) + : _name(name), _val(val) {}; + /** Returns parameter's name + * + * @return the name of the parameter + */ + std::string name() const + { + return _name; + } + /** Returns parameter's value + * + * @return the value of the parameter + */ + T value() + { + return _val; + } + +private: + std::string _name; + T _val; +}; +} // namespace graph +} // namespace arm_compute +#endif /* __ARM_COMPUTE_GRAPH_NODE_PARAMETER_H__ */ diff --git a/arm_compute/graph/Nodes.h b/arm_compute/graph/Nodes.h index 548deabeb..0282e1d2a 100644 --- a/arm_compute/graph/Nodes.h +++ b/arm_compute/graph/Nodes.h @@ -26,12 +26,19 @@ #include "arm_compute/graph/nodes/ActivationLayer.h" #include "arm_compute/graph/nodes/BatchNormalizationLayer.h" +#include "arm_compute/graph/nodes/BranchLayer.h" #include "arm_compute/graph/nodes/ConvolutionLayer.h" +#include "arm_compute/graph/nodes/DepthConvertLayer.h" +#include "arm_compute/graph/nodes/DepthwiseConvolutionLayer.h" +#include "arm_compute/graph/nodes/DequantizationLayer.h" +#include "arm_compute/graph/nodes/FlattenLayer.h" #include "arm_compute/graph/nodes/FloorLayer.h" #include "arm_compute/graph/nodes/FullyConnectedLayer.h" #include "arm_compute/graph/nodes/L2NormalizeLayer.h" #include "arm_compute/graph/nodes/NormalizationLayer.h" #include "arm_compute/graph/nodes/PoolingLayer.h" +#include "arm_compute/graph/nodes/QuantizationLayer.h" +#include "arm_compute/graph/nodes/ReshapeLayer.h" #include "arm_compute/graph/nodes/SoftmaxLayer.h" #endif /* __ARM_COMPUTE_GRAPH_NODES_H__ */ diff --git a/arm_compute/graph/OperationRegistrar.h b/arm_compute/graph/OperationRegistrar.h new file mode 100644 index 000000000..ee171c351 --- /dev/null +++ b/arm_compute/graph/OperationRegistrar.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_GRAPH_OPERATION_REGISTRAR +#define ARM_COMPUTE_GRAPH_OPERATION_REGISTRAR + +#include "arm_compute/graph/OperationRegistry.h" +#include "arm_compute/graph/Types.h" + +#include <string> +#include <utility> + +namespace arm_compute +{ +namespace graph +{ +namespace detail +{ +/** Helper class to statically register an operation */ +template <typename T> +class OperationRegistrar final +{ +public: + /** Add a new test case with the given name to the framework. + * + * @param[in] operation Operation type + */ + OperationRegistrar(OperationType operation); +}; + +template <typename T> +inline OperationRegistrar<T>::OperationRegistrar(OperationType operation) +{ + OperationRegistry::get().add_operation<T>(std::move(operation)); +} +} // namespace detail +} // namespace graph +} // namespace arm_compute +#endif /* ARM_COMPUTE_GRAPH_OPERATION_REGISTRAR */
\ No newline at end of file diff --git a/arm_compute/graph/OperationRegistry.h b/arm_compute/graph/OperationRegistry.h new file mode 100644 index 000000000..ae68bf45a --- /dev/null +++ b/arm_compute/graph/OperationRegistry.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GRAPH_OPERATION_REGISTRY_H__ +#define __ARM_COMPUTE_GRAPH_OPERATION_REGISTRY_H__ + +#include "arm_compute/graph/IOperation.h" +#include "arm_compute/graph/Types.h" +#include "support/ToolchainSupport.h" + +#include <map> +#include <memory> +#include <string> + +namespace arm_compute +{ +namespace graph +{ +/** Registry holding all the supported operations */ +class OperationRegistry +{ +public: + /** Gets operation registry instance + * + * @return Operation registry instance + */ + static OperationRegistry &get(); + /** Finds an operation in the registry + * + * @param[in] operation Type of the operation to find + * @param[in] target Target of the operation + * + * @return Pointer to the operation functor if found, else nullptr + */ + IOperation *find_operation(OperationType operation, TargetHint target); + /** Checks if an operation for a given target exists + * + * @param[in] operation Operation type + * @param[in] target Execution target + * + * @return True if exists else false + */ + bool contains(OperationType operation, TargetHint target) const; + /** Registers an operation to the registry + * + * @param operation Operation to register + */ + template <typename T> + void add_operation(OperationType operation); + +private: + /** Default Constructor */ + OperationRegistry(); + +private: + std::map<OperationType, std::vector<std::unique_ptr<IOperation>>> _registered_ops; +}; + +template <typename T> +inline void OperationRegistry::add_operation(OperationType operation) +{ + _registered_ops[operation].emplace_back(support::cpp14::make_unique<T>()); +} +} // namespace graph +} // namespace arm_compute +#endif /* __ARM_COMPUTE_GRAPH_OPERATION_REGISTRY_H__ */ diff --git a/arm_compute/graph/SubGraph.h b/arm_compute/graph/SubGraph.h new file mode 100644 index 000000000..d768bf911 --- /dev/null +++ b/arm_compute/graph/SubGraph.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GRAPH_SUBGRAPH_H__ +#define __ARM_COMPUTE_GRAPH_SUBGRAPH_H__ + +#include "arm_compute/graph/Graph.h" +#include "arm_compute/graph/INode.h" +#include "arm_compute/graph/ITensorObject.h" +#include "arm_compute/graph/SubTensor.h" +#include "arm_compute/graph/Tensor.h" +#include "arm_compute/graph/Types.h" +#include "arm_compute/runtime/IFunction.h" + +#include <memory> + +namespace arm_compute +{ +namespace graph +{ +/** SubGraph class */ +class SubGraph +{ +public: + /** Constructor */ + SubGraph(); + /** Adds a node to the graph + * + * @param[in] node Node to add + */ + void add_node(std::unique_ptr<INode> node); + /** Adds a tensor to the graph + * + * @param[in] tensor Tensor to add + */ + void add_tensor_object(std::unique_ptr<ITensorObject> tensor); + /** Constructs a graph from a subgraph + * + * @param[in] hint Execution target hint + * @param[in] input Input to the graph + * @param[in] output Output to the graph + * + * @return A graph + */ + std::unique_ptr<Graph> construct(TargetHint hint, std::unique_ptr<ITensorObject> input, std::unique_ptr<ITensorObject> output); + /** Checks if the subgraph has an input + * + * @return True if the sub-graph has an input else false + */ + bool has_input() const; + /** Checks if the subgraph has an output + * + * @return True if the sub-graph has an output else false + */ + bool has_output() const; + +private: + std::vector<std::unique_ptr<INode>> _nodes; + std::unique_ptr<ITensorObject> _input; + std::unique_ptr<ITensorObject> _output; +}; + +SubGraph &operator<<(SubGraph &graph, Tensor &&tensor); +SubGraph &operator<<(SubGraph &graph, SubTensor &&sub_tensor); + +template <typename Node> +SubGraph &operator<<(SubGraph &sub_graph, Node node) +{ + sub_graph.add_node(arm_compute::support::cpp14::make_unique<Node>(std::move(node))); + return sub_graph; +} +} // namespace graph +} // namespace arm_compute +#endif /* __ARM_COMPUTE_GRAPH_INODE_H__ */ diff --git a/arm_compute/graph/SubTensor.h b/arm_compute/graph/SubTensor.h index ace93d20a..72aa78927 100644 --- a/arm_compute/graph/SubTensor.h +++ b/arm_compute/graph/SubTensor.h @@ -25,6 +25,7 @@ #define __ARM_COMPUTE_GRAPH_SUBTENSOR_H__ #include "arm_compute/graph/ITensorAccessor.h" +#include "arm_compute/graph/ITensorObject.h" #include "arm_compute/graph/Tensor.h" #include "arm_compute/graph/Types.h" #include "support/ToolchainSupport.h" @@ -36,7 +37,7 @@ namespace arm_compute namespace graph { /** SubTensor class */ -class SubTensor final +class SubTensor final : public ITensorObject { public: /** Default Constructor */ @@ -55,7 +56,7 @@ public: * @param[in] coords Starting coordinates of the sub-tensor in the parent tensor * @param[in] target Execution target */ - SubTensor(ITensor *parent, TensorShape tensor_shape, Coordinates coords, TargetHint target); + SubTensor(arm_compute::ITensor *parent, TensorShape tensor_shape, Coordinates coords, TargetHint target); /** Prevent instances of this class from being copied (As this class contains pointers) */ SubTensor(const SubTensor &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ @@ -67,37 +68,25 @@ public: /** Default Destructor */ ~SubTensor() = default; - /** Sets the given TensorInfo to the tensor - * - * @param[in] info TensorInfo to set - */ - void set_info(SubTensorInfo &&info); - /** Returns tensor's TensorInfo - * - * @return TensorInfo of the tensor - */ - const SubTensorInfo &info() const; - /** Returns a pointer to the internal tensor - * - * @return Tensor - */ - ITensor *tensor(); - /** Return the target that this tensor is pinned on - * - * @return Target of the tensor - */ - TargetHint target() const; + // Inherited methods overriden: + bool call_accessor() override; + bool has_accessor() const override; + arm_compute::ITensor *set_target(TargetHint target) override; + arm_compute::ITensor *tensor() override; + const arm_compute::ITensor *tensor() const override; + TargetHint target() const override; + void allocate() override; private: /** Instantiates a sub-tensor */ void instantiate_subtensor(); private: - TargetHint _target; /**< Target that this tensor is pinned on */ - Coordinates _coords; /**< SubTensor Coordinates */ - SubTensorInfo _info; /**< SubTensor metadata */ - ITensor *_parent; /**< Parent tensor */ - std::unique_ptr<ITensor> _subtensor; /**< SubTensor */ + TargetHint _target; /**< Target that this tensor is pinned on */ + TensorShape _tensor_shape; /**< SubTensor shape */ + Coordinates _coords; /**< SubTensor Coordinates */ + arm_compute::ITensor *_parent; /**< Parent tensor */ + std::unique_ptr<arm_compute::ITensor> _subtensor; /**< SubTensor */ }; } // namespace graph } // namespace arm_compute diff --git a/arm_compute/graph/Tensor.h b/arm_compute/graph/Tensor.h index dbe2ba595..e5821dc81 100644 --- a/arm_compute/graph/Tensor.h +++ b/arm_compute/graph/Tensor.h @@ -25,6 +25,7 @@ #define __ARM_COMPUTE_GRAPH_TENSOR_H__ #include "arm_compute/graph/ITensorAccessor.h" +#include "arm_compute/graph/ITensorObject.h" #include "arm_compute/graph/Types.h" #include "support/ToolchainSupport.h" @@ -35,7 +36,7 @@ namespace arm_compute namespace graph { /** Tensor class */ -class Tensor +class Tensor final : public ITensorObject { public: /** Constructor @@ -94,43 +95,28 @@ public: * @param[in] info TensorInfo to set */ void set_info(TensorInfo &&info); - /** Calls accessor on tensor - * - * @return True if succeeds else false - */ - bool call_accessor(); - /** Sets target of the tensor - * - * @param[in] target Target where the tensor should be pinned in - * - * @return - */ - ITensor *set_target(TargetHint target); /** Returns tensor's TensorInfo * * @return TensorInfo of the tensor */ const TensorInfo &info() const; - /** Returns a pointer to the internal tensor - * - * @return Tensor - */ - ITensor *tensor(); /** Allocates and fills the tensor if needed */ void allocate_and_fill_if_needed(); - /** Allocates the tensor */ - void allocate(); - /** Return the target that this tensor is pinned on - * - * @return Target of the tensor - */ - TargetHint target() const; + + // Inherited methods overriden: + bool call_accessor() override; + bool has_accessor() const override; + arm_compute::ITensor *set_target(TargetHint target) override; + arm_compute::ITensor *tensor() override; + const arm_compute::ITensor *tensor() const override; + TargetHint target() const override; + void allocate() override; private: - TargetHint _target; /**< Target that this tensor is pinned on */ - TensorInfo _info; /**< Tensor metadata */ - std::unique_ptr<ITensorAccessor> _accessor; /**< Tensor Accessor */ - std::unique_ptr<ITensor> _tensor; /**< Tensor */ + TargetHint _target; /**< Target that this tensor is pinned on */ + TensorInfo _info; /**< Tensor metadata */ + std::unique_ptr<ITensorAccessor> _accessor; /**< Tensor Accessor */ + std::unique_ptr<arm_compute::ITensor> _tensor; /**< Tensor */ }; } // namespace graph } // namespace arm_compute diff --git a/arm_compute/graph/Types.h b/arm_compute/graph/Types.h index e48ff84ab..f8d20615d 100644 --- a/arm_compute/graph/Types.h +++ b/arm_compute/graph/Types.h @@ -27,24 +27,50 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/SubTensorInfo.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/logging/Macros.h" + +/** Create a default core logger + * + * @note It will eventually create all default loggers in don't exist + */ +#define ARM_COMPUTE_CREATE_DEFAULT_GRAPH_LOGGER() \ + do \ + { \ + if(arm_compute::logging::LoggerRegistry::get().logger("GRAPH") == nullptr) \ + { \ + arm_compute::logging::LoggerRegistry::get().create_reserved_loggers(); \ + } \ + } while(false) + +#define ARM_COMPUTE_LOG_GRAPH(log_level, x) \ + ARM_COMPUTE_CREATE_DEFAULT_GRAPH_LOGGER(); \ + ARM_COMPUTE_LOG_STREAM("GRAPH", log_level, x) + +#define ARM_COMPUTE_LOG_GRAPH_INFO(x) \ + ARM_COMPUTE_CREATE_DEFAULT_GRAPH_LOGGER(); \ + ARM_COMPUTE_LOG_STREAM("GRAPH", arm_compute::logging::LogLevel::INFO, x) namespace arm_compute { namespace graph { -using arm_compute::ITensor; -using arm_compute::TensorInfo; -using arm_compute::SubTensorInfo; -using arm_compute::DataType; -using arm_compute::Coordinates; -using arm_compute::TensorShape; -using arm_compute::PadStrideInfo; -using arm_compute::WeightsInfo; using arm_compute::ActivationLayerInfo; +using arm_compute::Coordinates; +using arm_compute::DataType; +using arm_compute::DimensionRoundingType; +using arm_compute::ITensorInfo; using arm_compute::NormType; using arm_compute::NormalizationLayerInfo; +using arm_compute::PadStrideInfo; using arm_compute::PoolingLayerInfo; using arm_compute::PoolingType; +using arm_compute::SubTensorInfo; +using arm_compute::TensorInfo; +using arm_compute::TensorShape; +using arm_compute::WeightsInfo; + +using arm_compute::logging::LogLevel; +using arm_compute::ConvertPolicy; /**< Execution hint to the graph executor */ enum class TargetHint @@ -54,12 +80,38 @@ enum class TargetHint NEON /**< Run node on a NEON capable device */ }; -/**< Convolution method hint to the graph executor */ +/** Convolution method hint to the graph executor */ enum class ConvolutionMethodHint { GEMM, /**< Convolution using GEMM */ DIRECT /**< Direct convolution */ }; + +/** Supported layer operations */ +enum class OperationType +{ + ActivationLayer, + BatchNormalizationLayer, + ConvolutionLayer, + DepthConvertLayer, + DepthwiseConvolutionLayer, + DequantizationLayer, + FlattenLayer, + FloorLayer, + FullyConnectedLayer, + L2NormalizeLayer, + NormalizationLayer, + PoolingLayer, + QuantizationLayer, + ReshapeLayer, + SoftmaxLayer +}; + +/** Branch layer merging method */ +enum class BranchMergeMethod +{ + DEPTH_CONCATENATE /**< Concatenate across depth */ +}; } // namespace graph } // namespace arm_compute #endif /*__ARM_COMPUTE_GRAPH_TYPES_H__*/ diff --git a/arm_compute/graph/nodes/ActivationLayer.h b/arm_compute/graph/nodes/ActivationLayer.h index efe8112e7..bc619a8df 100644 --- a/arm_compute/graph/nodes/ActivationLayer.h +++ b/arm_compute/graph/nodes/ActivationLayer.h @@ -26,7 +26,7 @@ #include "arm_compute/graph/GraphContext.h" #include "arm_compute/graph/INode.h" -#include "arm_compute/graph/Tensor.h" +#include "arm_compute/graph/ITensorObject.h" #include "arm_compute/graph/Types.h" namespace arm_compute @@ -44,7 +44,7 @@ public: ActivationLayer(const ActivationLayerInfo activation_info); // Inherited methods overriden: - std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output) override; + std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override; private: const ActivationLayerInfo _activation_info; /**< Activation layer info */ diff --git a/arm_compute/graph/nodes/BatchNormalizationLayer.h b/arm_compute/graph/nodes/BatchNormalizationLayer.h index f01cac236..df7b1d19a 100644 --- a/arm_compute/graph/nodes/BatchNormalizationLayer.h +++ b/arm_compute/graph/nodes/BatchNormalizationLayer.h @@ -26,6 +26,7 @@ #include "arm_compute/graph/GraphContext.h" #include "arm_compute/graph/INode.h" +#include "arm_compute/graph/ITensorObject.h" #include "arm_compute/graph/Tensor.h" #include "arm_compute/graph/Types.h" @@ -52,7 +53,7 @@ public: } // Inherited methods overriden: - std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output) override; + std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override; private: Tensor _mean; diff --git a/arm_compute/graph/nodes/BranchLayer.h b/arm_compute/graph/nodes/BranchLayer.h new file mode 100644 index 000000000..c71899f4f --- /dev/null +++ b/arm_compute/graph/nodes/BranchLayer.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GRAPH_BRANCH_LAYER_H__ +#define __ARM_COMPUTE_GRAPH_BRANCH_LAYER_H__ + +#include "arm_compute/graph/GraphContext.h" +#include "arm_compute/graph/INode.h" +#include "arm_compute/graph/ITensorObject.h" +#include "arm_compute/graph/SubGraph.h" +#include "arm_compute/graph/SubTensor.h" +#include "arm_compute/graph/Types.h" + +#include "arm_compute/core/Helpers.h" + +#include <vector> + +namespace arm_compute +{ +namespace graph +{ +/** Branch Layer node */ +class BranchLayer final : public INode +{ +public: + /** Default Constructor + * + * @param[in] merge_method Branch merging method + * @param[in] sub_graph1 First graph branch + * @param[in] sub_graph2 Second graph branch + * @param[in] rest_sub_graphs Rest sub-graph branches + */ + template <typename... Ts> + BranchLayer(BranchMergeMethod merge_method, SubGraph &&sub_graph1, SubGraph &&sub_graph2, Ts &&... rest_sub_graphs) + : _branch_merge_method(merge_method), _sub_graphs() + { + _sub_graphs.push_back(arm_compute::support::cpp14::make_unique<SubGraph>(std::move(sub_graph1))); + _sub_graphs.push_back(arm_compute::support::cpp14::make_unique<SubGraph>(std::move(sub_graph2))); + + for_each([&](SubGraph & sub_graph) + { + _sub_graphs.push_back(arm_compute::support::cpp14::make_unique<SubGraph>(std::move(sub_graph))); + }, + std::move(rest_sub_graphs)...); + } + /** Default Constructor + * + * @param[in] sub_graph Sub graph + */ + template <typename... Ts> + BranchLayer(SubGraph &&sub_graph) + : _branch_merge_method(BranchMergeMethod::DEPTH_CONCATENATE), _sub_graphs() + { + _sub_graphs.push_back(arm_compute::support::cpp14::make_unique<SubGraph>(std::move(sub_graph))); + } + + // Inherited methods overriden: + std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override; + +private: + BranchMergeMethod _branch_merge_method; + std::vector<std::unique_ptr<SubGraph>> _sub_graphs; +}; +} // namespace graph +} // namespace arm_compute +#endif /* __ARM_COMPUTE_GRAPH_BRANCH_LAYER_H__ */ diff --git a/arm_compute/graph/nodes/ConvolutionLayer.h b/arm_compute/graph/nodes/ConvolutionLayer.h index 04ba3dd6b..0905524de 100644 --- a/arm_compute/graph/nodes/ConvolutionLayer.h +++ b/arm_compute/graph/nodes/ConvolutionLayer.h @@ -26,6 +26,7 @@ #include "arm_compute/graph/GraphContext.h" #include "arm_compute/graph/INode.h" +#include "arm_compute/graph/ITensorObject.h" #include "arm_compute/graph/SubTensor.h" #include "arm_compute/graph/Tensor.h" #include "arm_compute/graph/Types.h" @@ -77,7 +78,7 @@ public: } // Inherited methods overriden: - std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output) override; + std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override; private: /** Instantiates a non-grouped convolution diff --git a/arm_compute/graph/nodes/DepthConvertLayer.h b/arm_compute/graph/nodes/DepthConvertLayer.h new file mode 100644 index 000000000..03bf9b7ed --- /dev/null +++ b/arm_compute/graph/nodes/DepthConvertLayer.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GRAPH_DEPTHCONVERT_LAYER_H__ +#define __ARM_COMPUTE_GRAPH_DEPTHCONVERT_LAYER_H__ + +#include "arm_compute/graph/GraphContext.h" +#include "arm_compute/graph/INode.h" +#include "arm_compute/graph/ITensorObject.h" +#include "arm_compute/graph/Types.h" + +namespace arm_compute +{ +namespace graph +{ +/** DepthConvertLayer layer node */ +class DepthConvertLayer final : public INode +{ +public: + /** Default constructor + * + * @param[in] policy Convertion policy + * @param[in] shift Shift value + * @param[in] output_datatype Output datatype + */ + DepthConvertLayer(const ConvertPolicy policy, uint32_t shift, DataType output_datatype); + + // Inherited methods overriden: + std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override; + +private: + const ConvertPolicy _policy; + uint32_t _shift; + DataType _output_datatype; +}; +} // namespace graph +} // namespace arm_compute +#endif /* __ARM_COMPUTE_GRAPH_DEPTHCONVERT_LAYER_H__ */ diff --git a/arm_compute/graph/nodes/DepthwiseConvolutionLayer.h b/arm_compute/graph/nodes/DepthwiseConvolutionLayer.h new file mode 100644 index 000000000..8b7e3b829 --- /dev/null +++ b/arm_compute/graph/nodes/DepthwiseConvolutionLayer.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GRAPH_DEPTHWISE_CONVOLUTION_LAYER_H__ +#define __ARM_COMPUTE_GRAPH_DEPTHWISE_CONVOLUTION_LAYER_H__ + +#include "arm_compute/graph/GraphContext.h" +#include "arm_compute/graph/INode.h" +#include "arm_compute/graph/ITensorObject.h" +#include "arm_compute/graph/SubTensor.h" +#include "arm_compute/graph/Tensor.h" +#include "arm_compute/graph/Types.h" +#include "arm_compute/runtime/IFunction.h" + +#include <memory> + +namespace arm_compute +{ +namespace graph +{ +/** Convolution layer node */ +class DepthwiseConvolutionLayer final : public INode +{ +public: + /** Default constructor + * + * @param[in] conv_width Convolution width + * @param[in] conv_height Convolution height + * @param[in] weights Weights values tensor + * @param[in] biases Biases values tensor + * @param[in] conv_info Convolution info + * @param[in] opt3x3 (Optional) If true executes DepthwiseConvolutionLayer3x3 + */ + template <typename AccessorType> + DepthwiseConvolutionLayer(unsigned int conv_width, unsigned int conv_height, AccessorType &&weights, AccessorType &&biases, const PadStrideInfo conv_info, bool opt3x3 = true) + : _conv_width(conv_width), _conv_height(conv_height), _weights(std::move(weights)), _biases(std::move(biases)), _conv_info(conv_info), _opt3x3(opt3x3) + { + } + + // Inherited methods overriden: + std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override; + +private: + unsigned int _conv_width; + unsigned int _conv_height; + Tensor _weights; + Tensor _biases; + const PadStrideInfo _conv_info; + bool _opt3x3; +}; +} // namespace graph +} // namespace arm_compute +#endif /* __ARM_COMPUTE_GRAPH_DEPTHWISE_CONVOLUTION_LAYER_H__ */ diff --git a/arm_compute/graph/nodes/DequantizationLayer.h b/arm_compute/graph/nodes/DequantizationLayer.h new file mode 100644 index 000000000..f9b7e8af8 --- /dev/null +++ b/arm_compute/graph/nodes/DequantizationLayer.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GRAPH_DEQUANTIZATION_LAYER_H__ +#define __ARM_COMPUTE_GRAPH_DEQUANTIZATION_LAYER_H__ + +#include "arm_compute/graph/GraphContext.h" +#include "arm_compute/graph/INode.h" +#include "arm_compute/graph/ITensorObject.h" +#include "arm_compute/graph/Tensor.h" +#include "arm_compute/graph/Types.h" + +namespace arm_compute +{ +namespace graph +{ +/** DequantizationLayer layer node */ +class DequantizationLayer final : public INode +{ +public: + /** Default constructor + * + * @param[in] min_max Min max value tensor + */ + template <typename AccessorType> + DequantizationLayer(AccessorType &&min_max) + : _min_max(std::move(min_max)) + { + } + + // Inherited methods overriden: + std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override; + +private: + Tensor _min_max; +}; +} // namespace graph +} // namespace arm_compute +#endif /* __ARM_COMPUTE_GRAPH_DEQUANTIZATION_LAYER_H__ */ diff --git a/arm_compute/graph/nodes/FlattenLayer.h b/arm_compute/graph/nodes/FlattenLayer.h new file mode 100644 index 000000000..c5f51a2b3 --- /dev/null +++ b/arm_compute/graph/nodes/FlattenLayer.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GRAPH_FLATTEN_LAYER_H__ +#define __ARM_COMPUTE_GRAPH_FLATTEN_LAYER_H__ + +#include "arm_compute/graph/GraphContext.h" +#include "arm_compute/graph/INode.h" +#include "arm_compute/graph/ITensorObject.h" +#include "arm_compute/graph/Types.h" + +namespace arm_compute +{ +namespace graph +{ +/** Flatten layer node */ +class FlattenLayer final : public INode +{ +public: + // Inherited methods overriden: + std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override; +}; +} // namespace graph +} // namespace arm_compute +#endif /* __ARM_COMPUTE_GRAPH_FLATTEN_LAYER_H__ */ diff --git a/arm_compute/graph/nodes/FloorLayer.h b/arm_compute/graph/nodes/FloorLayer.h index 40fde3b79..146e2c16d 100644 --- a/arm_compute/graph/nodes/FloorLayer.h +++ b/arm_compute/graph/nodes/FloorLayer.h @@ -26,18 +26,18 @@ #include "arm_compute/graph/GraphContext.h" #include "arm_compute/graph/INode.h" -#include "arm_compute/graph/Tensor.h" +#include "arm_compute/graph/ITensorObject.h" #include "arm_compute/graph/Types.h" namespace arm_compute { namespace graph { /** Floor layer node */ -class FloorLayer : public INode +class FloorLayer final : public INode { public: // Inherited methods overriden: - std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output) override; + std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override; }; } // namespace graph diff --git a/arm_compute/graph/nodes/FullyConnectedLayer.h b/arm_compute/graph/nodes/FullyConnectedLayer.h index d31e06045..270676a6b 100644 --- a/arm_compute/graph/nodes/FullyConnectedLayer.h +++ b/arm_compute/graph/nodes/FullyConnectedLayer.h @@ -26,6 +26,7 @@ #include "arm_compute/graph/GraphContext.h" #include "arm_compute/graph/INode.h" +#include "arm_compute/graph/ITensorObject.h" #include "arm_compute/graph/Tensor.h" #include "arm_compute/graph/Types.h" @@ -50,7 +51,7 @@ public: } // Inherited methods overriden: - std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output) override; + std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override; // Inherited methods overriden: private: diff --git a/arm_compute/graph/nodes/L2NormalizeLayer.h b/arm_compute/graph/nodes/L2NormalizeLayer.h index ab333a221..a423306bd 100644 --- a/arm_compute/graph/nodes/L2NormalizeLayer.h +++ b/arm_compute/graph/nodes/L2NormalizeLayer.h @@ -26,14 +26,14 @@ #include "arm_compute/graph/GraphContext.h" #include "arm_compute/graph/INode.h" -#include "arm_compute/graph/Tensor.h" +#include "arm_compute/graph/ITensorObject.h" #include "arm_compute/graph/Types.h" namespace arm_compute { namespace graph { -/** L2Normalize layer node */ +/** L2NormalizeLayer layer node */ class L2NormalizeLayer final : public INode { public: @@ -42,13 +42,10 @@ public: * @param[in] axis Dimension along which to reduce. * @param[in] epsilon Lower bound value for the normalization. */ - explicit L2NormalizeLayer(unsigned int axis, float epsilon) - : _axis(axis), _epsilon(epsilon) - { - } + explicit L2NormalizeLayer(unsigned int axis, float epsilon); // Inherited methods overriden: - std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output) override; + std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override; private: unsigned int _axis; diff --git a/arm_compute/graph/nodes/NormalizationLayer.h b/arm_compute/graph/nodes/NormalizationLayer.h index 02efd1cbe..e1c45094d 100644 --- a/arm_compute/graph/nodes/NormalizationLayer.h +++ b/arm_compute/graph/nodes/NormalizationLayer.h @@ -26,6 +26,7 @@ #include "arm_compute/graph/GraphContext.h" #include "arm_compute/graph/INode.h" +#include "arm_compute/graph/ITensorObject.h" #include "arm_compute/graph/Types.h" namespace arm_compute @@ -43,7 +44,7 @@ public: explicit NormalizationLayer(const NormalizationLayerInfo norm_info); // Inherited methods overriden: - std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output) override; + std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override; private: const NormalizationLayerInfo _norm_info; /**< Normalization layer information */ diff --git a/arm_compute/graph/nodes/PoolingLayer.h b/arm_compute/graph/nodes/PoolingLayer.h index 87b15d06c..5c45bc04e 100644 --- a/arm_compute/graph/nodes/PoolingLayer.h +++ b/arm_compute/graph/nodes/PoolingLayer.h @@ -26,7 +26,7 @@ #include "arm_compute/graph/GraphContext.h" #include "arm_compute/graph/INode.h" -#include "arm_compute/graph/Tensor.h" +#include "arm_compute/graph/ITensorObject.h" #include "arm_compute/graph/Types.h" namespace arm_compute @@ -44,7 +44,7 @@ public: PoolingLayer(const PoolingLayerInfo pool_info); // Inherited methods overriden: - std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output) override; + std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override; private: const PoolingLayerInfo _pool_info; /**< Pooling layer information */ diff --git a/arm_compute/graph/nodes/QuantizationLayer.h b/arm_compute/graph/nodes/QuantizationLayer.h new file mode 100644 index 000000000..a3ef02530 --- /dev/null +++ b/arm_compute/graph/nodes/QuantizationLayer.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GRAPH_QUANTIZATION_LAYER_H__ +#define __ARM_COMPUTE_GRAPH_QUANTIZATION_LAYER_H__ + +#include "arm_compute/graph/GraphContext.h" +#include "arm_compute/graph/INode.h" +#include "arm_compute/graph/ITensorObject.h" +#include "arm_compute/graph/Types.h" + +namespace arm_compute +{ +namespace graph +{ +/** Quantization layer node */ +class QuantizationLayer final : public INode +{ +public: + // Inherited methods overriden: + std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override; +}; +} // namespace graph +} // namespace arm_compute +#endif /* __ARM_COMPUTE_GRAPH_QUANTIZATION_LAYER_H__ */ diff --git a/arm_compute/graph/nodes/ReshapeLayer.h b/arm_compute/graph/nodes/ReshapeLayer.h new file mode 100644 index 000000000..b727d33a2 --- /dev/null +++ b/arm_compute/graph/nodes/ReshapeLayer.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GRAPH_RESHAPE_LAYER_H__ +#define __ARM_COMPUTE_GRAPH_RESHAPE_LAYER_H__ + +#include "arm_compute/graph/GraphContext.h" +#include "arm_compute/graph/INode.h" +#include "arm_compute/graph/ITensorObject.h" +#include "arm_compute/graph/Types.h" + +namespace arm_compute +{ +namespace graph +{ +/** Reshape layer node */ +class ReshapeLayer final : public INode +{ +public: + /** Default constructor + * + * @param[in] shape Output shape + */ + ReshapeLayer(const TensorShape shape); + + // Inherited methods overriden: + std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override; + +private: + TensorShape _shape; +}; +} // namespace graph +} // namespace arm_compute +#endif /* __ARM_COMPUTE_GRAPH_RESHAPE_LAYER_H__ */ diff --git a/arm_compute/graph/nodes/SoftmaxLayer.h b/arm_compute/graph/nodes/SoftmaxLayer.h index 2e1bd98c8..b5d1bc53f 100644 --- a/arm_compute/graph/nodes/SoftmaxLayer.h +++ b/arm_compute/graph/nodes/SoftmaxLayer.h @@ -26,20 +26,19 @@ #include "arm_compute/graph/GraphContext.h" #include "arm_compute/graph/INode.h" -#include "arm_compute/graph/Tensor.h" +#include "arm_compute/graph/ITensorObject.h" #include "arm_compute/graph/Types.h" namespace arm_compute { namespace graph { /** Softmax layer node */ -class SoftmaxLayer : public INode +class SoftmaxLayer final : public INode { public: // Inherited methods overriden: - std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensor *input, ITensor *output) override; + std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override; }; - } // namespace graph } // namespace arm_compute #endif /* __ARM_COMPUTE_GRAPH_SOFTMAX_LAYER_H__ */ diff --git a/arm_compute/runtime/BlobLifetimeManager.h b/arm_compute/runtime/BlobLifetimeManager.h index ec43f47fe..edf4d4342 100644 --- a/arm_compute/runtime/BlobLifetimeManager.h +++ b/arm_compute/runtime/BlobLifetimeManager.h @@ -24,21 +24,20 @@ #ifndef __ARM_COMPUTE_BLOBLIFETIMEMANAGER_H__ #define __ARM_COMPUTE_BLOBLIFETIMEMANAGER_H__ -#include "arm_compute/runtime/ILifetimeManager.h" +#include "arm_compute/runtime/ISimpleLifetimeManager.h" -#include "arm_compute/runtime/IMemoryGroup.h" +#include "arm_compute/runtime/IMemoryPool.h" #include "arm_compute/runtime/Types.h" #include <cstddef> -#include <map> +#include <memory> #include <vector> namespace arm_compute { -class IMemoryGroup; - -/** Class that tracks the lifetime of registered tensors and calculates the systems memory requirements in terms of blobs */ -class BlobLifetimeManager : public ILifetimeManager +/** Concrete class that tracks the lifetime of registered tensors and + * calculates the systems memory requirements in terms of blobs */ +class BlobLifetimeManager : public ISimpleLifetimeManager { public: /** Constructor */ @@ -53,35 +52,15 @@ public: BlobLifetimeManager &operator=(BlobLifetimeManager &&) = default; // Inherited methods overridden: - void register_group(IMemoryGroup *group) override; - void start_lifetime(void *obj) override; - void end_lifetime(void *obj, void **handle, size_t size) override; std::unique_ptr<IMemoryPool> create_pool(IAllocator *allocator) override; - bool are_all_finalized() const override; MappingType mapping_type() const override; private: - /** Update blobs and mappings */ - void update_blobs_and_mappings(); + // Inherited methods overridden: + void update_blobs_and_mappings() override; private: - /** Element struct */ - struct Element - { - Element(void *id_ = nullptr, void **handle_ = nullptr, size_t size_ = 0, bool status_ = false) - : id(id_), handle(handle_), size(size_), status(status_) - { - } - void *id; /**< Element id */ - void **handle; /**< Element's memory handle */ - size_t size; /**< Element's size */ - bool status; /**< Lifetime status */ - }; - - IMemoryGroup *_active_group; /**< Active group */ - std::vector<Element> _active_elements; /**< A map that contains the active elements */ - std::map<IMemoryGroup *, std::vector<Element>> _finalized_groups; /**< A map that contains the finalized groups */ - std::vector<size_t> _blobs; + std::vector<size_t> _blobs; /**< Memory blobs' sizes */ }; -} // arm_compute +} // namespace arm_compute #endif /* __ARM_COMPUTE_BLOBLIFETIMEMANAGER_H__ */ diff --git a/arm_compute/runtime/BlobMemoryPool.h b/arm_compute/runtime/BlobMemoryPool.h index f703bf0b8..25bfd539f 100644 --- a/arm_compute/runtime/BlobMemoryPool.h +++ b/arm_compute/runtime/BlobMemoryPool.h @@ -79,5 +79,5 @@ private: std::vector<void *> _blobs; /**< Vector holding all the memory blobs */ std::vector<size_t> _blob_sizes; /**< Sizes of each blob */ }; -} // arm_compute +} // namespace arm_compute #endif /* __ARM_COMPUTE_BLOBMEMORYPOOL_H__ */ diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h index 360372d19..f6ecef7a5 100644 --- a/arm_compute/runtime/CL/CLFunctions.h +++ b/arm_compute/runtime/CL/CLFunctions.h @@ -42,9 +42,9 @@ #include "arm_compute/runtime/CL/functions/CLColorConvert.h" #include "arm_compute/runtime/CL/functions/CLConvolution.h" #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h" -#include "arm_compute/runtime/CL/functions/CLDepthConcatenate.h" -#include "arm_compute/runtime/CL/functions/CLDepthConvert.h" -#include "arm_compute/runtime/CL/functions/CLDepthwiseConvolution.h" +#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h" +#include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h" +#include "arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h" #include "arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h" #include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h" #include "arm_compute/runtime/CL/functions/CLDerivative.h" @@ -59,7 +59,8 @@ #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h" #include "arm_compute/runtime/CL/functions/CLGEMM.h" #include "arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h" -#include "arm_compute/runtime/CL/functions/CLGEMMLowp.h" +#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" +#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h" #include "arm_compute/runtime/CL/functions/CLGEMMTranspose1xW.h" #include "arm_compute/runtime/CL/functions/CLGaussian3x3.h" #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h" @@ -71,7 +72,7 @@ #include "arm_compute/runtime/CL/functions/CLHarrisCorners.h" #include "arm_compute/runtime/CL/functions/CLHistogram.h" #include "arm_compute/runtime/CL/functions/CLIntegralImage.h" -#include "arm_compute/runtime/CL/functions/CLL2Normalize.h" +#include "arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h" #include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h" #include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h" #include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h" diff --git a/arm_compute/runtime/CL/CLMultiImage.h b/arm_compute/runtime/CL/CLMultiImage.h index f70929db0..2c2b4709b 100644 --- a/arm_compute/runtime/CL/CLMultiImage.h +++ b/arm_compute/runtime/CL/CLMultiImage.h @@ -44,18 +44,18 @@ public: CLMultiImage(); /** Init the multi-planar image * - * @param[in] width Width of the whole image - * @param[in] height Heigth of the whole image - * @param[in] format Format of the whole image + * @param[in] width Width of the whole image + * @param[in] height Heigth of the whole image + * @param[in] format Format of the whole image */ void init(unsigned int width, unsigned int height, Format format); /** Init the multi-planar image * * @note Uses conservative padding strategy which fits all kernels. * - * @param[in] width Width of the whole image - * @param[in] height Height of the whole image - * @param[in] format Format of the whole image + * @param[in] width Width of the whole image + * @param[in] height Height of the whole image + * @param[in] format Format of the whole image */ void init_auto_padding(unsigned int width, unsigned int height, Format format); /** Allocated a previously initialised multi image @@ -73,10 +73,10 @@ public: private: /** Init the multi-planar image * - * @param[in] width Width of the whole image - * @param[in] height Height of the whole image - * @param[in] format Format of the whole image - * @param[in] auto_padding Specifies whether the image uses auto padding + * @param[in] width Width of the whole image + * @param[in] height Height of the whole image + * @param[in] format Format of the whole image + * @param[in] auto_padding Specifies whether the image uses auto padding */ void internal_init(unsigned int width, unsigned int height, Format format, bool auto_padding); diff --git a/arm_compute/runtime/CL/functions/CLActivationLayer.h b/arm_compute/runtime/CL/functions/CLActivationLayer.h index a1aeb193d..5b99abc5f 100644 --- a/arm_compute/runtime/CL/functions/CLActivationLayer.h +++ b/arm_compute/runtime/CL/functions/CLActivationLayer.h @@ -49,6 +49,16 @@ public: * @param[in] act_info Activation layer parameters. */ void configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info); + /** Static function to check if given info will lead to a valid configuration of @ref CLActivationLayer + * + * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result + * of the activation function. Data types supported: QS8/QS16/F16/F32. + * @param[in] output Destination tensor info. Data type supported: same as @p input + * @param[in] act_info Activation layer information. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info); }; } #endif /* __ARM_COMPUTE_CLACTIVATIONLAYER_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLArithmeticAddition.h b/arm_compute/runtime/CL/functions/CLArithmeticAddition.h index f888256b3..1ef3e274c 100644 --- a/arm_compute/runtime/CL/functions/CLArithmeticAddition.h +++ b/arm_compute/runtime/CL/functions/CLArithmeticAddition.h @@ -47,6 +47,16 @@ public: * @param[in] policy Policy to use to handle overflow. */ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticAddition + * + * @param[in] input1 First tensor input info. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input2 Second tensor input info. Data types supported: U8/QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16/F32. + * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32. + * @param[in] policy Policy to use to handle overflow. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy); }; } #endif /* __ARM_COMPUTE_CLARITHMETICADDITION_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h b/arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h index eedeaa8d0..0d3f5bce6 100644 --- a/arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h +++ b/arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h @@ -48,6 +48,16 @@ public: * @param[in] policy Policy to use to handle overflow. */ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticSubtraction + * + * @param[in] input1 First tensor input info. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input2 Second tensor input info. Data types supported: U8/QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16/F32. + * @param[in] output Output tensor info. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16/F32. + * @param[in] policy Policy to use to handle overflow. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy); }; } #endif /* __ARM_COMPUTE_CLARITHMETICSUBTRACTION_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h index ffb66bee6..127de1055 100644 --- a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h +++ b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h @@ -51,14 +51,32 @@ public: * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result. * 3 lower dimensions represent a single input with dimensions [width, height, FM]. * The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32. + * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input * @param[in] mean Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input * @param[in] var Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input - * @param[in] gamma Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input * @param[in] beta Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] gamma Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input * @param[in] epsilon Small value to avoid division with zero. - * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input */ void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon); + /** Static function to check if given info will lead to a valid configuration of @ref CLBatchNormalizationLayer + * + * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result. + * 3 lower dimensions represent a single input with dimensions [width, height, FM]. + * The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32. + * @param[in] output Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input + * @param[in] mean Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] var Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] beta Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] gamma Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] epsilon Small value to avoid division with zero. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *mean, const ITensorInfo *var, + const ITensorInfo *beta, const ITensorInfo *gamma, + float epsilon); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/CL/functions/CLChannelExtract.h b/arm_compute/runtime/CL/functions/CLChannelExtract.h index 175337462..9182feffc 100644 --- a/arm_compute/runtime/CL/functions/CLChannelExtract.h +++ b/arm_compute/runtime/CL/functions/CLChannelExtract.h @@ -39,14 +39,14 @@ class CLChannelExtract : public ICLSimpleFunction public: /** Initialize the function's source, destination * - * @param[in] input The input tensor to extract the channel from. Formats supported: Any single planar. + * @param[in] input The input tensor to extract the channel from. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422 * @param[in] channel The channel to extract. * @param[out] output The extracted channel. Must be of U8 format. */ void configure(const ICLTensor *input, Channel channel, ICLTensor *output); /** Initialize the function's source, destination * - * @param[in] input The multi-planar input image to extract channel from. + * @param[in] input The multi-planar input image to extract channel from. Formats supported: NV12/NV21/IYUV/YUV444 * @param[in] channel The channel to extract. * @param[out] output The extracted 2D channel. Must be of U8 format. */ diff --git a/arm_compute/runtime/CL/functions/CLColorConvert.h b/arm_compute/runtime/CL/functions/CLColorConvert.h index 12457a0cf..dd7de4547 100644 --- a/arm_compute/runtime/CL/functions/CLColorConvert.h +++ b/arm_compute/runtime/CL/functions/CLColorConvert.h @@ -41,26 +41,27 @@ class CLColorConvert : public ICLSimpleFunction public: /** Initialize the function's source, destination * - * @param[in] input The input single-planar tensor from which to convert - * @param[in] output The converted single-planar output tensor + * @param[in] input Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888 + * @param[out] output Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422), + * RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/) */ void configure(const ICLTensor *input, ICLTensor *output); /** Initialize the function's source, destination * - * @param[in] input The multi-planar input image from which to convert - * @param[in] output The converted single-planar output image + * @param[in] input Multi-planar source image. Formats supported: NV12/NV21/IYUV + * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888 */ void configure(const ICLMultiImage *input, ICLImage *output); /** Initialize the function's source, destination * - * @param[in] input The single-planar input image from which to convert - * @param[in] output The converted multi-planar output image + * @param[in] input Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422 + * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888) */ void configure(const ICLImage *input, ICLMultiImage *output); /** Initialize the function's source, destination * - * @param[in] input The multi-planar input image from which to convert - * @param[in] output The converted multi-planar output image + * @param[in] input Multi-planar source image. Formats supported: NV12/NV21/IYUV + * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of @p input is IYUV) */ void configure(const ICLMultiImage *input, ICLMultiImage *output); }; diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h index cd1ea70a2..a8a04a0bb 100644 --- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h @@ -36,6 +36,8 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLMemoryGroup.h" #include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" +#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h" #include "arm_compute/runtime/IMemoryManager.h" #include <memory> @@ -55,7 +57,8 @@ public: CLConvolutionLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager = nullptr); /** Set the input and output tensors. * - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QS8/QS16/F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: QS8/QASYMM8/QS16/F16/F32. * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights. * @param[out] output Destination tensor. Data types supported: Same as @p weights. * @param[in] transpose1xW True if the weights are to undergo a 1xW transposition after reshaping (in case of GEMM operation), false otherwise. @@ -79,7 +82,8 @@ private: * -# @ref CLGEMMTranspose1xWKernel (executed only once for each configuration) * -# @ref CLIm2ColKernel * -# @ref CLGEMMInterleave4x4Kernel - * -# @ref CLGEMMMatrixMultiplyKernel + * -# @ref CLGEMMMatrixMultiplyKernel or @ref CLGEMMLowpMatrixMultiplyCore (if quantized asymmetric) + * -# @ref CLGEMMLowpQuantizeDownInt32ToUint8Scale (if quantized asymmetric) * -# @ref CLCol2ImKernel */ class CLConvolutionLayer : public IFunction @@ -91,14 +95,15 @@ public: * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: QS8/QS16/F16/F32. - * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input. + * Data types supported: QS8/QASYMM8/QS16/F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type. * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights - * tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input. + * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. If this is not part of the fully connected layer the weights + * tensor has also been transposed with CLGEMMTranspose1xWKernel. Data type supported: Same as @p input. */ void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo()); @@ -106,20 +111,37 @@ public: void run() override; private: - CLMemoryGroup _memory_group; - CLConvolutionLayerReshapeWeights _reshape_weights; - CLIm2ColKernel _input_im2col_kernel; - CLGEMMInterleave4x4Kernel _input_interleave_kernel; - CLGEMMMatrixMultiplyKernel _mm_kernel; - CLCol2ImKernel _output_col2im_kernel; - CLTensor _input_im2col_reshaped; - CLTensor _input_interleaved_reshaped; - CLTensor _weights_reshaped; - CLTensor _weights_transposed; - CLTensor _gemm_output; - bool _has_bias; - bool _is_fully_connected_convolution; - bool _are_weights_reshaped; + /** Configures the appropriate matrix multiply routine + * + * @param input Input tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32. + * @param weights Weights tensor. Data type supported: Same as @p input. + * @param output Output tensor. Data types supported: Same as @p input, + * except for input of QASYMM8 type where output should be of S32 type. + * @param is_interleaved_transposed Flag that signals if matrix is interleaved transposed + */ + void configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool is_interleaved_transposed = true); + +private: + CLMemoryGroup _memory_group; + CLConvolutionLayerReshapeWeights _reshape_weights; + CLIm2ColKernel _input_im2col_kernel; + CLGEMMInterleave4x4Kernel _input_interleave_kernel; + CLGEMMMatrixMultiplyKernel _mm_kernel; + CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp; + CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage; + CLCol2ImKernel _output_col2im_kernel; + + CLTensor _input_im2col_reshaped; + CLTensor _input_interleaved_reshaped; + CLTensor _weights_reshaped; + CLTensor _weights_transposed; + CLTensor _gemm_output; + CLTensor _tmp_output; + + bool _append_bias; + bool _is_fully_connected_convolution; + bool _are_weights_reshaped; + bool _is_quantized; }; } #endif /* __ARM_COMPUTE_CLCONVOLUTIONLAYER_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLDepthConcatenate.h b/arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h index 77997f6bd..00b3b66c9 100644 --- a/arm_compute/runtime/CL/functions/CLDepthConcatenate.h +++ b/arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h @@ -29,7 +29,7 @@ #include "arm_compute/core/Window.h" #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h" +#include "arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h" #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" #include <memory> @@ -42,14 +42,14 @@ class ICLTensor; /** Basic function to execute concatenate tensors along z axis. This function calls the following kernels: * * -# @ref CLFillBorderKernel (executed if input's lowest two dimensions are smaller than respective output's dimensions) - * -# @ref CLDepthConcatenateKernel + * -# @ref CLDepthConcatenateLayerKernel * */ -class CLDepthConcatenate : public IFunction +class CLDepthConcatenateLayer : public IFunction { public: /** Default constructor */ - CLDepthConcatenate(); + CLDepthConcatenateLayer(); /** Initialise the kernel's inputs vector and output. * * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: QS8/QS16/F16/F32. @@ -61,10 +61,10 @@ public: void run() override; private: - std::vector<ICLTensor *> _inputs_vector; - std::unique_ptr<CLDepthConcatenateKernel[]> _concat_kernels_vector; - std::unique_ptr<CLFillBorderKernel[]> _border_handlers_vector; - unsigned int _num_inputs; + std::vector<ICLTensor *> _inputs_vector; + std::unique_ptr<CLDepthConcatenateLayerKernel[]> _concat_kernels_vector; + std::unique_ptr<CLFillBorderKernel[]> _border_handlers_vector; + unsigned int _num_inputs; }; } #endif /* __ARM_COMPUTE_CLDEPTHCONCATENATE_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLDepthConvert.h b/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h index 9a4c63dd6..c84dc1550 100644 --- a/arm_compute/runtime/CL/functions/CLDepthConvert.h +++ b/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h @@ -33,8 +33,8 @@ namespace arm_compute { class ICLTensor; -/** Basic function to run @ref CLDepthConvertKernel. */ -class CLDepthConvert : public ICLSimpleFunction +/** Basic function to run @ref CLDepthConvertLayerKernel. */ +class CLDepthConvertLayer : public ICLSimpleFunction { public: /** Initialize the function's source, destination diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseConvolution.h b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h index 53bc079cb..f7899415d 100644 --- a/arm_compute/runtime/CL/functions/CLDepthwiseConvolution.h +++ b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h @@ -24,7 +24,7 @@ #ifndef __ARM_COMPUTE_CLDEPTHWISECONVOLUTION_H__ #define __ARM_COMPUTE_CLDEPTHWISECONVOLUTION_H__ -#include "arm_compute/core/CL/kernels/CLDepthwiseConvolution3x3Kernel.h" +#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.h" #include "arm_compute/core/CL/kernels/CLDepthwiseIm2ColKernel.h" #include "arm_compute/core/CL/kernels/CLDepthwiseVectorToTensorKernel.h" #include "arm_compute/core/CL/kernels/CLDepthwiseWeightsReshapeKernel.h" @@ -40,30 +40,32 @@ class ICLTensor; /** Basic function to execute a depthwise convolution for kernel size 3x3xC. This function calls the following OpenCL kernels: * - * -# @ref CLDepthwiseConvolution3x3Kernel + * -# @ref CLDepthwiseConvolutionLayer3x3Kernel * -# @ref CLFillBorderKernel (if pad_x or pad_y > 0) * */ -class CLDepthwiseConvolution3x3 : public IFunction +class CLDepthwiseConvolutionLayer3x3 : public IFunction { public: /** Default constructor */ - CLDepthwiseConvolution3x3(); + CLDepthwiseConvolutionLayer3x3(); /** Initialize the function's source, destination, conv and border_size. * - * @param[in, out] input Source tensor. Data type supported: F32. (Written to only for border filling). + * @param[in, out] input Source tensor. Data type supported: QASYMM8/F32. (Written to only for border filling). + * @param[in] weights Weights tensor. A 3D tensor with shape [3, 3, IFM]. Data type supported: Same as @p input. + * @param[in] biases (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input. * @param[out] output Destination tensor. Data type supported: same as @p input. - * @param[in] weights Weights tensor. These are 3D tensors with shape [3, 3, IFM]. Data type supported: Same as @p input. * @param[in] conv_info Padding and stride information to use for the convolution. */ - void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *weights, const PadStrideInfo &conv_info); + void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info); // Inherited methods overriden: void run() override; private: - CLDepthwiseConvolution3x3Kernel _kernel; - CLFillBorderKernel _border_handler; + CLDepthwiseConvolutionLayer3x3Kernel _kernel; + CLFillBorderKernel _border_handler; }; /** Basic function to execute a generic depthwise convolution. This function calls the following OpenCL kernels: @@ -74,19 +76,21 @@ private: * -# @ref CLFillBorderKernel (if pad_x or pad_y > 0) * */ -class CLDepthwiseConvolution : public IFunction +class CLDepthwiseConvolutionLayer : public IFunction { public: /** Default constructor */ - CLDepthwiseConvolution(); + CLDepthwiseConvolutionLayer(); /** Initialize the function's source, destination, weights and convolution information. * * @param[in, out] input Source tensor. Data type supported: F32. (Written to only for border filling). - * @param[out] output Destination tensor. Data type supported: same as @p input. * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input. + * @param[in] biases (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input. + * @param[out] output Destination tensor. Data type supported: same as @p input. * @param[in] conv_info Padding and stride information to use for the convolution. */ - void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *weights, const PadStrideInfo &conv_info); + void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info); // Inherited methods overriden: void run() override; diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h index 7dabed181..27cee5ed3 100644 --- a/arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h @@ -27,7 +27,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/ICLSimpleFunction.h" -#include "arm_compute/runtime/CL/functions/CLDepthwiseConvolution.h" +#include "arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h" #include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h" #include "arm_compute/runtime/IFunction.h" @@ -39,7 +39,7 @@ class ICLTensor; /** Basic function to execute depthwise convolution. This function calls the following OpenCL kernels and function: * - * -# @ref CLDepthwiseConvolution + * -# @ref CLDepthwiseConvolutionLayer * -# @ref CLDirectConvolutionLayer * */ @@ -53,24 +53,27 @@ public: * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F32. * @param[in] depthwise_weights Depthwise convolution weights tensor. These are 3D tensors with dimensions [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input. + * @param[in] depthwise_biases (Optional) Biases tensor.Biases are 1D tensor with dimensions [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p weights. * @param[out] depthwise_out Depthwise destination tensor. * @param[in] pointwise_weights Pointwise convolution weights tensor. These are 4D tensors with dimensions [1, 1, IFM, OFM]. Data type supported: Same as @p input. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights. + * @param[in] pointwise_biases (Optional) Biases tensor. Biases are 1D tensor with dimensions [OFM]. Must be nullptr if not needed. + * Data type supported: Same as @p weights. * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. * Data types supported: Same as @p input. * @param[in] depthwise_conv_info Contains padding and stride information described in @ref PadStrideInfo for depthwise convolution. * @param[in] pointwise_conv_info Contains padding and stride information described in @ref PadStrideInfo for pointwise convolution. */ - void configure(ICLTensor *input, const ICLTensor *depthwise_weights, ICLTensor *depthwise_out, const ICLTensor *pointwise_weights, const ICLTensor *biases, ICLTensor *output, - const PadStrideInfo &depthwise_conv_info, - const PadStrideInfo &pointwise_conv_info); + void configure(ICLTensor *input, const ICLTensor *depthwise_weights, const ICLTensor *depthwise_biases, ICLTensor *depthwise_out, + const ICLTensor *pointwise_weights, const ICLTensor *pointwise_biases, ICLTensor *output, + const PadStrideInfo &depthwise_conv_info, const PadStrideInfo &pointwise_conv_info); // Inherited methods overriden: void run() override; private: - CLDepthwiseConvolution _depthwise_conv; - CLDirectConvolutionLayer _pointwise_conv; + CLDepthwiseConvolutionLayer _depthwise_conv; + CLDirectConvolutionLayer _pointwise_conv; }; } #endif /*__ARM_COMPUTE_CL_DEPTHWISE_SEPARABLE_CONVOLUTION_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLDilate.h b/arm_compute/runtime/CL/functions/CLDilate.h index 8534139c8..e4173ac51 100644 --- a/arm_compute/runtime/CL/functions/CLDilate.h +++ b/arm_compute/runtime/CL/functions/CLDilate.h @@ -49,7 +49,7 @@ public: * @param[in] border_mode Border mode to use for the convolution. * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ - void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value); + void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0); }; } #endif /*__ARM_COMPUTE_CLDILATE_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h index 4c85277c0..f31a45be9 100644 --- a/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h @@ -45,14 +45,29 @@ public: * * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], * while every optional dimension from 4 and above represent a batch of inputs. - * Data types supported: QS8/QS16/F16/F32. + * Data types supported: QASYMM8/QS8/QS16/F16/F32. * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. - * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type. * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. * Data types supported: Same as @p input. * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. */ void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info); + /** Static function to check if given info will lead to a valid configuration of @ref CLDirectConvolutionLayer + * + * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QS8/QASYMM8/QS16/F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input. + * @param[in] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/CL/functions/CLErode.h b/arm_compute/runtime/CL/functions/CLErode.h index cd2f5516e..a929cc9ba 100644 --- a/arm_compute/runtime/CL/functions/CLErode.h +++ b/arm_compute/runtime/CL/functions/CLErode.h @@ -49,7 +49,7 @@ public: * @param[in] border_mode Border mode to use for the convolution. * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ - void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value); + void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0); }; } #endif /*__ARM_COMPUTE_CLERODE_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h index f71e2a33f..2cac06c1c 100644 --- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h +++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h @@ -32,6 +32,8 @@ #include "arm_compute/core/CL/kernels/CLTransposeKernel.h" #include "arm_compute/runtime/CL/CLMemoryGroup.h" #include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h" +#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h" namespace arm_compute { @@ -46,7 +48,7 @@ class CLFullyConnectedLayerReshapeWeights : public ICLSimpleFunction public: /** Set the input and output tensors. * - * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: QS8/QS16/F16/F32. + * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: QS8/QASYMM8/QS16/F16/F32. * @param[out] output Destination tensor which stores the transposed input tensor. Data type supported: Same as @p input. */ void configure(const ICLTensor *input, ICLTensor *output); @@ -56,8 +58,8 @@ public: * * -# @ref CLIm2ColKernel (called when the input comes from a convolutional layer) * -# @ref CLFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once) - * -# @ref CLGEMMMatrixMultiplyKernel - * -# @ref CLGEMMMatrixAccumulateBiasesKernel (if @p biases is not equal to nullptr) + * -# @ref CLGEMMMatrixMultiplyKernel or @ref CLGEMMLowpMatrixMultiplyCore (if quantized asymmetric) + * -# @ref CLGEMMMatrixAccumulateBiasesKernel or @ref CLGEMMLowpQuantizeDownInt32ToUint8Scale (if quantized asymmetric) (if @p biases is not equal to nullptr) * * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. */ @@ -68,7 +70,7 @@ public: CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); /** Set the input and output tensors. * - * @param[in] input Source tensor. Data type supported: QS8/QS16/F16/F32. + * @param[in] input Source tensor. Data type supported: QS8/QASYMM8/QS16/F16/F32. * @param[in] weights Weights tensor. The weights must be 2 dimensional. Data type supported: Same as @p input * @param[in] biases Bias tensor. It can be nullptr. Data type supported:Same as @p input. * @param[out] output Destination tensor. Data type supported: Same as @p input. @@ -83,17 +85,22 @@ public: private: void configure_fc_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output); void configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output); + void configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool is_interleaved_transposed = true); - CLMemoryGroup _memory_group; - CLIm2ColKernel _im2col_kernel; - CLFullyConnectedLayerReshapeWeights _reshape_weights_kernel; - CLGEMMMatrixMultiplyKernel _mm_kernel; - CLGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; - CLTensor _im2col_output; - CLTensor _reshape_weights_output; - bool _are_weights_reshaped; - bool _is_fc_after_conv; - bool _accumulate_biases; + CLMemoryGroup _memory_group; + CLIm2ColKernel _im2col_kernel; + CLFullyConnectedLayerReshapeWeights _reshape_weights_kernel; + CLGEMMMatrixMultiplyKernel _mm_kernel; + CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp; + CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage; + CLGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; + CLTensor _im2col_output; + CLTensor _gemmlowp_output; + CLTensor _reshape_weights_output; + bool _are_weights_reshaped; + bool _is_fc_after_conv; + bool _accumulate_biases; + bool _is_quantized; }; } #endif /* __ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h b/arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h index 8c755aeab..ae05b0fd9 100644 --- a/arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h +++ b/arm_compute/runtime/CL/functions/CLGEMMInterleave4x4.h @@ -40,11 +40,11 @@ class CLGEMMInterleave4x4 : public ICLSimpleFunction public: /** Initialise the kernel's inputs, output * - * @param[in] input First input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32 + * @param[in] input First input tensor. Data types supported: U8/S8/QS8/QASYMM8/QS16/U16/S16/F16/U32/S32/F32 * @param[out] output Output tensor. Data type supported: same as @p input */ void configure(const ICLTensor *input, ICLTensor *output); }; } -#endif /* __ARM_COMPUTE_CLGEMMINTERLEAVE4X4_H__ */
\ No newline at end of file +#endif /* __ARM_COMPUTE_CLGEMMINTERLEAVE4X4_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowp.h b/arm_compute/runtime/CL/functions/CLGEMMLowp.h deleted file mode 100644 index 613fcaa7e..000000000 --- a/arm_compute/runtime/CL/functions/CLGEMMLowp.h +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2016, 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_CLGEMMLOWP_H__ -#define __ARM_COMPUTE_CLGEMMLOWP_H__ - -#include "arm_compute/core/CL/ICLKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h" -#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h" -#include "arm_compute/runtime/CL/CLMemoryGroup.h" -#include "arm_compute/runtime/CL/CLTensor.h" -#include "arm_compute/runtime/IFunction.h" -#include "arm_compute/runtime/IMemoryManager.h" - -#include <memory> - -namespace arm_compute -{ -class ICLTensor; - -/** Basic function to execute GEMMLowp on OpenCL. This function calls the following OpenCL kernels: -* -* -# @ref CLGEMMInterleave4x4Kernel -* -# @ref CLGEMMTranspose1xWKernel -* -# @ref CLGEMMLowpMatrixMultiplyKernel -* -*/ -class CLGEMMLowp : public IFunction -{ -public: - /** Constructor */ - CLGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager = nullptr); - /** Initialise the kernel's inputs, output - * - * @note GEMM_LOWP: low precision matrix multiply kernel - * This kernel performs the following computation: - * - * -# Convert a values from uint8 to int32 and add a_offset to each of them. - * -# Convert b values from uint8 to int32 and add b_offset to each of them. - * -# Compute the int32 matrix product of the resulting a * b. - * -# Add output_offset to each entry of the result. - * -# Multiply each entry of the result and round to the nearest integer - * -# Clamp the resulting int32 values to the [0..255] range and cast to uint8. - * - * @param[in] a First input tensor (Matrix A). Data types supported: U8. - * @param[in] b Second input tensor (Matrix B). Data types supported: same as @p a. - * @param[out] output Output tensor. Data types supported: same as @p a. - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * @param[in] output_offset Offset to be added to each element of the output matrix - * @param[in] output_mult_int Multiplied with each element of the output matrix - * @param[in] shift Number of bits to shift right the result. - */ - void configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift); - - // Inherited methods overridden: - void run() override; - -private: - CLMemoryGroup _memory_group; - CLGEMMInterleave4x4Kernel _interleave_kernel; - CLGEMMTranspose1xWKernel _transpose_kernel; - CLGEMMLowpMatrixMultiplyKernel _mm_kernel; - CLTensor _tmp_a; - CLTensor _tmp_b; -}; -} -#endif /*__ARM_COMPUTE_CLGEMMLOWP_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h new file mode 100644 index 000000000..e31614454 --- /dev/null +++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H__ +#define __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H__ + +#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h" +#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h" +#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h" +#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h" +#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h" +#include "arm_compute/runtime/CL/CLMemoryGroup.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/IFunction.h" + +namespace arm_compute +{ +class IMemoryManager; +class ICLTensor; + +/** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. This function calls the following OpenCL kernels: + * + * -# @ref CLGEMMInterleave4x4Kernel (if the output tensor is a matrix) + * -# @ref CLGEMMTranspose1xWKernel (if the output tensor is a matrix) + * -# @ref CLGEMMLowpMatrixMultiplyKernel + * -# @ref CLGEMMLowpMatrixAReductionKernel (if the offset of matrix B is not 0) + * -# @ref CLGEMMLowpMatrixBReductionKernel (if the offset of matrix A is not 0) + * -# @ref CLGEMMLowpOffsetContributionKernel + * +*/ +class CLGEMMLowpMatrixMultiplyCore : public IFunction +{ +public: + /** Constructor */ + CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Initialise the kernel's inputs, output + * + * @note GEMM_LOWP: low precision GEMM kernel + * This kernel performs the following computations: + * + * -# Convert a values from QASYMM8 to int32 and add a_offset to each of them. + * -# Convert b values from QASYMM8 to int32 add b_offset to each of them. + * -# Compute the matrix product of the resulting a * b in int32. + * + * @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8. + * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a + * @param[out] output Output tensor. Data type supported: Data type supported: S32 + * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and + * if the reshape of matrix B should be executed only for the first run + */ + void configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, const GEMMInfo &gemm_info = GEMMInfo()); + + // Inherited methods overridden: + void run() override; + +private: + CLMemoryGroup _memory_group; + CLGEMMLowpMatrixMultiplyKernel _mm_kernel; + CLGEMMInterleave4x4Kernel _mtx_a_reshape_kernel; + CLGEMMTranspose1xWKernel _mtx_b_reshape_kernel; + CLGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel; + CLGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel; + CLGEMMLowpOffsetContributionKernel _offset_contribution_kernel; + CLTensor _vector_sum_col; + CLTensor _vector_sum_row; + CLTensor _tmp_a; + CLTensor _tmp_b; + int32_t _a_offset; + int32_t _b_offset; + bool _is_interleaved_transposed; + bool _is_first_run; + bool _reshape_b_only_on_first_run; +}; +} +#endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h new file mode 100644 index 000000000..7446ff4b3 --- /dev/null +++ b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H__ +#define __ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H__ + +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +/** This file contains all available output stages for GEMMLowp on OpenCL. + * + * In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyCore), + * and processes it to obtain the final ASYMM8 value. + * + * More information about the GEMMLowp output stage can be found at https://github.com/google/gemmlowp/blob/master/doc/output.md + */ + +namespace arm_compute +{ +class ITensor; + +/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToUint8Scale on OpenCL. + * + * CLGEMMLowpQuantizeDownInt32ToUint8Scale depends on 3 parameters: result_offset, result_mult_int, result_shift + * The final result is: + * + * ((input[i][k] + result_offset) * result_mult_int) >> result_shift + * + * In case the bias tensor is provided, the final result is: + * + * ((input[i][k] + bias[k] + result_offset) * result_mult_int) >> result_shift + * + * This function calls the following OpenCL kernels: + * + * -# @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel + * + * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions + * after the result is shifted right by result_shift +*/ +class CLGEMMLowpQuantizeDownInt32ToUint8Scale : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's inputs, output + * + * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] result_offset Offset to be added to each element of the input matrix + * @param[in] result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add + * @param[in] result_shift Number of bits to shift right the result before converting back to QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions + */ + void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0); + /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8Scale + * + * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0); +}; + +/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint on OpenCL. + * + * CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint depends on 3 parameters: + * + * result_fixedpoint_multiplier, result_shift, result_offset_after_shift + * + * The final result is: + * + * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift + * + * where FixedPointMul(x, y) is the nearest integer to the following + * mathematical expression, evaluated without overflow or intermediate rounding: + * + * (x * y) / 2^31 + * + * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68 + * + * In case the bias tensor is provided, the final result is: + * + * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift + * + * This function calls the following OpenCL kernels: + * + * -# @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel + * + * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions + * after the result is shifted right by result_shift +*/ +class CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's inputs, output + * + * @param[in] input Input tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add + * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication + * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions + */ + void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0); + /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint + * + * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0); +}; +} +#endif /*__ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H__ */
\ No newline at end of file diff --git a/arm_compute/runtime/CL/functions/CLGEMMTranspose1xW.h b/arm_compute/runtime/CL/functions/CLGEMMTranspose1xW.h index 866c17b51..ae56548c2 100644 --- a/arm_compute/runtime/CL/functions/CLGEMMTranspose1xW.h +++ b/arm_compute/runtime/CL/functions/CLGEMMTranspose1xW.h @@ -38,7 +38,7 @@ class CLGEMMTranspose1xW : public ICLSimpleFunction public: /** Initialise the kernel's inputs, output * - * @param[in] input First input tensor. Data type supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32/ + * @param[in] input First input tensor. Data type supported: U8/S8/QS8/QASYMM8/QS16/U16/S16/F16/U32/S32/F32 * @param[out] output Output tensor. Data type supported: same as @p input */ void configure(const ICLTensor *input, ICLTensor *output); diff --git a/arm_compute/runtime/CL/functions/CLHOGDetector.h b/arm_compute/runtime/CL/functions/CLHOGDetector.h index 0b4fad776..158783693 100644 --- a/arm_compute/runtime/CL/functions/CLHOGDetector.h +++ b/arm_compute/runtime/CL/functions/CLHOGDetector.h @@ -55,7 +55,7 @@ public: * * @attention The function does not reset the number of values in @ref IDetectionWindowArray so it is caller's responsibility to clear it. * - * @param[in] input Input tensor. It is the output of @ref NEHOGDescriptor. Data type supported: F32 + * @param[in] input Input tensor. It is the output of @ref CLHOGDescriptor. Data type supported: F32 * @param[in] hog HOG data-object that describes the HOG descriptor * @param[out] detection_windows Array of @ref DetectionWindow used to store the detected objects * @param[in] detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions. diff --git a/arm_compute/runtime/CL/functions/CLIntegralImage.h b/arm_compute/runtime/CL/functions/CLIntegralImage.h index 25fc549b2..71f6897d1 100644 --- a/arm_compute/runtime/CL/functions/CLIntegralImage.h +++ b/arm_compute/runtime/CL/functions/CLIntegralImage.h @@ -43,10 +43,10 @@ public: /** Default Constructor. */ CLIntegralImage(); /** Initialise the function's source, destinations and border mode. - * - * @param[in] input Source tensor. Data types supported: U8. - * @param[out] output Destination tensor, Data types supported: U32. - */ + * + * @param[in] input Source tensor. Data types supported: U8. + * @param[out] output Destination tensor, Data types supported: U32. + */ void configure(const ICLTensor *input, ICLTensor *output); // Inherited methods overridden: diff --git a/arm_compute/runtime/CL/functions/CLL2Normalize.h b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h index 20af54eda..8aea7a641 100644 --- a/arm_compute/runtime/CL/functions/CLL2Normalize.h +++ b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h @@ -24,7 +24,7 @@ #ifndef __ARM_COMPUTE_CLL2NORMALIZE_H__ #define __ARM_COMPUTE_CLL2NORMALIZE_H__ -#include "arm_compute/core/CL/kernels/CLL2NormalizeKernel.h" +#include "arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLMemoryGroup.h" #include "arm_compute/runtime/CL/CLTensor.h" @@ -41,11 +41,11 @@ class ICLTensor; /** Perform reduction operation. */ -class CLL2Normalize : public IFunction +class CLL2NormalizeLayer : public IFunction { public: /** Constructor */ - CLL2Normalize(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + CLL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); /** Set the input and output tensors. * @@ -60,10 +60,10 @@ public: void run() override; private: - CLMemoryGroup _memory_group; - CLReductionOperation _reduce_func; - CLL2NormalizeKernel _normalize_kernel; - CLTensor _sumsq; + CLMemoryGroup _memory_group; + CLReductionOperation _reduce_func; + CLL2NormalizeLayerKernel _normalize_kernel; + CLTensor _sumsq; }; } #endif /*__ARM_COMPUTE_CLL2NORMALIZE_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h b/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h index 0c6708aa7..585a013e3 100644 --- a/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h +++ b/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h @@ -27,7 +27,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLPyramid.h" #include "arm_compute/runtime/CL/functions/CLArithmeticSubtraction.h" -#include "arm_compute/runtime/CL/functions/CLDepthConvert.h" +#include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h" #include "arm_compute/runtime/CL/functions/CLGaussian5x5.h" #include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h" #include "arm_compute/runtime/IFunction.h" @@ -77,7 +77,7 @@ private: CLGaussianPyramidHalf _gaussian_pyr_function; std::unique_ptr<CLGaussian5x5[]> _convf; std::unique_ptr<CLArithmeticSubtraction[]> _subf; - CLDepthConvert _depth_function; + CLDepthConvertLayer _depth_function; CLPyramid _gauss_pyr; CLPyramid _conv_pyr; }; diff --git a/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h b/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h index 4bc7eb65c..4a676c85a 100644 --- a/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h +++ b/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h @@ -27,7 +27,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLPyramid.h" #include "arm_compute/runtime/CL/functions/CLArithmeticAddition.h" -#include "arm_compute/runtime/CL/functions/CLDepthConvert.h" +#include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h" #include "arm_compute/runtime/CL/functions/CLScale.h" #include "arm_compute/runtime/IFunction.h" @@ -43,7 +43,7 @@ using ICLImage = ICLTensor; * * -# @ref CLArithmeticAddition * -# @ref CLScale - * -# @ref CLDepthConvert + * -# @ref CLDepthConvertLayer * * This function reconstructs the original image from a Laplacian Image Pyramid. * @@ -85,7 +85,7 @@ private: CLPyramid _tmp_pyr; std::unique_ptr<CLArithmeticAddition[]> _addf; std::unique_ptr<CLScale[]> _scalef; - CLDepthConvert _depthf; + CLDepthConvertLayer _depthf; }; } #endif /*__ARM_COMPUTE_CLLAPLACIANRECONSTRUCT_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLMagnitude.h b/arm_compute/runtime/CL/functions/CLMagnitude.h index dc5f9139b..f9c7e5c14 100644 --- a/arm_compute/runtime/CL/functions/CLMagnitude.h +++ b/arm_compute/runtime/CL/functions/CLMagnitude.h @@ -41,8 +41,9 @@ public: * @param[in] input2 Second tensor input. Data types supported: S16. * @param[out] output Output tensor. Data types supported: S16. * @param[in] mag_type (Optional) Magnitude calculation type. Default: L2NORM. + * @param[in] use_fp16 (Optional) If true the FP16 kernels will be used. If false F32 kernels are used. */ - void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type = MagnitudeType::L2NORM); + void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type = MagnitudeType::L2NORM, bool use_fp16 = false); }; } #endif /*__ARM_COMPUTE_CLMAGNITUDE_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h index 0818cec2e..93925778d 100644 --- a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h +++ b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h @@ -37,7 +37,7 @@ namespace arm_compute { class ICLTensor; -/** Basic function to simulate a normalization layer. This function calls the following CL kernels: +/** Basic function to compute a normalization layer. This function calls the following CL kernels: * * -# @ref CLFillBorderKernel * -# @ref CLNormalizationLayerKernel @@ -51,11 +51,21 @@ public: /** Set the input and output tensors. * * @param[in, out] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], - * and an optional 4th dimension for batch of inputs. Data types supported: F16/F32 (Written to by the border handler) + * and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/F16/F32 (Written to by the border handler) * @param[out] output Destination tensor. Dimensions, data type and number of channels must match the input ones. * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters. */ - void configure(ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info); + void configure(ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info); + /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayer + * + * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/F16/F32 + * @param[in] output Destination tensor. Dimensions, data type and number of channels must match the input ones. + * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h index 71754fc3f..d57bfda2c 100644 --- a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h +++ b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h @@ -37,15 +37,30 @@ class CLPixelWiseMultiplication : public ICLSimpleFunction public: /** Initialise the kernel's inputs, output and convertion policy. * - * @param[in] input1 First tensor input. Data types supported: U8, S16, F16 or F32. - * @param[in] input2 Second tensor input. Data types supported: U8, S16, F16 or F32. - * @param[out] output Output tensor. Data types supported: U8(Only if both inputs are U8), S16, F16 or F32. - * @param[in] scale Scale to apply after multiplication. Must be positive. + * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input2 An input tensor. Data types supported: same as @p input1. + * @param[out] output The output tensor, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1. * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. */ void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); + /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseMultiplication + * + * @param[in] input1 An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input2 An input tensor info. Data types supported: same as @p input1. + * @param[in] output The output tensor info, Data types supported: same as @p input1. Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1. + * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate + * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); }; } #endif /*__ARM_COMPUTE_CLPIXELWISEMULTIPLICATION_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLPoolingLayer.h b/arm_compute/runtime/CL/functions/CLPoolingLayer.h index 80233d400..a8bdabad9 100644 --- a/arm_compute/runtime/CL/functions/CLPoolingLayer.h +++ b/arm_compute/runtime/CL/functions/CLPoolingLayer.h @@ -26,6 +26,7 @@ #include "arm_compute/runtime/CL/ICLSimpleFunction.h" +#include "arm_compute/core/Error.h" #include "arm_compute/core/Types.h" namespace arm_compute @@ -42,11 +43,20 @@ class CLPoolingLayer : public ICLSimpleFunction public: /** Set the input and output tensors. * - * @param[in,out] input Source tensor. (Written to only when padding != 0) Data types supported: QS8/QS16/F16/F32. + * @param[in,out] input Source tensor. (Written to only when padding != 0) Data types supported: QS8/QASYMM8/QS16/F16/F32. * @param[out] output Destination tensor. Data types supported: Same as @p input. * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. */ void configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info); + /** Static function to check if given info will lead to a valid configuration of @ref CLPoolingLayer + * + * @param[in] input Source tensor info. Data types supported: QS8/QASYMM8/QS16/F16/F32. + * @param[in] output Destination tensor info. Data types supported: Same as @p input. + * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info); }; } // namespace arm_compute #endif /* __ARM_COMPUTE_CLPOOLINGLAYER_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLReshapeLayer.h b/arm_compute/runtime/CL/functions/CLReshapeLayer.h index c82e646e9..411e75129 100644 --- a/arm_compute/runtime/CL/functions/CLReshapeLayer.h +++ b/arm_compute/runtime/CL/functions/CLReshapeLayer.h @@ -36,7 +36,7 @@ class CLReshapeLayer : public ICLSimpleFunction public: /** Initialise the kernel's inputs and outputs * - * @param[in] input First tensor input. Data type supported: U8/S8/QS8/U16/S16/QS16/U32/S32/F16/F32 + * @param[in] input First tensor input. Data type supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32 * @param[out] output Output tensor. Data type supported: Same as @p input */ void configure(const ICLTensor *input, ICLTensor *output); diff --git a/arm_compute/runtime/CL/functions/CLScale.h b/arm_compute/runtime/CL/functions/CLScale.h index db491c1a4..68d64a9e2 100644 --- a/arm_compute/runtime/CL/functions/CLScale.h +++ b/arm_compute/runtime/CL/functions/CLScale.h @@ -45,8 +45,10 @@ public: * @param[in] policy The interpolation type. * @param[in] border_mode Strategy to use for borders. * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + * @param[in] sampling_policy (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER */ - void configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue()); + void configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue(), + SamplingPolicy sampling_policy = SamplingPolicy::CENTER); }; } #endif /*__ARM_COMPUTE_CLSCALE_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h index 70a265c1a..5430f9c10 100644 --- a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h +++ b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h @@ -39,7 +39,7 @@ class ICLTensor; /** Basic function to compute a SoftmaxLayer. * * Softmax is calculated by : - * @f[ out = exp(x - max(x)) / sum(exp(x - max(x))) @f] + * @f[ out = exp((x - max(x)) * beta) / sum(exp((x - max(x)) * beta)) @f] * * This function runs the following kernels: * -# @ref CLLogits1DMaxKernel @@ -53,22 +53,33 @@ public: CLSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); /** Set the input and output tensors. * - * @param[in] input Source tensor. Data types supported: QS8/QS16/F16/F32 + * @param[in] input Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32 * @param[out] output Destination tensor. Data types supported: same as @p input + * @param[in] beta (Optional) A scaling factor for the exponent. Defaults to 1.f */ - void configure(const ICLTensor *input, ICLTensor *output); + void configure(const ICLTensor *input, ICLTensor *output, float beta = 1.0f); + /** Static function to check if given info will lead to a valid configuration of @ref CLSoftmaxLayer + * + * @param[in] input Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32 + * @param[in] output Destination tensor. Data types supported: same as @p input + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); // Inherited methods overridden: void run() override; private: - CLMemoryGroup _memory_group; - CLLogits1DMaxKernel _max_kernel; - CLLogits1DShiftExpSumKernel _shift_exp_sum_kernel; - CLLogits1DNormKernel _norm_kernel; - CLTensor _max; - CLTensor _sum; - CLTensor _tmp; + CLMemoryGroup _memory_group; + CLLogits1DMaxKernel _max_kernel; + CLLogits1DShiftExpSumKernel _shift_exp_sum_kernel; + CLLogits1DMaxShiftExpSumKernel _max_shift_exp_sum_kernel; + CLLogits1DNormKernel _norm_kernel; + CLTensor _max; + CLTensor _sum; + CLTensor _tmp; + bool _run_legacy_path; }; } #endif /* __ARM_COMPUTE_CLSOFTMAXLAYER_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLTranspose.h b/arm_compute/runtime/CL/functions/CLTranspose.h index 9ac5458a9..89a2022e7 100644 --- a/arm_compute/runtime/CL/functions/CLTranspose.h +++ b/arm_compute/runtime/CL/functions/CLTranspose.h @@ -40,10 +40,18 @@ class CLTranspose : public ICLSimpleFunction public: /** Initialise the kernel's inputs and output * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[out] output Output tensor. Data type supported: Same as @p input */ void configure(const ICLTensor *input, ICLTensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref CLTranspose + * + * @param[in] input The input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] output The output tensor. Data types supported: Same as @p input + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); }; } diff --git a/arm_compute/runtime/CPP/CPPFunctions.h b/arm_compute/runtime/CPP/CPPFunctions.h new file mode 100644 index 000000000..1f01ffac8 --- /dev/null +++ b/arm_compute/runtime/CPP/CPPFunctions.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CPPFUNCTIONS_H__ +#define __ARM_COMPUTE_CPPFUNCTIONS_H__ + +/* Header regrouping all the CPP functions */ +#include "arm_compute/runtime/CPP/functions/CPPPermute.h" + +#endif /* __ARM_COMPUTE_CPPFUNCTIONS_H__ */ diff --git a/arm_compute/runtime/CPP/ICPPSimpleFunction.h b/arm_compute/runtime/CPP/ICPPSimpleFunction.h new file mode 100644 index 000000000..d1bd23258 --- /dev/null +++ b/arm_compute/runtime/CPP/ICPPSimpleFunction.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_ICPPSIMPLEFUNCTION_H__ +#define __ARM_COMPUTE_ICPPSIMPLEFUNCTION_H__ + +#include "arm_compute/core/CPP/ICPPKernel.h" +#include "arm_compute/runtime/IFunction.h" + +#include <memory> + +namespace arm_compute +{ +/** Basic interface for functions which have a single CPP kernel */ +class ICPPSimpleFunction : public IFunction +{ +public: + /** Constructor */ + ICPPSimpleFunction(); + + // Inherited methods overridden: + void run() override final; + +protected: + std::unique_ptr<ICPPKernel> _kernel; /**< Kernel to run */ +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_ICPPSIMPLEFUNCTION_H__ */ diff --git a/arm_compute/runtime/CPP/functions/CPPPermute.h b/arm_compute/runtime/CPP/functions/CPPPermute.h new file mode 100644 index 000000000..0094576da --- /dev/null +++ b/arm_compute/runtime/CPP/functions/CPPPermute.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CPPPERMUTE_H__ +#define __ARM_COMPUTE_CPPPERMUTE_H__ + +#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h" + +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to run @ref CPPPermuteKernel */ +class CPPPermute : public ICPPSimpleFunction +{ +public: + /** Configure the permute CPP kernel + * + * @param[in] input The input tensor to permute. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[out] output The output tensor. Data types supported: Same as @p input + * @param[in] perm Permutation vector + */ + void configure(const ITensor *input, ITensor *output, const PermutationVector &perm); + /** Static function to check if given info will lead to a valid configuration of @ref CPPPermute + * + * @param[in] input The input tensor to permute. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] output The output tensor. Data types supported: Same as @p input + * @param[in] perm Permutation vector + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm); +}; +} +#endif /* __ARM_COMPUTE_CPPPERMUTE_H__ */ diff --git a/arm_compute/runtime/GLES_COMPUTE/GCFunctions.h b/arm_compute/runtime/GLES_COMPUTE/GCFunctions.h new file mode 100644 index 000000000..e76d4efb2 --- /dev/null +++ b/arm_compute/runtime/GLES_COMPUTE/GCFunctions.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCFUNCTIONS_H__ +#define __ARM_COMPUTE_GCFUNCTIONS_H__ + +/* Header regrouping all the GLES compute functions */ +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFillBorder.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h" + +#endif /* __ARM_COMPUTE_GCFUNCTIONS_H__ */ diff --git a/arm_compute/runtime/GLES_COMPUTE/GCScheduler.h b/arm_compute/runtime/GLES_COMPUTE/GCScheduler.h new file mode 100644 index 000000000..817f8b54b --- /dev/null +++ b/arm_compute/runtime/GLES_COMPUTE/GCScheduler.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_GCSCHEDULER_H__ +#define __ARM_COMPUTE_GCSCHEDULER_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class IGCKernel; + +/** Provides global access to a OpenGL ES context and command queue. */ +class GCScheduler +{ +private: + /** Constructor */ + GCScheduler(); + +public: + /** Access the scheduler singleton. + * + * @return The scheduler + */ + static GCScheduler &get(); + + /** Initialises the context and command queue used by the scheduler to default values + * and sets a default device and kernel path for the @ref GCKernelLibrary. + */ + void default_init(); + + /** Schedule the execution of the passed kernel if possible. + * + * @param[in] kernel Kernel to execute. + * @param[in] flush (Optional) Specifies if the command queue will be flushed after running the kernel. + */ + void enqueue(IGCKernel &kernel, bool flush = true); + + /** Initialises the display and context to be used by the scheduler. + * + * @param[in] dpy The EGL display connection + * @param[in] ctx The EGL rendering context + */ + void init(EGLDisplay dpy, EGLContext ctx); + + /** Blocks until all commands in the associated command queue have finished. */ + void sync(); +}; +} + +#endif /* __ARM_COMPUTE_GCSCHEDULER_H__ */ diff --git a/arm_compute/runtime/GLES_COMPUTE/GCTensor.h b/arm_compute/runtime/GLES_COMPUTE/GCTensor.h new file mode 100644 index 000000000..3e51f9908 --- /dev/null +++ b/arm_compute/runtime/GLES_COMPUTE/GCTensor.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_GCTENSOR_H__ +#define __ARM_COMPUTE_GCTENSOR_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h" + +namespace arm_compute +{ +class ITensorAllocator; +class ITensorInfo; + +/** Interface for OpenGL ES tensor */ +class GCTensor : public IGCTensor +{ +public: + /** Default constructor */ + GCTensor(); + + /** Prevent instances of this class from being copied (As this class contains pointers). */ + GCTensor(const GCTensor &) = delete; + + /** Prevent instances of this class from being copy assigned (As this class contains pointers). */ + GCTensor &operator=(const GCTensor &) = delete; + + /** Allow instances of this class to be moved */ + GCTensor(GCTensor &&) = default; + + /** Allow instances of this class to be moved */ + GCTensor &operator=(GCTensor &&) = default; + + /** Virtual destructor */ + virtual ~GCTensor() = default; + + /** Return a pointer to the tensor's allocator + * + * @return A pointer to the tensor's allocator + */ + ITensorAllocator *allocator(); + + /** Enqueue a map operation of the allocated buffer on the given queue. + * + * @param[in] blocking (Optional) If true, then the mapping will be ready to use by the time + * this method returns, else it is the caller's responsibility + * to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer. + * + * @return The mapping address. + */ + void map(bool blocking = true); + + /** Enqueue an unmap operation of the allocated and mapped buffer on the given queue. + * + * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before + * the memory is accessed by the device. + * + */ + void unmap(); + + // Inherited methods overridden: + TensorInfo *info() const override; + TensorInfo *info() override; + uint8_t *buffer() const override; + GLuint gc_buffer() const override; + +protected: + // Inherited methods overridden: + uint8_t *do_map(bool blocking) override; + void do_unmap() override; + +private: + mutable GCTensorAllocator _allocator; +}; + +using GCImage = GCTensor; +} + +#endif /*__ARM_COMPUTE_GCTENSOR_H__ */ diff --git a/arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h b/arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h new file mode 100644 index 000000000..ce52cbbbd --- /dev/null +++ b/arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_GCTENSORALLOCATOR_H__ +#define __ARM_COMPUTE_GCTENSORALLOCATOR_H__ + +#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" +#include "arm_compute/runtime/ITensorAllocator.h" + +#include <memory> + +namespace arm_compute +{ +/** Basic implementation of a GLES memory tensor allocator. */ +class GCTensorAllocator : public ITensorAllocator +{ +public: + /** Default constructor. */ + GCTensorAllocator(); + + /** Prevent instances of this class from being copied (As this class contains pointers). */ + GCTensorAllocator(const GCTensorAllocator &) = delete; + + /** Prevent instances of this class from being copy assigned (As this class contains pointers). */ + GCTensorAllocator &operator=(const GCTensorAllocator &) = delete; + + /** Allow instances of this class to be moved */ + GCTensorAllocator(GCTensorAllocator &&) = default; + + /** Allow instances of this class to be moved */ + GCTensorAllocator &operator=(GCTensorAllocator &&) = default; + + /** Default destructor */ + ~GCTensorAllocator() = default; + + /** Interface to be implemented by the child class to return the pointer to the mapped data. */ + uint8_t *data(); + + /** Get the OpenGL ES buffer object name + * + * @return The buffer object name + */ + GLuint get_gl_ssbo_name() const; + + /** Enqueue a map operation of the allocated buffer on the given queue. + * + * @param[in] blocking If true, then the mapping will be ready to use by the time + * this method returns, else it is the caller's responsibility + * to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer. + * + * @return The mapping address. + */ + uint8_t *map(bool blocking); + + /** Enqueue an unmap operation of the allocated buffer on the given queue. + * + * @note This method simply enqueue the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before + * the memory is accessed by the device. + * + */ + void unmap(); + + /** Allocate size specified by TensorInfo of GLES memory. + * + * @note: The tensor must not already be allocated when calling this function. + * + */ + void allocate() override; + + /** Free allocated GLES memory. + * + * @note The tensor must have been allocated when calling this function. + * + */ + void free() override; + +protected: + /** Call map() on the SSBO. + * + * @return A pointer to the beginning of the tensor's allocation. + */ + uint8_t *lock() override; + + /** Call unmap() on the SSBO. */ + void unlock() override; + +private: + class GLBufferWrapper + { + public: + GLBufferWrapper() + : _ssbo_name(0) + { + ARM_COMPUTE_GL_CHECK(glGenBuffers(1, &_ssbo_name)); + } + ~GLBufferWrapper() + { + ARM_COMPUTE_GL_CHECK(glDeleteBuffers(1, &_ssbo_name)); + } + GLuint _ssbo_name; + }; + std::unique_ptr<GLBufferWrapper> _gl_buffer; + uint8_t *_mapping; +}; +} + +#endif /* __ARM_COMPUTE_GCTENSORALLOCATOR_H__ */ diff --git a/arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h b/arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h new file mode 100644 index 000000000..15bbfffe9 --- /dev/null +++ b/arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_IGCSIMPLEFUNCTION_H__ +#define __ARM_COMPUTE_IGCSIMPLEFUNCTION_H__ + +#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h" +#include "arm_compute/runtime/IFunction.h" + +#include <memory> + +namespace arm_compute +{ +/** Basic interface for functions which have a single OpenGL ES kernel */ +class IGCSimpleFunction : public IFunction +{ +public: + /** Default constructor */ + IGCSimpleFunction(); + + // Inherited methods overridden: + void run() override final; + +protected: + std::unique_ptr<IGCKernel> _kernel; /**< Kernel to run */ + GCFillBorderKernel _border_handler; /**< Kernel to handle borders */ +}; +} +#endif /*__ARM_COMPUTE_IGCSIMPLEFUNCTION_H__ */ diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.h new file mode 100644 index 000000000..0d4a354e2 --- /dev/null +++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_GCABSOLUTEDIFFERENCE_H__ +#define __ARM_COMPUTE_GCABSOLUTEDIFFERENCE_H__ + +#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to run @ref GCAbsoluteDifferenceKernel + * + * @note The tensor data types for the inputs must be U8. + * @note The function calculates the absolute difference also when the 2 inputs have different tensor data types. + */ +class GCAbsoluteDifference : public IGCSimpleFunction +{ +public: + /** Initialize the function + * + * @param[in] input1 First input tensor. Data types supported: U8 + * @param[in] input2 Second input tensor. Data types supported: U8 + * @param[out] output Output tensor. Data types supported: U8 + */ + void configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output); +}; +} + +#endif /* __ARM_COMPUTE_GCABSOLUTEDIFFERENCE_H__ */ diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h new file mode 100644 index 000000000..b43456b2c --- /dev/null +++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCACTIVATIONLAYER_H__ +#define __ARM_COMPUTE_GCACTIVATIONLAYER_H__ + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h" + +namespace arm_compute +{ +class IGCTensor; + +/** Basic function to run @ref GCActivationLayerKernel + * + * @note The function simulates an activation layer with the specified activation function. + */ +class GCActivationLayer : public IGCSimpleFunction +{ +public: + /** Set the input and output tensor. + * + * @note If the output tensor is a nullptr, the activation function will be performed in-place + * + * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result + * of the activation function. Data types supported: F16/F32. + * @param[out] output Destination tensor. Data type supported: same as @p input + * @param[in] act_info Activation layer parameters. + */ + void configure(IGCTensor *input, IGCTensor *output, ActivationLayerInfo act_info); +}; +} +#endif /* __ARM_COMPUTE_GCACTIVATIONLAYER_H__ */ diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h new file mode 100644 index 000000000..9d81b9a7f --- /dev/null +++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCBATCHNORMALIZATIONLAYER_H__ +#define __ARM_COMPUTE_GCBATCHNORMALIZATIONLAYER_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class IGCTensor; + +/** Basic function to run @ref GCBatchNormalizationLayerKernel and simulate a batch normalization layer. + * + * Batch normalization is calculated by: + * @f[ out_i = \gamma * (\frac{in_i - \mu_{B}}{\sqrt{\sigma^2_{B} + \epsilon}}) + \beta \equiv BN_{\gamma,\beta}(in_i) @f] + * + */ +class GCBatchNormalizationLayer : public IFunction +{ +public: + /** Default constructor */ + GCBatchNormalizationLayer(); + /** Set the input and output tensors. + * + * @param[in] input Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, FM]. + * The rest are optional and used for representing batches. Data types supported: F16/F32. + * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input + * @param[in] mean Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] var Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] beta Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] gamma Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] epsilon Small value to avoid division with zero. + */ + void configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *var, const IGCTensor *beta, const IGCTensor *gamma, float epsilon); + + // Inherited methods overridden: + void run() override; + +private: + GCBatchNormalizationLayerKernel _norm_kernel; /**< BatchNormalization layer kernel to run */ +}; +} +#endif /* __ARM_COMPUTE_GCBATCHNORMALIZATIONLAYER_H__ */ diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h new file mode 100644 index 000000000..1151399f9 --- /dev/null +++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenateLayer.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCDEPTHCONCATENATE_H__ +#define __ARM_COMPUTE_GCDEPTHCONCATENATE_H__ + +#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" + +#include <memory> +#include <vector> + +namespace arm_compute +{ +class IGCTensor; + +/** Basic function to execute concatenate tensors along z axis. This function calls the following kernels: + * + * -# @ref GCFillBorderKernel (executed if input's lowest two dimensions are smaller than respective output's dimensions) + * -# @ref GCDepthConcatenateLayerKernel + * + */ +class GCDepthConcatenateLayer : public IFunction +{ +public: + /** Default constructor */ + GCDepthConcatenateLayer(); + /** Initialise the kernel's inputs vector and output. + * + * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: F16/F32. + * @param[out] output Output tensor. Data types supported: Same as @p input. + */ + void configure(std::vector<IGCTensor *> inputs_vector, IGCTensor *output); + + // Inherited methods overridden: + void run() override; + +private: + std::unique_ptr<GCDepthConcatenateLayerKernel[]> _concat_kernels_vector; + std::unique_ptr<GCFillBorderKernel[]> _border_handlers_vector; + unsigned int _num_inputs; +}; +} +#endif /* __ARM_COMPUTE_GCDEPTHCONCATENATE_H__ */ diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h new file mode 100644 index 000000000..5472bdb9e --- /dev/null +++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCDIRECTCONVOLUTIONLAYER_H__ +#define __ARM_COMPUTE_GCDIRECTCONVOLUTIONLAYER_H__ + +#include "arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h" + +#include <memory> + +namespace arm_compute +{ +class IGCTensor; + +/** Basic function to execute direct convolution function: + * + * @note Supported kernel size: 1x1, 3x3, and 5x5 + * @note This OpenGL ES implementation works with stride_x = 1 and 2 + */ +class GCDirectConvolutionLayer : public IGCSimpleFunction +{ +public: + /** Set the input and output tensors. + * + * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input. + * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + */ + void configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info); +}; +} +#endif /* __ARM_COMPUTE_GCDIRECTCONVOLUTIONLAYER_H__ */ diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h new file mode 100644 index 000000000..c51d2c161 --- /dev/null +++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_GCDROPOUTLAYER_H__ +#define __ARM_COMPUTE_GCDROPOUTLAYER_H__ + +#include "arm_compute/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.h" +#include "arm_compute/runtime/IFunction.h" + +namespace arm_compute +{ +class IGCTensor; +/** Basic function to do dropout op. This function calls the following kernels: + * + * -# @ref GCDropoutLayerKernel + */ +class GCDropoutLayer : public IFunction +{ +public: + /** Constructor */ + GCDropoutLayer(); + + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data type supported: F16/F32. + * @param[out] mask Destination tensor. Data type supported: Same as @p input. + * @param[out] output Destination tensor. Data type supported: Same as @p input. + * @param[in] ratio Dropout ratio + * @param[in] forward Forward or backward propagation + * + */ + void configure(const IGCTensor *input, IGCTensor *mask, IGCTensor *output, float ratio, bool forward); + + //Inherited methods override + void run() override; + +private: + GCDropoutLayerKernel _dropout_kernel; +}; +} + +#endif /* __ARM_COMPUTE_GCDROPOUTLAYER_H__ */ diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCFillBorder.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCFillBorder.h new file mode 100644 index 000000000..a04e4002f --- /dev/null +++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCFillBorder.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_GCFILLBORDER_H__ +#define __ARM_COMPUTE_GCFILLBORDER_H__ + +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to run @ref GCFillBorderKernel */ +class GCFillBorder : public IGCSimpleFunction +{ +public: + /** Initialize the function + * + * @param[in,out] tensor Source tensor. Data types supported: F16/F32 + * @param[in] border_width The border width + * @param[in] border_mode Strategy to use for borders. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + */ + void configure(IGCTensor *tensor, unsigned int border_width, BorderMode border_mode, + const PixelValue &constant_border_value = PixelValue()); +}; +} + +#endif /*__ARM_COMPUTE_FILLBORDER_H__ */ diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h new file mode 100644 index 000000000..1ae5837de --- /dev/null +++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCFULLYCONNECTEDLAYER_H__ +#define __ARM_COMPUTE_GCFULLYCONNECTEDLAYER_H__ + +#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h" +#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h" + +namespace arm_compute +{ +/** Basic function to reshape the weights of Fully Connected layer with OpenGL ES. This function calls the following kernels: + * + * -# @ref GCTransposeKernel + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + */ +class GCFullyConnectedLayerReshapeWeights : public IGCSimpleFunction +{ +public: + /** Set the input and output tensors. + * + * @param[in] input Weights tensor. The weights must be 2 dimensional. Data types supported: F16/F32. + * @param[out] output Destination tensor which stores the transposed input tensor. Data type supported: Same as @p input. + */ + void configure(const IGCTensor *input, IGCTensor *output); +}; + +/** Basic function to compute a Fully Connected layer on OpenGL ES. This function calls the following OpenGL ES kernels: + * + * -# @ref GCIm2ColKernel (called when the input comes from a convolutional layer) + * -# @ref GCFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once) + * -# @ref GCGEMMMatrixMultiplyKernel + * -# @ref GCGEMMMatrixAccumulateBiasesKernel (if @p biases is not equal to nullptr) + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + */ +class GCFullyConnectedLayer : public IFunction +{ +public: + /** Constructor */ + GCFullyConnectedLayer(); + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data type supported: F16/F32. + * @param[in] weights Weights tensor. The weights must be 2 dimensional. Data type supported: Same as @p input + * @param[in] biases Bias tensor. It can be nullptr. Data type supported:Same as @p input. + * @param[out] output Destination tensor. Data type supported: Same as @p input. + * @param[in] transpose_weights (Optional) Transpose weights if true. Defaults to true. + * @param[in] are_weights_reshaped (Optional) Reshape the weights tensor if false. Defaults to false. + */ + void configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, bool transpose_weights = true, bool are_weights_reshaped = false); + + //Inherited methods override + void run() override; + +private: + void configure_fc_fc(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output); + void configure_conv_fc(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output); + + GCIm2ColKernel _im2col_kernel; + GCFullyConnectedLayerReshapeWeights _reshape_weights_kernel; + GCGEMMMatrixMultiplyKernel _mm_kernel; + GCGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; + GCTensor _im2col_output; + GCTensor _reshape_weights_output; + bool _are_weights_reshaped; + bool _is_fc_after_conv; + bool _accumulate_biases; +}; +} +#endif /* __ARM_COMPUTE_GCFULLYCONNECTEDLAYER_H__ */ diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h new file mode 100644 index 000000000..f2484cd80 --- /dev/null +++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_GCGEMM_H__ +#define __ARM_COMPUTE_GCGEMM_H__ + +#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h" +#include "arm_compute/runtime/IFunction.h" + +namespace arm_compute +{ +class IGCTensor; + +/** Basic function to execute GEMM on OpenGLES Compute. This function calls the following kernels: + * + * -# @ref GCGEMMInterleave4x4Kernel (if the output tensor is a matrix) + * -# @ref GCGEMMTranspose1xWKernel (if the output tensor is a matrix) + * -# @ref GCGEMMMatrixMultiplyKernel + * -# @ref GCGEMMMatrixAdditionKernel (if c != nullptr and beta != 0.0) + * + */ +class GCGEMM : public IFunction +{ +public: + /** Default constructor. */ + GCGEMM(); + + /** Initialise the kernel's inputs and output + * + * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C]. + * + * @note All tensors must have the same data type. + * + * @note Whilst the first input tensor can be a vector, the second input tensor must be at least a matrix + * + * @param[in] a First input tensor (Matrix or Vector A). Data types supported: F32 + * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a. + * @param[in] c Third input tensor (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a. + * @param[out] output Output tensor. Data type supported: same as @p a + * @param[in] alpha Weight of the matrix product + * @param[in] beta Weight of matrix C + */ + void configure(const IGCTensor *a, const IGCTensor *b, const IGCTensor *c, IGCTensor *output, float alpha, float beta); + + // Inherited methods overridden: + void run() override; + +private: + GCGEMMInterleave4x4Kernel _interleave_kernel; + GCGEMMTranspose1xWKernel _transpose_kernel; + GCGEMMMatrixMultiplyKernel _mm_kernel; + GCGEMMMatrixAdditionKernel _ma_kernel; + GCTensor _tmp_a; + GCTensor _tmp_b; + bool _is_interleaved_transposed; + bool _run_addition; +}; +} + +#endif /* __ARM_COMPUTE_GCGEMM_H__ */ diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h new file mode 100644 index 000000000..48fa7ed50 --- /dev/null +++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCGEMMINTERLEAVE4X4_H__ +#define __ARM_COMPUTE_GCGEMMINTERLEAVE4X4_H__ + +#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to execute GCGEMMInterleave4x4Kernel. This function calls the following OpenGL ES kernel: + * + * -# @ref GCGEMMInterleave4x4Kernel + * + */ +class GCGEMMInterleave4x4 : public IGCSimpleFunction +{ +public: + /** Initialise the kernel's inputs, output + * + * @param[in] input First input tensor. Data types supported: F32 + * @param[out] output Output tensor. Data type supported: same as @p input + */ + void configure(const IGCTensor *input, IGCTensor *output); +}; +} + +#endif /* __ARM_COMPUTE_GCGEMMINTERLEAVE4X4_H__ */ diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.h new file mode 100644 index 000000000..24af2193c --- /dev/null +++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCGEMMTRANSPOSE1XW_H__ +#define __ARM_COMPUTE_GCGEMMTRANSPOSE1XW_H__ + +#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h" + +namespace arm_compute +{ +/** Basic function to execute GCGEMMTranspose1xWKernel. This function calls the following OpenGLES kernels: + * + * -# @ref GCGEMMTranspose1xWKernel + * + */ +class GCGEMMTranspose1xW : public IGCSimpleFunction +{ +public: + /** Initialise the kernel's inputs, output + * + * @param[in] input First input tensor. Data type supported: F32 + * @param[out] output Output tensor. Data type supported: same as @p input + */ + void configure(const IGCTensor *input, IGCTensor *output); +}; +} +#endif /*__ARM_COMPUTE_GCGEMMTRANSPOSE1XW_H__ */ diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h new file mode 100644 index 000000000..d080a2f7b --- /dev/null +++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCNORMALIZATIONLAYER_H__ +#define __ARM_COMPUTE_GCNORMALIZATIONLAYER_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h" + +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class IGCTensor; + +/** Basic function to compute a normalization layer. This function calls the following OpenGL ES kernels: + * + * -# @ref GCPixelWiseMultiplicationKernel + * -# @ref GCFillBorderKernel + * -# @ref GCNormalizationLayerKernel + * + */ +class GCNormalizationLayer : public IFunction +{ +public: + /** Default constructor */ + GCNormalizationLayer(); + /** Set the input and output tensors. + * + * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data types supported: F32. Number of channels must be 1. + * @param[out] output Destination tensor. Dimensions, data type and number of channels must match the input ones. + * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters. + */ + void configure(const IGCTensor *input, IGCTensor *output, const NormalizationLayerInfo &norm_info); + + // Inherited methods overridden: + void run() override; + +private: + GCTensor _squared_input; /**< The intermediate buffer which stores results of squaring input*/ + GCNormalizationLayerKernel _norm_kernel; /**< Normalization layer kernel to run */ + GCPixelWiseMultiplicationKernel _multiply_kernel; /**< Pixel multiplication kernel to run */ + GCFillBorderKernel _border_handler; /**< Kernel to handle borders */ +}; +} +#endif /* __ARM_COMPUTE_GCNORMALIZATIONLAYER_H__ */ diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.h new file mode 100644 index 000000000..e6239edc2 --- /dev/null +++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCPIXELWISEMULTIPLICATION_H__ +#define __ARM_COMPUTE_GCPIXELWISEMULTIPLICATION_H__ + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h" + +namespace arm_compute +{ +class IGCTensor; + +/** Basic function to run @ref GCPixelWiseMultiplicationKernel. */ +class GCPixelWiseMultiplication : public IGCSimpleFunction +{ +public: + /** Initialise the kernel's inputs, output and convertion policy. + * + * @param[in] input1 First tensor input. Data types supported: F32. + * @param[in] input2 Second tensor input. Data types supported: Same as @p input1. + * @param[out] output Output tensor. Data types supported: Same as @p input1. + * @param[in] scale Scale to apply after multiplication. Must be a positive value. + */ + void configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output, float scale); +}; +} +#endif /*__ARM_COMPUTE_GCPIXELWISEMULTIPLICATION_H__ */ diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h new file mode 100644 index 000000000..cce44d0c3 --- /dev/null +++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCPOOLINGLAYER_H__ +#define __ARM_COMPUTE_GCPOOLINGLAYER_H__ + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h" + +#include <memory> + +namespace arm_compute +{ +class IGCTensor; + +/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenGL ES kernels: + * + * -# @ref GCFillBorderKernel (executed if padding size is different from zero) + * -# @ref GCPoolingLayerKernel + */ +class GCPoolingLayer : public IGCSimpleFunction +{ +public: + /** Set the input and output tensors. + * + * @param[in,out] input Source tensor. (Written to only when padding != 0) Data types supported: F16/F32. + * @param[out] output Destination tensor. Data types supported: Same as @p input. + * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. + */ + void configure(IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info); +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_GCPOOLINGLAYER_H__ */ diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h new file mode 100644 index 000000000..e7f8d5053 --- /dev/null +++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCSOFTMAXLAYER_H__ +#define __ARM_COMPUTE_GCSOFTMAXLAYER_H__ + +#include "arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h" +#include "arm_compute/runtime/IFunction.h" + +namespace arm_compute +{ +class IGCTensor; + +/** Basic function to compute a SoftmaxLayer. + * + * Softmax is calculated by : + * @f[ out = exp(x - max(x)) / sum(exp(x - max(x))) @f] + * + * This function runs the following kernels: + * -# @ref GCLogits1DMaxKernel + * -# @ref GCLogits1DShiftExpSumKernel + * -# @ref GCLogits1DNormKernel + */ +class GCSoftmaxLayer : public IFunction +{ +public: + /** Constructor */ + GCSoftmaxLayer(); + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data types supported: F16/F32 + * @param[out] output Destination tensor. Data types supported: same as @p input + * @param[in] beta (Optional) A scaling factor for the exponent. Only beta = 1 is supported. + */ + void configure(const IGCTensor *input, IGCTensor *output, float beta = 1.0f); + + // Inherited methods overridden: + void run() override; + +private: + GCLogits1DMaxKernel _max_kernel; + GCLogits1DShiftExpSumKernel _shift_exp_sum_kernel; + GCLogits1DNormKernel _norm_kernel; + GCTensor _max; + GCTensor _sum; + GCTensor _tmp; +}; +} +#endif /* __ARM_COMPUTE_GCSOFTMAXLAYER_H__ */ diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h new file mode 100644 index 000000000..23324343f --- /dev/null +++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_GCTRANSPOSE_H__ +#define __ARM_COMPUTE_GCTRANSPOSE_H__ + +#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h" + +namespace arm_compute +{ +class IGCTensor; + +/** Basic function to transpose a matrix on OpenGL ES. This function calls the following OpenGL ES kernel: + * + * -# @ref GCTransposeKernel + * + */ +class GCTranspose : public IGCSimpleFunction +{ +public: + /** Initialise the kernel's inputs and output + * + * @param[in] input Input tensor. Data types supported: F16/F32 + * @param[out] output Output tensor. Data type supported: Same as @p input + */ + void configure(const IGCTensor *input, IGCTensor *output); +}; +} + +#endif /* __ARM_COMPUTE_GCTRANSPOSE_H__ */ diff --git a/arm_compute/runtime/ILifetimeManager.h b/arm_compute/runtime/ILifetimeManager.h index 4f9af6f53..6f2c68d37 100644 --- a/arm_compute/runtime/ILifetimeManager.h +++ b/arm_compute/runtime/ILifetimeManager.h @@ -28,7 +28,7 @@ #include "arm_compute/runtime/Types.h" #include <cstddef> -#include <vector> +#include <memory> namespace arm_compute { @@ -58,6 +58,11 @@ public: * @param[in] size Size of the given object at given time */ virtual void end_lifetime(void *obj, void **handle, size_t size) = 0; + /** Checks if the lifetime of the registered object is complete + * + * @return True if all object lifetimes are finalized else false. + */ + virtual bool are_all_finalized() const = 0; /** Creates a memory pool depending on the memory requirements * * @param allocator Allocator to use @@ -65,16 +70,11 @@ public: * @return A memory pool */ virtual std::unique_ptr<IMemoryPool> create_pool(IAllocator *allocator) = 0; - /** Checks if the lifetime of the registered object is complete - * - * @return True if all object lifetimes are finalized else false. - */ - virtual bool are_all_finalized() const = 0; /** Returns the type of mappings that the lifetime manager returns * * @return Mapping type of the lifetime manager */ virtual MappingType mapping_type() const = 0; }; -} // arm_compute +} // namespace arm_compute #endif /* __ARM_COMPUTE_ILIFETIMEMANAGER_H__ */ diff --git a/arm_compute/runtime/ISimpleLifetimeManager.h b/arm_compute/runtime/ISimpleLifetimeManager.h new file mode 100644 index 000000000..792ab0b55 --- /dev/null +++ b/arm_compute/runtime/ISimpleLifetimeManager.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_ISIMPLELIFETIMEMANAGER_H__ +#define __ARM_COMPUTE_ISIMPLELIFETIMEMANAGER_H__ + +#include "arm_compute/runtime/ILifetimeManager.h" + +#include "arm_compute/runtime/IMemoryPool.h" +#include "arm_compute/runtime/Types.h" + +#include <cstddef> +#include <map> +#include <vector> + +namespace arm_compute +{ +class IAllocator; +class IMemoryGroup; + +/** Abstract class of the simple lifetime manager interface */ +class ISimpleLifetimeManager : public ILifetimeManager +{ +public: + /** Constructor */ + ISimpleLifetimeManager(); + /** Prevent instances of this class to be copy constructed */ + ISimpleLifetimeManager(const ISimpleLifetimeManager &) = delete; + /** Prevent instances of this class to be copied */ + ISimpleLifetimeManager &operator=(const ISimpleLifetimeManager &) = delete; + /** Allow instances of this class to be move constructed */ + ISimpleLifetimeManager(ISimpleLifetimeManager &&) = default; + /** Allow instances of this class to be moved */ + ISimpleLifetimeManager &operator=(ISimpleLifetimeManager &&) = default; + + // Inherited methods overridden: + void register_group(IMemoryGroup *group) override; + void start_lifetime(void *obj) override; + void end_lifetime(void *obj, void **handle, size_t size) override; + bool are_all_finalized() const override; + +protected: + /** Update blobs and mappings */ + virtual void update_blobs_and_mappings() = 0; + +protected: + /** Element struct */ + struct Element + { + Element(void *id_ = nullptr, void **handle_ = nullptr, size_t size_ = 0, bool status_ = false) + : id(id_), handle(handle_), size(size_), status(status_) + { + } + void *id; /**< Element id */ + void **handle; /**< Element's memory handle */ + size_t size; /**< Element's size */ + bool status; /**< Lifetime status */ + }; + + IMemoryGroup *_active_group; /**< Active group */ + std::vector<Element> _active_elements; /**< A map that contains the active elements */ + std::map<IMemoryGroup *, std::vector<Element>> _finalized_groups; /**< A map that contains the finalized groups */ +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_ISIMPLELIFETIMEMANAGER_H__ */ diff --git a/arm_compute/runtime/Memory.h b/arm_compute/runtime/Memory.h new file mode 100644 index 000000000..98bbb7023 --- /dev/null +++ b/arm_compute/runtime/Memory.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_MEMORY_H__ +#define __ARM_COMPUTE_MEMORY_H__ + +#include <cstddef> +#include <memory> + +namespace arm_compute +{ +/** CPU implementation of memory object */ +class Memory +{ +public: + /** Default Constructor */ + Memory(); + /** Default Constructor + * + * @note Ownership of the memory is transferred to this object + * + * @param[in] memory Memory to be imported + */ + Memory(std::shared_ptr<uint8_t> memory); + /** Default Constructor + * + * @note Ownership of the memory is not transferred to this object. + * Thus management (allocate/free) should be done by the client. + * + * @param[in] memory Memory to be imported + */ + Memory(uint8_t *memory); + /** Allow instances of this class to be copied */ + Memory(const Memory &) = default; + /** Allow instances of this class to be copy assigned */ + Memory &operator=(const Memory &) = default; + /** Allow instances of this class to be moved */ + Memory(Memory &&) noexcept = default; + /** Allow instances of this class to be move assigned */ + Memory &operator=(Memory &&) noexcept = default; + + /** Returns the pointer to the allocated data. + * + * @return Pointer to the allocated data + */ + uint8_t *buffer(); + /** Returns the pointer to the allocated data. + * + * @return Pointer to the allocated data + */ + uint8_t *buffer() const; + /** Handle of internal memory + * + * @return Handle of memory + */ + uint8_t **handle(); + +private: + uint8_t *_memory; + std::shared_ptr<uint8_t> _memory_owned; +}; +} +#endif /* __ARM_COMPUTE_MEMORY_H__ */ diff --git a/arm_compute/runtime/MemoryGroupBase.h b/arm_compute/runtime/MemoryGroupBase.h index ab8acb349..19e983492 100644 --- a/arm_compute/runtime/MemoryGroupBase.h +++ b/arm_compute/runtime/MemoryGroupBase.h @@ -26,6 +26,7 @@ #include "arm_compute/runtime/IMemoryGroup.h" +#include "arm_compute/core/Error.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/IMemoryPool.h" diff --git a/arm_compute/runtime/MultiImage.h b/arm_compute/runtime/MultiImage.h index 917e586ef..30fa9b025 100644 --- a/arm_compute/runtime/MultiImage.h +++ b/arm_compute/runtime/MultiImage.h @@ -45,18 +45,18 @@ public: MultiImage(); /** Allocate the multi-planar image * - * @param[in] width Width of the whole image - * @param[in] height Height of the whole image - * @param[in] format Format of the whole image + * @param[in] width Width of the whole image + * @param[in] height Height of the whole image + * @param[in] format Format of the whole image */ void init(unsigned int width, unsigned int height, Format format); /** Allocate the multi-planar image * * @note Uses conservative padding strategy which fits all kernels. * - * @param[in] width Width of the whole image - * @param[in] height Height of the whole image - * @param[in] format Format of the whole image + * @param[in] width Width of the whole image + * @param[in] height Height of the whole image + * @param[in] format Format of the whole image */ void init_auto_padding(unsigned int width, unsigned int height, Format format); /** Allocated a previously initialised multi image @@ -67,10 +67,10 @@ public: void allocate(); /** Create a subimage from an existing MultiImage. * - * @param[in] image Image to use backing memory from - * @param[in] coords Starting coordinates of the new image. Should be within the parent image sizes - * @param[in] width The width of the subimage - * @param[in] height The height of the subimage + * @param[in] image Image to use backing memory from + * @param[in] coords Starting coordinates of the new image. Should be within the parent image sizes + * @param[in] width The width of the subimage + * @param[in] height The height of the subimage */ void create_subimage(MultiImage *image, const Coordinates &coords, unsigned int width, unsigned int height); @@ -82,10 +82,10 @@ public: private: /** Init the multi-planar image * - * @param[in] width Width of the whole image - * @param[in] height Height of the whole image - * @param[in] format Format of the whole image - * @param[in] auto_padding Specifies whether the image uses auto padding + * @param[in] width Width of the whole image + * @param[in] height Height of the whole image + * @param[in] format Format of the whole image + * @param[in] auto_padding Specifies whether the image uses auto padding */ void internal_init(unsigned int width, unsigned int height, Format format, bool auto_padding); diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h index 40bff978a..08852cf36 100644 --- a/arm_compute/runtime/NEON/NEFunctions.h +++ b/arm_compute/runtime/NEON/NEFunctions.h @@ -39,11 +39,16 @@ #include "arm_compute/runtime/NEON/functions/NECannyEdge.h" #include "arm_compute/runtime/NEON/functions/NEChannelCombine.h" #include "arm_compute/runtime/NEON/functions/NEChannelExtract.h" +#include "arm_compute/runtime/NEON/functions/NECol2Im.h" #include "arm_compute/runtime/NEON/functions/NEColorConvert.h" #include "arm_compute/runtime/NEON/functions/NEConvolution.h" #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h" -#include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h" -#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h" +#include "arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h" +#include "arm_compute/runtime/NEON/functions/NEDeconvolutionLayerUpsample.h" +#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h" +#include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h" +#include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h" +#include "arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h" #include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h" #include "arm_compute/runtime/NEON/functions/NEDerivative.h" #include "arm_compute/runtime/NEON/functions/NEDilate.h" @@ -57,7 +62,9 @@ #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h" #include "arm_compute/runtime/NEON/functions/NEGEMM.h" #include "arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h" -#include "arm_compute/runtime/NEON/functions/NEGEMMLowp.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h" +#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h" #include "arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h" #include "arm_compute/runtime/NEON/functions/NEGaussian3x3.h" #include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h" @@ -68,8 +75,9 @@ #include "arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h" #include "arm_compute/runtime/NEON/functions/NEHarrisCorners.h" #include "arm_compute/runtime/NEON/functions/NEHistogram.h" +#include "arm_compute/runtime/NEON/functions/NEIm2Col.h" #include "arm_compute/runtime/NEON/functions/NEIntegralImage.h" -#include "arm_compute/runtime/NEON/functions/NEL2Normalize.h" +#include "arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h" #include "arm_compute/runtime/NEON/functions/NELaplacianPyramid.h" #include "arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h" #include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h" @@ -100,5 +108,6 @@ #include "arm_compute/runtime/NEON/functions/NETranspose.h" #include "arm_compute/runtime/NEON/functions/NEWarpAffine.h" #include "arm_compute/runtime/NEON/functions/NEWarpPerspective.h" +#include "arm_compute/runtime/NEON/functions/NEWinogradLayer.h" #endif /* __ARM_COMPUTE_NEFUNCTIONS_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEActivationLayer.h b/arm_compute/runtime/NEON/functions/NEActivationLayer.h index f3cd30591..007c53a0a 100644 --- a/arm_compute/runtime/NEON/functions/NEActivationLayer.h +++ b/arm_compute/runtime/NEON/functions/NEActivationLayer.h @@ -49,6 +49,16 @@ public: * @param[in] activation_info Activation layer parameters. */ void configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info); + /** Static function to check if given info will lead to a valid configuration of @ref NEActivationLayer + * + * @param[in] input Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result + * of the activation function. Data types supported: QS8/QS16/F16/F32. + * @param[in] output Destination tensor info. Data type supported: same as @p input + * @param[in] act_info Activation layer information. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info); }; } #endif /* __ARM_COMPUTE_NEACTIVATIONLAYER_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h index 3d1862389..371807393 100644 --- a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h +++ b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h @@ -43,6 +43,16 @@ public: * @param[in] policy Policy to use to handle overflow. */ void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAddition + * + * @param[in] input1 First tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32 + * @param[in] input2 Second tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32 + * @param[in] output Output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 + * @param[in] policy Policy to use to handle overflow. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy); }; } #endif /*__ARM_COMPUTE_NEARITHMETICADDITION_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h index b59cca98a..751ed1adf 100644 --- a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h +++ b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h @@ -43,6 +43,16 @@ public: * @param[in] policy Policy to use to handle overflow. */ void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtraction + * + * @param[in] input1 First tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32 + * @param[in] input2 Second tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32 + * @param[in] output Output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32 + * @param[in] policy Policy to use to handle overflow. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy); }; } #endif /* __ARM_COMPUTE_NEARITHMETICSUBTRACTION_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h index 041b9e729..1933468af 100644 --- a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h +++ b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h @@ -50,14 +50,32 @@ public: * @param[in, out] input Source tensor. In case of @p output tensor = nullptr, this tensor will store the result. * 3 lower dimensions represent a single input with dimensions [width, height, FM]. * The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32. + * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input * @param[in] mean Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input * @param[in] var Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input - * @param[in] gamma Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input * @param[in] beta Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] gamma Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input * @param[in] epsilon Small value to avoid division with zero. - * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input */ void configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon); + /** Static function to check if given info will lead to a valid configuration of @ref NEBatchNormalizationLayer + * + * @param[in] input Source tensor info. In case of @p output tensor = nullptr, this tensor will store the result. + * 3 lower dimensions represent a single input with dimensions [width, height, FM]. + * The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32. + * @param[in] output Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input + * @param[in] mean Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] var Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] beta Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] gamma Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] epsilon Small value to avoid division with zero. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *mean, const ITensorInfo *var, + const ITensorInfo *beta, const ITensorInfo *gamma, + float epsilon); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEChannelExtract.h b/arm_compute/runtime/NEON/functions/NEChannelExtract.h index 5e46eef3a..1620d3ad1 100644 --- a/arm_compute/runtime/NEON/functions/NEChannelExtract.h +++ b/arm_compute/runtime/NEON/functions/NEChannelExtract.h @@ -39,14 +39,14 @@ class NEChannelExtract : public INESimpleFunction public: /** Initialize the function's source, destination * - * @param[in] input The input tensor to extract the channel from. Formats supported: Any single planar. + * @param[in] input The input tensor to extract the channel from. Formats supported: Formats supported: RGB888/RGBA8888/YUYV422/UYVY422 * @param[in] channel The channel to extract. * @param[out] output The extracted channel. Format supported: U8 */ void configure(const ITensor *input, Channel channel, ITensor *output); /** Initialize the function's source, destination * - * @param[in] input The multi-planar input image to extract channel from. + * @param[in] input The multi-planar input image to extract channel from. Formats supported: NV12/NV21/IYUV/YUV444 * @param[in] channel The channel to extract. * @param[out] output The extracted channel. Format supported: U8 */ diff --git a/arm_compute/runtime/NEON/functions/NECol2Im.h b/arm_compute/runtime/NEON/functions/NECol2Im.h new file mode 100644 index 000000000..9b05bd451 --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NECol2Im.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NECOL2IM_H__ +#define __ARM_COMPUTE_NECOL2IM_H__ + +#include "arm_compute/runtime/NEON/INESimpleFunction.h" + +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to run @ref NECol2Im */ +class NECol2Im : public INESimpleFunction +{ +public: + /** Configure the col2im NEON kernel + * + * @param[in] input The input tensor to convert. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[out] output The output tensor. 3 lower dimensions represent a single output [width, height, OFM], + * while the rest represent batch of outputs. Data types supported: Same as @p input + * @param[in] convolved_dims Output convolved dimensions. + */ + void configure(const ITensor *input, ITensor *output, const Size2D &convolved_dims); + /** Static function to check if given info will lead to a valid configuration of @ref NECol2Im + * + * @param[in] input The input tensor to convert. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32 + * @param[in] output The output tensor. 3 lower dimensions represent a single output [width, height, OFM], + * while the rest represent batch of outputs. Data types supported: Same as @p input + * @param[in] convolved_dims Output convolved dimensions. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims); +}; +} +#endif /* __ARM_COMPUTE_NECOL2IM_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEColorConvert.h b/arm_compute/runtime/NEON/functions/NEColorConvert.h index 2997778ed..ab0bf1460 100644 --- a/arm_compute/runtime/NEON/functions/NEColorConvert.h +++ b/arm_compute/runtime/NEON/functions/NEColorConvert.h @@ -38,26 +38,27 @@ class NEColorConvert : public INESimpleFunction public: /** Initialize the function's source, destination * - * @param[in] input The input single-planar tensor from which to convert - * @param[in] output The converted single-planar output tensor + * @param[in] input Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888 + * @param[out] output Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422), + * RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/) */ void configure(const ITensor *input, ITensor *output); /** Initialize the function's source, destination * - * @param[in] input The multi-planar input image from which to convert - * @param[in] output The converted single-planar output image + * @param[in] input Multi-planar source image. Formats supported: NV12/NV21/IYUV + * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888 */ void configure(const IMultiImage *input, IImage *output); /** Initialize the function's source, destination * - * @param[in] input The single-planar input image from which to convert - * @param[in] output The converted multi-planar output image + * @param[in] input Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422 + * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888) */ void configure(const IImage *input, IMultiImage *output); /** Initialize the function's source, destination * - * @param[in] input The multi-planar input image from which to convert - * @param[in] output The converted multi-planar output image + * @param[in] input Multi-planar source image. Formats supported: NV12/NV21/IYUV + * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of @p input is IYUV) */ void configure(const IMultiImage *input, IMultiImage *output); }; diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h new file mode 100644 index 000000000..8757bc63a --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEDECONVOLUTIONLAYER_H__ +#define __ARM_COMPUTE_NEDECONVOLUTIONLAYER_H__ + +#include "arm_compute/runtime/NEON/functions/NEDeconvolutionLayerUpsample.h" +#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h" + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/Tensor.h" + +#include <memory> + +namespace arm_compute +{ +/** Function to run the deconvolution layer. + * + * The operation is similar to convolution but it's implemented by up-sampling the inputs with zeros insertions between the inputs and convolving + * the kernels on the up-sampled result. + * + * Before the Deconvolution is done, up-scaling the first 2D with zeros is performed. The relation between input to + * output is as follows: + * width_output = round((width_input − 1) ∗ upscale_x − 2 ∗ padding_x + kernel_x + a_x ) + * height_output = round((height_input − 1) ∗ upscale_y − 2 ∗ padding_y + kernel_y + a_y ) + * + * where + * width is the size of the first input dimension. + * height is the size of the second input dimension. + * width_output is the size of the first output dimension. + * height_output is the size of the second output dimension. + * kernel_x and kernel_y are the convolution sizes in x and y. + * ax and ay the number of zeros added to the top and right edges of the input. + * upscale_x and upscale_y how much to scale the X and Y axis. + * + * This function calls the following NEON kernels: + * + * -# @ref NEDeconvolutionLayerUpsampleKernel + * -# @ref NEDirectConvolutionLayer + * + */ +class NEDeconvolutionLayer : public IFunction +{ +public: + /** Constructor */ + NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Set the input, weights, biases and output tensors. + * + * @param[in,out] input Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: F32. + * @param[in] weights The 4d weights with dimensions [width, height, OFM, IFM]. Data type supported: Same as @p input. + * @param[in] bias Optional, ignored if NULL. The biases have one dimension. Data type supported: Same as @p input. + * @param[out] output Output tensor. The output has the same number of dimensions as the @p input. + * @param[in] info Contains padding and policies to be used in the deconvolution, this is decribed in @ref PadStrideInfo. + * @param[in] ax The number of zeros added to right edge of the input. + * @param[in] ay The number of zeros added to top edge of the input. + * @param[in] upscalex How much to scale the X axis. + * @param[in] upscaley How much to scale the Y axis. + * + */ + void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info, + unsigned int ax, unsigned int ay, float upscalex, float upscaley); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; + NEDeconvolutionLayerUpsample _scale_f; + NEDirectConvolutionLayer _conv_f; + Tensor _scaled_output; +}; +} // arm_compute +#endif /* __ARM_COMPUTE_NEDECONVOLUTIONLAYER_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayerUpsample.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayerUpsample.h new file mode 100644 index 000000000..d2ac12a58 --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayerUpsample.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEDECONVOLUTIONUPSAMPLE_H__ +#define __ARM_COMPUTE_NEDECONVOLUTIONUPSAMPLE_H__ + +#include "arm_compute/core/NEON/kernels/NEDeconvolutionLayerUpsampleKernel.h" +#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/Tensor.h" + +#include <cstdint> +#include <memory> + +namespace arm_compute +{ +class ITensor; + +/** Basic function to run @ref NEDeconvolutionLayerUpsampleKernel */ +class NEDeconvolutionLayerUpsample : public IFunction +{ +public: + /** Constructor + * + * Initialize NEDeconvolutionLayerUpsample + */ + NEDeconvolutionLayerUpsample(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Initialize the function's source, destination, interpolation type and border_mode. + * + * @param[in, out] input Source tensor. Data type supported: F32. + * @param[out] output Destination tensor. Data type supported: F32. + * @param[in] a Top and right inner border sizes. These rows and columns will be filled with zero. + * @param[in] iz The number of zeros to be inserted between each input sample + * @param[in] info Contains padding and policies to be used in the deconvolution, this is decribed in @ref PadStrideInfo. + */ + void configure(ITensor *input, ITensor *output, const std::pair<unsigned int, unsigned int> &a, + const std::pair<unsigned int, unsigned int> &iz, const PadStrideInfo &info); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; + Tensor _offsets; + NEFillBorderKernel _border_handler; + NEDeconvolutionLayerUpsampleKernel _upsample; +}; +} // arm_compute +#endif /*__ARM_COMPUTE_NEDECONVOLUTIONUPSAMPLE_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEDepthConcatenate.h b/arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h index cc6509957..5b63b7063 100644 --- a/arm_compute/runtime/NEON/functions/NEDepthConcatenate.h +++ b/arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h @@ -26,7 +26,7 @@ #include "arm_compute/runtime/IFunction.h" -#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h" +#include "arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h" #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" #include <memory> @@ -39,14 +39,14 @@ class ITensor; /** Basic function to execute concatenate tensors along z axis. This function calls the following kernels: * * -# @ref NEFillBorderKernel (executed if input's lowest two dimensions are smaller than respective output's dimensions) - * -# @ref NEDepthConcatenateKernel + * -# @ref NEDepthConcatenateLayerKernel * */ -class NEDepthConcatenate : public IFunction +class NEDepthConcatenateLayer : public IFunction { public: /** Default constructor */ - NEDepthConcatenate(); + NEDepthConcatenateLayer(); /** Initialise the kernel's inputs vector and output. * * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: QS8/QS16/F16/F32. @@ -58,10 +58,10 @@ public: void run() override; private: - std::vector<ITensor *> _inputs_vector; - std::unique_ptr<NEDepthConcatenateKernel[]> _concat_kernels_vector; - std::unique_ptr<NEFillBorderKernel[]> _border_handlers_vector; - unsigned int _num_inputs; + std::vector<ITensor *> _inputs_vector; + std::unique_ptr<NEDepthConcatenateLayerKernel[]> _concat_kernels_vector; + std::unique_ptr<NEFillBorderKernel[]> _border_handlers_vector; + unsigned int _num_inputs; }; } #endif /* __ARM_COMPUTE_NEDEPTHCONCATENATE_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEDepthConvert.h b/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h index 37f7293fb..b235e87b4 100644 --- a/arm_compute/runtime/NEON/functions/NEDepthConvert.h +++ b/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h @@ -33,16 +33,16 @@ namespace arm_compute { class ITensor; -/**Basic function to run @ref NEDepthConvertKernel */ -class NEDepthConvert : public INESimpleFunction +/**Basic function to run @ref NEDepthConvertLayerKernel */ +class NEDepthConvertLayer : public INESimpleFunction { public: /* Contructor */ - NEDepthConvert() = default; + NEDepthConvertLayer() = default; /** Prevent instances of this class from being copied (As this class contains pointers)*/ - NEDepthConvert(const NEDepthConvert &) = delete; + NEDepthConvertLayer(const NEDepthConvertLayer &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers)*/ - const NEDepthConvert &operator=(const NEDepthConvert &) = delete; + const NEDepthConvertLayer &operator=(const NEDepthConvertLayer &) = delete; /** Initialize the function's source, destination * * Valid conversions Input -> Output : diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h new file mode 100644 index 000000000..659594fe1 --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEDEPTHWISECONVOLUTION_H__ +#define __ARM_COMPUTE_NEDEPTHWISECONVOLUTION_H__ + +#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h" +#include "arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h" +#include "arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h" +#include "arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h" +#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h" +#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/Tensor.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to execute a depthwise convolution for kernel size 3x3xC. This function calls the following NEON kernels: + * + * -# @ref NEDepthwiseConvolutionLayer3x3 + * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) + * + */ +class NEDepthwiseConvolutionLayer3x3 : public IFunction +{ +public: + /** Default constructor */ + NEDepthwiseConvolutionLayer3x3(); + /** Initialize the function's source, destination, kernels and border_size. + * + * @param[in, out] input Source tensor. Data type supported: F32. (Written to only for border filling). + * @param[in] weights Weights tensor. These are 3D tensors with shape [3, 3, IFM]. Data type supported: Same as @p input. + * @param[in] biases (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input. + * @param[out] output Destination tensor. Data type supported: same as @p input. + * @param[in] conv_info Padding and stride information to use for the convolution. + */ + void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info); + + // Inherited methods overriden: + void run() override; + +private: + NEDepthwiseConvolutionLayer3x3Kernel _kernel; + NEDirectConvolutionLayerBiasAccumulateKernel _bias_kernel; + NEFillBorderKernel _border_handler; + bool _has_bias; +}; + +/** Basic function to execute a generic depthwise convolution. This function calls the following NEON kernels: + * + * -# @ref NEDepthwiseIm2ColKernel + * -# @ref NEDepthwiseWeightsReshapeKernel + * -# @ref NEGEMMMatrixVectorMultiplyKernel + * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) + * + */ +class NEDepthwiseConvolutionLayer : public IFunction +{ +public: + /** Default constructor */ + NEDepthwiseConvolutionLayer(); + /** Initialize the function's source, destination, weights and convolution information. + * + * @param[in, out] input Source tensor. Data type supported: F32. (Written to only for border filling). + * @param[out] output Destination tensor. Data type supported: same as @p input. + * @param[in] weights Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input. + * @param[in] biases (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p input. + * @param[in] conv_info Padding and stride information to use for the convolution. + */ + void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info); + + // Inherited methods overriden: + void run() override; + +private: + NEDepthwiseIm2ColKernel _im2col_kernel; + NEDepthwiseWeightsReshapeKernel _weights_reshape_kernel; + NEGEMMMatrixVectorMultiplyKernel _v2mm_kernel; + NEDepthwiseVectorToTensorKernel _vector_to_tensor_kernel; + Tensor _input_reshaped; + Tensor _weights_reshaped; + Tensor _v2mm_output; +}; +} +#endif /* __ARM_COMPUTE_NEDEPTHWISECONVOLUTION_H__ */
\ No newline at end of file diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h new file mode 100644 index 000000000..0562c0751 --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEON_DEPTHWISE_SEPARABLE_CONVOLUTION_H__ +#define __ARM_COMPUTE_NEON_DEPTHWISE_SEPARABLE_CONVOLUTION_H__ + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/NEON/INESimpleFunction.h" +#include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h" +#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h" +#include "arm_compute/runtime/Tensor.h" + +#include <cstdint> + +namespace arm_compute +{ +class ITensor; + +/** Basic function to execute depthwise convolution. This function calls the following NEON kernels and function: + * + * -# @ref NEDepthwiseConvolutionLayer + * -# @ref NEDirectConvolutionLayer + * + */ +class NEDepthwiseSeparableConvolutionLayer : public IFunction +{ +public: + /** Default constructor */ + NEDepthwiseSeparableConvolutionLayer(); + /** Set the input and output tensors. + * + * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F32. + * @param[in] depthwise_weights Depthwise convolution weights tensor. These are 3D tensors with dimensions [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input. + * @param[in] depthwise_biases (Optional) Biases tensor.Biases are 1D tensor with dimensions [IFM]. Must be nullptr if not needed. + * Data type supported: Same as @p weights. + * @param[out] depthwise_out Depthwise destination tensor. + * @param[in] pointwise_weights Pointwise convolution weights tensor. These are 4D tensors with dimensions [1, 1, IFM, OFM]. Data type supported: Same as @p input. + * @param[in] pointwise_biases (Optional) Biases tensor. Biases are 1D tensor with dimensions [OFM]. Must be nullptr if not needed. + * Data type supported: Same as @p weights. + * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] depthwise_conv_info Contains padding and stride information described in @ref PadStrideInfo for depthwise convolution. + * @param[in] pointwise_conv_info Contains padding and stride information described in @ref PadStrideInfo for pointwise convolution. + */ + void configure(ITensor *input, const ITensor *depthwise_weights, const ITensor *depthwise_biases, ITensor *depthwise_out, + const ITensor *pointwise_weights, const ITensor *pointwise_biases, ITensor *output, + const PadStrideInfo &depthwise_conv_info, const PadStrideInfo &pointwise_conv_info); + + // Inherited methods overriden: + void run() override; + +private: + NEDepthwiseConvolutionLayer _depthwise_conv; + NEDirectConvolutionLayer _pointwise_conv; +}; +} +#endif /*__ARM_COMPUTE_NEON_DEPTHWISE_SEPARABLE_CONVOLUTION_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEDilate.h b/arm_compute/runtime/NEON/functions/NEDilate.h index 17bdb3363..3b795f57e 100644 --- a/arm_compute/runtime/NEON/functions/NEDilate.h +++ b/arm_compute/runtime/NEON/functions/NEDilate.h @@ -49,7 +49,7 @@ public: * @param[in] border_mode Border mode to use for the convolution. * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ - void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value); + void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value = 0); }; } #endif /*__ARM_COMPUTE_NEDILATE_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h index daaf18f29..09a54968b 100644 --- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h @@ -51,23 +51,43 @@ public: /** Constructor */ NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); /** Set the input, weights, biases and output tensors. - * - * @note: DirectConvolution only works in the following configurations: - * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = QS8/QS16/F16/F32 - * 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = QS8/F16/F32 - * 5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32 - * - * @param[in, out] input Input tensor. Data types supported: QS8/QS16/F16/F32. - * @param[in] weights Set of kernels to convolve the input volume. - * Supported sizes: 1x1, 3x3 and 5x5. - * The 3rd dimension must be the same as the input's volume 3rd dimension. - * Data type supported: Same as @p input. - * @param[in] bias Set of biases. Data type supported: Same as @p input. - * @param[out] output Output tensor. - * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input. - * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. - */ + * + * @note: DirectConvolution only works in the following configurations: + * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = QS8/QS16/F16/F32 + * 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = QS8/F16/F32 + * 5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32 + * + * @param[in, out] input Input tensor. Data types supported: QS8/QS16/F16/F32. + * @param[in] weights Set of kernels to convolve the input volume. + * Supported sizes: 1x1, 3x3 and 5x5. + * The 3rd dimension must be the same as the input's volume 3rd dimension. + * Data type supported: Same as @p input. + * @param[in] bias Set of biases. Can be nullptr. Data type supported: Same as @p input. + * @param[out] output Output tensor. + * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + */ void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info); + /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayer + * + * @note: DirectConvolution only works in the following configurations: + * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = QS8/QS16/F16/F32 + * 3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = QS8/F16/F32 + * 5x5 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F32 + * + * @param[in] input Input tensor. Data types supported: QS8/QS16/F16/F32. + * @param[in] weights Set of kernels to convolve the input volume. + * Supported sizes: 1x1, 3x3 and 5x5. + * The 3rd dimension must be the same as the input's volume 3rd dimension. + * Data type supported: Same as @p input. + * @param[in] bias Set of biases. Can be nullptr. Data type supported: Same as @p input. + * @param[in] output Output tensor. + * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info); // Inherited methods overridden: void run() override; @@ -78,6 +98,7 @@ private: NEDirectConvolutionLayerKernel _conv_kernel; NEFillBorderKernel _input_border_handler; Tensor _accumulator; + bool _has_bias; }; } #endif /* __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEErode.h b/arm_compute/runtime/NEON/functions/NEErode.h index 940ae1847..739e981a9 100644 --- a/arm_compute/runtime/NEON/functions/NEErode.h +++ b/arm_compute/runtime/NEON/functions/NEErode.h @@ -49,7 +49,7 @@ public: * @param[in] border_mode Border mode to use for the convolution. * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. */ - void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value); + void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value = 0); }; } #endif /*__ARM_COMPUTE_NEERODE_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowp.h b/arm_compute/runtime/NEON/functions/NEGEMMLowp.h deleted file mode 100644 index 0b0a7742f..000000000 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowp.h +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#ifndef __ARM_COMPUTE_NEGEMMLOWP_H__ -#define __ARM_COMPUTE_NEGEMMLOWP_H__ - -#include "arm_compute/core/NEON/INEKernel.h" -#include "arm_compute/runtime/IFunction.h" -#include "arm_compute/runtime/Tensor.h" - -#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" -#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" -#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" -#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" -#include "arm_compute/runtime/IMemoryManager.h" -#include "arm_compute/runtime/MemoryGroup.h" - -#include <memory> - -namespace arm_compute -{ -class ITensor; - -/** Basic function to execute GEMMLowp on NEON. This function calls the following NEON kernels: -* -* -# @ref NEGEMMInterleave4x4Kernel -* -# @ref NEGEMMTranspose1xWKernel -* -# @ref NEGEMMLowpMatrixMultiplyKernel -* -*/ -class NEGEMMLowp : public IFunction -{ -public: - /** Constructor */ - NEGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager = nullptr); - /** Initialise the kernel's inputs, output - * - * @note GEMM_LOWP: low precision GEMM kernel - * This kernel performs the following computation: - * - * -# Convert a values from uint8 to int32 and add a_offset to each of them. - * -# Convert b values from uint8 to int32 and add b_offset to each of them. - * -# Compute the int32 matrix product of the resulting a * b. - * -# Add output_offset to each entry of the result. - * -# Multiply each entry of the result and round to the nearest integer - * -# Clamp the resulting int32 values to the [0..255] range and cast to uint8. - * - * @param[in] a First input tensor (Matrix A). Data type supported: U8. - * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a - * @param[out] output Output tensor. Data type supported: same as @p a. - * @param[in] a_offset Offset to be added to each element of the matrix A. - * @param[in] b_offset Offset to be added to each element of the matrix B. - * @param[in] output_offset Offset to be added to each element of the output matrix - * @param[in] output_mult_int Value to be multiplied to each element of the output matrix - * @param[in] shift Number of bits to shift right the result. - */ - void configure(const ITensor *a, const ITensor *b, ITensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift); - // Inherited methods overridden: - void run() override; - -private: - MemoryGroup _memory_group; - NEGEMMInterleave4x4Kernel _interleave_kernel; - NEGEMMTranspose1xWKernel _transpose_kernel; - NEGEMMLowpMatrixMultiplyKernel _mm_kernel; - Tensor _tmp_a; - Tensor _tmp_b; -}; -} -#endif /*__ARM_COMPUTE_NEGEMMLOWP_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h new file mode 100644 index 000000000..3d213a766 --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h @@ -0,0 +1,69 @@ + +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMLOWPASSEMBLYMATRIXMULTIPLYCORE_H__ +#define __ARM_COMPUTE_NEGEMMLOWPASSEMBLYMATRIXMULTIPLYCORE_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/Tensor.h" + +#include <memory> + +namespace arm_compute +{ +class ITensor; + +/** Basic function to execute matrix multiply assembly kernels. + * +*/ +class NEGEMMLowpAssemblyMatrixMultiplyCore : public IFunction +{ +public: + /** Constructor */ + NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Initialise the kernel's inputs, output + * + * @param[in] a First input tensor (Matrix A). Data type supported: U8, S8. + * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a + * @param[out] output Output tensor. Data type supported: Data type supported: S32 + */ + void configure(const ITensor *a, const ITensor *b, ITensor *output); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; + std::unique_ptr<INEKernel> _mm_kernel; + std::unique_ptr<INEKernel> _mtx_a_reshape_kernel; + std::unique_ptr<INEKernel> _mtx_b_reshape_kernel; + Tensor _tmp_a; + Tensor _tmp_b; + Tensor _workspace; +}; +} +#endif /*__ARM_COMPUTE_NEGEMMLOWPASSEMBLYMATRIXMULTIPLYCORE_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h new file mode 100644 index 000000000..46e6b494f --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H__ +#define __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/IMemoryManager.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/Tensor.h" + +#include <memory> + +namespace arm_compute +{ +class ITensor; + +/** Basic function to execute GEMMLowpMatrixMultiplyCore on NEON. This function calls the following NEON kernels if the DOT product instruction is not available: + * + * -# @ref NEGEMMInterleave4x4Kernel + * -# @ref NEGEMMTranspose1xWKernel + * -# @ref NEGEMMLowpMatrixMultiplyKernel + * -# @ref NEGEMMLowpOffsetContributionKernel + * + * otherwise if the DOT product instruction is available: + * + * -# @ref NEGEMMInterleaveBlockedKernel + * -# @ref NEGEMMLowpAArch64V8P4Kernel + * -# @ref NEGEMMLowpOffsetContributionKernel + * +*/ +class NEGEMMLowpMatrixMultiplyCore : public IFunction +{ +public: + /** Constructor */ + NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + /** Initialise the kernel's inputs, output + * + * @note GEMM_LOWP: low precision GEMM kernel + * This kernel performs the following computations: + * + * -# Convert a values from QASYMM8 to int32 and add a_offset to each of them. + * -# Convert b values from QASYMM8 to int32 add b_offset to each of them. + * -# Compute the matrix product of the resulting a * b in int32. + * + * @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8. + * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a + * @param[out] output Output tensor. Data type supported: Data type supported: S32 + * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and + * if the reshape of matrix B should be executed only for the first run + */ + void configure(const ITensor *a, const ITensor *b, ITensor *output, const GEMMInfo &gemm_info = GEMMInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyCore + * + * @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8. + * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a + * @param[out] output Output tensor. Data type supported: Data type supported: S32 + * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and + * if the reshape of matrix B should be executed only for the first run + * + * @return a status + */ + static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo()); + + // Inherited methods overridden: + void run() override; + +private: + MemoryGroup _memory_group; + std::unique_ptr<INEKernel> _mm_kernel; + std::unique_ptr<INEKernel> _mtx_a_reshape_kernel; + std::unique_ptr<INEKernel> _mtx_b_reshape_kernel; + NEGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel; + NEGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel; + NEGEMMLowpOffsetContributionKernel _offset_contribution_kernel; + Tensor _vector_sum_col; + Tensor _vector_sum_row; + Tensor _tmp_a; + Tensor _tmp_b; + Tensor _workspace; + int32_t _a_offset; + int32_t _b_offset; + bool _run_vector_matrix_multiplication; + bool _dot_product_path; +}; +} +#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h new file mode 100644 index 000000000..7da0d2359 --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h @@ -0,0 +1,151 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H__ +#define __ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H__ + +#include "arm_compute/runtime/NEON/INESimpleFunction.h" + +/** This file contains all available output stages for GEMMLowp on NEON. + * + * In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyCore), + * and processes it to obtain the final ASYMM8 value. + * + * More information about the GEMMLowp output stage can be found at https://github.com/google/gemmlowp/blob/master/doc/output.md + */ + +namespace arm_compute +{ +class ITensor; + +/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToUint8Scale on NEON. + * + * NEGEMMLowpQuantizeDownInt32ToUint8Scale depends on 3 parameters: result_offset, result_mult_int, result_shift + * The final result is: + * + * ((input[i][k] + result_offset) * result_mult_int) >> result_shift + * + * In case the bias tensor is provided, the final result is: + * + * ((input[i][k] + bias[k] + result_offset) * result_mult_int) >> result_shift + * + * This function calls the following NEON kernels: + * + * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel + * + * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions + * after the result is shifted right by result_shift +*/ +class NEGEMMLowpQuantizeDownInt32ToUint8Scale : public INESimpleFunction +{ +public: + /** Initialise the kernel's inputs, output + * + * @param[in] input Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] result_offset Offset to be added to each element of the input matrix + * @param[in] result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add + * @param[in] result_shift Number of bits to shift right the result before converting back to QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions + */ + void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0); + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8Scale + * + * @param[in] input Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0); +}; + +/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint on NEON. + * + * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint depends on 3 parameters: + * + * result_fixedpoint_multiplier, result_shift, result_offset_after_shift + * + * The final result is: + * + * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift + * + * where FixedPointMul(x, y) is the nearest integer to the following + * mathematical expression, evaluated without overflow or intermediate rounding: + * + * (x * y) / 2^31 + * + * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68 + * + * In case the bias tensor is provided, the final result is: + * + * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift + * + * This function calls the following NEON kernels: + * + * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel + * + * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions + * after the result is shifted right by result_shift +*/ +class NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint : public INESimpleFunction +{ +public: + /** Initialise the kernel's inputs, output + * + * @param[in] input Input tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add + * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication + * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions + */ + void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0); + /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint + * + * @param[in] input Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8 + * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8 + * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8, + * Along with @p min, this value can be used to implement "rectified linear unit" activation functions + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0); +}; +} +#endif /*__ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H__ */
\ No newline at end of file diff --git a/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h b/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h index b4ed56a0c..dbe0ecdf6 100644 --- a/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h +++ b/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h @@ -91,7 +91,8 @@ public: void run() override; private: - std::unique_ptr<NEFillBorderKernel[]> _border_handler; + std::unique_ptr<NEFillBorderKernel[]> _horizontal_border_handler; + std::unique_ptr<NEFillBorderKernel[]> _vertical_border_handler; std::unique_ptr<NEGaussianPyramidHorKernel[]> _horizontal_reduction; std::unique_ptr<NEGaussianPyramidVertKernel[]> _vertical_reduction; }; diff --git a/arm_compute/runtime/NEON/functions/NEIm2Col.h b/arm_compute/runtime/NEON/functions/NEIm2Col.h new file mode 100644 index 000000000..cb08f5cd0 --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NEIm2Col.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEIM2COL_H__ +#define __ARM_COMPUTE_NEIM2COL_H__ + +#include "arm_compute/runtime/NEON/INESimpleFunction.h" + +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** Basic function to run @ref NEIm2ColKernel */ +class NEIm2Col : public INESimpleFunction +{ +public: + /** Configure the im2col NEON kernel + * + * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/QASYMM8/F16/F32 + * Note: QASYMM8 works only for has_bias = false + * @param[out] output The output tensor. Data types supported: Same as @p input + * @param[in] kernel_dims The kernel dimensions (width and height). + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] has_bias In case biases are provided expands the matrix with 1. + */ + void configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias); + /** Static function to check if given info will lead to a valid configuration of @ref NEIm2Col + * + * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/QASYMM8/F16/F32 + * Note: QASYMM8 works only for has_bias = false + * @param[in] output The output tensor. Data types supported: Same as @p input + * @param[in] kernel_dims The kernel dimensions (width and height). + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] has_bias In case biases are provided expands the matrix with 1. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias); +}; +} +#endif /* __ARM_COMPUTE_NEIM2COL_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEIntegralImage.h b/arm_compute/runtime/NEON/functions/NEIntegralImage.h index 6d7dd697e..1ac501c99 100644 --- a/arm_compute/runtime/NEON/functions/NEIntegralImage.h +++ b/arm_compute/runtime/NEON/functions/NEIntegralImage.h @@ -35,10 +35,10 @@ class NEIntegralImage : public INESimpleFunction { public: /** Initialise the function's source, destinations and border mode. - * - * @param[in] input Source tensor. Data type supported: U8. - * @param[out] output Destination tensor. Data type supported: U32. - */ + * + * @param[in] input Source tensor. Data type supported: U8. + * @param[out] output Destination tensor. Data type supported: U32. + */ void configure(const ITensor *input, ITensor *output); }; } diff --git a/arm_compute/runtime/NEON/functions/NEL2Normalize.h b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h index 95d5186c1..100e23940 100644 --- a/arm_compute/runtime/NEON/functions/NEL2Normalize.h +++ b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h @@ -24,7 +24,7 @@ #ifndef __ARM_COMPUTE_NEL2NORMALIZE_H__ #define __ARM_COMPUTE_NEL2NORMALIZE_H__ -#include "arm_compute/core/NEON/kernels/NEL2NormalizeKernel.h" +#include "arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/IMemoryManager.h" #include "arm_compute/runtime/MemoryGroup.h" @@ -41,13 +41,13 @@ class ITensor; * * This function runs the following kernels: * -# @ref NEReductionOperation - * -# @ref NEL2NormalizeKernel + * -# @ref NEL2NormalizeLayerKernel */ -class NEL2Normalize : public IFunction +class NEL2NormalizeLayer : public IFunction { public: /** Constructor */ - NEL2Normalize(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + NEL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); /** Set the input and output tensors. * * @param[in, out] input Source tensor. Data types supported: F32. (Written to only for border_size != 0) @@ -61,10 +61,10 @@ public: void run() override; private: - MemoryGroup _memory_group; - NEReductionOperation _reduce_func; - NEL2NormalizeKernel _normalize_kernel; - Tensor _sumsq; + MemoryGroup _memory_group; + NEReductionOperation _reduce_func; + NEL2NormalizeLayerKernel _normalize_kernel; + Tensor _sumsq; }; } #endif /* __ARM_COMPUTE_NEL2NORMALIZE_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NELaplacianPyramid.h b/arm_compute/runtime/NEON/functions/NELaplacianPyramid.h index 991ae7c29..baa4b7b1a 100644 --- a/arm_compute/runtime/NEON/functions/NELaplacianPyramid.h +++ b/arm_compute/runtime/NEON/functions/NELaplacianPyramid.h @@ -27,7 +27,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h" -#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h" +#include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h" #include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h" #include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h" #include "arm_compute/runtime/Pyramid.h" @@ -79,7 +79,7 @@ private: std::unique_ptr<NEArithmeticSubtraction[]> _subf; Pyramid _gauss_pyr; Pyramid _conv_pyr; - NEDepthConvert _depth_function; + NEDepthConvertLayer _depth_function; }; } #endif /*__ARM_COMPUTE_NELAPLACIANPYRAMID_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h b/arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h index 413973349..3d423607a 100644 --- a/arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h +++ b/arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h @@ -27,7 +27,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/IFunction.h" #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h" -#include "arm_compute/runtime/NEON/functions/NEDepthConvert.h" +#include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h" #include "arm_compute/runtime/NEON/functions/NEScale.h" #include "arm_compute/runtime/Pyramid.h" @@ -43,7 +43,7 @@ using IImage = ITensor; * * -# @ref NEArithmeticAddition * -# @ref NEScale - * -# @ref NEDepthConvert + * -# @ref NEDepthConvertLayer * * This function reconstructs the original image from a Laplacian Image Pyramid. * @@ -85,7 +85,7 @@ private: Pyramid _tmp_pyr; std::unique_ptr<NEArithmeticAddition[]> _addf; std::unique_ptr<NEScale[]> _scalef; - NEDepthConvert _depthf; + NEDepthConvertLayer _depthf; }; } #endif /*__ARM_COMPUTE_NELAPLACIANRECONSTRUCT_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEMagnitude.h b/arm_compute/runtime/NEON/functions/NEMagnitude.h index 6c1f988ef..5bc3faf66 100644 --- a/arm_compute/runtime/NEON/functions/NEMagnitude.h +++ b/arm_compute/runtime/NEON/functions/NEMagnitude.h @@ -39,9 +39,10 @@ public: * @param[in] input1 First tensor input. Data type supported: S16. * @param[in] input2 Second tensor input. Data type supported: S16. * @param[out] output Output tensor. Data type supported: S16. + * @param[in] mag_type (Optional) Magnitude calculation type. Default: L2NORM. * @param[in] use_fp16 (Optional) If true the FP16 kernels will be used. If false F32 kernels are used. */ - void configure(const ITensor *input1, const ITensor *input2, ITensor *output, bool use_fp16 = false); + void configure(const ITensor *input1, const ITensor *input2, ITensor *output, MagnitudeType mag_type = MagnitudeType::L2NORM, bool use_fp16 = false); }; } #endif /*__ARM_COMPUTE_NEMAGNITUDE_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h index 1c95c5bc4..4b5ad2870 100644 --- a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h +++ b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h @@ -40,7 +40,7 @@ namespace arm_compute { class ITensor; -/** Basic function to simulate a normalization layer. This function calls the following NEON kernels: +/** Basic function to compute a normalization layer. This function calls the following NEON kernels: * * -# @ref NEPixelWiseMultiplicationKernel * -# @ref NEFillBorderKernel @@ -55,11 +55,21 @@ public: /** Set the input and output tensors. * * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], - * and an optional 4th dimension for batch of inputs. Data type supported: QS8/F16/F32 + * and an optional 4th dimension for batch of inputs. Data type supported: QS8/QS16/F16/F32 * @param[out] output Destination with the same dimensions, data type and number of channels of @p input * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters. */ - void configure(const ITensor *input, ITensor *output, NormalizationLayerInfo norm_info); + void configure(const ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info); + /** Static function to check if given info will lead to a valid configuration of @ref NENormalizationLayer + * + * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data type supported: QS8/QS16/F16/F32 + * @param[in] output Destination with the same dimensions, data type and number of channels of @p input + * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEPhase.h b/arm_compute/runtime/NEON/functions/NEPhase.h index 985ba84c4..cd62cf98e 100644 --- a/arm_compute/runtime/NEON/functions/NEPhase.h +++ b/arm_compute/runtime/NEON/functions/NEPhase.h @@ -36,11 +36,12 @@ class NEPhase : public INESimpleFunction public: /** Initialise the kernel's inputs, output. * - * @param[in] input1 First tensor input. Data type supported: S16. - * @param[in] input2 Second tensor input. Data type supported: S16. - * @param[out] output Output tensor. Data type supported: U8. + * @param[in] input1 First tensor input. Data type supported: S16. + * @param[in] input2 Second tensor input. Data type supported: S16. + * @param[out] output Output tensor. Data type supported: U8. + * @param[in] phase_type (Optional) Phase calculation type. Default: SIGNED. */ - void configure(const ITensor *input1, const ITensor *input2, ITensor *output); + void configure(const ITensor *input1, const ITensor *input2, ITensor *output, PhaseType phase_type = PhaseType::SIGNED); }; } #endif /*__ARM_COMPUTE_NEPHASE_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h index de7a797cd..7d22500c5 100644 --- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h +++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h @@ -45,6 +45,18 @@ public: * @param[in] rounding_policy Rounding policy. */ void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); + /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplication + * + * @param[in] input1 First tensor info input. Data types supported: U8/QS8/S16/F32. + * @param[in] input2 Second tensor info input. Data types supported: U8/QS8/S16/F32. + * @param[in] output Output tensor info. Data types supported: U8/QS8/S16/F32. + * @param[in] scale Scale to apply after multiplication. Must be positive. + * @param[in] overflow_policy Overflow policy. + * @param[in] rounding_policy Rounding policy. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); }; } #endif /*__ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h index 7b038aaa5..0f8abb587 100644 --- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h +++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h @@ -53,6 +53,17 @@ public: * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. */ void configure(ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info); + /** Static function to check if given info will lead to a valid configuration of @ref NEPoolingLayer + * + * @note QS8, QS16 and F16 are supported for pool sizes 2 and 3 only + * + * @param[in] input Source tensor. (Written to only when padding != 0) Data types supported: QS8/QS16/F16/F32. + * @param[in] output Destination tensor. Data types supported: Same as @p input. + * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h index 5adc1110d..69a90dd89 100644 --- a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h +++ b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h @@ -35,7 +35,7 @@ class ITensor; /** Basic function to run @ref NEROIPoolingLayerKernel. * - * This function calls the following OpenCL kernels: + * This function calls the following NEON kernels: * -# @ref NEROIPoolingLayerKernel * */ diff --git a/arm_compute/runtime/NEON/functions/NEScale.h b/arm_compute/runtime/NEON/functions/NEScale.h index 7297880a7..1d96db3ff 100644 --- a/arm_compute/runtime/NEON/functions/NEScale.h +++ b/arm_compute/runtime/NEON/functions/NEScale.h @@ -52,8 +52,10 @@ public: * @param[in] policy The interpolation type. * @param[in] border_mode Strategy to use for borders. * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + * @param[in] sampling_policy (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER */ - void configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue()); + void configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue(), + SamplingPolicy sampling_policy = SamplingPolicy::CENTER); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h index a265f7004..5043f79c2 100644 --- a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h +++ b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h @@ -53,8 +53,18 @@ public: * * @param[in] input Source tensor. Data types supported: QS8/QS16/F16/F32. * @param[out] output Destination tensor. Data types supported: same as @p input. + * @param[in] beta (Optional) A scaling factor for the exponent. QS8/QS16 only support a beta value of 1. */ - void configure(ITensor *input, ITensor *output); + void configure(ITensor *input, ITensor *output, float beta = 1.0f); + /** Static function to check if given info will lead to a valid configuration of @ref NESoftmaxLayer + * + * @param[in] input Source tensor. Data types supported: QS8/QS16/F16/F32 + * @param[in] output Destination tensor. Data types supported: same as @p input + * @param[in] beta (Optional) A scaling factor for the exponent. QS8/QS16 only support a beta value of 1. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, float beta = 1.0f); // Inherited methods overridden: void run() override; diff --git a/arm_compute/runtime/NEON/functions/NETranspose.h b/arm_compute/runtime/NEON/functions/NETranspose.h index 4b606e728..6d1e10708 100644 --- a/arm_compute/runtime/NEON/functions/NETranspose.h +++ b/arm_compute/runtime/NEON/functions/NETranspose.h @@ -41,10 +41,18 @@ class NETranspose : public INESimpleFunction public: /** Initialise the kernel's inputs and output * - * @param[in] input Input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32 + * @param[in] input Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/F16/U32/S32/F32 * @param[out] output Output tensor. Data type supported: Same as @p input */ void configure(const ITensor *input, ITensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref NETranspose + * + * @param[in] input The input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/F16/U32/S32/F32 + * @param[in] output The output tensor. Data types supported: Same as @p input + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); }; } diff --git a/arm_compute/runtime/NEON/functions/NEWinogradLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradLayer.h new file mode 100644 index 000000000..77707060e --- /dev/null +++ b/arm_compute/runtime/NEON/functions/NEWinogradLayer.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEWINOGRADLAYER_H__ +#define __ARM_COMPUTE_NEWINOGRADLAYER_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/NEON/kernels/NEWinogradLayerKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/MemoryGroup.h" +#include "arm_compute/runtime/Tensor.h" + +#include <memory> + +namespace arm_compute +{ +class ITensor; +/** Basic function to simulate a convolution layer. This function calls the following NEON kernels: + */ +class NEWinogradLayer : public IFunction +{ +public: + /** Constructor */ + NEWinogradLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr); + + /** Set the input and output tensors. + * + * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input. + * Currently only 3x3 kernels are supported. + * @param[in] biases Not supported, biases will be ignored. + * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. Currently only unit strides are supported. + */ + void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info); + + // Inherited methods overridden: + void run() override; + + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEWinogradLayer(const NEWinogradLayer &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEWinogradLayer &operator=(const NEWinogradLayer &) = delete; + +private: + MemoryGroup _memory_group; + NEWinogradLayerKernel _winograd_kernel; + Tensor _weights_workspace; + Tensor _workspace; + Tensor _kernel_storage; + const ITensor *_input; + const ITensor *_weights; + ITensor *_output; + bool _reshaped_kernel; + std::unique_ptr<Winograd3x3F32> _conv; +}; +} +#endif /* __ARM_COMPUTE_NEWINOGRADLAYER_H__ */ diff --git a/arm_compute/runtime/OffsetLifetimeManager.h b/arm_compute/runtime/OffsetLifetimeManager.h new file mode 100644 index 000000000..e39d6a0d6 --- /dev/null +++ b/arm_compute/runtime/OffsetLifetimeManager.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_OFFSETLIFETIMEMANAGER_H__ +#define __ARM_COMPUTE_OFFSETLIFETIMEMANAGER_H__ + +#include "arm_compute/runtime/ISimpleLifetimeManager.h" + +#include "arm_compute/runtime/Types.h" + +#include <cstddef> +#include <map> +#include <vector> + +namespace arm_compute +{ +class IMemoryPool; + +/** Concrete class that tracks the lifetime of registered tensors and + * calculates the systems memory requirements in terms of a single blob and a list of offsets */ +class OffsetLifetimeManager : public ISimpleLifetimeManager +{ +public: + /** Constructor */ + OffsetLifetimeManager(); + /** Prevent instances of this class to be copy constructed */ + OffsetLifetimeManager(const OffsetLifetimeManager &) = delete; + /** Prevent instances of this class to be copied */ + OffsetLifetimeManager &operator=(const OffsetLifetimeManager &) = delete; + /** Allow instances of this class to be move constructed */ + OffsetLifetimeManager(OffsetLifetimeManager &&) = default; + /** Allow instances of this class to be moved */ + OffsetLifetimeManager &operator=(OffsetLifetimeManager &&) = default; + + // Inherited methods overridden: + std::unique_ptr<IMemoryPool> create_pool(IAllocator *allocator) override; + MappingType mapping_type() const override; + +private: + // Inherited methods overridden: + void update_blobs_and_mappings() override; + +private: + size_t _blob; /**< Memory blob size */ +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_OFFSETLIFETIMEMANAGER_H__ */ diff --git a/arm_compute/runtime/OffsetMemoryPool.h b/arm_compute/runtime/OffsetMemoryPool.h new file mode 100644 index 000000000..9685fd131 --- /dev/null +++ b/arm_compute/runtime/OffsetMemoryPool.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_OFFSETMEMORYPOOL_H__ +#define __ARM_COMPUTE_OFFSETMEMORYPOOL_H__ + +#include "arm_compute/runtime/IMemoryPool.h" + +#include "arm_compute/runtime/Types.h" + +#include <cstddef> + +namespace arm_compute +{ +class IAllocator; + +/** Offset based memory pool */ +class OffsetMemoryPool : public IMemoryPool +{ +public: + /** Default Constructor + * + * @note allocator should outlive the memory pool + * + * @param[in] allocator Backing memory allocator + * @param[in] blob_size Size of the memory be allocated + */ + OffsetMemoryPool(IAllocator *allocator, size_t blob_size); + /** Default Destructor */ + ~OffsetMemoryPool(); + /** Prevent instances of this class to be copy constructed */ + OffsetMemoryPool(const OffsetMemoryPool &) = delete; + /** Prevent instances of this class to be copy assigned */ + OffsetMemoryPool &operator=(const OffsetMemoryPool &) = delete; + /** Allow instances of this class to be move constructed */ + OffsetMemoryPool(OffsetMemoryPool &&) = default; + /** Allow instances of this class to be move assigned */ + OffsetMemoryPool &operator=(OffsetMemoryPool &&) = default; + + // Inherited methods overridden: + void acquire(MemoryMappings &handles) override; + void release(MemoryMappings &handles) override; + MappingType mapping_type() const override; + std::unique_ptr<IMemoryPool> duplicate() override; + +private: + IAllocator *_allocator; /**< Allocator to use for internal allocation */ + void *_blob; /**< Memory blob */ + size_t _blob_size; /**< Sizes of the allocated memory blob */ +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_OFFSETMEMORYPOOL_H__ */ diff --git a/arm_compute/runtime/TensorAllocator.h b/arm_compute/runtime/TensorAllocator.h index 40704c0a1..9af100c12 100644 --- a/arm_compute/runtime/TensorAllocator.h +++ b/arm_compute/runtime/TensorAllocator.h @@ -25,6 +25,7 @@ #define __ARM_COMPUTE_TENSORALLOCATOR_H__ #include "arm_compute/runtime/ITensorAllocator.h" +#include "arm_compute/runtime/Memory.h" #include <cstdint> #include <memory> @@ -86,6 +87,19 @@ public: * */ void free() override; + /** Import an existing memory as a tensor's backing memory + * + * @warning If the tensor is flagged to be managed by a memory manager, + * this call will lead to an error. + * @warning Ownership of memory depends on the way the @ref Memory object was constructed + * @note Calling free on a tensor with imported memory will just clear + * the internal pointer value. + * + * @param[in] memory Memory to import + * + * @return error status + */ + arm_compute::Status import_memory(Memory memory); /** Associates the tensor with a memory group * * @param[in] associated_memory_group Memory group to associate the tensor with @@ -104,7 +118,7 @@ protected: private: MemoryGroup *_associated_memory_group; /**< Registered memory manager */ - uint8_t *_buffer; /**< CPU memory allocation. */ + Memory _memory; /**< CPU memory */ Tensor *_owner; /**< Owner of the allocator */ }; } |