diff options
Diffstat (limited to 'libs')
142 files changed, 9842 insertions, 6896 deletions
diff --git a/libs/.FORMATCHECKED b/libs/.FORMATCHECKED new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/libs/.FORMATCHECKED diff --git a/libs/ARMComputeEx/CMakeLists.txt b/libs/ARMComputeEx/CMakeLists.txt new file mode 100644 index 000000000..2483fb55d --- /dev/null +++ b/libs/ARMComputeEx/CMakeLists.txt @@ -0,0 +1,21 @@ +if("${TARGET_ARCH}" STREQUAL "x86_64") + return() +endif() + +nnfw_find_package(ARMCompute REQUIRED) + +set(ACL_EX_BASE ${CMAKE_SOURCE_DIR}/libs/ARMComputeEx) + +file(GLOB_RECURSE ACL_EX_SRCS "${ACL_EX_BASE}/*.cpp") + +# generate embeded cl_kernel +execute_process ( + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}/libs/ARMComputeEx" + COMMAND bash -c "python resolve_includes.py" +) + +add_library(arm_compute_ex SHARED ${ACL_EX_SRCS}) +set_target_properties(arm_compute_ex PROPERTIES COMPILE_FLAGS "-DEMBEDDED_KERNELS=1") +target_include_directories(arm_compute_ex PUBLIC ${CMAKE_SOURCE_DIR}/libs/ARMComputeEx) +target_link_libraries(arm_compute_ex arm_compute_core) +install(TARGETS arm_compute_ex DESTINATION lib) diff --git a/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h new file mode 100644 index 000000000..026487077 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ +#define __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ + +#include "arm_compute/core/CL/OpenCL.h" + +#include <map> +#include <set> +#include <string> +#include <utility> + +namespace arm_compute +{ + +/** CLKernelLibrary class */ +class CLKernelLibraryEx +{ + using StringSet = std::set<std::string>; + +private: + /** Default Constructor. */ + CLKernelLibraryEx(); + +public: + /** Prevent instances of this class from being copied */ + CLKernelLibraryEx(const CLKernelLibraryEx &) = delete; + /** Prevent instances of this class from being copied */ + const CLKernelLibraryEx &operator=(const CLKernelLibraryEx &) = delete; + /** Access the KernelLibrary singleton. + * @return The KernelLibrary instance. + */ + static CLKernelLibraryEx &get(); + /** Initialises the kernel library. + * + * @param[in] kernel_path (Optional) Path of the directory from which kernel sources are loaded. + * @param[in] context (Optional) CL context used to create programs. + * @param[in] device (Optional) CL device for which the programs are created. + */ + void init(std::string kernel_path = ".", cl::Context context = cl::Context::getDefault(), + cl::Device device = cl::Device::getDefault()) + { + _kernel_path = std::move(kernel_path); + _context = std::move(context); + _device = std::move(device); + } + /** Sets the path that the kernels reside in. + * + * @param[in] kernel_path Path of the kernel. + */ + void set_kernel_path(const std::string &kernel_path) { _kernel_path = kernel_path; }; + /** Gets the path that the kernels reside in. + */ + std::string get_kernel_path() { return _kernel_path; }; + /** Gets the source of the selected program. + * + * @param[in] program_name Program name. + * + * @return Source of the selected program. + */ + std::string get_program_source(const std::string &program_name); + /** Sets the CL context used to create programs. + * + * @note Setting the context also resets the device to the + * first one available in the new context. + * + * @param[in] context A CL context. + */ + void set_context(cl::Context context) + { + _context = std::move(context); + if (_context.get() == nullptr) + { + _device = cl::Device(); + } + else + { + const auto cl_devices = _context.getInfo<CL_CONTEXT_DEVICES>(); + + if (cl_devices.empty()) + { + _device = cl::Device(); + } + else + { + _device = cl_devices[0]; + } + } + } + + /** Accessor for the associated CL context. + * + * @return A CL context. + */ + cl::Context &context() { return _context; } + + /** Sets the CL device for which the programs are created. + * + * @param[in] device A CL device. + */ + void set_device(cl::Device device) { _device = std::move(device); } + + /** Return the device version + * + * @return The content of CL_DEVICE_VERSION + */ + std::string get_device_version(); + /** Creates a kernel from the kernel library. + * + * @param[in] kernel_name Kernel name. + * @param[in] build_options_set Kernel build options as a set. + * + * @return The created kernel. + */ + Kernel create_kernel(const std::string &kernel_name, + const StringSet &build_options_set = {}) const; + /** Find the maximum number of local work items in a workgroup can be supported for the kernel. + * + */ + size_t max_local_workgroup_size(const cl::Kernel &kernel) const; + /** Return the default NDRange for the device. + * + */ + cl::NDRange default_ndrange() const; + + /** Clear the library's cache of binary programs + */ + void clear_programs_cache() + { + _programs_map.clear(); + _built_programs_map.clear(); + } + + /** Access the cache of built OpenCL programs */ + const std::map<std::string, cl::Program> &get_built_programs() const + { + return _built_programs_map; + } + + /** Add a new built program to the cache + * + * @param[in] built_program_name Name of the program + * @param[in] program Built program to add to the cache + */ + void add_built_program(const std::string &built_program_name, cl::Program program); + +private: + /** Load program and its dependencies. + * + * @param[in] program_name Name of the program to load. + */ + const Program &load_program(const std::string &program_name) const; + /** Concatenates contents of a set into a single string. + * + * @param[in] s Input set to concatenate. + * + * @return Concatenated string. + */ + std::string stringify_set(const StringSet &s) const; + + cl::Context _context; /**< Underlying CL context. */ + cl::Device _device; /**< Underlying CL device. */ + std::string _kernel_path; /**< Path to the kernels folder. */ + mutable std::map<std::string, const Program> + _programs_map; /**< Map with all already loaded program data. */ + mutable std::map<std::string, cl::Program> + _built_programs_map; /**< Map with all already built program data. */ + static const std::map<std::string, std::string> + _kernel_program_map; /**< Map that associates kernel names with programs. */ + static const std::map<std::string, std::string> + _program_source_map; /**< Contains sources for all programs. + Used for compile-time kernel inclusion. >*/ +}; +} +#endif /* __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h new file mode 100644 index 000000000..6bd33bf8f --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLCASTKERNEL_H__ +#define __ARM_COMPUTE_CLCASTKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** OpenCL kernel to perform a cast operation */ +class CLCastKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLCastKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLCastKernel(const CLCastKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLCastKernel &operator=(const CLCastKernel &) = delete; + /** Allow instances of this class to be moved */ + CLCastKernel(CLCastKernel &&) = default; + /** Allow instances of this class to be moved */ + CLCastKernel &operator=(CLCastKernel &&) = default; + /** Default destructor */ + ~CLCastKernel() = default; + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * @param[in] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + */ + void configure(const ICLTensor *input, ICLTensor *output); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; /**< Source tensor */ + ICLTensor *_output; /**< Destination tensor */ +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CLCASTKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h new file mode 100644 index 000000000..a51441aca --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherKernel.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLGATHERKERNEL_H__ +#define __ARM_COMPUTE_CLGATHERKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the gather kernel. + * + */ +class CLGatherKernel : public ICLKernel +{ +public: + /** Default constructor.*/ + CLGatherKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLGatherKernel(const CLGatherKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLGatherKernel &operator=(const CLGatherKernel &) = delete; + /** Allow instances of this class to be moved */ + CLGatherKernel(CLGatherKernel &&) = default; + /** Allow instances of this class to be moved */ + CLGatherKernel &operator=(CLGatherKernel &&) = default; + /** Initialise the kernel's input, output and border mode. + * + * @param[in] input1 An input tensor. Data types supported: U8/S32/F32. + * @param[in] input2 An input tensor. Data types supported: S32. + * @param[out] output The output tensor, Data types supported: same as @p input1. + */ + void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLGatherKernel + * + * @param[in] input1 An input tensor. Data types supported: U8/S32/F32. + * @param[in] input2 An input tensor. Data types supported: S32. + * @param[out] output The output tensor, Data types supported: same as @p input1. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input1; + const ICLTensor *_input2; + ICLTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLGATHERKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h new file mode 100644 index 000000000..cd2b255bc --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__ +#define __ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the pixelwise division kernel. + * + */ +class CLPixelWiseDivisionKernel : public ICLKernel +{ +public: + /** Default constructor.*/ + CLPixelWiseDivisionKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLPixelWiseDivisionKernel(const CLPixelWiseDivisionKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLPixelWiseDivisionKernel &operator=(const CLPixelWiseDivisionKernel &) = delete; + /** Allow instances of this class to be moved */ + CLPixelWiseDivisionKernel(CLPixelWiseDivisionKernel &&) = default; + /** Allow instances of this class to be moved */ + CLPixelWiseDivisionKernel &operator=(CLPixelWiseDivisionKernel &&) = default; + /** Initialise the kernel's input, output and border mode. + * + * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input2 An input tensor. Data types supported: same as @p input1. + * @param[out] output The output tensor, Data types supported: same as @p input1. Note: + * U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). + * @param[in] scale Scale to apply after division. + * Scale must be positive and its value must be either 1/255 or 1/2^n + * where n is between 0 and 15. For QS8 and QS16 scale must be 1. + * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate + * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest + * even. + */ + void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLPixelWiseDivisionKernel + * + * @param[in] input1 An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input2 An input tensor info. Data types supported: same as @p input1. + * @param[in] output The output tensor info, Data types supported: same as @p input1. + * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). + * @param[in] scale Scale to apply after division. + * Scale must be positive and its value must be either 1/255 or 1/2^n + * where n is between 0 and 15. For QS8 and QS16 scale must be 1. + * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate + * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + BorderSize border_size() const override; + +private: + const ICLTensor *_input1; + const ICLTensor *_input2; + ICLTensor *_output; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLPIXELWISEDIVISIONKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h new file mode 100644 index 000000000..a7d96cc5c --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceMaxKernel.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLREDUCEMAXKERNEL_H__ +#define __ARM_COMPUTE_CLREDUCEMAXKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the pixelwise division kernel. + * + */ +class CLReduceMaxKernel : public ICLKernel +{ +public: + /** Default constructor.*/ + CLReduceMaxKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLReduceMaxKernel(const CLReduceMaxKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers). */ + CLReduceMaxKernel &operator=(const CLReduceMaxKernel &) = delete; + /** Allow instances of this class to be moved */ + CLReduceMaxKernel(CLReduceMaxKernel &&) = default; + /** Allow instances of this class to be moved */ + CLReduceMaxKernel &operator=(CLReduceMaxKernel &&) = default; + /** Initialise the kernel's input, output and border mode. + * + * @param[in] input An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] axis Axis to reduce + * @param[out] output The output tensor, Data types supported: same as @p input1. Note: + * U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). + */ + void configure(const ICLTensor *input, int32_t axis, ICLTensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLReduceMaxKernel + * + * @param[in] input An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] axis Axis to reduce + * @param[in] output The output tensor info, Data types supported: same as @p input1. + * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). + * + * @return a status + */ + static Status validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + void run_on_cpu(cl::CommandQueue &queue); + +private: + const ICLTensor *_input; + ICLTensor *_output; + int32_t _axis; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLREDUCEMAXKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReductionMeanKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReductionMeanKernel.h new file mode 100644 index 000000000..de9df3381 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLReductionMeanKernel.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__ +#define __ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the reduction operation kernel */ +class CLReductionMeanKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLReductionMeanKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLReductionMeanKernel(const CLReductionMeanKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLReductionMeanKernel &operator=(const CLReductionMeanKernel &) = delete; + /** Allow instances of this class to be moved */ + CLReductionMeanKernel(CLReductionMeanKernel &&) = default; + /** Allow instances of this class to be moved */ + CLReductionMeanKernel &operator=(CLReductionMeanKernel &&) = default; + /** Default destructor */ + ~CLReductionMeanKernel() = default; + + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data types supported: F32. Data layouts supported: NCHW. + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input. + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. Supported reduction axis : 0, 1 + */ + void configure(const ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis); + + /** Static function to check if given info will lead to a valid configuration of @ref + * CLReductionMeanKernel. + * + * @param[in] input Source tensor info. Data types supported: F32. Data layouts supported: NCHW. + * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p + * input. + * Output will have the same number of dimensions as input. + * @param[in] axis Axis along which to reduce. Supported reduction axis : 0, 1 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + std::vector<uint32_t> axis); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + BorderSize border_size() const override; + +private: + const ICLTensor *_input; + ICLTensor *_output; + std::vector<uint32_t> _reduction_axis; + BorderSize _border_size; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLREDUCTIONMEANKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceKernel.h new file mode 100644 index 000000000..248ae6635 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLStridedSliceKernel.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__ +#define __ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Interface for the kernel to extract a strided slice of a tensor */ +class CLStridedSliceKernel : public ICLKernel +{ +public: + /** Default constructor */ + CLStridedSliceKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLStridedSliceKernel(const CLStridedSliceKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLStridedSliceKernel &operator=(const CLStridedSliceKernel &) = delete; + /** Allow instances of this class to be moved */ + CLStridedSliceKernel(CLStridedSliceKernel &&) = default; + /** Allow instances of this class to be moved */ + CLStridedSliceKernel &operator=(CLStridedSliceKernel &&) = default; + /** Default destructor */ + ~CLStridedSliceKernel() = default; + /** Set the input and output of the kernel + * + * @param[in] input Source tensor. Data type supported: + * U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32 + * @param[out] output Destination tensor. Data type supported: Same as @p input + * @param[in] beginData The begin tensor. Data types supported: S32. + * The number of dimensions must be 1. + * The length must be the same as the number of dimensions of input. + * @param[in] endData The end tensor. Data types supported: S32. + * The number of dimensions must be 1. + * The length must be the same as the number of dimensions of input. + * @param[in] strideData The stride tensor. Data types supported: S32. + * The number of dimensions must be 1. + * The length must be the same as the number of dimensions of input. + * @param[in] beginMask Mask for begin + * @param[in] endMask Mask for end + * @param[in] shrinkAxisMask Mask for shrink axis. + * + */ + void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, + ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, + int32_t shrinkAxisMask); + + /** Static function to check if given info will lead to a valid configuration of @ref + * CLStridedSliceKernel + * + * @param[in] input The input tensor info. Data types supported: + * U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32 + * @param[in] output The output tensor info, Data types supported: same as @p input1. + * @param[in] begin The begin tensor info. Data types supported: S32. + * The number of dimensions must be 1. + * The length must be the same as the number of dimensions of input. + * @param[in] end The end tensor info. Data types supported: S32. + * The number of dimensions must be 1. + * The length must be the same as the number of dimensions of input. + * @param[in] stride The stride tensor info. Data types supported: S32. + * The number of dimensions must be 1. + * The length must be the same as the number of dimensions of input. + * @param[in] beginMask Mask for begin + * @param[in] endMask Mask for end + * @param[in] shrinkAxisMask Mask for shrink axis. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *begin, const ITensorInfo *end, + const ITensorInfo *stride, int32_t beginMask, int32_t endMask, + int32_t shrinkAxisMask); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; /** Source tensor */ + ICLTensor *_output; /** Destination tensor */ + ICLTensor *_beginData; /** Start indices of input tensor */ + ICLTensor *_endData; /** Stop indices of input tensor */ + ICLTensor *_stridesData; /** Strides tensor */ + int32_t _beginMask; /** Begin mask */ + int32_t _endMask; /** End mask */ + int32_t _shrinkAxisMask; /** Shrink axis mask */ +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEKERNEL_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h new file mode 100644 index 000000000..5c567f38e --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLTOPKV2KERNEL_H__ +#define __ARM_COMPUTE_CLTOPKV2KERNEL_H__ + +#include "arm_compute/core/CL/ICLArray.h" +#include "arm_compute/core/CL/ICLKernel.h" + +#include <array> + +// these parameters can be changed +#define _ITEMS 16 // number of items in a group +#define _GROUPS 4 // the number of virtual processors is _ITEMS * _GROUPS +#define _HISTOSPLIT (_ITEMS * _GROUPS / 2) // number of splits of the histogram +#define PERMUT // store the final permutation +//////////////////////////////////////////////////////// + +namespace arm_compute +{ +class ICLTensor; + +class CLTopKV2Single : public ICLKernel +{ +public: + /** Constructor */ + CLTopKV2Single(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLTopKV2Single(const CLTopKV2Single &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLTopKV2Single &operator=(const CLTopKV2Single &) = delete; + /** Allow instances of this class to be moved */ + CLTopKV2Single(CLTopKV2Single &&) = default; + /** Allow instances of this class to be moved */ + CLTopKV2Single &operator=(CLTopKV2Single &&) = default; + + void configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices, + cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_input; + ICLTensor *_topk_values; + ICLTensor *_topk_indices; +}; + +class CLTopKV2Init : public ICLKernel +{ +public: + /** Constructor */ + CLTopKV2Init(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLTopKV2Init(const CLTopKV2Init &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLTopKV2Init &operator=(const CLTopKV2Init &) = delete; + /** Allow instances of this class to be moved */ + CLTopKV2Init(CLTopKV2Init &&) = default; + /** Allow instances of this class to be moved */ + CLTopKV2Init &operator=(CLTopKV2Init &&) = default; + + void configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf, int n); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_input; +}; + +class CLRadixSortHistogram : public ICLKernel +{ +public: + /** Constructor */ + CLRadixSortHistogram(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLRadixSortHistogram(const CLRadixSortHistogram &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLRadixSortHistogram &operator=(const CLRadixSortHistogram &) = delete; + /** Allow instances of this class to be moved */ + CLRadixSortHistogram(CLRadixSortHistogram &&) = default; + /** Allow instances of this class to be moved */ + CLRadixSortHistogram &operator=(CLRadixSortHistogram &&) = default; + + void configure(cl::Buffer *hist_buf, int bits, int n); + + void setPass(int pass, cl::Buffer *in_key_buf) + { + _pass = pass; + _in_key_buf = in_key_buf; + } + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + int _pass; + cl::Buffer *_in_key_buf; +}; + +class CLRadixSortScanHistogram : public ICLKernel +{ +public: + /** Constructor */ + CLRadixSortScanHistogram(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLRadixSortScanHistogram(const CLRadixSortScanHistogram &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLRadixSortScanHistogram &operator=(const CLRadixSortScanHistogram &) = delete; + /** Allow instances of this class to be moved */ + CLRadixSortScanHistogram(CLRadixSortScanHistogram &&) = default; + /** Allow instances of this class to be moved */ + CLRadixSortScanHistogram &operator=(CLRadixSortScanHistogram &&) = default; + + void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; +}; + +class CLRadixSortGlobalScanHistogram : public ICLKernel +{ +public: + /** Constructor */ + CLRadixSortGlobalScanHistogram(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLRadixSortGlobalScanHistogram(const CLRadixSortGlobalScanHistogram &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLRadixSortGlobalScanHistogram &operator=(const CLRadixSortGlobalScanHistogram &) = delete; + /** Allow instances of this class to be moved */ + CLRadixSortGlobalScanHistogram(CLRadixSortGlobalScanHistogram &&) = default; + /** Allow instances of this class to be moved */ + CLRadixSortGlobalScanHistogram &operator=(CLRadixSortGlobalScanHistogram &&) = default; + + void configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf, int bits); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; +}; + +class CLRadixSortPasteHistogram : public ICLKernel +{ +public: + /** Constructor */ + CLRadixSortPasteHistogram(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLRadixSortPasteHistogram(const CLRadixSortPasteHistogram &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLRadixSortPasteHistogram &operator=(const CLRadixSortPasteHistogram &) = delete; + /** Allow instances of this class to be moved */ + CLRadixSortPasteHistogram(CLRadixSortPasteHistogram &&) = default; + /** Allow instances of this class to be moved */ + CLRadixSortPasteHistogram &operator=(CLRadixSortPasteHistogram &&) = default; + + void configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; +}; + +class CLRadixSortReorder : public ICLKernel +{ +public: + /** Constructor */ + CLRadixSortReorder(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLRadixSortReorder(const CLRadixSortReorder &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLRadixSortReorder &operator=(const CLRadixSortReorder &) = delete; + /** Allow instances of this class to be moved */ + CLRadixSortReorder(CLRadixSortReorder &&) = default; + /** Allow instances of this class to be moved */ + CLRadixSortReorder &operator=(CLRadixSortReorder &&) = default; + + void configure(cl::Buffer *hist_buf, int bits, int n); + + void setPass(int pass, cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf, + cl::Buffer *out_ind_buf) + { + _pass = pass; + _in_key_buf = in_key_buf; + _out_key_buf = out_key_buf; + _in_ind_buf = in_ind_buf; + _out_ind_buf = out_ind_buf; + } + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + int _pass; + cl::Buffer *_in_key_buf; + cl::Buffer *_out_key_buf; + cl::Buffer *_in_ind_buf; + cl::Buffer *_out_ind_buf; +}; + +class CLTopKV2FindFirstNegative : public ICLKernel +{ +public: + /** Constructor */ + CLTopKV2FindFirstNegative(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLTopKV2FindFirstNegative(const CLTopKV2FindFirstNegative &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLTopKV2FindFirstNegative &operator=(const CLTopKV2FindFirstNegative &) = delete; + /** Allow instances of this class to be moved */ + CLTopKV2FindFirstNegative(CLTopKV2FindFirstNegative &&) = default; + /** Allow instances of this class to be moved */ + CLTopKV2FindFirstNegative &operator=(CLTopKV2FindFirstNegative &&) = default; + + void configure(cl::Buffer *first_negative_idx_buf, int n); + + void setOutputBuffer(cl::Buffer *out_key_buf) { _out_key_buf = out_key_buf; } + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + cl::Buffer *_out_key_buf; +}; + +class CLTopKV2ReorderNegatives : public ICLKernel +{ +public: + /** Constructor */ + CLTopKV2ReorderNegatives(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLTopKV2ReorderNegatives(const CLTopKV2ReorderNegatives &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLTopKV2ReorderNegatives &operator=(const CLTopKV2ReorderNegatives &) = delete; + /** Allow instances of this class to be moved */ + CLTopKV2ReorderNegatives(CLTopKV2ReorderNegatives &&) = default; + /** Allow instances of this class to be moved */ + CLTopKV2ReorderNegatives &operator=(CLTopKV2ReorderNegatives &&) = default; + + void configure(cl::Buffer *first_negative_idx_buf, int n); + + void setBuffers(cl::Buffer *in_key_buf, cl::Buffer *out_key_buf, cl::Buffer *in_ind_buf, + cl::Buffer *out_ind_buf) + { + _in_key_buf = in_key_buf; + _out_key_buf = out_key_buf; + _in_ind_buf = in_ind_buf; + _out_ind_buf = out_ind_buf; + } + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + cl::Buffer *_in_key_buf; + cl::Buffer *_out_key_buf; + cl::Buffer *_in_ind_buf; + cl::Buffer *_out_ind_buf; +}; + +class CLTopKV2Store : public ICLKernel +{ +public: + /** Constructor */ + CLTopKV2Store(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLTopKV2Store(const CLTopKV2Store &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLTopKV2Store &operator=(const CLTopKV2Store &) = delete; + /** Allow instances of this class to be moved */ + CLTopKV2Store(CLTopKV2Store &&) = default; + /** Allow instances of this class to be moved */ + CLTopKV2Store &operator=(CLTopKV2Store &&) = default; + + void configure(ICLTensor *values, ICLTensor *indices, int k, int n); + + void setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_values; + ICLTensor *_indices; + cl::Buffer *_out_key_buf; + cl::Buffer *_out_ind_buf; +}; + +} // namespace arm_compute + +#endif // __ARM_COMPUTE_CLTOPKV2KERNEL_H__ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h new file mode 100644 index 000000000..63050067d --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLCAST_H__ +#define __ARM_COMPUTE_CLCAST_H__ + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLCastKernel + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32. + * @note The function converts the input tensor to the tensor of the output tensor's type. + */ +class CLCast : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in, out] input Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be modified + * inside the kernel. + * @param[out] output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32. + */ + void configure(ICLTensor *input, ICLTensor *output); +}; +} +#endif /* __ARM_COMPUTE_CLCAST_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h new file mode 100644 index 000000000..3ae7afe14 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLGather.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLGATHER_H__ +#define __ARM_COMPUTE_CLGATHER_H__ + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLGatherKernel. */ +class CLGather : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's inputs, output and convertion policy. + * + * @param[in] input1 An input tensor. Data types supported: U8/S32/F32. + * @param[in] input2 An indexes tensor. Data types supported: S32. + * @param[out] output The output tensor, Data types supported: same as @p input1. + */ + void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref CLGather + * + * @param[in] input1 An input tensor. Data types supported: U8/S32/F32. + * @param[in] input2 An indexes tensor. Data types supported: S32. + * @param[out] output The output tensor, Data types supported: same as @p input1. + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output); +}; +} +#endif /*__ARM_COMPUTE_CLGATHER_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h new file mode 100644 index 000000000..c1383e21f --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISION_H__ +#define __ARM_COMPUTE_CLPIXELWISEDIVISION_H__ + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLPixelWiseDivisionKernel. */ +class CLPixelWiseDivision : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's inputs, output and convertion policy. + * + * @param[in, out] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32. + * The input tensor is [in, out] because its TensorInfo might be + * modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] input2 An input tensor. Data types supported: same as @p input1. + * The input tensor is [in, out] because its TensorInfo might be + * modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] output The output tensor, Data types supported: same as @p input1. + * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or + * 1/2^n where n is between 0 and 15. For QS8 and QS16 scale must be 1. + * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate + * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest + * even. + */ + void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale = 1.f, + ConvertPolicy overflow_policy = ConvertPolicy::WRAP, + RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLPixelWiseDivision + * + * @param[in] input1 An input tensor info. Data types supported: U8/QS8/QS16/S16/F16/F32. + * @param[in] input2 An input tensor info. Data types supported: same as @p input1. + * @param[in] output The output tensor info, Data types supported: same as @p input1. + * Note: U8 (QS8, QS16) requires both inputs to be U8 (QS8, QS16). + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n + * where n is between 0 and 15. For QS8 and QS16 scale must be 1. + * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate + * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. + * + * @return a status + */ + static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output, float scale = 1.f, + ConvertPolicy overflow_policy = ConvertPolicy::WRAP, + RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO); +}; +} +#endif /*__ARM_COMPUTE_CLPIXELWISEDIVISION_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h new file mode 100644 index 000000000..14b473f33 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceMax.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLREDUCE_MAX_H__ +#define __ARM_COMPUTE_CLREDUCE_MAX_H__ + +#include "arm_compute/runtime/CL/CLArray.h" +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to execute TopK operation. This function calls the following OpenCL kernels: + * + * -# @ref CLTopKV2Kernel + */ +class CLReduceMax : public IFunction +{ +public: + /** Constructor */ + CLReduceMax(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLReduceMax(const CLReduceMax &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLReduceMax &operator=(const CLReduceMax &) = delete; + /** Allow instances of this class to be moved */ + CLReduceMax(CLReduceMax &&) = default; + /** Allow instances of this class to be moved */ + CLReduceMax &operator=(CLReduceMax &&) = default; + /** Initialise the kernel's inputs and outputs. + * + * @note When locations of min and max occurrences are requested, the reported number of locations + * is limited to the given array size. + * + * @param[in] input Input image. Data types supported: F32 + * @param[in] axis Axis to reduce. Data type supported: S32 + * @param[out] output indices related to top k values. Data types supported: F32. + */ + void configure(ICLTensor *input, int32_t axis, ICLTensor *output); + /** Static function to check if given info will lead to a valid configuration of @ref + * CLPixelWiseDivision + * + * @param[in] input Input image. Data types supported: F32 + * @param[in] axis Axis to reduce. Data type supported: S32 + * @param[out] output indices related to top k values. Data types supported: F32. * + * + * @return a status + */ + static Status validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output); + + // Inherited methods overridden: + void run() override; + +private: + void run_on_cpu(); + + int32_t _axis; + + ICLTensor *_input; + ICLTensor *_output; + + std::unique_ptr<ICLKernel> _kernel; +}; +} +#endif /*__ARM_COMPUTE_CLREDUCE_MAX_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h new file mode 100644 index 000000000..2081518c1 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLReductionMean.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ARM_COMPUTE_CLREDUCTIONMEAN_H__ +#define __ARM_COMPUTE_CLREDUCTIONMEAN_H__ + +#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" +#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/IFunction.h" + +#include <cstdint> +#include <memory> +#include <vector> + +namespace arm_compute +{ +class ICLTensor; + +/** Perform reduction operation. + */ +class CLReductionMean : public IFunction +{ +public: + /** Default Constructor. + */ + CLReductionMean(); + + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data types supported: F32. Data layouts supported: NCHW. + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input. + * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1 + */ + void configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis); + + /** Static function to check if given info will lead to a valid configuration of @ref + * CLReductionMean. + * + * @param[in] input Source tensor info. Data types supported: F32. Data layouts supported: NCHW. + * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p + * input. + * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1 + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, + std::vector<uint32_t> axis); + + // Inherited methods overridden: + void run() override; + +private: + CLReductionMeanKernel _reduction_mean_kernel; + CLFillBorderKernel _fill_border_kernel; +}; +} +#endif /*__ARM_COMPUTE_CLREDUCTIONMEAN_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h new file mode 100644 index 000000000..f223a79be --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSlice.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLSTRIDEDSLICE_H__ +#define __ARM_COMPUTE_CLSTRIDEDSLICE_H__ + +#include "arm_compute/runtime/IFunction.h" +#include "arm_compute/runtime/CL/ICLSimpleFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to run @ref CLStridedSliceKernel */ +class CLStridedSlice : public ICLSimpleFunction +{ +public: + /** Initialise the kernel's inputs and outputs + * + * @param[in] input First tensor input. Data type supported: + * U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32 + * @param[out] output Output tensor. Data type supported: Same as @p input + */ + void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, + ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask, + int32_t shrinkAxisMask); +}; + +class CLStridedSliceCPU : public IFunction +{ +public: + /** Initialise inputs and outputs + * + * @param[in] input First tensor input. + * @param[out] output Output tensor. + */ + void configure(ICLTensor *input, ICLTensor *output, ICLTensor *beginData, ICLTensor *endData, + ICLTensor *stridesData, int32_t beginMask, int32_t endMask, + int32_t shrinkAxisMask); + + void run() override; + +private: + void run_on_cpu(); + + ICLTensor *_input; + ICLTensor *_output; + ICLTensor *_beginData; + ICLTensor *_endData; + ICLTensor *_stridesData; + int32_t _beginMask; + int32_t _endMask; + int32_t _shrinkAxisMask; +}; +} +#endif /*__ARM_COMPUTE_CLSTRIDEDSLICE_H__ */ diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h new file mode 100644 index 000000000..06cd1ee9b --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef __ARM_COMPUTE_CLTOPK_V2_H__ +#define __ARM_COMPUTE_CLTOPK_V2_H__ + +#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h" + +#include "arm_compute/runtime/CL/CLArray.h" +#include "arm_compute/runtime/IFunction.h" + +namespace arm_compute +{ +class ICLTensor; + +/** Basic function to execute TopK operation. This function calls the following OpenCL kernels: + * + * -# @ref CLTopKV2Kernel + */ +class CLTopKV2 : public IFunction +{ +public: + /** Constructor */ + CLTopKV2(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLTopKV2(const CLTopKV2 &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLTopKV2 &operator=(const CLTopKV2 &) = delete; + /** Allow instances of this class to be moved */ + CLTopKV2(CLTopKV2 &&) = default; + /** Allow instances of this class to be moved */ + CLTopKV2 &operator=(CLTopKV2 &&) = default; + /** Initialise the kernel's inputs and outputs. + * + * @note When locations of min and max occurrences are requested, the reported number of locations + * is limited to the given array size. + * + * @param[in] input Input image. Data types supported: U8/S16/F32. + * @param[in] k The value of `k`. + * @param[out] values Top k values. Data types supported: S32 if input type is U8/S16, F32 if + * input type is F32. + * @param[out] indices indices related to top k values. Data types supported: S32 if input type + * is U8/S16, F32 if input type is F32. + */ + void configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices, + int total_bits = 32, int bits = 4); + + // Inherited methods overridden: + void run() override; + +private: + void run_on_cpu(); + void run_on_gpu(); + void run_on_gpu_single_quicksort(); + + uint32_t _k; + uint32_t _total_bits; + uint32_t _bits; + uint32_t _radix; + uint32_t _hist_buf_size; + uint32_t _glob_sum_buf_size; + uint32_t _n; + + ICLTensor *_input; + ICLTensor *_values; + ICLTensor *_indices; + + cl::Buffer _qs_idx_buf; + cl::Buffer _qs_temp_buf; + cl::Buffer _hist_buf; + cl::Buffer _glob_sum_buf; + cl::Buffer _temp_buf; + cl::Buffer _first_negative_idx_buf; + cl::Buffer _in_key_buf; + cl::Buffer _out_key_buf; + cl::Buffer _in_ind_buf; + cl::Buffer _out_ind_buf; + + cl::Buffer *_p_in_key_buf; + cl::Buffer *_p_out_key_buf; + cl::Buffer *_p_in_ind_buf; + cl::Buffer *_p_out_ind_buf; + + CLTopKV2Single _qs_kernel; + CLTopKV2Init _init_kernel; + CLRadixSortHistogram _hist_kernel; + CLRadixSortScanHistogram _scan_hist_kernel; + CLRadixSortGlobalScanHistogram _glob_scan_hist_kernel; + CLRadixSortPasteHistogram _paste_hist_kernel; + CLRadixSortReorder _reorder_kernel; + CLTopKV2FindFirstNegative _find_first_negative_kernel; + CLTopKV2ReorderNegatives _reorder_negatives_kernel; + CLTopKV2Store _store_kernel; +}; +} +#endif // __ARM_COMPUTE_CLTOPK_V2_H__ diff --git a/libs/ARMComputeEx/resolve_includes.py b/libs/ARMComputeEx/resolve_includes.py new file mode 100644 index 000000000..b3e252892 --- /dev/null +++ b/libs/ARMComputeEx/resolve_includes.py @@ -0,0 +1,102 @@ +# Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved +# Copyright (c) 2016, 2017 ARM Limited. +# +# SPDX-License-Identifier: MIT +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +import collections +import os.path +import re +import subprocess +import glob + + +def resolve_includes(target, source): + # File collection + FileEntry = collections.namedtuple('FileEntry', 'target_name file_contents') + + # Include pattern + pattern = re.compile("#include \"(.*)\"") + + # Get file contents + files = [] + for i in range(len(source)): + src = source[i] + dst = target[i] + f = open(src) + cts = f.read() + f.close() + contents = cts.splitlines() + entry = FileEntry(target_name=dst, file_contents=contents) + files.append((os.path.basename(src), entry)) + + # Create dictionary of tupled list + files_dict = dict(files) + + # Check for includes (can only be files in the same folder) + final_files = [] + for file in files: + done = False + tmp_file = file[1].file_contents + print(file[1].target_name) + while not done: + file_count = 0 + updated_file = [] + for line in tmp_file: + found = pattern.search(line) + if found: + include_file = found.group(1) + data = files_dict[include_file].file_contents + updated_file.extend(data) + else: + updated_file.append(line) + file_count += 1 + + # Check if all include are replaced. + if file_count == len(tmp_file): + done = True + + # Update temp file + tmp_file = updated_file + + # Append and prepend string literal identifiers and add expanded file to final list + tmp_file.insert(0, "R\"(\n") + tmp_file.append("\n)\"") + entry = FileEntry(target_name=file[1].target_name, file_contents=tmp_file) + final_files.append((file[0], entry)) + + # Write output files + for file in final_files: + with open(file[1].target_name, 'w+') as out_file: + out_file.write("\n".join(file[1].file_contents)) + + +# Generate embed files +cl_files = glob.glob('src/core/CL/cl_kernels/*.cl') +cl_files += glob.glob('src/core/CL/cl_kernels/*.h') + +# DEBUG: print cl files +print("cl_files:") +print(cl_files) + +embed_files = [f + "embed" for f in cl_files] +print("embed_files:") +print(embed_files) + +resolve_includes(embed_files, cl_files) diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp new file mode 100644 index 000000000..d535c5da4 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp @@ -0,0 +1,547 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Utils.h" + +#include <algorithm> +#include <fstream> +#include <iostream> +#include <utility> +#include <vector> + +using namespace arm_compute; + +const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = { + {"absdiff", "absdiff.cl"}, + {"accumulate", "accumulate.cl"}, + {"accumulate_squared", "accumulate.cl"}, + {"accumulate_weighted", "accumulate.cl"}, + {"activation_layer", "activation_layer.cl"}, + {"activation_layer_qa8", "activation_layer_qa8.cl"}, + {"activation_layer_logistic_qa8", "activation_layer_qa8.cl"}, + {"arithmetic_add", "arithmetic_op.cl"}, + {"arithmetic_sub", "arithmetic_op.cl"}, + {"arithmetic_add_qasymm8", "arithmetic_op_quantized.cl"}, + {"batchnormalization_layer_nchw", "batchnormalization_layer.cl"}, + {"batchnormalization_layer_nhwc", "batchnormalization_layer.cl"}, + {"bitwise_or", "bitwise_op.cl"}, + {"bitwise_and", "bitwise_op.cl"}, + {"bitwise_xor", "bitwise_op.cl"}, + {"bitwise_not", "bitwise_op.cl"}, + {"cast", "cast.cl"}, + {"cast_qasymm_in", "cast.cl"}, + {"cast_qasymm_out", "cast.cl"}, + {"channel_combine_NV", "channel_combine.cl"}, + {"channel_combine_RGB888", "channel_combine.cl"}, + {"channel_combine_RGBA8888", "channel_combine.cl"}, + {"channel_combine_UYVY422", "channel_combine.cl"}, + {"channel_combine_YUYV422", "channel_combine.cl"}, + {"channel_shuffle_nchw", "channel_shuffle.cl"}, + {"channel_extract_NV12", "channel_extract.cl"}, + {"channel_extract_NV21", "channel_extract.cl"}, + {"channel_extract_RGB888", "channel_extract.cl"}, + {"channel_extract_RGBA8888", "channel_extract.cl"}, + {"channel_extract_UYVY422", "channel_extract.cl"}, + {"channel_extract_YUYV422", "channel_extract.cl"}, + {"combine_gradients_L1", "canny.cl"}, + {"combine_gradients_L2", "canny.cl"}, + {"concatenate_depth", "concatenate.cl"}, + {"concatenate_width", "concatenate.cl"}, + {"convolution_rectangle", "convolution_rectangle.cl"}, + {"col2im", "col2im.cl"}, + {"convert_depth_down", "depth_convert.cl"}, + {"convert_depth_up", "depth_convert.cl"}, + {"convert_fc_weights", "convert_fc_weights.cl"}, + {"convolution3x3_static", "convolution3x3.cl"}, + {"convolution5x5_static", "convolution5x5.cl"}, + {"convolution7x7_static", "convolution7x7.cl"}, + {"convolution9x9_static", "convolution9x9.cl"}, + {"convolution_separable1x5_static", "convolution5x5.cl"}, + {"convolution_separable5x1_static", "convolution5x5.cl"}, + {"convolution_separable1x7_static", "convolution7x7.cl"}, + {"convolution_separable7x1_static", "convolution7x7.cl"}, + {"convolution_separable1x9_static", "convolution9x9.cl"}, + {"convolution_separable9x1_static", "convolution9x9.cl"}, + {"copy_tensor", "copy_tensor.cl"}, + {"copy_plane", "channel_extract.cl"}, + {"copy_planes_3p", "channel_combine.cl"}, + {"copy_to_keypoint", "fast_corners.cl"}, + {"deconvolution_upsample", "deconvolution_layer.cl"}, + {"depthwise_convolution_3x3", "depthwise_convolution.cl"}, + {"depthwise_convolution_3x3_f16", "depthwise_convolution.cl"}, + {"depthwise_convolution_3x3_quantized_nchw", "depthwise_convolution_quantized.cl"}, + {"depthwise_convolution_3x3_quantized_nhwc_stride1", "depthwise_convolution_quantized.cl"}, + {"depthwise_convolution_3x3_quantized_nhwc_stride2", "depthwise_convolution_quantized.cl"}, + {"depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16", "depthwise_convolution.cl"}, + {"depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16", "depthwise_convolution.cl"}, + {"depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32", "depthwise_convolution.cl"}, + {"depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32", "depthwise_convolution.cl"}, + {"depthwise_im2col", "depthwise_convolution.cl"}, + {"depthwise_vector_to_tensor", "depthwise_convolution.cl"}, + {"depthwise_weights_reshape", "depthwise_convolution.cl"}, + {"dequantization_layer", "dequantization_layer.cl"}, + {"derivative", "derivative.cl"}, + {"dilate", "dilate.cl"}, + {"direct_convolution1x1", "direct_convolution1x1.cl"}, + {"direct_convolution1x1_f32_bifrost", "direct_convolution1x1.cl"}, + {"direct_convolution3x3", "direct_convolution3x3.cl"}, + {"direct_convolution3x3_f32_bifrost", "direct_convolution3x3.cl"}, + {"direct_convolution5x5", "direct_convolution5x5.cl"}, + {"direct_convolution5x5_f32_bifrost", "direct_convolution5x5.cl"}, + {"direct_convolution_1x1_3x3_5x5_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl"}, + {"erode", "erode.cl"}, + {"fast_corners", "fast_corners.cl"}, + {"fill_image_borders_constant", "fill_border.cl"}, + {"fill_image_borders_replicate", "fill_border.cl"}, + {"finalize", "optical_flow_pyramid_lk.cl"}, + {"floor_layer", "floor.cl"}, + {"gather", "gather.cl"}, + {"gather_1d", "gather.cl"}, + {"gather_1d_out", "gather.cl"}, + {"gaussian1x5_sub_x", "gaussian_pyramid.cl"}, + {"gaussian5x1_sub_y", "gaussian_pyramid.cl"}, + {"gemm_accumulate_biases", "gemm.cl"}, + {"gemm_interleave4x4", "gemm.cl"}, + {"gemm_ma_f16", "gemm.cl"}, + {"gemm_ma_f32", "gemm.cl"}, + {"gemm_ma_qs8", "gemm.cl"}, + {"gemm_ma_qs16", "gemm.cl"}, + {"gemm_mv", "gemv.cl"}, + {"gemm_mv_quantized", "gemv.cl"}, + {"gemm_mm_interleaved_transposed_f16", "gemm.cl"}, + {"gemm_mm_interleaved_transposed_f16_bifrost", "gemm.cl"}, + {"gemm_mm_interleaved_transposed_f32", "gemm.cl"}, + {"gemm_mm_interleaved_transposed_f32_bifrost", "gemm.cl"}, + {"gemm_mm_interleaved_transposed_qs8", "gemm.cl"}, + {"gemm_mm_interleaved_transposed_qs16", "gemm.cl"}, + {"gemm_mm_floating_point", "gemm.cl"}, + {"gemm_mm_floating_point_f16_bifrost", "gemm.cl"}, + {"gemm_mm_floating_point_f32_bifrost", "gemm.cl"}, + {"gemm_mm_floating_point_f32_bifrost_1000", "gemm.cl"}, + {"gemm_mm_qs8", "gemm.cl"}, + {"gemm_mm_qs16", "gemm.cl"}, + {"gemm_lc_vm_f32", "gemm.cl"}, + {"gemm_transpose1xW", "gemm.cl"}, + {"gemmlowp_matrix_a_reduction", "gemmlowp.cl"}, + {"gemmlowp_matrix_b_reduction", "gemmlowp.cl"}, + {"gemmlowp_mm_bifrost", "gemmlowp.cl"}, + {"gemmlowp_mm_midgard", "gemmlowp.cl"}, + {"gemmlowp_mm_interleaved_transposed_bifrost", "gemmlowp.cl"}, + {"gemmlowp_mm_interleaved_transposed_midgard", "gemmlowp.cl"}, + {"gemmlowp_offset_contribution", "gemmlowp.cl"}, + {"gemmlowp_output_stage_quantize_down", "gemmlowp.cl"}, + {"gemmlowp_output_stage_quantize_down_fixedpoint", "gemmlowp.cl"}, + {"harris_score_3x3", "harris_corners.cl"}, + {"harris_score_5x5", "harris_corners.cl"}, + {"harris_score_7x7", "harris_corners.cl"}, + {"hist_border_kernel", "histogram.cl"}, + {"hist_border_kernel_fixed", "histogram.cl"}, + {"hist_local_kernel", "histogram.cl"}, + {"hist_local_kernel_fixed", "histogram.cl"}, + {"hog_block_normalization", "hog.cl"}, + {"hog_detector", "hog.cl"}, + {"hog_orientation_binning", "hog.cl"}, + {"hysteresis", "canny.cl"}, + {"im2col1x1_stridex1_dchw", "im2col.cl"}, + {"im2col3x3_dchw", "im2col.cl"}, + {"im2col5x5_dchw", "im2col.cl"}, + {"im2col11x11_padx0_pady0_dchw", "im2col.cl"}, + {"im2col_generic_dchw", "im2col.cl"}, + {"im2col_generic_padx0_pady0_dchw", "im2col.cl"}, + {"im2col_reduced_dchw", "im2col.cl"}, + {"init_level", "optical_flow_pyramid_lk.cl"}, + {"init_level_max", "optical_flow_pyramid_lk.cl"}, + {"init_level_max_initial_estimate", "optical_flow_pyramid_lk.cl"}, + {"integral_horizontal", "integral_image.cl"}, + {"integral_vertical", "integral_image.cl"}, + {"IYUV_to_NV12_bt709", "color_convert.cl"}, + {"IYUV_to_RGB888_bt709", "color_convert.cl"}, + {"IYUV_to_RGBA8888_bt709", "color_convert.cl"}, + {"IYUV_to_YUV444_bt709", "color_convert.cl"}, + {"l2_normalize", "l2_normalize.cl"}, + {"lktracker_stage0", "optical_flow_pyramid_lk.cl"}, + {"lktracker_stage1", "optical_flow_pyramid_lk.cl"}, + {"magnitude_phase", "magnitude_phase.cl"}, + {"mean_stddev_accumulate", "mean_stddev.cl"}, + {"minmax", "minmaxloc.cl"}, + {"minmax_border", "minmaxloc.cl"}, + {"minmax_layer", "minmax_layer.cl"}, + {"minmaxloc", "minmaxloc.cl"}, + {"non_linear_filter_box3x3", "non_linear_filter3x3.cl"}, + {"non_linear_filter_cross3x3", "non_linear_filter3x3.cl"}, + {"non_linear_filter_disk3x3", "non_linear_filter3x3.cl"}, + {"non_linear_filter_box5x5", "non_linear_filter5x5.cl"}, + {"non_linear_filter_cross5x5", "non_linear_filter5x5.cl"}, + {"non_linear_filter_disk5x5", "non_linear_filter5x5.cl"}, + {"non_max_suppression", "nonmax.cl"}, + {"normalization_layer_cross_map", "normalization_layer.cl"}, + {"normalization_layer_in_map", "normalization_layer.cl"}, + {"NV12_to_IYUV_bt709", "color_convert.cl"}, + {"NV12_to_RGB888_bt709", "color_convert.cl"}, + {"NV12_to_RGBA8888_bt709", "color_convert.cl"}, + {"NV12_to_YUV444_bt709", "color_convert.cl"}, + {"NV21_to_IYUV_bt709", "color_convert.cl"}, + {"NV21_to_RGB888_bt709", "color_convert.cl"}, + {"NV21_to_RGBA8888_bt709", "color_convert.cl"}, + {"NV21_to_YUV444_bt709", "color_convert.cl"}, + {"output_stage_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl"}, + {"permute_201", "permute.cl"}, + {"permute_120", "permute.cl"}, + {"permute_3201", "permute.cl"}, + {"pixelwise_mul_float", "pixelwise_mul_float.cl"}, + {"pixelwise_mul_int", "pixelwise_mul_int.cl"}, + {"pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl"}, + {"pixelwise_div_float", "pixelwise_div_float.cl"}, + {"pixelwise_div_int", "pixelwise_div_int.cl"}, + {"pooling_layer_2", "pooling_layer.cl"}, + {"pooling_layer_3", "pooling_layer.cl"}, + {"pooling_layer_optimized_3", "pooling_layer.cl"}, + {"pooling_layer_7", "pooling_layer.cl"}, + {"pooling_layer_MxN_nchw", "pooling_layer.cl"}, + {"pooling_layer_MxN_nhwc", "pooling_layer.cl"}, + {"pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl"}, + {"pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl"}, + {"quantization_layer", "quantization_layer.cl"}, + {"reduce_max", "reduce_max.cl"}, + {"reduction_operation", "reduction_operation.cl"}, + {"reduction_mean", "reduction_mean.cl"}, + {"remap_nearest_neighbour", "remap.cl"}, + {"remap_bilinear", "remap.cl"}, + {"reshape_layer", "reshape_layer.cl"}, + {"reshape_to_columns", "convolution_layer.cl"}, + {"RGB888_to_IYUV_bt709", "color_convert.cl"}, + {"RGB888_to_NV12_bt709", "color_convert.cl"}, + {"RGB888_to_RGBA8888_bt709", "color_convert.cl"}, + {"RGB888_to_YUV444_bt709", "color_convert.cl"}, + {"RGBA8888_to_IYUV_bt709", "color_convert.cl"}, + {"RGBA8888_to_NV12_bt709", "color_convert.cl"}, + {"RGBA8888_to_RGB888_bt709", "color_convert.cl"}, + {"RGBA8888_to_YUV444_bt709", "color_convert.cl"}, + {"roi_pooling_layer", "roi_pooling_layer.cl"}, + {"scale_nearest_neighbour", "scale.cl"}, + {"scale_bilinear", "scale.cl"}, + {"scharr3x3", "scharr_filter.cl"}, + {"sobel3x3", "sobel_filter.cl"}, + {"sobel_separable5x1", "sobel_filter.cl"}, + {"sobel_separable1x5", "sobel_filter.cl"}, + {"sobel_separable7x1", "sobel_filter.cl"}, + {"sobel_separable1x7", "sobel_filter.cl"}, + {"softmax_layer_norm", "softmax_layer.cl"}, + {"softmax_layer_norm_quantized", "softmax_layer_quantized.cl"}, + {"softmax_layer_max_shift_exp_sum_quantized_serial", "softmax_layer_quantized.cl"}, + {"softmax_layer_max_shift_exp_sum_quantized_parallel", "softmax_layer_quantized.cl"}, + {"softmax_layer_max_shift_exp_sum_serial", "softmax_layer.cl"}, + {"softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl"}, + {"strided_slice", "strided_slice.cl"}, + {"suppress_non_maximum", "canny.cl"}, + {"tablelookup_U8", "tablelookup.cl"}, + {"tablelookup_S16", "tablelookup.cl"}, + {"threshold_binary", "threshold.cl"}, + {"threshold_range", "threshold.cl"}, + {"transpose", "transpose.cl"}, + {"UYVY422_to_IYUV_bt709", "color_convert.cl"}, + {"UYVY422_to_NV12_bt709", "color_convert.cl"}, + {"UYVY422_to_RGB888_bt709", "color_convert.cl"}, + {"UYVY422_to_RGBA8888_bt709", "color_convert.cl"}, + {"warp_affine_nearest_neighbour", "warp_affine.cl"}, + {"warp_affine_bilinear", "warp_affine.cl"}, + {"warp_perspective_nearest_neighbour", "warp_perspective.cl"}, + {"warp_perspective_bilinear", "warp_perspective.cl"}, + {"winograd_filter_transform_2x2_3x3_nchw", "winograd.cl"}, + {"winograd_filter_transform_4x4_3x3_nchw", "winograd.cl"}, + {"winograd_filter_transform_4x4_5x5_nchw", "winograd.cl"}, + {"winograd_input_transform_4x4_5x5_stepz1_nchw", "winograd.cl"}, + {"winograd_input_transform_2x2_3x3_stepz1_nchw", "winograd.cl"}, + {"winograd_input_transform_2x2_3x3_stepz2_nchw", "winograd.cl"}, + {"winograd_input_transform_4x4_3x3_stepz1_nchw", "winograd.cl"}, + {"winograd_output_transform_2x2_3x3_nchw", "winograd.cl"}, + {"winograd_output_transform_4x4_3x3_nchw", "winograd.cl"}, + {"winograd_output_transform_4x4_5x5_nchw", "winograd.cl"}, + {"YUYV422_to_IYUV_bt709", "color_convert.cl"}, + {"YUYV422_to_NV12_bt709", "color_convert.cl"}, + {"YUYV422_to_RGB888_bt709", "color_convert.cl"}, + {"YUYV422_to_RGBA8888_bt709", "color_convert.cl"}, + {"topkv2_init", "topkv2.cl"}, + {"topkv2_find_first_negative", "topkv2.cl"}, + {"topkv2_reorder_negatives", "topkv2.cl"}, + {"topkv2_store", "topkv2.cl"}, + {"radixsort_histogram", "topkv2_radixsort.cl"}, + {"radixsort_scanhistograms", "topkv2_radixsort.cl"}, + {"radixsort_pastehistograms", "topkv2_radixsort.cl"}, + {"radixsort_reorder", "topkv2_radixsort.cl"}, + {"topkv2_quicksort", "topkv2_quicksort.cl"}, +}; + +const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = { +#ifdef EMBEDDED_KERNELS + { + "cast.cl", +#include "./cl_kernels/cast.clembed" + }, + { + "fixed_point.h", +#include "./cl_kernels/fixed_point.hembed" + }, + { + "gather.cl", +#include "./cl_kernels/gather.clembed" + }, + { + "helpers.h", +#include "./cl_kernels/helpers.hembed" + }, + { + "helpers_asymm.h", +#include "./cl_kernels/helpers_asymm.hembed" + }, + { + "pixelwise_div_float.cl", +#include "./cl_kernels/pixelwise_div_float.clembed" + }, + { + "pixelwise_div_int.cl", +#include "./cl_kernels/pixelwise_div_int.clembed" + }, + { + "reduce_max.cl", +#include "./cl_kernels/reduce_max.clembed" + }, + { + "reduction_mean.cl", +#include "./cl_kernels/reduction_mean.clembed" + }, + { + "strided_slice.cl", +#include "./cl_kernels/strided_slice.clembed" + }, + { + "topkv2.cl", +#include "./cl_kernels/topkv2.clembed" + }, + { + "topkv2_radixsort.cl", +#include "./cl_kernels/topkv2_radixsort.clembed" + }, + { + "topkv2_quicksort.cl", +#include "./cl_kernels/topkv2_quicksort.clembed" + }, +#endif /* EMBEDDED_KERNELS */ +}; + +CLKernelLibraryEx::CLKernelLibraryEx() + : _context(), _device(), _kernel_path("."), _programs_map(), _built_programs_map() +{ + opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the + // CLKernelLibrary is built +} + +CLKernelLibraryEx &CLKernelLibraryEx::get() +{ + static CLKernelLibraryEx _kernel_library; + return _kernel_library; +} + +Kernel CLKernelLibraryEx::create_kernel(const std::string &kernel_name, + const StringSet &build_options_set) const +{ + // Find which program contains the kernel + auto kernel_program_it = _kernel_program_map.find(kernel_name); + + if (_kernel_program_map.end() == kernel_program_it) + { + ARM_COMPUTE_ERROR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str()); + } + std::string concat_str; + + if (fp16_supported(_device)) + { + concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 "; + } + + if (get_cl_version(_device) == CLVersion::CL20) + { + concat_str += " -cl-std=CL2.0 "; + } + else if (arm_non_uniform_workgroup_supported(_device)) + { + concat_str += " -cl-arm-non-uniform-work-group-size "; + } + else + { + ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!"); + } + + // Check if the program has been built before with same build options. + const std::string program_name = kernel_program_it->second; + const std::string build_options = stringify_set(build_options_set) + concat_str; + + const std::string built_program_name = program_name + "_" + build_options; + auto built_program_it = _built_programs_map.find(built_program_name); + + cl::Program cl_program; + + if (_built_programs_map.end() != built_program_it) + { + // If program has been built, retrieve to create kernel from it + cl_program = built_program_it->second; + } + else + { + // Get program + Program program = load_program(program_name); + + // Build program + cl_program = program.build(build_options); + + // Add built program to internal map + _built_programs_map.emplace(built_program_name, cl_program); + } + + // Create and return kernel + return Kernel(kernel_name, cl_program); +} + +void CLKernelLibraryEx::add_built_program(const std::string &built_program_name, + cl::Program program) +{ + _built_programs_map.emplace(built_program_name, program); +} + +const Program &CLKernelLibraryEx::load_program(const std::string &program_name) const +{ + const auto program_it = _programs_map.find(program_name); + + if (program_it != _programs_map.end()) + { + return program_it->second; + } + + Program program; + +#ifdef EMBEDDED_KERNELS + const auto program_source_it = _program_source_map.find(program_name); + + if (_program_source_map.end() == program_source_it) + { + ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str()); + } + + program = Program(_context, program_name, program_source_it->second); +#else /* EMBEDDED_KERNELS */ + // Check for binary + std::string source_name = _kernel_path + program_name; + std::string binary_name = source_name + "bin"; + + if (std::ifstream(binary_name).is_open()) + { + const std::string program_binary = read_file(binary_name, true); + program = Program(_context, _device, program_name, + std::vector<unsigned char>(program_binary.begin(), program_binary.end())); + } + else if (std::ifstream(source_name).is_open()) + { + program = Program(_context, program_name, read_file(source_name, false)); + } + else + { + ARM_COMPUTE_ERROR("Kernel file %s does not exist.", source_name.c_str()); + } +#endif /* EMBEDDED_KERNELS */ + + // Insert program to program map + const auto new_program = _programs_map.emplace(program_name, std::move(program)); + + return new_program.first->second; +} + +std::string CLKernelLibraryEx::stringify_set(const StringSet &s) const +{ + std::string concat_set; + +#ifndef EMBEDDED_KERNELS + concat_set += "-I" + _kernel_path + " "; +#endif /* EMBEDDED_KERNELS */ + + // Concatenate set + for (const auto &el : s) + { + concat_set += " " + el; + } + + return concat_set; +} + +std::string CLKernelLibraryEx::get_program_source(const std::string &program_name) +{ + const auto program_source_it = _program_source_map.find(program_name); + + if (program_source_it == _program_source_map.end()) + { + ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str()); + } + + return program_source_it->second; +} + +size_t CLKernelLibraryEx::max_local_workgroup_size(const cl::Kernel &kernel) const +{ + size_t result; + + size_t err = kernel.getWorkGroupInfo(_device, CL_KERNEL_WORK_GROUP_SIZE, &result); + ARM_COMPUTE_ERROR_ON_MSG( + err != 0, + "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel"); + ARM_COMPUTE_UNUSED(err); + + return result; +} + +cl::NDRange CLKernelLibraryEx::default_ndrange() const +{ + cl::Device device = cl::Device::getDefault(); + GPUTarget _target = get_target_from_device(device); + cl::NDRange default_range; + + switch (_target) + { + case GPUTarget::MIDGARD: + case GPUTarget::T600: + case GPUTarget::T700: + case GPUTarget::T800: + default_range = cl::NDRange(128u, 1); + break; + default: + default_range = cl::NullRange; + } + + return default_range; +} + +std::string CLKernelLibraryEx::get_device_version() { return _device.getInfo<CL_DEVICE_VERSION>(); } diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl new file mode 100644 index 000000000..0c0a9ede6 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers_asymm.h" + +#if defined(FIXED_POINT_POSITION) +#include "fixed_point.h" +#endif /* FIXED_POINT_POSITION */ + +#ifdef SATURATE +#define ADD(x, y) add_sat((x), (y)) +#define SUB(x, y) sub_sat((x), (y)) +#else /* SATURATE */ +#define ADD(x, y) (x) + (y) +#define SUB(x, y) (x) - (y) +#endif /* SATURATE */ + +/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8 + * + * The following computations will be performed: + * + * -# Add offset terms to inputs + -# Get scaled value of two inputs + * -# Add inputs + * -# Add offset terms to final result + * -# Multiply each entry of result by result_mult_int + * -# Shift the int32 accumulator by result_shift + * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. + * + * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar + * @attention The number of bits to shift left of input tensors must be passed at compile time using -DLEFT_SHIFT + * @attention The offset, scalar scale factor and number of bits to shift right of input tensors must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT, -DIN2_OFFSET, -RIN2_MULT_INT and -DIN2_SHIFT + * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT + * + * @attention The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar + * @attention The inputs and output scale information of qasymm8 need to be passed at compile time using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT: + * e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f + * @attention The inputs and output scale offset need to be passed at compile time using -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT: + * e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0 + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used. + * + * @param[in] in1_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] in1_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in1_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in1_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] in1_step_z in1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] in2_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] in2_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in2_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in2_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] in2_step_z in2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] out_ptr Pointer to the destination tensor. Supported data types: QASYMM8 + * @param[in] out_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void arithmetic_add_qasymm8( + TENSOR3D_DECLARATION(in1), + TENSOR3D_DECLARATION(in2), + TENSOR3D_DECLARATION(out)) +{ + // Get pixels pointer + Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); + Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + + // Load data + VEC_DATA_TYPE(int, 16) + in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16)); + VEC_DATA_TYPE(int, 16) + in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16)); + + // Get scaled value of two inputs + VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET); + VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET); + + VEC_DATA_TYPE(int, 16) left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT); + VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift; + VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift; + + VEC_DATA_TYPE(int, 16) scaled_in1_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16); + VEC_DATA_TYPE(int, 16) scaled_in2_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16); + + // Add inputs and multiply with a multiplier smaller than 1 + VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val; + VEC_DATA_TYPE(int, 16) out_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16); + out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET); + + VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16)); + +// TODO: Apply min-max BOUND to support fuse with relu. +/* +#if defined(MIN_BOUND) + res = max(res, (uchar16)MIN_BOUND); +#endif // defined(MIN_BOUND) +#if defined(MAX_BOUND) + res = min(res, (uchar16)MAX_BOUND); +#endif // defined(MAX_BOUND) +*/ + + // Store result + VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), + 0, (__global DATA_TYPE_OUT *)out.ptr); +} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl new file mode 100644 index 000000000..113804cca --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/cast.cl @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#ifndef SCALE_IN +#define SCALE_IN 1.0f +#endif +#ifndef OFFSET_IN +#define OFFSET_IN 0 +#endif + +/** Perform a cast operation on an input tensor. + * + * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g. -DDATA_TYPE_IN=float + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void cast( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr), + VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), + 0, (__global DATA_TYPE_OUT *)output.ptr); +} + + +/** Perform a cast operation on an QASYMM8 input tensor. + * + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void cast_qasymm_in( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data = + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); + VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET_IN); + VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE_IN); + + VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset; + VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale; + + VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), + 0, (__global DATA_TYPE_OUT *)output.ptr); +} + + +/** Perform a cast operation on an QASYMM8 output tensor. + * + * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: U8 + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void cast_qasymm_out( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) in_data = + VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); + VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET_IN); + VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE_IN); + + VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale; + VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE)); + + VSTORE(VEC_SIZE)(CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), + 0, (__global DATA_TYPE_OUT *)output.ptr); +} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h new file mode 100644 index 000000000..7807533e2 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/fixed_point.h @@ -0,0 +1,565 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_FIXED_POINT_H +#define ARM_COMPUTE_FIXED_POINT_H + +#define TYPE_ALIAS(type, alias) \ + typedef type alias; \ + typedef type alias##x##1; \ + typedef type##2 alias##x##2; \ + typedef type##3 alias##x##3; \ + typedef type##4 alias##x##4; \ + typedef type##8 alias##x##8; \ + typedef type##16 alias##x##16; + +TYPE_ALIAS(char, qs8) +TYPE_ALIAS(short, qs16) +TYPE_ALIAS(int, qs32) + +#define qs8_MIN ((char)CHAR_MIN) +#define qs8_MAX ((char)CHAR_MAX) +#define qs16_MIN ((short)SHRT_MIN) +#define qs16_MAX ((short)SHRT_MAX) +#define qs32_MIN ((int)INT_MIN) +#define qs32_MAX ((int)INT_MAX) + +#define qu8_MIN ((uchar)0) +#define qu8_MAX ((uchar)UCHAR_MAX) +#define qu16_MIN ((ushort)0) +#define qu16_MAX ((ushort)USHRT_MAX) +#define qu32_MIN ((uint)0) +#define qu32_MAX ((uint)UINT_MAX) + +#define qs8_TYPE char +#define qs8x1_TYPE char +#define qs8x2_TYPE char2 +#define qs8x3_TYPE char3 +#define qs8x4_TYPE char4 +#define qs8x8_TYPE char8 +#define qs8x16_TYPE char16 + +#define qs16_TYPE short +#define qs16x1_TYPE short +#define qs16x2_TYPE short2 +#define qs16x3_TYPE short3 +#define qs16x4_TYPE short4 +#define qs16x8_TYPE short8 +#define qs16x16_TYPE short16 + +#define qs32_TYPE int +#define qs32x1_TYPE int +#define qs32x2_TYPE int2 +#define qs32x3_TYPE int3 +#define qs32x4_TYPE int4 +#define qs32x8_TYPE int8 +#define qs32x16_TYPE int16 + +/* All internal constants are represented in the maximum supported fixed point format (QS16), + * thus we define an additional shift parameter required to convert the constant + * from the maximum supported format to the require one. + */ +#define qs8_SHIFT 8 +#define qs16_SHIFT 0 + +#undef VEC_DATA_TYPE_STR +#undef VEC_DATA_TYPE +#undef CONVERT_STR +#undef CONVERT +#undef CONVERT_SAT_STR +#undef CONVERT_SAT + +#define VEC_DATA_TYPE_STR(type, size) type##x##size +#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) + +#define CONVERT_STR3(x, type, rtype) (convert_##rtype((x))) +#define CONVERT_STR2(x, type, rtype) CONVERT_STR3(x, type, rtype) +#define CONVERT_STR(x, type) CONVERT_STR2(x, type, type##_TYPE) +#define CONVERT(x, type) CONVERT_STR(x, type) + +#define CONVERT_SAT_STR3(x, type, rtype) (convert_##rtype##_sat((x))) +#define CONVERT_SAT_STR2(x, type, rtype) CONVERT_SAT_STR3(x, type, rtype) +#define CONVERT_SAT_STR(x, type) CONVERT_SAT_STR2(x, type, type##_TYPE) +#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) + +/** Computes saturating absolute value of fixed point vector. + * + * @param[in] type the actual data type. + * + * @return The result of the fixed point absolute value. + */ +#define ABSQ_SAT_IMPL(type) \ + inline type abs_##type##_sat(type VopA) { return CONVERT_SAT(abs(VopA), type); } + +ABSQ_SAT_IMPL(qs8x16) +ABSQ_SAT_IMPL(qs16x8) + +#define ABS_SAT_OP_EXPAND_STR(a, type, size) abs_##type##x##size##_sat((a)) +#define ABS_SAT_OP_EXPAND(a, type, size) ABS_SAT_OP_EXPAND_STR(a, type, size) + +/** Computes max of fixed point types. + * + * @param[in] type the actual data type. + * + * @return The result of the fixed point maximum. + */ +#define MAXQ_IMPL(type) \ + inline type max_##type(type VopA, type VopB) { return max(VopA, VopB); } + +MAXQ_IMPL(qs8x1) +MAXQ_IMPL(qs8x2) +MAXQ_IMPL(qs8x4) +MAXQ_IMPL(qs8x8) +MAXQ_IMPL(qs8x16) +MAXQ_IMPL(qs16x1) +MAXQ_IMPL(qs16x2) +MAXQ_IMPL(qs16x4) +MAXQ_IMPL(qs16x8) +MAXQ_IMPL(qs16x16) + +#define MAX_OP_EXPAND_STR(a, b, type, size) max_##type##x##size((a), (b)) +#define MAX_OP_EXPAND(a, b, type, size) MAX_OP_EXPAND_STR(a, b, type, size) + +/** Computes saturated addition of fixed point types. + * + * @param[in] type the actual data type. + * + * @return The result of the fixed point addition. The result is saturated in case of overflow + */ +#define ADDQ_SAT_IMPL(type) \ + inline type add_sat_##type(type VopA, type VopB) { return add_sat(VopA, VopB); } + +ADDQ_SAT_IMPL(qs8x1) +ADDQ_SAT_IMPL(qs8x2) +ADDQ_SAT_IMPL(qs8x4) +ADDQ_SAT_IMPL(qs8x8) +ADDQ_SAT_IMPL(qs8x16) +ADDQ_SAT_IMPL(qs16x1) +ADDQ_SAT_IMPL(qs16x2) +ADDQ_SAT_IMPL(qs16x4) +ADDQ_SAT_IMPL(qs16x8) +ADDQ_SAT_IMPL(qs16x16) +ADDQ_SAT_IMPL(qs32x1) +ADDQ_SAT_IMPL(qs32x2) +ADDQ_SAT_IMPL(qs32x4) +ADDQ_SAT_IMPL(qs32x8) +ADDQ_SAT_IMPL(qs32x16) + +#define ADD_SAT_OP_EXPAND_STR(a, b, type, size) add_sat_##type##x##size((a), (b)) +#define ADD_SAT_OP_EXPAND(a, b, type, size) ADD_SAT_OP_EXPAND_STR(a, b, type, size) + +/** Computes saturated subtraction of fixed point types. + * + * @param[in] type the actual data type. + * + * @return The result of the fixed point subtraction. The result is saturated in case of overflow + */ +#define SUBQ_SAT_IMPL(type) \ + inline type sub_sat_##type(type VopA, type VopB) { return sub_sat(VopA, VopB); } + +SUBQ_SAT_IMPL(qs8x1) +SUBQ_SAT_IMPL(qs8x2) +SUBQ_SAT_IMPL(qs8x4) +SUBQ_SAT_IMPL(qs8x8) +SUBQ_SAT_IMPL(qs8x16) +SUBQ_SAT_IMPL(qs16x1) +SUBQ_SAT_IMPL(qs16x2) +SUBQ_SAT_IMPL(qs16x4) +SUBQ_SAT_IMPL(qs16x8) +SUBQ_SAT_IMPL(qs16x16) + +#define SUB_SAT_OP_EXPAND_STR(a, b, type, size) sub_sat_##type##x##size((a), (b)) +#define SUB_SAT_OP_EXPAND(a, b, type, size) SUB_SAT_OP_EXPAND_STR(a, b, type, size) + +/* Multiply of two fixed point numbers + * + * @param[in] type the actual data type. + * @param[in] itype the intermediate data type. + * + * @return The result of the fixed point multiplication. + */ +#define MULQ_IMPL(type, itype) \ + inline type mul_##type(type VopA, type VopB, int fixed_point_position) \ + { \ + itype round_val = (itype)(1 << (fixed_point_position - 1)); \ + itype res = CONVERT((VopA), itype) * CONVERT((VopB), itype) + round_val; \ + return CONVERT((res >> (itype)fixed_point_position), type); \ + } + +MULQ_IMPL(qs8x8, qs16x8) +MULQ_IMPL(qs16x8, qs32x8) +MULQ_IMPL(qs8x16, qs16x16) +MULQ_IMPL(qs16x16, qs32x16) + +#define MUL_OP_EXPAND_STR(a, b, type, size, position) mul_##type##x##size((a), (b), (position)) +#define MUL_OP_EXPAND(a, b, type, size, position) MUL_OP_EXPAND_STR(a, b, type, size, position) + +/* Saturate multiply of two fixed point numbers + * + * @param[in] type the actual data type. + * @param[in] itype the intermediate data type. + * + * @return The result of the fixed point multiplication. The result is saturated in case of overflow + */ +#define MULQ_SAT_IMPL(type, itype) \ + inline type mul_sat_##type(type VopA, type VopB, int fixed_point_position) \ + { \ + itype round_val = (itype)(1 << (fixed_point_position - 1)); \ + itype res = mad_sat(CONVERT((VopA), itype), CONVERT((VopB), itype), round_val); \ + return CONVERT_SAT((res >> (itype)fixed_point_position), type); \ + } + +MULQ_SAT_IMPL(qs8x1, qs16x1) +MULQ_SAT_IMPL(qs8x2, qs16x2) +MULQ_SAT_IMPL(qs8x3, qs16x3) +MULQ_SAT_IMPL(qs8x4, qs16x4) +MULQ_SAT_IMPL(qs8x8, qs16x8) +MULQ_SAT_IMPL(qs8x16, qs16x16) +MULQ_SAT_IMPL(qs16x1, qs32x1) +MULQ_SAT_IMPL(qs16x2, qs32x2) +MULQ_SAT_IMPL(qs16x3, qs32x3) +MULQ_SAT_IMPL(qs16x4, qs32x4) +MULQ_SAT_IMPL(qs16x8, qs32x8) +MULQ_SAT_IMPL(qs16x16, qs32x16) + +#define MUL_SAT_OP_EXPAND_STR(a, b, type, size, position) \ + mul_sat_##type##x##size((a), (b), (position)) +#define MUL_SAT_OP_EXPAND(a, b, type, size, position) \ + MUL_SAT_OP_EXPAND_STR(a, b, type, size, position) + +/** Saturate multiply-accumulate + * + * @param[in] type the actual data type. + * @param[in] itype the intermediate data type. + * + * @return The result of the fixed point multiply-accumulate. The result is saturated in case of + * overflow + */ +#define MLAQ_SAT_IMPL(type, itype) \ + type mla_sat_##type(type VopA, type VopB, type VopC, int fixed_point_position) \ + { \ + itype res = mad_sat(CONVERT(VopB, itype), CONVERT(VopC, itype), \ + (itype)(1 << (fixed_point_position - 1))); \ + return add_sat(VopA, CONVERT_SAT(res >> (itype)fixed_point_position, type)); \ + } + +MLAQ_SAT_IMPL(qs8x8, qs16x8) +MLAQ_SAT_IMPL(qs8x16, qs16x16) +MLAQ_SAT_IMPL(qs16x8, qs32x8) + +#define MLA_SAT_OP_EXPAND_STR(a, b, c, type, size, position) \ + mla_sat_##type##x##size((a), (b), (c), (position)) +#define MLA_SAT_OP_EXPAND(a, b, c, type, size, position) \ + MLA_SAT_OP_EXPAND_STR(a, b, c, type, size, position) + +/** Saturate multiply-accumulate long + * + * @param[in] type the actual data type. + * @param[in] itype the intermediate data type. + * + * @return The result of the fixed point multiply-accumulate long. The result is saturated in case + * of overflow + */ +#define MLALQ_SAT_IMPL(type, itype) \ + itype mlal_sat_##type(itype VopA, type VopB, type VopC, int fixed_point_position) \ + { \ + itype res = mad_sat(CONVERT(VopB, itype), CONVERT(VopC, itype), \ + (itype)(1 << (fixed_point_position - 1))); \ + return add_sat(VopA, res >> (itype)fixed_point_position); \ + } + +MLALQ_SAT_IMPL(qs8x8, qs16x8) +MLALQ_SAT_IMPL(qs16x8, qs32x8) + +#define MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position) \ + mlal_sat_##type##x##size((a), (b), (c), (position)) +#define MLAL_SAT_OP_EXPAND(a, b, c, type, size, position) \ + MLAL_SAT_OP_EXPAND_STR(a, b, c, type, size, position) + +/** Saturate division of two fixed point vectors + * + * @param[in] stype the actual scalar data type. + * @param[in] type the actual data type. + * @param[in] itype the intermediate data type. + * + * @return The result of the fixed point division. The result is saturated in case of overflow + */ +#define DIVQ_SAT_IMPL(stype, type, itype) \ + inline type div_sat_##type(type VopA, type VopB, int fixed_point_position) \ + { \ + itype conv_a = CONVERT((VopA), itype); \ + itype denominator = CONVERT((VopB), itype); \ + itype numerator = conv_a << (itype)(fixed_point_position); \ + itype res = select((itype)(numerator / denominator), \ + select((itype)stype##_MAX, (itype)stype##_MIN, (itype)(conv_a < (itype)0)), \ + (itype)(denominator == (itype)0)); \ + return CONVERT_SAT((res), type); \ + } + +DIVQ_SAT_IMPL(qs8, qs8x16, qs16x16) +DIVQ_SAT_IMPL(qs16, qs16x8, qs32x8) +DIVQ_SAT_IMPL(qs16, qs16x16, qs32x16) +DIVQ_SAT_IMPL(qs8, qs8, qs16) +DIVQ_SAT_IMPL(qs16, qs16, qs32) + +#define DIV_SAT_OP_EXPAND_STR(a, b, type, position) div_sat_##type((a), (b), (position)) +#define DIV_SAT_OP_EXPAND(a, b, type, position) DIV_SAT_OP_EXPAND_STR(a, b, type, position) + +#define DIV_SAT_OP_VEC_EXPAND_STR(a, b, type, size, position) \ + div_sat_##type##x##size((a), (b), (position)) +#define DIV_SAT_OP_VEC_EXPAND(a, b, type, size, position) \ + DIV_SAT_OP_VEC_EXPAND_STR(a, b, type, size, position) + +/** Saturate exponential of a fixed point vector + * + * @note Implemented approach uses taylor polynomial to approximate the exponential function. + * + * @param[in] stype the actual scalar data type. + * @param[in] type the actual data type. + * @param[in] size the number of the calculated elements. + * + * @return The result of the fixed point exponential. The result is saturated in case of overflow + */ +#define EXPQ_IMPL(stype, type, size) \ + inline type exp_sat_##type(type VopA, int fixed_point_position) \ + { \ + type const_one = (type)(1 << (fixed_point_position)); \ + type ln2 = (type)((((0x58B9 >> (14 - fixed_point_position))) + 1) >> 1); \ + type inv_ln2 = (type)((((0x38AA >> (14 - fixed_point_position)) + 1) >> 1)) | const_one; \ + type A = (type)(((0x7FBA >> (14 - fixed_point_position)) + 1) >> 1); \ + type B = (type)(((0x3FE9 >> (14 - fixed_point_position)) + 1) >> 1); \ + type C = (type)(((0x1693 >> (14 - fixed_point_position)) + 1) >> 1); \ + type D = (type)(((0x0592 >> (14 - fixed_point_position)) + 1) >> 1); \ + type m = MUL_SAT_OP_EXPAND(VopA, inv_ln2, stype, size, fixed_point_position); \ + type dec_m = m >> (type)fixed_point_position; \ + type alpha = MUL_SAT_OP_EXPAND(dec_m << (type)fixed_point_position, ln2, stype, size, \ + fixed_point_position); \ + alpha = CONVERT(abs_diff(VopA, alpha), type); \ + type sum = add_sat(MUL_SAT_OP_EXPAND(alpha, D, stype, size, fixed_point_position), C); \ + sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), B); \ + sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), A); \ + sum = add_sat(MUL_SAT_OP_EXPAND(alpha, sum, stype, size, fixed_point_position), const_one); \ + return select((type)stype##_MAX, select(sum << dec_m, sum >> -dec_m, dec_m < (type)0), \ + clz(sum) > dec_m); /* Saturate result if needed */ \ + } + +EXPQ_IMPL(qs8, qs8x2, 2) +EXPQ_IMPL(qs8, qs8x4, 4) +EXPQ_IMPL(qs8, qs8x8, 8) +EXPQ_IMPL(qs8, qs8x16, 16) +EXPQ_IMPL(qs16, qs16x2, 2) +EXPQ_IMPL(qs16, qs16x4, 4) +EXPQ_IMPL(qs16, qs16x8, 8) +EXPQ_IMPL(qs16, qs16x16, 16) + +#define EXP_OP_EXPAND_STR(a, type, size, position) exp_sat_##type##x##size((a), (position)) +#define EXP_OP_EXPAND(a, type, size, position) EXP_OP_EXPAND_STR(a, type, size, position) + +/** Saturate logarithm of a fixed point vector + * + * @note Implemented approach uses taylor polynomial to approximate the logarithm function. + * + * @param[in] stype the actual scalar data type. + * @param[in] type the actual data type. + * @param[in] size the number of the calculated elements. + * + * @return The result of the fixed point logarithm. The result is saturated in case of overflow + */ +#define LOGQ_IMPL(stype, type, size) \ + inline type log_sat_##type(type VopA, int fixed_point_position) \ + { \ + type const_one = (type)(1 << (fixed_point_position)); \ + type ln2 = (type)(0x58B9 >> (15 - fixed_point_position)); /* 1.4384189 */ \ + type A = (type)(0x5C0F >> (14 - fixed_point_position)); /* 1.4384189 */ \ + type B = -(type)(0x56AE >> (15 - fixed_point_position)); /* -0.6771900 */ \ + type C = (type)(0x2933 >> (15 - fixed_point_position)); /* 0.3218538 */ \ + type D = -(type)(0x0AA7 >> (15 - fixed_point_position)); /* -0.0832229 */ \ + type inter_a = \ + select(VopA, DIV_SAT_OP_VEC_EXPAND(const_one, VopA, stype, size, fixed_point_position), \ + VopA < const_one); \ + type shift_val = (type)(15 - stype##_SHIFT) - clz(inter_a >> (type)fixed_point_position); \ + inter_a = inter_a >> shift_val; \ + inter_a = sub_sat(inter_a, const_one); \ + type sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, D, stype, size, fixed_point_position), C); \ + sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position), B); \ + sum = add_sat(MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position), A); \ + sum = MUL_SAT_OP_EXPAND(inter_a, sum, stype, size, fixed_point_position); \ + sum = MUL_SAT_OP_EXPAND(add_sat(sum, shift_val << (type)fixed_point_position), ln2, stype, \ + size, fixed_point_position); \ + return select(select(sum, -sum, VopA < const_one), (type)0, \ + VopA < (type)0); /* Saturate result if needed */ \ + } + +LOGQ_IMPL(qs8, qs8x16, 16) +LOGQ_IMPL(qs16, qs16x8, 8) +LOGQ_IMPL(qs16, qs16x16, 16) + +#define LOG_OP_EXPAND_STR(a, type, size, position) log_sat_##type##x##size((a), (position)) +#define LOG_OP_EXPAND(a, type, size, position) LOG_OP_EXPAND_STR(a, type, size, position) + +/** Saturate inverse square root of a fixed point vector + * + * @note Implemented approach uses Newton's method to approximate the inverse square root function. + * + * @param[in] stype the actual scalar data type. + * @param[in] type the actual data type. + * @param[in] size the number of the calculated elements. + * + * @return The result of the fixed point inverse square root. The result is saturated in case of + * overflow + */ +#define INVSQRTQ_IMPL(stype, type, size) \ + inline type invsqrt_sat_##type(type VopA, int fixed_point_position) \ + { \ + type const_three = (type)(3 << (fixed_point_position)); \ + type shift_value = (type)(16 - stype##_SHIFT) - (clz(VopA) + (type)fixed_point_position); \ + type temp = select((type)(VopA >> shift_value), \ + select((type)stype##_MAX, (type)(VopA << (-shift_value)), \ + (type)(clz(VopA) > (-shift_value))), \ + (type)(shift_value < (type)0)); \ + type x = temp; \ + x = MUL_SAT_OP_EXPAND( \ + x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \ + fixed_point_position), \ + temp, stype, size, fixed_point_position)), \ + stype, size, fixed_point_position) >> \ + 1; \ + x = MUL_SAT_OP_EXPAND( \ + x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \ + fixed_point_position), \ + temp, stype, size, fixed_point_position)), \ + stype, size, fixed_point_position) >> \ + 1; \ + x = MUL_SAT_OP_EXPAND( \ + x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \ + fixed_point_position), \ + temp, stype, size, fixed_point_position)), \ + stype, size, fixed_point_position) >> \ + 1; \ + if (sizeof((stype)(1)) > 1) /* Perform more iterations if datatype is QS16 */ \ + { \ + x = MUL_SAT_OP_EXPAND( \ + x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \ + fixed_point_position), \ + temp, stype, size, fixed_point_position)), \ + stype, size, fixed_point_position) >> \ + 1; \ + x = MUL_SAT_OP_EXPAND( \ + x, sub_sat(const_three, MUL_SAT_OP_EXPAND(MUL_SAT_OP_EXPAND(x, x, stype, size, \ + fixed_point_position), \ + temp, stype, size, fixed_point_position)), \ + stype, size, fixed_point_position) >> \ + 1; \ + } \ + type shift_value2 = select(shift_value >> 1, (-shift_value) >> 1, shift_value < (type)0); \ + return select((type)(x >> shift_value2), select((type)stype##_MAX, (type)(x << shift_value2), \ + (type)(clz(x) > shift_value2)), \ + (type)(shift_value < (type)0)); /* Saturate result if needed */ \ + } + +INVSQRTQ_IMPL(qs8, qs8x1, 1) +INVSQRTQ_IMPL(qs16, qs16x1, 1) +INVSQRTQ_IMPL(qs8, qs8x16, 16) +INVSQRTQ_IMPL(qs16, qs16x8, 8) + +#define INVSQRT_OP_EXPAND_STR(a, type, size, position) invsqrt_sat_##type##x##size((a), (position)) +#define INVSQRT_OP_EXPAND(a, type, size, position) INVSQRT_OP_EXPAND_STR(a, type, size, position) + +/** Saturate hyperbolic tangent of a fixed point vector + * + * tanh(x) = (e^2x - 1)/(e^2x + 1) + * + * @param[in] stype the actual scalar data type. + * @param[in] type the actual data type. + * @param[in] size the number of the calculated elements. + * + * @return The result of the fixed point hyperbolic tangent. The result is saturated in case of + * overflow + */ +#define TANHQ_IMPL(stype, type, size) \ + inline type tanh_sat_##type(type VopA, int fixed_point_position) \ + { \ + type const_one = (type)(1 << (fixed_point_position)); \ + type const_two = (type)(2 << (fixed_point_position)); \ + type exp2x = \ + EXP_OP_EXPAND(MUL_SAT_OP_EXPAND(const_two, VopA, stype, size, fixed_point_position), \ + stype, size, fixed_point_position); \ + type num = SUB_SAT_OP_EXPAND(exp2x, const_one, stype, size); \ + type den = ADD_SAT_OP_EXPAND(exp2x, const_one, stype, size); \ + return DIV_SAT_OP_VEC_EXPAND(num, den, stype, size, fixed_point_position); \ + } + +TANHQ_IMPL(qs8, qs8x16, 16) +TANHQ_IMPL(qs16, qs16x8, 8) + +#define TANH_OP_EXPAND_STR(a, type, size, position) tanh_sat_##type##x##size((a), (position)) +#define TANH_OP_EXPAND(a, type, size, position) TANH_OP_EXPAND_STR(a, type, size, position) + +#define floatx16 float16 +#define float16_TYPE float16 + +#define CONVERTQ_DOWN_IMPL(in_type, out_type) \ + inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position) \ + { \ + return CONVERT(a * (1 << fixed_point_position) + \ + select((in_type)-0.5f, (in_type)0.5f, isgreater(a, (in_type)0)), \ + out_type); \ + } + +CONVERTQ_DOWN_IMPL(float16, qs8x16) +CONVERTQ_DOWN_IMPL(float16, qs16x16) + +#define CONVERTQ_DOWN_SAT_IMPL(in_type, out_type) \ + inline out_type convert_##out_type##_##in_type##_sat(in_type a, int fixed_point_position) \ + { \ + return CONVERT_SAT(a * (1 << fixed_point_position) + \ + select((in_type)-0.5f, (in_type)0.5f, isgreater(a, (in_type)0)), \ + out_type); \ + } + +CONVERTQ_DOWN_SAT_IMPL(float16, qs8x16) +CONVERTQ_DOWN_SAT_IMPL(float16, qs16x16) + +#define CONVERTQ_UP_IMPL(in_type, out_type) \ + inline out_type convert_##out_type##_##in_type(in_type a, int fixed_point_position) \ + { \ + return CONVERT(a, out_type) / (1 << fixed_point_position); \ + } + +CONVERTQ_UP_IMPL(qs8x16, float16) +CONVERTQ_UP_IMPL(qs16x16, float16) + +#define SQCVT_SAT_IMPL(type) \ + inline type sqcvt_##type##_sat(float a, int fixed_point_position) \ + { \ + return CONVERT_SAT((a * (1 << fixed_point_position) + ((a < 0) ? -0.5f : 0.5f)), type); \ + } + +SQCVT_SAT_IMPL(qs8) +SQCVT_SAT_IMPL(qs16) + +#define SQCVT_SAT_OP_EXPAND_STR(a, type, position) sqcvt_##type##_sat((a), (position)) +#define SQCVT_SAT_OP_EXPAND(a, type, position) SQCVT_SAT_OP_EXPAND_STR((a), type, position) + +#endif // ARM_COMPUTE_FIXED_POINT_H diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl new file mode 100644 index 000000000..25e20f5f2 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** Perform gather + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * + * @param[in] input1_ptr Pointer to the first source tensor. Supported data types: U8/S32/F32 + * @param[in] input1_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] input1_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input1_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] input1_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input1_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] input1_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input1_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[in] input2_ptr Pointer to the first source tensor. Supported data types: U32 + * @param[in] input2_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] input2_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input2_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void gather(IMAGE_DECLARATION(input1), + VECTOR_DECLARATION(input2), + IMAGE_DECLARATION(output)) +{ + Image in1 = CONVERT_TO_IMAGE_STRUCT_NO_STEP(input1); + Vector in2 = CONVERT_TO_VECTOR_STRUCT(input2); + Image out = CONVERT_TO_IMAGE_STRUCT_NO_STEP(output); + + VEC_DATA_TYPE(DATA_TYPE_IN2, 2) + in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2)); + + //TODO: performance tuning for memcopy + int index = in2_data.s0; + int stride=input1_stride_y/input1_stride_x; + + for(int i=0; i<stride; i++){ + *((__global DATA_TYPE_OUT *)offset(&out, i,get_global_id(0)))=*((__global DATA_TYPE_IN1 *)offset(&in1, i,index)); + } +} + +__kernel void gather_1d_out(IMAGE_DECLARATION(input1), + VECTOR_DECLARATION(input2), + VECTOR_DECLARATION(output)) +{ + Image in1 = CONVERT_TO_IMAGE_STRUCT_NO_STEP(input1); + Vector in2 = CONVERT_TO_VECTOR_STRUCT(input2); + Vector out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output); + + VEC_DATA_TYPE(DATA_TYPE_IN2, 2) + in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2)); + + //TODO: performance tuning for memcopy + int index = in2_data.s0; + int stride=input1_stride_y/input1_stride_x; + + for(int i=0; i<stride; i++){ + *((__global DATA_TYPE_OUT *)vector_offset(&out, i+get_global_id(0)))=*((__global DATA_TYPE_IN1 *)offset(&in1, i, index)); + } +} + +__kernel void gather_1d(VECTOR_DECLARATION(input1), + VECTOR_DECLARATION(input2), + VECTOR_DECLARATION(output)) +{ + Vector in1 = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input1); + Vector in2 = CONVERT_TO_VECTOR_STRUCT(input2); + Vector out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output); + + VEC_DATA_TYPE(DATA_TYPE_IN2, 2) + in2_data = CONVERT(vload2(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_IN2, 2)); + + //TODO: performance tuning for memcopy + int index = in2_data.s0; + *((__global DATA_TYPE_OUT *)vector_offset(&out, get_global_id(0)))=*((__global DATA_TYPE_IN1 *)vector_offset(&in1, index)); +} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h new file mode 100644 index 000000000..8143d2398 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers.h @@ -0,0 +1,344 @@ +/* + * Copyright (c) 2016-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_HELPER_H +#define ARM_COMPUTE_HELPER_H + +#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) + +#if defined(ARM_COMPUTE_DEBUG_ENABLED) +#if defined(cl_arm_printf) +#pragma OPENCL EXTENSION cl_arm_printf : enable +#endif // defined(cl_arm_printf) +#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) + +#define EXPAND(x) x + +#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) + +#define VLOAD_STR(size) vload##size +#define VLOAD(size) VLOAD_STR(size) + +#define VSTORE_STR(size) vstore##size +#define VSTORE(size) VSTORE_STR(size) + +#define VEC_DATA_TYPE_STR(type, size) type##size +#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) + +#define CL_VEC_DATA_TYPE_STR(type, size) type##size +#define CL_VEC_DATA_TYPE(type, size) CL_VEC_DATA_TYPE_STR(type, size) + +#define CONVERT_STR(x, type) (convert_##type((x))) +#define CONVERT(x, type) CONVERT_STR(x, type) + +#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x))) +#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type) + +#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x))) +#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round) + +#define VECTOR_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, \ + uint name##_offset_first_element_in_bytes + +#define IMAGE_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ + uint name##_step_y, uint name##_offset_first_element_in_bytes + +#define TENSOR3D_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ + uint name##_step_y, uint name##_stride_z, uint name##_step_z, \ + uint name##_offset_first_element_in_bytes + +#define TENSOR4D_DECLARATION(name) \ + __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, \ + uint name##_step_y, uint name##_stride_z, uint name##_step_z, uint name##_stride_w, \ + uint name##_step_w, uint name##_offset_first_element_in_bytes + +#define CONVERT_TO_VECTOR_STRUCT(name) \ + update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x) + +#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ + update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0) + +#define CONVERT_TO_IMAGE_STRUCT(name) \ + update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x, name##_stride_y, name##_step_y) + +#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ + update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \ + name##_stride_y, 0) + +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ + update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \ + name##_stride_x, name##_step_x, name##_stride_y, \ + name##_step_y, name##_stride_z, name##_step_z) + +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ + update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \ + name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, \ + name##_step_z) + +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ + update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, \ + name##_stride_x, name##_step_x, name##_stride_y, \ + name##_step_y, name##_stride_z, name##_step_z) + +#define CONVERT_TO_TENSOR3D_STRUCT(name) \ + update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \ + name##_step_z) + +#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ + update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + 0, name##_stride_y, 0, name##_stride_z, 0) + +#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \ + update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, \ + name##_step_z, name##_stride_w, name##_step_w, mod_size) + +#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \ + update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \ + 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, \ + mod_size) + +/** Structure to hold Vector information */ +typedef struct Vector +{ + __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ + int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + int stride_x; /**< Stride of the image in X dimension (in bytes) */ +} Vector; + +/** Structure to hold Image information */ +typedef struct Image +{ + __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ + int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + int stride_x; /**< Stride of the image in X dimension (in bytes) */ + int stride_y; /**< Stride of the image in Y dimension (in bytes) */ +} Image; + +/** Structure to hold 3D tensor information */ +typedef struct Tensor3D +{ + __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ + int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + int stride_x; /**< Stride of the image in X dimension (in bytes) */ + int stride_y; /**< Stride of the image in Y dimension (in bytes) */ + int stride_z; /**< Stride of the image in Z dimension (in bytes) */ +} Tensor3D; + +/** Structure to hold 4D tensor information */ +typedef struct Tensor4D +{ + __global uchar *ptr; /**< Pointer to the starting postion of the buffer */ + int offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + int stride_x; /**< Stride of the image in X dimension (in bytes) */ + int stride_y; /**< Stride of the image in Y dimension (in bytes) */ + int stride_z; /**< Stride of the image in Z dimension (in bytes) */ + int stride_w; /**< Stride of the image in W dimension (in bytes) */ +} Tensor4D; + +/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's + * data. + * + * @param[in] ptr Pointer to the starting postion of the buffer + * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector + * @param[in] stride_x Stride of the vector in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per + * workitem(in bytes) + * + * @return An image object + */ +Vector inline update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, + uint stride_x, uint step_x) +{ + Vector vector = { + .ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + }; + vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x; + return vector; +} + +/** Wrap image information into an Image structure, and make the pointer point at this workitem's + * data. + * + * @param[in] ptr Pointer to the starting postion of the buffer + * @param[in] offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] stride_x Stride of the image in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per + * workitem(in bytes) + * @param[in] stride_y Stride of the image in Y dimension (in bytes) + * @param[in] step_y stride_y * number of elements along Y processed per + * workitem(in bytes) + * + * @return An image object + */ +Image inline update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, + uint stride_x, uint step_x, uint stride_y, uint step_y) +{ + Image img = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y}; + img.ptr += + img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y; + return img; +} + +/** Wrap 3D tensor information into an image structure, and make the pointer point at this + * workitem's data. + * + * @param[in] ptr Pointer to the starting postion of the buffer + * @param[in] offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] stride_x Stride of the image in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per + * workitem(in bytes) + * @param[in] stride_y Stride of the image in Y dimension (in bytes) + * @param[in] step_y stride_y * number of elements along Y processed per + * workitem(in bytes) + * @param[in] stride_z Stride of the image in Z dimension (in bytes) + * @param[in] step_z stride_z * number of elements along Z processed per + * workitem(in bytes) + * + * @return A 3D tensor object + */ +Image inline update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, + uint offset_first_element_in_bytes, + uint stride_x, uint step_x, uint stride_y, + uint step_y, uint stride_z, uint step_z) +{ + Image img = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y}; + img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + + get_global_id(1) * step_y + get_global_id(2) * step_z; + return img; +} + +/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this + * workitem's data. + * + * @param[in] ptr Pointer to the starting postion of the buffer + * @param[in] offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] stride_x Stride of the image in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per + * workitem(in bytes) + * @param[in] stride_y Stride of the image in Y dimension (in bytes) + * @param[in] step_y stride_y * number of elements along Y processed per + * workitem(in bytes) + * @param[in] stride_z Stride of the image in Z dimension (in bytes) + * @param[in] step_z stride_z * number of elements along Z processed per + * workitem(in bytes) + * + * @return A 3D tensor object + */ +Tensor3D inline update_tensor3D_workitem_ptr(__global uchar *ptr, + uint offset_first_element_in_bytes, uint stride_x, + uint step_x, uint stride_y, uint step_y, uint stride_z, + uint step_z) +{ + Tensor3D tensor = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y, + .stride_z = stride_z}; + tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + + get_global_id(1) * step_y + get_global_id(2) * step_z; + return tensor; +} + +Tensor4D inline update_tensor4D_workitem_ptr(__global uchar *ptr, + uint offset_first_element_in_bytes, uint stride_x, + uint step_x, uint stride_y, uint step_y, uint stride_z, + uint step_z, uint stride_w, uint step_w, uint mod_size) +{ + Tensor4D tensor = {.ptr = ptr, + .offset_first_element_in_bytes = offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y, + .stride_z = stride_z, + .stride_w = stride_w}; + + tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + + (get_global_id(2) / mod_size) * step_w; + return tensor; +} + +/** Get the pointer position of a Vector + * + * @param[in] vec Pointer to the starting position of the buffer + * @param[in] x Relative X position + */ +__global inline const uchar *vector_offset(const Vector *vec, int x) +{ + return vec->ptr + x * vec->stride_x; +} + +/** Get the pointer position of a Image + * + * @param[in] img Pointer to the starting position of the buffer + * @param[in] x Relative X position + * @param[in] y Relative Y position + */ +__global inline uchar *offset(const Image *img, int x, int y) +{ + return img->ptr + x * img->stride_x + y * img->stride_y; +} + +/** Get the pointer position of a Tensor3D + * + * @param[in] tensor Pointer to the starting position of the buffer + * @param[in] x Relative X position + * @param[in] y Relative Y position + * @param[in] z Relative Z position + */ +__global inline const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z) +{ + return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z; +} + +/** Get the pointer position of a Tensor4D + * + * @param[in] tensor Pointer to the starting position of the buffer + * @param[in] x Relative X position + * @param[in] y Relative Y position + * @param[in] z Relative Z position + * @param[in] w Relative W position + */ +__global inline const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w) +{ + return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + + w * tensor->stride_w; +} + +#endif // _HELPER_H diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h new file mode 100644 index 000000000..c39138caa --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h @@ -0,0 +1,406 @@ +/* + * Copyright (c) 2017-2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_HELPERS_ASYMM_H +#define ARM_COMPUTE_HELPERS_ASYMM_H + +#include "helpers.h" + +/** Correctly-rounded-to-nearest division by a power-of-two. + * + * @param[in] size Size of vector. + * + * @return Correctly-rounded-to-nearest division by a power-of-two. + */ +#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, int exponent) \ + { \ + VEC_DATA_TYPE(int, size) \ + mask = (1 << exponent) - 1; \ + const VEC_DATA_TYPE(int, size) zero = 0; \ + const VEC_DATA_TYPE(int, size) one = 1; \ + VEC_DATA_TYPE(int, size) \ + threshold = (mask >> 1) + select(zero, one, x < 0); \ + return (x >> exponent) + select(zero, one, (x & mask) > threshold); \ + } + +/** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1), + * rounding to the nearest value, and saturating -1 * -1 to the maximum value. + * + * @param[in] size Size of vector. + * + * @return Product of two fixed-point numbers. + */ +#define ASYMM_MULT_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_mult##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ + { \ + VEC_DATA_TYPE(int, size) \ + overflow = a == b && a == INT_MIN; \ + VEC_DATA_TYPE(long, size) \ + a_64 = convert_long##size(a); \ + VEC_DATA_TYPE(long, size) \ + b_64 = convert_long##size(b); \ + VEC_DATA_TYPE(long, size) \ + ab_64 = a_64 * b_64; \ + /* COMPMID-907 */ \ + VEC_DATA_TYPE(int, size) \ + ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31)); \ + return select(ab_x2_high32, INT_MAX, overflow); \ + } + +/** Calculates \f$ exp(x) \f$ for x in [-1/4, 0). + * + * @param[in] size Size of vector. + * + * @return Result in fixed-point format Q0. + */ +#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) \ + a) \ + { \ + const VEC_DATA_TYPE(int, size) constant_term = 1895147668; \ + const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883; \ + const int k_fractional_bits = 31; \ + VEC_DATA_TYPE(int, size) \ + x = a + (1 << (k_fractional_bits - 3)); \ + VEC_DATA_TYPE(int, size) \ + x2 = ASYMM_MULT(x, x, size); \ + VEC_DATA_TYPE(int, size) \ + x3 = ASYMM_MULT(x2, x, size); \ + VEC_DATA_TYPE(int, size) \ + x4 = ASYMM_MULT(x2, x2, size); \ + VEC_DATA_TYPE(int, size) \ + x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size); \ + VEC_DATA_TYPE(int, size) \ + x4_over_24_plus_x3_over_6_plus_x2 = \ + ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2; \ + VEC_DATA_TYPE(int, size) \ + x4_over_24_plus_x3_over_6_plus_x2_over_2 = \ + ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size); \ + return constant_term + \ + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \ + } + +/** Each bit of the result is set to the corresponding bit of either then_val or + * else_val depending on whether the corresponding bit of if_mask is set. + * Equivalent to the VBSL instruction in ARM NEON. + * + * @param[in] size Size of vector. + * + * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding + * bit in @p if_mask is set or not. + */ +#define ASYMM_SELECT_USING_MASK_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, \ + VEC_DATA_TYPE(int, size) then_val, \ + VEC_DATA_TYPE(int, size) else_val) \ + { \ + return (if_mask & then_val) ^ (~if_mask & else_val); \ + } + +/** For each element of input vector, the corresponding bits of the result item are set + * if the input item is zero. + * + * @param[in] size Size of vector. + * + * @returns Output vector with bits set when corresponding bit in @p a is zero. + */ +#define ASYMM_MASK_IF_ZERO_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_mask_if_zero##size(VEC_DATA_TYPE(int, size) a) \ + { \ + const VEC_DATA_TYPE(int, size) all_zeros = 0; \ + const VEC_DATA_TYPE(int, size) all_ones = ~0; \ + return select(all_zeros, all_ones, a == 0); \ + } + +/** For each element of input vector, the corresponding bits of the result item are set + * if the input item is non-zero. + * + * @param[in] size Size of vector. + * + * @returns Output vector with bits set when corresponding bit in @p a is non zero. + */ +#define ASYMM_MASK_IF_NON_ZERO_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_mask_if_non_zero##size(VEC_DATA_TYPE(int, size) a) \ + { \ + const VEC_DATA_TYPE(int, size) all_zeros = 0; \ + const VEC_DATA_TYPE(int, size) all_ones = ~0; \ + return select(all_zeros, all_ones, a != 0); \ + } + +#define EXP_BARREL_SHIFTER_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size( \ + VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \ + int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \ + { \ + if (k_integer_bits > exponent) \ + { \ + const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0; \ + return ASYMM_SELECT_USING_MASK( \ + ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size), \ + ASYMM_MULT(result, fp_multiplier, size), result, size); \ + } \ + \ + return result; \ + } + +/** Calculates \f$ exp(x) \f$ for x < 0. + * + * @param[in] size Size of vector. + * + * @return Result in fixed-point format Q0. + */ +#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \ + { \ + const int k_fractional_bits = 31 - k_integer_bits; \ + VEC_DATA_TYPE(int, size) \ + k_one_quarter = 1 << (k_fractional_bits - 2); \ + VEC_DATA_TYPE(int, size) \ + mask = k_one_quarter - 1; \ + VEC_DATA_TYPE(int, size) \ + a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter; \ + VEC_DATA_TYPE(int, size) \ + a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits; \ + VEC_DATA_TYPE(int, size) \ + result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL( \ + a_mod_quarter_minus_one_quarter_scaled, size); \ + VEC_DATA_TYPE(int, size) \ + remainder = a_mod_quarter_minus_one_quarter - a; \ + \ + result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, \ + remainder, size); \ + result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, \ + remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, \ + remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, \ + remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, \ + remainder, size); \ + result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, \ + size); \ + result = \ + EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size); \ + \ + if (k_integer_bits > 5) \ + { \ + const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5)); \ + result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size); \ + } \ + \ + const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ + return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size); \ + } + +/** Calculates the product of a integer value by a power of two, with either a positive exponent + * (equivalent to an arithmetic left shift, saturating) or a negative exponent + * (equivalent to an arithmetic right shift, rounding to nearest). + * + * @param[in] size Size of vector. + * + * @return Arithmetic left or right shift. + */ +#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \ + { \ + if (exponent < 0) \ + { \ + return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size); \ + } \ + \ + const VEC_DATA_TYPE(int, size) min = INT_MIN; \ + const VEC_DATA_TYPE(int, size) max = INT_MAX; \ + int threshold = ((1 << (31 - exponent)) - 1); \ + VEC_DATA_TYPE(int, size) \ + positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size); \ + VEC_DATA_TYPE(int, size) \ + negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size); \ + VEC_DATA_TYPE(int, size) \ + result = x << exponent; \ + result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size); \ + result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size); \ + return result; \ + } + +/** Calculates (a+b)/2, rounded to the nearest integer. + * Equivalent to VRHADD in the ARM NEON instruction set. + * + * @param[in] size Size of vector. + * + * @return (a+b)/2, rounded to the nearest integer. + */ +#define ASYMM_ROUNDING_HALF_SUM_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \ + { \ + VEC_DATA_TYPE(long, size) \ + a64 = convert_long##size(a); \ + VEC_DATA_TYPE(long, size) \ + b64 = convert_long##size(b); \ + VEC_DATA_TYPE(long, size) \ + sum = a64 + b64; \ + const VEC_DATA_TYPE(long, size) one = 1; \ + const VEC_DATA_TYPE(long, size) minus_one = -1; \ + VEC_DATA_TYPE(long, size) \ + sign = select(minus_one, one, sum >= 0); \ + return convert_int##size((sum + sign) / 2); \ + } + +/** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1). + * + * @param[in] size Size of vector. + * + * @return Result in fixed-point format Q0. + */ +#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) \ + asymm_one_over_one_plus_x_for_x_in_0_1##size(VEC_DATA_TYPE(int, size) a) \ + { \ + const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX; \ + const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2); \ + VEC_DATA_TYPE(int, size) \ + half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size); \ + const VEC_DATA_TYPE(int, size) Q2_48_over_17 = 1515870810; \ + const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540; \ + VEC_DATA_TYPE(int, size) \ + x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size); \ + for (int i = 0; i < 3; i++) \ + { \ + VEC_DATA_TYPE(int, size) \ + half_denominator_times_x = ASYMM_MULT(half_denominator, x, size); \ + VEC_DATA_TYPE(int, size) \ + one_minus_half_denominator_times_x = Q2_one - half_denominator_times_x; \ + VEC_DATA_TYPE(int, size) \ + tmp = ASYMM_MULT(x, one_minus_half_denominator_times_x, size); \ + x = x + ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(tmp, 2, size); \ + } \ + return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, 1, size); \ + } + +/** Considering the integer value as fixed-point, change the number of integer bits and update value + * accordingly. + * + * @param[in] size Size of vector. + * + * @return Rescaled value. + */ +#define ASYMM_RESCALE_IMPL(size) \ + inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, \ + int src_integer_bits, int dst_integer_bits) \ + { \ + int exponent = src_integer_bits - dst_integer_bits; \ + return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size); \ + } + +#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) \ + asymm_rounding_divide_by_POW2_##size(x, exponent) +#define ASYMM_MULT(a, b, size) asymm_mult##size(a, b) +#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \ + ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size) +#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \ + asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a) +#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) \ + asymm_select_using_mask##size(if_mask, then_val, else_val) +#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a) +#define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a) +#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \ + remainder, size) \ + exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, \ + remainder) +#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) \ + asymm_exp_on_negative_values##size(a, k_integer_bits) +#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) \ + asymm_one_over_one_plus_x_for_x_in_0_1##size(a) +#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) \ + asymm_saturating_rounding_mult_by_pow2##size(x, exponent) +#define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b) +#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \ + asymm_rescale##size(value, src_integer_bits, dst_integer_bits) + +ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2) +ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4) +ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8) +ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16) + +ASYMM_MULT_IMPL(2) +ASYMM_MULT_IMPL(4) +ASYMM_MULT_IMPL(8) +ASYMM_MULT_IMPL(16) + +ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(2) +ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4) +ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8) +ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16) + +ASYMM_SELECT_USING_MASK_IMPL(2) +ASYMM_SELECT_USING_MASK_IMPL(4) +ASYMM_SELECT_USING_MASK_IMPL(8) +ASYMM_SELECT_USING_MASK_IMPL(16) + +ASYMM_MASK_IF_ZERO_IMPL(2) +ASYMM_MASK_IF_ZERO_IMPL(4) +ASYMM_MASK_IF_ZERO_IMPL(8) +ASYMM_MASK_IF_ZERO_IMPL(16) + +ASYMM_MASK_IF_NON_ZERO_IMPL(2) +ASYMM_MASK_IF_NON_ZERO_IMPL(4) +ASYMM_MASK_IF_NON_ZERO_IMPL(8) +ASYMM_MASK_IF_NON_ZERO_IMPL(16) + +EXP_BARREL_SHIFTER_IMPL(2) +EXP_BARREL_SHIFTER_IMPL(4) +EXP_BARREL_SHIFTER_IMPL(8) +EXP_BARREL_SHIFTER_IMPL(16) + +ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(2) +ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4) +ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8) +ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16) + +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2) +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4) +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8) +ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(16) + +ASYMM_ROUNDING_HALF_SUM_IMPL(2) +ASYMM_ROUNDING_HALF_SUM_IMPL(4) +ASYMM_ROUNDING_HALF_SUM_IMPL(8) +ASYMM_ROUNDING_HALF_SUM_IMPL(16) + +ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(2) +ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4) +ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8) +ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16) + +ASYMM_RESCALE_IMPL(2) +ASYMM_RESCALE_IMPL(4) +ASYMM_RESCALE_IMPL(8) +ASYMM_RESCALE_IMPL(16) + +#endif // ARM_COMPUTE_HELPERS_ASYMM_H
\ No newline at end of file diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl new file mode 100644 index 000000000..512c62023 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_float.cl @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#ifdef SATURATE +#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x)) +#else /* SATURATE */ +#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x)) +#endif /* SATURATE */ +#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round) + +/** Performs a pixelwise division with float scale of either integer or float inputs. + * + * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short + * @attention The data type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES. + * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short. + * @attention -DDATA_TYPE_FLOAT must be passed if floating point inputs are provided. + * + * @param[in] in1_ptr Pointer to the source image. Supported data types: U8, S16, F16, F32 + * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes) + * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] in2_ptr Pointer to the source image. Supported data types: U8, S16, F16, F32 + * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes) + * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16, F16, F32 + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] scale Float scaling factor. Supported data types: F32 + */ +__kernel void pixelwise_div_float( + TENSOR3D_DECLARATION(in1), + TENSOR3D_DECLARATION(in2), + TENSOR3D_DECLARATION(out), + const float scale) +{ + // Get pixels pointer + Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); + Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + + // Load data + VEC_DATA_TYPE(DATA_TYPE_RES, 16) + in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16)); + VEC_DATA_TYPE(DATA_TYPE_RES, 16) + in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16)); + + // Perform division +#ifdef DATA_TYPE_FLOAT + VEC_DATA_TYPE(DATA_TYPE_OUT, 16) + res = CONVERT(in1_data / in2_data * (DATA_TYPE_RES)scale, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)); +#else /* DATA_TYPE_FLOAT */ + VEC_DATA_TYPE(DATA_TYPE_OUT, 16) + res = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((convert_float16(in1_data / in2_data) * scale), VEC_DATA_TYPE(DATA_TYPE_RES, 16), ROUND), VEC_DATA_TYPE(DATA_TYPE_OUT, 16), ROUND); +#endif /* DATA_TYPE_FLOAT */ + + // Store result + vstore16(res, 0, (__global DATA_TYPE_OUT *)out.ptr); +} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl new file mode 100644 index 000000000..82edf3b1d --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_div_int.cl @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(FIXED_POINT_POSITION) + +#include "fixed_point.h" + +#if defined(SATURATE) +#define DIV_OP(x, y, scale, type, size) DIV_SAT_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION) +#else // SATURATE +#define DIV_OP(x, y, scale, type, size) DIV_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION) +#endif // SATURATE + +#else // FIXED_POINT_POSITION + +#if defined(SATURATE) +#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x)) +#else // SATURATE +#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size(x)) +#endif // SATURATE +#define CONVERT_OP_INT(x, type, size) CONVERT_OP_INT_STR(x, type, size) + +#define DIV_OP(x, y, scale, type, size) CONVERT_OP_INT((x) / (y) >> scale, type, size) + +#endif // FIXED_POINT_POSITION + +/** Performs a pixelwise division with integer scale of integer inputs. + * + * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short + * @attention The data_type of the intermediate result of the division should passed as well using -DDATA_TYPE_RES. + * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short. + * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3 + * + * @param[in] in1_ptr Pointer to the source image. Supported data types: U8/QS8/QS16/S16 + * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes) + * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] in2_ptr Pointer to the source image. Supported data types: same as @p in1_ptr + * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes) + * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p in1_ptr + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] scale Integer scaling factor. Supported data types: S32 (ignored for QS8 and QS16 as the assumption is scale = 1). + */ +__kernel void pixelwise_div_int( + TENSOR3D_DECLARATION(in1), + TENSOR3D_DECLARATION(in2), + TENSOR3D_DECLARATION(out), + const uint scale) +{ + // Get pixels pointer + Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); + Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + + // Load data + VEC_DATA_TYPE(DATA_TYPE_RES, 16) + in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16)); + VEC_DATA_TYPE(DATA_TYPE_RES, 16) + in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16)); + + // Perform division and store result + vstore16(DIV_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, 16), 0, (__global DATA_TYPE_OUT *)out.ptr); +} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl new file mode 100644 index 000000000..ddc9d5a27 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/pixelwise_mul_quantized.cl @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers_asymm.h" + +#ifdef SATURATE +#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x)) +#else /* SATURATE */ +#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x)) +#endif /* SATURATE */ +#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round) + +#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) +/** Performs a pixelwise multiplication used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8 + * + * The following computations will be performed by the kernel: + * + * -# Add offset terms to inputs + * -# Multiply inputs + * -# Add offset terms to final result + * -# Multiply each entry of result by result_mult_int + * -# Shift the int32 accumulator by result_shift + * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8. + * + * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT: + * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar + * @attention The offset factor of inputs must be passed at compile time using -DIN1_OFFSET and -DIN2_OFFSET + * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT + * + * @param[in] in1_ptr Pointer to the source image. Supported data types: U8 + * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes) + * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] in2_ptr Pointer to the source image. Supported data types: U8 + * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes) + * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8 + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] scale Float scaling factor. Supported data types: F32 + */ +__kernel void pixelwise_mul_qasymm8( + TENSOR3D_DECLARATION(in1), + TENSOR3D_DECLARATION(in2), + TENSOR3D_DECLARATION(out), + const float scale) +{ + // Get pixels pointer + Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); + Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); + + // Load data + VEC_DATA_TYPE(int, 16) + in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16)); + VEC_DATA_TYPE(int, 16) + in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16)); + + // Perform multiplication of two inputs + VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET); + VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET); + VEC_DATA_TYPE(int, 16) out_val = in1_val * in2_val; + + // Multiply with a multiplier smaller than 1 + out_val = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(out_val, RESULT_MULT_INT, RESULT_SHIFT, 16); + out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET); + + VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16)); + +// TODO: Apply min-max BOUND to support fuse with relu. +/* +#if defined(MIN_BOUND) + res = max(res, (uchar16)MIN_BOUND); +#endif // defined(MIN_BOUND) +#if defined(MAX_BOUND) + res = min(res, (uchar16)MAX_BOUND); +#endif // defined(MAX_BOUND) +*/ + + // Store result + VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), + 0, (__global DATA_TYPE_OUT *)out.ptr); +} +#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl new file mode 100644 index 000000000..dfa3b85f4 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_max.cl @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(WIDTH) +/** Perform reduce max + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * + * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[out] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[out] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[out] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void reduce_max(VECTOR_DECLARATION(input), + VECTOR_DECLARATION(output)) +{ + Vector input = CONVERT_TO_VECTOR_STRUCT(input); + Vector output = CONVERT_TO_VECTOR_STRUCT(output); + + __global float *input_addr = (__global float *)(input.ptr); + __global float *output_addr = (__global float *)(output.ptr); + + float max_value = *input_addr; + for(int x = 1; x < WIDTH; x++) + { + float value = *(input_addr + x); + max_value = max(value, max_value); + } + + // Store max + *output_addr = max_value; +} +#endif // defined(WIDTH) diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl new file mode 100644 index 000000000..1a96eea61 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/reduction_mean.cl @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +inline DATA_TYPE sum_8(__global const DATA_TYPE *input) +{ + VEC_DATA_TYPE(DATA_TYPE, 8) + in = vload8(0, input); + in.s0123 += in.s4567; + in.s01 += in.s23; + return ((in.s0 + in.s1)); +} + +/** This function calculates the sum and sum of squares of a given input image. + * + * @note To enable calculation sum of squares -DSTDDEV should be passed as a preprocessor argument. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: U8 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] local_sum Local sum of all elements + * @param[in] height Height of the input image + * @param[in] divider Divider to calculate mean + */ +__kernel void reduction_mean( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(dst), + __local DATA_TYPE *local_sums, + int height, + int divider) +{ + // Get pixels pointer + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + float8 tmp_sum = 0; + // Calculate partial sum + + for(int i = 0; i < height; i++) + { + local_sums[0] += sum_8((__global DATA_TYPE *)offset(&src, 0, i)); + } + ((__global DATA_TYPE *)offset(&dst, get_global_id(0), get_global_id(1)))[0] = local_sums[0]/divider; +} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl new file mode 100644 index 000000000..c5ff82f9e --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/strided_slice.cl @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + + +inline Tensor4D tensor4D_from_vector_no_step(const Vector *vector, int dim_x, int dim_y, int dim_z, int dim_w) +{ + int stride_x = vector->stride_x; + int stride_y = stride_x * dim_x; + int stride_z = stride_y * dim_y; + int stride_w = stride_z * dim_z; + Tensor4D tensor = + { + .ptr = vector->ptr, + .offset_first_element_in_bytes = vector->offset_first_element_in_bytes, + .stride_x = stride_x, + .stride_y = stride_y, + .stride_z = stride_z, + .stride_w = stride_w, + }; + return tensor; +} + +/** Extracts a strided slice up to 4-dimensions + * + * @note Datatype should be given as a preprocessor argument using -DELEMENT_DATA_TYPE=type. e.g. -DELEMENT_DATA_TYPE=short + * @note The size of an element should be given as a preprocessor argument using -DELEMENT_SIZE=size. e.g. -DELEMENT_SIZE=2 + * + * @param[in] input_ptr Pointer to the first source tensor. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/F16/F32 + * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] dims_in The 4-dimensional dimension of the input. Supported data types: S32 + * @param[in] dims_out The 4-dimensional dimension of the output. Supported data types: S32 + * @param[in] starts The stride of X dimension of input tensor to be sliced. Supported data types: S32 + * @param[in] strides The stride of Y dimension of input tensor to be sliced. Supported data types: S32 + */ +__kernel void strided_slice(VECTOR_DECLARATION(input), + VECTOR_DECLARATION(output), + const int4 dims_in, + const int4 dims_out, + const int4 starts, + const int4 strides) +{ + // TODO: Should be change to CONVERT_TO_TENSOR4D_STRUCT in order to reduce inference of the offset + Vector vec_out = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output); + Vector vec_in = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input); + + // Implemenation + // Infer a Tensor4D from output Vector and output's dimensions info + // Infer a Tensor4D from input Vector and input's dimensions info + // Infer indices of output as 4D from the offset of output vector + // Infer indices of input as 4D from indices of output + // out(offset of output vector) = in(offset of input) + + Tensor4D tensor_out = tensor4D_from_vector_no_step(&vec_out, dims_out.x, dims_out.y, dims_out.z, dims_out.w); + Tensor4D tensor_in = tensor4D_from_vector_no_step(&vec_in, dims_in.x, dims_in.y, dims_in.z, dims_in.w); + + // Must be output_step_x == output_stride_x == an element's size + const int offset_out = get_global_id(0) * output_stride_x; + int4 indices_out = + { + get_global_id(0) % dims_out.x, + (offset_out / tensor_out.stride_y) % dims_out.y, + (offset_out / tensor_out.stride_z) % dims_out.z, + (offset_out / tensor_out.stride_w) % dims_out.w, + }; + + int4 indices_in = + { + starts.x + (strides.x * indices_out.x), + starts.y + (strides.y * indices_out.y), + starts.z + (strides.z * indices_out.z), + starts.w + (strides.w * indices_out.w), + }; + + *((__global ELEMENT_DATA_TYPE *)vector_offset(&vec_out, get_global_id(0))) = *((__global ELEMENT_DATA_TYPE *)tensor4D_offset(&tensor_in, indices_in.x, indices_in.y, indices_in.z, indices_in.w)); +} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl new file mode 100644 index 000000000..0b0cf8218 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2.cl @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +__kernel void topkv2_init(VECTOR_DECLARATION(input), + __global float* in_key_buf, + __global int* in_ind_buf, + const int n) +{ + int gid = get_global_id(0); + int lws = get_local_size(0); + int groups = get_num_groups(0); + int gws = lws * groups; + int iter = n / gws; + + Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input); + + for(int i = 0; i < iter; ++i) + { + int idx = i * gws + gid; + in_key_buf[idx] = *(__global float*)(input.ptr + idx * input.stride_x); + in_ind_buf[idx] = idx; + } +} + +__kernel void topkv2_find_first_negative( + __global float *out_key_buf, + __global int *first_negative_idx, + int n) +{ + int gid = get_global_id(0); + + if( gid == n - 1 ) + { + // if the last item is positive, the first negative index is n. + if( out_key_buf[gid] > 0.f ) + *first_negative_idx = n; + } else if ( gid == 0 ) { + // if the first item is negative, set it 0. + if( out_key_buf[gid] < 0.f ) + *first_negative_idx = 0; + } else { + // if its left is positive and it is negative, then it is the first negative item. + if( out_key_buf[gid-1] > 0.f && out_key_buf[gid] < 0.f ) + *first_negative_idx = gid; + } +} + +__kernel void topkv2_reorder_negatives( + __global float* in_key_buf, + __global float* out_key_buf, + __global float* in_ind_buf, + __global float* out_ind_buf, + __global int* first_negative_idx, + int n) +{ + int gid = get_global_id(0); + + int num_negs = n - *first_negative_idx; + int in_idx; + + if( gid < num_negs ) { + in_idx = n - 1 - gid; + } else { + in_idx = gid - num_negs; + } + + out_key_buf[gid] = in_key_buf[in_idx]; + out_ind_buf[gid] = in_ind_buf[in_idx]; +} + +__kernel void topkv2_store( + VECTOR_DECLARATION(values), + VECTOR_DECLARATION(indices), + __global float *out_key_buf, + __global int *out_ind_buf, + int n) +{ + int gid = get_global_id(0); + + Vector values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(values); + Vector indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices); + + int idx = n - 1 - gid; + + *(__global float*)(values.ptr + gid * values.stride_x) = out_key_buf[idx]; + *(__global int*)(indices.ptr + gid * indices.stride_x) = out_ind_buf[idx]; +} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl new file mode 100644 index 000000000..deadf8412 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_quicksort.cl @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "helpers.h" + +__global inline float* get_vec_elem(Vector* vec, int idx) +{ + return (__global float*)(vec->ptr + idx * vec->stride_x); +} + +__global inline int* get_vec_elem_int(Vector* vec, int idx) +{ + return (__global int*)(vec->ptr + idx * vec->stride_x); +} + +// A utility function to swap two elements +void swap(__global float *a, __global float *b) +{ + float t = *a; + *a = *b; + *b = t; +} + +void swap_idx(__global int *a, __global int *b) +{ + int t = *a; + *a = *b; + *b = t; +} + +/* This function is same in both iterative and recursive*/ +int partition (Vector* arr, __global int* indices, int l, int h) +{ + float x = *get_vec_elem(arr, h); + int i = (l - 1); + + for (int j = l; j <= h- 1; j++) + { + if (*get_vec_elem(arr, j) >= x) + { + i++; + swap (get_vec_elem(arr,i), get_vec_elem(arr,j)); + swap_idx(&indices[i], &indices[j]); + } + } + swap (get_vec_elem(arr, i + 1), get_vec_elem(arr, h)); + swap_idx(&indices[i + 1], &indices[h]); + return (i + 1); +} + +/* A[] --> Array to be sorted, + l --> Starting index, + h --> Ending index */ +void quickSortIterative (Vector* arr, __global int* indices, + __global int *stack, int l, int h) +{ + // Create an auxiliary stack + + // initialize top of stack + int top = -1; + + // push initial values of l and h to stack + stack[ ++top ] = l; + stack[ ++top ] = h; + + // Keep popping from stack while is not empty + while ( top >= 0 ) + { + // Pop h and l + h = stack[ top-- ]; + l = stack[ top-- ]; + + // Set pivot element at its correct position + // in sorted array + int p = partition( arr, indices, l, h ); + + // If there are elements on left side of pivot, + // then push left side to stack + if ( p-1 > l ) + { + stack[ ++top ] = l; + stack[ ++top ] = p - 1; + } + + // If there are elements on right side of pivot, + // then push right side to stack + if ( p+1 < h ) + { + stack[ ++top ] = p + 1; + stack[ ++top ] = h; + } + } +} + +__kernel void topkv2_quicksort(VECTOR_DECLARATION(input), + VECTOR_DECLARATION(topk_values), VECTOR_DECLARATION(topk_indices), + __global int* indices, __global int* temp_stack, int k, int n) +{ + Vector input = CONVERT_TO_VECTOR_STRUCT_NO_STEP(input); + Vector topk_values = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_values); + Vector topk_indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(topk_indices); + + for( int i = 0; i < n; ++i ) + { + indices[i] = i; + } + + quickSortIterative(&input, indices, temp_stack, 0, n-1); + + // extract k items. + for(int i = 0; i < k; ++i) + { + *get_vec_elem(&topk_values, i) = *get_vec_elem(&input, i); + *get_vec_elem_int(&topk_indices, i) = indices[i]; + } +} diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl new file mode 100644 index 000000000..cac0c071e --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/topkv2_radixsort.cl @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +// reference: +// https://code.google.com/archive/p/ocl-radix-sort/source/default/source +// OpenCL kernel sources for the CLRadixSort class +// the #include does not exist in OpenCL +// Copyright Philippe Helluy, Université de Strasbourg, France, 2011, helluy@math.unistra.fr +// licensed under the GNU Lesser General Public License see http://www.gnu.org/copyleft/lesser.html +// if you find this software usefull you can cite the following work in your reports or articles: +// Philippe HELLUY, A portable implementation of the radix sort algorithm in OpenCL, 2011. +// http://hal.archives-ouvertes.fr/hal-00596730 + +// Reference for floating point radix sort: +// http://www.codercorner.com/RadixSortRevisited.htm + +// compute the histogram for each radix and each virtual processor for the pass +__kernel void radixsort_histogram(__global float* in_key_buf, + __global int* d_Histograms, + const int pass, + __local int* loc_histo, + const int n) +{ + int it = get_local_id(0); // i local number of the processor + int ig = get_global_id(0); // global number = i + g I + + int gr = get_group_id(0); // g group number + + int groups = get_num_groups(0); + int items = get_local_size(0); + + // set the local histograms to zero + for(int ir=0;ir<_RADIX;ir++){ + loc_histo[ir * items + it] = 0; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // range of keys that are analyzed by the work item + int size= n/groups/items; // size of the sub-list + int start= ig * size; // beginning of the sub-list + + unsigned int key; + int shortkey,k; + + // compute the index + // the computation depends on the transposition + for(int j = 0; j < size ; j++) { +#ifdef TRANSPOSE + k= groups * items * j + ig; +#else + k=j+start; +#endif + + key = *((__global unsigned int*)(in_key_buf + k)); + + // extract the group of _BITS bits of the pass + // the result is in the range 0.._RADIX-1 + shortkey=(( key >> (pass * _BITS)) & (_RADIX-1)); + + // increment the local histogram + loc_histo[shortkey * items + it ]++; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // copy the local histogram to the global one + for(int ir=0;ir<_RADIX;ir++) { + d_Histograms[items * (ir * groups + gr) + it] = loc_histo[ir * items + it]; + } + + barrier(CLK_GLOBAL_MEM_FENCE); +} + +// initial transpose of the list for improving +// coalescent memory access +__kernel void transpose(const __global int* invect, + __global int* outvect, + const int nbcol, + const int nbrow, + const __global int* inperm, + __global int* outperm, + __local int* blockmat, + __local int* blockperm, + const int tilesize){ + + int i0 = get_global_id(0)*tilesize; // first row index + int j = get_global_id(1); // column index + + int jloc = get_local_id(1); // local column index + + // fill the cache + for(int iloc=0;iloc<tilesize;iloc++){ + int k=(i0+iloc)*nbcol+j; // position in the matrix + blockmat[iloc*tilesize+jloc]=invect[k]; +#ifdef PERMUT + blockperm[iloc*tilesize+jloc]=inperm[k]; +#endif + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // first row index in the transpose + int j0=get_group_id(1)*tilesize; + + // put the cache at the good place + for(int iloc=0;iloc<tilesize;iloc++){ + int kt=(j0+iloc)*nbrow+i0+jloc; // position in the transpose + outvect[kt]=blockmat[jloc*tilesize+iloc]; +#ifdef PERMUT + outperm[kt]=blockperm[jloc*tilesize+iloc]; +#endif + } + +} + +// each virtual processor reorders its data using the scanned histogram +__kernel void radixsort_reorder(__global float* in_key, + __global float* out_key, + __global int* d_Histograms, + const int pass, + __global int* indices_in, + __global int* indices_out, + __local int* loc_histo, + const int n){ + + int it = get_local_id(0); + int ig = get_global_id(0); + + int gr = get_group_id(0); + int groups=get_num_groups(0); + int items=get_local_size(0); + + int start= ig *(n/groups/items); + int size= n/groups/items; + + // take the histogram in the cache + for(int ir=0;ir<_RADIX;ir++){ + loc_histo[ir * items + it]= + d_Histograms[items * (ir * groups + gr) + it]; + } + barrier(CLK_LOCAL_MEM_FENCE); + + int newpos,shortkey,k,newpost; + unsigned int key; + + for(int j= 0; j< size;j++){ +#ifdef TRANSPOSE + k= groups * items * j + ig; +#else + k=j+start; +#endif + float org_value = in_key[k]; + key = *(__global unsigned int*)(in_key + k); + shortkey=((key >> (pass * _BITS)) & (_RADIX-1)); + + newpos=loc_histo[shortkey * items + it]; + +#ifdef TRANSPOSE + int ignew,jnew; + ignew= newpos/(n/groups/items); + jnew = newpos%(n/groups/items); + newpost = jnew * (groups*items) + ignew; +#else + newpost=newpos; +#endif + + //d_outKeys[newpost]= key; // killing line !!! + out_key[newpost] = org_value; + +#ifdef PERMUT + indices_out[newpost] = indices_in[k]; +#endif + + newpos++; + loc_histo[shortkey * items + it]=newpos; + } +} + +// perform a parallel prefix sum (a scan) on the local histograms +// (see Blelloch 1990) each workitem worries about two memories +// see also http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html +__kernel void radixsort_scanhistograms(__global int* histo, __local int* temp, __global int* globsum) +{ + int it = get_local_id(0); + int ig = get_global_id(0); + int decale = 1; + int n=get_local_size(0) * 2 ; + int gr=get_group_id(0); + + // load input into local memory + // up sweep phase + temp[2*it] = histo[2*ig]; + temp[2*it+1] = histo[2*ig+1]; + + // parallel prefix sum (algorithm of Blelloch 1990) + for (int d = n>>1; d > 0; d >>= 1){ + barrier(CLK_LOCAL_MEM_FENCE); + if (it < d){ + int ai = decale*(2*it+1)-1; + int bi = decale*(2*it+2)-1; + temp[bi] += temp[ai]; + } + decale *= 2; + } + + // store the last element in the global sum vector + // (maybe used in the next step for constructing the global scan) + // clear the last element + if (it == 0) { + globsum[gr]=temp[n-1]; + temp[n - 1] = 0; + } + + // down sweep phase + for (int d = 1; d < n; d *= 2){ + decale >>= 1; + barrier(CLK_LOCAL_MEM_FENCE); + + if (it < d){ + int ai = decale*(2*it+1)-1; + int bi = decale*(2*it+2)-1; + + int t = temp[ai]; + temp[ai] = temp[bi]; + temp[bi] += t; + } + + } + barrier(CLK_LOCAL_MEM_FENCE); + + // write results to device memory + + histo[2*ig] = temp[2*it]; + histo[2*ig+1] = temp[2*it+1]; + + barrier(CLK_GLOBAL_MEM_FENCE); + +} + +// use the global sum for updating the local histograms +// each work item updates two values +__kernel void radixsort_pastehistograms( __global int* histo,__global int* globsum) +{ + int ig = get_global_id(0); + int gr=get_group_id(0); + + int s; + + s=globsum[gr]; + + // write results to device memory + histo[2*ig] += s; + histo[2*ig+1] += s; + + barrier(CLK_GLOBAL_MEM_FENCE); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp new file mode 100644 index 000000000..b019e8c33 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLCastKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +CLCastKernel::CLCastKernel() : _input(nullptr), _output(nullptr) {} + +void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, + DataType::S16, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + + _input = input; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); + build_opts.emplace( + ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); + + // Create kernel + if (is_data_type_quantized_asymmetric(input->info()->data_type())) + { + const float scale_in = input->info()->quantization_info().scale; + const int offset_in = input->info()->quantization_info().offset; + build_opts.emplace("-DSCALE_IN=" + float_to_string_with_full_precision(scale_in)); + build_opts.emplace("-DOFFSET_IN=" + support::cpp11::to_string(offset_in)); + + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts)); + } + else if (is_data_type_quantized_asymmetric(output->info()->data_type())) + { + const float scale_in = output->info()->quantization_info().scale; + const int offset_in = output->info()->quantization_info().offset; + build_opts.emplace("-DSCALE_IN=" + float_to_string_with_full_precision(scale_in)); + build_opts.emplace("-DOFFSET_IN=" + support::cpp11::to_string(offset_in)); + + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts)); + } + else + { + _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("cast", build_opts)); + } + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, input->info()->valid_region()); + + ICLKernel::configure(win); +} + +void CLCastKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } while (collapsed.slide_window_slice_3D(slice)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp new file mode 100644 index 000000000..23efafa6a --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLGatherKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <cmath> +#include <cstdlib> +#include <set> +#include <string> + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S32, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S32, + DataType::F32); + + return Status{}; +} + +} // namespace + +CLGatherKernel::CLGatherKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr) {} + +void CLGatherKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); + + _input1 = input1; + _input2 = input2; + _output = output; + + // Construct kernel name + std::string kernel_name = "gather"; + if (input1->info()->num_dimensions() == 1) + { + kernel_name = "gather_1d"; + } + else if (input1->info()->num_dimensions() == 2) + { + if (_output->info()->num_dimensions() == 1) + { + kernel_name = "gather_1d_out"; + } + } + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + // Configure kernel window + const unsigned int num_elems_processed_per_iteration = 1; + Window win = calculate_max_window(*input2->info(), Steps(num_elems_processed_per_iteration)); + output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + + ICLKernel::configure(win); +} + +Status CLGatherKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output)); + + return Status{}; +} + +void CLGatherKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + if (_input1->info()->num_dimensions() == 1) + { + Window slice = window.first_slice_window_1D(); + + unsigned int idx = 0; + add_1D_tensor_argument(idx, _input1, slice); + add_1D_tensor_argument(idx, _input2, slice); + add_1D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + } + else if (_input1->info()->num_dimensions() == 2) + { + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY); + Window slice = window.collapse_if_possible(ICLKernel::window(), Window::DimX); + + // Set inputs + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input1, window_collapsed); + add_1D_tensor_argument(idx, _input2, slice); + if (_output->info()->num_dimensions() == 1) + { + add_1D_tensor_argument(idx, _output, slice); + } + else + { + add_2D_tensor_argument(idx, _output, window_collapsed); + } + enqueue(queue, *this, slice); + } +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp new file mode 100644 index 000000000..a3e0163de --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLPixelWiseDivisionKernel.cpp @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <cmath> +#include <cstdlib> +#include <set> +#include <string> + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy) +{ + ARM_COMPUTE_UNUSED(overflow_policy); + ARM_COMPUTE_UNUSED(rounding_policy); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, + DataType::QS16, DataType::S16, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, + DataType::QS16, DataType::S16, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative."); + + const TensorShape &out_shape = + TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, + "Inputs are not broadcast compatible"); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, input2); + + if (is_data_type_fixed_point(input1->data_type())) + { + // All data types must be all QS8 or all QS16 + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale != 1, + "Unsupported scaling factor for QS8/QS16. Scale must be 1."); + } + + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, + DataType::QS16, DataType::S16, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + output->data_type() == DataType::U8 && + (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8), + "Output can only be U8 if both inputs are U8"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), + "Wrong shape for output"); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input1, output); + if (is_data_type_fixed_point(input1->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output); + } + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, + ITensorInfo *output) +{ + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2); + const TensorShape &out_shape = broadcast_pair.first; + const ValidRegion &valid_region = broadcast_pair.second; + + // Auto initialize output if not initialized + { + set_shape_if_empty(*output, out_shape); + + if (input1->data_type() == DataType::S16 || input2->data_type() == DataType::S16) + { + set_format_if_unknown(*output, Format::S16); + } + else if (input1->data_type() == DataType::F32 || input2->data_type() == DataType::F32) + { + set_format_if_unknown(*output, Format::F32); + } + } + + Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration)); + Window win_input1 = win.broadcast_if_dimension_le_one(*input1); + Window win_input2 = win.broadcast_if_dimension_le_one(*input2); + + AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + + bool window_changed = update_window_and_padding(win_input1, input1_access) || + update_window_and_padding(win_input2, input2_access) || + update_window_and_padding(win, output_access); + + output_access.set_valid_region(win, valid_region); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + return std::make_pair(err, win); +} +} // namespace + +CLPixelWiseDivisionKernel::CLPixelWiseDivisionKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void CLPixelWiseDivisionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, + ICLTensor *output, float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), + scale, overflow_policy, rounding_policy)); + + // Configure kernel window + auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + + _input1 = input1; + _input2 = input2; + _output = output; + + int scale_int = -1; + // Extract sign, exponent and mantissa + int exponent = 0; + float normalized_mantissa = std::frexp(scale, &exponent); + // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15 + // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= + // 14 + // Moreover, it will be negative as we deal with 1/2^n + if ((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)) + { + // Store the positive exponent. We know that we compute 1/2^n + // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5 + scale_int = std::abs(exponent - 1); + } + + std::string data_type; + std::string compute_type; + // Check if it has float inputs and output + if (is_data_type_float(input1->info()->data_type()) || + is_data_type_float(input2->info()->data_type())) + { + scale_int = -1; + compute_type = (input1->info()->data_type() == DataType::F32 || + input2->info()->data_type() == DataType::F32) + ? "float" + : "half"; + data_type = "DATA_TYPE_FLOAT"; + } + else + { + if (input1->info()->data_type() == DataType::S16 || + input2->info()->data_type() == DataType::S16) + { + compute_type = "int"; + } + else if (input1->info()->data_type() == DataType::QS8) + { + compute_type = "qs8"; + } + else if (input1->info()->data_type() == DataType::QS16) + { + compute_type = "qs16"; + } + else + { + compute_type = "ushort"; + } + data_type = "DATA_TYPE_INT"; + } + + // Construct kernel name + std::string kernel_name = "pixelwise_div"; + kernel_name += (scale_int >= 0) ? "_int" : "_float"; + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace( + (overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type())) + ? "-DWRAP" + : "-DSATURATE"); + build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz" + : "-DROUND=_rte"); + if (is_data_type_fixed_point(input1->info()->data_type())) + { + build_opts.emplace("-DFIXED_POINT_POSITION=" + + support::cpp11::to_string(input1->info()->fixed_point_position())); + } + build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); + build_opts.emplace("-DDATA_TYPE_RES=" + compute_type); + build_opts.emplace("-D" + data_type); + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + // Set scale argument + unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the inputs and output parameters + + if (scale_int >= 0) + { + _kernel.setArg(idx++, scale_int); + } + else + { + _kernel.setArg(idx++, scale); + } + + ICLKernel::configure(win_config.second); +} + +Status CLPixelWiseDivisionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output, float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), + input2->clone().get(), + output->clone().get()) + .first); + + return Status{}; +} + +void CLPixelWiseDivisionKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const TensorShape &in_shape1 = _input1->info()->tensor_shape(); + const TensorShape &in_shape2 = _input2->info()->tensor_shape(); + const TensorShape &out_shape = _output->info()->tensor_shape(); + + bool can_collapse = true; + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = + (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) + : window; + + const TensorShape &in_shape1_collapsed = + has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = + has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input1, slice_input1); + add_3D_tensor_argument(idx, _input2, slice_input2); + add_3D_tensor_argument(idx, _output, slice); + enqueue(queue, *this, slice); + + collapsed.slide_window_slice_3D(slice_input1); + collapsed.slide_window_slice_3D(slice_input2); + } while (collapsed.slide_window_slice_3D(slice)); +} + +BorderSize CLPixelWiseDivisionKernel::border_size() const +{ + const unsigned int replicateSize = + _output->info()->dimension(0) - + std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); + const unsigned int border = + std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize); + return BorderSize(0, border, 0, 0); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp new file mode 100644 index 000000000..168b246bf --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLReduceMaxKernel.cpp @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <cmath> +#include <cstdlib> +#include <set> +#include <string> + +using namespace arm_compute; + +namespace +{ +constexpr unsigned int num_elems_processed_per_iteration = 16; + +Status validate_arguments(const ITensorInfo *input, int32_t axis, const ITensorInfo *output) +{ + // We can handle for simple case only + // Input rank: 2 + // Output rank: 1 + // Axis: one axis value, restrict to 1 + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis != 1, "Axis only allowed 1"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0, + "Inputs are not broadcast compatible"); + + // Validate in case of configured output + if (output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() != input->data_type(), + "Output same type allowed for input and output"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().num_dimensions() != 1, + "Only support for output dimension 1"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->tensor_shape().num_dimensions() != 2, + "Only support for input dimension 2"); + } + + return Status{}; +} + +} // namespace + +CLReduceMaxKernel::CLReduceMaxKernel() : _input(nullptr), _output(nullptr), _axis(0) {} + +void CLReduceMaxKernel::configure(const ICLTensor *input, int32_t axis, ICLTensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, output->info())); + + _input = input; + _output = output; + _axis = axis; + + // Configure kernel window + int cols = _input->info()->tensor_shape()[0]; + int rows = _input->info()->tensor_shape()[1]; + Window win; + win.set(0, Window::Dimension(0, cols, 1)); + win.set(1, Window::Dimension(0, rows, 1)); + + // Construct kernel name + std::string kernel_name = "reduce_max"; + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DWIDTH=" + support::cpp11::to_string(cols)); + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts)); + + ICLKernel::configure(win); +} + +Status CLReduceMaxKernel::validate(const ITensorInfo *input, int32_t axis, + const ITensorInfo *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, output)); + + return Status{}; +} + +void CLReduceMaxKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window window_input = window; + Window slice_input = window_input.first_slice_window_1D(); + + do + { + Window slice_output = slice_input.shift_dimensions(1); + unsigned int idx = 0; + add_1D_tensor_argument(idx, _input, slice_input); + add_1D_tensor_argument(idx, _output, slice_output); + enqueue(queue, *this, slice_input); + + } while (window_input.slide_window_slice_1D(slice_input)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp new file mode 100644 index 000000000..84a77122d --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLReductionMeanKernel.cpp @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/FixedPoint.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, + std::vector<uint32_t> axis) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis.size() >= TensorShape::num_max_dimensions, + "Reduction axis greater than max number of dimensions"); + + std::vector<uint32_t>::const_iterator it; + bool axis_w = false; + bool axis_h = false; + for (it = axis.begin(); it != axis.end(); ++it) + { + if ((*it) == 0) + { + axis_w = true; + } + else if ((*it) == 1) + { + axis_h = true; + } + else + { + ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported axis!"); + } + } + // TODO Other axises (currently, only axises for both width and height are supported.) + if (!axis_w || !axis_h) + { + ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported axis!"); + } + + if (output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW); + } + + return Status{}; +} + +std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, + std::vector<uint32_t> axis) +{ + // Output tensor auto initialization if not yet initialized + TensorShape output_shape{input->tensor_shape()}; + output_shape.set(0, 1); + output_shape.set(1, 1); + auto_init_if_empty(*output, output_shape, output->num_channels(), input->data_type(), + input->fixed_point_position()); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration_x = 8; // step + const unsigned int num_elems_processed_per_iteration_y = input->dimension(1); + + Window win = calculate_max_window( + *input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, + num_elems_processed_per_iteration_y); + AccessWindowHorizontal output_access(output, 0, 1); + bool window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, output->valid_region()); + + Status err = (window_changed) + ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") + : Status{}; + + return std::make_tuple(err, win); +} +} // namespace + +CLReductionMeanKernel::CLReductionMeanKernel() + : _input(nullptr), _output(nullptr), _reduction_axis(), _border_size() +{ +} + +BorderSize CLReductionMeanKernel::border_size() const { return _border_size; } + +void CLReductionMeanKernel::configure(const ICLTensor *input, ICLTensor *output, + std::vector<uint32_t> axis) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis)); + + _input = input; + _output = output; + _reduction_axis = axis; + + constexpr unsigned int num_elems_processed_per_iteration_x = 8; // step + + // Set border size + _border_size = BorderSize( + ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration_x) - + input->info()->dimension(0)); + + // Set build options + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()))); + // build_opts.emplace(("-DVEC_SIZE=" + + // support::cpp11::to_string(num_elems_processed_per_iteration))); + if (is_data_type_fixed_point(input->info()->data_type())) + { + build_opts.emplace("-DFIXED_POINT_POSITION=" + + support::cpp11::to_string(input->info()->fixed_point_position())); + } + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("reduction_mean", build_opts)); + + // Configure kernel window + auto win_config = validate_and_configure_window(_input->info(), _output->info(), axis); + + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); + + ICLKernel::configure(std::get<1>(win_config)); +} + +Status CLReductionMeanKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + std::vector<uint32_t> axis) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis)); + ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>( + validate_and_configure_window(input->clone().get(), output->clone().get(), axis))); + + return Status{}; +} + +void CLReductionMeanKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + // Set out window + Window out_window(window); + out_window.set(Window::DimX, Window::Dimension(0, 0, 0)); + + // Get first input and output slices + Window in_slice = window.first_slice_window_2D(); + Window out_slice = out_window.first_slice_window_2D(); + + // Set local sums buffer + // TODO work_group + unsigned int local_sum_size = _lws_hint[0] * _input->info()->element_size(); + + unsigned int idx = 2 * num_arguments_per_2D_tensor(); + _kernel.setArg(idx++, local_sum_size, nullptr); + _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(_input->info()->dimension(1))); // height + _kernel.setArg<cl_int>(idx++, static_cast<cl_int>(_input->info()->dimension(0) * + _input->info()->dimension(1))); // divider + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, in_slice); + in_slice.set_dimension_step(Window::DimY, _input->info()->dimension(1)); + add_2D_tensor_argument(idx, _output, out_slice); + enqueue(queue, *this, in_slice); + } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp new file mode 100644 index 000000000..80ffd423a --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLStridedSliceKernel.cpp @@ -0,0 +1,304 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <string> + +using namespace std; +using namespace arm_compute; + +static const int32_t maxDim = 4; + +CLStridedSliceKernel::CLStridedSliceKernel() + : _input(nullptr), _output(nullptr), _beginData(nullptr), _endData(nullptr), + _stridesData(nullptr), _beginMask(0), _endMask(0), _shrinkAxisMask(0) +{ +} + +Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output, + const ITensorInfo *begin, const ITensorInfo *end, + const ITensorInfo *strides, int32_t beginMask, + int32_t endMask, int32_t shrinkAxisMask) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, begin, end, strides); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::QASYMM8, DataType::U16, + DataType::S16, DataType::QS16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(begin, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(end, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(strides, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + ARM_COMPUTE_ERROR_ON(begin->num_dimensions() != 1 || begin->dimension(0) > 4); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(begin->tensor_shape(), end->tensor_shape(), + strides->tensor_shape()); + + return Status{}; +} + +// Return the index for the first element along that axis. This index will be a +// positive integer between [0, axisSize - 1] that can be used to index +// directly into the data. +inline int32_t StartForAxis(int32_t beginMask, int32_t begin, int32_t stride, + const TensorShape &inputShape, int32_t axis) +{ + // Begin with the specified index + int32_t start = begin; + + // beginMask override + if (beginMask & 1 << axis) + { + if (stride > 0) + { + // Forward iteration - use the first element. These values will get + // clamped below (Note: We could have set them to 0 and axisSize-1, but + // use lowest() and max() to maintain symmetry with StopForAxis()) + start = std::numeric_limits<int32_t>::lowest(); + } + else + { + // Backward iteration - use the last element. + start = std::numeric_limits<int32_t>::max(); + } + } + + // Handle negative indices + int32_t axisSize = inputShape[axis]; + if (start < 0) + { + start += axisSize; + } + + // Clamping + start = arm_compute::utility::clamp(start, 0, axisSize - 1); + + return start; +} + +// Return the "real" index for the end of iteration along that axis. This is an +// "end" in the traditional C sense, in that it points to one past the last +// element. ie. So if you were iterating through all elements of a 1D array of +// size 4, this function would return 4 as the stop, because it is one past the +// "real" indices of 0, 1, 2 & 3. +inline int32_t StopForAxis(int32_t endMask, int32_t end, int32_t stride, + const TensorShape &inputShape, int32_t axis) +{ + // Begin with the specified index + int32_t stop = end; + + // endMask override + if (endMask & (1 << axis)) + { + if (stride > 0) + { + // Forward iteration - use the last element. These values will get + // clamped below + stop = std::numeric_limits<int32_t>::max(); + } + else + { + // Backward iteration - use the first element. + stop = std::numeric_limits<int32_t>::lowest(); + } + } + + // Handle negative indices + int32_t axisSize = inputShape[axis]; + if (stop < 0) + { + stop += axisSize; + } + + // Clamping + // Because the end index points one past the last element, we need slightly + // different clamping ranges depending on the direction. + if (stride > 0) + { + // Forward iteration + stop = arm_compute::utility::clamp(stop, 0, axisSize); + } + else + { + // Backward iteration + stop = arm_compute::utility::clamp(stop, -1, axisSize - 1); + } + + return stop; +} + +inline int32_t offset4D(const TensorShape &shape, int32_t b, int32_t d, int32_t h, int32_t w) +{ + int32_t offset = b * shape[2] * shape[1] * shape[0]; + offset += d * shape[1] * shape[0]; + offset += h * shape[0]; + offset += w; + return offset; +} + +inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride) +{ + int32_t ret = 0; + if (stride > 0) + { + ret = ((stop - start - 1) / stride) + 1; + } + else + { + ret = ((stop - start + 1) / stride) + 1; + } + ARM_COMPUTE_ERROR_ON_MSG(ret < 0, "The dimension must be the natural number"); + return ret; +} + +void CLStridedSliceKernel::configure(const ICLTensor *input, ICLTensor *output, + ICLTensor *beginData, ICLTensor *endData, + ICLTensor *stridesData, int32_t beginMask, int32_t endMask, + int32_t shrinkAxisMask) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), beginData->info(), + endData->info(), stridesData->info(), beginMask, endMask, + shrinkAxisMask)); + + _input = input; + _output = output; + _beginData = beginData; + _endData = endData; + _stridesData = stridesData; + _beginMask = beginMask; + _endMask = endMask; + _shrinkAxisMask = shrinkAxisMask; + + constexpr unsigned int num_elems_processed_per_iteration = 1; + + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-DELEMENT_DATA_TYPE=" + + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.emplace("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size())); + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("strided_slice", build_opts)); + + // Create output's window without padding + TensorShape collapsed = output->info()->tensor_shape(); + collapsed.collapse(4); + TensorInfo info = *output->info(); + info.set_tensor_shape(collapsed); + Window win = calculate_max_window(info, Steps(num_elems_processed_per_iteration)); + + ICLKernel::configure(win); +} + +void CLStridedSliceKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + // Create input window + TensorShape collapsed = _input->info()->tensor_shape(); + collapsed.collapse(4); + TensorInfo info = *_input->info(); + info.set_tensor_shape(collapsed); + Window win_in = calculate_max_window(info, Steps(_input->info()->tensor_shape().total_size())); + + _beginData->map(queue); + _endData->map(queue); + _stridesData->map(queue); + + std::vector<int32_t> dimsIn; + std::vector<int32_t> dimsOut; + std::vector<int32_t> starts; + std::vector<int32_t> stops; + std::vector<int32_t> strides; + + for (uint32_t n = 0; n < _beginData->info()->tensor_shape().total_size(); ++n) + { + const TensorShape shape = _input->info()->tensor_shape(); + starts.emplace_back( + StartForAxis(_beginMask, reinterpret_cast<int32_t *>(_beginData->buffer())[n], + reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape, n)); + + stops.emplace_back(StopForAxis(_endMask, reinterpret_cast<int32_t *>(_endData->buffer())[n], + reinterpret_cast<int32_t *>(_stridesData->buffer())[n], shape, + n)); + + strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[n]); + dimsIn.emplace_back(shape[n]); + dimsOut.emplace_back(getOutDim(starts[n], stops[n], strides[n])); + } + + for (uint32_t n = _beginData->info()->tensor_shape().total_size(); n < 4; n++) + { + starts.emplace_back(0); + stops.emplace_back(1); + strides.emplace_back(1); + dimsIn.emplace_back(1); + dimsOut.emplace_back(1); + } + // TODO: Apply shrinkAxisMask + + _beginData->unmap(queue); + _stridesData->unmap(queue); + _endData->unmap(queue); + + // Set parameters + unsigned int idx = 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters + const cl_int4 dimsInArg = {{ + static_cast<cl_int>(dimsIn[0]), static_cast<cl_int>(dimsIn[1]), + static_cast<cl_int>(dimsIn[2]), static_cast<cl_int>(dimsIn[3]), + }}; + _kernel.setArg<cl_int4>(idx++, dimsInArg); + + const cl_int4 dimsOutArg = {{ + static_cast<cl_int>(dimsOut[0]), static_cast<cl_int>(dimsOut[1]), + static_cast<cl_int>(dimsOut[2]), static_cast<cl_int>(dimsOut[3]), + }}; + _kernel.setArg<cl_int4>(idx++, dimsOutArg); + + const cl_int4 startsArg = {{ + static_cast<cl_int>(starts[0]), static_cast<cl_int>(starts[1]), + static_cast<cl_int>(starts[2]), static_cast<cl_int>(starts[3]), + }}; + _kernel.setArg<cl_int4>(idx++, startsArg); + + const cl_int4 stridesArg = {{ + static_cast<cl_int>(strides[0]), static_cast<cl_int>(strides[1]), + static_cast<cl_int>(strides[2]), static_cast<cl_int>(strides[3]), + }}; + _kernel.setArg<cl_int4>(idx++, stridesArg); + + // TODO: Apply slicing output's window + idx = 0; + add_1D_tensor_argument(idx, _input, win_in); + add_1D_tensor_argument(idx, _output, window); + + enqueue(queue, *this, window); +} diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp new file mode 100644 index 000000000..d95b485b7 --- /dev/null +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLTopKV2Kernel.cpp @@ -0,0 +1,475 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/core/CL/kernels/CLTopKV2Kernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/CLKernelLibraryEx.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include <climits> +#include <cassert> + +namespace arm_compute +{ +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2Single::CLTopKV2Single() : _input(nullptr), _topk_values(nullptr), _topk_indices(nullptr) {} + +void CLTopKV2Single::configure(ICLTensor *input, ICLTensor *topk_values, ICLTensor *topk_indices, + cl::Buffer *indices, cl::Buffer *temp_stack, int k, int n) +{ + ARM_COMPUTE_ERROR_ON(input == nullptr && indices == nullptr); + ARM_COMPUTE_ERROR_ON(topk_values == nullptr && topk_indices == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + _input = input; + _topk_values = topk_values; + _topk_indices = topk_indices; + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("topkv2_quicksort", build_opts)); + + unsigned int idx = 3 * num_arguments_per_1D_tensor(); + _kernel.setArg(idx++, *indices); + _kernel.setArg(idx++, *temp_stack); + _kernel.setArg<cl_int>(idx++, k); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, 1, 1)); + ICLKernel::configure(win); +} + +void CLTopKV2Single::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + add_1D_tensor_argument(idx, _input, window); + add_1D_tensor_argument(idx, _topk_values, window); + add_1D_tensor_argument(idx, _topk_indices, window); + + enqueue(queue, *this, window); +} + +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2Init::CLTopKV2Init() : _input(nullptr) {} + +void CLTopKV2Init::configure(ICLTensor *input, cl::Buffer *in_key_buf, cl::Buffer *in_ind_buf, + int n) +{ + ARM_COMPUTE_ERROR_ON(input == nullptr && in_key_buf == nullptr); + ARM_COMPUTE_ERROR_ON(in_ind_buf == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + _input = input; + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_init", build_opts)); + + unsigned int idx = num_arguments_per_1D_tensor(); + _kernel.setArg(idx++, *in_key_buf); + _kernel.setArg(idx++, *in_ind_buf); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, n, 1)); + ICLKernel::configure(win); +} + +void CLTopKV2Init::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + add_1D_tensor_argument(idx, _input, window); + + enqueue(queue, *this, window); +} + +//////////////////////////////////////////////////////////////////////////////// +// This kernel makes a histogram of radix for each work item. +CLRadixSortHistogram::CLRadixSortHistogram() : _pass(0), _in_key_buf(nullptr) {} + +void CLRadixSortHistogram::configure(cl::Buffer *hist_buf, int bits, int n) +{ + ARM_COMPUTE_ERROR_ON(hist_buf == nullptr); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_histogram", build_opts)); + + int loc_histo_size = radix * _ITEMS * sizeof(cl_int); + + unsigned int idx = 1; + _kernel.setArg(idx++, *hist_buf); + + idx = 3; + _kernel.setArg(idx++, loc_histo_size, nullptr); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1)); + ICLKernel::configure(win); +} + +void CLRadixSortHistogram::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + _kernel.setArg(0, *_in_key_buf); + _kernel.setArg<cl_int>(2, _pass); + + cl::NDRange lws = cl::NDRange(_ITEMS, 1); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLRadixSortScanHistogram::CLRadixSortScanHistogram() {} + +void CLRadixSortScanHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits) +{ + ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts)); + + int temp_size = + std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint); + + unsigned int idx = 0; + _kernel.setArg(idx++, *hist_buf); + _kernel.setArg(idx++, temp_size, nullptr); + _kernel.setArg(idx++, *glob_sum_buf); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1)); + ICLKernel::configure(win); +} + +void CLRadixSortScanHistogram::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); + cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLRadixSortGlobalScanHistogram::CLRadixSortGlobalScanHistogram() {} + +void CLRadixSortGlobalScanHistogram::configure(cl::Buffer *glob_sum_buf, cl::Buffer *temp_buf, + int bits) +{ + ARM_COMPUTE_ERROR_ON(glob_sum_buf == nullptr && temp_buf == nullptr); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_scanhistograms", build_opts)); + + int temp_size = + std::max<uint32_t>(_HISTOSPLIT, _ITEMS * _GROUPS * radix / _HISTOSPLIT) * sizeof(cl_uint); + + unsigned int idx = 0; + _kernel.setArg(idx++, *glob_sum_buf); + _kernel.setArg(idx++, temp_size, nullptr); + _kernel.setArg(idx++, *temp_buf); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, _HISTOSPLIT / 2, 1)); + ICLKernel::configure(win); +} + +void CLRadixSortGlobalScanHistogram::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); + cl::NDRange lws = cl::NDRange(gws_x, 1); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLRadixSortPasteHistogram::CLRadixSortPasteHistogram() {} + +void CLRadixSortPasteHistogram::configure(cl::Buffer *hist_buf, cl::Buffer *glob_sum_buf, int bits) +{ + ARM_COMPUTE_ERROR_ON(hist_buf == nullptr && glob_sum_buf == nullptr); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_pastehistograms", build_opts)); + + unsigned int idx = 0; + _kernel.setArg(idx++, *hist_buf); + _kernel.setArg(idx++, *glob_sum_buf); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, radix * _GROUPS * _ITEMS / 2, 1)); + ICLKernel::configure(win); +} + +void CLRadixSortPasteHistogram::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); + cl::NDRange lws = cl::NDRange(gws_x / _HISTOSPLIT, 1); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLRadixSortReorder::CLRadixSortReorder() + : _pass(0), _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), + _out_ind_buf(nullptr) +{ +} + +void CLRadixSortReorder::configure(cl::Buffer *hist_buf, int bits, int n) +{ + ARM_COMPUTE_ERROR_ON(hist_buf == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + unsigned int radix = 1 << bits; + // Set kernel build options + std::set<std::string> build_opts; + build_opts.emplace("-D_BITS=" + support::cpp11::to_string(bits)); + build_opts.emplace("-D_RADIX=" + support::cpp11::to_string(radix)); + build_opts.emplace("-DPERMUT=1"); + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("radixsort_reorder", build_opts)); + + unsigned int idx = 2; + _kernel.setArg(idx++, *hist_buf); + + idx = 6; + _kernel.setArg(idx++, sizeof(uint) * radix * _ITEMS, nullptr); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, _GROUPS * _ITEMS, 1)); + ICLKernel::configure(win); +} + +void CLRadixSortReorder::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + const unsigned int gws_x = (window.x().end() - window.x().start()) / window.x().step(); + unsigned int lx = std::max(1U, (gws_x / _HISTOSPLIT)); + cl::NDRange lws = (lx < gws_x) ? cl::NDRange(lx, 1) : cl::NDRange(1, 1); + + _kernel.setArg(0, *_in_key_buf); + _kernel.setArg(1, *_out_key_buf); + _kernel.setArg<cl_int>(3, _pass); + _kernel.setArg(4, *_in_ind_buf); + _kernel.setArg(5, *_out_ind_buf); + + enqueue(queue, *this, window, lws); +} + +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2FindFirstNegative::CLTopKV2FindFirstNegative() : _out_key_buf(nullptr) {} + +void CLTopKV2FindFirstNegative::configure(cl::Buffer *first_negative_idx_buf, int n) +{ + ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("topkv2_find_first_negative", build_opts)); + + unsigned int idx = 1; + _kernel.setArg(idx++, *first_negative_idx_buf); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, n, 1)); + ICLKernel::configure(win); +} + +void CLTopKV2FindFirstNegative::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + _kernel.setArg(idx++, *_out_key_buf); + + enqueue(queue, *this, window); +} + +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2ReorderNegatives::CLTopKV2ReorderNegatives() + : _in_key_buf(nullptr), _out_key_buf(nullptr), _in_ind_buf(nullptr), _out_ind_buf(nullptr) +{ +} + +void CLTopKV2ReorderNegatives::configure(cl::Buffer *first_negative_idx_buf, int n) +{ + ARM_COMPUTE_ERROR_ON(first_negative_idx_buf == nullptr); + ARM_COMPUTE_ERROR_ON(n == 0); + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = static_cast<cl::Kernel>( + CLKernelLibraryEx::get().create_kernel("topkv2_reorder_negatives", build_opts)); + + unsigned int idx = 4; + _kernel.setArg(idx++, *first_negative_idx_buf); + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, n, 1)); + ICLKernel::configure(win); +} + +void CLTopKV2ReorderNegatives::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + _kernel.setArg(idx++, *_in_key_buf); + _kernel.setArg(idx++, *_out_key_buf); + _kernel.setArg(idx++, *_in_ind_buf); + _kernel.setArg(idx++, *_out_ind_buf); + + enqueue(queue, *this, window); +} + +//////////////////////////////////////////////////////////////////////////////// +CLTopKV2Store::CLTopKV2Store() + : _values(nullptr), _indices(nullptr), _out_key_buf(nullptr), _out_ind_buf(nullptr) +{ +} + +void CLTopKV2Store::configure(ICLTensor *values, ICLTensor *indices, int k, int n) +{ + ARM_COMPUTE_ERROR_ON(values == nullptr && indices == nullptr); + ARM_COMPUTE_ERROR_ON(k == 0); + ARM_COMPUTE_ERROR_ON(k > n); + + _values = values; + _indices = indices; + + // Set kernel build options + std::set<std::string> build_opts; + + // Create kernel + _kernel = + static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("topkv2_store", build_opts)); + + unsigned int idx = 2 * num_arguments_per_1D_tensor() + 2; + _kernel.setArg<cl_int>(idx++, n); + + // Configure kernel window + Window win; + win.set(0, Window::Dimension(0, k, 1)); + ICLKernel::configure(win); +} + +void CLTopKV2Store::setOutputBuffers(cl::Buffer *out_key_buf, cl::Buffer *out_ind_buf) +{ + _out_key_buf = out_key_buf; + _out_ind_buf = out_ind_buf; +} + +void CLTopKV2Store::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int idx = 0; + add_1D_tensor_argument(idx, _values, window); + add_1D_tensor_argument(idx, _indices, window); + _kernel.setArg(idx++, *_out_key_buf); + _kernel.setArg(idx++, *_out_ind_buf); + + enqueue(queue, *this, window); +} + +} // namespace arm_compute diff --git a/libs/kernel/acl/src/Init_acl.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp index cabf079fa..e1059ab53 100644 --- a/libs/kernel/acl/src/Init_acl.cpp +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp @@ -1,11 +1,12 @@ /* * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -13,20 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "arm_compute/runtime/CL/functions/CLCast.h" -#include <arm_compute/runtime/CL/CLScheduler.h> -#include <kernel/acl/nnfw_kernel_acl.h> +#include "arm_compute/core/CL/kernels/CLCastKernel.h" +#include "support/ToolchainSupport.h" -namespace nnfw { -namespace kernel { -namespace acl { +using namespace arm_compute; -// This will do one time initialization but can be called multiple times -void Initialize(void) +void CLCast::configure(ICLTensor *input, ICLTensor *output) { - arm_compute::CLScheduler::get().default_init(); + auto k = arm_compute::support::cpp14::make_unique<CLCastKernel>(); + k->configure(input, output); + _kernel = std::move(k); } - -} // namespace acl -} // namespace kernel -} // namespace nnfw diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp new file mode 100644 index 000000000..5552cbc6f --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLGather.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLGather.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/kernels/CLGatherKernel.h" +#include "support/ToolchainSupport.h" + +#include <utility> + +using namespace arm_compute; + +void CLGather::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique<CLGatherKernel>(); + k->configure(input1, input2, output); + _kernel = std::move(k); +} + +Status CLGather::validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output) +{ + return CLGatherKernel::validate(input1, input2, output); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp new file mode 100644 index 000000000..e1add5e90 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLPixelWiseDivision.cpp @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2016-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLPixelWiseDivision.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/kernels/CLPixelWiseDivisionKernel.h" +#include "support/ToolchainSupport.h" + +#include <utility> + +using namespace arm_compute; + +void CLPixelWiseDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, + float scale, ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy) +{ + auto k = arm_compute::support::cpp14::make_unique<CLPixelWiseDivisionKernel>(); + k->configure(input1, input2, output, scale, overflow_policy, rounding_policy); + _kernel = std::move(k); + + if (output->info()->dimension(0) > 1) + { + ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2; + + if (broadcasted_info->info()->dimension(0) == 1) + { + _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE); + } + } +} + +Status CLPixelWiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, + const ITensorInfo *output, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy) +{ + return CLPixelWiseDivisionKernel::validate(input1, input2, output, scale, overflow_policy, + rounding_policy); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp new file mode 100644 index 000000000..3382058db --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLReduceMax.cpp @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLReduceMax.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "support/ToolchainSupport.h" +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/kernels/CLReduceMaxKernel.h" + +#include <vector> +#include <algorithm> + +#include <utility> + +#define REDUCE_MAX_RUN_ON_CPU 1 + +namespace arm_compute +{ + +CLReduceMax::CLReduceMax() : _axis(0), _input(nullptr), _output(nullptr), _kernel(nullptr) {} + +void CLReduceMax::configure(ICLTensor *input, int axis, ICLTensor *output) +{ + _axis = axis; + + _input = input; + _output = output; + + auto k = arm_compute::support::cpp14::make_unique<CLReduceMaxKernel>(); + k->configure(input, axis, output); + _kernel = std::move(k); + + // We can handle for simple case only + // Output rank: 1 + // Axis: one axis value, restrict to 1 + ARM_COMPUTE_ERROR_ON(input->info()->tensor_shape().num_dimensions() != 2); + ARM_COMPUTE_ERROR_ON(output->info()->tensor_shape().num_dimensions() != 1); + ARM_COMPUTE_ERROR_ON(axis != 1); +} + +Status CLReduceMax::validate(const ITensorInfo *input, int32_t axis, const ITensorInfo *output) +{ + return CLReduceMaxKernel::validate(input, axis, output); +} + +void CLReduceMax::run() +{ +#if REDUCE_MAX_RUN_ON_CPU + run_on_cpu(); + + arm_compute::CLScheduler::get().sync(); +#else + arm_compute::CLScheduler::get().enqueue(*_kernel); +#endif +} + +void CLReduceMax::run_on_cpu() +{ + cl::CommandQueue q = CLScheduler::get().queue(); + + _input->map(q); + _output->map(q); + + // Compute by CPU for simple case + // Input rank: 2 + // Output rank: 1 + // Axis: one axis value, restrict to 1 + + float *input_data = (float *)_input->buffer(); + float *output_data = (float *)_output->buffer(); + + std::vector<float> container_max; + int cols = _input->info()->tensor_shape()[0]; + int rows = _input->info()->tensor_shape()[1]; + container_max.resize(rows); + + // Initialize as 1st element in row + float *input_pointer = input_data; + for (int i = 0; i < rows; i++) + { + container_max[i] = *input_pointer; + input_pointer += cols; + } + + // Update max value in row + for (int i = 0; i < rows; i++) + { + float max_in_row = container_max[i]; + for (int j = 1; j < cols; j++) + { + if (max_in_row < input_data[i * cols + j]) + { + max_in_row = input_data[i * cols + j]; + } + } + container_max[i] = max_in_row; + } + + for (int i = 0; i < rows; i++) + { + output_data[i] = container_max[i]; + } + + _input->unmap(q); + _output->unmap(q); +} +} // namespace arm_compute diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp new file mode 100644 index 000000000..ab724e752 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLReductionMean.cpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017-2018 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLReductionMean.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/kernels/CLReductionMeanKernel.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/runtime/Tensor.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +CLReductionMean::CLReductionMean() : _reduction_mean_kernel(), _fill_border_kernel() {} + +Status CLReductionMean::validate(const ITensorInfo *input, const ITensorInfo *output, + std::vector<uint32_t> axis) +{ + ARM_COMPUTE_RETURN_ON_ERROR(CLReductionMeanKernel::validate(input, output, axis)); + return Status{}; +} + +void CLReductionMean::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis) +{ + _reduction_mean_kernel.configure(input, output, axis); + _fill_border_kernel.configure(input, _reduction_mean_kernel.border_size(), BorderMode::CONSTANT, + PixelValue(0)); +} + +void CLReductionMean::run() +{ + CLScheduler::get().enqueue(_fill_border_kernel); + CLScheduler::get().enqueue(_reduction_mean_kernel); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp new file mode 100644 index 000000000..cd576cec1 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLStridedSlice.cpp @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLStridedSlice.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h" +#include "arm_compute/core/utils/misc/Utility.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "support/ToolchainSupport.h" +#include <vector> + +using namespace arm_compute; + +static const int32_t maxDims = 4; + +// Return the index for the first element along that axis. This index will be a +// positive integer between [0, axisSize - 1] that can be used to index +// directly into the data. +inline int32_t StartForAxis(int32_t beginMask, std::vector<int32_t> const &startIndices, + std::vector<int32_t> const &strides, const TensorShape &inputShape, + int32_t axis) +{ + // Begin with the specified index + int32_t start = startIndices[axis]; + + // beginMask override + if (beginMask & 1 << axis) + { + if (strides[axis] > 0) + { + // Forward iteration - use the first element. These values will get + // clamped below (Note: We could have set them to 0 and axisSize-1, but + // use lowest() and max() to maintain symmetry with StopForAxis()) + start = std::numeric_limits<int32_t>::lowest(); + } + else + { + // Backward iteration - use the last element. + start = std::numeric_limits<int32_t>::max(); + } + } + + // Handle negative indices + int32_t axisSize = inputShape[axis]; + if (start < 0) + { + start += axisSize; + } + + // Clamping + start = arm_compute::utility::clamp(start, 0, axisSize - 1); + + return start; +} + +// Return the "real" index for the end of iteration along that axis. This is an +// "end" in the traditional C sense, in that it points to one past the last +// element. ie. So if you were iterating through all elements of a 1D array of +// size 4, this function would return 4 as the stop, because it is one past the +// "real" indices of 0, 1, 2 & 3. +inline int32_t StopForAxis(int32_t endMask, std::vector<int32_t> const &stopIndices, + std::vector<int32_t> const &strides, const TensorShape &inputShape, + int32_t axis) +{ + // Begin with the specified index + int32_t stop = stopIndices[axis]; + + // endMask override + if (endMask & (1 << axis)) + { + if (strides[axis] > 0) + { + // Forward iteration - use the last element. These values will get + // clamped below + stop = std::numeric_limits<int32_t>::max(); + } + else + { + // Backward iteration - use the first element. + stop = std::numeric_limits<int32_t>::lowest(); + } + } + + // Handle negative indices + int32_t axisSize = inputShape[axis]; + if (stop < 0) + { + stop += axisSize; + } + + // Clamping + // Because the end index points one past the last element, we need slightly + // different clamping ranges depending on the direction. + if (strides[axis] > 0) + { + // Forward iteration + stop = arm_compute::utility::clamp(stop, 0, axisSize); + } + else + { + // Backward iteration + stop = arm_compute::utility::clamp(stop, -1, axisSize - 1); + } + + return stop; +} + +inline int32_t offset4D(const TensorShape &shape, int32_t b, int32_t d, int32_t h, int32_t w) +{ + int32_t offset = b * shape[2] * shape[1] * shape[0]; + offset += d * shape[1] * shape[0]; + offset += h * shape[0]; + offset += w; + return offset; +} + +void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData, + ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, + int32_t endMask, int32_t shrinkAxisMask) +{ + auto k = arm_compute::support::cpp14::make_unique<CLStridedSliceKernel>(); + k->configure(input, output, beginData, endData, stridesData, beginMask, endMask, shrinkAxisMask); + _kernel = std::move(k); +} + +void CLStridedSliceCPU::configure(ICLTensor *input, ICLTensor *output, ICLTensor *beginData, + ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, + int32_t endMask, int32_t shrinkAxisMask) +{ + ARM_COMPUTE_ERROR_THROW_ON(CLStridedSliceKernel::validate( + input->info(), output->info(), beginData->info(), endData->info(), stridesData->info(), + beginMask, endMask, shrinkAxisMask)); + + _input = input; + _output = output; + _beginData = beginData; + _endData = endData; + _stridesData = stridesData; + _beginMask = beginMask; + _endMask = endMask; + _shrinkAxisMask = shrinkAxisMask; +} + +void CLStridedSliceCPU::run() +{ + run_on_cpu(); + + arm_compute::CLScheduler::get().sync(); +} + +inline int32_t getOutDim(int32_t start, int32_t stop, int32_t stride) +{ + if (stride > 0) + { + return ((stop - start - 1) / stride) + 1; + } + else + { + return ((stop - start + 1) / stride) + 1; + } +} + +template <typename T> +inline void StridedSlice(const T *inputData, const TensorShape &inputShape, int32_t beginMask, + int32_t endMask, const std::vector<int32_t> &startIndices, + const std::vector<int32_t> &stopIndices, + const std::vector<int32_t> &strides, T *outputData) +{ + ARM_COMPUTE_ERROR_ON(startIndices.size() != maxDims); + ARM_COMPUTE_ERROR_ON(stopIndices.size() != maxDims); + ARM_COMPUTE_ERROR_ON(strides.size() != maxDims); + + const int32_t start_b = StartForAxis(beginMask, startIndices, strides, inputShape, 3); + const int32_t stop_b = StopForAxis(endMask, stopIndices, strides, inputShape, 3); + const int32_t start_d = StartForAxis(beginMask, startIndices, strides, inputShape, 2); + const int32_t stop_d = StopForAxis(endMask, stopIndices, strides, inputShape, 2); + const int32_t start_h = StartForAxis(beginMask, startIndices, strides, inputShape, 1); + const int32_t stop_h = StopForAxis(endMask, stopIndices, strides, inputShape, 1); + const int32_t start_w = StartForAxis(beginMask, startIndices, strides, inputShape, 0); + const int32_t stop_w = StopForAxis(endMask, stopIndices, strides, inputShape, 0); + + // The shape of outputData may collapse in one-dimension. + // Therefore, it is necessary to create a shape that matches the result of the outputData. + TensorShape outputShape( + getOutDim(start_w, stop_w, strides[0]), getOutDim(start_h, stop_h, strides[1]), + getOutDim(start_d, stop_d, strides[2]), getOutDim(start_b, stop_b, strides[3])); + for (int32_t in_b = start_b, b = 0; strides[3] > 0 ? in_b < stop_b : in_b > stop_b; + in_b += strides[3], b++) + { + for (int32_t in_d = start_d, d = 0; strides[2] > 0 ? in_d < stop_d : in_d > stop_d; + in_d += strides[2], d++) + { + for (int32_t in_h = start_h, h = 0; strides[1] > 0 ? in_h < stop_h : in_h > stop_h; + in_h += strides[1], h++) + { + for (int32_t in_w = start_w, w = 0; strides[0] > 0 ? in_w < stop_w : in_w > stop_w; + in_w += strides[0], w++) + { + outputData[offset4D(outputShape, b, d, h, w)] = + inputData[offset4D(inputShape, in_b, in_d, in_h, in_w)]; + } + } + } + } +} + +void CLStridedSliceCPU::run_on_cpu() +{ + // TODO: Support shrinkAxisMask + cl::CommandQueue q = CLScheduler::get().queue(); + + _input->map(q); + _output->map(q); + _beginData->map(q); + _endData->map(q); + _stridesData->map(q); + + TensorShape inputShape = _input->info()->tensor_shape(); + TensorShape outputShape = _output->info()->tensor_shape(); + + std::vector<int32_t> starts; + std::vector<int32_t> stops; + std::vector<int32_t> strides; + + for (uint32_t idx = 0; idx <= _input->info()->num_dimensions() - 1; ++idx) + { + starts.emplace_back(reinterpret_cast<int32_t *>(_beginData->buffer())[idx]); + stops.emplace_back(reinterpret_cast<int32_t *>(_endData->buffer())[idx]); + strides.emplace_back(reinterpret_cast<int32_t *>(_stridesData->buffer())[idx]); + } + + for (uint32_t i = _input->info()->num_dimensions(); i < maxDims; i++) + { + starts.emplace_back(0); + stops.emplace_back(1); + strides.emplace_back(1); + } + + switch (_input->info()->data_type()) + { + case DataType::U8: + case DataType::QASYMM8: + StridedSlice(reinterpret_cast<const uint8_t *>(_input->buffer()), inputShape, _beginMask, + _endMask, starts, stops, strides, + reinterpret_cast<uint8_t *>(_output->buffer())); + break; + case DataType::S8: + case DataType::QS8: + StridedSlice(reinterpret_cast<const int8_t *>(_input->buffer()), inputShape, _beginMask, + _endMask, starts, stops, strides, reinterpret_cast<int8_t *>(_output->buffer())); + break; + case DataType::U16: + StridedSlice(reinterpret_cast<const uint16_t *>(_input->buffer()), inputShape, _beginMask, + _endMask, starts, stops, strides, + reinterpret_cast<uint16_t *>(_output->buffer())); + break; + case DataType::S16: + case DataType::QS16: + StridedSlice(reinterpret_cast<const int16_t *>(_input->buffer()), inputShape, _beginMask, + _endMask, starts, stops, strides, + reinterpret_cast<int16_t *>(_output->buffer())); + break; + case DataType::F16: + // Not sure this works. + StridedSlice(reinterpret_cast<const half *>(_input->buffer()), inputShape, _beginMask, + _endMask, starts, stops, strides, reinterpret_cast<half *>(_output->buffer())); + break; + case DataType::U32: + StridedSlice(reinterpret_cast<const uint32_t *>(_input->buffer()), inputShape, _beginMask, + _endMask, starts, stops, strides, + reinterpret_cast<uint32_t *>(_output->buffer())); + break; + case DataType::S32: + StridedSlice(reinterpret_cast<const int32_t *>(_input->buffer()), inputShape, _beginMask, + _endMask, starts, stops, strides, + reinterpret_cast<int32_t *>(_output->buffer())); + break; + case DataType::F32: + StridedSlice(reinterpret_cast<const float *>(_input->buffer()), inputShape, _beginMask, + _endMask, starts, stops, strides, reinterpret_cast<float *>(_output->buffer())); + break; + default: + ARM_COMPUTE_ERROR("DataType not supported"); + break; + } + + _input->unmap(q); + _output->unmap(q); + _beginData->unmap(q); + _endData->unmap(q); + _stridesData->unmap(q); +} diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp new file mode 100644 index 000000000..6426364c9 --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLTopKV2.cpp @@ -0,0 +1,305 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (c) 2017 ARM Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "arm_compute/runtime/CL/functions/CLTopKV2.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/CLHelpers.h" + +#include <vector> +#include <algorithm> + +#include "../../topk_v2.h" + +namespace arm_compute +{ + +CLTopKV2::CLTopKV2() + : _k(0), _total_bits(0), _bits(0), _radix(0), _hist_buf_size(0), _glob_sum_buf_size(0), _n(0), + _input(nullptr), _values(nullptr), _indices(nullptr), _qs_idx_buf(), _qs_temp_buf(), + _hist_buf(), _glob_sum_buf(), _temp_buf(), _first_negative_idx_buf(), _in_key_buf(), + _out_key_buf(), _in_ind_buf(), _out_ind_buf(), _p_in_key_buf(nullptr), + _p_out_key_buf(nullptr), _p_in_ind_buf(nullptr), _p_out_ind_buf(nullptr), _qs_kernel(), + _init_kernel(), _hist_kernel(), _scan_hist_kernel(), _glob_scan_hist_kernel(), + _paste_hist_kernel(), _reorder_kernel(), _find_first_negative_kernel(), + _reorder_negatives_kernel(), _store_kernel() +{ +} + +void CLTopKV2::configure(ICLTensor *input, int k, ICLTensor *values, ICLTensor *indices, + int total_bits, int bits) +{ + _total_bits = total_bits; + _bits = bits; + _n = input->info()->tensor_shape()[0]; + + // _total_bits should be divided by _bits. + ARM_COMPUTE_ERROR_ON((_total_bits % _bits) != 0); + + _k = k; + _radix = 1 << bits; + + _input = input; + _values = values; + _indices = indices; + + std::string topk_env; + + char *env = getenv("ACL_TOPKV2"); + if (env) + topk_env = env; + + if (topk_env == "GPU_SINGLE") + { + _qs_idx_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); + _qs_temp_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); + + _qs_kernel.configure(input, values, indices, &_qs_idx_buf, &_qs_temp_buf, k, _n); + } + else if (topk_env == "GPU") + { + // n should be divided by (_GROUPS * _ITEMS) + ARM_COMPUTE_ERROR_ON((_n % (_GROUPS * _ITEMS)) != 0); + + _hist_buf_size = _radix * _GROUPS * _ITEMS; + _glob_sum_buf_size = _HISTOSPLIT; + + _hist_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(cl_int) * _hist_buf_size); + _glob_sum_buf = + cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(cl_int) * _glob_sum_buf_size); + _temp_buf = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, + sizeof(cl_int) * _glob_sum_buf_size); + _first_negative_idx_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int)); + _in_key_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n); + _out_key_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_float) * _n); + _in_ind_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); + _out_ind_buf = cl::Buffer(CLScheduler::get().context(), + CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, sizeof(cl_int) * _n); + + _p_in_key_buf = &_in_key_buf; + _p_out_key_buf = &_out_key_buf; + _p_in_ind_buf = &_in_ind_buf; + _p_out_ind_buf = &_out_ind_buf; + + _init_kernel.configure(input, _p_in_key_buf, _p_in_ind_buf, _n); + _hist_kernel.configure(&_hist_buf, bits, _n); + _scan_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits); + _glob_scan_hist_kernel.configure(&_glob_sum_buf, &_temp_buf, bits); + _paste_hist_kernel.configure(&_hist_buf, &_glob_sum_buf, bits); + _reorder_kernel.configure(&_hist_buf, bits, _n); + _find_first_negative_kernel.configure(&_first_negative_idx_buf, _n); + _reorder_negatives_kernel.configure(&_first_negative_idx_buf, _n); + _store_kernel.configure(values, indices, k, _n); + } + else + { + // DO NOTHING for CPU. + } +} + +void CLTopKV2::run() +{ + std::string topk_env; + + char *env = getenv("ACL_TOPKV2"); + if (env) + topk_env = env; + + if (topk_env == "GPU_SINGLE") + { + run_on_gpu_single_quicksort(); + } + else if (topk_env == "GPU") + { + run_on_gpu(); + } + else + { + run_on_cpu(); + } +} + +void CLTopKV2::run_on_gpu_single_quicksort() +{ + // This is a single threaded quick sort implementation. + CLScheduler::get().enqueue(_qs_kernel, false); + + arm_compute::CLScheduler::get().sync(); +} + +void CLTopKV2::run_on_gpu() +{ + cl::CommandQueue q = CLScheduler::get().queue(); + + // 1. CLTopKV2Init set key buffer and index buffer. + // - Key buffer is set as the same value of the layer's input + // - Values in the index buffer are set as their indices. + CLScheduler::get().enqueue(_init_kernel, false); + + int n_passes = _total_bits / _bits; + + // 2. Repeat (total_bits/bits) times. + // - total_bits is the number of bits of the data type (e.g., 32 for float) + // - bits defines number of buckets (e.g. 16 buckets where bit is 4) + for (int pass = 0; pass < n_passes; ++pass) + { + arm_compute::CLScheduler::get().sync(); + + // 2.1. Calculate histogram with _GROUPS * _ITEMS threads + _hist_kernel.setPass(pass, _p_in_key_buf); + CLScheduler::get().enqueue(_hist_kernel, false); + + // 2.2. Calculate prefix sum locally with multiple threads + CLScheduler::get().enqueue(_scan_hist_kernel, false); + // 2.3. Calculate prefix sum within a work group + CLScheduler::get().enqueue(_glob_scan_hist_kernel, false); + // 2.4. Calculate global prefix sum + CLScheduler::get().enqueue(_paste_hist_kernel, false); + + // 2.5. Reorder keys and indices based on the global prefix sum + _reorder_kernel.setPass(pass, _p_in_key_buf, _p_out_key_buf, _p_in_ind_buf, _p_out_ind_buf); + CLScheduler::get().enqueue(_reorder_kernel, false); + + cl::Buffer *tmp; + // swap key buffers + tmp = _p_in_key_buf; + _p_in_key_buf = _p_out_key_buf; + _p_out_key_buf = tmp; + + // swap index buffers + tmp = _p_in_ind_buf; + _p_in_ind_buf = _p_out_ind_buf; + _p_out_ind_buf = tmp; + } + + // 3. Get the first negative index + // Because we swap in_buf and out_buf at the end of the above for loop, + // the output buffers are in bufs. + _find_first_negative_kernel.setOutputBuffer(_p_in_key_buf); + CLScheduler::get().enqueue(_find_first_negative_kernel, false); + + // 4. Correct odering of negatives + // - Since radix sort does not consider negatives, negatives are considered as bigger values + // than positives. + // reordered data will be stored in _p_out_key_buf and _p_out_ind_buf + _reorder_negatives_kernel.setBuffers(_p_in_key_buf, _p_out_key_buf, _p_in_ind_buf, + _p_out_ind_buf); + CLScheduler::get().enqueue(_reorder_negatives_kernel, false); + + // 5. Extract top k values from sorted keys and indices. + _store_kernel.setOutputBuffers(_p_out_key_buf, _p_out_ind_buf); + CLScheduler::get().enqueue(_store_kernel, false); + + arm_compute::CLScheduler::get().sync(); + +#if 0 + // below code is left for debugging. + int first_neg; + q.enqueueReadBuffer(_first_negative_idx_buf, CL_TRUE, 0, sizeof(cl_int), &first_neg); + std::cout << "first neg = " << first_neg << std::endl; + + float in_key[_n]; + q.enqueueReadBuffer(*_p_in_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, in_key); + for(uint32_t i = 0 ; i < _n; ++i) { + std::cout << "in_key[" << i << "] = " << in_key[i] << std::endl; + } + + float out_key[_n]; + q.enqueueReadBuffer(*_p_out_key_buf, CL_TRUE, 0, sizeof(cl_float)*_n, out_key); + for(uint32_t i = 0 ; i < _n; ++i) { + std::cout << "out_key[" << i << "] = " << out_key[i] << std::endl; + } + + int in_ind[_n]; + q.enqueueReadBuffer(*_p_in_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, in_ind); + for(uint32_t i = 0 ; i < _n; ++i) { + std::cout << "in_ind[" << i << "] = " << in_ind[i] << std::endl; + } + + int out_ind[_n]; + q.enqueueReadBuffer(*_p_out_ind_buf, CL_TRUE, 0, sizeof(cl_int)*_n, out_ind); + for(uint32_t i = 0 ; i < _n; ++i) { + std::cout << "out_ind[" << i << "] = " << out_ind[i] << std::endl; + } + + int hist_buf[_hist_buf_size]; + q.enqueueReadBuffer(_hist_buf, CL_TRUE, 0, sizeof(cl_int)*_hist_buf_size, hist_buf); + for(uint32_t i = 0 ; i < _hist_buf_size; ++i) { + std::cout << "hist_buf[" << i << "] = " << hist_buf[i] << std::endl; + } + + int glob_sum_buf[_glob_sum_buf_size]; + q.enqueueReadBuffer(_glob_sum_buf, CL_TRUE, 0, sizeof(cl_int)*_glob_sum_buf_size, glob_sum_buf); + for(uint32_t i = 0 ; i < _glob_sum_buf_size; ++i) { + std::cout << "glob_sum_buf[" << i << "] = " << glob_sum_buf[i] << std::endl; + } + +#endif +} + +void CLTopKV2::run_on_cpu() +{ + cl::CommandQueue q = CLScheduler::get().queue(); + // const Window& w = _topkv2_kernel.window(); + + _input->map(q); + _values->map(q); + _indices->map(q); + + // int row_size = (w[0].end() - w[0].start()) / w[0].step(); + int row_size = _input->info()->tensor_shape()[0]; + int rank = _input->info()->num_dimensions(); + + if (rank > 2) + throw std::runtime_error("Not supported type."); + + int row_num = (rank == 2 ? _input->info()->tensor_shape()[1] : 1); + + if (_input->info()->data_type() == DataType::F32) + { + nnfw::rt::optimized_ops::TopK<float>(row_size, row_num, (float *)_input->buffer(), _k, + (int32 *)_indices->buffer(), (float *)_values->buffer()); + } + else if (_input->info()->data_type() == DataType::S32) + { + nnfw::rt::optimized_ops::TopK<int32_t>(row_size, row_num, (int32_t *)_input->buffer(), _k, + (int32 *)_indices->buffer(), + (int32_t *)_values->buffer()); + } + else if (_input->info()->data_type() == DataType::QASYMM8) + { + nnfw::rt::optimized_ops::TopK<uint8_t>(row_size, row_num, (uint8_t *)_input->buffer(), _k, + (int32 *)_indices->buffer(), + (uint8_t *)_values->buffer()); + } + else + { + throw std::runtime_error("Not supported type."); + } + + _input->unmap(q); + _values->unmap(q); + _indices->unmap(q); +} +} // namespace arm_compute diff --git a/libs/ARMComputeEx/src/runtime/topk_v2.h b/libs/ARMComputeEx/src/runtime/topk_v2.h new file mode 100644 index 000000000..a18ff0b0d --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/topk_v2.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ +#define __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ + +typedef int32_t int32; + +namespace nnfw +{ +namespace rt +{ +namespace optimized_ops +{ +// The follwing codes are impemented and modified while referring to TFLite topk_v2.cc file. +// TopK_v2 of NN Runtime supports TENSOR_FLOAT32, TENSOR_QUANT8_ASYMM, TENSOR_INT32 other than +// TFLite. +//(TFLite additionaly supports kTfLiteInt64.) + +// The class that collects top indexes of k values. Based on template +// tensorflow::gtl::TopN<> but, for optimization, +// it re-uses the same container. +template <typename T> class TopContainer +{ +public: + TopContainer() = delete; + TopContainer(int32 k, int32 row_size) : k_(k), container_(), values_(nullptr) + { + container_.reserve(std::min(k, row_size) + 1); + } + + /** Prevent instances of this class from being copied (As this class contains pointers) */ + TopContainer(const TopContainer &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + TopContainer &operator=(const TopContainer &) = delete; + + void start_collecting(const T *values) + { + values_ = values; + container_.clear(); + } + + void push(int32 a) + { + auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); }; + if (container_.size() <= (size_t)k_) + { + container_.push_back(a); + if (container_.size() == (size_t)(k_ + 1)) + { + std::make_heap(container_.begin(), container_.end(), comparator); + std::pop_heap(container_.begin(), container_.end(), comparator); + } + } + else if (comparator(a, container_.front())) + { + container_.back() = a; + std::push_heap(container_.begin(), container_.end(), comparator); + std::pop_heap(container_.begin(), container_.end(), comparator); + } + } + + const std::vector<int32> &sorted_result() + { + auto comparator = [this](int32 a, int32 b) { return compare_fun(a, b); }; + if (container_.size() <= (size_t)(k_)) + { + std::sort(container_.begin(), container_.end(), comparator); + } + else + { + std::sort_heap(container_.begin(), container_.end() - 1, comparator); + container_.resize(k_); + } + return container_; + } + +private: + int32 k_; + std::vector<int32> container_; + const T *values_ = nullptr; + + bool compare_fun(int32 a, int32 b) const + { + if (values_[b] < values_[a]) + { + return true; + } + else if (values_[b] > values_[a]) + { + return false; + } + else + { + return a < b; + } + } +}; + +template <typename T> +void TopK(int32 row_size, int32 num_rows, const T *data, int32 k, int32 *output_indexes, + T *output_values) +{ + TopContainer<T> topc(k, row_size); + for (int row = 0; row < num_rows; ++row) + { + const T *values_row = data + row * row_size; + topc.start_collecting(values_row); + for (int32 c = 0; c < row_size; ++c) + { + topc.push(c); + } + + // Prepare output buffers. + int32 *indexes_row = output_indexes + row * k; + T *output_row = output_values + row * k; + // We always assume that the output is sorted. + const auto &top_k = topc.sorted_result(); + std::copy(top_k.begin(), top_k.end(), indexes_row); + std::transform(top_k.begin(), top_k.end(), output_row, + [values_row](const int32 loc) { return values_row[loc]; }); + } +} + +} // namespace optimized_ops +} // namespace rt +} // namespace nnfw + +#endif // __NNFW_RT_OPTIMIZED_OPS_TOPK_V2_H__ diff --git a/libs/CMakeLists.txt b/libs/CMakeLists.txt index 8865a92be..687159725 100644 --- a/libs/CMakeLists.txt +++ b/libs/CMakeLists.txt @@ -1,5 +1,3 @@ add_subdirectory(util) -if(BUILD_NN_RUNTIME) - add_subdirectory(kernel) -endif(BUILD_NN_RUNTIME) add_subdirectory(support) +add_subdirectory(ARMComputeEx) diff --git a/libs/kernel/CMakeLists.txt b/libs/kernel/CMakeLists.txt deleted file mode 100644 index 7da54604d..000000000 --- a/libs/kernel/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -if(${TARGET_ARCH_BASE} STREQUAL "arm" OR ${TARGET_ARCH_BASE} STREQUAL "aarch64") - add_subdirectory(acl) -endif() diff --git a/libs/kernel/acl/CMakeLists.txt b/libs/kernel/acl/CMakeLists.txt deleted file mode 100644 index 8f0486e56..000000000 --- a/libs/kernel/acl/CMakeLists.txt +++ /dev/null @@ -1,94 +0,0 @@ -set(LIB_KERNELACL kernelacl) -set(LIB_KERNELACL_TEST kernelacl_test) - -# TODO remove this when default goes to c++14 -if(CMAKE_VERSION VERSION_LESS 3.1.0) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") -else(CMAKE_VERSION VERSION_LESS 3.1.0) - set(CMAKE_CXX_STANDARD 14) -endif(CMAKE_VERSION VERSION_LESS 3.1.0) - -# runtime information -set(PATH_RUNTIME_NN ${CMAKE_SOURCE_DIR}/runtimes/nn) -SET(RUNTIME_INCLUDES ${PATH_RUNTIME_NN}/common/include - ${PATH_RUNTIME_NN}/runtime/include - ${PATH_RUNTIME_NN}/depend/hal/include - ${PATH_RUNTIME_NN}/depend/libhidl/base/include - ${PATH_RUNTIME_NN}/depend/libcutils/include - ${PATH_RUNTIME_NN}/depend/libutils/include - ${PATH_RUNTIME_NN}/depend/android-base/include - ) - -# common -link_directories(${CMAKE_INSTALL_PREFIX}/lib) - -# kernel library -set(KERNELACL_SRCS "src/Init_acl.cpp" - "src/IO_accessor.cpp" - "src/shape.cpp" - "src/support.cpp" - "src/cl/Conv2D.cpp" - "src/cl/DepthwiseConv2D.cpp" - "src/cl/FullyConnected.cpp" - "src/cl/Pooling.cpp" - "src/cl/Reshape.cpp" - "src/cl/Softmax.cpp" - "src/cl/Concatenation.cpp" - "src/neon/Conv2D.cpp" - "src/neon/DepthwiseConv2D.cpp" - "src/neon/FullyConnected.cpp" - "src/neon/Pooling.cpp" - "src/neon/Softmax.cpp" - "src/neon/Reshape.cpp" - "src/neon/Concatenation.cpp" - ) - -add_library(${LIB_KERNELACL} SHARED ${KERNELACL_SRCS}) -target_include_directories(${LIB_KERNELACL} PUBLIC - ${NNFW_INCLUDE_DIR} - ${RUNTIME_INCLUDES} - ${NNFW_ACL_INCLUDES} - ${CMAKE_SOURCE_DIR}/include - ) -target_link_libraries(${LIB_KERNELACL} nnfw_support_nnapi) -if (${TARGET_OS} STREQUAL "tizen") - target_link_libraries(${LIB_KERNELACL} nnfw_util ${NNFW_ACL_LIBS} OpenCL) -else() - target_link_libraries(${LIB_KERNELACL} nnfw_util ${NNFW_ACL_LIBS}) -endif() -install(TARGETS ${LIB_KERNELACL} DESTINATION lib) - -# kernel test executable -set(KERNELACL_TEST_SRCS "src/util.cpp" - "src/gtest_env.cpp" - "src/cl/Conv2D.test.cpp" - "src/cl/DepthwiseConv2D.test.cpp" - "src/cl/FullyConnected.test.cpp" - "src/cl/Pooling.test.cpp" - "src/cl/Reshape.test.cpp" - "src/cl/Softmax.test.cpp" - "src/cl/Concatenation.test.cpp" - "src/neon/Conv2D.test.cpp" - "src/neon/DepthwiseConv2D.test.cpp" - "src/neon/FullyConnected.test.cpp" - "src/neon/Pooling.test.cpp" - "src/neon/Softmax.test.cpp" - "src/neon/Reshape.test.cpp" - "src/neon/Concatenation.test.cpp" - ) - -add_executable(${LIB_KERNELACL_TEST} ${KERNELACL_TEST_SRCS}) -target_include_directories(${LIB_KERNELACL_TEST} PUBLIC - ${NNFW_INCLUDE_DIR} - ${RUNTIME_INCLUDES} - ${NNFW_ACL_INCLUDES} - ) -if (NOT ${TARGET_OS} STREQUAL "tizen") - add_dependencies(${LIB_KERNELACL_TEST} googletest) -endif() -target_link_libraries(${LIB_KERNELACL_TEST} - ${LIB_KERNELACL} - nnfw_util ${NNFW_ACL_LIBS} - ${NNFW_GTEST_LIBS} - ) -install(TARGETS ${LIB_KERNELACL_TEST} DESTINATION unittest) diff --git a/libs/kernel/acl/src/CLUniqueTensor.h b/libs/kernel/acl/src/CLUniqueTensor.h deleted file mode 100644 index 6844e4565..000000000 --- a/libs/kernel/acl/src/CLUniqueTensor.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_KERNEL_ACL_CLUNIQUETENSOR_H__ -#define __NNFW_KERNEL_ACL_CLUNIQUETENSOR_H__ - -#include <arm_compute/runtime/CL/CLTensor.h> - -namespace nnfw { -namespace kernel { -namespace acl { - -class CLUniqueTensor -{ -public: - CLUniqueTensor(const ::arm_compute::TensorInfo &info) - { - _tensor.allocator()->init(info); - } - -public: - // Both copy and move are not allowed - CLUniqueTensor(const CLUniqueTensor &) = delete; - CLUniqueTensor(CLUniqueTensor &&) = delete; - -public: - ~CLUniqueTensor() - { - _tensor.allocator()->free(); - } - -public: - void allocate() - { - _tensor.allocator()->allocate(); - } - -public: - ::arm_compute::CLTensor &ref(void) { return _tensor; } - ::arm_compute::CLTensor *ptr(void) { return &_tensor; } - -private: - ::arm_compute::CLTensor _tensor; -}; - -} // namespace acl -} // namespace kernel -} // namespace nnfw - -#endif //__NNFW_KERNEL_ACL_CLUNIQUETENSOR_H__ diff --git a/libs/kernel/acl/src/DepthwiseConv2D.h b/libs/kernel/acl/src/DepthwiseConv2D.h deleted file mode 100644 index 8af8d4fd0..000000000 --- a/libs/kernel/acl/src/DepthwiseConv2D.h +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_KERNEL_ACL_DEPTHWISECONV2D_COMMON_H__ -#define __NNFW_KERNEL_ACL_DEPTHWISECONV2D_COMMON_H__ - -#include <OperationsUtils.h> -#include <arm_compute/core/TensorShape.h> -#include <arm_compute/core/TensorInfo.h> -#include <arm_compute/runtime/IFunction.h> - -#include "shape.h" -#include "IO_accessor.h" - -namespace nnfw { -namespace kernel { -namespace acl { - -namespace common { - -typedef std::function<void (void)> sync_scheduler_f; - -template<class TensorT, class LayerT, class ActT> -bool depthwiseConvFloat32(const float* inputData, const nnfw::rt::Shape& inputShape, - const float* filterData, const nnfw::rt::Shape& filterShape, - const float* biasData, const nnfw::rt::Shape& biasShape, - int32_t padding_left, int32_t padding_right, - int32_t padding_top, int32_t padding_bottom, - int32_t stride_width, int32_t stride_height, - int32_t depth_multiplier, int32_t activation, - float* outputData, const nnfw::rt::Shape& outputShape, - sync_scheduler_f sync_scheduler) { - auto inputShapeACL = util::fromNNShape(inputShape); - auto weightsShapeACL = util::fromNNShape(filterShape); - auto biasShapeACL = util::fromNNShape(biasShape); - auto outputShapeACL = util::fromNNShape(outputShape); - - TensorT input(arm_compute::TensorInfo(inputShapeACL, arm_compute::Format::F32)); - TensorT weights(arm_compute::TensorInfo(weightsShapeACL, arm_compute::Format::F32)); - TensorT bias(arm_compute::TensorInfo(biasShapeACL, arm_compute::Format::F32)); - TensorT output(arm_compute::TensorInfo(outputShapeACL, arm_compute::Format::F32)); - - arm_compute::PadStrideInfo psinfo = arm_compute::PadStrideInfo(stride_width, stride_height, - padding_left, padding_right, - padding_top, padding_bottom, - arm_compute::DimensionRoundingType::FLOOR); - - auto l = std::make_shared<LayerT>(); - l->configure(input.ptr(), weights.ptr(), bias.ptr(), output.ptr(), psinfo); - - std::vector<std::shared_ptr<arm_compute::IFunction>> fns; - - fns.emplace_back(l); - - util::insertFusedActivationLayer<TensorT, ActT>(output, activation, fns); - - input.allocate(); - output.allocate(); - bias.allocate(); - weights.allocate(); - - // TODO: Do we need 2D tensor accessor for the input feature? - TensorAccess<InputAccessor>(input.ref(), inputData, inputShape); - TensorAccess<BiasAccessor>(bias.ref(), biasData, biasShape); - TensorAccess<WeightAccessor>(weights.ref(), filterData, filterShape); - - for (const auto &fn : fns) - { - fn->run(); - } - - sync_scheduler(); - - TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape); - - return true; -} - -} // namespace common - -} // namespace acl -} // namespace kernel -} // namespace nnfw - -#endif // __NNFW_KERNEL_ACL_DEPTHWISECONV2D_COMMON_H__ diff --git a/libs/kernel/acl/src/DepthwiseConv2D.test.h b/libs/kernel/acl/src/DepthwiseConv2D.test.h deleted file mode 100644 index b2c8592ee..000000000 --- a/libs/kernel/acl/src/DepthwiseConv2D.test.h +++ /dev/null @@ -1,245 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <gtest/gtest.h> -#include <OperationsUtils.h> -#include <kernel/acl/nnfw_kernel_acl.h> -#include <kernel/acl/DepthwiseConv2D.h> - -// TODO: fix include path in CMakeFiles -#include "util.h" - -#ifndef ACL_TEST -#error "ACL_TEST should be defined first!" -#endif // ACL_TEST - -#ifndef ACL_CORE_FUNC_NAME -#error "ACL_CORE_FUNC_NAME should be defined first!" -#endif // ACL_CORE_FUNC_NAME - -using namespace nnfw::kernel::acl; - -ACL_TEST(KernelACL_TC, dwise_conv2d_1) { - uint32_t input_n = 1; - uint32_t input_h = 3; - uint32_t input_w = 3; - uint32_t input_c = 1; - uint32_t filter_h = 3; - uint32_t filter_w = 3; - uint32_t filter_c = 1; - uint32_t out_h = 1; - uint32_t out_w = 1; - - int32_t padding_left = 0; - int32_t padding_right = 0; - int32_t padding_top = 0; - int32_t padding_bottom = 0; - int32_t stride_width = 1; - int32_t stride_height = 1; - int32_t depth_multiplier = 1; - - util::TensorWrapper input({input_n, input_h, input_w, input_c}); - util::TensorWrapper weights({1, filter_h, filter_w, filter_c}); - util::TensorWrapper bias({filter_c}); - util::TensorWrapper output({1, out_h, out_w, filter_c}); - - int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU); - - input.initValue([&](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - uint32_t N = input_n; - uint32_t H = input_h; - uint32_t W = input_w; - uint32_t C = input_c; - - return n*H*W*C + h*W*C + w*C + c; - }); - weights.initValue([&](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - uint32_t N = 1; - uint32_t H = filter_h; - uint32_t W = filter_w; - uint32_t C = filter_c; - - return n*H*W*C + h*W*C + w*C + c; - }); - bias.initValue([](uint32_t w) { - return 0.f; - }); - output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 0.f; - }); - - bool bret = ACL_CORE_FUNC_NAME(input.ptr<float>(), input.shape(), - weights.ptr<float>(), weights.shape(), - bias.ptr<float>(), bias.shape(), - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - depth_multiplier, activation, - output.ptr<float>(), output.shape()); - - EXPECT_EQ(bret, true); - - util::TensorWrapper expected({1, out_h, out_w, filter_c}); - expected.initValue([&](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 204.f; - }); - - EXPECT_EQ(output, expected); -} - -ACL_TEST(KernelACL_TC, dwise_conv2d_multi_channel) { - uint32_t input_n = 1; - uint32_t input_h = 3; - uint32_t input_w = 3; - uint32_t input_c = 3; - uint32_t filter_h = 3; - uint32_t filter_w = 3; - uint32_t filter_c = input_c; - uint32_t out_h = 1; - uint32_t out_w = 1; - - int32_t padding_left = 0; - int32_t padding_right = 0; - int32_t padding_top = 0; - int32_t padding_bottom = 0; - int32_t stride_width = 1; - int32_t stride_height = 1; - int32_t depth_multiplier = 1; - - util::TensorWrapper input({input_n, input_h, input_w, input_c}); - util::TensorWrapper weights({1, filter_h, filter_w, filter_c}); - util::TensorWrapper bias({filter_c}); - util::TensorWrapper output({1, out_h, out_w, filter_c}); - - int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU); - - input.initValue([&](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - uint32_t N = input_n; - uint32_t H = input_h; - uint32_t W = input_w; - uint32_t C = input_c; - - return n*H*W*C + h*W*C + w*C + c; - }); - weights.initValue([&](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - uint32_t N = 1; - uint32_t H = filter_h; - uint32_t W = filter_w; - uint32_t C = filter_c; - - return n*H*W*C + h*W*C + w*C + c; - }); - bias.initValue([](uint32_t w) { - return 0.f; - }); - output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 0.f; - }); - - bool bret = ACL_CORE_FUNC_NAME(input.ptr<float>(), input.shape(), - weights.ptr<float>(), weights.shape(), - bias.ptr<float>(), bias.shape(), - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - depth_multiplier, activation, - output.ptr<float>(), output.shape()); - - EXPECT_EQ(bret, true); - - util::TensorWrapper expected({1, out_h, out_w, filter_c}); - expected.initValue({ - 1836.f, - 2061.f, - 2304.f - }); - - EXPECT_EQ(output, expected); -} - -ACL_TEST(KernelACL_TC, dwise_conv2d_inception_1) { - uint32_t input_n = 1; - uint32_t input_h = 112; - uint32_t input_w = 112; - uint32_t input_c = 32; - uint32_t filter_h = 3; - uint32_t filter_w = 3; - uint32_t filter_c = input_c; - uint32_t out_h = 112; - uint32_t out_w = 112; - - int32_t padding_left = 1; - int32_t padding_right = 1; - int32_t padding_top = 1; - int32_t padding_bottom = 1; - int32_t stride_width = 1; - int32_t stride_height = 1; - int32_t depth_multiplier = 1; - - util::TensorWrapper input({input_n, input_h, input_w, input_c}); - util::TensorWrapper weights({1, filter_h, filter_w, filter_c}); - util::TensorWrapper bias({filter_c}); - util::TensorWrapper output({1, out_h, out_w, filter_c}); - - int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU6); - - input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return c; - }); - weights.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return c; - }); - bias.initValue([](uint32_t w) { - return 0.f; - }); - output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 0.f; - }); - - bool bret = ACL_CORE_FUNC_NAME(input.ptr<float>(), input.shape(), - weights.ptr<float>(), weights.shape(), - bias.ptr<float>(), bias.shape(), - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - depth_multiplier, activation, - output.ptr<float>(), output.shape()); - - EXPECT_EQ(bret, true); - - util::TensorWrapper expected({1, out_h, out_w, filter_c}); - expected.initValue([&](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - float v = 9.f; - if( h == 0 || h == out_h-1 ) - v -= 3.f; - if( w == 0 || w == out_w-1 ) - v -= 3.f; - - // four corners - if( (w == 0 && h == 0) - || (w == 0 && h == out_h-1) - || (w == out_w-1 && h == 0) - || (w == out_w-1 && h == out_h-1) ) - v += 1.f; - - // Assumption: negative numbers cannot appear because - // only positive numbers exist in the input and weights. - float ret = c*c*v; - return std::min(ret, 6.f); - }); - - EXPECT_EQ(output, expected); -} diff --git a/libs/kernel/acl/src/FullyConnected.h b/libs/kernel/acl/src/FullyConnected.h deleted file mode 100644 index 5030a8548..000000000 --- a/libs/kernel/acl/src/FullyConnected.h +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_KERNEL_ACL_FULLYCONNECTED_COMMON_H__ -#define __NNFW_KERNEL_ACL_FULLYCONNECTED_COMMON_H__ - -#include <OperationsUtils.h> -#include <arm_compute/core/TensorShape.h> -#include <arm_compute/core/TensorInfo.h> -#include <arm_compute/runtime/IFunction.h> - -#include "shape.h" -#include "IO_accessor.h" - -namespace nnfw { -namespace kernel { -namespace acl { - -namespace common { - -typedef std::function<void (void)> sync_scheduler_f; - -template<class TensorT, class LayerT, class ActT> -bool fullyConnectedFloat32(const float* inputData, const nnfw::rt::Shape& inputShape, - const float* weightsData, const nnfw::rt::Shape& weightsShape, - const float* biasData, const nnfw::rt::Shape& biasShape, - int32_t activation, - float* outputData, const nnfw::rt::Shape& outputShape, - sync_scheduler_f sync_scheduler) { - - // NNAPI specification: https://developer.android.com/ndk/reference/group___neural_networks.html#ggaabbe492c60331b13038e39d4207940e0aaada7a3dbaf4676aba560c933ff610c5 - - // According to the NNAPI Specification, - // INPUT - // 1. input rank is up to 4. - // 2. if input rank > 2, it is flattened to rank 2 [batch_size, input_size] - nnfw::rt::Shape flattenedInputShape = inputShape; - switch(inputShape.dimensions.size()) { - case 1: - { - assert("Need to be implemented." && 0); - break; - } - case 2: - { - // DO NOTHING. - break; - } - case 3: - { - assert("Need to be implemented." && 0); - break; - } - case 4: - { - auto N = inputShape.dimensions[0]; - auto H = inputShape.dimensions[1]; - auto W = inputShape.dimensions[2]; - auto C = inputShape.dimensions[3]; - flattenedInputShape.dimensions = {N, H*W*C}; - break; - } - default: - assert(inputShape.dimensions.size() <= 4); - } - // Finally, flattenedInputShape is a 2D tensor. - - // WEIGHTS is a 2D tensor - assert(weightsShape.dimensions.size() == 2); - - // BIAS is a 1D tensor - assert(biasShape.dimensions.size() == 1); - - // OUTPUT is a 2D tensor. - assert(outputShape.dimensions.size() == 2); - - auto input_shape = util::fromNNShape(flattenedInputShape); - auto weights_shape = util::fromNNShape(weightsShape); - auto bias_shape = util::fromNNShape(biasShape); - auto output_shape = util::fromNNShape(outputShape); - - assert(activation == ANEURALNETWORKS_FUSED_NONE || activation == ANEURALNETWORKS_FUSED_RELU); - - std::vector<std::shared_ptr<arm_compute::IFunction>> fns; - - TensorT input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32)); - TensorT output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32)); - TensorT bias(arm_compute::TensorInfo(bias_shape, arm_compute::Format::F32)); - TensorT weights(arm_compute::TensorInfo(weights_shape, arm_compute::Format::F32)); - - auto fc = std::make_shared<LayerT>(); - fc->configure(input.ptr(), weights.ptr(), bias.ptr(), output.ptr()); - - fns.emplace_back(fc); - - if (ANEURALNETWORKS_FUSED_RELU == activation) - { - auto relu_f = std::make_shared<ActT>(); - - const arm_compute::ActivationLayerInfo relu_info{arm_compute::ActivationLayerInfo::ActivationFunction::RELU}; - - // Do in-place update - relu_f->configure(output.ptr(), nullptr, relu_info); - - fns.emplace_back(relu_f); - } - - input.allocate(); - output.allocate(); - bias.allocate(); - weights.allocate(); - - // TODO: Do we need 2D tensor accessor for the input feature? - TensorAccess<MatrixWeightAccessor>(input.ref(), inputData, inputShape); - TensorAccess<BiasAccessor>(bias.ref(), biasData, biasShape); - TensorAccess<MatrixWeightAccessor>(weights.ref(), weightsData, weightsShape); - - for (const auto &fn : fns) - { - fn->run(); - } - - sync_scheduler(); - - TensorAccess<MatrixOutputAccessor>(output.ref(), outputData, outputShape); - - return true; -} - -} // namespace common - -} // namespace acl -} // namespace kernel -} // namespace nnfw - -#endif // __NNFW_KERNEL_ACL_FULLYCONNECTED_COMMON_H__ diff --git a/libs/kernel/acl/src/FullyConnected.test.h b/libs/kernel/acl/src/FullyConnected.test.h deleted file mode 100644 index 01bbff802..000000000 --- a/libs/kernel/acl/src/FullyConnected.test.h +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <gtest/gtest.h> -#include <OperationsUtils.h> -#include <kernel/acl/nnfw_kernel_acl.h> -#include <kernel/acl/FullyConnected.h> - -// TODO: fix include path in CMakeFiles -#include "util.h" - -#ifndef ACL_TEST -#error "ACL_TEST should be defined first!" -#endif // ACL_TEST - -#ifndef ACL_CORE_FUNC_NAME -#error "ACL_CORE_FUNC_NAME should be defined first!" -#endif // ACL_CORE_FUNC_NAME - -using namespace nnfw::kernel::acl; -using fullyConnectedFloat32T = bool (*)(const float* inputData, const nnfw::rt::Shape& inputShape, - const float* weightsData, const nnfw::rt::Shape& weightsShape, - const float* biasData, const nnfw::rt::Shape& biasShape, - int32_t activation, - float* outputData, const nnfw::rt::Shape& outputShape); - -ACL_TEST(KernelACL_TC, fcFloat32_1) { - - util::TensorWrapper input({1,1,1,100}); - util::TensorWrapper weights({1,100}); - util::TensorWrapper bias({1}); - util::TensorWrapper output({1,1}); - - int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU); - - input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 1.f; - }); - weights.initValue([](uint32_t h, uint32_t w) { - return 1.f; - }); - bias.initValue([](uint32_t w) { - return 0.f; - }); - output.initValue([](uint32_t h, uint32_t w) { - return 0.f; - }); - - bool bret = ACL_CORE_FUNC_NAME(input.ptr<float>(), input.shape(), - weights.ptr<float>(), weights.shape(), - bias.ptr<float>(), bias.shape(), - activation, - output.ptr<float>(), output.shape()); - - EXPECT_EQ(bret, true); - - util::TensorWrapper expected({1,1}); - expected.initValue([](uint32_t h, uint32_t w) { - return 100.f; - }); - - EXPECT_EQ(output, expected); -} - -ACL_TEST(KernelACL_TC, fcFloat32_relu) { - - util::TensorWrapper input({1,1,1,100}); - util::TensorWrapper weights({1,100}); - util::TensorWrapper bias({1}); - util::TensorWrapper output({1,1}); - - int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU); - - input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 1.f; - }); - weights.initValue([](uint32_t h, uint32_t w) { - return -1.f; - }); - bias.initValue([](uint32_t w) { - return 0.f; - }); - output.initValue([](uint32_t h, uint32_t w) { - return 0.f; - }); - - bool bret = ACL_CORE_FUNC_NAME(input.ptr<float>(), input.shape(), - weights.ptr<float>(), weights.shape(), - bias.ptr<float>(), bias.shape(), - activation, - output.ptr<float>(), output.shape()); - - EXPECT_EQ(bret, true); - - util::TensorWrapper expected({1,1}); - expected.initValue([](uint32_t h, uint32_t w) { - return 0.f; - }); - - EXPECT_EQ(output, expected); -} - -ACL_TEST(KernelACL_TC, fcFloat32_conv_fc) { - uint32_t input_n = 1; - uint32_t input_c = 5; - uint32_t input_h = 4; - uint32_t input_w = 4; - uint32_t weight_n = 6; - - int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU); - - util::TensorWrapper input({input_n, input_h, input_w, input_c}); - util::TensorWrapper weight({weight_n, input_c*input_h*input_w}); - util::TensorWrapper bias({weight_n}); - util::TensorWrapper output({1, weight_n}); - - input.initValue([&](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - uint32_t N = input_n; - uint32_t H = input_h; - uint32_t W = input_w; - uint32_t C = input_c; - - return n*H*W*C + h*W*C + w*C + c; - }); - - weight.initValue([&](uint32_t h, uint32_t w) { - uint32_t H = weight_n; - uint32_t W = input_c*input_h*input_w; - - return h*W + w; - }); - - bias.initValue([](uint32_t w) { - return 0.f; - }); - - output.initValue([](uint32_t h, uint32_t w) { - return 0.f; - }); - - bool bret = ACL_CORE_FUNC_NAME(input.ptr<float>(), input.shape(), - weight.ptr<float>(), weight.shape(), - bias.ptr<float>(), bias.shape(), - activation, - output.ptr<float>(), output.shape()); - - EXPECT_EQ(bret, true); - - util::TensorWrapper expected({1, weight_n}); - expected.initValue({ - 167480.f, - 420280.f, - 673080.f, - 925880.f, - 1178680.f, - 1431480.f}); - - EXPECT_EQ(output, expected); -} - -ACL_TEST(KernelACL_TC, fcFloat32_fc_fc) { - uint32_t input_n = 6; - uint32_t weight_n = 6; - - int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU); - - util::TensorWrapper input({1, input_n}); - util::TensorWrapper weight({weight_n, input_n}); - util::TensorWrapper bias({weight_n}); - util::TensorWrapper output({1, weight_n}); - - input.initValue([&](uint32_t h, uint32_t w) { - // not use h because h = 0. - return (float)w; - }); - - weight.initValue([&](uint32_t h, uint32_t w) { - uint32_t H = weight_n; - uint32_t W = input_n; - - return (float)(h*W + w); - }); - - bias.initValue([](uint32_t w) { - return 0.f; - }); - - output.initValue([](uint32_t h, uint32_t w) { - return 0.f; - }); - - bool bret = ACL_CORE_FUNC_NAME(input.ptr<float>(), input.shape(), - weight.ptr<float>(), weight.shape(), - bias.ptr<float>(), bias.shape(), - activation, - output.ptr<float>(), output.shape()); - - EXPECT_EQ(bret, true); - - util::TensorWrapper expected({1, weight_n}); - expected.initValue({ - 55.f, - 145.f, - 235.f, - 325.f, - 415.f, - 505.f, - }); - - EXPECT_EQ(output, expected); -} - -ACL_TEST(KernelACL_TC, fcFloat32_inceptionv3) { - - uint32_t input_c = 2048; - uint32_t weight_n = 1008; - - util::TensorWrapper input({1,1,1,input_c}); - util::TensorWrapper weight({weight_n,input_c}); - util::TensorWrapper bias({weight_n}); - util::TensorWrapper output({1, weight_n}); - - int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU); - - input.initValue([&](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 1.f; - }); - weight.initValue([&](uint32_t h, uint32_t w) { - return (float)h; - }); - bias.initValue([](uint32_t w) { - return 0.f; - }); - output.initValue([](uint32_t h, uint32_t w) { - return 0.f; - }); - - bool bret = ACL_CORE_FUNC_NAME(input.ptr<float>(), input.shape(), - weight.ptr<float>(), weight.shape(), - bias.ptr<float>(), bias.shape(), - activation, - output.ptr<float>(), output.shape()); - - EXPECT_EQ(bret, true); - - util::TensorWrapper expected({1, weight_n}); - expected.initValue([&](uint32_t h, uint32_t w) { - return w*input_c; - }); - - EXPECT_EQ(output, expected); -} - diff --git a/libs/kernel/acl/src/IO_accessor.cpp b/libs/kernel/acl/src/IO_accessor.cpp deleted file mode 100644 index 410fb8ea5..000000000 --- a/libs/kernel/acl/src/IO_accessor.cpp +++ /dev/null @@ -1,310 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "IO_accessor.h" - -#include <cassert> - -namespace nnfw { -namespace kernel { -namespace acl { - -InputAccessor::InputAccessor(const float* inputData, const nnfw::rt::Shape& inputShape) - : _inputData(inputData) - , _inputShape(inputShape) -{ -} - -MatrixInputAccessor::MatrixInputAccessor(const float* inputData, const nnfw::rt::Shape& inputShape) - : _inputData(inputData) - , _inputShape(inputShape) -{ -} - -VectorInputAccessor::VectorInputAccessor(const float* inputData, const nnfw::rt::Shape& inputShape) - : _inputData(inputData) - , _inputShape(inputShape) -{ -} - -WeightAccessor::WeightAccessor(const float* filterData, const nnfw::rt::Shape& filterShape) - : _filterData(filterData) - , _filterShape(filterShape) -{ -} - -MatrixWeightAccessor::MatrixWeightAccessor(const float* filterData, const nnfw::rt::Shape& filterShape) - : _filterData(filterData) - , _filterShape(filterShape) -{ -} - -BiasAccessor::BiasAccessor(const float* biasData, const nnfw::rt::Shape& biasShape) - : _biasData(biasData) - , _biasShape(biasShape) -{ -} - -OutputAccessor::OutputAccessor(float* outputData, const nnfw::rt::Shape& outputShape) - : _outputData(outputData) - , _outputShape(outputShape) -{ -} - -MatrixOutputAccessor::MatrixOutputAccessor(float* outputData, const nnfw::rt::Shape& outputShape) - : _outputData(outputData) - , _outputShape(outputShape) -{ -} - -VectorOutputAccessor::VectorOutputAccessor(float* outputData, const nnfw::rt::Shape& outputShape) - : _outputData(outputData) - , _outputShape(outputShape) -{ -} - -static uint32_t getOffsetNCHW(const nnfw::rt::Shape& shape, const arm_compute::Coordinates& id) -{ - // get offset for ACL(NCHW) from data of NNAPI(NHWC) - uint32_t num = getSizeOfDimension(shape, 0); - uint32_t height = getSizeOfDimension(shape, 1); - uint32_t width = getSizeOfDimension(shape, 2); - uint32_t chann = getSizeOfDimension(shape, 3); - uint32_t stride = 1; - uint32_t offset = 0; - uint32_t numdim = id.num_dimensions(); - offset += numdim > 0 ? id[0] * stride : 0; stride *= width; - offset += numdim > 1 ? id[1] * stride : 0; stride *= height; - offset += numdim > 2 ? id[2] * stride : 0; stride *= chann; - offset += numdim > 3 ? id[3] * stride : 0; stride *= num; - return offset; -} - -static uint32_t getElementOffset(const nnfw::rt::Shape& shape, - uint32_t ch, uint32_t row, uint32_t col) -{ - assert(getSizeOfDimension(shape, 0) == 1); - assert(shape.dimensions.size() == 4); - - // TODO Optimize this! - const uint32_t W = getSizeOfDimension(shape, 2); - const uint32_t C = getSizeOfDimension(shape, 3); - - int offset = 0; - - // NNAPI uses NHWC ordering - offset += row * W * C; - offset += col * C; - offset += ch; - - return offset; -} - -static uint32_t getElementOffset(const nnfw::rt::Shape& shape, - uint32_t nth, uint32_t ch, uint32_t row, uint32_t col) -{ - assert(shape.dimensions.size() == 4); - - // TODO Optimize this! - const uint32_t H = getSizeOfDimension(shape, 1); - const uint32_t W = getSizeOfDimension(shape, 2); - const uint32_t C = getSizeOfDimension(shape, 3); - - int offset = 0; - - // NNAPI uses NHWC ordering - offset += nth * H * W * C; - offset += row * W * C; - offset += col * C; - offset += ch; - - return offset; -} - -bool InputAccessor::access_tensor(arm_compute::ITensor &tensor) -{ - arm_compute::Window window; - window.use_tensor_dimensions(tensor.info()->tensor_shape()); - - execute_window_loop(window, [&](const arm_compute::Coordinates& id) - { - const uint32_t ch = id[2]; - const uint32_t row = id[1]; - const uint32_t col = id[0]; - - uint32_t offset = getElementOffset(_inputShape, ch, row, col); - - *reinterpret_cast<float *>(tensor.ptr_to_element(id)) = - *(_inputData + offset); - }); - return true; -} - -bool MatrixInputAccessor::access_tensor(arm_compute::ITensor &tensor) -{ - arm_compute::Window window; - window.use_tensor_dimensions(tensor.info()->tensor_shape()); - - assert(tensor.info()->tensor_shape().num_dimensions() <= 2); - - execute_window_loop(window, [&](const arm_compute::Coordinates& id) - { - const auto row = id[1]; - const auto col = id[0]; - const auto W = tensor.info()->tensor_shape().x(); - - const auto offset = row * W + col; - - *reinterpret_cast<float *>(tensor.ptr_to_element(id)) = - *(_inputData + offset); - }); - return true; -} - -bool VectorInputAccessor::access_tensor(arm_compute::ITensor &tensor) -{ - arm_compute::Window window; - window.use_tensor_dimensions(tensor.info()->tensor_shape()); - - assert(tensor.info()->tensor_shape().num_dimensions() == 1); - - execute_window_loop(window, [&](const arm_compute::Coordinates& id) - { - uint32_t offset = id[0]; - - *reinterpret_cast<float *>(tensor.ptr_to_element(id)) = - *(_inputData + offset); - }); - return true; -} - -bool WeightAccessor::access_tensor(arm_compute::ITensor &tensor) -{ - arm_compute::Window window; - window.use_tensor_dimensions(tensor.info()->tensor_shape()); - - execute_window_loop(window, [&](const arm_compute::Coordinates& id) - { - const uint32_t nth = id[3]; - const uint32_t ch = id[2]; - const uint32_t row = id[1]; - const uint32_t col = id[0]; - - uint32_t offset = getElementOffset(_filterShape, nth, ch, row, col); - - *reinterpret_cast<float *>(tensor.ptr_to_element(id)) = - *(_filterData + offset); - }); - return true; -} - -bool MatrixWeightAccessor::access_tensor(arm_compute::ITensor &tensor) -{ - arm_compute::Window window; - window.use_tensor_dimensions(tensor.info()->tensor_shape()); - - assert(tensor.info()->tensor_shape().num_dimensions() <= 2); - - execute_window_loop(window, [&](const arm_compute::Coordinates& id) - { - const auto row = id[1]; - const auto col = id[0]; - const auto W = tensor.info()->tensor_shape().x(); - - uint32_t offset = row * W + col; - - *reinterpret_cast<float *>(tensor.ptr_to_element(id)) = - *(_filterData + offset); - }); - return true; -} - -bool BiasAccessor::access_tensor(arm_compute::ITensor &tensor) -{ - arm_compute::Window window; - window.use_tensor_dimensions(tensor.info()->tensor_shape()); - - execute_window_loop(window, [&](const arm_compute::Coordinates& id) - { - uint32_t offset = getOffsetNCHW(_biasShape, id); - *reinterpret_cast<float *>(tensor.ptr_to_element(id)) = - *(_biasData + offset); - }); - return true; -} - -bool OutputAccessor::access_tensor(arm_compute::ITensor &tensor) -{ - arm_compute::Window window; - window.use_tensor_dimensions(tensor.info()->tensor_shape()); - - execute_window_loop(window, [&](const arm_compute::Coordinates& id) - { - const uint32_t ch = id[2]; - const uint32_t row = id[1]; - const uint32_t col = id[0]; - - uint32_t offset = getElementOffset(_outputShape, ch, row, col); - - *(_outputData + offset) = - *reinterpret_cast<float *>(tensor.ptr_to_element(id)); - }); - return false; // end the network -} - -bool VectorOutputAccessor::access_tensor(arm_compute::ITensor &tensor) -{ - arm_compute::Window window; - window.use_tensor_dimensions(tensor.info()->tensor_shape()); - - assert(tensor.info()->tensor_shape().num_dimensions() == 1); - - execute_window_loop(window, [&](const arm_compute::Coordinates& id) - { - const uint32_t x = id[0]; - - uint32_t offset = x; - - *(_outputData + offset) = - *reinterpret_cast<float *>(tensor.ptr_to_element(id)); - }); - return false; // end the network -} - -bool MatrixOutputAccessor::access_tensor(arm_compute::ITensor &tensor) -{ - arm_compute::Window window; - window.use_tensor_dimensions(tensor.info()->tensor_shape()); - - assert(tensor.info()->tensor_shape().num_dimensions() <= 2); - - execute_window_loop(window, [&](const arm_compute::Coordinates& id) - { - const auto row = id[1]; - const auto col = id[0]; - const auto W = tensor.info()->tensor_shape().x(); - - const auto offset = row * W + col; - - *(_outputData + offset) = - *reinterpret_cast<float *>(tensor.ptr_to_element(id)); - }); - return false; // end the network -} - -} // namespace acl -} // namespace kernel -} // namespace nnfw diff --git a/libs/kernel/acl/src/IO_accessor.h b/libs/kernel/acl/src/IO_accessor.h deleted file mode 100644 index e7670f15c..000000000 --- a/libs/kernel/acl/src/IO_accessor.h +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_KERNEL_ACL_IO_ACCESSOR_H__ -#define __NNFW_KERNEL_ACL_IO_ACCESSOR_H__ - -#include <arm_compute/graph/ITensorAccessor.h> -#include <arm_compute/runtime/CL/CLFunctions.h> -#include <arm_compute/runtime/NEON/NEFunctions.h> - -#include <OperationsUtils.h> // for nnfw::rt::Shape - -namespace nnfw { -namespace kernel { -namespace acl { - -class InputAccessor : public arm_compute::graph::ITensorAccessor -{ -public: - InputAccessor(const float* inputData, const nnfw::rt::Shape& inputShape); - InputAccessor(InputAccessor&&) = default; - - // Inherited methods overriden: - bool access_tensor(arm_compute::ITensor& tensor) override; - -private: - const float* _inputData; - const nnfw::rt::Shape& _inputShape; -}; - -class MatrixInputAccessor : public arm_compute::graph::ITensorAccessor -{ -public: - MatrixInputAccessor(const float* inputData, const nnfw::rt::Shape& inputShape); - MatrixInputAccessor(MatrixInputAccessor&&) = default; - - // Inherited methods overriden: - bool access_tensor(arm_compute::ITensor& tensor) override; - -private: - const float* _inputData; - const nnfw::rt::Shape& _inputShape; -}; - -class VectorInputAccessor : public arm_compute::graph::ITensorAccessor -{ -public: - VectorInputAccessor(const float* inputData, const nnfw::rt::Shape& inputShape); - VectorInputAccessor(VectorInputAccessor&&) = default; - - // Inherited methods overriden: - bool access_tensor(arm_compute::ITensor& tensor) override; - -private: - const float* _inputData; - const nnfw::rt::Shape& _inputShape; -}; - -class WeightAccessor : public arm_compute::graph::ITensorAccessor -{ -public: - WeightAccessor(const float* filterData, const nnfw::rt::Shape& filterShape); - WeightAccessor(WeightAccessor&&) = default; - - // Inherited methods overriden: - bool access_tensor(arm_compute::ITensor& tensor) override; - -private: - const float* _filterData; - const nnfw::rt::Shape& _filterShape; -}; - -class MatrixWeightAccessor : public arm_compute::graph::ITensorAccessor -{ -public: - MatrixWeightAccessor(const float* filterData, const nnfw::rt::Shape& filterShape); - MatrixWeightAccessor(MatrixWeightAccessor&&) = default; - - // Inherited methods overriden: - bool access_tensor(arm_compute::ITensor& tensor) override; - -private: - const float* _filterData; - const nnfw::rt::Shape& _filterShape; -}; - -class BiasAccessor : public arm_compute::graph::ITensorAccessor -{ -public: - BiasAccessor(const float* biasData, const nnfw::rt::Shape& biasShape); - BiasAccessor(BiasAccessor&&) = default; - - // Inherited methods overriden: - bool access_tensor(arm_compute::ITensor& tensor) override; - -private: - const float* _biasData; - const nnfw::rt::Shape& _biasShape; -}; - -class OutputAccessor : public arm_compute::graph::ITensorAccessor -{ -public: - OutputAccessor(float* outputData, const nnfw::rt::Shape& outputShape); - OutputAccessor(OutputAccessor&&) = default; - - // Inherited methods overriden: - bool access_tensor(arm_compute::ITensor& tensor) override; - -private: - float* _outputData; - const nnfw::rt::Shape& _outputShape; -}; - -class MatrixOutputAccessor : public arm_compute::graph::ITensorAccessor -{ -public: - MatrixOutputAccessor(float* outputData, const nnfw::rt::Shape& outputShape); - MatrixOutputAccessor(MatrixOutputAccessor&&) = default; - - // Inherited methods overriden: - bool access_tensor(arm_compute::ITensor& tensor) override; - -private: - float* _outputData; - const nnfw::rt::Shape& _outputShape; -}; - -class VectorOutputAccessor : public arm_compute::graph::ITensorAccessor -{ -public: - VectorOutputAccessor(float* outputData, const nnfw::rt::Shape& outputShape); - VectorOutputAccessor(VectorOutputAccessor&&) = default; - - // Inherited methods overriden: - bool access_tensor(arm_compute::ITensor& tensor) override; - -private: - float* _outputData; - const nnfw::rt::Shape& _outputShape; -}; - -template<typename AccessorType> -inline void TensorAccess(arm_compute::CLTensor& tensor, const float* data, - const nnfw::rt::Shape& shape) -{ - tensor.map(); - AccessorType accessor(data, shape); - accessor.access_tensor(tensor); - tensor.unmap(); -} - -template<typename AccessorType> -inline void TensorAccess(arm_compute::CLTensor& tensor, float* data, - const nnfw::rt::Shape& shape) -{ - tensor.map(); - AccessorType accessor(data, shape); - accessor.access_tensor(tensor); - tensor.unmap(); -} - -template<typename AccessorType> -inline void TensorAccess(arm_compute::Tensor& tensor, const float* data, - const nnfw::rt::Shape& shape) -{ - AccessorType accessor(data, shape); - accessor.access_tensor(tensor); -} - -template<typename AccessorType> -inline void TensorAccess(arm_compute::Tensor& tensor, float* data, - const nnfw::rt::Shape& shape) -{ - AccessorType accessor(data, shape); - accessor.access_tensor(tensor); -} - -} // namespace acl -} // namespace kernel -} // namespace nnfw - -#endif // __NNFW_KERNEL_ACL_IO_ACCESSOR_H__ diff --git a/libs/kernel/acl/src/NEUniqueTensor.h b/libs/kernel/acl/src/NEUniqueTensor.h deleted file mode 100644 index 34412f9e3..000000000 --- a/libs/kernel/acl/src/NEUniqueTensor.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_KERNEL_ACL_NEUNIQUETENSOR_H__ -#define __NNFW_KERNEL_ACL_NEUNIQUETENSOR_H__ - -#include <arm_compute/runtime/Tensor.h> - -namespace nnfw { -namespace kernel { -namespace acl { - -// TODO: find a way to merge CLUniqueTensor and NEUniqueTensor. -class NEUniqueTensor -{ -public: - NEUniqueTensor(const ::arm_compute::TensorInfo &info) - { - _tensor.allocator()->init(info); - } - -public: - // Both copy and move are not allowed - NEUniqueTensor(const NEUniqueTensor &) = delete; - NEUniqueTensor(NEUniqueTensor &&) = delete; - -public: - ~NEUniqueTensor() - { - _tensor.allocator()->free(); - } - -public: - void allocate() - { - _tensor.allocator()->allocate(); - } - -public: - ::arm_compute::Tensor &ref(void) { return _tensor; } - ::arm_compute::Tensor *ptr(void) { return &_tensor; } - -private: - ::arm_compute::Tensor _tensor; -}; - -} // namespace acl -} // namespace kernel -} // namespace nnfw - -#endif //__NNFW_KERNEL_ACL_NEUNIQUETENSOR_H__ diff --git a/libs/kernel/acl/src/Reshape.h b/libs/kernel/acl/src/Reshape.h deleted file mode 100644 index ebd82477d..000000000 --- a/libs/kernel/acl/src/Reshape.h +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_KERNEL_ACL_RESHAPE_COMMON_H__ -#define __NNFW_KERNEL_ACL_RESHAPE_COMMON_H__ -#include <OperationsUtils.h> -#include <arm_compute/core/TensorShape.h> -#include <arm_compute/core/TensorInfo.h> - -// TODO: fix include path in CMakeFiles -#include "IO_accessor.h" -#include "shape.h" - -namespace nnfw { -namespace kernel { -namespace acl { - -namespace common { - -typedef std::function<void (void)> sync_scheduler_f; - -template<class TensorT, class LayerT> -bool reshapeGeneric(const void* inputData, const nnfw::rt::Shape& inputShape, - void* outputData, const nnfw::rt::Shape& outputShape, - sync_scheduler_f sync_scheduler) { - - auto input_shape = util::fromNNShape(inputShape); - auto output_shape = util::fromNNShape(outputShape); - - TensorT input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32)); - TensorT output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32)); - - LayerT l; - - l.configure(input.ptr(), output.ptr()); - - input.allocate(); - output.allocate(); - - TensorAccess<InputAccessor>(input.ref(), (float*)inputData, inputShape); - - l.run(); - - sync_scheduler(); - - TensorAccess<OutputAccessor>(output.ref(), (float*)outputData, outputShape); - - return true; -} - -} // namespace common - -} // namespace acl -} // namespace kernel -} // namespace nnfw - -#endif // __NNFW_KERNEL_ACL_RESHAPE_COMMON_H__ diff --git a/libs/kernel/acl/src/Reshape.test.h b/libs/kernel/acl/src/Reshape.test.h deleted file mode 100644 index a96a896a6..000000000 --- a/libs/kernel/acl/src/Reshape.test.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <gtest/gtest.h> -#include <OperationsUtils.h> -#include <kernel/acl/nnfw_kernel_acl.h> -#include <kernel/acl/Reshape.h> - -// TODO: fix include path in CMakeFiles -#include "util.h" - -#ifndef ACL_TEST -#error "ACL_TEST should be defined first!" -#endif // ACL_TEST - -#ifndef ACL_CORE_FUNC_NAME -#error "ACL_CORE_FUNC_NAME should be defined first!" -#endif // ACL_CORE_FUNC_NAME - -using namespace nnfw::kernel::acl; - -ACL_TEST(KernelACL_TC, reshape_1) { - const nnfw::rt::Shape inputShape = {OperandType::FLOAT32, {1,1,9,1}, 1.0, 0}; - float inputData[9] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f}; - - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - float outputData[9] = {0}; - - bool bret = ACL_CORE_FUNC_NAME(inputData, inputShape, - outputData, outputShape); - - EXPECT_EQ(bret, true); - - float expectData[9] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f}; - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); - -} diff --git a/libs/kernel/acl/src/cl/Concatenation.cpp b/libs/kernel/acl/src/cl/Concatenation.cpp deleted file mode 100644 index 9376006ca..000000000 --- a/libs/kernel/acl/src/cl/Concatenation.cpp +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <OperationsUtils.h> -#include <arm_compute/core/TensorShape.h> -#include <arm_compute/core/TensorInfo.h> - -#include <cassert> - -// TODO: fix include path in CMakeFiles -#include "../IO_accessor.h" -#include "../shape.h" - -namespace nnfw { -namespace kernel { -namespace acl { - -bool concatenationFloat32(const std::vector<const float*>& inputDataPtrs, - const std::vector<nnfw::rt::Shape>& inputShapes, int32_t axis, - float* outputData, const nnfw::rt::Shape& outputShape) -{ - if (axis != 3) - { - assert("Only support axis=3 for ACL" && 0); - return false; - } - assert(inputDataPtrs.size() == inputShapes.size()); - - std::vector<arm_compute::CLTensor*> inputPtrs; - std::vector<arm_compute::ICLTensor*> inputIptrs; - arm_compute::CLTensor output; - - // init Tensors - std::vector<nnfw::rt::Shape>::const_iterator it_inputShape = inputShapes.begin(); - for (auto inputData : inputDataPtrs) - { - const nnfw::rt::Shape& inputShape = *it_inputShape; - arm_compute::TensorShape input_shape = util::fromNNShape(inputShape); - arm_compute::CLTensor* inputPtr = new arm_compute::CLTensor(); - - inputPtr->allocator()->init(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32)); - inputPtrs.push_back(inputPtr); - inputIptrs.push_back(inputPtr); - - it_inputShape++; - } - arm_compute::TensorShape output_shape = util::fromNNShape(outputShape); - output.allocator()->init(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32)); - - // prepare ACL Concatenate and configure tensors - auto concat = std::make_shared<arm_compute::CLDepthConcatenateLayer>(); - concat->configure(inputIptrs, &output); - - // allocate Tensors - it_inputShape = inputShapes.begin(); - std::vector<const float*>::const_iterator it_inputData = inputDataPtrs.begin(); - for (auto inputPtr : inputPtrs) - { - inputPtr->allocator()->allocate(); - - const float* inputData = *it_inputData; - const nnfw::rt::Shape& inputShape = *it_inputShape; - - TensorAccess<InputAccessor>(*inputPtr, inputData, inputShape); - - it_inputShape++; - it_inputData++; - } - output.allocator()->allocate(); - - // run - concat->run(); - arm_compute::CLScheduler::get().sync(); - - // get output - TensorAccess<OutputAccessor>(output, outputData, outputShape); - - // cleanup - for (auto inputPtr : inputPtrs) - { - inputPtr->allocator()->free(); - delete inputPtr; - } - output.allocator()->free(); - - return true; -} - -} // namespace acl -} // namespace kernel -} // namespace nnfw diff --git a/libs/kernel/acl/src/cl/Concatenation.test.cpp b/libs/kernel/acl/src/cl/Concatenation.test.cpp deleted file mode 100644 index b2c5a5891..000000000 --- a/libs/kernel/acl/src/cl/Concatenation.test.cpp +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <gtest/gtest.h> -#include <OperationsUtils.h> -#include <kernel/acl/nnfw_kernel_acl.h> -#include <kernel/acl/Concatenation.h> - -// TODO: fix include path in CMakeFiles -#include "../util.h" - -using namespace nnfw::kernel::acl; - -TEST(KernelACL_TC, concatFloat32_1) -{ - float inputData_1[6] = { - 1, 2, 3, 4, 5, 6 // [ [ [1],[2],[3] ], [ [4],[5],[6] ] ] - }; - float inputData_2[6] = { - 7, 8, 9, 10, 11, 12 // [ [ [7],[8],[9] ], [ [10],[11],[12] ] ] - }; - const nnfw::rt::Shape inputShape_1 = { OperandType::FLOAT32, {1,2,3,1}, 1.0, 0 }; - const nnfw::rt::Shape inputShape_2 = { OperandType::FLOAT32, {1,2,3,1}, 1.0, 0 }; - std::vector<const float*> inputDataPtrs; - std::vector<nnfw::rt::Shape> inputShapes; - float outputData[12]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,2,3,2}, 1.0, 0 }; - bool bret; - - inputDataPtrs.push_back(inputData_1); - inputDataPtrs.push_back(inputData_2); - inputShapes.push_back(inputShape_1); - inputShapes.push_back(inputShape_2); - - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - bret = concatenationFloat32(inputDataPtrs, inputShapes, 3, - outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectNCHW[] = { - 1, 2, 3, 4, 5, 6, - 7, 8, 9, 10, 11, 12 - }; - float expectData[12]; // [ [ [1,7],[2,8],[3,9] ], [ [4,10],[5,11],[6,12] ] ] - util::NCHW2NHWC(expectNCHW, expectData, outputShape); - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} diff --git a/libs/kernel/acl/src/cl/Conv2D.cpp b/libs/kernel/acl/src/cl/Conv2D.cpp deleted file mode 100644 index 4783bdc1d..000000000 --- a/libs/kernel/acl/src/cl/Conv2D.cpp +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <OperationsUtils.h> -#include <NeuralNetworks.h> - -#include <arm_compute/core/TensorShape.h> -#include <arm_compute/core/TensorInfo.h> - -#include <util/environment.h> - -#include "../IO_accessor.h" -#include "../util.h" -#include "../shape.h" -#include "../CLUniqueTensor.h" -#include "../support.h" - -#include "util/feature/TextFormatter.h" - -#include "support/nnapi/feature/Reader.h" - -namespace nnfw { -namespace kernel { -namespace acl { - -static int verbose = 0; - -bool convFloat32(const float* inputData, const nnfw::rt::Shape& inputShape, - const float* filterData, const nnfw::rt::Shape& filterShape, - const float* biasData, const nnfw::rt::Shape& biasShape, - int32_t padding_left, int32_t padding_right, - int32_t padding_top, int32_t padding_bottom, - int32_t stride_width, int32_t stride_height, - int32_t activation, - float* outputData, const nnfw::rt::Shape& outputShape) -{ - arm_compute::TensorShape input_shape = util::fromNNShape(inputShape); - arm_compute::TensorShape filter_shape = util::fromNNShape(filterShape); - arm_compute::TensorShape bias_shape = util::fromVectorNNShape(biasShape); - arm_compute::TensorShape output_shape = util::fromNNShape(outputShape); - arm_compute::PadStrideInfo conv_info = arm_compute::PadStrideInfo(stride_width, stride_height, - padding_left, padding_right, - padding_top, padding_bottom, - arm_compute::DimensionRoundingType::FLOOR); - - CLUniqueTensor input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32)); - CLUniqueTensor output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32)); - CLUniqueTensor bias(arm_compute::TensorInfo(bias_shape, arm_compute::Format::F32)); - CLUniqueTensor filter(arm_compute::TensorInfo(filter_shape, arm_compute::Format::F32)); - - std::vector<std::shared_ptr<arm_compute::IFunction>> fns; - - auto conv_f = std::make_shared<arm_compute::CLConvolutionLayer>(); - - conv_f->configure(input.ptr(), filter.ptr(), bias.ptr(), output.ptr(), conv_info); - - fns.emplace_back(conv_f); - - util::insertFusedActivationLayer<CLUniqueTensor, arm_compute::CLActivationLayer>(output, activation, fns); - - input.allocate(); - output.allocate(); - bias.allocate(); - filter.allocate(); - - TensorAccess<InputAccessor>(input.ref(), inputData, inputShape); - TensorAccess<BiasAccessor>(bias.ref(), biasData, biasShape); - TensorAccess<WeightAccessor>(filter.ref(), filterData, filterShape); - - nnfw::util::env::IntAccessor("CONV2D_VERBOSE").access(verbose); - if (verbose) - { - input.ref().map(); - auto ifm_shape = nnfw::support::nnapi::feature::asFeatureShape(inputShape); - nnfw::support::nnapi::feature::Reader<float> nnapi_ifm_reader{ifm_shape, inputData}; - nnfw::support::acl::feature::Reader<float> acl_ifm_reader{input.ptr()}; - - std::cout << "NNAPI IFM:" << std::endl; - std::cout << nnfw::util::feature::TextFormatter<float>{ifm_shape, nnapi_ifm_reader} << std::endl; - - std::cout << "ARM Compute IFM:" << std::endl; - std::cout << nnfw::util::feature::TextFormatter<float>{ifm_shape, acl_ifm_reader} << std::endl; - input.ref().unmap(); - } - - for (const auto &fn : fns) - { - fn->run(); - } - - arm_compute::CLScheduler::get().sync(); - - TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape); - - return true; -} - -} // namespace acl -} // namespace kernel -} // namespace nnfw diff --git a/libs/kernel/acl/src/cl/Conv2D.test.cpp b/libs/kernel/acl/src/cl/Conv2D.test.cpp deleted file mode 100644 index e34cdeea5..000000000 --- a/libs/kernel/acl/src/cl/Conv2D.test.cpp +++ /dev/null @@ -1,202 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <gtest/gtest.h> -#include <OperationsUtils.h> -#include <kernel/acl/nnfw_kernel_acl.h> -#include <kernel/acl/Conv2D.h> - -// TODO: fix include path in CMakeFiles -#include "../util.h" - -using namespace nnfw::kernel::acl; - -TEST(KernelACL_TC, convFloat32_3x3to1x1) -{ - float inputData[9]; - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - float filterData[9]; - const nnfw::rt::Shape filterShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - float biasData[1] = { 1.0 }; - const nnfw::rt::Shape biasShape = { OperandType::FLOAT32, {1}, 1.0, 0 }; - int32_t padding_left = 0; - int32_t padding_right = 0; - int32_t padding_top = 0; - int32_t padding_bottom = 0; - int32_t stride_width = 1; - int32_t stride_height = 1; - int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU); - float outputData[1]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 }; - bool bret; - - util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0); - util::initData(filterData, sizeof(filterData) / sizeof(filterData[0]), 1.0); - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - bret = convFloat32(inputData, inputShape, - filterData, filterShape, - biasData, biasShape, - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - activation, - outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = { 10.0f }; - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} - -TEST(KernelACL_TC, convFloat32_3x3to3x3) -{ - float inputData[9]; - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - float filterData[9]; - const nnfw::rt::Shape filterShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - float biasData[1] = { 1.0 }; - const nnfw::rt::Shape biasShape = { OperandType::FLOAT32, {1}, 1.0, 0 }; - int32_t padding_left = 1; - int32_t padding_right = 1; - int32_t padding_top = 1; - int32_t padding_bottom = 1; - int32_t stride_width = 1; - int32_t stride_height = 1; - int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU); - float outputData[9]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - bool bret; - - util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0); - util::initData(filterData, sizeof(filterData) / sizeof(filterData[0]), 1.0); - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - bret = convFloat32(inputData, inputShape, - filterData, filterShape, - biasData, biasShape, - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - activation, - outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = { - 5.0f, 7.0f, 5.0f, - 7.0f, 10.0f, 7.0f, - 5.0f, 7.0f, 5.0f - }; - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} - -TEST(KernelACL_TC, convFloat32_3x3to3x3_RELU) -{ - float inputData[9]; - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - float filterData[9]; - const nnfw::rt::Shape filterShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - float biasData[1] = { -5.0f }; - const nnfw::rt::Shape biasShape = { OperandType::FLOAT32, {1}, 1.0, 0 }; - int32_t padding_left = 1; - int32_t padding_right = 1; - int32_t padding_top = 1; - int32_t padding_bottom = 1; - int32_t stride_width = 1; - int32_t stride_height = 1; - int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU); - float outputData[9]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - bool bret; - - util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0); - util::initData(filterData, sizeof(filterData) / sizeof(filterData[0]), 1.0); - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - bret = convFloat32(inputData, inputShape, - filterData, filterShape, - biasData, biasShape, - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - activation, - outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = - { - 0.0f, 1.0f, 0.0f, - 1.0f, 4.0f, 1.0f, - 0.0f, 1.0f, 0.0f - }; - - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} - -TEST(KernelACL_TC, convFloat32_3x5to3x3) -{ - float inputData[15] = { - 1,2,3,4,5, - 6,7,8,9,10, - 11,12,13,14,15 - }; - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,5,1}, 1.0, 0 }; - float filterData[18] = { - 1,1,1, 1,1,1, 1,1,1, - 2,2,2, 2,2,2, 2,2,2 - }; - const nnfw::rt::Shape filterShape = { OperandType::FLOAT32, {2,3,3,1}, 1.0, 0 }; - float biasData[2] = { 1.0, 1.0 }; - const nnfw::rt::Shape biasShape = { OperandType::FLOAT32, {2}, 1.0, 0 }; - int32_t padding_left = 1; - int32_t padding_right = 1; - int32_t padding_top = 1; - int32_t padding_bottom = 1; - int32_t stride_width = 1; - int32_t stride_height = 1; - int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU); - float outputData[30]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,3,5,2}, 1.0, 0 }; - bool bret; - - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - bret = convFloat32(inputData, inputShape, - filterData, filterShape, - biasData, biasShape, - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - activation, - outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectNCHW[] = { - 17.0f, 28.0f, 34.0f, 40.0f, 29.0f, - 40.0f, 64.0f, 73.0f, 82.0f, 58.0f, - 37.0f, 58.0f, 64.0f, 70.0f, 49.0f, - - 33.0f, 55.0f, 67.0f, 79.0f, 57.0f, - 79.0f, 127.0f, 145.0f, 163.0f, 115.0f, - 73.0f, 115.0f, 127.0f, 139.0f, 97.0f - }; - float expectData[30]; - util::NCHW2NHWC(expectNCHW, expectData, outputShape); - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} diff --git a/libs/kernel/acl/src/cl/DepthwiseConv2D.cpp b/libs/kernel/acl/src/cl/DepthwiseConv2D.cpp deleted file mode 100644 index 7593a99f4..000000000 --- a/libs/kernel/acl/src/cl/DepthwiseConv2D.cpp +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <OperationsUtils.h> -#include <arm_compute/core/TensorShape.h> -#include <arm_compute/core/TensorInfo.h> - -#include <cassert> - -// TODO: fix include path in CMakeFiles -#include "../IO_accessor.h" -#include "../shape.h" -#include "../CLUniqueTensor.h" -#include "../DepthwiseConv2D.h" - -namespace nnfw { -namespace kernel { -namespace acl { - -static void sync_scheduler() { - arm_compute::CLScheduler::get().sync(); -} - -bool depthwiseConvFloat32(const float* inputData, const nnfw::rt::Shape& inputShape, - const float* filterData, const nnfw::rt::Shape& filterShape, - const float* biasData, const nnfw::rt::Shape& biasShape, - int32_t padding_left, int32_t padding_right, - int32_t padding_top, int32_t padding_bottom, - int32_t stride_width, int32_t stride_height, - int32_t depth_multiplier, int32_t activation, - float* outputData, const nnfw::rt::Shape& outputShape) { - return common::depthwiseConvFloat32<CLUniqueTensor, arm_compute::CLDepthwiseConvolutionLayer, - arm_compute::CLActivationLayer>(inputData, inputShape, - filterData, filterShape, - biasData, biasShape, - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - depth_multiplier, activation, - outputData, outputShape, - sync_scheduler); -} - -} // namespace acl -} // namespace kernel -} // namespace nnfw - diff --git a/libs/kernel/acl/src/cl/DepthwiseConv2D.test.cpp b/libs/kernel/acl/src/cl/DepthwiseConv2D.test.cpp deleted file mode 100644 index 695563383..000000000 --- a/libs/kernel/acl/src/cl/DepthwiseConv2D.test.cpp +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#define ACL_CORE_FUNC_NAME depthwiseConvFloat32 -#define ACL_TEST(tc, t) TEST(tc, cl_##t) - -#include "../DepthwiseConv2D.test.h" diff --git a/libs/kernel/acl/src/cl/FullyConnected.cpp b/libs/kernel/acl/src/cl/FullyConnected.cpp deleted file mode 100644 index 7513355ab..000000000 --- a/libs/kernel/acl/src/cl/FullyConnected.cpp +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <OperationsUtils.h> -#include <arm_compute/core/TensorShape.h> -#include <arm_compute/core/TensorInfo.h> - -#include <cassert> - -// TODO: fix include path in CMakeFiles -#include "../IO_accessor.h" -#include "../shape.h" -#include "../CLUniqueTensor.h" -#include "../FullyConnected.h" - -namespace nnfw { -namespace kernel { -namespace acl { - -void sync_scheduler() { - arm_compute::CLScheduler::get().sync(); -} - -bool fullyConnectedFloat32(const float* inputData, const nnfw::rt::Shape& inputShape, - const float* weightsData, const nnfw::rt::Shape& weightsShape, - const float* biasData, const nnfw::rt::Shape& biasShape, - int32_t activation, - float* outputData, const nnfw::rt::Shape& outputShape) { - return common::fullyConnectedFloat32<CLUniqueTensor, arm_compute::CLFullyConnectedLayer, - arm_compute::CLActivationLayer>(inputData, inputShape, - weightsData, weightsShape, - biasData, biasShape, - activation, - outputData, outputShape, - sync_scheduler); -} - -} // namespace acl -} // namespace kernel -} // namespace nnfw diff --git a/libs/kernel/acl/src/cl/FullyConnected.test.cpp b/libs/kernel/acl/src/cl/FullyConnected.test.cpp deleted file mode 100644 index b1f5a095f..000000000 --- a/libs/kernel/acl/src/cl/FullyConnected.test.cpp +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#define ACL_CORE_FUNC_NAME fullyConnectedFloat32 -#define ACL_TEST(tc, t) TEST(tc, cl_##t) - -#include "../FullyConnected.test.h" diff --git a/libs/kernel/acl/src/cl/Pooling.cpp b/libs/kernel/acl/src/cl/Pooling.cpp deleted file mode 100644 index e22eacccc..000000000 --- a/libs/kernel/acl/src/cl/Pooling.cpp +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <OperationsUtils.h> -#include <arm_compute/core/TensorShape.h> -#include <arm_compute/core/TensorInfo.h> -#include "../IO_accessor.h" -#include "../shape.h" -#include "../CLUniqueTensor.h" - -#include <cassert> - -namespace nnfw { -namespace kernel { -namespace acl { - -bool maxPoolFloat32(const float* inputData, const nnfw::rt::Shape& inputShape, - int32_t padding_left, int32_t padding_right, - int32_t padding_top, int32_t padding_bottom, - int32_t stride_width, int32_t stride_height, - int32_t filter_width, int32_t filter_height, - int32_t activation, - float* outputData, const nnfw::rt::Shape& outputShape) -{ - arm_compute::TensorShape input_shape = util::fromNNShape(inputShape); - arm_compute::TensorShape output_shape = util::fromNNShape(outputShape); - - std::vector<std::shared_ptr<arm_compute::IFunction>> fns; - - arm_compute::PadStrideInfo pad_info = arm_compute::PadStrideInfo(stride_width, stride_height, - padding_left, padding_right, - padding_top, padding_bottom, - arm_compute::DimensionRoundingType::FLOOR); - - arm_compute::PoolingLayerInfo maxpool_info = arm_compute::PoolingLayerInfo(arm_compute::PoolingType::MAX, - arm_compute::Size2D(filter_width,filter_height), - pad_info, false); - - CLUniqueTensor input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32)); - CLUniqueTensor output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32)); - - auto pool_f = std::make_shared<arm_compute::CLPoolingLayer>(); - pool_f->configure(input.ptr(), output.ptr(), maxpool_info); - - fns.emplace_back(pool_f); - - input.allocate(); - output.allocate(); - - util::insertFusedActivationLayer<CLUniqueTensor, arm_compute::CLActivationLayer>(output, activation, fns); - - TensorAccess<InputAccessor>(input.ref(), inputData, inputShape); - - for (const auto &fn : fns) - { - fn->run(); - } - - arm_compute::CLScheduler::get().sync(); - - TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape); - - return true; -} - -bool averagePoolFloat32(const float* inputData, const nnfw::rt::Shape& inputShape, - int32_t padding_left, int32_t padding_right, - int32_t padding_top, int32_t padding_bottom, - int32_t stride_width, int32_t stride_height, - int32_t filter_width, int32_t filter_height, - int32_t activation, - float* outputData, const nnfw::rt::Shape& outputShape) -{ - arm_compute::TensorShape input_shape = util::fromNNShape(inputShape); - arm_compute::TensorShape output_shape = util::fromNNShape(outputShape); - - std::vector<std::shared_ptr<arm_compute::IFunction>> fns; - - arm_compute::PadStrideInfo pad_info = arm_compute::PadStrideInfo(stride_width, stride_height, - padding_left, padding_right, - padding_top, padding_bottom, - arm_compute::DimensionRoundingType::FLOOR); - - arm_compute::PoolingLayerInfo pool_info = arm_compute::PoolingLayerInfo(arm_compute::PoolingType::AVG, - arm_compute::Size2D(filter_width,filter_height), - pad_info, true); - - CLUniqueTensor input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32)); - CLUniqueTensor output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32)); - - auto pool_f = std::make_shared<arm_compute::CLPoolingLayer>(); - pool_f->configure(input.ptr(), output.ptr(), pool_info); - - fns.emplace_back(pool_f); - - input.allocate(); - output.allocate(); - - util::insertFusedActivationLayer<CLUniqueTensor, arm_compute::CLActivationLayer>(output, activation, fns); - - TensorAccess<InputAccessor>(input.ref(), inputData, inputShape); - - for (const auto &fn : fns) - { - fn->run(); - } - - arm_compute::CLScheduler::get().sync(); - - TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape); - - return true; -} - -} // namespace acl -} // namespace kernel -} // namespace nnfw diff --git a/libs/kernel/acl/src/cl/Pooling.test.cpp b/libs/kernel/acl/src/cl/Pooling.test.cpp deleted file mode 100644 index 8112e7a45..000000000 --- a/libs/kernel/acl/src/cl/Pooling.test.cpp +++ /dev/null @@ -1,482 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <gtest/gtest.h> -#include <OperationsUtils.h> -#include <kernel/acl/nnfw_kernel_acl.h> -#include <arm_compute/core/Types.h> -#include <kernel/acl/Pooling.h> - -#include "../util.h" - -using namespace nnfw::kernel::acl; - -TEST(KernelACL_TC, maxPoolFloat32_3x3to1x1) -{ - util::TensorWrapper input({1,3,3,1}); - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - int32_t padding_left = 0; - int32_t padding_right = 0; - int32_t padding_top = 0; - int32_t padding_bottom = 0; - int32_t stride_width = 1; - int32_t stride_height = 1; - int32_t filter_width = 3; - int32_t filter_height = 3; - - float outputData[1]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 }; - bool bret; - - float value = 1.0f; - input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return value++; - }); - - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - int32_t activation = ANEURALNETWORKS_FUSED_NONE; - - bret = maxPoolFloat32(input.ptr<float>(), inputShape, - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - filter_width, filter_height, - activation, - outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = { 9.0f }; - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} - -TEST(KernelACL_TC, maxPoolFloat32_3x3to1x1_RELU) -{ - util::TensorWrapper input({1,3,3,1}); - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - int32_t padding_left = 0; - int32_t padding_right = 0; - int32_t padding_top = 0; - int32_t padding_bottom = 0; - int32_t stride_width = 1; - int32_t stride_height = 1; - int32_t filter_width = 3; - int32_t filter_height = 3; - - float outputData[1]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 }; - bool bret; - - float value = -1.0f; - input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return value--; - }); - - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - int32_t activation = ANEURALNETWORKS_FUSED_RELU; - - bret = maxPoolFloat32(input.ptr<float>(), inputShape, - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - filter_width, filter_height, - activation, - outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = { 0.0f }; - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} - -TEST(KernelACL_TC, maxPoolFloat32_3x3to2x2) -{ - util::TensorWrapper input({1,3,3,1}); - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - int32_t padding_left = 0; - int32_t padding_right = 1; - int32_t padding_top = 0; - int32_t padding_bottom = 1; - int32_t stride_width = 2; - int32_t stride_height = 2; - int32_t filter_width = 2; - int32_t filter_height = 2; - - float outputData[4]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,2,2,1}, 1.0, 0 }; - bool bret; - - float value = 1.0f; - input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return value++; - }); - - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - int32_t activation = ANEURALNETWORKS_FUSED_NONE; - - bret = maxPoolFloat32(input.ptr<float>(), inputShape, - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - filter_width, filter_height, - activation, - outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = { - 5.0f, 6.0f, - 8.0f, 9.0f - }; - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} - -TEST(KernelACL_TC, maxPoolFloat32_147x147to73x73) -{ - util::TensorWrapper input({1,147,147,64}); - util::TensorWrapper output({1,73,73,64}); - - int32_t padding_left = 0; - int32_t padding_right = 0; - int32_t padding_top = 0; - int32_t padding_bottom = 0; - int32_t stride_width = 2; - int32_t stride_height = 2; - int32_t filter_width = 3; - int32_t filter_height = 3; - - input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 1.0f; - }); - - output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 0.f; - }); - - int32_t activation = ANEURALNETWORKS_FUSED_NONE; - - bool bret = maxPoolFloat32(input.ptr<float>(), input.shape(), - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - filter_width, filter_height, - activation, - output.ptr<float>(), output.shape()); - EXPECT_EQ(bret, true); - - util::TensorWrapper expected({1,73,73,64}); - expected.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 1.0f; - }); - - EXPECT_EQ(output, expected); -} - -TEST(KernelACL_TC, maxPoolFloat32_71x71to35x35) -{ - util::TensorWrapper input({1,71,71,192}); - util::TensorWrapper output({1,35,35,192}); - - int32_t padding_left = 0; - int32_t padding_right = 0; - int32_t padding_top = 0; - int32_t padding_bottom = 0; - int32_t stride_width = 2; - int32_t stride_height = 2; - int32_t filter_width = 3; - int32_t filter_height = 3; - - input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 1.0f; - }); - - output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 0.f; - }); - - int32_t activation = ANEURALNETWORKS_FUSED_NONE; - - bool bret = maxPoolFloat32(input.ptr<float>(), input.shape(), - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - filter_width, filter_height, - activation, - output.ptr<float>(), output.shape()); - EXPECT_EQ(bret, true); - - util::TensorWrapper expected({1,35,35,192}); - expected.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 1.0f; - }); - - EXPECT_EQ(output, expected); -} - -TEST(KernelACL_TC, averagePoolFloat32_3x3to1x1) -{ - util::TensorWrapper input({1,3,3,1}); - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - int32_t padding_left = 0; - int32_t padding_right = 0; - int32_t padding_top = 0; - int32_t padding_bottom = 0; - int32_t stride_width = 1; - int32_t stride_height = 1; - int32_t filter_width = 3; - int32_t filter_height = 3; - - float outputData[1]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 }; - bool bret; - - float value = 1.0f; - input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return value++; - }); - - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - int32_t activation = ANEURALNETWORKS_FUSED_NONE; - - bret = averagePoolFloat32(input.ptr<float>(), inputShape, - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - filter_width, filter_height, - activation, - outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = { 5.0f }; - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} - -TEST(KernelACL_TC, averagePoolFloat32_3x3to1x1_RELU) -{ - util::TensorWrapper input({1,3,3,1}); - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - int32_t padding_left = 0; - int32_t padding_right = 0; - int32_t padding_top = 0; - int32_t padding_bottom = 0; - int32_t stride_width = 1; - int32_t stride_height = 1; - int32_t filter_width = 3; - int32_t filter_height = 3; - - float outputData[1]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 }; - bool bret; - - float value = 3.0f; - input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return value--; - }); - - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - int32_t activation = ANEURALNETWORKS_FUSED_RELU; - - bret = averagePoolFloat32(input.ptr<float>(), inputShape, - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - filter_width, filter_height, - activation, - outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = { 0.0f }; - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} - -TEST(KernelACL_TC, averagePoolFloat32_3x3to2x2) -{ - util::TensorWrapper input({1,3,3,1}); - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - int32_t padding_left = 0; - int32_t padding_right = 0; - int32_t padding_top = 0; - int32_t padding_bottom = 0; - int32_t stride_width = 1; - int32_t stride_height = 1; - int32_t filter_width = 2; - int32_t filter_height = 2; - - float outputData[4]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,2,2,1}, 1.0, 0 }; - bool bret; - - float value = 1.0f; - input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return value++; - }); - - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - int32_t activation = ANEURALNETWORKS_FUSED_NONE; - - bret = averagePoolFloat32(input.ptr<float>(), inputShape, - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - filter_width, filter_height, - activation, - outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = { - 3.0f, 4.0f, - 6.0f, 7.0f - }; - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} - -TEST(KernelACL_TC, averagePoolFloat32_3x3to3x3) -{ - std::vector<uint32_t> dims = {1,3,3,1}; - util::TensorWrapper input(dims); - util::TensorWrapper output(dims); - - int32_t padding_left = 1; - int32_t padding_right = 1; - int32_t padding_top = 1; - int32_t padding_bottom = 1; - int32_t stride_width = 1; - int32_t stride_height = 1; - int32_t filter_width = 3; - int32_t filter_height = 3; - - int32_t value=1.0f; - input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return value++; - }); - - output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 0.f; - }); - - int32_t activation = ANEURALNETWORKS_FUSED_NONE; - - bool bret = averagePoolFloat32(input.ptr<float>(), input.shape(), - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - filter_width, filter_height, - activation, - output.ptr<float>(), output.shape()); - EXPECT_EQ(bret, true); - - util::TensorWrapper expected(dims); - float v=2.5f; - expected.initValue([&v](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - v = v + 0.5f; - return v; - }); - - EXPECT_EQ(output, expected); -} - -TEST(KernelACL_TC, averagePoolFloat32_35x35to35x35) -{ - int32_t N=35; - std::vector<uint32_t> dims = {1,35,35,768}; - util::TensorWrapper input(dims); - util::TensorWrapper output(dims); - - int32_t padding_left = 1; - int32_t padding_right = 1; - int32_t padding_top = 1; - int32_t padding_bottom = 1; - int32_t stride_width = 1; - int32_t stride_height = 1; - int32_t filter_width = 3; - int32_t filter_height = 3; - - input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 1.0f; - }); - - output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 0.f; - }); - - int32_t activation = ANEURALNETWORKS_FUSED_NONE; - - bool bret = averagePoolFloat32(input.ptr<float>(), input.shape(), - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - filter_width, filter_height, - activation, - output.ptr<float>(), output.shape()); - EXPECT_EQ(bret, true); - - util::TensorWrapper expected(dims); - expected.initValue([&N](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 1.0f; - }); - - EXPECT_EQ(output, expected); -} - -TEST(KernelACL_TC, averagePoolFloat32_8x8to1x1) -{ - util::TensorWrapper input({1,8,8,2048}); - util::TensorWrapper output({1,1,1,2048}); - - int32_t padding_left = 0; - int32_t padding_right = 0; - int32_t padding_top = 0; - int32_t padding_bottom = 0; - int32_t stride_width = 2; - int32_t stride_height = 2; - int32_t filter_width = 8; - int32_t filter_height = 8; - - input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 1.0f; - }); - - output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 0.f; - }); - - int32_t activation = ANEURALNETWORKS_FUSED_NONE; - - bool bret = averagePoolFloat32(input.ptr<float>(), input.shape(), - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - filter_width, filter_height, - activation, - output.ptr<float>(), output.shape()); - EXPECT_EQ(bret, true); - - util::TensorWrapper expected({1,1,1,2048}); - expected.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 1.0f; - }); - - EXPECT_EQ(output, expected); -} diff --git a/libs/kernel/acl/src/cl/Reshape.cpp b/libs/kernel/acl/src/cl/Reshape.cpp deleted file mode 100644 index e420ab92b..000000000 --- a/libs/kernel/acl/src/cl/Reshape.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <OperationsUtils.h> -#include <arm_compute/core/TensorShape.h> -#include <arm_compute/core/TensorInfo.h> - -// TODO: fix include path in CMakeFiles -#include "../IO_accessor.h" -#include "../shape.h" -#include "../CLUniqueTensor.h" -#include "../Reshape.h" - -namespace nnfw { -namespace kernel { -namespace acl { - -static void sync_scheduler() { - arm_compute::CLScheduler::get().sync(); -} - -bool reshapeGeneric(const void* inputData, const nnfw::rt::Shape& inputShape, - void* outputData, const nnfw::rt::Shape& outputShape) { - return common::reshapeGeneric<CLUniqueTensor, arm_compute::CLReshapeLayer> - (inputData, inputShape, outputData, outputShape, sync_scheduler); -} - -} // namespace acl -} // namespace kernel -} // namespace nnfw diff --git a/libs/kernel/acl/src/cl/Reshape.test.cpp b/libs/kernel/acl/src/cl/Reshape.test.cpp deleted file mode 100644 index db23a6d3d..000000000 --- a/libs/kernel/acl/src/cl/Reshape.test.cpp +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#define ACL_CORE_FUNC_NAME reshapeGeneric -#define ACL_TEST(tc, t) TEST(tc, cl_##t) - -#include "../Reshape.test.h" diff --git a/libs/kernel/acl/src/cl/Softmax.cpp b/libs/kernel/acl/src/cl/Softmax.cpp deleted file mode 100644 index a628f05fe..000000000 --- a/libs/kernel/acl/src/cl/Softmax.cpp +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <OperationsUtils.h> -#include <NeuralNetworks.h> - -#include <arm_compute/core/TensorShape.h> -#include <arm_compute/core/TensorInfo.h> -#include "../IO_accessor.h" -#include "../shape.h" -#include "../CLUniqueTensor.h" -#include "../util.h" - -namespace nnfw { -namespace kernel { -namespace acl { - -bool softmaxFloat32(const float* inputData, const nnfw::rt::Shape& inputShape, - const float beta, - float* outputData, const nnfw::rt::Shape& outputShape) -{ - arm_compute::TensorShape input_shape = util::fromNNShape(inputShape); - arm_compute::TensorShape output_shape = util::fromNNShape(outputShape); - - CLUniqueTensor input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32)); - CLUniqueTensor output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32)); - - auto softmax_f = std::make_shared<arm_compute::CLSoftmaxLayer>(); - softmax_f->configure(input.ptr(), output.ptr(), beta); - - input.allocate(); - output.allocate(); - - if (inputShape.dimensions.size() == 4) - { - TensorAccess<InputAccessor>(input.ref(), inputData, inputShape); - - softmax_f->run(); - - arm_compute::CLScheduler::get().sync(); - - TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape); - } - else if (inputShape.dimensions.size() == 2) - { - TensorAccess<MatrixInputAccessor>(input.ref(), inputData, inputShape); - - softmax_f->run(); - - arm_compute::CLScheduler::get().sync(); - - TensorAccess<MatrixOutputAccessor>(output.ref(), outputData, outputShape); - } - else - { - assert("undefined dimension of input" && 0); - return false; - } - - return true; -} - -} // namespace acl -} // namespace kernel -} // namespace nnfw diff --git a/libs/kernel/acl/src/cl/Softmax.test.cpp b/libs/kernel/acl/src/cl/Softmax.test.cpp deleted file mode 100644 index 8ee8b41e2..000000000 --- a/libs/kernel/acl/src/cl/Softmax.test.cpp +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <gtest/gtest.h> -#include <OperationsUtils.h> -#include <kernel/acl/nnfw_kernel_acl.h> -#include <arm_compute/core/Types.h> -#include <kernel/acl/Softmax.h> - -#include "../util.h" - -using namespace nnfw::kernel::acl; - -TEST(KernelACL_TC, softmaxFloat32_1xn) -{ - float inputData[4]; - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,4}, 1.0, 0 }; - float outputData[4]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,4}, 1.0, 0 }; - const float beta = 1.0f; - bool bret; - - util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0); - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - bret = softmaxFloat32(inputData, inputShape, beta, outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = { 0.25f, 0.25f, 0.25f, 0.25f }; - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} - -TEST(KernelACL_TC, softmaxFloat32_4d) -{ - float inputData[4]; - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 }; - float outputData[4]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 }; - const float beta = 1.0f; - bool bret; - - util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0); - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - bret = softmaxFloat32(inputData, inputShape, beta, outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = { 0.25f, 0.25f, 0.25f, 0.25f }; - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} - -TEST(KernelACL_TC, softmaxFloat32_1xn_seq) -{ - float inputData[4]; - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,4}, 1.0, 0 }; - float outputData[4]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,4}, 1.0, 0 }; - const float beta = 1.0f; - bool bret; - - util::initData_Increasing(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0); - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - bret = softmaxFloat32(inputData, inputShape, beta, outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = {0.032058603280085, 0.0871443187420326, 0.23688281808991, 0.643914259887972}; - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} - -TEST(KernelACL_TC, softmaxFloat32_4d_seq) -{ - float inputData[4]; - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 }; - float outputData[4]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 }; - const float beta = 1.0f; - bool bret; - - util::initData_Increasing(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0); - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - bret = softmaxFloat32(inputData, inputShape, beta, outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = {0.032058603280085, 0.0871443187420326, 0.23688281808991, 0.643914259887972}; - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} diff --git a/libs/kernel/acl/src/gtest_env.cpp b/libs/kernel/acl/src/gtest_env.cpp deleted file mode 100644 index f6fc52f7a..000000000 --- a/libs/kernel/acl/src/gtest_env.cpp +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <gtest/gtest.h> -#include <kernel/acl/nnfw_kernel_acl.h> - -class TestEnvironment : public ::testing::Environment -{ -public: - virtual ~TestEnvironment() = default; - - virtual void SetUp() - { - nnfw::kernel::acl::Initialize(); - } - - virtual void TearDown() - { - // DO NOTHING - } -}; - -static ::testing::Environment* const testingenv = - ::testing::AddGlobalTestEnvironment(new TestEnvironment); diff --git a/libs/kernel/acl/src/neon/Concatenation.cpp b/libs/kernel/acl/src/neon/Concatenation.cpp deleted file mode 100644 index 8738a9d12..000000000 --- a/libs/kernel/acl/src/neon/Concatenation.cpp +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <OperationsUtils.h> -#include <arm_compute/core/TensorShape.h> -#include <arm_compute/core/TensorInfo.h> - -#include <cassert> - -// TODO: fix include path in CMakeFiles -#include "../IO_accessor.h" -#include "../shape.h" - -namespace nnfw { -namespace kernel { -namespace acl { -namespace neon { - -bool concatenationFloat32(const std::vector<const float*>& inputDataPtrs, - const std::vector<nnfw::rt::Shape>& inputShapes, int32_t axis, - float* outputData, const nnfw::rt::Shape& outputShape) -{ - if (axis != 3) - { - assert("Only support axis=3 for ACL" && 0); - return false; - } - assert(inputDataPtrs.size() == inputShapes.size()); - - std::vector<arm_compute::Tensor*> inputPtrs; - std::vector<arm_compute::ITensor*> inputIptrs; - arm_compute::Tensor output; - - // init Tensors - std::vector<nnfw::rt::Shape>::const_iterator it_inputShape = inputShapes.begin(); - for (auto inputData : inputDataPtrs) - { - const nnfw::rt::Shape& inputShape = *it_inputShape; - arm_compute::TensorShape input_shape = util::fromNNShape(inputShape); - arm_compute::Tensor* inputPtr = new arm_compute::Tensor(); - - inputPtr->allocator()->init(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32)); - inputPtrs.push_back(inputPtr); - inputIptrs.push_back(inputPtr); - - it_inputShape++; - } - arm_compute::TensorShape output_shape = util::fromNNShape(outputShape); - output.allocator()->init(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32)); - - // prepare ACL Concatenate and configure tensors - auto concat = std::make_shared<arm_compute::NEDepthConcatenateLayer>(); - concat->configure(inputIptrs, &output); - - // allocate Tensors - it_inputShape = inputShapes.begin(); - std::vector<const float*>::const_iterator it_inputData = inputDataPtrs.begin(); - for (auto inputPtr : inputPtrs) - { - inputPtr->allocator()->allocate(); - - const float* inputData = *it_inputData; - const nnfw::rt::Shape& inputShape = *it_inputShape; - - TensorAccess<InputAccessor>(*inputPtr, inputData, inputShape); - - it_inputShape++; - it_inputData++; - } - output.allocator()->allocate(); - - // run - concat->run(); - - // get output - TensorAccess<OutputAccessor>(output, outputData, outputShape); - - // cleanup - for (auto inputPtr : inputPtrs) - { - inputPtr->allocator()->free(); - delete inputPtr; - } - output.allocator()->free(); - - return true; -} - -} // namespace neon -} // namespace acl -} // namespace kernel -} // namespace nnfw diff --git a/libs/kernel/acl/src/neon/Concatenation.test.cpp b/libs/kernel/acl/src/neon/Concatenation.test.cpp deleted file mode 100644 index 03b05bd24..000000000 --- a/libs/kernel/acl/src/neon/Concatenation.test.cpp +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <gtest/gtest.h> -#include <OperationsUtils.h> -#include <kernel/acl/nnfw_kernel_acl.h> -#include <kernel/acl/Concatenation.h> - -// TODO: fix include path in CMakeFiles -#include "../util.h" - -using namespace nnfw::kernel::acl; - -TEST(KernelACL_TC, neon_concatFloat32_1) -{ - float inputData_1[6] = { - 1, 2, 3, 4, 5, 6 // [ [ [1],[2],[3] ], [ [4],[5],[6] ] ] - }; - float inputData_2[6] = { - 7, 8, 9, 10, 11, 12 // [ [ [7],[8],[9] ], [ [10],[11],[12] ] ] - }; - const nnfw::rt::Shape inputShape_1 = { OperandType::FLOAT32, {1,2,3,1}, 1.0, 0 }; - const nnfw::rt::Shape inputShape_2 = { OperandType::FLOAT32, {1,2,3,1}, 1.0, 0 }; - std::vector<const float*> inputDataPtrs; - std::vector<nnfw::rt::Shape> inputShapes; - float outputData[12]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,2,3,2}, 1.0, 0 }; - bool bret; - - inputDataPtrs.push_back(inputData_1); - inputDataPtrs.push_back(inputData_2); - inputShapes.push_back(inputShape_1); - inputShapes.push_back(inputShape_2); - - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - bret = neon::concatenationFloat32(inputDataPtrs, inputShapes, 3, - outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectNCHW[] = { - 1, 2, 3, 4, 5, 6, - 7, 8, 9, 10, 11, 12 - }; - float expectData[12]; // [ [ [1,7],[2,8],[3,9] ], [ [4,10],[5,11],[6,12] ] ] - util::NCHW2NHWC(expectNCHW, expectData, outputShape); - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} diff --git a/libs/kernel/acl/src/neon/Conv2D.cpp b/libs/kernel/acl/src/neon/Conv2D.cpp deleted file mode 100644 index 679ecfced..000000000 --- a/libs/kernel/acl/src/neon/Conv2D.cpp +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <OperationsUtils.h> -#include <NeuralNetworks.h> - -#include <arm_compute/core/TensorShape.h> -#include <arm_compute/core/TensorInfo.h> - -#include <util/environment.h> - -#include "../IO_accessor.h" -#include "../util.h" -#include "../shape.h" -#include "../NEUniqueTensor.h" -#include "../support.h" - -#include "util/feature/TextFormatter.h" - -#include "support/nnapi/feature/Reader.h" - -namespace nnfw { -namespace kernel { -namespace acl { -namespace neon { - -static int verbose = 0; - -bool convFloat32(const float* inputData, const nnfw::rt::Shape& inputShape, - const float* filterData, const nnfw::rt::Shape& filterShape, - const float* biasData, const nnfw::rt::Shape& biasShape, - int32_t padding_left, int32_t padding_right, - int32_t padding_top, int32_t padding_bottom, - int32_t stride_width, int32_t stride_height, - int32_t activation, - float* outputData, const nnfw::rt::Shape& outputShape) -{ - arm_compute::TensorShape input_shape = util::fromNNShape(inputShape); - arm_compute::TensorShape filter_shape = util::fromNNShape(filterShape); - arm_compute::TensorShape bias_shape = util::fromVectorNNShape(biasShape); - arm_compute::TensorShape output_shape = util::fromNNShape(outputShape); - arm_compute::PadStrideInfo conv_info = arm_compute::PadStrideInfo(stride_width, stride_height, - padding_left, padding_right, - padding_top, padding_bottom, - arm_compute::DimensionRoundingType::FLOOR); - - NEUniqueTensor input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32)); - NEUniqueTensor output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32)); - NEUniqueTensor bias(arm_compute::TensorInfo(bias_shape, arm_compute::Format::F32)); - NEUniqueTensor filter(arm_compute::TensorInfo(filter_shape, arm_compute::Format::F32)); - - std::vector<std::shared_ptr<arm_compute::IFunction>> fns; - - auto conv_f = std::make_shared<arm_compute::NEConvolutionLayer>(); - - conv_f->configure(input.ptr(), filter.ptr(), bias.ptr(), output.ptr(), conv_info); - - fns.emplace_back(conv_f); - - util::insertFusedActivationLayer<NEUniqueTensor, arm_compute::NEActivationLayer>(output, activation, fns); - - input.allocate(); - output.allocate(); - bias.allocate(); - filter.allocate(); - - TensorAccess<InputAccessor>(input.ref(), inputData, inputShape); - TensorAccess<BiasAccessor>(bias.ref(), biasData, biasShape); - TensorAccess<WeightAccessor>(filter.ref(), filterData, filterShape); - - nnfw::util::env::IntAccessor("CONV2D_VERBOSE").access(verbose); - if (verbose) - { - auto ifm_shape = nnfw::support::nnapi::feature::asFeatureShape(inputShape); - nnfw::support::nnapi::feature::Reader<float> nnapi_ifm_reader{ifm_shape, inputData}; - nnfw::support::acl::feature::Reader<float> acl_ifm_reader{ input.ptr() }; - - std::cout << "NNAPI IFM:" << std::endl; - std::cout << nnfw::util::feature::TextFormatter<float>{ifm_shape, nnapi_ifm_reader} << std::endl; - - std::cout << "ARM Compute IFM:" << std::endl; - std::cout << nnfw::util::feature::TextFormatter<float>{ifm_shape, acl_ifm_reader} << std::endl; - } - - for (const auto &fn : fns) - { - fn->run(); - } - - TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape); - - return true; -} - -} // namespace neon -} // namespace acl -} // namespace kernel -} // namespace nnfw diff --git a/libs/kernel/acl/src/neon/Conv2D.test.cpp b/libs/kernel/acl/src/neon/Conv2D.test.cpp deleted file mode 100644 index 6a3de1c43..000000000 --- a/libs/kernel/acl/src/neon/Conv2D.test.cpp +++ /dev/null @@ -1,202 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <gtest/gtest.h> -#include <OperationsUtils.h> -#include <kernel/acl/nnfw_kernel_acl.h> -#include <kernel/acl/Conv2D.h> - -// TODO: fix include path in CMakeFiles -#include "../util.h" - -using namespace nnfw::kernel::acl; - -TEST(KernelACL_TC, neon_convFloat32_3x3to1x1) -{ - float inputData[9]; - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - float filterData[9]; - const nnfw::rt::Shape filterShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - float biasData[1] = { 1.0 }; - const nnfw::rt::Shape biasShape = { OperandType::FLOAT32, {1}, 1.0, 0 }; - int32_t padding_left = 0; - int32_t padding_right = 0; - int32_t padding_top = 0; - int32_t padding_bottom = 0; - int32_t stride_width = 1; - int32_t stride_height = 1; - int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU); - float outputData[1]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 }; - bool bret; - - util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0); - util::initData(filterData, sizeof(filterData) / sizeof(filterData[0]), 1.0); - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - bret = neon::convFloat32(inputData, inputShape, - filterData, filterShape, - biasData, biasShape, - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - activation, - outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = { 10.0f }; - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} - -TEST(KernelACL_TC, neon_convFloat32_3x3to3x3) -{ - float inputData[9]; - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - float filterData[9]; - const nnfw::rt::Shape filterShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - float biasData[1] = { 1.0 }; - const nnfw::rt::Shape biasShape = { OperandType::FLOAT32, {1}, 1.0, 0 }; - int32_t padding_left = 1; - int32_t padding_right = 1; - int32_t padding_top = 1; - int32_t padding_bottom = 1; - int32_t stride_width = 1; - int32_t stride_height = 1; - int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU); - float outputData[9]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - bool bret; - - util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0); - util::initData(filterData, sizeof(filterData) / sizeof(filterData[0]), 1.0); - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - bret = neon::convFloat32(inputData, inputShape, - filterData, filterShape, - biasData, biasShape, - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - activation, - outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = { - 5.0f, 7.0f, 5.0f, - 7.0f, 10.0f, 7.0f, - 5.0f, 7.0f, 5.0f - }; - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} - -TEST(KernelACL_TC, neon_convFloat32_3x3to3x3_RELU) -{ - float inputData[9]; - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - float filterData[9]; - const nnfw::rt::Shape filterShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - float biasData[1] = { -5.0f }; - const nnfw::rt::Shape biasShape = { OperandType::FLOAT32, {1}, 1.0, 0 }; - int32_t padding_left = 1; - int32_t padding_right = 1; - int32_t padding_top = 1; - int32_t padding_bottom = 1; - int32_t stride_width = 1; - int32_t stride_height = 1; - int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU); - float outputData[9]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - bool bret; - - util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0); - util::initData(filterData, sizeof(filterData) / sizeof(filterData[0]), 1.0); - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - bret = neon::convFloat32(inputData, inputShape, - filterData, filterShape, - biasData, biasShape, - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - activation, - outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = - { - 0.0f, 1.0f, 0.0f, - 1.0f, 4.0f, 1.0f, - 0.0f, 1.0f, 0.0f - }; - - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} - -TEST(KernelACL_TC, neon_convFloat32_3x5to3x3) -{ - float inputData[15] = { - 1,2,3,4,5, - 6,7,8,9,10, - 11,12,13,14,15 - }; - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,5,1}, 1.0, 0 }; - float filterData[18] = { - 1,1,1, 1,1,1, 1,1,1, - 2,2,2, 2,2,2, 2,2,2 - }; - const nnfw::rt::Shape filterShape = { OperandType::FLOAT32, {2,3,3,1}, 1.0, 0 }; - float biasData[2] = { 1.0, 1.0 }; - const nnfw::rt::Shape biasShape = { OperandType::FLOAT32, {2}, 1.0, 0 }; - int32_t padding_left = 1; - int32_t padding_right = 1; - int32_t padding_top = 1; - int32_t padding_bottom = 1; - int32_t stride_width = 1; - int32_t stride_height = 1; - int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU); - float outputData[30]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,3,5,2}, 1.0, 0 }; - bool bret; - - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - bret = neon::convFloat32(inputData, inputShape, - filterData, filterShape, - biasData, biasShape, - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - activation, - outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectNCHW[] = { - 17.0f, 28.0f, 34.0f, 40.0f, 29.0f, - 40.0f, 64.0f, 73.0f, 82.0f, 58.0f, - 37.0f, 58.0f, 64.0f, 70.0f, 49.0f, - - 33.0f, 55.0f, 67.0f, 79.0f, 57.0f, - 79.0f, 127.0f, 145.0f, 163.0f, 115.0f, - 73.0f, 115.0f, 127.0f, 139.0f, 97.0f - }; - float expectData[30]; - util::NCHW2NHWC(expectNCHW, expectData, outputShape); - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} diff --git a/libs/kernel/acl/src/neon/DepthwiseConv2D.cpp b/libs/kernel/acl/src/neon/DepthwiseConv2D.cpp deleted file mode 100644 index bcf56c667..000000000 --- a/libs/kernel/acl/src/neon/DepthwiseConv2D.cpp +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <OperationsUtils.h> -#include <arm_compute/core/TensorShape.h> -#include <arm_compute/core/TensorInfo.h> -#include <arm_compute/runtime/NEON/NEScheduler.h> - -#include <cassert> - -// TODO: fix include path in CMakeFiles -#include "../IO_accessor.h" -#include "../shape.h" -#include "../NEUniqueTensor.h" -#include "../DepthwiseConv2D.h" - -namespace nnfw { -namespace kernel { -namespace acl { - -namespace neon { -static void sync_scheduler() { -} - -bool depthwiseConvFloat32(const float* inputData, const nnfw::rt::Shape& inputShape, - const float* filterData, const nnfw::rt::Shape& filterShape, - const float* biasData, const nnfw::rt::Shape& biasShape, - int32_t padding_left, int32_t padding_right, - int32_t padding_top, int32_t padding_bottom, - int32_t stride_width, int32_t stride_height, - int32_t depth_multiplier, int32_t activation, - float* outputData, const nnfw::rt::Shape& outputShape) { - return common::depthwiseConvFloat32<NEUniqueTensor, arm_compute::NEDepthwiseConvolutionLayer, - arm_compute::NEActivationLayer>(inputData, inputShape, - filterData, filterShape, - biasData, biasShape, - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - depth_multiplier, activation, - outputData, outputShape, - sync_scheduler); -} - -} // namespace neon -} // namespace acl -} // namespace kernel -} // namespace nnfw diff --git a/libs/kernel/acl/src/neon/FullyConnected.cpp b/libs/kernel/acl/src/neon/FullyConnected.cpp deleted file mode 100644 index 86229cbf2..000000000 --- a/libs/kernel/acl/src/neon/FullyConnected.cpp +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <OperationsUtils.h> -#include <arm_compute/core/TensorShape.h> -#include <arm_compute/core/TensorInfo.h> -#include <arm_compute/runtime/NEON/NEScheduler.h> - -#include <cassert> - -// TODO: fix include path in CMakeFiles -#include "../IO_accessor.h" -#include "../shape.h" -#include "../NEUniqueTensor.h" -#include "../FullyConnected.h" - -namespace nnfw { -namespace kernel { -namespace acl { - -namespace neon { - -void sync_scheduler() { -} - -bool fullyConnectedFloat32(const float* inputData, const nnfw::rt::Shape& inputShape, - const float* weightsData, const nnfw::rt::Shape& weightsShape, - const float* biasData, const nnfw::rt::Shape& biasShape, - int32_t activation, - float* outputData, const nnfw::rt::Shape& outputShape) { - - return common::fullyConnectedFloat32<NEUniqueTensor, arm_compute::NEFullyConnectedLayer, - arm_compute::NEActivationLayer>(inputData, inputShape, - weightsData, weightsShape, - biasData, biasShape, - activation, - outputData, outputShape, - sync_scheduler); -} - -} // namespace neon -} // namespace acl -} // namespace kernel -} // namespace nnfw - diff --git a/libs/kernel/acl/src/neon/FullyConnected.test.cpp b/libs/kernel/acl/src/neon/FullyConnected.test.cpp deleted file mode 100644 index d4c95e4cb..000000000 --- a/libs/kernel/acl/src/neon/FullyConnected.test.cpp +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#define ACL_CORE_FUNC_NAME neon::fullyConnectedFloat32 -#define ACL_TEST(tc, t) TEST(tc, neon_##t) - -#include "../FullyConnected.test.h" - diff --git a/libs/kernel/acl/src/neon/Pooling.cpp b/libs/kernel/acl/src/neon/Pooling.cpp deleted file mode 100644 index 5c58ae0b5..000000000 --- a/libs/kernel/acl/src/neon/Pooling.cpp +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <OperationsUtils.h> -#include <arm_compute/core/TensorShape.h> -#include <arm_compute/core/TensorInfo.h> -#include "../IO_accessor.h" -#include "../shape.h" -#include "../NEUniqueTensor.h" - -#include <cassert> - -namespace nnfw { -namespace kernel { -namespace acl { -namespace neon { - -bool maxPoolFloat32(const float* inputData, const nnfw::rt::Shape& inputShape, - int32_t padding_left, int32_t padding_right, - int32_t padding_top, int32_t padding_bottom, - int32_t stride_width, int32_t stride_height, - int32_t filter_width, int32_t filter_height, - int32_t activation, - float* outputData, const nnfw::rt::Shape& outputShape) -{ - arm_compute::TensorShape input_shape = util::fromNNShape(inputShape); - arm_compute::TensorShape output_shape = util::fromNNShape(outputShape); - - std::vector<std::shared_ptr<arm_compute::IFunction>> fns; - - arm_compute::PadStrideInfo pad_info = arm_compute::PadStrideInfo(stride_width, stride_height, - padding_left, padding_right, - padding_top, padding_bottom, - arm_compute::DimensionRoundingType::FLOOR); - - arm_compute::PoolingLayerInfo maxpool_info = arm_compute::PoolingLayerInfo(arm_compute::PoolingType::MAX, - arm_compute::Size2D(filter_width,filter_height), - pad_info, false); - - NEUniqueTensor input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32)); - NEUniqueTensor output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32)); - - auto pool_f = std::make_shared<arm_compute::NEPoolingLayer>(); - pool_f->configure(input.ptr(), output.ptr(), maxpool_info); - - fns.emplace_back(pool_f); - - util::insertFusedActivationLayer<NEUniqueTensor, arm_compute::NEActivationLayer>(output, activation, fns); - - input.allocate(); - output.allocate(); - - TensorAccess<InputAccessor>(input.ref(), inputData, inputShape); - - for (const auto &fn : fns) - { - fn->run(); - } - - TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape); - - return true; -} - -bool averagePoolFloat32(const float* inputData, const nnfw::rt::Shape& inputShape, - int32_t padding_left, int32_t padding_right, - int32_t padding_top, int32_t padding_bottom, - int32_t stride_width, int32_t stride_height, - int32_t filter_width, int32_t filter_height, - int32_t activation, - float* outputData, const nnfw::rt::Shape& outputShape) -{ - arm_compute::TensorShape input_shape = util::fromNNShape(inputShape); - arm_compute::TensorShape output_shape = util::fromNNShape(outputShape); - - std::vector<std::shared_ptr<arm_compute::IFunction>> fns; - - arm_compute::PadStrideInfo pad_info = arm_compute::PadStrideInfo(stride_width, stride_height, - padding_left, padding_right, - padding_top, padding_bottom, - arm_compute::DimensionRoundingType::FLOOR); - - arm_compute::PoolingLayerInfo pool_info = arm_compute::PoolingLayerInfo(arm_compute::PoolingType::AVG, - arm_compute::Size2D(filter_width,filter_height), - pad_info, true); - - NEUniqueTensor input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32)); - NEUniqueTensor output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32)); - - auto pool_f = std::make_shared<arm_compute::NEPoolingLayer>(); - pool_f->configure(input.ptr(), output.ptr(), pool_info); - - fns.emplace_back(pool_f); - - util::insertFusedActivationLayer<NEUniqueTensor, arm_compute::NEActivationLayer>(output, activation, fns); - - input.allocate(); - output.allocate(); - - TensorAccess<InputAccessor>(input.ref(), inputData, inputShape); - - for (const auto &fn : fns) - { - fn->run(); - } - - TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape); - - return true; -} - -} // namespace neon -} // namespace acl -} // namespace kernel -} // namespace nnfw diff --git a/libs/kernel/acl/src/neon/Pooling.test.cpp b/libs/kernel/acl/src/neon/Pooling.test.cpp deleted file mode 100644 index 4e6593921..000000000 --- a/libs/kernel/acl/src/neon/Pooling.test.cpp +++ /dev/null @@ -1,436 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <gtest/gtest.h> -#include <OperationsUtils.h> -#include <kernel/acl/nnfw_kernel_acl.h> -#include <arm_compute/core/Types.h> -#include <kernel/acl/Pooling.h> - -#include "../util.h" - -using namespace nnfw::kernel::acl; - -TEST(KernelACL_TC, neon_maxPoolFloat32_3x3to1x1) -{ - util::TensorWrapper input({1,3,3,1}); - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - int32_t padding_left = 0; - int32_t padding_right = 0; - int32_t padding_top = 0; - int32_t padding_bottom = 0; - int32_t stride_width = 1; - int32_t stride_height = 1; - int32_t filter_width = 3; - int32_t filter_height = 3; - - float outputData[1]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 }; - bool bret; - - float value = 1.0f; - input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return value++; - }); - - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - int32_t activation = ANEURALNETWORKS_FUSED_NONE; - - bret = neon::maxPoolFloat32(input.ptr<float>(), inputShape, - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - filter_width, filter_height, - activation, - outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = { 9.0f }; - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} - -TEST(KernelACL_TC, neon_maxPoolFloat32_3x3to1x1_RELU) -{ - util::TensorWrapper input({1,3,3,1}); - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - int32_t padding_left = 0; - int32_t padding_right = 0; - int32_t padding_top = 0; - int32_t padding_bottom = 0; - int32_t stride_width = 1; - int32_t stride_height = 1; - int32_t filter_width = 3; - int32_t filter_height = 3; - - float outputData[1]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 }; - bool bret; - - float value = -1.0f; - input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return value--; - }); - - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - int32_t activation = ANEURALNETWORKS_FUSED_RELU; - - bret = neon::maxPoolFloat32(input.ptr<float>(), inputShape, - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - filter_width, filter_height, - activation, - outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = { 0.0f }; - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} - -TEST(KernelACL_TC, neon_maxPoolFloat32_3x3to2x2) -{ - util::TensorWrapper input({1,3,3,1}); - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - int32_t padding_left = 0; - int32_t padding_right = 1; - int32_t padding_top = 0; - int32_t padding_bottom = 1; - int32_t stride_width = 2; - int32_t stride_height = 2; - int32_t filter_width = 2; - int32_t filter_height = 2; - - float outputData[4]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,2,2,1}, 1.0, 0 }; - bool bret; - - float value = 1.0f; - input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return value++; - }); - - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - int32_t activation = ANEURALNETWORKS_FUSED_NONE; - - bret = neon::maxPoolFloat32(input.ptr<float>(), inputShape, - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - filter_width, filter_height, - activation, - outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = { - 5.0f, 6.0f, - 8.0f, 9.0f - }; - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} - -TEST(KernelACL_TC, neon_maxPoolFloat32_147x147to73x73) -{ - util::TensorWrapper input({1,147,147,64}); - util::TensorWrapper output({1,73,73,64}); - - int32_t padding_left = 0; - int32_t padding_right = 0; - int32_t padding_top = 0; - int32_t padding_bottom = 0; - int32_t stride_width = 2; - int32_t stride_height = 2; - int32_t filter_width = 3; - int32_t filter_height = 3; - - input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 1.0f; - }); - - output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 0.f; - }); - - int32_t activation = ANEURALNETWORKS_FUSED_NONE; - - bool bret = neon::maxPoolFloat32(input.ptr<float>(), input.shape(), - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - filter_width, filter_height, - activation, - output.ptr<float>(), output.shape()); - EXPECT_EQ(bret, true); - - util::TensorWrapper expected({1,73,73,64}); - expected.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 1.0f; - }); - - EXPECT_EQ(output, expected); -} - -TEST(KernelACL_TC, neon_maxPoolFloat32_71x71to35x35) -{ - util::TensorWrapper input({1,71,71,192}); - util::TensorWrapper output({1,35,35,192}); - - int32_t padding_left = 0; - int32_t padding_right = 0; - int32_t padding_top = 0; - int32_t padding_bottom = 0; - int32_t stride_width = 2; - int32_t stride_height = 2; - int32_t filter_width = 3; - int32_t filter_height = 3; - - input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 1.0f; - }); - - output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 0.f; - }); - - int32_t activation = ANEURALNETWORKS_FUSED_NONE; - - bool bret = neon::maxPoolFloat32(input.ptr<float>(), input.shape(), - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - filter_width, filter_height, - activation, - output.ptr<float>(), output.shape()); - EXPECT_EQ(bret, true); - - util::TensorWrapper expected({1,35,35,192}); - expected.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 1.0f; - }); - - EXPECT_EQ(output, expected); -} - -TEST(KernelACL_TC, neon_averagePoolFloat32_3x3to1x1) -{ - util::TensorWrapper input({1,3,3,1}); - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - int32_t padding_left = 0; - int32_t padding_right = 0; - int32_t padding_top = 0; - int32_t padding_bottom = 0; - int32_t stride_width = 1; - int32_t stride_height = 1; - int32_t filter_width = 3; - int32_t filter_height = 3; - - float outputData[1]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 }; - bool bret; - - float value = 1.0f; - input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return value++; - }); - - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - int32_t activation = ANEURALNETWORKS_FUSED_NONE; - - bret = neon::averagePoolFloat32(input.ptr<float>(), inputShape, - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - filter_width, filter_height, - activation, - outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = { 5.0f }; - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} - -TEST(KernelACL_TC, neon_averagePoolFloat32_3x3to1x1_RELU) -{ - util::TensorWrapper input({1,3,3,1}); - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - int32_t padding_left = 0; - int32_t padding_right = 0; - int32_t padding_top = 0; - int32_t padding_bottom = 0; - int32_t stride_width = 1; - int32_t stride_height = 1; - int32_t filter_width = 3; - int32_t filter_height = 3; - - float outputData[1]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 }; - bool bret; - - float value = 3.0f; - input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return value--; - }); - - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - int32_t activation = ANEURALNETWORKS_FUSED_RELU; - - bret = neon::averagePoolFloat32(input.ptr<float>(), inputShape, - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - filter_width, filter_height, - activation, - outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = { 0.0f }; - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} - -TEST(KernelACL_TC, neon_averagePoolFloat32_3x3to2x2) -{ - util::TensorWrapper input({1,3,3,1}); - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 }; - int32_t padding_left = 0; - int32_t padding_right = 0; - int32_t padding_top = 0; - int32_t padding_bottom = 0; - int32_t stride_width = 1; - int32_t stride_height = 1; - int32_t filter_width = 2; - int32_t filter_height = 2; - - float outputData[4]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,2,2,1}, 1.0, 0 }; - bool bret; - - float value = 1.0f; - input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return value++; - }); - - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - int32_t activation = ANEURALNETWORKS_FUSED_NONE; - - bret = neon::averagePoolFloat32(input.ptr<float>(), inputShape, - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - filter_width, filter_height, - activation, - outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = { - 3.0f, 4.0f, - 6.0f, 7.0f - }; - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} - -TEST(KernelACL_TC, neon_averagePoolFloat32_35x35to35x35) -{ - std::vector<uint32_t> dims = {1,35,35,192}; - util::TensorWrapper input(dims); - util::TensorWrapper output(dims); - - int32_t padding_left = 1; - int32_t padding_right = 1; - int32_t padding_top = 1; - int32_t padding_bottom = 1; - int32_t stride_width = 1; - int32_t stride_height = 1; - int32_t filter_width = 3; - int32_t filter_height = 3; - - input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 1.0f; - }); - - output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 0.f; - }); - - int32_t activation = ANEURALNETWORKS_FUSED_NONE; - - bool bret = neon::averagePoolFloat32(input.ptr<float>(), input.shape(), - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - filter_width, filter_height, - activation, - output.ptr<float>(), output.shape()); - EXPECT_EQ(bret, true); - - util::TensorWrapper expected(dims); - expected.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 1.0f; - }); - - EXPECT_EQ(output, expected); -} - -TEST(KernelACL_TC, neon_averagePoolFloat32_8x8to1x1) -{ - util::TensorWrapper input({1,8,8,2048}); - util::TensorWrapper output({1,1,1,2048}); - - int32_t padding_left = 0; - int32_t padding_right = 0; - int32_t padding_top = 0; - int32_t padding_bottom = 0; - int32_t stride_width = 2; - int32_t stride_height = 2; - int32_t filter_width = 8; - int32_t filter_height = 8; - - input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 1.0f; - }); - - output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 0.f; - }); - - int32_t activation = ANEURALNETWORKS_FUSED_NONE; - - bool bret = neon::averagePoolFloat32(input.ptr<float>(), input.shape(), - padding_left, padding_right, - padding_top, padding_bottom, - stride_width, stride_height, - filter_width, filter_height, - activation, - output.ptr<float>(), output.shape()); - EXPECT_EQ(bret, true); - - util::TensorWrapper expected({1,1,1,2048}); - expected.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) { - return 1.0f; - }); - - EXPECT_EQ(output, expected); -} diff --git a/libs/kernel/acl/src/neon/Reshape.cpp b/libs/kernel/acl/src/neon/Reshape.cpp deleted file mode 100644 index cef84c7f3..000000000 --- a/libs/kernel/acl/src/neon/Reshape.cpp +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <OperationsUtils.h> -#include <arm_compute/core/TensorShape.h> -#include <arm_compute/core/TensorInfo.h> - -// TODO: fix include path in CMakeFiles -#include "../IO_accessor.h" -#include "../shape.h" -#include "../NEUniqueTensor.h" -#include "../Reshape.h" - -namespace nnfw { -namespace kernel { -namespace acl { - -namespace neon { - -static void sync_scheduler() { - arm_compute::CLScheduler::get().sync(); -} - -bool reshapeGeneric(const void* inputData, const nnfw::rt::Shape& inputShape, - void* outputData, const nnfw::rt::Shape& outputShape) { - return common::reshapeGeneric<NEUniqueTensor, arm_compute::NEReshapeLayer> - (inputData, inputShape, outputData, outputShape, sync_scheduler); -} - -} // namespace neon - -} // namespace acl -} // namespace kernel -} // namespace nnfw - diff --git a/libs/kernel/acl/src/neon/Reshape.test.cpp b/libs/kernel/acl/src/neon/Reshape.test.cpp deleted file mode 100644 index 9aca45e7e..000000000 --- a/libs/kernel/acl/src/neon/Reshape.test.cpp +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#define ACL_CORE_FUNC_NAME neon::reshapeGeneric -#define ACL_TEST(tc, t) TEST(tc, neon_##t) - -#include "../Reshape.test.h" diff --git a/libs/kernel/acl/src/neon/Softmax.cpp b/libs/kernel/acl/src/neon/Softmax.cpp deleted file mode 100644 index 79d614418..000000000 --- a/libs/kernel/acl/src/neon/Softmax.cpp +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <OperationsUtils.h> -#include <NeuralNetworks.h> - -#include <arm_compute/core/TensorShape.h> -#include <arm_compute/core/TensorInfo.h> -#include "../IO_accessor.h" -#include "../shape.h" -#include "../util.h" -#include "../NEUniqueTensor.h" - -namespace nnfw { -namespace kernel { -namespace acl { -namespace neon { - -bool softmaxFloat32(const float* inputData, const nnfw::rt::Shape& inputShape, - const float beta, - float* outputData, const nnfw::rt::Shape& outputShape) -{ - arm_compute::TensorShape input_shape = util::fromNNShape(inputShape); - arm_compute::TensorShape output_shape = util::fromNNShape(outputShape); - - NEUniqueTensor input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32)); - NEUniqueTensor output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32)); - - auto softmax_f = std::make_shared<arm_compute::NESoftmaxLayer>(); - softmax_f->configure(input.ptr(), output.ptr(), beta); - - input.allocate(); - output.allocate(); - - if (inputShape.dimensions.size() == 4) - { - TensorAccess<InputAccessor>(input.ref(), inputData, inputShape); - - softmax_f->run(); - - TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape); - } - else if (inputShape.dimensions.size() == 2) - { - // Softmax comes with 1xN matrix and this is translated to N vector in arm_compute::TensorShape - TensorAccess<VectorInputAccessor>(input.ref(), inputData, inputShape); - - softmax_f->run(); - - TensorAccess<VectorOutputAccessor>(output.ref(), outputData, outputShape); - } - else - { - assert("undefined dimension of input" && 0); - return false; - } - - return true; -} - -} // namespace neon -} // namespace acl -} // namespace kernel -} // namespace nnfw diff --git a/libs/kernel/acl/src/neon/Softmax.test.cpp b/libs/kernel/acl/src/neon/Softmax.test.cpp deleted file mode 100644 index 988f55078..000000000 --- a/libs/kernel/acl/src/neon/Softmax.test.cpp +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <gtest/gtest.h> -#include <OperationsUtils.h> -#include <kernel/acl/nnfw_kernel_acl.h> -#include <arm_compute/core/Types.h> -#include <kernel/acl/Softmax.h> - -#include "../util.h" - -using namespace nnfw::kernel::acl; - -TEST(KernelACL_TC, neon_softmaxFloat32_1xn) -{ - float inputData[4]; - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,4}, 1.0, 0 }; - float outputData[4]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,4}, 1.0, 0 }; - const float beta = 1.0f; - bool bret; - - util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0); - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - bret = neon::softmaxFloat32(inputData, inputShape, beta, outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = { 0.25f, 0.25f, 0.25f, 0.25f }; - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} - -TEST(KernelACL_TC, neon_softmaxFloat32_4d) -{ - float inputData[4]; - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 }; - float outputData[4]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 }; - const float beta = 1.0f; - bool bret; - - util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0); - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - bret = neon::softmaxFloat32(inputData, inputShape, beta, outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = { 0.25f, 0.25f, 0.25f, 0.25f }; - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} - -TEST(KernelACL_TC, neon_softmaxFloat32_1xn_seq) -{ - float inputData[4]; - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 }; - float outputData[4]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 }; - const float beta = 1.0f; - bool bret; - - util::initData_Increasing(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0); - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - bret = neon::softmaxFloat32(inputData, inputShape, beta, outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = {0.032058603280085, 0.0871443187420326, 0.23688281808991, 0.643914259887972}; - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} - -TEST(KernelACL_TC, neon_softmaxFloat32_4d_seq) -{ - float inputData[4]; - const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 }; - float outputData[4]; - const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 }; - const float beta = 1.0f; - bool bret; - - util::initData_Increasing(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0); - util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0); - - bret = neon::softmaxFloat32(inputData, inputShape, beta, outputData, outputShape); - EXPECT_EQ(bret, true); - - float expectData[] = {0.032058603280085, 0.0871443187420326, 0.23688281808991, 0.643914259887972}; - bret = util::compareData(outputData, expectData, outputShape); - EXPECT_EQ(bret, true); -} diff --git a/libs/kernel/acl/src/shape.cpp b/libs/kernel/acl/src/shape.cpp deleted file mode 100644 index 3c976ae94..000000000 --- a/libs/kernel/acl/src/shape.cpp +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <cassert> - -#include "shape.h" - -namespace nnfw { -namespace rt { - -// TODO remove from this source and use it from runtime -uint32_t getSizeOfDimension(const Shape& shape, uint32_t dimensionIdx) { - if (dimensionIdx >= shape.dimensions.size()) { - // TODO, log the error - return 0; - } - return shape.dimensions[dimensionIdx]; -} - -} // namespace rt -} // namespace nnfw - -namespace nnfw { -namespace kernel { -namespace acl { -namespace util { - -arm_compute::TensorShape fromVectorNNShape(const nnfw::rt::Shape& shape) -{ - assert(shape.dimensions.size() == 1); - - const uint32_t len = nnfw::rt::getSizeOfDimension(shape, 0); - - return arm_compute::TensorShape(len); -} - -arm_compute::TensorShape fromMatrixNNShape(const nnfw::rt::Shape& shape) -{ - assert(shape.dimensions.size() == 2); - - const uint32_t n = nnfw::rt::getSizeOfDimension(shape, 0); - const uint32_t c = nnfw::rt::getSizeOfDimension(shape, 1); - - return arm_compute::TensorShape(c, n); -} - -arm_compute::TensorShape fromNNShape(const nnfw::rt::Shape& shape) -{ - if( shape.dimensions.size() == 1 ) - return fromVectorNNShape(shape); - else if( shape.dimensions.size() == 2 ) - return fromMatrixNNShape(shape); - - // TODO: need to treat 3D tensors. - - assert(shape.dimensions.size() == 4); - - // NNAPI assumes the following ordering: - // - // dim(0) -> N - // dim(1) -> H - // dim(2) -> W - // dim(3) -> C - // - uint32_t c = nnfw::rt::getSizeOfDimension(shape, 3); - uint32_t h = nnfw::rt::getSizeOfDimension(shape, 1); - uint32_t w = nnfw::rt::getSizeOfDimension(shape, 2); - uint32_t n = nnfw::rt::getSizeOfDimension(shape, 0); - - return arm_compute::TensorShape(w, h, c, n); -} - -} // namespace util -} // namespace acl -} // namespace kernel -} // namespace nnfw diff --git a/libs/kernel/acl/src/shape.h b/libs/kernel/acl/src/shape.h deleted file mode 100644 index 902115ebd..000000000 --- a/libs/kernel/acl/src/shape.h +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_KERNEL_ACL_SHAPE_H__ -#define __NNFW_KERNEL_ACL_SHAPE_H__ - -#include <OperationsUtils.h> -#include <arm_compute/core/TensorShape.h> -#include <arm_compute/core/TensorInfo.h> -#include <arm_compute/runtime/IFunction.h> -#include <cassert> - -namespace nnfw { -namespace rt { - -// TODO remove from this source and use it from runtime -uint32_t getSizeOfDimension(const Shape& shape, uint32_t dimensionIdx); - -} // namespace rt -} // namespace nnfw - -namespace nnfw { -namespace kernel { -namespace acl { -namespace util { - -arm_compute::TensorShape fromVectorNNShape(const nnfw::rt::Shape& shape); -arm_compute::TensorShape fromNNShape(const nnfw::rt::Shape& shape); - -template<class TensorT, class ActT> -void insertFusedActivationLayer(TensorT& out, int activation, - std::vector<std::shared_ptr<arm_compute::IFunction>>& fns) { - auto relu_f = std::make_shared<ActT>(); - - switch(activation) { - case ANEURALNETWORKS_FUSED_NONE: - // DO NOTHING - return; - - case ANEURALNETWORKS_FUSED_RELU: - { - const arm_compute::ActivationLayerInfo relu_info(arm_compute::ActivationLayerInfo::ActivationFunction::RELU); - - // Do in-place update - relu_f->configure(out.ptr(), nullptr, relu_info); - } - break; - - case ANEURALNETWORKS_FUSED_RELU1: - { - const arm_compute::ActivationLayerInfo relu_info(arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 1.f); - - // Do in-place update - relu_f->configure(out.ptr(), nullptr, relu_info); - } - break; - - case ANEURALNETWORKS_FUSED_RELU6: - { - const arm_compute::ActivationLayerInfo relu_info(arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f); - - // Do in-place update - relu_f->configure(out.ptr(), nullptr, relu_info); - } - break; - - default: - assert("Undefined activation type." && 0); - break; - } - - fns.emplace_back(relu_f); -} - -} // namespace util -} // namespace acl -} // namespace kernel -} // namespace nnfw - -#endif // __NNFW_KERNEL_ACL_SHAPE_H__ diff --git a/libs/kernel/acl/src/support.cpp b/libs/kernel/acl/src/support.cpp deleted file mode 100644 index d04aef59e..000000000 --- a/libs/kernel/acl/src/support.cpp +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "support.h" - -namespace nnfw -{ -namespace support -{ -namespace nnapi -{ -namespace feature -{ - -// TODO Extract this function as utility function -// NOTE It is not a good design to access nnfw::rt::Shape nnfw_support_nnapi lib -nnfw::util::feature::Shape asFeatureShape(const nnfw::rt::Shape& shape) -{ - // NNAPI assumes the following ordering: - // - // dim(0) -> N - // dim(1) -> H - // dim(2) -> W - // dim(3) -> C - // - int32_t c = nnfw::rt::getSizeOfDimension(shape, 3); - int32_t h = nnfw::rt::getSizeOfDimension(shape, 1); - int32_t w = nnfw::rt::getSizeOfDimension(shape, 2); - - assert(nnfw::rt::getSizeOfDimension(shape, 0) == 1); - - return nnfw::util::feature::Shape{c, h, w}; -} - -} // namespace feature -} // namespace nnapi -} // namespace support -} // namespace nnfw diff --git a/libs/kernel/acl/src/support.h b/libs/kernel/acl/src/support.h deleted file mode 100644 index 751d2c6cb..000000000 --- a/libs/kernel/acl/src/support.h +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_KERNEL_SUPPORT_H_TEMPORARY__ -#define __NNFW_KERNEL_SUPPORT_H_TEMPORARY__ - -// NOTE these are not decided yet but need to be moved out from Conv2D -// to separate NEON implementation to it's folder -// TODO move to some folder where it should be - -#include <cassert> - -#include "util/feature/Shape.h" - -#include <OperationsUtils.h> - -namespace nnfw -{ -namespace support -{ -namespace nnapi -{ -namespace feature -{ - -// TODO Extract this function as utility function -// NOTE It is not a good design to access nnfw::rt::Shape nnfw_support_nnapi lib -nnfw::util::feature::Shape asFeatureShape(const nnfw::rt::Shape& shape); - -} // namespace feature -} // namespace nnapi -} // namespace support -} // namespace nnfw - -#include <arm_compute/core/ITensor.h> - -#include "util/feature/Reader.h" - -namespace nnfw -{ -namespace support -{ -namespace acl -{ -namespace feature -{ - -template<typename T> class Reader; - -template<> class Reader<float> final : public nnfw::util::feature::Reader<float> -{ -public: - Reader(arm_compute::ITensor *tensor) : _tensor{tensor} - { - assert(tensor->info()->data_type() == arm_compute::DataType::F32); - } - -public: - float at(uint32_t ch, uint32_t row, uint32_t col) const override - { - return *ptr_to_element(ch, row, col); - } - -private: - float *ptr_to_element(uint32_t ch, uint32_t row, uint32_t col) const - { - // ARM Compute uses CHW ordering - return reinterpret_cast<float *>(_tensor->ptr_to_element(arm_compute::Coordinates{col, row, ch})); - } - -private: - arm_compute::ITensor *_tensor; -}; - -} // namespace feature -} // namespace acl -} // namespace support -} // namespace nnfw - -#endif // __NNFW_KERNEL_SUPPORT_H_TEMPORARY__ diff --git a/libs/kernel/acl/src/util.cpp b/libs/kernel/acl/src/util.cpp deleted file mode 100644 index 7e5df534e..000000000 --- a/libs/kernel/acl/src/util.cpp +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include <util/fp32.h> - -#include "util.h" - -namespace nnfw { -namespace kernel { -namespace acl { -namespace util { - -void initData(float* data, int num, float value) -{ - for (int i = 0; i < num; i++) { - *(data + i) = value; - } -} - -void initData_Increasing(float* data, int num, float value) -{ - for (int i = 0; i < num; i++) { - *(data + i) = value; - value++; - } -} - -// compareData -// return true if result == expected with the shape info, -// otherwise false -bool compareData(const float* result, const float* expected, const nnfw::rt::Shape& shape) -{ - if (shape.dimensions.size() == 4) - { - // TODO fix indentation - uint32_t height = nnfw::rt::getSizeOfDimension(shape, 1); - uint32_t width = nnfw::rt::getSizeOfDimension(shape, 2); - uint32_t numitems = height * width; - for (int item = 0; item < numitems; item++) { - if (!::nnfw::util::fp32::epsilon_equal(*(result + item), *(expected + item), 1)) { - LOG(ERROR) << "compareData failed: result " << *(result + item) - << ", expected " << *(expected + item) - << ", diff " << ::nnfw::util::fp32::relative_diff(*(result + item), *(expected + item)) - << std::endl; - return false; - } - } - } - else if (shape.dimensions.size() == 2) - { - uint32_t height = nnfw::rt::getSizeOfDimension(shape, 0); - uint32_t width = nnfw::rt::getSizeOfDimension(shape, 1); - uint32_t numitems = height * width; - for (int item = 0; item < numitems; item++) { - if (!::nnfw::util::fp32::epsilon_equal(*(result + item), *(expected + item), 1)) { - LOG(ERROR) << "compareData failed: result " << *(result + item) - << ", expected " << *(expected + item) - << ", diff " << ::nnfw::util::fp32::relative_diff(*(result + item), *(expected + item)) - << std::endl; - return false; - } - } - } - else - { - // TODO: add a handler for rank 1 and 3 - LOG(ERROR) << "Unhandled shape: " << shape.dimensions.size() << std::endl; - } - return true; -} - -void NCHW2NHWC(const float* nchw, float* nhwc, const nnfw::rt::Shape& shape) -{ - uint32_t N = nnfw::rt::getSizeOfDimension(shape, 0); - uint32_t H = nnfw::rt::getSizeOfDimension(shape, 1); - uint32_t W = nnfw::rt::getSizeOfDimension(shape, 2); - uint32_t C = nnfw::rt::getSizeOfDimension(shape, 3); - - for (uint32_t n = 0; n < N; n++) { - for (uint32_t c = 0; c < C; c++) { - for (uint32_t h = 0; h < H; h++) { - for (uint32_t w = 0; w < W; w++) { - uint32_t soffset = w + (h * W) + (c * W * H) + (n * W * H * C); - uint32_t doffset = c + (w * C) + (h * C * W) + (n * C * W * H); - *(nhwc + doffset) = *(nchw + soffset); - } - } - } - } -} - -} // namespace util -} // namespace acl -} // namespace kernel -} // namespace nnfw diff --git a/libs/kernel/acl/src/util.h b/libs/kernel/acl/src/util.h deleted file mode 100644 index 48ed02783..000000000 --- a/libs/kernel/acl/src/util.h +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_KERNEL_ACL_UTIL_H__ -#define __NNFW_KERNEL_ACL_UTIL_H__ -#include <OperationsUtils.h> - -#include <cmath> -#include <cassert> -#include <functional> - -namespace nnfw { -namespace kernel { -namespace acl { -namespace util { - -// TODO: make a separate module. -class TensorWrapper { -public: - TensorWrapper(std::vector<uint32_t> dims, - OperandType type = OperandType::FLOAT32, - float scale = 1.0, - int32_t offset = 0) - :_shape{type, dims, scale, offset} - { - - // currently, we support only FLOAT32 for now. - assert( type == OperandType::FLOAT32); - - uint32_t size_bytes = sizeof(float); - - _num_elems = 1; - for( auto& d: dims ) { - _num_elems *= d; - } - - _data = new uint8_t[_num_elems * size_bytes]; - } - - ~TensorWrapper() { - delete [] _data; - } - - const nnfw::rt::Shape shape() const { - return _shape; - } - - uint32_t num_elems() const { return _num_elems; } - - template<class T> - T at(const uint32_t& idx) const { - return reinterpret_cast<T*>(_data)[idx]; - } - - template<class T> - T& at(const uint32_t& idx) { - return reinterpret_cast<T*>(_data)[idx]; - } - - template<class T> - T* ptr() { return reinterpret_cast<T*>(_data); } - - void initValue(float f) { - for( uint32_t i = 0; i < _num_elems; ++i ) { - at<float>(i) = f; - } - } - - typedef std::function<float(uint32_t n, uint32_t c, uint32_t h, uint32_t w)> funcInit4; - void initValue(funcInit4 f) { - assert(_shape.dimensions.size() == 4); - - int N = _shape.dimensions[0]; - int H = _shape.dimensions[1]; - int W = _shape.dimensions[2]; - int C = _shape.dimensions[3]; - - for(int n = 0; n < N; ++n) { - for(int h = 0; h < H; ++h) { - for(int w = 0; w < W; ++w) { - for(int c = 0; c < C; ++c) { - uint32_t offset = n*H*W*C + h*W*C + w*C + c; - at<float>(offset) = f(n,c,h,w); - } - } - } - } - } - - typedef std::function<float(uint32_t c, uint32_t h, uint32_t w)> funcInit3; - void initValue(funcInit3 f) { - assert(_shape.dimensions.size() == 3); - - int C = _shape.dimensions[0]; - int H = _shape.dimensions[1]; - int W = _shape.dimensions[2]; - - for(int h = 0; h < H; ++h) { - for(int w = 0; w < W; ++w) { - for(int c = 0; c < C; ++c) { - uint32_t offset = h*W*C + w*C + c; - at<float>(offset) = f(c,h,w); - } - } - } - } - - typedef std::function<float(uint32_t h, uint32_t w)> funcInit2; - void initValue(funcInit2 f) { - assert(_shape.dimensions.size() == 2); - - int H = _shape.dimensions[0]; - int W = _shape.dimensions[1]; - - for(int h = 0; h < H; ++h) { - for(int w = 0; w < W; ++w) { - uint32_t offset = h*W + w; - at<float>(offset) = f(h,w); - } - } - } - - typedef std::function<float(uint32_t w)> funcInit1; - void initValue(funcInit1 f) { - assert(_shape.dimensions.size() == 1); - - int W = _shape.dimensions[0]; - - for(int w = 0; w < W; ++w) { - uint32_t offset = w; - at<float>(offset) = f(w); - } - } - - void initValue(std::vector<float> v) { - assert(v.size() == _num_elems); - for( uint32_t i = 0; i < _num_elems; ++i ) { - at<float>(i) = v[i]; - } - } - - bool operator==(const TensorWrapper &t) const { - // compare the shape - assert(num_elems() == t.num_elems()); - assert(_shape.type == t.shape().type); - assert(_shape.scale == t.shape().scale); - assert(_shape.offset == t.shape().offset); - assert(_shape.dimensions == t.shape().dimensions); - - // currently, we support only FLOAT32. - assert(_shape.type == OperandType::FLOAT32); - - for( uint32_t i = 0; i < _num_elems; ++i ) { - if( std::fabs(static_cast<float>(at<float>(i) - t.at<float>(i))) > 0.001f ) { - std::cout << "Comparing [" << i << "] " << at<float>(i) << "," << t.at<float>(i) << std::endl; - return false; - } - } - - return true; - } - -private: - nnfw::rt::Shape _shape; - uint32_t _num_elems; - uint8_t* _data; -}; - -void initData(float* data, int num, float value); -bool compareData(const float* result, const float* expected, const nnfw::rt::Shape& shape); -void initData_Increasing(float* data, int num, float value); - -void NCHW2NHWC(const float* nchw, float* nhwc, const nnfw::rt::Shape& shape); - -} // namespace util -} // namespace acl -} // namespace kernel -} // namespace nnfw - -#endif // __NNFW_KERNEL_ACL_UTIL_H__ diff --git a/libs/support/nnapi/CMakeLists.txt b/libs/support/nnapi/CMakeLists.txt index cd1f365cf..193bcbd4e 100644 --- a/libs/support/nnapi/CMakeLists.txt +++ b/libs/support/nnapi/CMakeLists.txt @@ -3,4 +3,4 @@ file(GLOB_RECURSE SOURCES "src/*.cpp") add_library(nnfw_support_nnapi ${SOURCES}) set_property(TARGET nnfw_support_nnapi PROPERTY POSITION_INDEPENDENT_CODE ON) target_include_directories(nnfw_support_nnapi PUBLIC ${CMAKE_SOURCE_DIR}/include) -target_link_libraries(nnfw_support_nnapi nnfw_util) +target_link_libraries(nnfw_support_nnapi static_nnfw_util) diff --git a/libs/support/nnapi/src/Utils.cpp b/libs/support/nnapi/src/Utils.cpp new file mode 100644 index 000000000..ae1076fd1 --- /dev/null +++ b/libs/support/nnapi/src/Utils.cpp @@ -0,0 +1,29 @@ +#include "support/nnapi/Utils.h" + +#include <cassert> + +namespace nnfw +{ +namespace support +{ +namespace nnapi +{ + +const char *to_string(const PaddingCode &code) +{ + assert((ANEURALNETWORKS_PADDING_SAME == code) || (ANEURALNETWORKS_PADDING_VALID == code)); + + switch (code) + { + case ANEURALNETWORKS_PADDING_SAME: + return "ANEURALNETWORKS_PADDING_SAME"; + case ANEURALNETWORKS_PADDING_VALID: + return "ANEURALNETWORKS_PADDING_VALID"; + } + + return nullptr; +} + +} // namespace nnapi +} // namespace support +} // namespace nnfw diff --git a/libs/support/tflite/CMakeLists.txt b/libs/support/tflite/CMakeLists.txt index cccc7de3d..667b3bc11 100644 --- a/libs/support/tflite/CMakeLists.txt +++ b/libs/support/tflite/CMakeLists.txt @@ -2,9 +2,11 @@ file(GLOB_RECURSE SOURCES "src/*.cpp") file(GLOB_RECURSE TESTS "src/*.test.cpp") list(REMOVE_ITEM SOURCES ${TESTS}) -add_library(nnfw_support_tflite ${SOURCES}) +add_library(nnfw_support_tflite STATIC ${SOURCES}) +set_target_properties(nnfw_support_tflite PROPERTIES POSITION_INDEPENDENT_CODE ON) target_include_directories(nnfw_support_tflite PUBLIC ${CMAKE_SOURCE_DIR}/include) -target_link_libraries(nnfw_support_tflite nnfw_util tensorflow-lite ${LIB_PTHREAD} dl) +target_link_libraries(nnfw_support_tflite tensorflow-lite ${LIB_PTHREAD} dl) +target_link_libraries(nnfw_support_tflite static_nnfw_util) add_executable(nnfw_support_tflite_test_TensorView src/TensorView.test.cpp) target_link_libraries(nnfw_support_tflite_test_TensorView nnfw_support_tflite) diff --git a/libs/support/tflite/src/Diff.cpp b/libs/support/tflite/src/Diff.cpp index f382df2d6..e875571cb 100644 --- a/libs/support/tflite/src/Diff.cpp +++ b/libs/support/tflite/src/Diff.cpp @@ -15,24 +15,31 @@ */ #include "support/tflite/Diff.h" +#include "support/tflite/nnapi_delegate.h" #include "util/fp32.h" #include "util/tensor/IndexIterator.h" #include "util/tensor/IndexFormatter.h" #include "util/tensor/Zipper.h" +#include "util/tensor/Comparator.h" + +#include "util/environment.h" #include <iostream> +#include <cassert> -class DiffSummary : public TfLiteTensorComparator::Observer +class DiffSummary : public nnfw::util::tensor::Comparator::Observer { public: DiffSummary() - : max_abs_diff_index(0), max_abs_diff_value{0.0f}, - max_rel_diff_index(0), max_rel_diff_value{0.0f} + : max_abs_diff_index(0), max_abs_diff_expected{0.0f}, max_abs_diff_obtained{0.0f}, + max_abs_diff_value{0.0f}, max_rel_diff_index(0), max_rel_diff_expected{0.0f}, + max_rel_diff_obtained{0.0f}, max_rel_diff_value{0.0f} { // DO NOTHING } + public: void notify(const nnfw::util::tensor::Index &index, float expected, float obtained) override; @@ -71,170 +78,422 @@ void DiffSummary::notify(const nnfw::util::tensor::Index &index, float expected, } } -std::vector<TfLiteTensorDiff> -TfLiteTensorComparator::compare(const nnfw::support::tflite::TensorView<float> &expected, - const nnfw::support::tflite::TensorView<float> &obtained, - Observer *observer) const +template <typename T> +bool TfLiteInterpMatchApp::compareSingleTensorView( + const nnfw::support::tflite::TensorView<T> &expected, + const nnfw::support::tflite::TensorView<T> &obtained, int id) const { - std::vector<TfLiteTensorDiff> res; - + std::vector<nnfw::util::tensor::Diff<T>> diffs; assert(expected.shape() == obtained.shape()); - nnfw::util::tensor::zip(expected.shape(), expected, obtained) << - [&] (const nnfw::util::tensor::Index &index, float expected_value, float obtained_value) + using nnfw::util::tensor::zip; + using nnfw::util::tensor::Index; + + zip(expected.shape(), expected, obtained) + << [&](const Index &index, T expected_value, T obtained_value) { + if (expected_value != obtained_value) + { + diffs.emplace_back(index, expected_value, obtained_value); + } + }; + + // TODO Unify summary generation code + if (diffs.size() == 0) { - const auto relative_diff = nnfw::util::fp32::relative_diff(expected_value, obtained_value); + std::cout << " Tensor #" << id << ": MATCHED" << std::endl; + } + else + { + std::cout << " Tensor #" << id << ": UNMATCHED" << std::endl; + std::cout << " " << diffs.size() << " diffs are detected" << std::endl; + } - if (!_compare_fn(expected_value, obtained_value)) + if (diffs.size() > 0 && _verbose != 0) + { + std::cout << " ---- Details ---" << std::endl; + for (const auto &diff : diffs) { - TfLiteTensorDiff diff(index); + std::cout << " Diff at [" << nnfw::util::tensor::IndexFormatter(diff.index) << "]" + << std::endl; + std::cout << " expected: " << diff.expected << std::endl; + std::cout << " obtained: " << diff.obtained << std::endl; + } + } - diff.expected = expected_value; - diff.obtained = obtained_value; + return diffs.size() == 0; +} - res.emplace_back(diff); - } +template <> +bool TfLiteInterpMatchApp::compareSingleTensorView<float>( + const nnfw::support::tflite::TensorView<float> &expected, + const nnfw::support::tflite::TensorView<float> &obtained, int id) const +{ + DiffSummary summary; - // Update max_diff_index, if necessary - if (observer != nullptr) + assert(expected.shape() == obtained.shape()); + auto diffs = _comparator.compare(expected.shape(), expected, obtained, &summary); + + // TODO Unify summary generation code + if (diffs.size() == 0) + { + std::cout << " Tensor #" << id << ": MATCHED" << std::endl; + } + else + { + std::cout << " Tensor #" << id << ": UNMATCHED" << std::endl; + std::cout << " " << diffs.size() << " diffs are detected" << std::endl; + } + + // Print out max_diff + if (summary.max_abs_diff_value > 0) + { + std::cout << " Max absolute diff at [" + << nnfw::util::tensor::IndexFormatter(summary.max_abs_diff_index) << "]" << std::endl; + std::cout << " expected: " << summary.max_abs_diff_expected << std::endl; + std::cout << " obtained: " << summary.max_abs_diff_obtained << std::endl; + std::cout << " absolute diff: " << summary.max_abs_diff_value << std::endl; + } + + if (summary.max_rel_diff_value > 0) + { + const auto tolerance_level = summary.max_rel_diff_value / FLT_EPSILON; + + std::cout << " Max relative diff at [" + << nnfw::util::tensor::IndexFormatter(summary.max_rel_diff_index) << "]" << std::endl; + std::cout << " expected: " << summary.max_rel_diff_expected << std::endl; + std::cout << " obtained: " << summary.max_rel_diff_obtained << std::endl; + std::cout << " relative diff: " << summary.max_rel_diff_value << std::endl; + std::cout << " (tolerance level = " << tolerance_level << ")" << std::endl; + } + + if (diffs.size() > 0) + { + if (_verbose != 0) { - observer->notify(index, expected_value, obtained_value); + std::cout << " ---- Details ---" << std::endl; + for (const auto &diff : diffs) + { + const auto absolute_diff = std::fabs(diff.expected - diff.obtained); + const auto relative_diff = nnfw::util::fp32::relative_diff(diff.expected, diff.obtained); + const auto tolerance_level = relative_diff / FLT_EPSILON; + + std::cout << " Diff at [" << nnfw::util::tensor::IndexFormatter(diff.index) << "]" + << std::endl; + std::cout << " expected: " << diff.expected << std::endl; + std::cout << " obtained: " << diff.obtained << std::endl; + std::cout << " absolute diff: " << absolute_diff << std::endl; + std::cout << " relative diff: " << relative_diff << std::endl; + std::cout << " (tolerance level = " << tolerance_level << ")" << std::endl; + } } - }; - return res; + return false; + } + return true; } +#include <map> + bool TfLiteInterpMatchApp::run(::tflite::Interpreter &interp, ::tflite::Interpreter &nnapi) const { assert(interp.outputs() == nnapi.outputs()); - for (const auto &id : interp.outputs()) - { + bool all_matched = true; + + using Comparator = std::function<bool(int id, ::tflite::Interpreter &, ::tflite::Interpreter &)>; + + std::map<TfLiteType, Comparator> comparators; + + comparators[kTfLiteUInt8] = [this](int id, ::tflite::Interpreter &interp, + ::tflite::Interpreter &nnapi) { + const auto expected = nnfw::support::tflite::TensorView<uint8_t>::make(interp, id); + const auto obtained = nnfw::support::tflite::TensorView<uint8_t>::make(nnapi, id); + + return compareSingleTensorView(expected, obtained, id); + }; + + comparators[kTfLiteInt32] = [this](int id, ::tflite::Interpreter &interp, + ::tflite::Interpreter &nnapi) { + const auto expected = nnfw::support::tflite::TensorView<int32_t>::make(interp, id); + const auto obtained = nnfw::support::tflite::TensorView<int32_t>::make(nnapi, id); + + return compareSingleTensorView(expected, obtained, id); + }; + + comparators[kTfLiteFloat32] = [this](int id, ::tflite::Interpreter &interp, + ::tflite::Interpreter &nnapi) { const auto expected = nnfw::support::tflite::TensorView<float>::make(interp, id); const auto obtained = nnfw::support::tflite::TensorView<float>::make(nnapi, id); - DiffSummary summary; + return compareSingleTensorView(expected, obtained, id); + }; - auto diffs = _comparator.compare(expected, obtained, &summary); + for (const auto &id : interp.outputs()) + { + assert(interp.tensor(id)->type == nnapi.tensor(id)->type); - if (diffs.size() == 0) - { - std::cout << " Tensor #" << id << ": MATCHED" << std::endl; - } - else - { - std::cout << " Tensor #" << id << ": UNMATCHED" << std::endl; - std::cout << " " << diffs.size() << " diffs are detected" << std::endl; - } + auto it = comparators.find(interp.tensor(id)->type); - // Print out max_diff - if (summary.max_abs_diff_value > 0) + if (it == comparators.end()) { - std::cout << " Max absolute diff at [" << nnfw::util::tensor::IndexFormatter(summary.max_abs_diff_index) << "]" << std::endl; - std::cout << " expected: " << summary.max_abs_diff_expected << std::endl; - std::cout << " obtained: " << summary.max_abs_diff_obtained << std::endl; - std::cout << " absolute diff: " << summary.max_abs_diff_value << std::endl; + throw std::runtime_error{"Not supported output type"}; } - if (summary.max_rel_diff_value > 0) - { - const auto tolerance_level = summary.max_rel_diff_value / FLT_EPSILON; + const auto &comparator = it->second; - std::cout << " Max relative diff at [" << nnfw::util::tensor::IndexFormatter(summary.max_rel_diff_index) << "]" << std::endl; - std::cout << " expected: " << summary.max_rel_diff_expected << std::endl; - std::cout << " obtained: " << summary.max_rel_diff_obtained << std::endl; - std::cout << " relative diff: " << summary.max_rel_diff_value << std::endl; - std::cout << " (tolerance level = " << tolerance_level << ")" << std::endl; - } - - if (diffs.size() > 0) + if (!comparator(id, interp, nnapi)) { - if (_verbose != 0) - { - std::cout << " ---- Details ---" << std::endl; - for (const auto &diff : diffs) - { - const auto absolute_diff = std::fabs(diff.expected - diff.obtained); - const auto relative_diff = nnfw::util::fp32::relative_diff(diff.expected, diff.obtained); - const auto tolerance_level = relative_diff / FLT_EPSILON; - - std::cout << " Diff at [" << nnfw::util::tensor::IndexFormatter(diff.index) << "]" << std::endl; - std::cout << " expected: " << diff.expected << std::endl; - std::cout << " obtained: " << diff.obtained << std::endl; - std::cout << " absolute diff: " << absolute_diff << std::endl; - std::cout << " relative diff: " << relative_diff << std::endl; - std::cout << " (tolerance level = " << tolerance_level << ")" << std::endl; - } - } - - return false; + all_matched = false; } } - return true; + return all_matched; } #include "util/tensor/Object.h" +using namespace std::placeholders; + +template <> uint8_t RandomGenerator::generate<uint8_t>(void) +{ + // The value of type_range is 255. + float type_range = static_cast<float>(std::numeric_limits<uint8_t>::max()) - + static_cast<float>(std::numeric_limits<uint8_t>::min()); + // Most _dist values range from -5.0 to 5.0. + float min_range = -5.0f; + float max_range = 5.0f; + return static_cast<uint8_t>((_dist(_rand) - min_range) * type_range / (max_range - min_range)); +} + +#include "support/tflite/TensorLogger.h" // // Random Test Runner // int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder) { - auto pure = builder.build(); + auto tfl_interp = builder.build(); auto nnapi = builder.build(); - pure->UseNNAPI(false); - nnapi->UseNNAPI(true); + tfl_interp->UseNNAPI(false); // Allocate Tensors - pure->AllocateTensors(); + tfl_interp->AllocateTensors(); nnapi->AllocateTensors(); - assert(pure->inputs() == nnapi->inputs()); + assert(tfl_interp->inputs() == nnapi->inputs()); - // Fill IFM with random numbers - auto ifm_gen = [this] (const nnfw::util::tensor::Shape &, const nnfw::util::tensor::Index &) - { - // TODO Allow users to set min/max and distribution - std::normal_distribution<float> dist(0.0f, 2.0f); - return dist(_rand); + using ::tflite::Interpreter; + using Initializer = std::function<void(int id, Interpreter *, Interpreter *)>; + + std::map<TfLiteType, Initializer> initializers; + std::map<TfLiteType, Initializer> reseters; + + // Generate singed 32-bit integer (s32) input + initializers[kTfLiteInt32] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) { + assert(tfl_interp->tensor(id)->type == kTfLiteInt32); + assert(nnapi->tensor(id)->type == kTfLiteInt32); + + auto tfl_interp_view = nnfw::support::tflite::TensorView<int32_t>::make(*tfl_interp, id); + auto nnapi_view = nnfw::support::tflite::TensorView<int32_t>::make(*nnapi, id); + + assert(tfl_interp_view.shape() == nnapi_view.shape()); + + int32_t value = 0; + + nnfw::util::tensor::iterate(tfl_interp_view.shape()) + << [&](const nnfw::util::tensor::Index &ind) { + // TODO Generate random values + tfl_interp_view.at(ind) = value; + nnapi_view.at(ind) = value; + ++value; + }; }; - for (const auto id : pure->inputs()) - { - auto pure_view = nnfw::support::tflite::TensorView<float>::make(*pure, id); + // Generate singed 32-bit integer (s32) input + reseters[kTfLiteInt32] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) { + assert(tfl_interp->tensor(id)->type == kTfLiteInt32); + assert(nnapi->tensor(id)->type == kTfLiteInt32); + + auto tfl_interp_view = nnfw::support::tflite::TensorView<int32_t>::make(*tfl_interp, id); + auto nnapi_view = nnfw::support::tflite::TensorView<int32_t>::make(*nnapi, id); + + assert(tfl_interp_view.shape() == nnapi_view.shape()); + + int32_t value = 0; + + nnfw::util::tensor::iterate(tfl_interp_view.shape()) + << [&](const nnfw::util::tensor::Index &ind) { + // TODO Generate random values + tfl_interp_view.at(ind) = value; + nnapi_view.at(ind) = value; + }; + }; + + initializers[kTfLiteUInt8] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) { + assert(tfl_interp->tensor(id)->type == kTfLiteUInt8); + assert(nnapi->tensor(id)->type == kTfLiteUInt8); + + auto tfl_interp_view = nnfw::support::tflite::TensorView<uint8_t>::make(*tfl_interp, id); + auto nnapi_view = nnfw::support::tflite::TensorView<uint8_t>::make(*nnapi, id); + + assert(tfl_interp_view.shape() == nnapi_view.shape()); + + auto fp = static_cast<uint8_t (RandomGenerator::*)(const ::nnfw::util::tensor::Shape &, + const ::nnfw::util::tensor::Index &)>( + &RandomGenerator::generate<uint8_t>); + const nnfw::util::tensor::Object<uint8_t> data(tfl_interp_view.shape(), + std::bind(fp, _randgen, _1, _2)); + assert(tfl_interp_view.shape() == data.shape()); + + nnfw::util::tensor::iterate(tfl_interp_view.shape()) + << [&](const nnfw::util::tensor::Index &ind) { + const auto value = data.at(ind); + + tfl_interp_view.at(ind) = value; + nnapi_view.at(ind) = value; + }; + }; + + reseters[kTfLiteUInt8] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) { + assert(tfl_interp->tensor(id)->type == kTfLiteUInt8); + assert(nnapi->tensor(id)->type == kTfLiteUInt8); + + auto tfl_interp_view = nnfw::support::tflite::TensorView<uint8_t>::make(*tfl_interp, id); + auto nnapi_view = nnfw::support::tflite::TensorView<uint8_t>::make(*nnapi, id); + + assert(tfl_interp_view.shape() == nnapi_view.shape()); + + auto fp = static_cast<uint8_t (RandomGenerator::*)(const ::nnfw::util::tensor::Shape &, + const ::nnfw::util::tensor::Index &)>( + &RandomGenerator::generate<uint8_t>); + const nnfw::util::tensor::Object<uint8_t> data(tfl_interp_view.shape(), + std::bind(fp, _randgen, _1, _2)); + assert(tfl_interp_view.shape() == data.shape()); + + uint8_t value = 0; + + nnfw::util::tensor::iterate(tfl_interp_view.shape()) + << [&](const nnfw::util::tensor::Index &ind) { + tfl_interp_view.at(ind) = value; + nnapi_view.at(ind) = value; + }; + }; + + initializers[kTfLiteFloat32] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) { + assert(tfl_interp->tensor(id)->type == kTfLiteFloat32); + assert(nnapi->tensor(id)->type == kTfLiteFloat32); + + auto tfl_interp_view = nnfw::support::tflite::TensorView<float>::make(*tfl_interp, id); auto nnapi_view = nnfw::support::tflite::TensorView<float>::make(*nnapi, id); - assert(pure_view.shape() == nnapi_view.shape()); + assert(tfl_interp_view.shape() == nnapi_view.shape()); - const nnfw::util::tensor::Object<float> data(pure_view.shape(), ifm_gen); + auto fp = static_cast<float (RandomGenerator::*)(const ::nnfw::util::tensor::Shape &, + const ::nnfw::util::tensor::Index &)>( + &RandomGenerator::generate<float>); + const nnfw::util::tensor::Object<float> data(tfl_interp_view.shape(), + std::bind(fp, _randgen, _1, _2)); + + assert(tfl_interp_view.shape() == data.shape()); + + nnfw::util::tensor::iterate(tfl_interp_view.shape()) + << [&](const nnfw::util::tensor::Index &ind) { + const auto value = data.at(ind); + + tfl_interp_view.at(ind) = value; + nnapi_view.at(ind) = value; + }; + }; - assert(pure_view.shape() == data.shape()); + reseters[kTfLiteFloat32] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) { + assert(tfl_interp->tensor(id)->type == kTfLiteFloat32); + assert(nnapi->tensor(id)->type == kTfLiteFloat32); + + auto tfl_interp_view = nnfw::support::tflite::TensorView<float>::make(*tfl_interp, id); + auto nnapi_view = nnfw::support::tflite::TensorView<float>::make(*nnapi, id); + + assert(tfl_interp_view.shape() == nnapi_view.shape()); + + auto fp = static_cast<float (RandomGenerator::*)(const ::nnfw::util::tensor::Shape &, + const ::nnfw::util::tensor::Index &)>( + &RandomGenerator::generate<float>); + const nnfw::util::tensor::Object<float> data(tfl_interp_view.shape(), + std::bind(fp, _randgen, _1, _2)); + + assert(tfl_interp_view.shape() == data.shape()); + + float value = 0; + + nnfw::util::tensor::iterate(tfl_interp_view.shape()) + << [&](const nnfw::util::tensor::Index &ind) { + tfl_interp_view.at(ind) = value; + nnapi_view.at(ind) = value; + }; + }; - nnfw::util::tensor::iterate(pure_view.shape()) << [&] (const nnfw::util::tensor::Index &ind) + // Fill IFM with random numbers + for (const auto id : tfl_interp->inputs()) + { + assert(tfl_interp->tensor(id)->type == nnapi->tensor(id)->type); + + auto it = initializers.find(tfl_interp->tensor(id)->type); + + if (it == initializers.end()) { - const auto value = data.at(ind); + throw std::runtime_error{"Not supported input type"}; + } - pure_view.at(ind) = value; - nnapi_view.at(ind) = value; - }; + it->second(id, tfl_interp.get(), nnapi.get()); + } + + // Fill OFM with 0 + for (const auto id : tfl_interp->outputs()) + { + assert(tfl_interp->tensor(id)->type == nnapi->tensor(id)->type); + + auto it = reseters.find(tfl_interp->tensor(id)->type); + + if (it == reseters.end()) + { + throw std::runtime_error{"Not supported input type"}; + } + + it->second(id, tfl_interp.get(), nnapi.get()); } std::cout << "[NNAPI TEST] Run T/F Lite Interpreter without NNAPI" << std::endl; - pure->Invoke(); + tfl_interp->Invoke(); std::cout << "[NNAPI TEST] Run T/F Lite Interpreter with NNAPI" << std::endl; - nnapi->Invoke(); + + char *env = getenv("UPSTREAM_DELEGATE"); + + if (env && !std::string(env).compare("1")) + { + nnapi->UseNNAPI(true); + nnapi->Invoke(); + } + else + { + nnfw::NNAPIDelegate d; + + if (d.BuildGraph(nnapi.get())) + { + throw std::runtime_error{"Failed to BuildGraph"}; + } + + if (d.Invoke(nnapi.get())) + { + throw std::runtime_error{"Failed to BuildGraph"}; + } + } // Compare OFM std::cout << "[NNAPI TEST] Compare the result" << std::endl; const auto tolerance = _param.tolerance; - auto equals = [tolerance] (float lhs, float rhs) - { + auto equals = [tolerance](float lhs, float rhs) { // NOTE Hybrid approach // TODO Allow users to set tolerance for absolute_epsilon_equal if (nnfw::util::fp32::absolute_epsilon_equal(lhs, rhs)) @@ -245,12 +504,12 @@ int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder) return nnfw::util::fp32::epsilon_equal(lhs, rhs, tolerance); }; - TfLiteTensorComparator comparator(equals); + nnfw::util::tensor::Comparator comparator(equals); TfLiteInterpMatchApp app(comparator); app.verbose() = _param.verbose; - bool res = app.run(*pure, *nnapi); + bool res = app.run(*tfl_interp, *nnapi); if (!res) { @@ -258,5 +517,22 @@ int RandomTestRunner::run(const nnfw::support::tflite::interp::Builder &builder) } std::cout << "[NNAPI TEST] PASSED" << std::endl; + + if (_param.tensor_logging) + nnfw::support::tflite::TensorLogger::instance().save(_param.log_path, *tfl_interp); + return 0; } + +RandomTestRunner RandomTestRunner::make(int seed) +{ + RandomTestParam param; + + param.verbose = 0; + param.tolerance = 1; + + nnfw::util::env::IntAccessor("VERBOSE").access(param.verbose); + nnfw::util::env::IntAccessor("TOLERANCE").access(param.tolerance); + + return RandomTestRunner{seed, param}; +} diff --git a/libs/support/tflite/src/FeatureView.cpp b/libs/support/tflite/src/FeatureView.cpp index 50f599d2e..4c7636780 100644 --- a/libs/support/tflite/src/FeatureView.cpp +++ b/libs/support/tflite/src/FeatureView.cpp @@ -28,11 +28,8 @@ namespace tflite nnfw::util::feature::Shape getFeatureShape(const TfLiteTensor *tensor) { - nnfw::util::feature::Shape shape; - - shape.C = tensor->dims->data[3]; - shape.H = tensor->dims->data[1]; - shape.W = tensor->dims->data[2]; + nnfw::util::feature::Shape shape{tensor->dims->data[3], tensor->dims->data[1], + tensor->dims->data[2]}; return shape; } diff --git a/libs/kernel/acl/src/neon/DepthwiseConv2D.test.cpp b/libs/support/tflite/src/Quantization.cpp index d729d538e..b23204d41 100644 --- a/libs/kernel/acl/src/neon/DepthwiseConv2D.test.cpp +++ b/libs/support/tflite/src/Quantization.cpp @@ -14,7 +14,9 @@ * limitations under the License. */ -#define ACL_CORE_FUNC_NAME neon::depthwiseConvFloat32 -#define ACL_TEST(tc, t) TEST(tc, neon_##t) +#include "support/tflite/Quantization.h" -#include "../DepthwiseConv2D.test.h" +TfLiteQuantizationParams make_default_quantization(void) +{ + return TfLiteQuantizationParams{0.0f, 0}; +} diff --git a/libs/support/tflite/src/TensorShapeUtils.cpp b/libs/support/tflite/src/TensorShapeUtils.cpp new file mode 100644 index 000000000..611ba920e --- /dev/null +++ b/libs/support/tflite/src/TensorShapeUtils.cpp @@ -0,0 +1,51 @@ +#include "support/tflite/TensorShapeUtils.h" + +namespace nnfw +{ +namespace support +{ +namespace tflite +{ + +nnfw::util::tensor::Shape broadcast(const nnfw::util::tensor::Shape &lhs_shape, + const nnfw::util::tensor::Shape &rhs_shape) +{ + const uint32_t lhs_rank = lhs_shape.rank(); + const uint32_t rhs_rank = rhs_shape.rank(); + const uint32_t out_rank = std::max(lhs_rank, rhs_rank); + + // TODO Simplify implementation + std::vector<int32_t> lhs_normalized_dims; + std::vector<int32_t> rhs_normalized_dims; + + for (uint32_t n = 0; n < out_rank - lhs_rank; ++n) + { + lhs_normalized_dims.emplace_back(1); + } + for (uint32_t axis = 0; axis < lhs_rank; ++axis) + { + lhs_normalized_dims.emplace_back(lhs_shape.dim(axis)); + } + + for (uint32_t n = 0; n < out_rank - rhs_rank; ++n) + { + rhs_normalized_dims.emplace_back(1); + } + for (uint32_t axis = 0; axis < rhs_rank; ++axis) + { + rhs_normalized_dims.emplace_back(rhs_shape.dim(axis)); + } + + nnfw::util::tensor::Shape out_shape(out_rank); + + for (uint32_t axis = 0; axis < out_rank; ++axis) + { + out_shape.dim(axis) = std::max(lhs_normalized_dims.at(axis), rhs_normalized_dims.at(axis)); + } + + return out_shape; +} + +} // namespace tflite +} // namespace support +} // namespace nnfw diff --git a/libs/support/tflite/src/TensorView.cpp b/libs/support/tflite/src/TensorView.cpp deleted file mode 100644 index 9e164acc2..000000000 --- a/libs/support/tflite/src/TensorView.cpp +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "support/tflite/TensorView.h" - -#include <cassert> - -namespace nnfw -{ -namespace support -{ -namespace tflite -{ - -TensorView<float>::TensorView(const nnfw::util::tensor::Shape &shape, float *base) : _shape{shape}, _base{base} -{ - // Set 'stride' - _stride.init(_shape); -} - -float TensorView<float>::at(const nnfw::util::tensor::Index &index) const -{ - const auto offset = _stride.offset(index); - - return *(_base + offset); -} - -float &TensorView<float>::at(const nnfw::util::tensor::Index &index) -{ - const auto offset = _stride.offset(index); - - return *(_base + offset); -} - -TensorView<float> TensorView<float>::make(::tflite::Interpreter &interp, int tensor_index) -{ - auto tensor_ptr = interp.tensor(tensor_index); - - // TODO Enable the following assets - // assert(isFloatTensor(tensor_ptr)); - // assert(isFeatureTensor(tensor_ptr)); - - // Set 'shape' - nnfw::util::tensor::Shape shape(tensor_ptr->dims->size); - - for (uint32_t axis = 0; axis < shape.rank(); ++axis) - { - shape.dim(axis) = tensor_ptr->dims->data[axis]; - } - - return TensorView<float>(shape, interp.typed_tensor<float>(tensor_index)); -} - -} // namespace tflite -} // namespace support -} // namespace nnfw diff --git a/libs/support/tflite/src/TensorView.test.cpp b/libs/support/tflite/src/TensorView.test.cpp index 75993a6da..1d3a70500 100644 --- a/libs/support/tflite/src/TensorView.test.cpp +++ b/libs/support/tflite/src/TensorView.test.cpp @@ -18,9 +18,24 @@ #include <cassert> +void int_test(void) +{ + int value[6] = {1, 2, 3, 4, 5, 6}; + + const nnfw::util::tensor::Shape shape{2, 3}; + const nnfw::support::tflite::TensorView<int> view{shape, value}; + + assert(view.at(nnfw::util::tensor::Index{0, 0}) == 1); + assert(view.at(nnfw::util::tensor::Index{0, 1}) == 2); + assert(view.at(nnfw::util::tensor::Index{0, 2}) == 3); + assert(view.at(nnfw::util::tensor::Index{1, 0}) == 4); + assert(view.at(nnfw::util::tensor::Index{1, 1}) == 5); + assert(view.at(nnfw::util::tensor::Index{1, 2}) == 6); +} + int main(int argc, char **argv) { - float value[6] = { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f }; + float value[6] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; const nnfw::util::tensor::Shape shape{2, 3}; const nnfw::support::tflite::TensorView<float> view{shape, value}; @@ -32,5 +47,7 @@ int main(int argc, char **argv) assert(view.at(nnfw::util::tensor::Index{1, 1}) == 5.0f); assert(view.at(nnfw::util::tensor::Index{1, 2}) == 6.0f); + int_test(); + return 0; } diff --git a/libs/support/tflite/src/interp/FlatBufferBuilder.cpp b/libs/support/tflite/src/interp/FlatBufferBuilder.cpp index f46c74652..67df13f34 100644 --- a/libs/support/tflite/src/interp/FlatBufferBuilder.cpp +++ b/libs/support/tflite/src/interp/FlatBufferBuilder.cpp @@ -16,7 +16,7 @@ #include "support/tflite/interp/FlatBufferBuilder.h" -#include <tensorflow/contrib/lite/kernels/register.h> +#include "support/tflite/kernels/register.h" namespace nnfw { diff --git a/libs/support/tflite/src/kernels/RSQRT.cpp b/libs/support/tflite/src/kernels/RSQRT.cpp new file mode 100644 index 000000000..13efe0ed9 --- /dev/null +++ b/libs/support/tflite/src/kernels/RSQRT.cpp @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "support/tflite/kernels/RSQRT.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" + +#include <cmath> +#include <iostream> + +namespace tflite +{ +namespace ops +{ +namespace custom +{ +namespace nnfw +{ +namespace RSQRT +{ + +void *InitRSQRT(TfLiteContext *context, const char *buffer, size_t length) { return nullptr; } + +void FreeRSQRT(TfLiteContext *context, void *buffer) {} + +TfLiteStatus PrepareRSQRT(TfLiteContext *context, TfLiteNode *node) +{ + TF_LITE_ENSURE_EQ(context, NumInputs(node), 1); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + const TfLiteTensor *input = GetInput(context, node, 0); + TfLiteTensor *output = GetOutput(context, node, 0); + TF_LITE_ENSURE_EQ(context, input->type, output->type); + // Quantized float is not supported yet. + TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32); + return context->ResizeTensor(context, output, TfLiteIntArrayCopy(input->dims)); +} + +inline TfLiteStatus Eval(TfLiteContext *context, TfLiteNode *node, float float_func(float)) +{ + const TfLiteTensor *input = GetInput(context, node, 0); + TfLiteTensor *output = GetOutput(context, node, 0); + switch (input->type) + { + case kTfLiteFloat32: + { + size_t elements = NumElements(input); + const float *in = input->data.f; + const float *in_end = in + elements; + float *out = output->data.f; + for (; in < in_end; in++, out++) + *out = float_func(*in); + return kTfLiteOk; + } + default: + { + context->ReportError(context, "Input type is %d, requires float32", input->type); + return kTfLiteError; + } + } +} + +TfLiteStatus EvalRSQRT(TfLiteContext *context, TfLiteNode *node) +{ + return Eval(context, node, [](float f) { return 1.f / std::sqrt(f); }); +} + +} // namespace RSQRT +} // namespace nnfw +} // namespace custom +} // namespace ops +} // namespace tflite diff --git a/libs/support/tflite/src/kernels/SquaredDifference.cpp b/libs/support/tflite/src/kernels/SquaredDifference.cpp new file mode 100644 index 000000000..25e10a8ed --- /dev/null +++ b/libs/support/tflite/src/kernels/SquaredDifference.cpp @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "support/tflite/kernels/SquaredDifference.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" + +#include <iostream> + +namespace tflite +{ +namespace ops +{ +namespace custom +{ +namespace nnfw +{ +namespace SquaredDifference +{ + +void *InitSquaredDifference(TfLiteContext *context, const char *buffer, size_t length) +{ + return nullptr; +} + +void FreeSquaredDifference(TfLiteContext *context, void *buffer) {} + +TfLiteStatus PrepareSquaredDifference(TfLiteContext *context, TfLiteNode *node) +{ + TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); + TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); + + const TfLiteTensor *input1 = GetInput(context, node, 0); + const TfLiteTensor *input2 = GetInput(context, node, 1); + TfLiteTensor *output = GetOutput(context, node, 0); + + TF_LITE_ENSURE_EQ(context, input1->type, input2->type); + TF_LITE_ENSURE_EQ(context, input1->type, output->type); + + return context->ResizeTensor(context, output, TfLiteIntArrayCopy(input1->dims)); +} + +TfLiteStatus EvalSquaredDifference(TfLiteContext *context, TfLiteNode *node) +{ + + const TfLiteTensor *input1 = GetInput(context, node, 0); + const TfLiteTensor *input2 = GetInput(context, node, 1); + + TfLiteTensor *output = GetOutput(context, node, 0); + + size_t elements = NumElements(input1); + + switch (input1->type) + { + case kTfLiteFloat32: + { + const float *in1 = input1->data.f; + const float *in2 = input2->data.f; + const float *in_end1 = in1 + elements; + float *out = output->data.f; + + for (; in1 < in_end1; in1++, in2++, out++) + *out = ((*in1 - *in2) * (*in1 - *in2)); + + return kTfLiteOk; + } + case kTfLiteInt32: + { + const int *in1 = input1->data.i32; + const int *in2 = input2->data.i32; + const int *in_end1 = in1 + elements; + int *out = output->data.i32; + + for (; in1 < in_end1; in1++, in2++, out++) + *out = ((*in1 - *in2) * (*in1 - *in2)); + + return kTfLiteOk; + } + case kTfLiteInt64: + { + const int64_t *in1 = input1->data.i64; + const int64_t *in2 = input1->data.i64; + const int64_t *in_end1 = in1 + elements; + int64_t *out = output->data.i64; + + for (; in1 < in_end1; in1++, in2++, out++) + *out = ((*in1 - *in2) * (*in1 - *in2)); + + return kTfLiteOk; + } + default: + { + context->ReportError(context, "InputType is %d Unsupported", input1->type); + return kTfLiteError; + } + } +} + +} // namespace SquaredDifference +} // nnfw +} // namespace custom +} // namespace ops +} // namespace tflite diff --git a/libs/support/tflite/src/kernels/TensorFlowMax.cpp b/libs/support/tflite/src/kernels/TensorFlowMax.cpp new file mode 100644 index 000000000..abc6fda4e --- /dev/null +++ b/libs/support/tflite/src/kernels/TensorFlowMax.cpp @@ -0,0 +1,390 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "support/tflite/kernels/TensorFlowMax.h" +#include "tensorflow/contrib/lite/kernels/kernel_util.h" + +#include <iostream> + +namespace tflite +{ +namespace ops +{ +namespace custom +{ +namespace nnfw +{ +namespace TensorFlowMax +{ + +struct TensorFlowMaxOp +{ + TensorFlowMaxOp(TfLiteContext *context, TfLiteNode *node) + { + input = tflite::GetInput(context, node, 0); + axis = tflite::GetInput(context, node, 1); + output = tflite::GetOutput(context, node, 0); + } + const TfLiteTensor *input; + const TfLiteTensor *axis; + TfLiteTensor *output; +}; + +void *InitTensorFlowMax(TfLiteContext *context, const char *buffer, size_t length) +{ + // Creates two temp tensors to store index and axis for internal + // implementation only. + auto *scratch_tensor_index = new int; + context->AddTensors(context, 2, scratch_tensor_index); + return scratch_tensor_index; +} + +void FreeTensorFlowMax(TfLiteContext *context, void *buffer) +{ + delete static_cast<TensorFlowMaxOp *>(buffer); +} + +// Resizes the temp tensor that stores resolved axis. +TfLiteStatus ResizeTempAxis(TfLiteContext *context, TensorFlowMaxOp *op_context, + TfLiteTensor *resolved_axis) +{ + TfLiteIntArray *axis_size = TfLiteIntArrayCreate(1); + axis_size->data[0] = static_cast<int>(tflite::NumElements(op_context->axis)); + return context->ResizeTensor(context, resolved_axis, axis_size); +} + +// Resizes output array based on the input size and resolved axis. +TfLiteStatus ResizeOutputTensor(TfLiteContext *context, TensorFlowMaxOp *op_context) +{ + size_t num_axis = tflite::NumElements(op_context->axis); + const TfLiteIntArray *input_dims = op_context->input->dims; + int input_num_dims = tflite::NumDimensions(op_context->input); + const int *axis = op_context->axis->data.i32; + + { + // Calculates size of reducing axis. + int num_reduce_axis = num_axis; + for (int i = 0; i < num_axis; ++i) + { + int current = axis[i]; + if (current < 0) + { + current += input_num_dims; + } + TF_LITE_ENSURE(context, current >= 0 && current < input_num_dims); + for (int j = 0; j < i; ++j) + { + int previous = axis[j]; + if (previous < 0) + { + previous += input_num_dims; + } + if (current == previous) + { + --num_reduce_axis; + break; + } + } + } + // Determines output dimensions. + TfLiteIntArray *output_dims = TfLiteIntArrayCreate(input_num_dims - num_reduce_axis); + int num_skip_axis = 0; + for (int idx = 0; idx < input_num_dims; ++idx) + { + bool is_axis = false; + for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) + { + if (axis[axis_idx] == idx || axis[axis_idx] + input_num_dims == idx) + { + ++num_skip_axis; + is_axis = true; + break; + } + } + if (!is_axis) + { + output_dims->data[idx - num_skip_axis] = input_dims->data[idx]; + } + } + return context->ResizeTensor(context, op_context->output, output_dims); + } +} + +// Initializes temp tensors to store index and resolved axis. +TfLiteStatus InitializeTemporaries(TfLiteContext *context, TfLiteNode *node, + TensorFlowMaxOp *op_context) +{ + // Creates a temp index to iterate through input data. + int *scratch_tensor_index = reinterpret_cast<int *>(node->user_data); + TfLiteIntArrayFree(node->temporaries); + node->temporaries = TfLiteIntArrayCreate(2); + node->temporaries->data[0] = *scratch_tensor_index; + TfLiteTensor *scratch_tensor = &context->tensors[node->temporaries->data[0]]; + scratch_tensor->type = kTfLiteInt32; + scratch_tensor->allocation_type = kTfLiteArenaRw; + TfLiteIntArray *index_size = TfLiteIntArrayCreate(1); + index_size->data[0] = tflite::NumDimensions(op_context->input); + TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_tensor, index_size)); + + // Creates a temp tensor to store resolved axis given input data. + node->temporaries->data[1] = *scratch_tensor_index + 1; + TfLiteTensor *resolved_axis = &context->tensors[node->temporaries->data[1]]; + resolved_axis->type = kTfLiteInt32; + return kTfLiteOk; +} + +TfLiteStatus PrepareTensorFlowMax(TfLiteContext *context, TfLiteNode *node) +{ + TF_LITE_ENSURE_EQ(context, tflite::NumInputs(node), 2); + TF_LITE_ENSURE_EQ(context, tflite::NumOutputs(node), 1); + + TensorFlowMaxOp op_context(context, node); + TF_LITE_ENSURE_OK(context, InitializeTemporaries(context, node, &op_context)); + + TfLiteTensor *resolved_axis = &context->tensors[node->temporaries->data[1]]; + // Leaves work to Eval if axis is not constant; else resizes output. + if (!tflite::IsConstantTensor(op_context.axis)) + { + tflite::SetTensorToDynamic(op_context.output); + tflite::SetTensorToDynamic(resolved_axis); + return kTfLiteOk; + } + resolved_axis->allocation_type = kTfLiteArenaRw; + TF_LITE_ENSURE_OK(context, ResizeTempAxis(context, &op_context, resolved_axis)); + return ResizeOutputTensor(context, &op_context); +} + +// Gets offset of index if expanded on axis. When expanded, the flattened offset +// will not change, if the output index changes on the given axis. For example, +// if you have a 2D tensor and you are expanding to 3D on axis 0, +// then index (0, 1, 2) and index (1, 1, 2) will map from the same flattened +// offset. +inline size_t ExpandedInputOffset(const int num_dims, const int *dims, const int *index, + const int num_axis, const int *axis) +{ + size_t offset = 0; + int out_idx = 0; + for (int in_idx = 0; in_idx < num_dims; ++in_idx) + { + // if we need to expand this axis + bool is_axis = false; + if (axis != nullptr) + { + for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) + { + if (in_idx == axis[axis_idx]) + { + is_axis = true; + break; + } + } + } + if (!is_axis) + { + offset = offset * static_cast<size_t>(dims[in_idx]) + static_cast<size_t>(index[out_idx]); + out_idx++; + } + else + { + offset = offset * static_cast<size_t>(dims[in_idx]); + } + } + return offset; +} + +// Gets offset of index if reducing on axis. When reducing, the flattened offset +// will not change, if the input index changes on the given axis. For example, +// if you have a 3D tensor and you are reducing to 2D by eliminating axis 0, +// then index (0, 1, 2) and index (1, 1, 2) will map to the same flattened +// offset. +// TODO(kanlig): uses Dims to represent dimensions. +inline size_t ReducedOutputOffset(const int num_dims, const int *dims, const int *index, + const int num_axis, const int *axis) +{ + size_t offset = 0; + for (int idx = 0; idx < num_dims; ++idx) + { + // if we need to skip this axis + bool is_axis = false; + if (axis != nullptr) + { + for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) + { + if (idx == axis[axis_idx]) + { + is_axis = true; + break; + } + } + } + if (!is_axis) + { + offset = offset * static_cast<size_t>(dims[idx]) + static_cast<size_t>(index[idx]); + } + } + return offset; +} + +// Gets next index to iterate through a multidimensional array. +inline bool NextIndex(TfLiteContext *context, const int num_dims, const int *dims, int *current) +{ + int carry = 1; + for (int idx = num_dims - 1; idx >= 0; --idx) + { + int current_val = current[idx] + carry; + TF_LITE_ENSURE(context, (dims[idx] >= current_val)); + if (dims[idx] == current_val) + { + current[idx] = 0; + } + else + { + current[idx] = current_val; + carry = 0; + break; + } + } + return (carry == 0); +} + +template <typename T> +inline TfLiteStatus +CustomMax(TfLiteContext *context, T *input_data, const int *input_dims, const int input_num_dims, + T *output_data, const int *output_dims, const int output_num_dims, const int *axis, + const int num_axis_dimensions, bool keep_dims, int *temp_index, int *resolved_axis) +{ + // resolves axis. + int num_resolved_axis = 0; + for (int idx = 0; idx < num_axis_dimensions; ++idx) + { + int current = axis[idx]; + TF_LITE_ENSURE(context, (current < input_num_dims && current + input_num_dims >= 0)); + if (current < 0) + { + current += input_num_dims; + } + bool is_dup = false; + for (int j = 0; j < num_resolved_axis; ++j) + { + if (resolved_axis[j] == current) + { + is_dup = true; + break; + } + } + if (!is_dup) + { + resolved_axis[num_resolved_axis++] = current; + } + } + + TF_LITE_ENSURE(context, (input_num_dims > 0)); + TF_LITE_ENSURE(context, (input_dims != nullptr)); + TF_LITE_ENSURE(context, (temp_index != nullptr)); + + // resets output data. + for (int idx = 0; idx < output_num_dims; ++idx) + { + temp_index[idx] = 0; + } + for (bool has_next = true; has_next; + has_next = NextIndex(context, output_num_dims, output_dims, temp_index)) + { + size_t output_offset = + ReducedOutputOffset(output_num_dims, output_dims, temp_index, 0, nullptr); + size_t input_offset = ExpandedInputOffset(input_num_dims, input_dims, temp_index, + num_resolved_axis, resolved_axis); + output_data[output_offset] = input_data[input_offset]; + } + + // resets temp index. + for (int idx = 0; idx < input_num_dims; ++idx) + { + temp_index[idx] = 0; + } + + // iterates through input_data. + for (bool has_next = true; has_next; + has_next = NextIndex(context, input_num_dims, input_dims, temp_index)) + { + size_t input_offset = ReducedOutputOffset(input_num_dims, input_dims, temp_index, 0, nullptr); + size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims, temp_index, + num_resolved_axis, resolved_axis); + if (output_data[output_offset] < input_data[input_offset]) + { + output_data[output_offset] = input_data[input_offset]; + } + } + + return kTfLiteOk; +} + +TfLiteStatus EvalTensorFlowMax(TfLiteContext *context, TfLiteNode *node) +{ + + TensorFlowMaxOp op_context(context, node); + int num_axis = static_cast<int>(tflite::NumElements(op_context.axis)); + TfLiteTensor *temp_index = &context->tensors[node->temporaries->data[0]]; + TfLiteTensor *resolved_axis = &context->tensors[node->temporaries->data[1]]; + // Resize the output tensor if the output tensor is dynamic. + if (tflite::IsDynamicTensor(op_context.output)) + { + TF_LITE_ENSURE_OK(context, ResizeTempAxis(context, &op_context, resolved_axis)); + TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context)); + } + + TfLiteStatus returnStatus = kTfLiteOk; + switch (op_context.input->type) + { + case kTfLiteFloat32: + returnStatus = CustomMax<float>( + context, op_context.input->data.f, op_context.input->dims->data, + op_context.input->dims->size, op_context.output->data.f, op_context.output->dims->data, + op_context.output->dims->size, op_context.axis->data.i32, num_axis, false, + temp_index->data.i32, resolved_axis->data.i32); + break; + case kTfLiteInt32: + returnStatus = CustomMax<int>(context, op_context.input->data.i32, + op_context.input->dims->data, op_context.input->dims->size, + op_context.output->data.i32, op_context.output->dims->data, + op_context.output->dims->size, op_context.axis->data.i32, + num_axis, false, temp_index->data.i32, resolved_axis->data.i32); + break; + case kTfLiteUInt8: + returnStatus = CustomMax<uint8_t>( + context, op_context.input->data.uint8, op_context.input->dims->data, + op_context.input->dims->size, op_context.output->data.uint8, + op_context.output->dims->data, op_context.output->dims->size, op_context.axis->data.i32, + num_axis, false, temp_index->data.i32, resolved_axis->data.i32); + break; + case kTfLiteInt64: + returnStatus = CustomMax<int64_t>( + context, op_context.input->data.i64, op_context.input->dims->data, + op_context.input->dims->size, op_context.output->data.i64, op_context.output->dims->data, + op_context.output->dims->size, op_context.axis->data.i32, num_axis, false, + temp_index->data.i32, resolved_axis->data.i32); + break; + default: + returnStatus = kTfLiteError; + } + + return returnStatus; +} +} // namespace TensorFlowMax +} // namespace nnfw +} // namespace custom +} // namespace ops +} // namespace tflite diff --git a/libs/support/tflite/src/kernels/register.cpp b/libs/support/tflite/src/kernels/register.cpp new file mode 100644 index 000000000..6700b4de4 --- /dev/null +++ b/libs/support/tflite/src/kernels/register.cpp @@ -0,0 +1,169 @@ +/* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// NOTE This code is derived from the following file (in TensorFlow) +// 'externals/tensorflow/tensorflow/contrib/lite/kernels/register.cc' +#include "support/tflite/kernels/register.h" +#include "support/tflite/kernels/CustomOps.h" + +// TODO Use namespace nnfw +namespace tflite +{ +namespace ops +{ +namespace builtin +{ + +TfLiteRegistration *Register_RELU(); +TfLiteRegistration *Register_RELU_N1_TO_1(); +TfLiteRegistration *Register_RELU6(); +TfLiteRegistration *Register_TANH(); +TfLiteRegistration *Register_LOGISTIC(); +TfLiteRegistration *Register_AVERAGE_POOL_2D(); +TfLiteRegistration *Register_MAX_POOL_2D(); +TfLiteRegistration *Register_L2_POOL_2D(); +TfLiteRegistration *Register_CONV_2D(); +TfLiteRegistration *Register_DEPTHWISE_CONV_2D(); +TfLiteRegistration *Register_SVDF(); +TfLiteRegistration *Register_RNN(); +TfLiteRegistration *Register_BIDIRECTIONAL_SEQUENCE_RNN(); +TfLiteRegistration *Register_UNIDIRECTIONAL_SEQUENCE_RNN(); +TfLiteRegistration *Register_EMBEDDING_LOOKUP(); +TfLiteRegistration *Register_EMBEDDING_LOOKUP_SPARSE(); +TfLiteRegistration *Register_FULLY_CONNECTED(); +TfLiteRegistration *Register_LSH_PROJECTION(); +TfLiteRegistration *Register_HASHTABLE_LOOKUP(); +TfLiteRegistration *Register_SOFTMAX(); +TfLiteRegistration *Register_CONCATENATION(); +TfLiteRegistration *Register_ADD(); +TfLiteRegistration *Register_SPACE_TO_BATCH_ND(); +TfLiteRegistration *Register_DIV(); +TfLiteRegistration *Register_SUB(); +TfLiteRegistration *Register_BATCH_TO_SPACE_ND(); +TfLiteRegistration *Register_MUL(); +TfLiteRegistration *Register_L2_NORMALIZATION(); +TfLiteRegistration *Register_LOCAL_RESPONSE_NORMALIZATION(); +TfLiteRegistration *Register_LSTM(); +TfLiteRegistration *Register_BIDIRECTIONAL_SEQUENCE_LSTM(); +TfLiteRegistration *Register_UNIDIRECTIONAL_SEQUENCE_LSTM(); +TfLiteRegistration *Register_PAD(); +TfLiteRegistration *Register_PADV2(); +TfLiteRegistration *Register_RESHAPE(); +TfLiteRegistration *Register_RESIZE_BILINEAR(); +TfLiteRegistration *Register_SKIP_GRAM(); +TfLiteRegistration *Register_SPACE_TO_DEPTH(); +TfLiteRegistration *Register_GATHER(); +TfLiteRegistration *Register_TRANSPOSE(); +TfLiteRegistration *Register_MEAN(); +TfLiteRegistration *Register_SPLIT(); +TfLiteRegistration *Register_SQUEEZE(); +TfLiteRegistration *Register_STRIDED_SLICE(); +TfLiteRegistration *Register_EXP(); +TfLiteRegistration *Register_TOPK_V2(); +TfLiteRegistration *Register_LOG_SOFTMAX(); +TfLiteRegistration *Register_CAST(); +TfLiteRegistration *Register_DEQUANTIZE(); +TfLiteRegistration *Register_PRELU(); +TfLiteRegistration *Register_MAXIMUM(); +TfLiteRegistration *Register_MINIMUM(); +TfLiteRegistration *Register_ARG_MAX(); +TfLiteRegistration *Register_GREATER(); +TfLiteRegistration *Register_GREATER_EQUAL(); +TfLiteRegistration *Register_LESS(); +TfLiteRegistration *Register_LESS_EQUAL(); +TfLiteRegistration *Register_FLOOR(); +TfLiteRegistration *Register_NEG(); +TfLiteRegistration *Register_SELECT(); +TfLiteRegistration *Register_SLICE(); +TfLiteRegistration *Register_SIN(); +TfLiteRegistration *Register_TRANSPOSE_CONV(); +TfLiteRegistration *Register_SPARSE_TO_DENSE(); + +BuiltinOpResolver::BuiltinOpResolver() +{ + AddBuiltin(BuiltinOperator_RELU, Register_RELU()); + AddBuiltin(BuiltinOperator_RELU_N1_TO_1, Register_RELU_N1_TO_1()); + AddBuiltin(BuiltinOperator_RELU6, Register_RELU6()); + AddBuiltin(BuiltinOperator_TANH, Register_TANH()); + AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC()); + AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_2D()); + AddBuiltin(BuiltinOperator_MAX_POOL_2D, Register_MAX_POOL_2D()); + AddBuiltin(BuiltinOperator_L2_POOL_2D, Register_L2_POOL_2D()); + AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D()); + AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D()); + AddBuiltin(BuiltinOperator_SVDF, Register_SVDF()); + AddBuiltin(BuiltinOperator_RNN, Register_RNN()); + AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN, Register_BIDIRECTIONAL_SEQUENCE_RNN()); + AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN, Register_UNIDIRECTIONAL_SEQUENCE_RNN()); + AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP, Register_EMBEDDING_LOOKUP()); + AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP_SPARSE, Register_EMBEDDING_LOOKUP_SPARSE()); + AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED()); + AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION()); + AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP()); + AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX()); + AddBuiltin(BuiltinOperator_CONCATENATION, Register_CONCATENATION()); + AddBuiltin(BuiltinOperator_ADD, Register_ADD()); + AddBuiltin(BuiltinOperator_SPACE_TO_BATCH_ND, Register_SPACE_TO_BATCH_ND()); + AddBuiltin(BuiltinOperator_BATCH_TO_SPACE_ND, Register_BATCH_TO_SPACE_ND()); + AddBuiltin(BuiltinOperator_MUL, Register_MUL()); + AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION()); + AddBuiltin(BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION, Register_LOCAL_RESPONSE_NORMALIZATION()); + AddBuiltin(BuiltinOperator_LSTM, Register_LSTM()); + AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM, Register_BIDIRECTIONAL_SEQUENCE_LSTM()); + AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM, Register_UNIDIRECTIONAL_SEQUENCE_LSTM()); + AddBuiltin(BuiltinOperator_PAD, Register_PAD()); + AddBuiltin(BuiltinOperator_PADV2, Register_PADV2()); + AddBuiltin(BuiltinOperator_RESHAPE, Register_RESHAPE()); + AddBuiltin(BuiltinOperator_RESIZE_BILINEAR, Register_RESIZE_BILINEAR()); + AddBuiltin(BuiltinOperator_SKIP_GRAM, Register_SKIP_GRAM()); + AddBuiltin(BuiltinOperator_SPACE_TO_DEPTH, Register_SPACE_TO_DEPTH()); + AddBuiltin(BuiltinOperator_GATHER, Register_GATHER()); + AddBuiltin(BuiltinOperator_TRANSPOSE, Register_TRANSPOSE()); + AddBuiltin(BuiltinOperator_MEAN, Register_MEAN()); + AddBuiltin(BuiltinOperator_DIV, Register_DIV()); + AddBuiltin(BuiltinOperator_SUB, Register_SUB()); + AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT()); + AddBuiltin(BuiltinOperator_SQUEEZE, Register_SQUEEZE()); + AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE()); + AddBuiltin(BuiltinOperator_EXP, Register_EXP()); + AddBuiltin(BuiltinOperator_TOPK_V2, Register_TOPK_V2()); + AddBuiltin(BuiltinOperator_LOG_SOFTMAX, Register_LOG_SOFTMAX()); + AddBuiltin(BuiltinOperator_CAST, Register_CAST()); + AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE()); + AddBuiltin(BuiltinOperator_PRELU, Register_PRELU()); + AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM()); + AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM()); + AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX()); + AddBuiltin(BuiltinOperator_GREATER, Register_GREATER()); + AddBuiltin(BuiltinOperator_GREATER_EQUAL, Register_GREATER_EQUAL()); + AddBuiltin(BuiltinOperator_LESS, Register_LESS()); + AddBuiltin(BuiltinOperator_LESS_EQUAL, Register_LESS_EQUAL()); + AddBuiltin(BuiltinOperator_FLOOR, Register_FLOOR()); + AddBuiltin(BuiltinOperator_NEG, Register_NEG()); + AddBuiltin(BuiltinOperator_SELECT, Register_SELECT()); + AddBuiltin(BuiltinOperator_SLICE, Register_SLICE()); + AddBuiltin(BuiltinOperator_SIN, Register_SIN()); + AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSE_CONV()); + AddBuiltin(BuiltinOperator_SPARSE_TO_DENSE, Register_SPARSE_TO_DENSE()); + + AddCustom("TensorFlowMax", tflite::ops::custom::nnfw::Register_TensorFlowMax()); + AddCustom("RSQRT", tflite::ops::custom::nnfw::Register_RSQRT()); + AddCustom("SquaredDifference", tflite::ops::custom::nnfw::Register_SquaredDifference()); +} + +} // namespace builtin +} // namespace ops +} // namespace tflite diff --git a/libs/support/tflite/src/nnapi_delegate.cpp b/libs/support/tflite/src/nnapi_delegate.cpp new file mode 100644 index 000000000..1eada4bca --- /dev/null +++ b/libs/support/tflite/src/nnapi_delegate.cpp @@ -0,0 +1,720 @@ +/* Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// NOTE To minimize diff with upstream tensorflow, disable clang-format +// clang-format off + +// NOTE This code is derived from the following file (in TensorFlow) +// 'externals/tensorflow/tensorflow/contrib/lite/nnapi_delegate.cc' +#include "support/tflite/nnapi_delegate.h" +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/types.h> +#include "tensorflow/contrib/lite/builtin_op_data.h" +#include "tensorflow/contrib/lite/error_reporter.h" +#include "tensorflow/contrib/lite/model.h" +#include "NeuralNetworksShim.h" +#include "NeuralNetworksExShim.h" + +#ifdef __ANDROID__ +#include <sys/system_properties.h> +#endif + +namespace nnfw +{ + +// TODO(aselle): FATAL leaves resources hanging. +void FATAL(const char* format, ...) { + va_list args; + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + fflush(stderr); + exit(1); +} + +// TODO(aselle): Change the error model to use status codes. +#define CHECK_TFLITE_SUCCESS(x) \ + if (x != kTfLiteOk) { \ + FATAL("Aborting since tflite returned failure."); \ + } + +#define CHECK_NN(x) \ + if (x != ANEURALNETWORKS_NO_ERROR) { \ + FATAL("Aborting since tflite returned failure."); \ + } + +namespace { + +int32_t GetAndroidSdkVersion() { +#ifdef __ANDROID__ + const char* sdkProp = "ro.build.version.sdk"; + char sdkVersion[PROP_VALUE_MAX]; + int length = __system_property_get(sdkProp, sdkVersion); + if (length != 0) { + for (int i = 0; i < length; ++i) { + int digit = sdkVersion[i] - '0'; + if (digit < 0 || digit > 9) { + // Non-numeric SDK version, assume it's higher then expected; + return 0xFFFF; + } + } + return atoi(sdkVersion); + } + FATAL("No %s prop", sdkProp); +#endif // __ANDROID__ + return 0; +} + +static const int32_t kAndroidSdkVersion = GetAndroidSdkVersion(); + +} // namespace + +NNAPIAllocation::NNAPIAllocation(const char* filename, + ::tflite::ErrorReporter* error_reporter) + : MMAPAllocation(filename, error_reporter) { + if (mmapped_buffer_ != MAP_FAILED) + CHECK_NN(ANeuralNetworksMemory_createFromFd(buffer_size_bytes_, PROT_READ, + mmap_fd_, 0, &handle_)); +} + +NNAPIAllocation::~NNAPIAllocation() { + if (handle_) { + ANeuralNetworksMemory_free(handle_); + } +} + +NNAPIDelegate::~NNAPIDelegate() { + if (nn_compiled_model_) { + ANeuralNetworksCompilation_free(nn_compiled_model_); + nn_compiled_model_ = nullptr; + } + if (nn_model_) { + ANeuralNetworksModel_free(nn_model_); + nn_model_ = nullptr; + // TODO(aselle): Is this thread-safe and callable multiple times? + } + // ANeuralNetworksShutdown(); +} + +// Adds the tensors of the interpreter to the NN API model. +// Returns the number of operands added. +uint32_t addTensorOperands(tflite::Interpreter* interpreter, + ANeuralNetworksModel* nn_model, + const std::vector<uint32_t>& skip_list) { + uint32_t next_id = 0; + for (size_t i = 0; i < interpreter->tensors_size(); i++) { + // skip temporaries tensors. + bool shouldSkip = false; + for (auto skip_idx : skip_list) { + if (i == skip_idx) { + shouldSkip = true; + break; + } + } + if (shouldSkip) continue; + + int32_t nn_type = 0; + // NNAPI requires 32-bit float scale to be zero, tflite doesn't care + float scale = 0.0f; + int32_t zeroPoint = 0; + TfLiteTensor* tensor = interpreter->tensor(i); + switch (tensor->type) { + case kTfLiteNoType: + // Tensors added during initialization of Ops don't have a type yet and + // should not be registered with the NNAPI. + continue; + case kTfLiteFloat32: + nn_type = ANEURALNETWORKS_TENSOR_FLOAT32; + break; + case kTfLiteUInt8: + nn_type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM; + scale = tensor->params.scale; + // FIXME The next line is a workaround because currently zero scale is + // passed down from TF + // Lite. Note that the latest NeuralNetworks.h (see + // https://android.googlesource.com/platform/frameworks/ml/+/master/nn/runtime/include/NeuralNetworks.h) + // requires scale to be greater than zero. Remove this workaround + // when the scale + // value is correctly passed. + scale = (scale == 0.0f) ? 1.0f : scale; + zeroPoint = tensor->params.zero_point; + break; + case kTfLiteInt32: + nn_type = ANEURALNETWORKS_TENSOR_INT32; + scale = tensor->params.scale; + zeroPoint = tensor->params.zero_point; + break; + default: + FATAL("Unsupported type."); + } + // TODO(aselle): Note, many of these are intermediate results. Do I need + // to ever specify these sizes. I am currently below doing setValue + // on all of them, but I shouldn't in the future. + // Answer(jeanluc): If all the operators can set the dimension correctly, + // you won't need to. + ANeuralNetworksOperandType operand_type{ + nn_type, static_cast<uint32_t>(tensor->dims->size), + reinterpret_cast<uint32_t*>(tensor->dims->data), scale, zeroPoint}; + CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type)); + // TODO(aselle): Based on Michael's suggestion, limiting this to read + // only memory + if (tensor->allocation_type == kTfLiteMmapRo) { + if (const NNAPIAllocation* alloc = dynamic_cast<const NNAPIAllocation*>( + static_cast<const ::tflite::Allocation*>(tensor->allocation))) { + CHECK_NN(ANeuralNetworksModel_setOperandValueFromMemory( + nn_model, next_id, alloc->memory(), alloc->offset(tensor->data.raw), + tensor->bytes)); + } else { + CHECK_NN(ANeuralNetworksModel_setOperandValue( + nn_model, next_id, tensor->data.raw, tensor->bytes)); + } + } else if (tensor->bytes == 0) { + // These size 0 tensors are optional tensors reserved. + CHECK_NN( + ANeuralNetworksModel_setOperandValue(nn_model, next_id, nullptr, 0)); + } + + ++next_id; + } + return next_id; +} + +// Adds the operations and their parameters to the NN API model. +// 'next-id' is the operand ID of the next operand of the model. +void AddOpsAndParams(tflite::Interpreter* interpreter, + ANeuralNetworksModel* nn_model, uint32_t next_id, + std::vector<int>* model_state_inputs, + std::vector<int>* model_state_outputs) { + for (size_t i = 0; i < interpreter->nodes_size(); i++) { + const auto* node_and_registration = interpreter->node_and_registration(i); + const TfLiteNode& node = node_and_registration->first; + const TfLiteRegistration& registration = node_and_registration->second; + tflite::BuiltinOperator builtin = + static_cast<tflite::BuiltinOperator>(registration.builtin_code); + + // Add the parameters. + std::vector<uint32_t> augmented_inputs( + node.inputs->data, node.inputs->data + node.inputs->size); + std::vector<uint32_t> augmented_outputs( + node.outputs->data, node.outputs->data + node.outputs->size); + + auto add_scalar_int32 = [&nn_model, &augmented_inputs, + &next_id](int value) { + ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_INT32}; + CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type)) + CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, &value, + sizeof(int32_t))) + augmented_inputs.push_back(next_id++); + }; + + auto add_scalar_float32 = [&nn_model, &augmented_inputs, + &next_id](float value) { + ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_FLOAT32}; + CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type)) + CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, &value, + sizeof(float))) + augmented_inputs.push_back(next_id++); + }; + + // Handle state tensors of RNN, LSTM, SVDF. + // For each state_out tensor, a corresponding state_in operand needs to be + // created for NNAPI. + auto duplicate_state_tensor_float32 = + [interpreter, &nn_model, &next_id, &augmented_inputs, + &model_state_inputs, &model_state_outputs](int tensor_id) { + const TfLiteTensor* tensor = interpreter->tensor(tensor_id); + ANeuralNetworksOperandType operand_type{ + ANEURALNETWORKS_TENSOR_FLOAT32, + static_cast<uint32_t>(tensor->dims->size), + reinterpret_cast<uint32_t*>(tensor->dims->data), + tensor->params.scale, tensor->params.zero_point}; + CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type)); + augmented_inputs.push_back(next_id); + model_state_inputs->push_back(next_id); + model_state_outputs->push_back(tensor_id); + next_id++; + }; + + auto add_add_params = [&add_scalar_int32]() { add_scalar_int32(0); }; + + auto add_pooling_params = [&add_scalar_int32](void* data) { + auto builtin = reinterpret_cast<TfLitePoolParams*>(data); + add_scalar_int32(builtin->padding); + add_scalar_int32(builtin->stride_width); + add_scalar_int32(builtin->stride_height); + add_scalar_int32(builtin->filter_width); + add_scalar_int32(builtin->filter_height); + add_scalar_int32(builtin->activation); + }; + + auto add_convolution_params = [&add_scalar_int32](void* data) { + auto builtin = reinterpret_cast<TfLiteConvParams*>(data); + add_scalar_int32(builtin->padding); + add_scalar_int32(builtin->stride_width); + add_scalar_int32(builtin->stride_height); + add_scalar_int32(builtin->activation); + }; + + auto add_depthwise_conv_params = [&add_scalar_int32](void* data) { + auto builtin = reinterpret_cast<TfLiteDepthwiseConvParams*>(data); + add_scalar_int32(builtin->padding); + add_scalar_int32(builtin->stride_width); + add_scalar_int32(builtin->stride_height); + add_scalar_int32(builtin->depth_multiplier); + add_scalar_int32(builtin->activation); + }; + + auto add_fully_connected_params = [&add_scalar_int32](void* data) { + auto builtin = reinterpret_cast<TfLiteFullyConnectedParams*>(data); + add_scalar_int32(builtin->activation); + }; + + auto add_concatenation_params = [&add_scalar_int32](void* data) { + auto builtin = reinterpret_cast<TfLiteConcatenationParams*>(data); + add_scalar_int32(builtin->axis); + if (builtin->activation != kTfLiteActNone) { + FATAL("Concatenation does not support fused activation in NNAPI"); + } + }; + + auto add_softmax_params = [&add_scalar_float32](void* data) { + auto builtin = reinterpret_cast<TfLiteSoftmaxParams*>(data); + add_scalar_float32(builtin->beta); + }; + + auto add_space_to_depth_params = [&add_scalar_int32](void* data) { + auto builtin = reinterpret_cast<TfLiteSpaceToDepthParams*>(data); + add_scalar_int32(builtin->block_size); + }; + + auto add_lstm_params = [&add_scalar_int32, + &add_scalar_float32](void* data) { + auto builtin = reinterpret_cast<TfLiteLSTMParams*>(data); + add_scalar_int32(builtin->activation); + add_scalar_float32(builtin->cell_clip); + add_scalar_float32(builtin->proj_clip); + }; + + // LSTM in NNAPI requires scratch tensor as an output operand. + auto add_lstm_scratch_tensor_float32 = [interpreter, &node, &nn_model, + &next_id, &augmented_outputs]() { + int scratch_buffer_index = node.temporaries->data[0]; + const TfLiteTensor* tensor = interpreter->tensor(scratch_buffer_index); + ANeuralNetworksOperandType operand_type{ + ANEURALNETWORKS_TENSOR_FLOAT32, + static_cast<uint32_t>(tensor->dims->size), + reinterpret_cast<uint32_t*>(tensor->dims->data), tensor->params.scale, + tensor->params.zero_point}; + CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type)); + augmented_outputs.insert(augmented_outputs.begin(), next_id++); + }; + + auto add_mean_params = [&add_scalar_int32](void* data) { + auto builtin = reinterpret_cast<TfLiteMeanParams*>(data); + add_scalar_int32(builtin->keep_dims); + }; + + auto add_svdf_params = [&add_scalar_int32](void* data) { + auto builtin = reinterpret_cast<TfLiteSVDFParams*>(data); + add_scalar_int32(builtin->rank); + add_scalar_int32(builtin->activation); + }; + + auto add_rnn_params = [&add_scalar_int32](void* data) { + auto builtin = reinterpret_cast<TfLiteRNNParams*>(data); + add_scalar_int32(builtin->activation); + }; + + // Handle optional input tensors. + auto add_optional_tensors = [&nn_model, &augmented_inputs, + &next_id](int nn_type) { + for (size_t idx = 0; idx < augmented_inputs.size(); idx++) { + if (augmented_inputs[idx] == kOptionalTensor) { + const std::vector<uint32_t> dim = {0, 0}; + ANeuralNetworksOperandType operand_type{nn_type, 2, dim.data(), 0, 0}; + CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type)) + CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id, + nullptr, 0)) + augmented_inputs[idx] = next_id++; + } + } + }; + + int nnapi_version = 10; +#include "nnapi_delegate_ex_AddOpsAndParams_lambda.inc" + + ANeuralNetworksOperationType nn_op_type; + + switch (builtin) { + case tflite::BuiltinOperator_ADD: + nn_op_type = ANEURALNETWORKS_ADD; + add_add_params(); + break; + case tflite::BuiltinOperator_MUL: + nn_op_type = ANEURALNETWORKS_MUL; + add_add_params(); + break; + case tflite::BuiltinOperator_AVERAGE_POOL_2D: + add_pooling_params(node.builtin_data); + nn_op_type = ANEURALNETWORKS_AVERAGE_POOL_2D; + break; + case tflite::BuiltinOperator_MAX_POOL_2D: + add_pooling_params(node.builtin_data); + nn_op_type = ANEURALNETWORKS_MAX_POOL_2D; + break; + case tflite::BuiltinOperator_L2_POOL_2D: + add_pooling_params(node.builtin_data); + nn_op_type = ANEURALNETWORKS_L2_POOL_2D; + break; + case tflite::BuiltinOperator_CONV_2D: + add_convolution_params(node.builtin_data); + nn_op_type = ANEURALNETWORKS_CONV_2D; + break; + case tflite::BuiltinOperator_RELU: + nn_op_type = ANEURALNETWORKS_RELU; + break; + case tflite::BuiltinOperator_RELU_N1_TO_1: + nn_op_type = ANEURALNETWORKS_RELU1; + break; + case tflite::BuiltinOperator_RELU6: + nn_op_type = ANEURALNETWORKS_RELU6; + break; + case tflite::BuiltinOperator_TANH: + nn_op_type = ANEURALNETWORKS_TANH; + break; + case tflite::BuiltinOperator_FLOOR: + nn_op_type = ANEURALNETWORKS_FLOOR; + break; + case tflite::BuiltinOperator_LOGISTIC: + nn_op_type = ANEURALNETWORKS_LOGISTIC; + break; + case tflite::BuiltinOperator_DEPTHWISE_CONV_2D: + add_depthwise_conv_params(node.builtin_data); + nn_op_type = ANEURALNETWORKS_DEPTHWISE_CONV_2D; + break; + case tflite::BuiltinOperator_CONCATENATION: + add_concatenation_params(node.builtin_data); + nn_op_type = ANEURALNETWORKS_CONCATENATION; + break; + case tflite::BuiltinOperator_SOFTMAX: + add_softmax_params(node.builtin_data); + nn_op_type = ANEURALNETWORKS_SOFTMAX; + break; + case tflite::BuiltinOperator_FULLY_CONNECTED: + add_fully_connected_params(node.builtin_data); + nn_op_type = ANEURALNETWORKS_FULLY_CONNECTED; + break; + case tflite::BuiltinOperator_RESHAPE: + nn_op_type = ANEURALNETWORKS_RESHAPE; + // add_reshape_params(node.builtin_data); + break; + case tflite::BuiltinOperator_RESIZE_BILINEAR: + add_resize_bilinear_params(node.builtin_data); + nn_op_type = ANEURALNETWORKS_RESIZE_BILINEAR; + break; + case tflite::BuiltinOperator_SPACE_TO_DEPTH: + add_space_to_depth_params(node.builtin_data); + nn_op_type = ANEURALNETWORKS_SPACE_TO_DEPTH; + break; + case tflite::BuiltinOperator_LSTM: { + duplicate_state_tensor_float32( + node.outputs->data[/*kOutputStateTensor*/ 0]); + duplicate_state_tensor_float32( + node.outputs->data[/*kCellStateTensor*/ 1]); + add_lstm_params(node.builtin_data); + add_lstm_scratch_tensor_float32(); + add_optional_tensors(ANEURALNETWORKS_TENSOR_FLOAT32); + nn_op_type = ANEURALNETWORKS_LSTM; + break; + } + case tflite::BuiltinOperator_DEQUANTIZE: + nn_op_type = ANEURALNETWORKS_DEQUANTIZE; + break; + case tflite::BuiltinOperator_SVDF: { + duplicate_state_tensor_float32(node.outputs->data[/*kStateTensor*/ 0]); + add_svdf_params(node.builtin_data); + nn_op_type = ANEURALNETWORKS_SVDF; + break; + } + case tflite::BuiltinOperator_RNN: { + duplicate_state_tensor_float32( + node.outputs->data[/*kHiddenStateTensor*/ 0]); + add_rnn_params(node.builtin_data); + nn_op_type = ANEURALNETWORKS_RNN; + break; + } + case tflite::BuiltinOperator_EMBEDDING_LOOKUP: + nn_op_type = ANEURALNETWORKS_EMBEDDING_LOOKUP; + break; + case tflite::BuiltinOperator_PAD: + nnapi_version = 11; // require NNAPI 1.1 + nn_op_type = ANEURALNETWORKS_PAD; + break; + case tflite::BuiltinOperator_MEAN: + nnapi_version = 11; // require NNAPI 1.1 + add_mean_params(node.builtin_data); + nn_op_type = ANEURALNETWORKS_MEAN; + break; + case tflite::BuiltinOperator_DIV: + nnapi_version = 11; // require NNAPI 1.1 + nn_op_type = ANEURALNETWORKS_DIV; + add_add_params(); + break; + case tflite::BuiltinOperator_SUB: + nnapi_version = 11; // require NNAPI 1.1 + nn_op_type = ANEURALNETWORKS_SUB; + add_add_params(); + break; + case tflite::BuiltinOperator_STRIDED_SLICE: + add_strided_slice_params(node.builtin_data); + nn_op_type = ANEURALNETWORKS_STRIDED_SLICE; + break; + case tflite::BuiltinOperator_CAST: + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_CAST_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + case tflite::BuiltinOperator_TOPK_V2: + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_TOPK_V2_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + case tflite::BuiltinOperator_GATHER: + add_gather_ex_params(node.builtin_data); + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_GATHER_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + case tflite::BuiltinOperator_SPLIT: + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_SPLIT_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + case tflite::BuiltinOperator_TRANSPOSE: + nn_op_type = ANEURALNETWORKS_TRANSPOSE; + // param is almost same as reshape + break; + case tflite::BuiltinOperator_CONCAT_EMBEDDINGS: + case tflite::BuiltinOperator_LSH_PROJECTION: + case tflite::BuiltinOperator_HASHTABLE_LOOKUP: + case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN: + case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN: + case tflite::BuiltinOperator_EMBEDDING_LOOKUP_SPARSE: + case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM: + case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM: + case tflite::BuiltinOperator_L2_NORMALIZATION: + case tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION: + case tflite::BuiltinOperator_PADV2: + case tflite::BuiltinOperator_CALL: + case tflite::BuiltinOperator_SKIP_GRAM: + case tflite::BuiltinOperator_SPACE_TO_BATCH_ND: + case tflite::BuiltinOperator_BATCH_TO_SPACE_ND: + case tflite::BuiltinOperator_SQUEEZE: + case tflite::BuiltinOperator_EXP: + case tflite::BuiltinOperator_LOG_SOFTMAX: + case tflite::BuiltinOperator_DELEGATE: + case tflite::BuiltinOperator_PRELU: + case tflite::BuiltinOperator_MAXIMUM: + case tflite::BuiltinOperator_MINIMUM: + case tflite::BuiltinOperator_ARG_MAX: + case tflite::BuiltinOperator_GREATER: + case tflite::BuiltinOperator_GREATER_EQUAL: + case tflite::BuiltinOperator_LESS: + case tflite::BuiltinOperator_LESS_EQUAL: + case tflite::BuiltinOperator_NEG: + case tflite::BuiltinOperator_SELECT: + case tflite::BuiltinOperator_SLICE: + case tflite::BuiltinOperator_SIN: + case tflite::BuiltinOperator_TRANSPOSE_CONV: + case tflite::BuiltinOperator_SPARSE_TO_DENSE: + FATAL("Op code %d is currently not delegated to NNAPI", builtin); + nn_op_type = -1; // set to invalid + break; + case tflite::BuiltinOperator_CUSTOM: + std::string custom_name(registration.custom_name); + if (custom_name.compare("TensorFlowMax") == 0) { + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_TENSORFLOW_MAX_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), + static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + } + else if (custom_name.compare("RSQRT") == 0) { + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_RSQRT_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), + static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + } + else if (custom_name.compare("SquaredDifference") == 0) { + CHECK_NN(ANeuralNetworksModel_addOperationEx( + nn_model, ANEURALNETWORKS_SQUARED_DIFFERENCE_EX, + static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), + static_cast<uint32_t>(node.outputs->size), + reinterpret_cast<uint32_t*>(node.outputs->data))); + continue; + } + + FATAL("Custom operations are not supported when using NNAPI."); + nn_op_type = -1; // set to invalid + break; + } + + //if (nnapi_version == 11 && kAndroidSdkVersion < 28) { + // FATAL("Op %d needs NNAPI1.1", builtin); + //} + + // Add the operation. + CHECK_NN(ANeuralNetworksModel_addOperation( + nn_model, nn_op_type, static_cast<uint32_t>(augmented_inputs.size()), + augmented_inputs.data(), + static_cast<uint32_t>(augmented_outputs.size()), + reinterpret_cast<uint32_t*>(augmented_outputs.data()))); + } +} + +TfLiteStatus NNAPIDelegate::BuildGraph(::tflite::Interpreter* interpreter) { + // TODO(aselle): This is not correct. need to handle resize invalidation. + if (nn_model_ && nn_compiled_model_) return kTfLiteOk; + + if (!nn_model_) { + CHECK_NN(ANeuralNetworksModel_create(&nn_model_)); + + // Find all the temporary tensors and put them in a skip_list. + std::vector<uint32_t> skip_list; + for (size_t i = 0; i < interpreter->nodes_size(); i++) { + const auto* node_and_registration = interpreter->node_and_registration(i); + const TfLiteNode& node = node_and_registration->first; + if (node.temporaries != nullptr) { + for (int j = 0; j < node.temporaries->size; j++) { + skip_list.push_back(static_cast<uint32_t>(node.temporaries->data[j])); + } + } + } + + uint32_t next_id = addTensorOperands(interpreter, nn_model_, skip_list); + AddOpsAndParams(interpreter, nn_model_, next_id, &model_states_inputs_, + &model_states_outputs_); + + std::vector<int> augmented_inputs = interpreter->inputs(); + std::vector<int> augmented_outputs = interpreter->outputs(); + + // All state tensors input/output need to be treated as model input/output. + augmented_inputs.insert(augmented_inputs.end(), + model_states_inputs_.begin(), + model_states_inputs_.end()); + augmented_outputs.insert(augmented_outputs.end(), + model_states_outputs_.begin(), + model_states_outputs_.end()); + + CHECK_NN(ANeuralNetworksModel_identifyInputsAndOutputs( + nn_model_, static_cast<uint32_t>(augmented_inputs.size()), + reinterpret_cast<const uint32_t*>(augmented_inputs.data()), + static_cast<uint32_t>(augmented_outputs.size()), + reinterpret_cast<const uint32_t*>(augmented_outputs.data()))); + CHECK_NN(ANeuralNetworksModel_finish(nn_model_)); + } + if (!nn_compiled_model_) { + CHECK_NN(ANeuralNetworksCompilation_create(nn_model_, &nn_compiled_model_)); + CHECK_NN(ANeuralNetworksCompilation_finish(nn_compiled_model_)); + } + return kTfLiteOk; +} + +TfLiteStatus NNAPIDelegate::Invoke(::tflite::Interpreter* interpreter) { + if (!nn_model_) { + TF_LITE_ENSURE_STATUS(BuildGraph(interpreter)); + } + + ANeuralNetworksExecution* execution = nullptr; + CHECK_NN(ANeuralNetworksExecution_create(nn_compiled_model_, &execution)); + + // Currently perform deep copy of input buffer + for (size_t i = 0; i < interpreter->inputs().size(); i++) { + int input = interpreter->inputs()[i]; + // TODO(aselle): Is this what we want or do we want input instead? + // TODO(aselle): This should be called setInputValue maybe to be cons. + TfLiteTensor* tensor = interpreter->tensor(input); + CHECK_NN(ANeuralNetworksExecution_setInput( + execution, i, nullptr, tensor->data.raw, tensor->bytes)); + } + + // Tell nn api where to place final data. + for (size_t i = 0; i < interpreter->outputs().size(); i++) { + int output = interpreter->outputs()[i]; + TfLiteTensor* tensor = interpreter->tensor(output); + CHECK_NN(ANeuralNetworksExecution_setOutput( + execution, i, nullptr, tensor->data.raw, tensor->bytes)); + } + + // The state_out of previous invocation need to be mapped to state_in of + // current invocation. + for (size_t i = 0; i < model_states_outputs_.size(); i++) { + int state_tensor_idx = model_states_outputs_[i]; + TfLiteTensor* tensor = interpreter->tensor(state_tensor_idx); + // Here we are using a deep copy for state_in tensors so that we are not + // reading and writing into the same buffer during a invocation. + // TODO(miaowang): using double shared buffer to minimize the copies. + CHECK_NN(ANeuralNetworksExecution_setInput( + execution, i + interpreter->inputs().size(), nullptr, tensor->data.raw, + tensor->bytes)); + // Tell NNAPI where to output the state_out. + CHECK_NN(ANeuralNetworksExecution_setOutput( + execution, i + interpreter->outputs().size(), nullptr, tensor->data.raw, + tensor->bytes)); + } + + // Currently use blocking compute. + ANeuralNetworksEvent* event = nullptr; + CHECK_NN(ANeuralNetworksExecution_startCompute(execution, &event)); + CHECK_NN(ANeuralNetworksEvent_wait(event)); + ANeuralNetworksEvent_free(event); + ANeuralNetworksExecution_free(execution); + +#if 0 + printf("From the NN API:\n"); + TfLiteTensor* tensor = interpreter->tensor(interpreter->outputs()[0]); + if (float* data = + interpreter->typed_tensor<float>(interpreter->outputs()[0])) { + size_t num = tensor->bytes / sizeof(float); + for (float* p = data; p < data + num; p++) { + printf(" %f", *p); + } + printf("\n"); + } +#endif + + return kTfLiteOk; +} + +} // namespace nnfw + +// clang-format on diff --git a/libs/support/tflite/src/nnapi_delegate_ex_AddOpsAndParams_lambda.inc b/libs/support/tflite/src/nnapi_delegate_ex_AddOpsAndParams_lambda.inc new file mode 100644 index 000000000..ea485fe45 --- /dev/null +++ b/libs/support/tflite/src/nnapi_delegate_ex_AddOpsAndParams_lambda.inc @@ -0,0 +1,41 @@ +// This file is included from AddOpsAndParams defined in nnapi_delegate.cc +// and contains lambda for extened implementation to original Tensorflow Lite. + auto add_resize_bilinear_params = [&add_scalar_int32, &interpreter, &augmented_inputs](void* data) { + auto builtin = reinterpret_cast<TfLiteResizeBilinearParams*>(data); + if (builtin->align_corners) { + FATAL("Resize bilinear does not support align corners in NNAPI"); + } + + TfLiteTensor* tensor = interpreter->tensor(augmented_inputs.back()); + assert(tensor->type == kTfLiteInt32); + assert(tensor->bytes == sizeof(int)*2); + augmented_inputs.pop_back(); + + int height = ((int*)(tensor->data.raw))[1]; + int width = ((int*)(tensor->data.raw))[0]; + add_scalar_int32(height); + add_scalar_int32(width); + }; + + auto add_strided_slice_params = [&add_scalar_int32](void* data) { + auto builtin = reinterpret_cast<TfLiteStridedSliceParams*>(data); + add_scalar_int32(builtin->begin_mask); + add_scalar_int32(builtin->end_mask); + // ellipsis_mask and new_axis_mask are not supported on nn runtime + // cf) tflite interpreter supports both operations + if (builtin->ellipsis_mask) { + FATAL("STRIDE_SLICE does not support ellipsis_mask in NNAPI"); + } + if (builtin->new_axis_mask) { + FATAL("STRIDE_SLICE does not support new_axis_mask in NNAPI"); + } + add_scalar_int32(builtin->shrink_axis_mask); + }; + + auto add_gather_ex_params = [&add_scalar_int32](void* data) { + auto builtin = reinterpret_cast<TfLiteGatherParams*>(data); + add_scalar_int32(builtin->axis); + if (builtin->axis != 0) { + FATAL("GATHER does not support axis>0 in NNAPI"); + } + }; diff --git a/libs/util/CMakeLists.txt b/libs/util/CMakeLists.txt index 565aaf75e..eaa7ae8cf 100644 --- a/libs/util/CMakeLists.txt +++ b/libs/util/CMakeLists.txt @@ -3,12 +3,18 @@ set(NNFW_UTILITY_SRCS src/environment.cpp) list(APPEND NNFW_UTILITY_SRCS src/tensor/Shape.cpp) list(APPEND NNFW_UTILITY_SRCS src/tensor/NonIncreasingStride.cpp) list(APPEND NNFW_UTILITY_SRCS src/tensor/IndexFormatter.cpp) - -set(NNFW_INCLUDE_DIR include) +list(APPEND NNFW_UTILITY_SRCS src/tensor/Comparator.cpp) +if(BUILD_TFLITE_BENCHMARK_MODEL) + list(APPEND NNFW_UTILITY_SRCS src/profiling/time.cc) +endif() add_library(nnfw_util SHARED ${NNFW_UTILITY_SRCS}) target_include_directories(nnfw_util PUBLIC ${NNFW_INCLUDE_DIR}) +add_library(static_nnfw_util STATIC ${NNFW_UTILITY_SRCS}) +target_include_directories(static_nnfw_util PUBLIC ${NNFW_INCLUDE_DIR}) +set_target_properties(static_nnfw_util PROPERTIES POSITION_INDEPENDENT_CODE ON) + install(TARGETS nnfw_util RUNTIME DESTINATION bin COMPONENT libraries LIBRARY DESTINATION lib COMPONENT libraries) diff --git a/libs/util/examples/tensor_index_iterator.cpp b/libs/util/examples/tensor_index_iterator.cpp index a05d78dc4..284e04aa0 100644 --- a/libs/util/examples/tensor_index_iterator.cpp +++ b/libs/util/examples/tensor_index_iterator.cpp @@ -16,16 +16,52 @@ #include "util/tensor/IndexIterator.h" +#include <array> + #include <iostream> +#include <algorithm> + +#include <cassert> + +void test_iterate(void) +{ + const nnfw::util::tensor::Shape shape{3, 4, 7}; + + std::array<int, 3 * 4 * 7> array; + + array.fill(0); + + using nnfw::util::tensor::iterate; + using nnfw::util::tensor::Index; + + iterate(shape) << [&](const Index &index) { + assert(index.rank() == shape.rank()); + + const size_t rank = index.rank(); + + uint32_t offset = index.at(0); + + for (size_t axis = 1; axis < rank; ++axis) + { + offset *= shape.dim(axis); + offset += index.at(axis); + } + + array[offset] += 1; + }; + + assert(std::all_of(array.begin(), array.end(), [](int num) { return num == 1; })); +} int main(int argc, char **argv) { + test_iterate(); + nnfw::util::tensor::Shape shape{3, 4, 3, 4}; std::cout << "Iterate over tensor{3, 4, 3, 4}" << std::endl; - nnfw::util::tensor::iterate(shape) << [] (const nnfw::util::tensor::Index &index) - { + nnfw::util::tensor::iterate(shape) << [](const nnfw::util::tensor::Index &index) { std::cout << "rank: " << index.rank() << std::endl; for (size_t d = 0; d < index.rank(); ++d) diff --git a/libs/util/include/util/benchmark.h b/libs/util/include/util/benchmark.h deleted file mode 100644 index c451eddec..000000000 --- a/libs/util/include/util/benchmark.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_UTIL_BENCHMARK_H__ -#define __NNFW_UTIL_BENCHMARK_H__ - -#include <chrono> - -namespace nnfw -{ -namespace util -{ -// Benckmark support -namespace benchmark -{ - -template <typename T> class Accumulator -{ -public: - Accumulator(T &ref) : _ref(ref) - { - // DO NOTHING - } - -public: - T &operator()(void) { return _ref; } - -private: - T &_ref; -}; - -template <typename T, typename Callable> -Accumulator<T> &operator<<(Accumulator<T> &&acc, Callable cb) -{ - auto begin = std::chrono::steady_clock::now(); - cb(); - auto end = std::chrono::steady_clock::now(); - - acc() += std::chrono::duration_cast<T>(end - begin); - - return acc; -} - -template <typename T> Accumulator<T> measure(T &out) -{ - return Accumulator<T>(out); -} - -} // namespace benchmark -} // namespace util -} // namespace nnfw - -#endif // __NNFW_UTIL_BENCHMARK_H__ diff --git a/libs/util/include/util/environment.h b/libs/util/include/util/environment.h deleted file mode 100644 index fa9dd519d..000000000 --- a/libs/util/include/util/environment.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __UTIL_ENVIRONMENT_H__ -#define __UTIL_ENVIRONMENT_H__ - -namespace nnfw -{ -namespace util -{ - -int get_env_int(const char* name); -bool get_env_bool(const char* name); - -} -} - -#include <string> - -namespace nnfw -{ -namespace util -{ -namespace env -{ - -template <typename T> struct Accessor -{ - virtual ~Accessor() = default; - - virtual bool access(T &out) const = 0; -}; - -class IntAccessor : public Accessor<int> -{ -public: - IntAccessor(const std::string &tag); - -public: - bool access(int &out) const override; - -private: - std::string _tag; -}; - -} // namespace env -} // namespace util -} // namespace nnfw - -#endif // __UTIL_ENVIRONMENT_H__ diff --git a/libs/util/include/util/feature/Index.h b/libs/util/include/util/feature/Index.h deleted file mode 100644 index e77816669..000000000 --- a/libs/util/include/util/feature/Index.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_UTIL_FEATURE_INDEX_H__ -#define __NNFW_UTIL_FEATURE_INDEX_H__ - -#include <cstdint> - -namespace nnfw -{ -namespace util -{ -namespace feature -{ - -class Index -{ -public: - Index() = default; - -public: - Index(int32_t ch, int32_t row, int32_t col) : _ch{ch}, _row{row}, _col{col} - { - // DO NOTHING - } - -public: - int32_t ch(void) const { return _ch; } - int32_t row(void) const { return _row; } - int32_t col(void) const { return _col; } - -public: - int32_t &ch(void) { return _ch; } - int32_t &row(void) { return _row; } - int32_t &col(void) { return _col; } - -private: - int32_t _ch; - int32_t _row; - int32_t _col; -}; - -} // namespace feature -} // namespace util -} // namespace nnfw - -#endif // __NNFW_UTIL_FEATURE_INDEX_H__ diff --git a/libs/util/include/util/feature/IndexIterator.h b/libs/util/include/util/feature/IndexIterator.h deleted file mode 100644 index dd029f4b6..000000000 --- a/libs/util/include/util/feature/IndexIterator.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_UTIL_FEATURE_INDEX_ITERATOR_H__ -#define __NNFW_UTIL_FEATURE_INDEX_ITERATOR_H__ - -#include "util/feature/Shape.h" - -namespace nnfw -{ -namespace util -{ -namespace feature -{ - -class IndexIterator -{ -public: - IndexIterator(const Shape &shape) : _shape{shape} - { - // DO NOTHING - } - -public: - template <typename Callable> IndexIterator &iter(Callable cb) - { - for (uint32_t ch = 0; ch < _shape.C; ++ch) - { - for (uint32_t row = 0; row < _shape.H; ++row) - { - for (uint32_t col = 0; col < _shape.W; ++col) - { - cb(ch, row, col); - } - } - } - - return (*this); - } - -private: - const Shape _shape; -}; - -IndexIterator iterate(const Shape &shape) { return IndexIterator{shape}; } - -template <typename Callable> IndexIterator &operator<<(IndexIterator &&it, Callable cb) -{ - return it.iter(cb); -} - -} // namespace feature -} // namespace util -} // namespace nnfw - -#endif // __NNFW_UTIL_FEATURE_INDEX_ITERATOR_H__ diff --git a/libs/util/include/util/feature/Object.h b/libs/util/include/util/feature/Object.h deleted file mode 100644 index ca217b4a8..000000000 --- a/libs/util/include/util/feature/Object.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_UTIL_FEATURE_OBJECT_H__ -#define __NNFW_UTIL_FEATURE_OBJECT_H__ - -#include "util/feature/Shape.h" -#include "util/feature/Index.h" -#include "util/feature/Reader.h" - -#include <vector> - -namespace nnfw -{ -namespace util -{ -namespace feature -{ - -template<typename T> class Object final : public Reader<T> -{ -public: - using Generator = std::function<T (const Shape &shape, const Index &index)>; - -public: - Object(const Shape &shape, const Generator &fn) : _shape{shape} - { - _value.resize(_shape.C * _shape.H * _shape.W); - - for (int32_t ch = 0; ch < _shape.C; ++ch) - { - for (int32_t row = 0; row < _shape.H; ++row) - { - for (int32_t col = 0; col < _shape.W; ++col) - { - _value.at(offsetOf(ch, row, col)) = fn(_shape, Index{ch, row, col}); - } - } - } - } - -public: - const Shape &shape(void) const { return _shape; } - -public: - T at(uint32_t ch, uint32_t row, uint32_t col) const override - { - return _value.at(offsetOf(ch, row, col)); - } - -private: - uint32_t offsetOf(uint32_t ch, uint32_t row, uint32_t col) const - { - return ch * _shape.H * _shape.W + row * _shape.W + col; - } - -private: - Shape _shape; - std::vector<T> _value; -}; - -} // namespace feature -} // namespace util -} // namespace nnfw - -#endif // __NNFW_UTIL_FEATURE_OBJECT_H__ diff --git a/libs/util/include/util/feature/Reader.h b/libs/util/include/util/feature/Reader.h deleted file mode 100644 index 112503d80..000000000 --- a/libs/util/include/util/feature/Reader.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_UTIL_FEATURE_READER_H__ -#define __NNFW_UTIL_FEATURE_READER_H__ - -#include <cstdint> - -namespace nnfw -{ -namespace util -{ -namespace feature -{ - -template <typename T> struct Reader -{ - virtual ~Reader() = default; - - virtual T at(uint32_t ch, uint32_t row, uint32_t col) const = 0; -}; - -} // namespace feature -} // namespace util -} // namespace nnfw - -#endif // __NNFW_UTIL_FEATURE_READER_H__ diff --git a/libs/util/include/util/feature/Shape.h b/libs/util/include/util/feature/Shape.h deleted file mode 100644 index e05c97f51..000000000 --- a/libs/util/include/util/feature/Shape.h +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_UTIL_FEATURE_SHAPE_H__ -#define __NNFW_UTIL_FEATURE_SHAPE_H__ - -#include <cstdint> - -namespace nnfw -{ -namespace util -{ -namespace feature -{ - -struct Shape -{ - int32_t C; // Depth - int32_t H; // Height - int32_t W; // Width - - Shape() = default; - Shape(int32_t depth, int32_t height, int32_t width) : C{depth}, H{height}, W{width} - { - // DO NOTHING - } - -}; - -} // namespace feature -} // namespace util -} // namespace nnfw - -#endif // __NNFW_UTIL_FEATURE_H__ diff --git a/libs/util/include/util/feature/TextFormatter.h b/libs/util/include/util/feature/TextFormatter.h deleted file mode 100644 index 91b4c9fff..000000000 --- a/libs/util/include/util/feature/TextFormatter.h +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_UTIL_FEATURE_TEXT_FORMATTER_H__ -#define __NNFW_UTIL_FEATURE_TEXT_FORMATTER_H__ - -#include "util/feature/Shape.h" -#include "util/feature/Reader.h" - -#include <ostream> -#include <iomanip> -#include <limits> - -namespace nnfw -{ -namespace util -{ -namespace feature -{ - -template <typename T> class TextFormatter -{ -public: - TextFormatter(const Shape &shape, const Reader<T> &data) - : _shape(shape), _data(data) - { - // DO NOTHING - } - -public: - const Shape &shape(void) const { return _shape; } - const Reader<T> &data(void) const { return _data; } - -private: - const Shape &_shape; - const Reader<T> &_data; -}; - -template <typename T> -std::ostream &operator<<(std::ostream &os, const TextFormatter<T> &fmt) -{ - const auto &shape = fmt.shape(); - - for (uint32_t ch = 0; ch < shape.C; ++ch) - { - os << " Channel " << ch << ":" << std::endl; - for (uint32_t row = 0; row < shape.H; ++row) - { - os << " "; - for (uint32_t col = 0; col < shape.W; ++col) - { - const auto value = fmt.data().at(ch, row, col); - os << std::right; - os << std::fixed; - os << std::setw(std::numeric_limits<T>::digits10 + 2); - os << std::setprecision(5); - os << value; - os << " "; - } - os << std::endl; - } - } - - return os; -} - -} // namespace feature -} // namespace util -} // namespace nnfw - -#endif // __NNFW_UTIL_FEATURE_TEXT_FORMATTER_H__ diff --git a/libs/util/include/util/fp32.h b/libs/util/include/util/fp32.h deleted file mode 100644 index 604435470..000000000 --- a/libs/util/include/util/fp32.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_UTIL_FP32_H__ -#define __NNFW_UTIL_FP32_H__ - -#include <cmath> -#include <cfloat> -#include <algorithm> -#include <cstdint> - -namespace nnfw -{ -namespace util -{ -namespace fp32 -{ - -inline float relative_diff(float lhs, float rhs) -{ - const auto diff = std::fabs(lhs - rhs); - const auto base = std::max(std::fabs(lhs), std::fabs(rhs)); - - return diff / base; -} - -inline bool epsilon_equal(float expected, float obtained, uint32_t tolerance = 1) -{ - if (std::isnan(expected) && std::isnan(obtained)) - { - return true; - } - - // Let's use relative epsilon comparision - const auto diff = std::fabs(expected - obtained); - const auto max = std::max(std::fabs(expected), std::fabs(obtained)); - - return diff <= (max * FLT_EPSILON * tolerance); -} - -inline bool absolute_epsilon_equal(float expected, float obtained, float tolerance = 0.001) -{ - if (std::isnan(expected) && std::isnan(obtained)) - { - return true; - } - - // Let's use absolute epsilon comparision - const auto diff = std::fabs(expected - obtained); - - return diff <= tolerance; -} - -} // namespace fp32 -} // namespace util -} // namespace nnfw - -#endif // __NNFW_UTIL_FP32_H__ diff --git a/libs/util/include/util/kernel/IndexIterator.h b/libs/util/include/util/kernel/IndexIterator.h deleted file mode 100644 index ea6b48826..000000000 --- a/libs/util/include/util/kernel/IndexIterator.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_UTIL_KERNEL_INDEX_ITERATOR_H__ -#define __NNFW_UTIL_KERNEL_INDEX_ITERATOR_H__ - -#include "util/kernel/Shape.h" - -namespace nnfw -{ -namespace util -{ -namespace kernel -{ - -class IndexIterator -{ -public: - IndexIterator(const Shape &shape) : _shape{shape} - { - // DO NOTHING - } - -public: - template <typename Callable> IndexIterator &iter(Callable cb) - { - for (uint32_t nth = 0; nth < _shape.N; ++nth) - { - for (uint32_t ch = 0; ch < _shape.C; ++ch) - { - for (uint32_t row = 0; row < _shape.H; ++row) - { - for (uint32_t col = 0; col < _shape.W; ++col) - { - cb(nth, ch, row, col); - } - } - } - } - - return (*this); - } - -private: - const Shape _shape; -}; - -IndexIterator iterate(const Shape &shape) { return IndexIterator{shape}; } - -template <typename Callable> IndexIterator &operator<<(IndexIterator &&it, Callable cb) -{ - return it.iter(cb); -} - -} // namespace kernel -} // namespace util -} // namespace nnfw - -#endif // __NNFW_UTIL_FEATURE_INDEX_ITERATOR_H__ diff --git a/libs/util/include/util/kernel/RandomObject.h b/libs/util/include/util/kernel/RandomObject.h deleted file mode 100644 index ceed7a0b0..000000000 --- a/libs/util/include/util/kernel/RandomObject.h +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_UTIL_KERNEL_RANDOM_OBJECT_H__ -#define __NNFW_UTIL_KERNEL_RANDOM_OBJECT_H__ - -#include "util/kernel/Shape.h" -#include "util/kernel/Reader.h" - -#include <vector> - -namespace nnfw -{ -namespace util -{ -namespace kernel -{ - -template<typename T> class RandomObject final : public Reader<T> -{ -public: - RandomObject(const Shape &shape) : _shape{shape} - { - const uint32_t size = _shape.N * _shape.C * _shape.H * _shape.W; - - // TODO Use random number - for (uint32_t off = 0; off < size; ++off) - { - _value.emplace_back(static_cast<float>(off)); - } - } - -public: - const Shape &shape(void) const { return _shape; } - -public: - T at(uint32_t nth, uint32_t ch, uint32_t row, uint32_t col) const override - { - uint32_t index = 0; - - index += nth * _shape.C * _shape.H * _shape.W; - index += ch * _shape.H * _shape.W; - index += row * _shape.W; - index += col; - - return _value.at(index); - } - -private: - const Shape _shape; - std::vector<T> _value; -}; - -} // namespace kernel -} // namespace util -} // namespace nnfw - -#endif // __NNFW_UTIL_KERNEL_RANDOM_OBJECT_H__ diff --git a/libs/util/include/util/kernel/Reader.h b/libs/util/include/util/kernel/Reader.h deleted file mode 100644 index 9d8f33ad6..000000000 --- a/libs/util/include/util/kernel/Reader.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_UTIL_KERNEL_READER_H__ -#define __NNFW_UTIL_KERNEL_READER_H__ - -#include <cstdint> - -namespace nnfw -{ -namespace util -{ -namespace kernel -{ - -template <typename T> struct Reader -{ - virtual ~Reader() = default; - - virtual T at(uint32_t nth, uint32_t ch, uint32_t row, uint32_t col) const = 0; -}; - -} // namespace kernel -} // namespace util -} // namespace nnfw - -#endif // __NNFW_UTIL_KERNEL_READER_H__ diff --git a/libs/util/include/util/kernel/Shape.h b/libs/util/include/util/kernel/Shape.h deleted file mode 100644 index bd2332989..000000000 --- a/libs/util/include/util/kernel/Shape.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_UTIL_KERNEL_SHAPE_H__ -#define __NNFW_UTIL_KERNEL_SHAPE_H__ - -#include <cstdint> - -namespace nnfw -{ -namespace util -{ -namespace kernel -{ - -struct Shape -{ - int32_t N; - int32_t C; - int32_t H; - int32_t W; - - Shape() = default; - Shape(int32_t count, int32_t depth, int32_t height, int32_t width) - : N{count}, C{depth}, H{height}, W{width} - { - // DO NOTHING - } -}; - -} // namespace kernel -} // namespace util -} // namespace nnfw - -#endif // __NNFW_UTIL_KERNEL_SHAPE_H__ diff --git a/libs/util/include/util/tensor/Index.h b/libs/util/include/util/tensor/Index.h deleted file mode 100644 index e74b09229..000000000 --- a/libs/util/include/util/tensor/Index.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_UTIL_TENSOR_INDEX_H__ -#define __NNFW_UTIL_TENSOR_INDEX_H__ - -#include <cstdint> -#include <cstddef> - -#include <vector> -#include <initializer_list> - -namespace nnfw -{ -namespace util -{ -namespace tensor -{ - -struct Index -{ -public: - Index(size_t rank) - { - _offsets.resize(rank); - } - -public: - Index(std::initializer_list<int32_t> offsets) : _offsets{offsets} - { - // DO NOTHING - } - -public: - size_t rank(void) const { return _offsets.size(); } - -public: - int32_t at(size_t n) const { return _offsets.at(n); } - int32_t &at(size_t n) { return _offsets.at(n); } - -private: - std::vector<int32_t> _offsets; -}; - -} // namespace tensor -} // namespace util -} // namespace nnfw - -#endif // __NNFW_UTIL_TENSOR_INDEX_H__ diff --git a/libs/util/include/util/tensor/IndexFormatter.h b/libs/util/include/util/tensor/IndexFormatter.h deleted file mode 100644 index 8014a42b6..000000000 --- a/libs/util/include/util/tensor/IndexFormatter.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_UTIL_TENSOR_INDEX_FORMATTER_H__ -#define __NNFW_UTIL_TENSOR_INDEX_FORMATTER_H__ - -#include "util/tensor/Index.h" - -#include <ostream> - -namespace nnfw -{ -namespace util -{ -namespace tensor -{ - -class IndexFormatter -{ -public: - IndexFormatter(const nnfw::util::tensor::Index &index) : _index(index) - { - // DO NOTHING - } - -public: - const nnfw::util::tensor::Index &index(void) const { return _index; } - -private: - const nnfw::util::tensor::Index &_index; -}; - -std::ostream &operator<<(std::ostream &os, const IndexFormatter &fmt); - -} // namespace tensor -} // namespace util -} // namespace nnfw - -#endif // __NNFW_UTIL_TENSOR_INDEX_FORMATTER_H__ diff --git a/libs/util/include/util/tensor/IndexIterator.h b/libs/util/include/util/tensor/IndexIterator.h deleted file mode 100644 index 56a8c7dd2..000000000 --- a/libs/util/include/util/tensor/IndexIterator.h +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_UTIL_TENSOR_INDEX_ITERATOR_H__ -#define __NNFW_UTIL_TENSOR_INDEX_ITERATOR_H__ - -#include "util/tensor/Shape.h" -#include "util/tensor/Index.h" - -namespace nnfw -{ -namespace util -{ -namespace tensor -{ - -class IndexIterator -{ -public: - IndexIterator(const Shape &shape) : _shape(shape) - { - // DO NOTHING - } - -public: - // Allow move, but disallow copy - IndexIterator(IndexIterator &&) = default; - IndexIterator(const IndexIterator &) = delete; - -public: - template <typename Callable> IndexIterator &iter(Callable fn) - { - Index index(_shape.rank()); - - for (size_t d = 0; d < _shape.rank(); ++d) - { - index.at(d) = 0; - } - - size_t cursor = 0; - - while (cursor < _shape.rank()) - { - fn(index); - - if (index.at(cursor) + 1 < _shape.dim(cursor)) - { - index.at(cursor) += 1; - } - else - { - while ((cursor < _shape.rank()) && (index.at(cursor) + 1 == _shape.dim(cursor))) - { - ++cursor; - } - - if (cursor == _shape.rank()) - { - break; - } - - index.at(cursor) += 1; - - for (size_t d = 0; d < cursor; ++d) - { - index.at(d) = 0; - } - - cursor = 0; - } - } - - return (*this); - } - -private: - const Shape &_shape; -}; - -inline IndexIterator iterate(const Shape &shape) { return IndexIterator{shape}; } - -template <typename Callable> IndexIterator &operator<<(IndexIterator &&it, Callable cb) -{ - return it.iter(cb); -} - -} // namespace tensor -} // namespace util -} // namespace nnfw - -#endif // __NNFW_UTIL_TENSOR_INDEX_ITERATOR_H__ diff --git a/libs/util/include/util/tensor/NonIncreasingStride.h b/libs/util/include/util/tensor/NonIncreasingStride.h deleted file mode 100644 index ff013ffa2..000000000 --- a/libs/util/include/util/tensor/NonIncreasingStride.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_UTIL_TENSOR_NON_INCREASING_STRIDE_H__ -#define __NNFW_UTIL_TENSOR_NON_INCREASING_STRIDE_H__ - -#include "util/tensor/Shape.h" -#include "util/tensor/Index.h" - -#include <vector> - -namespace nnfw -{ -namespace util -{ -namespace tensor -{ - -// As its name suggests, stride[N-1] >= stride[N] holds for all N < rank in NonIncreasingStride. -class NonIncreasingStride -{ -public: - void init(const Shape &shape) - { - _stride.resize(shape.rank()); - _stride.at(shape.rank() - 1) = 1; - - for (uint32_t axis = shape.rank() - 1; axis > 0; --axis) - { - _stride.at(axis - 1) = _stride.at(axis) * shape.dim(axis); - } - } - -public: - uint32_t at(uint32_t axis) const { return _stride.at(axis); } - -public: - uint32_t offset(const Index &index) const; - -private: - std::vector<uint32_t> _stride; -}; - -} // namespace tensor -} // namespace util -} // namespace nnfw - -#endif // __NNFW_UTIL_TENSOR_NON_INCREASING_STRIDE_H__ diff --git a/libs/util/include/util/tensor/Object.h b/libs/util/include/util/tensor/Object.h deleted file mode 100644 index 839bce236..000000000 --- a/libs/util/include/util/tensor/Object.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_UTIL_TENSOR_OBJECT_H__ -#define __NNFW_UTIL_TENSOR_OBJECT_H__ - -#include "util/tensor/Shape.h" -#include "util/tensor/Index.h" -#include "util/tensor/IndexIterator.h" -#include "util/tensor/NonIncreasingStride.h" -#include "util/tensor/Reader.h" - -#include <vector> - -namespace nnfw -{ -namespace util -{ -namespace tensor -{ - -template<typename T> class Object final : public Reader<T> -{ -public: - using Generator = std::function<T (const Shape &shape, const Index &index)>; - -public: - Object(const Shape &shape, const Generator &fn) : _shape{shape} - { - // Set 'stride' - _stride.init(shape); - - // Pre-allocate buffer - _values.resize(_shape.dim(0) * _stride.at(0)); - - // Set 'value' - iterate(_shape) << [this, &fn] (const Index &index) - { - _values.at(_stride.offset(index)) = fn(_shape, index); - }; - } - -public: - const Shape &shape(void) const { return _shape; } - -public: - T at(const Index &index) const override - { - return _values.at(_stride.offset(index)); - } - -private: - Shape _shape; - NonIncreasingStride _stride; - -private: - std::vector<T> _values; -}; - -} // namespace tensor -} // namespace util -} // namespace nnfw - -#endif // __NNFW_UTIL_FEATURE_OBJECT_H__ diff --git a/libs/util/include/util/tensor/Reader.h b/libs/util/include/util/tensor/Reader.h deleted file mode 100644 index 654214880..000000000 --- a/libs/util/include/util/tensor/Reader.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_UTIL_TENSOR_READER_H__ -#define __NNFW_UTIL_TENSOR_READER_H__ - -#include "util/tensor/Index.h" - -namespace nnfw -{ -namespace util -{ -namespace tensor -{ - -template <typename T> struct Reader -{ - virtual ~Reader() = default; - - virtual T at(const Index &index) const = 0; -}; - -} // namespace tensor -} // namespace util -} // namespace nnfw - -#endif // __NNFW_UTIL_TENSOR_READER_H__ diff --git a/libs/util/include/util/tensor/Shape.h b/libs/util/include/util/tensor/Shape.h deleted file mode 100644 index d4edeaada..000000000 --- a/libs/util/include/util/tensor/Shape.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_UTIL_TENSOR_SHAPE_H__ -#define __NNFW_UTIL_TENSOR_SHAPE_H__ - -#include <cstdint> -#include <cstddef> -#include <vector> -#include <initializer_list> - -namespace nnfw -{ -namespace util -{ -namespace tensor -{ - -class Shape -{ -public: - Shape(size_t rank) - { - _dimensions.resize(rank); - } - -public: - Shape(const std::initializer_list<int32_t> &dimensions) : _dimensions{dimensions} - { - // DO NOTHING - } - -public: - size_t rank(void) const { return _dimensions.size(); } - -public: - int32_t dim(size_t n) const { return _dimensions.at(n); } - int32_t &dim(size_t n) { return _dimensions.at(n); } - -private: - std::vector<int32_t> _dimensions; -}; - -bool operator==(const Shape &, const Shape &); - -} // namespace tensor -} // namespace util -} // namespace nnfw - -#endif // __NNFW_UTIL_TENSOR_SHAPE_H__ diff --git a/libs/util/include/util/tensor/Zipper.h b/libs/util/include/util/tensor/Zipper.h deleted file mode 100644 index fc2d94e57..000000000 --- a/libs/util/include/util/tensor/Zipper.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_UTIL_TENSOR_ZIPPER_H__ -#define __NNFW_UTIL_TENSOR_ZIPPER_H__ - -#include "util/tensor/Index.h" -#include "util/tensor/IndexIterator.h" -#include "util/tensor/Reader.h" - -namespace nnfw -{ -namespace util -{ -namespace tensor -{ - -template <typename T> class Zipper -{ -public: - Zipper(const Shape &shape, const Reader<T> &lhs, const Reader<T> &rhs) - : _shape{shape}, _lhs{lhs}, _rhs{rhs} - { - // DO NOTHING - } - -public: - template <typename Callable> void zip(Callable cb) const - { - iterate(_shape) << [this, &cb] (const Index &index) - { - cb(index, _lhs.at(index), _rhs.at(index)); - }; - } - -private: - const Shape &_shape; - const Reader<T> &_lhs; - const Reader<T> &_rhs; -}; - -template<typename T, typename Callable> -const Zipper<T> &operator<<(const Zipper<T> &zipper, Callable cb) -{ - zipper.zip(cb); - return zipper; -} - -template<typename T> -Zipper<T> zip(const Shape &shape, const Reader<T> &lhs, const Reader<T> &rhs) -{ - return Zipper<T>{shape, lhs, rhs}; -} - -} // namespace tensor -} // namespace util -} // namespace nnfw - -#endif // __NNFW_UTIL_TENSOR_ZIPPER_H__ diff --git a/libs/util/include/util/vector.h b/libs/util/include/util/vector.h deleted file mode 100644 index 49a58a41e..000000000 --- a/libs/util/include/util/vector.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_UTIL_VECTOR_H__ -#define __NNFW_UTIL_VECTOR_H__ - -#include <vector> - -template <typename T> -bool operator==(const std::vector<T> &lhs, const std::vector<T> &rhs) -{ - if (lhs.size() != rhs.size()) - { - return false; - } - - for (size_t ind = 0; ind < lhs.size(); ++ind) - { - if (lhs.at(ind) != rhs.at(ind)) - { - return false; - } - } - - return true; -} - -#endif // __NNFW_UTIL_VECTOR_H__ diff --git a/libs/util/include/util/vector/Object.h b/libs/util/include/util/vector/Object.h deleted file mode 100644 index b1bc521da..000000000 --- a/libs/util/include/util/vector/Object.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_UTIL_VECTOR_OBJECT_H__ -#define __NNFW_UTIL_VECTOR_OBJECT_H__ - -#include "util/vector/Reader.h" - -#include <vector> -#include <functional> - -namespace nnfw -{ -namespace util -{ -namespace vector -{ - -template<typename T> class Object final : public Reader<T> -{ -public: - using Generator = std::function<T (int32_t size, int32_t offset)>; - -public: - Object(int32_t size, const Generator &gen) : _size{size} - { - _value.resize(_size); - - for (int32_t offset = 0; offset < size; ++offset) - { - _value.at(offset) = gen(size, offset); - } - } - -public: - int32_t size(void) const { return _size; } - -public: - T at(uint32_t nth) const override { return _value.at(nth); } - -private: - const int32_t _size; - std::vector<T> _value; -}; - -} // namespace vector -} // namespace util -} // namespace nnfw - -#endif // __NNFW_UTIL_VECTOR_OBJECT_H__ diff --git a/libs/util/include/util/vector/Reader.h b/libs/util/include/util/vector/Reader.h deleted file mode 100644 index a3c5cb359..000000000 --- a/libs/util/include/util/vector/Reader.h +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef __NNFW_UTIL_VECTOR_READER_H__ -#define __NNFW_UTIL_VECTOR_READER_H__ - -#include <cstdint> - -namespace nnfw -{ -namespace util -{ -namespace vector -{ - -template <typename T> struct Reader -{ - virtual ~Reader() = default; - - virtual T at(uint32_t nth) const = 0; -}; - -} // namespace vector -} // namespace util -} // namespace nnfw - -#endif // __NNFW_UTIL_VECTOR_READER_H__ diff --git a/libs/util/src/environment.cpp b/libs/util/src/environment.cpp index dca6c5c55..4b18b409f 100644 --- a/libs/util/src/environment.cpp +++ b/libs/util/src/environment.cpp @@ -25,25 +25,23 @@ namespace nnfw namespace util { -int get_env_int(const char* name) +int get_env_int(const char *name, int defaultValue) { const char *value = std::getenv(name); if (value != nullptr) return std::stoi(value); - return 0; + return defaultValue; } -bool get_env_bool(const char* name) +bool get_env_bool(const char *name, bool defaultValue) { const char *value = std::getenv(name); if (value != nullptr) { - if (std::stoi(value)) - return true; - if (!strcasecmp(value, "true")) - return true; + return std::stoi(value) != 0; } - return false; + + return defaultValue; } } // namespace util @@ -74,6 +72,24 @@ bool IntAccessor::access(int &out) const return true; } +FloatAccessor::FloatAccessor(const std::string &tag) : _tag{tag} +{ + // DO NOTHING +} + +bool FloatAccessor::access(float &out) const +{ + auto value = std::getenv(_tag.c_str()); + + if (value == nullptr) + { + return false; + } + + out = std::stof(value); + return true; +} + } // namespace env } // namespace util } // namespace nnfw diff --git a/libs/util/src/profiling/time.cc b/libs/util/src/profiling/time.cc new file mode 100644 index 000000000..6fe1b54dc --- /dev/null +++ b/libs/util/src/profiling/time.cc @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "util/profiling/time.h" + +#include <sys/time.h> + +namespace tflite +{ +namespace profiling +{ +namespace time +{ +uint64_t NowMicros() +{ + struct timeval tv; + gettimeofday(&tv, nullptr); + return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec; +} +} // namespace time +} // namespace profiling +} // namespace tflite diff --git a/libs/util/src/tensor/Comparator.cpp b/libs/util/src/tensor/Comparator.cpp new file mode 100644 index 000000000..89cd687e9 --- /dev/null +++ b/libs/util/src/tensor/Comparator.cpp @@ -0,0 +1,40 @@ +#include "util/tensor/Comparator.h" +#include "util/tensor/Zipper.h" + +#include "util/fp32.h" + +namespace nnfw +{ +namespace util +{ +namespace tensor +{ + +std::vector<Diff<float>> Comparator::compare(const Shape &shape, const Reader<float> &expected, + const Reader<float> &obtained, + Observer *observer) const +{ + std::vector<Diff<float>> res; + + zip(shape, expected, obtained) << + [&](const Index &index, float expected_value, float obtained_value) { + const auto relative_diff = nnfw::util::fp32::relative_diff(expected_value, obtained_value); + + if (!_compare_fn(expected_value, obtained_value)) + { + res.emplace_back(index, expected_value, obtained_value); + } + + // Update max_diff_index, if necessary + if (observer != nullptr) + { + observer->notify(index, expected_value, obtained_value); + } + }; + + return res; +} + +} // namespace tensor +} // namespace util +} // namespace nnfw diff --git a/libs/util/src/tensor/Shape.cpp b/libs/util/src/tensor/Shape.cpp index d177d1382..f1de26fdc 100644 --- a/libs/util/src/tensor/Shape.cpp +++ b/libs/util/src/tensor/Shape.cpp @@ -16,6 +16,8 @@ #include "util/tensor/Shape.h" +#include <cassert> + namespace nnfw { namespace util @@ -32,7 +34,7 @@ bool operator==(const Shape &lhs, const Shape &rhs) for (size_t axis = 0; axis < lhs.rank(); ++axis) { - if(lhs.dim(axis) != rhs.dim(axis)) + if (lhs.dim(axis) != rhs.dim(axis)) { return false; } @@ -41,6 +43,57 @@ bool operator==(const Shape &lhs, const Shape &rhs) return true; } +Shape Shape::from(const std::string &str) +{ + Shape shape(0); + + bool pending = false; + int value = 0; + + for (const char *cur = str.c_str(); true; ++cur) + { + if (*cur == ',' || *cur == '\0') + { + if (pending) + { + shape.append(value); + } + + if (*cur == '\0') + { + break; + } + + pending = false; + value = 0; + continue; + } + + assert(*cur >= '0' && *cur <= '9'); + + pending = true; + value *= 10; + value += *cur - '0'; + } + + return shape; +} + +std::ostream &operator<<(std::ostream &os, const Shape &shape) +{ + if (shape.rank() > 0) + { + os << shape.dim(0); + + for (uint32_t axis = 1; axis < shape.rank(); ++axis) + { + os << "," << shape.dim(axis); + } + } + + return os; +} + } // namespace tensor } // namespace util } // namespace nnfw |