diff options
author | Chunseok Lee <chunseok.lee@samsung.com> | 2020-08-14 15:19:19 +0900 |
---|---|---|
committer | Chunseok Lee <chunseok.lee@samsung.com> | 2020-08-14 15:19:19 +0900 |
commit | 042b262b3633b6c0f577aed6cb4b980ad0c1dcf3 (patch) | |
tree | e79fb9ffe65b21bdc5863306db2757ab187a3306 /compute/cker/include | |
parent | 05e0ec30a632339a8533082476f27bda31ccde16 (diff) | |
download | nnfw-042b262b3633b6c0f577aed6cb4b980ad0c1dcf3.tar.gz nnfw-042b262b3633b6c0f577aed6cb4b980ad0c1dcf3.tar.bz2 nnfw-042b262b3633b6c0f577aed6cb4b980ad0c1dcf3.zip |
Imported Upstream version 1.8.0upstream/1.8.0submit/tizen/20200814.062151
Diffstat (limited to 'compute/cker/include')
23 files changed, 2377 insertions, 73 deletions
diff --git a/compute/cker/include/cker/NeonTensorUtils.h b/compute/cker/include/cker/NeonTensorUtils.h index 5c38bc6f3..246fd9a46 100644 --- a/compute/cker/include/cker/NeonTensorUtils.h +++ b/compute/cker/include/cker/NeonTensorUtils.h @@ -546,7 +546,7 @@ bool NeonIsZeroVector(const float *vector, int v_size) void NeonCpuBackendGemm(const int8_t *input, const int32_t *bias, const int8_t *input_to_gate_weights, int32_t n_batch, int32_t n_input, - int32_t n_output, int32_t, int32_t *scratch) + int32_t n_output, int32_t, int32_t *scratch, ruy::Context *ruy_context) { MatrixParams<int8_t> lhs_params; lhs_params.order = Order::kRowMajor; @@ -571,8 +571,6 @@ void NeonCpuBackendGemm(const int8_t *input, const int32_t *bias, } // Below code is from tflite::cpu_backend_gemm::detail::GemmImplUsingRuy - ruy::Context *ruy_context = ruy_support::GetRuyContext(); - ruy::Matrix<int8_t> ruy_lhs; ruy::Matrix<int8_t> ruy_rhs; ruy::Matrix<int32_t> ruy_dst; @@ -851,13 +849,13 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix, const int m_cols, const int8_t *__restrict__ vectors, const float *scaling_factors, int n_batch, int32_t *scratch, float *__restrict__ result, - int result_stride) + int result_stride, ruy::Context *ruy_context) { if (m_rows % 4 == 0 && result_stride == 1) { const int32_t *bias = static_cast<const int32_t *>(nullptr); NeonCpuBackendGemm(vectors, bias, matrix, n_batch, m_cols, m_rows, - /*output_zp =*/0, scratch); + /*output_zp =*/0, scratch, ruy_context); // Multiply by float scaling factors and write to result const int total_size = n_batch * m_rows; diff --git a/compute/cker/include/cker/PortableTensorUtils.h b/compute/cker/include/cker/PortableTensorUtils.h index 9769d4ba6..54714e214 100644 --- a/compute/cker/include/cker/PortableTensorUtils.h +++ b/compute/cker/include/cker/PortableTensorUtils.h @@ -20,6 +20,7 @@ #include "cker/Types.h" #include "cker/neon/neon_check.h" +#include <ruy/context.h> #include <cstring> #include <cmath> @@ -142,7 +143,7 @@ void PortableMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matr const int8_t *__restrict__ vector, const float *scaling_factors, int n_batch, int32_t *, float *__restrict__ result, - int result_stride) + int result_stride, ruy::Context *) { PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector, scaling_factors, n_batch, result, result_stride); diff --git a/compute/cker/include/cker/TensorUtils.h b/compute/cker/include/cker/TensorUtils.h index 6b23c0b30..e07c91239 100644 --- a/compute/cker/include/cker/TensorUtils.h +++ b/compute/cker/include/cker/TensorUtils.h @@ -73,10 +73,10 @@ void MatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_ void MatrixBatchVectorMultiplyAccumulate(const int8_t *matrix, const int m_rows, const int m_cols, const int8_t *vectors, const float *scaling_factors, int n_batch, int32_t *scratch, float *result, - int result_stride) + int result_stride, ruy::Context *ruy_context) { NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, vectors, - scaling_factors, n_batch, scratch, result, result_stride); + scaling_factors, n_batch, scratch, result, result_stride, ruy_context); } void ZeroVector(float *vector, int v_size) { PortableZeroVector(vector, v_size); } diff --git a/compute/cker/include/cker/Types.h b/compute/cker/include/cker/Types.h index 41b1916cf..886ce5e5e 100644 --- a/compute/cker/include/cker/Types.h +++ b/compute/cker/include/cker/Types.h @@ -259,6 +259,12 @@ struct FullyConnectedParams // FullyConnectedWeightsFormat weights_format; }; +struct L2NormParams +{ + // uint8 inference params. + int32_t input_zero_point; +}; + struct GatherParams { int32_t axis; @@ -271,6 +277,14 @@ struct InstanceNormParams float float_activation_max; }; +struct ResizeBilinearParams +{ + int32_t output_height; + int32_t output_width; + bool align_corners; + bool half_pixel_centers; +}; + struct TransposeConvParams { PaddingType padding_type; @@ -325,6 +339,12 @@ struct SplitParams int16_t axis; }; +struct SplitVParams +{ + uint16_t num_split; + int16_t axis; +}; + struct FusedBatchNormParams { bool is_training; @@ -338,6 +358,11 @@ struct SpaceToBatchParams int32_t output_offset; }; +struct SpaceToDepthParams +{ + int32_t block_size; +}; + enum class Order { kColMajor, diff --git a/compute/cker/include/cker/Utils.h b/compute/cker/include/cker/Utils.h index b69d55c26..2abb998d0 100644 --- a/compute/cker/include/cker/Utils.h +++ b/compute/cker/include/cker/Utils.h @@ -123,6 +123,68 @@ inline int CountLeadingZeros(uint32_t integer_input) return leading_zeros; } +inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift, + int32_t *output_inv_sqrt, int *output_shift) +{ + assert(input >= 0); + if (input <= 1) + { + // Handle the input value 1 separately to avoid overflow in that case + // in the general computation below (b/143972021). Also handle 0 as if it + // were a 1. 0 is an invalid input here (divide by zero) and 1 is a valid + // but rare/unrealistic input value. We can expect both to occur in some + // incompletely trained models, but probably not in fully trained models. + *output_inv_sqrt = std::numeric_limits<std::int32_t>::max(); + *output_shift = 0; + return; + } + assert(input > 1); + *output_shift = 11; + while (input >= (1 << 29)) + { + input /= 4; + ++*output_shift; + } + const unsigned max_left_shift_bits = CountLeadingZeros(static_cast<uint32_t>(input)) - 1; + const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2; + const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1; + *output_shift -= left_shift_bit_pairs; + input <<= 2 * left_shift_bit_pairs; + assert(input >= (1 << 27)); + assert(input < (1 << 29)); + using gemmlowp::FixedPoint; + using gemmlowp::Rescale; + using gemmlowp::SaturatingRoundingMultiplyByPOT; + // Using 3 integer bits gives us enough room for the internal arithmetic in + // this Newton-Raphson iteration. + using F3 = FixedPoint<int32_t, 3>; + using F0 = FixedPoint<int32_t, 0>; + const F3 fixedpoint_input = F3::FromRaw(input >> 1); + const F3 fixedpoint_half_input = SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input); + const F3 fixedpoint_half_three = + GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5); + // Newton-Raphson iteration + // Naive unoptimized starting guess: x = 1 + F3 x = F3::One(); + // Naive unoptimized number of iterations: 5 + for (int i = 0; i < 5; i++) + { + const F3 x3 = Rescale<3>(x * x * x); + x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3); + } + const F0 fixedpoint_half_sqrt_2 = + GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.); + x = x * fixedpoint_half_sqrt_2; + *output_inv_sqrt = x.raw(); + if (*output_shift < 0) + { + *output_inv_sqrt <<= -*output_shift; + *output_shift = 0; + } + // Convert right shift (right is positive) to left shift. + *output_shift *= reverse_shift; +} + // Comment from tensorflow lite: // // DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING diff --git a/compute/cker/include/cker/operation/BatchToSpaceND.h b/compute/cker/include/cker/operation/BatchToSpaceND.h new file mode 100644 index 000000000..e33b2fba5 --- /dev/null +++ b/compute/cker/include/cker/operation/BatchToSpaceND.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_BATCH_TO_SPACE_ND_H__ +#define __NNFW_CKER_BATCH_TO_SPACE_ND_H__ + +#include "cker/Shape.h" + +#define UNUSED(x) ((void)(x)) + +namespace nnfw +{ +namespace cker +{ + +// Helper methods for BatchToSpaceND. +// `spatial_index_dim` specifies post-crop offset index in this spatial +// dimension, i.e. spatial offset introduced by flattening batch to spatial +// dimension minus the crop size at beginning. `block_shape_dim` is the block +// size in current dimension. `input_dim` and `output_dim` are input and output +// size of BatchToSpaceND operation in current dimension. +// Output start index is inclusive and end index is exclusive. +inline void GetIndexRange(int spatial_index_dim, int block_shape_dim, int input_dim, int output_dim, + int *start_index, int *end_index) +{ + // (*start_index) * block_shape_dim is effectively rounded up to the next + // multiple of block_shape_dim by the integer division. + *start_index = std::max(0, (-spatial_index_dim + block_shape_dim - 1) / block_shape_dim); + // Similarly, (*end_index) * block_shape_dim is rounded up too (note that + // end_index is exclusive). + *end_index = + std::min(input_dim, (output_dim - spatial_index_dim + block_shape_dim - 1) / block_shape_dim); +} + +template <typename T> +inline void BatchToSpaceND(const Shape &unextended_input1_shape, const T *input1_data, + const int32_t *block_shape_data, const int32_t *crops_data, + const Shape &unextended_output_shape, T *output_data) +{ + auto input_dim = unextended_input1_shape.DimensionsCount(); + auto output_dim = unextended_output_shape.DimensionsCount(); + + assert(input_dim == 3 || input_dim == 4); + assert(input_dim == output_dim); + + UNUSED(input_dim); + UNUSED(output_dim); + + // Extends the input/output shape from 3D to 4D if needed, NHC -> NH1C. + auto extend_shape = [](const Shape &shape) { + if (shape.DimensionsCount() == 4) + { + return shape; + } + Shape new_shape(4, 1); + new_shape.SetDim(0, shape.Dims(0)); + new_shape.SetDim(1, shape.Dims(1)); + new_shape.SetDim(3, shape.Dims(2)); + return new_shape; + }; + const Shape input1_shape = extend_shape(unextended_input1_shape); + const Shape output_shape = extend_shape(unextended_output_shape); + + const int32_t output_width = output_shape.Dims(2); + const int32_t output_height = output_shape.Dims(1); + const int32_t output_batch_size = output_shape.Dims(0); + + const int32_t depth = input1_shape.Dims(3); + const int32_t input_width = input1_shape.Dims(2); + const int32_t input_height = input1_shape.Dims(1); + const int32_t input_batch_size = input1_shape.Dims(0); + + const int32_t block_shape_height = block_shape_data[0]; + const int32_t block_shape_width = block_shape_data[1]; + + const int32_t crops_top = crops_data[0]; + const int32_t crops_left = crops_data[2]; + + for (int in_batch = 0; in_batch < input_batch_size; ++in_batch) + { + const int out_batch = in_batch % output_batch_size; + const int spatial_offset = in_batch / output_batch_size; + + int in_h_start = 0; + int in_h_end = 0; + // GetIndexRange ensures start and end indices are in [0, output_height). + GetIndexRange(spatial_offset / block_shape_width - crops_top, block_shape_height, input_height, + output_height, &in_h_start, &in_h_end); + + for (int in_h = in_h_start; in_h < in_h_end; ++in_h) + { + const int out_h = in_h * block_shape_height + spatial_offset / block_shape_width - crops_top; + assert(out_h >= 0); + assert(out_h < output_height); + + int in_w_start = 0; + int in_w_end = 0; + // GetIndexRange ensures start and end indices are in [0, output_width). + GetIndexRange(spatial_offset % block_shape_width - crops_left, block_shape_width, input_width, + output_width, &in_w_start, &in_w_end); + + for (int in_w = in_w_start; in_w < in_w_end; ++in_w) + { + const int out_w = + in_w * block_shape_width + spatial_offset % block_shape_width - crops_left; + assert(out_w >= 0); + assert(out_w < output_width); + T *out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0); + const T *in = input1_data + Offset(input1_shape, in_batch, in_h, in_w, 0); + memcpy(out, in, depth * sizeof(T)); + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_BATCH_TO_SPACE_ND_H__ diff --git a/compute/cker/include/cker/operation/FullyConnected.h b/compute/cker/include/cker/operation/FullyConnected.h index 9bcf3fd82..4280c9ae2 100644 --- a/compute/cker/include/cker/operation/FullyConnected.h +++ b/compute/cker/include/cker/operation/FullyConnected.h @@ -18,6 +18,7 @@ #ifndef __NNFW_CKER_FULLY_CONNECTED_H__ #define __NNFW_CKER_FULLY_CONNECTED_H__ +#include <ruy/context.h> #include "cker/Shape.h" #include "cker/Types.h" #include "cker/Utils.h" @@ -78,8 +79,11 @@ inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &inpu MatrixBatchVectorMultiplyAccumulate(weights_data, num_units, input_size, input_data, batch_size, output_data, /*result_stride=*/1); - // Apply activation function - ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data); + if (params.activation != FusedActivationFunctionType::kNone) + { + // Apply activation function + ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data); + } } inline void FullyConnected(const FullyConnectedParams ¶ms, const Shape &input_shape, @@ -140,7 +144,7 @@ inline void FullyConnectedHybrid(const FullyConnectedParams ¶ms, const Shape const float *input_data, const Shape &filter_shape, const int8_t *filter_data, const Shape &, const float *bias_data, const Shape &output_shape, float *output_data, - FCTempArena &temp_arena) + FCTempArena &temp_arena, ruy::Context *ruy_context) { int total_input_size = input_shape.FlatSize(); const int input_size = filter_shape.Dims(1); @@ -186,19 +190,72 @@ inline void FullyConnectedHybrid(const FullyConnectedParams ¶ms, const Shape int32_t *scratch = temp_arena.accum_scratch.data(); MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data, scaling_factors_ptr, batch_size, scratch, output_data, - /*result_stride=*/1); + /*result_stride=*/1, ruy_context); #else MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data, scaling_factors_ptr, batch_size, output_data, /*result_stride=*/1); + UNUSED_RELEASE(ruy_context); UNUSED_RELEASE(output_shape); #endif // Apply activation function to floats. - ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data); + if (params.activation != FusedActivationFunctionType::kNone) + { + // Apply activation function + ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data); + } return; } +inline void FullyConnectedSparseWeight(const FullyConnectedParams ¶ms, const Shape &input_shape, + const float *input_data, const Shape &weights_shape, + const float *weights_data, const Shape &bias_shape, + const float *bias_data, const Shape &output_shape, + float *output_data, int w0_size, const uint16_t *w1_segments, + const uint16_t *w1_indices) +{ + UNUSED_RELEASE(params); + UNUSED_RELEASE(input_shape); + + assert(weights_shape.DimensionsCount() == 2); + assert(output_shape.DimensionsCount() == 2); + + const int output_dims_count = output_shape.DimensionsCount(); + const int weights_dims_count = weights_shape.DimensionsCount(); + const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1); + const int output_depth = + MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1); + const int accum_depth = weights_shape.Dims(weights_dims_count - 1); + + UNUSED_RELEASE(bias_shape); + if (bias_data) + { + VectorBatchVectorAssign(bias_data, output_depth, batches, output_data); + } + else + { + ZeroVector(output_data, batches * output_depth); + } + for (int b = 0; b < batches; ++b) + { + for (int idx_0 = 0; idx_0 < w0_size; ++idx_0) + { + for (int pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1) + { + int idx_1 = w1_indices[pw1]; + output_data[b * output_depth + idx_0] += + weights_data[pw1] * input_data[b * accum_depth + idx_1]; + } + } + } + if (params.activation != FusedActivationFunctionType::kNone) + { + // Apply activation function + ApplyActivationToVector(output_data, batches * output_depth, params.activation, output_data); + } +} + } // namespace cker } // namespace nnfw diff --git a/compute/cker/include/cker/operation/Helper/PhiloxRandom.h b/compute/cker/include/cker/operation/Helper/PhiloxRandom.h new file mode 100644 index 000000000..8e8879ce9 --- /dev/null +++ b/compute/cker/include/cker/operation/Helper/PhiloxRandom.h @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2015 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_ +#define TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_ + +#include <stdlib.h> + +#include "cker/Types.h" +#include "cker/Shape.h" +#include "cker/Utils.h" + +// Function qualifiers that need to work on both CPU and GPU. +#if defined(__CUDACC__) || defined(__HIPCC__) +// For nvcc. +#define PHILOX_DEVICE_FUNC __host__ __device__ +#define PHILOX_INLINE __inline__ +#else +// For non-nvcc. +#define PHILOX_DEVICE_FUNC +#define PHILOX_INLINE inline +#endif +#define PHILOX_DEVICE_INLINE PHILOX_DEVICE_FUNC PHILOX_INLINE + +#include <math.h> + +namespace nnfw +{ +namespace cker +{ +namespace random +{ + +// A class that represents an inline array. It can be used on both CPU and GPU, +// and also trivially copyable between CPU and GPU. +// Arguments: +// T: the array element type; +// ElementCount: the fixed size of the array; +template <typename T, int ElementCount> class Array +{ +public: + static constexpr int kElementCount = ElementCount; + PHILOX_DEVICE_INLINE Array() + { + for (int i = 0; i < ElementCount; ++i) + { + data_[i] = T(0); + } + } + + PHILOX_DEVICE_INLINE const T &operator[](int index) const { return data_[index]; } + + PHILOX_DEVICE_INLINE T &operator[](int index) { return data_[index]; } + + size_t size() const { return ElementCount; } + +private: + T data_[ElementCount]; +}; + +// A class that encapsulates all the states for a random number generator using +// the philox_4x32_10 algorithm. Each invocation returns a 128-bit random bits +// in the form of four uint32. +// There are multiple variants of this algorithm, we picked the 4x32_10 version +// that is most suited for our applications. +// Since this class is meant to be copied between CPU to GPU, it maintains a +// value semantics. +// +// For example: To use this class and populate an array of 1024 randoms on CPU +// with two threads, +// +// void Fill(PhiloxRandom rnd, uint32* output, int start, int limit) { +// assert(start % 4 == 0); +// assert(limit % 4 == 0); +// rnd.Skip(start / 4); +// for (int i = start; i < limit; i += 4) { +// auto sample = rnd(); +// ... copy sample[0..3] to output[i..i+3] +// } +// } +// +// PhiloxRandom rng(seed); +// PhiloxRandom rng_copy = rng; +// rng.Skip(1000/4); +// +// ... schedule Fill(rng_copy, output, 0, 512) in thread 1; +// ... schedule Fill(rng_copy, output, 512, 1024) in thread 2; +// ... wait for thread 1 & 2 to finish executing Fill(). +// +// NOTE: +// 1. PhiloxRandom is trivially copyable. +// 2. PhiloxRandom is compilable by gcc and nvcc. +class PhiloxRandom +{ +public: + using ResultType = Array<uint32_t, 4>; + using ResultElementType = uint32_t; + // The number of elements that will be returned. + static constexpr int kResultElementCount = 4; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 10; + // The type for the 64-bit key stored in the form of two 32-bit uint + // that are used in the diffusion process. + using Key = Array<uint32_t, 2>; + + PHILOX_DEVICE_INLINE + PhiloxRandom() {} + + PHILOX_DEVICE_INLINE + explicit PhiloxRandom(uint64_t seed) + { + key_[0] = static_cast<uint32_t>(seed); + key_[1] = static_cast<uint32_t>(seed >> 32); + } + + PHILOX_DEVICE_INLINE + explicit PhiloxRandom(uint64_t seed_lo, uint64_t seed_hi) + { + key_[0] = static_cast<uint32_t>(seed_lo); + key_[1] = static_cast<uint32_t>(seed_lo >> 32); + counter_[2] = static_cast<uint32_t>(seed_hi); + counter_[3] = static_cast<uint32_t>(seed_hi >> 32); + } + + PHILOX_DEVICE_INLINE + PhiloxRandom(ResultType counter, Key key) : counter_(counter), key_(key) {} + + PHILOX_DEVICE_INLINE + ResultType const &counter() const { return counter_; } + + PHILOX_DEVICE_INLINE + Key const &key() const { return key_; } + + // Skip the specified number of samples of 128-bits in the current stream. + PHILOX_DEVICE_INLINE + void Skip(uint64_t count) + { + const uint32_t count_lo = static_cast<uint32_t>(count); + uint32_t count_hi = static_cast<uint32_t>(count >> 32); + + counter_[0] += count_lo; + if (counter_[0] < count_lo) + { + ++count_hi; + } + + counter_[1] += count_hi; + if (counter_[1] < count_hi) + { + if (++counter_[2] == 0) + { + ++counter_[3]; + } + } + } + + // Returns a group of four random numbers using the underlying Philox + // algorithm. + PHILOX_DEVICE_INLINE ResultType operator()() + { + ResultType counter = counter_; + Key key = key_; + + // Run the single rounds for ten times. Manually unrolling the loop + // for better performance. + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + RaiseKey(&key); + counter = ComputeSingleRound(counter, key); + + SkipOne(); + + return counter; + } + +private: + // We use the same constants as recommended by the original paper. + static constexpr uint32_t kPhiloxW32A = 0x9E3779B9; + static constexpr uint32_t kPhiloxW32B = 0xBB67AE85; + static constexpr uint32_t kPhiloxM4x32A = 0xD2511F53; + static constexpr uint32_t kPhiloxM4x32B = 0xCD9E8D57; + + // Helper function to skip the next sample of 128-bits in the current stream. + PHILOX_DEVICE_INLINE void SkipOne() + { + if (++counter_[0] == 0) + { + if (++counter_[1] == 0) + { + if (++counter_[2] == 0) + { + ++counter_[3]; + } + } + } + } + + // Helper function to return the lower and higher 32-bits from two 32-bit + // integer multiplications. + PHILOX_DEVICE_INLINE + static void MultiplyHighLow(uint32_t a, uint32_t b, uint32_t *result_low, uint32_t *result_high) + { +#ifndef __CUDA_ARCH__ + const uint64_t product = static_cast<uint64_t>(a) * b; + *result_low = static_cast<uint32_t>(product); + *result_high = static_cast<uint32_t>(product >> 32); +#else + *result_low = a * b; + *result_high = __umulhi(a, b); +#endif + } + + // Helper function for a single round of the underlying Philox algorithm. + PHILOX_DEVICE_INLINE static ResultType ComputeSingleRound(const ResultType &counter, + const Key &key) + { + uint32_t lo0; + uint32_t hi0; + MultiplyHighLow(kPhiloxM4x32A, counter[0], &lo0, &hi0); + + uint32_t lo1; + uint32_t hi1; + MultiplyHighLow(kPhiloxM4x32B, counter[2], &lo1, &hi1); + + ResultType result; + result[0] = hi1 ^ counter[1] ^ key[0]; + result[1] = lo1; + result[2] = hi0 ^ counter[3] ^ key[1]; + result[3] = lo0; + return result; + } + + PHILOX_DEVICE_INLINE void RaiseKey(Key *key) + { + (*key)[0] += kPhiloxW32A; + (*key)[1] += kPhiloxW32B; + } + +private: + ResultType counter_; + Key key_; +}; + +} // namespace random +} // namespace cker +} // namespace nnfw +#endif // TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_ diff --git a/compute/cker/include/cker/operation/Helper/RandomDistributions.h b/compute/cker/include/cker/operation/Helper/RandomDistributions.h new file mode 100644 index 000000000..baeafd7c9 --- /dev/null +++ b/compute/cker/include/cker/operation/Helper/RandomDistributions.h @@ -0,0 +1,778 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2015 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_HELPER_RANDOM_DISTRIBUTIONS_H__ +#define __NNFW_CKER_HELPER_RANDOM_DISTRIBUTIONS_H__ + +#include <string.h> + +#include <cmath> + +#include <algorithm> +#include <type_traits> + +#include "cker/Types.h" +#include "cker/Shape.h" +#include "cker/Utils.h" + +#include "cker/eigen/EigenSupport.h" +#include "cker/operation/Helper/PhiloxRandom.h" + +namespace nnfw +{ +namespace cker +{ +namespace random +{ + +// Helper function to convert a 16-bit integer to a half between [0..1). +PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16_t x); +// Helper function to convert a 16-bit integer to a bfloat16 between [0..1). +// PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x); +// Helper function to convert a 32-bit integer to a float between [0..1). +PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32_t x); +// Helper function to convert two 32-bit integers to a double between [0..1). +PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32_t x0, uint32_t x1); + +// Computes a + b. Requires that the result is representable in the destination +// type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b +// need *not* be representable in that type. (The condition on b excludes the +// extremal case INT_MIN + UINT_MAX = INT_MAX, which this function cannot +// compute.) +template <typename Int> +PHILOX_DEVICE_INLINE Int SignedAdd(Int a, typename std::make_unsigned<Int>::type b) +{ + // Implementation note: both b_div_2 and b - b_div_2 are positive and + // representable as Int. + auto b_div_2 = b >> 1; + return a + static_cast<Int>(b_div_2) + static_cast<Int>(b - b_div_2); +} + +// A class that generates uniform distribution random numbers from the +// underlying random integer generator. +// Arguments: +// Generator: a generator type that returns a number of uint32 upon each +// invocation. It needs to define kResultElementCount for the +// sample count for each invocation, and ResultType for the +// actual returned sample type. +// RealType: the data type of the real numbers that will be returned by the +// distribution. This could be either float or double for now. +// This class is meant to be implemented through specialization. The default +// is not defined by design. +template <class Generator, typename RealType> class UniformDistribution; + +template <class Generator> class UniformDistribution<Generator, Eigen::half> +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = Generator::kResultElementCount; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 3; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = false; + typedef Array<Eigen::half, kResultElementCount> ResultType; + typedef Eigen::half ResultElementType; + + PHILOX_DEVICE_INLINE + ResultType operator()(Generator *gen) + { + typename Generator::ResultType sample = (*gen)(); + ResultType result; + for (int i = 0; i < kResultElementCount; ++i) + { + result[i] = Uint16ToHalf(sample[i]); // Truncate the upper 16 bits. + } + return result; + } +}; + +template <class Generator> class UniformDistribution<Generator, float> +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = Generator::kResultElementCount; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 3; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = false; + typedef Array<float, kResultElementCount> ResultType; + typedef float ResultElementType; + + PHILOX_DEVICE_INLINE + ResultType operator()(Generator *gen) + { + typename Generator::ResultType sample = (*gen)(); + ResultType result; + for (int i = 0; i < kResultElementCount; ++i) + { + result[i] = Uint32ToFloat(sample[i]); + } + return result; + } +}; + +template <class Generator> class UniformDistribution<Generator, double> +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = Generator::kResultElementCount / 2; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 3; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = false; + typedef Array<double, kResultElementCount> ResultType; + typedef double ResultElementType; + + PHILOX_DEVICE_INLINE + ResultType operator()(Generator *gen) + { + typename Generator::ResultType sample = (*gen)(); + ResultType result; + for (int i = 0; i < kResultElementCount; ++i) + { + result[i] = Uint64ToDouble(sample[2 * i], sample[2 * i + 1]); + } + return result; + } +}; + +template <class Generator> class UniformDistribution<Generator, int32_t> +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = Generator::kResultElementCount; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 3; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = false; + typedef Array<int32_t, kResultElementCount> ResultType; + typedef int32_t ResultElementType; + + // Must have lo < hi + UniformDistribution(int32_t lo, int32_t hi) + : lo_(lo), range_(static_cast<uint32_t>(hi) - static_cast<uint32_t>(lo)) + { + } + + PHILOX_DEVICE_INLINE + ResultType operator()(Generator *gen) + { + typename Generator::ResultType sample = (*gen)(); + ResultType result; + for (int i = 0; i < kResultElementCount; ++i) + { + result[i] = SignedAdd(lo_, sample[i] % range_); + } + return result; + } + +private: + // Note that lo_ is intentionally signed while range_ is intentionally + // unsigned. This is because hi - lo can overflow signed integers if + // lo < 0 < hi, but always fits in unsigned. + int32_t lo_; + int32_t range_; +}; + +template <class Generator> class UniformDistribution<Generator, int64_t> +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = Generator::kResultElementCount / 2; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 3; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = false; + typedef Array<int64_t, kResultElementCount> ResultType; + typedef int64_t ResultElementType; + + // Must have lo < hi + UniformDistribution(int64_t lo, int64_t hi) + : lo_(lo), range_(static_cast<uint64_t>(hi) - static_cast<uint64_t>(lo)) + { + } + + PHILOX_DEVICE_INLINE + ResultType operator()(Generator *gen) + { + typename Generator::ResultType sample = (*gen)(); + ResultType result; + for (int i = 0; i < kResultElementCount; ++i) + { + auto bits = sample[2 * i] | static_cast<uint64_t>(sample[2 * i + 1]) << 32; + result[i] = SignedAdd(lo_, bits % range_); + } + return result; + } + +private: + // Note that lo_ is intentionally signed while range_ is intentionally + // unsigned. This is because hi - lo can overflow signed integers if + // lo < 0 < hi, but always fits in unsigned. + int64_t lo_; + uint64_t range_; +}; + +// Similar to `UniformDistribution`, except that instead of generating numbers +// in the range [low, high), it generates numbers covering the whole range of +// the integer type. +template <typename Generator, typename IntType> class UniformFullIntDistribution; + +template <typename Generator, typename IntType> class UniformFullIntDistribution32 +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = Generator::kResultElementCount; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 3; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = false; + typedef Array<IntType, kResultElementCount> ResultType; + typedef IntType ResultElementType; + + PHILOX_DEVICE_INLINE + ResultType operator()(Generator *gen) + { + typename Generator::ResultType sample = (*gen)(); + ResultType result; + for (int i = 0; i < kResultElementCount; ++i) + { + result[i] = sample[i]; + } + return result; + } +}; + +template <typename Generator, typename IntType> class UniformFullIntDistribution64 +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = Generator::kResultElementCount / 2; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 3; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = false; + typedef Array<IntType, kResultElementCount> ResultType; + typedef IntType ResultElementType; + + PHILOX_DEVICE_INLINE + ResultType operator()(Generator *gen) + { + typename Generator::ResultType sample = (*gen)(); + ResultType result; + for (int i = 0; i < kResultElementCount; ++i) + { + result[i] = sample[2 * i] | static_cast<uint64_t>(sample[2 * i + 1]) << 32; + } + return result; + } +}; + +template <typename Generator> +class UniformFullIntDistribution<Generator, int32_t> + : public UniformFullIntDistribution32<Generator, int32_t> +{ +}; +template <typename Generator> +class UniformFullIntDistribution<Generator, uint32_t> + : public UniformFullIntDistribution32<Generator, uint32_t> +{ +}; +template <typename Generator> +class UniformFullIntDistribution<Generator, int64_t> + : public UniformFullIntDistribution64<Generator, int64_t> +{ +}; +template <typename Generator> +class UniformFullIntDistribution<Generator, uint64_t> + : public UniformFullIntDistribution64<Generator, uint64_t> +{ +}; + +// A class that adapts the underlying native multiple samples to return a single +// sample at a time. +template <class Generator> class SingleSampleAdapter +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = 1; + // The number of elements that will be returned by the underlying generator. + static constexpr int kNativeElementCount = Generator::kResultElementCount; + typedef typename Generator::ResultElementType ResultType; + typedef typename Generator::ResultElementType ResultElementType; + + PHILOX_DEVICE_INLINE + explicit SingleSampleAdapter(Generator *gen) + : generator_(gen), used_result_index_(Generator::kResultElementCount) + { + } + + PHILOX_DEVICE_INLINE + ResultType operator()() + { + if (used_result_index_ == Generator::kResultElementCount) + { + unused_results_ = (*generator_)(); + used_result_index_ = 0; + } + + return unused_results_[used_result_index_++]; + } + + PHILOX_DEVICE_INLINE + void Skip(uint64_t num_skips) + { + if (!num_skips) + { + return; + } + int num_unused_results = kNativeElementCount - used_result_index_; + if (num_skips <= num_unused_results) + { + used_result_index_ += num_skips; + return; + } + num_skips -= num_unused_results; + used_result_index_ = kNativeElementCount; + SkipFromGenerator(num_skips / kNativeElementCount); + num_skips = num_skips % kNativeElementCount; + if (num_skips) + { + unused_results_ = (*generator_)(); + used_result_index_ = num_skips; + } + } + +private: + // This implementation iteratively skips over `num_skips` samples + // from `generator_`. There is an O(1) implementation for PhiloxRandom + // in random_distributions.cc. + PHILOX_DEVICE_INLINE + void SkipFromGenerator(uint64_t num_skips) + { + while (num_skips--) + { + (*generator_)(); + } + } + + Generator *generator_; + typename Generator::ResultType unused_results_; + int used_result_index_; +}; + +// A class that generates unit normal distribution random numbers from the +// underlying random integer generator. +// Arguments: +// Generator: a generator type that returns a number of uint32 upon each +// each invocation. It needs to define kResultElementCount for the +// sample count for each invocation, and ResultType for actual +// returned sample type. +// RealType: the data type of the real numbers that will be returned by the +// distribution. This could be either float or double for now. +// This class is meant to be implemented through specialization. The default +// is not defined by design. +template <class Generator, typename RealType> class NormalDistribution; + +PHILOX_DEVICE_INLINE +void BoxMullerFloat(uint32_t x0, uint32_t x1, float *f0, float *f1); + +PHILOX_DEVICE_INLINE +void BoxMullerDouble(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, double *d0, double *d1); + +// Exactly like the float version, except that we convert to half afterwards; +// since we don't have half-precision sin/cos even on GPUs, there's nothing to +// gain from working in half internally. +template <class Generator> class NormalDistribution<Generator, Eigen::half> +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = Generator::kResultElementCount; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 70; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = false; + typedef Array<Eigen::half, kResultElementCount> ResultType; + typedef Eigen::half ResultElementType; + + PHILOX_DEVICE_INLINE + ResultType operator()(Generator *gen) + { + typename Generator::ResultType sample = (*gen)(); + ResultType result; + for (int i = 0; i < kResultElementCount; i += 2) + { + float f[2]; + BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]); + result[i] = Eigen::half(f[0]); + result[i + 1] = Eigen::half(f[1]); + } + return result; + } +}; + +template <class Generator> class NormalDistribution<Generator, float> +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = Generator::kResultElementCount; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 70; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = false; + typedef Array<float, kResultElementCount> ResultType; + typedef float ResultElementType; + + PHILOX_DEVICE_INLINE + ResultType operator()(Generator *gen) + { + typename Generator::ResultType sample = (*gen)(); + ResultType result; + for (int i = 0; i < kResultElementCount; i += 2) + { + BoxMullerFloat(sample[i], sample[i + 1], &result[i], &result[i + 1]); + } + return result; + } +}; + +template <class Generator> class NormalDistribution<Generator, double> +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = Generator::kResultElementCount / 2; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 70; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = false; + typedef Array<double, kResultElementCount> ResultType; + typedef double ResultElementType; + + PHILOX_DEVICE_INLINE + ResultType operator()(Generator *gen) + { + typename Generator::ResultType sample = (*gen)(); + ResultType result; + for (int i = 0; i < kResultElementCount; i += 2) + { + const int i2 = 2 * i; + BoxMullerDouble(sample[i2], sample[i2 + 1], sample[i2 + 2], sample[i2 + 3], &result[i], + &result[i + 1]); + } + return result; + } +}; + +// A class that returns standard normal distribution between +// [-kTruncateValue, kTruncateValue]. +// Arguments: +// Generator: a generator type that returns a number of uint32 upon each +// each invocation. It needs to define kResultElementCount for the +// sample count for each invocation, and ResultType for actual +// returned sample type. +// RealType: the data type of the real numbers that will be returned by the +// distribution. This could be either float or double for now. +// This class is meant to be implemented through specialization. The default +// is not defined by design. +template <class SingleSampleGenerator, typename RealType> class TruncatedNormalDistribution; + +// Exactly like the float version, except that we convert to half afterwards; +// since we don't have half-precision sin/cos even on GPUs, there's nothing to +// gain from working in half internally. +template <class SingleSampleGenerator> +class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half> +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = SingleSampleGenerator::kNativeElementCount; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 90; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = true; + // The threshold where the normal distribution is truncated. + const float kTruncateValue = 2.0f; + + typedef Array<Eigen::half, kResultElementCount> ResultType; + typedef Eigen::half ResultElementType; + + PHILOX_DEVICE_INLINE + ResultType operator()(SingleSampleGenerator *gen) + { + ResultType results; + int index = 0; + while (true) + { + // Repeatedly take samples from the normal distribution, until we have + // the desired number of elements that fall within the pre-defined cutoff + // threshold. + const uint32_t x0 = (*gen)(); + const uint32_t x1 = (*gen)(); + float f[2]; + BoxMullerFloat(x0, x1, &f[0], &f[1]); + + if (Eigen::numext::abs(f[0]) < kTruncateValue) + { + results[index++] = Eigen::half(f[0]); + if (index >= kResultElementCount) + { + return results; + } + } + if (Eigen::numext::abs(f[1]) < kTruncateValue) + { + results[index++] = Eigen::half(f[1]); + if (index >= kResultElementCount) + { + return results; + } + } + } + } +}; + +// Partial specialization for float. +template <class SingleSampleGenerator> +class TruncatedNormalDistribution<SingleSampleGenerator, float> +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = SingleSampleGenerator::kNativeElementCount; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 90; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = true; + // The threshold where the normal distribution is truncated. + const float kTruncateValue = 2.0f; + + typedef Array<float, kResultElementCount> ResultType; + typedef float ResultElementType; + + PHILOX_DEVICE_INLINE + ResultType operator()(SingleSampleGenerator *gen) + { + ResultType results; + int index = 0; + while (true) + { + // Repeatedly take samples from the normal distribution, until we have + // the desired number of elements that fall within the pre-defined cutoff + // threshold. + const uint32_t x0 = (*gen)(); + const uint32_t x1 = (*gen)(); + float f[2]; + BoxMullerFloat(x0, x1, &f[0], &f[1]); + + if (Eigen::numext::abs(f[0]) < kTruncateValue) + { + results[index++] = f[0]; + if (index >= kResultElementCount) + { + return results; + } + } + if (Eigen::numext::abs(f[1]) < kTruncateValue) + { + results[index++] = f[1]; + if (index >= kResultElementCount) + { + return results; + } + } + } + } +}; + +// Partial specialization for double. +template <class SingleSampleGenerator> +class TruncatedNormalDistribution<SingleSampleGenerator, double> +{ +public: + // The number of elements that will be returned. + static constexpr int kResultElementCount = (SingleSampleGenerator::kNativeElementCount > 1) + ? SingleSampleGenerator::kNativeElementCount / 2 + : 1; + // Cost of generation of a single element (in cycles). + static constexpr int kElementCost = 90; + // Indicate that this distribution may take variable number of samples + // during the runtime. + static constexpr bool kVariableSamplesPerOutput = true; + typedef Array<double, kResultElementCount> ResultType; + typedef double ResultElementType; + const double kTruncateValue = 2.0; + + PHILOX_DEVICE_INLINE + ResultType operator()(SingleSampleGenerator *gen) + { + ResultType results; + int index = 0; + while (1) + { + const uint32_t x0 = (*gen)(); + const uint32_t x1 = (*gen)(); + const uint32_t x2 = (*gen)(); + const uint32_t x3 = (*gen)(); + double d[2]; + BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]); + + if (Eigen::numext::abs(d[0]) < kTruncateValue) + { + results[index++] = d[0]; + if (index >= kResultElementCount) + { + return results; + } + } + if (Eigen::numext::abs(d[1]) < kTruncateValue) + { + results[index++] = d[1]; + if (index >= kResultElementCount) + { + return results; + } + } + } + } +}; + +// Helper function to convert two 32-bit uniform integers to two floats +// under the unit normal distribution. +PHILOX_DEVICE_INLINE +void BoxMullerFloat(uint32_t x0, uint32_t x1, float *f0, float *f1) +{ + // This function implements the Box-Muller transform: + // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form + // Do not send a really small number to log(). + // We cannot mark "epsilon" as "static const" because NVCC would complain + const float epsilon = 1.0e-7f; + float u1 = Uint32ToFloat(x0); + if (u1 < epsilon) + { + u1 = epsilon; + } + const float v1 = 2.0f * M_PI * Uint32ToFloat(x1); + const float u2 = Eigen::numext::sqrt(-2.0f * Eigen::numext::log(u1)); +#if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__) + *f0 = Eigen::numext::sin(v1); + *f1 = Eigen::numext::cos(v1); +#else + sincosf(v1, f0, f1); +#endif + *f0 *= u2; + *f1 *= u2; +} + +// Helper function to convert four 32-bit uniform integers to two doubles +// under the unit normal distribution. +PHILOX_DEVICE_INLINE +void BoxMullerDouble(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, double *d0, double *d1) +{ + // This function implements the Box-Muller transform: + // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form + // Do not send a really small number to log(). + // We cannot mark "epsilon" as "static const" because NVCC would complain + const double epsilon = 1.0e-7; + double u1 = Uint64ToDouble(x0, x1); + if (u1 < epsilon) + { + u1 = epsilon; + } + const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3); + const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1)); +#if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__) + *d0 = Eigen::numext::sin(v1); + *d1 = Eigen::numext::cos(v1); +#else + sincos(v1, d0, d1); +#endif + *d0 *= u2; + *d1 *= u2; +} + +// Helper function to convert an 16-bit integer to a half between [0..1). +PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16_t x) +{ + // IEEE754 halfs are formatted as follows (MSB first): + // sign(1) exponent(5) mantissa(10) + // Conceptually construct the following: + // sign == 0 + // exponent == 15 -- an excess 15 representation of a zero exponent + // mantissa == 10 random bits + const uint16_t man = x & 0x3ffu; // 10 bit mantissa + const uint16_t exp = static_cast<uint16_t>(15); + const uint16_t val = (exp << 10) | man; + + Eigen::half result; + result.x = val; + return result - Eigen::half(1.0); +} + +// Helper function to convert an 32-bit integer to a float between [0..1). +PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32_t x) +{ + // IEEE754 floats are formatted as follows (MSB first): + // sign(1) exponent(8) mantissa(23) + // Conceptually construct the following: + // sign == 0 + // exponent == 127 -- an excess 127 representation of a zero exponent + // mantissa == 23 random bits + const uint32_t man = x & 0x7fffffu; // 23 bit mantissa + const uint32_t exp = static_cast<uint32_t>(127); + const uint32_t val = (exp << 23) | man; + + // Assumes that endian-ness is same for float and uint32. + float result; + memcpy(&result, &val, sizeof(val)); + return result - 1.0f; +} + +// Helper function to convert two 32-bit integers to a double between [0..1). +PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32_t x0, uint32_t x1) +{ + // IEEE754 doubles are formatted as follows (MSB first): + // sign(1) exponent(11) mantissa(52) + // Conceptually construct the following: + // sign == 0 + // exponent == 1023 -- an excess 1023 representation of a zero exponent + // mantissa == 52 random bits + const uint32_t mhi = x0 & 0xfffffu; // upper 20 bits of mantissa + const uint32_t mlo = x1; // lower 32 bits of mantissa + const uint64_t man = (static_cast<uint64_t>(mhi) << 32) | mlo; // mantissa + const uint64_t exp = static_cast<uint64_t>(1023); + const uint64_t val = (exp << 52) | man; + // Assumes that endian-ness is same for double and uint64. + double result; + memcpy(&result, &val, sizeof(val)); + return result - 1.0; +} + +} // namespace random +} // namespace tensorflow +} + +#endif // __NNFW_CKER_HELPER_RANDOM_DISTRIBUTIONS_H__ diff --git a/compute/cker/include/cker/operation/Helper/RandomOp.h b/compute/cker/include/cker/operation/Helper/RandomOp.h new file mode 100644 index 000000000..7dc51fe94 --- /dev/null +++ b/compute/cker/include/cker/operation/Helper/RandomOp.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2015 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_HELPER_RANDOM_OP_H__ +#define __NNFW_CKER_HELPER_RANDOM_OP_H__ + +#include "cker/Types.h" +#include "cker/Shape.h" +#include "cker/Utils.h" + +#include "cker/operation/Helper/RandomDistributions.h" + +namespace nnfw +{ +namespace cker +{ + +namespace functor +{ + +template <typename Device, class Distribution> struct FillPhiloxRandom; + +typedef Eigen::ThreadPoolDevice CPUDevice; +// Declares the partially CPU-specialized functor struct. +// +// NOTE: Due to inlining done by the compiler, you may need to add +// explicit instantiation of the functor in random_op.cc. See example +// functor::FillPhiloxRandom<CPUDevice, random::UniformDistribution>. +template <class Distribution> struct FillPhiloxRandom<CPUDevice, Distribution> +{ + void operator()(random::PhiloxRandom gen, typename Distribution::ResultElementType *data, + int64_t size, Distribution dist); +}; + +} // namespace functor +} // namespace tensorflow +} +#endif // __NNFW_CKER_HELPER_RANDOM_OP_H__ diff --git a/compute/cker/include/cker/operation/Helper/RandomOpCpu.h b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h new file mode 100644 index 000000000..85d267723 --- /dev/null +++ b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2019 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_HELPER_RANDOM_OP_CPU_H__ +#define __NNFW_CKER_HELPER_RANDOM_OP_CPU_H__ + +#define EIGEN_USE_THREADS + +#include <algorithm> +#include <cmath> +#include <memory> + +#include "cker/Types.h" +#include "cker/Shape.h" +#include "cker/Utils.h" + +#include "cker/eigen/EigenSupport.h" + +#include "cker/operation/Helper/PhiloxRandom.h" +#include "cker/operation/Helper/RandomOp.h" +#include "cker/operation/Helper/RandomDistributions.h" + +#if EIGEN_COMP_GNUC && __cplusplus > 199711L +#define DISABLE_FLOAT_EQUALITY_WARNING \ + _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Wfloat-equal\"") +#define ENABLE_FLOAT_EQUALITY_WARNING _Pragma("GCC diagnostic pop") +#else +#define DISABLE_FLOAT_EQUALITY_WARNING +#define ENABLE_FLOAT_EQUALITY_WARNING +#endif + +namespace nnfw +{ +namespace cker +{ + +typedef Eigen::ThreadPoolDevice CPUDevice; + +namespace functor +{ +using random::PhiloxRandom; +using random::SingleSampleAdapter; + +// The default implementation of the functor, which should never be invoked +// But we still need to provide implementation for now for the linker to work, +// since we do not support all the distributions yet. +template <typename Device, class Distribution> struct FillPhiloxRandom +{ + typedef typename Distribution::ResultElementType T; + void operator()() {} +}; + +// A class to fill a specified range of random groups +template <class Distribution, bool VariableSamplesPerOutput> struct FillPhiloxRandomTask; + +// Specialization for distribution that takes a fixed number of samples for +// each output. +template <class Distribution> struct FillPhiloxRandomTask<Distribution, false> +{ + typedef typename Distribution::ResultElementType T; + static void Run(random::PhiloxRandom gen, T *data, int64_t size, Distribution dist) + { + const int kGroupSize = Distribution::kResultElementCount; + gen.Skip(0); + int64_t offset = 0; + + // First fill all the full-size groups + int64_t limit_group_full = size / kGroupSize; + for (int64_t index = 0; index < limit_group_full; ++index) + { + auto samples = dist(&gen); + std::copy(&samples[0], &samples[0] + kGroupSize, data + offset); + offset += kGroupSize; + } + + int64_t remaining_size = size - limit_group_full * kGroupSize; + + // If there are any remaining elements that need to be filled, process them + if (remaining_size > 0) + { + auto samples = dist(&gen); + std::copy(&samples[0], &samples[0] + remaining_size, data + offset); + } + } +}; + +// Specialization for distribution that takes a variable number of samples for +// each output. This will be slower due to the generality. +template <class Distribution> struct FillPhiloxRandomTask<Distribution, true> +{ + typedef typename Distribution::ResultElementType T; + static constexpr int64_t kReservedSamplesPerOutput = 256; + + static void Run(random::PhiloxRandom base_gen, T *data, int64_t size, Distribution dist) + { + const int kGroupSize = Distribution::kResultElementCount; + static const int kGeneratorSkipPerOutputGroup = + kGroupSize * kReservedSamplesPerOutput / PhiloxRandom::kResultElementCount; + + int64_t offset = 0; + + // First fill all the full-size groups + int64_t limit_group_full = size / kGroupSize; + int64_t group_index; + for (group_index = 0; group_index < limit_group_full; ++group_index) + { + // Reset the generator to the beginning of the output group region + // This is necessary if we want the results to be independent of order + // of work + PhiloxRandom gen = base_gen; + gen.Skip(group_index * kGeneratorSkipPerOutputGroup); + SingleSampleAdapter<PhiloxRandom> single_samples(&gen); + + auto samples = dist(&single_samples); + std::copy(&samples[0], &samples[0] + kGroupSize, data + offset); + offset += kGroupSize; + } + + int64_t remaining_size = size - limit_group_full * kGroupSize; + // If there are any remaining elements that need to be filled, process them + if (remaining_size > 0) + { + PhiloxRandom gen = base_gen; + gen.Skip(group_index * kGeneratorSkipPerOutputGroup); + SingleSampleAdapter<PhiloxRandom> single_samples(&gen); + + auto samples = dist(&single_samples); + std::copy(&samples[0], &samples[0] + remaining_size, data + offset); + } + } +}; + +// Partial specialization for CPU to fill the entire region with randoms +// It splits the work into several tasks and run them in parallel +template <class Distribution> +void FillPhiloxRandom<CPUDevice, Distribution>:: +operator()(random::PhiloxRandom gen, typename Distribution::ResultElementType *data, int64_t size, + Distribution dist) +{ + FillPhiloxRandomTask<Distribution, Distribution::kVariableSamplesPerOutput>::Run(gen, data, size, + dist); +} + +} // namespace functor + +} // end namespace tensorflow +} + +#endif // __NNFW_CKER_HELPER_RANDOM_OP_CPU_H__ diff --git a/compute/cker/include/cker/operation/L2Normalize.h b/compute/cker/include/cker/operation/L2Normalize.h new file mode 100644 index 000000000..a0075c3d0 --- /dev/null +++ b/compute/cker/include/cker/operation/L2Normalize.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_L2NORMALIZE_H__ +#define __NNFW_CKER_L2NORMALIZE_H__ + +#include "cker/Shape.h" +#include "cker/Utils.h" +#include "cker/Types.h" + +namespace nnfw +{ +namespace cker +{ + +void L2NormalizeFloat32(const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + float epsilon = 1e-6; + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); + for (int i = 0; i < outer_size; ++i) + { + float squared_l2_norm = 0; + for (int c = 0; c < depth; ++c) + { + const float val = input_data[c]; + squared_l2_norm += val * val; + } + float l2_norm = std::sqrt(squared_l2_norm); + l2_norm = std::max(l2_norm, epsilon); + for (int c = 0; c < depth; ++c) + { + *output_data = *input_data / l2_norm; + ++output_data; + ++input_data; + } + } +} + +void L2NormalizeQuant8(L2NormParams ¶ms, const Shape &input_shape, const uint8_t *input_data, + const Shape &output_shape, uint8_t *output_data) +{ + const int trailing_dim = input_shape.DimensionsCount() - 1; + const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim); + const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape); + const int32_t input_zero_point = params.input_zero_point; + + for (int i = 0; i < outer_size; ++i) + { + int32_t square_l2_norm = 0; + for (int c = 0; c < depth; c++) + { + // Note that input_data advances by depth in the second pass below. + int32_t diff = input_data[c] - input_zero_point; + square_l2_norm += diff * diff; + } + int32_t inv_l2norm_multiplier; + int inv_l2norm_shift; + GetInvSqrtQuantizedMultiplierExp(square_l2_norm, -1, &inv_l2norm_multiplier, &inv_l2norm_shift); + for (int c = 0; c < depth; c++) + { + int32_t diff = *input_data - input_zero_point; + int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp( + 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift); + int32_t unclamped_output_val = 128 + rescaled_diff; + int32_t output_val = std::min(static_cast<int32_t>(255), + std::max(static_cast<int32_t>(0), unclamped_output_val)); + *output_data = static_cast<uint8_t>(output_val); + ++input_data; + ++output_data; + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_L2NORMALIZE_H__ diff --git a/compute/cker/include/cker/operation/Logistic.h b/compute/cker/include/cker/operation/Logistic.h index 7477858fc..3d3e59e55 100644 --- a/compute/cker/include/cker/operation/Logistic.h +++ b/compute/cker/include/cker/operation/Logistic.h @@ -32,18 +32,9 @@ namespace cker inline void Logistic(const Shape &input_shape, const float *input_data, const Shape &output_shape, float *output_data) { -#ifdef __aarch64__ auto input_map = MapAsVector(input_data, input_shape); auto output_map = MapAsVector(output_data, output_shape); output_map.array() = input_map.array().unaryExpr(Eigen::internal::scalar_logistic_op<float>()); -#else - // Note, this can be done using TANH: (1/2) + (1/2) * TANH(x/2) - const int size = MatchingFlatSize(input_shape, output_shape); - for (int i = 0; i < size; i++) - { - output_data[i] = 1.f / (1.f + std::exp(-input_data[i])); - } -#endif } } // namespace cker diff --git a/compute/cker/include/cker/operation/MatrixBandPart.h b/compute/cker/include/cker/operation/MatrixBandPart.h index 9f49c8fdd..5674ff3ef 100644 --- a/compute/cker/include/cker/operation/MatrixBandPart.h +++ b/compute/cker/include/cker/operation/MatrixBandPart.h @@ -32,10 +32,10 @@ void MatrixBandPart(const T num_lower_diags, const T num_upper_diags, const Shap { auto last_dim = input_shape.DimensionsCount() - 1; - T batch_num = 0; - for (int dim = 0; dim < last_dim - 2; dim++) + T batch_num = 1; + for (int dim = 0; dim < input_shape.DimensionsCount() - 2; dim++) { - batch_num += input_shape.Dims(dim); + batch_num *= input_shape.Dims(dim); } const T row_num = input_shape.Dims(last_dim - 1); diff --git a/compute/cker/include/cker/operation/Pad.h b/compute/cker/include/cker/operation/Pad.h index af432f3a8..4a2732d82 100644 --- a/compute/cker/include/cker/operation/Pad.h +++ b/compute/cker/include/cker/operation/Pad.h @@ -26,9 +26,10 @@ namespace nnfw { namespace cker { +template <typename T> inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &input_shape, - const float *input_data, const Shape &output_shape, float *output_data, - const float *constant_value_data) + const T *input_data, const Shape &output_shape, T *output_data, + const T *constant_value_data) { // Note, this is pad with mode=`CONSTANT`: it doesn't support `REFLECT` and `SYMMETRIC` // TODO: come up with more subtle solution that uses subtensors like arm compute @@ -38,7 +39,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu /** List of padding information */ using PaddingList = std::vector<PaddingInfo>; - auto constant_value = constant_value_data ? *constant_value_data : 0; + const T constant_value = constant_value_data ? *constant_value_data : 0; assert(output_shape.DimensionsCount() == input_shape.DimensionsCount()); PaddingList padding_list(pad_rank); @@ -64,7 +65,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu { const int32_t in_row_len = input_shape.Dims(0); std::fill_n(output_data, padding_list[0].first, constant_value); - std::memcpy(output_data + padding_list[0].first, input_data, in_row_len * sizeof(float)); + std::memcpy(output_data + padding_list[0].first, input_data, in_row_len * sizeof(T)); std::fill_n(output_data + padding_list[0].first + in_row_len, padding_list[0].second, constant_value); break; @@ -89,7 +90,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu out_offset += padding_list[1].first; // copy a row of input data - memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(float)); + memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(T)); out_offset += in_row_len; @@ -132,7 +133,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu out_offset += padding_list[2].first; // copy a row of input data - memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(float)); + memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(T)); out_offset += in_row_len; @@ -191,7 +192,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu out_c_offset += padding_list[3].first; // copy a row of input data - memcpy(output_data + out_c_offset, input_data + in_offset, in_row_len * sizeof(float)); + memcpy(output_data + out_c_offset, input_data + in_offset, in_row_len * sizeof(T)); out_c_offset += in_row_len; diff --git a/compute/cker/include/cker/operation/Quantize.h b/compute/cker/include/cker/operation/Quantize.h new file mode 100644 index 000000000..5c82d111f --- /dev/null +++ b/compute/cker/include/cker/operation/Quantize.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_QUANTIZE_H__ +#define __NNFW_CKER_QUANTIZE_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include "cker/Utils.h" +#include <stdexcept> +#include <iostream> +namespace nnfw +{ +namespace cker +{ +template <typename InputT, typename OutputT> +inline void Quantize(const Shape &input_shape, const InputT *input_data, const Shape &output_shape, + OutputT *output_data, const float output_scale, const int32_t output_offset) +{ + const int flat_size = MatchingFlatSize(input_shape, output_shape); + int min_val = std::numeric_limits<OutputT>::min(); + int max_val = std::numeric_limits<OutputT>::max(); + + for (int i = 0; i < flat_size; i++) + { + int32_t unclamped = static_cast<int32_t>(round(input_data[i] / output_scale)) + output_offset; + int32_t clamped = std::min(std::max(unclamped, min_val), max_val); + output_data[i] = clamped; + } +} +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_QUANTIZE_H__ diff --git a/compute/cker/include/cker/operation/ReLU6.h b/compute/cker/include/cker/operation/ReLU6.h new file mode 100644 index 000000000..20df561dc --- /dev/null +++ b/compute/cker/include/cker/operation/ReLU6.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2018 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_RELU6_H__ +#define __NNFW_CKER_RELU6_H__ + +#include "cker/Shape.h" +#include "cker/eigen/Utils.h" + +#include <cmath> +#include <Eigen/Core> + +namespace nnfw +{ +namespace cker +{ + +inline void ReLU6(const Shape &input_shape, const float *input_data, float *output_data) +{ + int size = input_shape.FlatSize(); + + for (int i = 0; i < size; ++i) + { + if (input_data[i] <= 0) + { + output_data[i] = 0; + } + else if (input_data[i] > 6.0) + { + output_data[i] = 6.0; + } + else + { + output_data[i] = input_data[i]; + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_RELU6_H__ diff --git a/compute/cker/include/cker/operation/Reduce.h b/compute/cker/include/cker/operation/Reduce.h index 4ba3652d3..cf9634a67 100644 --- a/compute/cker/include/cker/operation/Reduce.h +++ b/compute/cker/include/cker/operation/Reduce.h @@ -159,6 +159,92 @@ public: num_resolved_axis, temp_index_data(), reducer, output_data); } + // Computes the mean of elements across dimensions given in axis. + // It does so in two stages, first calculates the sum of elements along the axis + // then divides it by the number of element in axis for quantized values. + template <typename T, typename U> + inline bool QuantizedMeanOrSum(const T *input_data, int32_t input_zero_point, float input_scale, + const Shape &input_shape, T *output_data, + int32_t output_zero_point, float output_scale, + const Shape &output_shape, const std::vector<int> &axes, + bool /*keep_dims*/, U *temp_sum, bool compute_sum, + U reducer(const U current, const T in)) + { + // Reset output data. + size_t num_outputs = 1; + for (int idx = 0; idx < output_shape.DimensionsCount(); ++idx) + { + size_t current = static_cast<size_t>(output_shape.Dims(idx)); + // Overflow prevention. + if (num_outputs > std::numeric_limits<size_t>::max() / current) + { + return false; + } + num_outputs *= current; + } + for (size_t idx = 0; idx < num_outputs; ++idx) + { + output_data[idx] = T(); + temp_sum[idx] = U(); + } + + // Resolve axis. + int num_resolved_axis = 0; + if (!ResolveAxis(input_shape.DimensionsCount(), axes, resolved_axis_data(), &num_resolved_axis)) + { + return false; + } + + if (!ReduceImpl<T, U>(input_data, input_shape, output_shape, resolved_axis_data(), + num_resolved_axis, temp_index_data(), reducer, temp_sum)) + { + return false; + } + + // Calculate mean by dividing output_data by num of aggregated element. + U num_elements_in_axis = 1; + for (int idx = 0; idx < num_resolved_axis; ++idx) + { + size_t current = static_cast<size_t>(input_shape.Dims(resolved_axis_data()[idx])); + // Overflow prevention. + if (current > static_cast<size_t>(std::numeric_limits<U>::max() / num_elements_in_axis)) + { + return false; + } + num_elements_in_axis *= current; + } + + if (num_elements_in_axis > 0) + { + const float scale = input_scale / output_scale; + if (compute_sum) + { + // TODO(b/116341117): Eliminate float and do this completely in 8bit. + const float bias = -input_zero_point * scale * num_elements_in_axis + 0.5f; + for (size_t idx = 0; idx < num_outputs; ++idx) + { + const U value = + static_cast<U>(std::round(temp_sum[idx] * scale + bias)) + output_zero_point; + output_data[idx] = static_cast<T>(value); + } + } + else + { + const float bias = -input_zero_point * scale + 0.5f; + for (size_t idx = 0; idx < num_outputs; ++idx) + { + float float_mean = + static_cast<float>(temp_sum[idx]) / static_cast<float>(num_elements_in_axis); + float result = std::min(std::round(float_mean * scale + bias) + output_zero_point, + static_cast<float>(std::numeric_limits<T>::max())); + result = std::max(result, static_cast<float>(std::numeric_limits<T>::min())); + output_data[idx] = static_cast<T>(result); + } + } + } + return true; + } + inline int32_t *resolved_axis_data(void) { return _resolved_axis.size() ? _resolved_axis.data() : _resolved_axis_small; diff --git a/compute/cker/include/cker/operation/ResizeBilinear.h b/compute/cker/include/cker/operation/ResizeBilinear.h new file mode 100644 index 000000000..7fc1e9123 --- /dev/null +++ b/compute/cker/include/cker/operation/ResizeBilinear.h @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_RESIZEBILINEAR_H__ +#define __NNFW_CKER_RESIZEBILINEAR_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" +#include <cmath> + +namespace nnfw +{ +namespace cker +{ + +inline void ResizeBilinearKernel2x2(int32_t x0, int32_t x1, int32_t y0, int32_t y1, int32_t x, + int32_t y, int32_t depth, int32_t batch, + const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + const int32_t input_width = input_shape.Dims(2); + const int32_t output_width = output_shape.Dims(2); + + const int32_t input_x_offset = (x1 - x0) * depth; + const int32_t input_y_offset = (y1 - y0) * depth * input_width; + const int32_t output_x_offset = depth; + const int32_t output_y_offset = depth * output_width; + + for (int ch = 0; ch < depth; ch++) + { + const int32_t input_offset = Offset(input_shape, batch, y0, x0, ch); + + float x0y0 = input_data[input_offset]; + float x1y0 = input_data[input_offset + input_x_offset]; + float x0y1 = input_data[input_offset + input_y_offset]; + float x1y1 = input_data[input_offset + input_x_offset + input_y_offset]; + + // Top left corner. + const int32_t output_offset = Offset(output_shape, batch, y, x, ch); + output_data[output_offset] = x0y0; + + // Top right corner. + output_data[output_offset + output_x_offset] = (x0y0 + x1y0) / 2; + + // Bottom left corner. + float output = (x0y0 + x0y1) / 2; + output_data[output_offset + output_y_offset] = output; + + // Bottom right corner. + output_data[output_offset + output_x_offset + output_y_offset] = + (output + ((x1y0 + x1y1) / 2)) / 2; + } +} + +inline void ResizeBilinear2x2(int32_t batches, int32_t input_height, int32_t input_width, + int32_t depth, int32_t output_height, int32_t output_width, + const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + for (int b = 0; b < batches; b++) + { + for (int y0 = 0, y = 0; y <= output_height - 2; y += 2, y0++) + { + for (int x0 = 0, x = 0; x <= output_width - 2; x += 2, x0++) + { + int32_t x1 = std::min(x0 + 1, input_width - 1); + int32_t y1 = std::min(y0 + 1, input_height - 1); + ResizeBilinearKernel2x2(x0, x1, y0, y1, x, y, depth, b, input_shape, input_data, + output_shape, output_data); + } + } + } +} + +inline void ResizeBilinearKernel(const float *input_ptr, int32_t depth, float scale, + float *output_ptr) +{ + for (int32_t i = 0; i < depth; i++) + { + *output_ptr += *input_ptr * scale; + output_ptr++; + input_ptr++; + } +} + +inline void ComputeInterpolationValues(const float value, const float scale, + const bool half_pixel_centers, int32_t input_size, + float *scaled_value, int32_t *lower_bound, + int32_t *upper_bound) +{ + if (half_pixel_centers) + { + *scaled_value = (value + 0.5f) * scale - 0.5f; + } + else + { + *scaled_value = value * scale; + } + float scaled_value_floor = std::floor(*scaled_value); + *lower_bound = std::max(static_cast<int32_t>(scaled_value_floor), static_cast<int32_t>(0)); + *upper_bound = std::min(static_cast<int32_t>(std::ceil(*scaled_value)), input_size - 1); +} + +inline void ResizeBilinearGeneric(int32_t batches, int32_t input_height, int32_t input_width, + int32_t depth, int32_t output_height, int32_t output_width, + float height_scale, float width_scale, const Shape &input_shape, + const float *input_data, float *output_data, + const bool half_pixel_centers) +{ + memset(output_data, 0, batches * output_height * output_width * depth * sizeof(float)); + + int32_t output_offset = 0; + for (int b = 0; b < batches; ++b) + { + for (int y = 0; y < output_height; ++y) + { + float input_y; + int32_t y0, y1; + ComputeInterpolationValues(y, height_scale, half_pixel_centers, input_height, &input_y, &y0, + &y1); + for (int x = 0; x < output_width; ++x) + { + float input_x; + int32_t x0, x1; + ComputeInterpolationValues(x, width_scale, half_pixel_centers, input_width, &input_x, &x0, + &x1); + float *output_ptr = &output_data[output_offset]; + + // Run kernel on the 4 corners of the bilinear resize algorithm. + int32_t input_offset = Offset(input_shape, b, y0, x0, 0); + float scale = (1 - (input_y - y0)) * (1 - (input_x - x0)); + const float *input_ptr = &input_data[input_offset]; + ResizeBilinearKernel(input_ptr, depth, scale, output_ptr); + + input_offset = Offset(input_shape, b, y0, x1, 0); + scale = (1 - (input_y - y0)) * (input_x - x0); + input_ptr = &input_data[input_offset]; + ResizeBilinearKernel(input_ptr, depth, scale, output_ptr); + + input_offset = Offset(input_shape, b, y1, x0, 0); + scale = (input_y - y0) * (1 - (input_x - x0)); + input_ptr = &input_data[input_offset]; + ResizeBilinearKernel(input_ptr, depth, scale, output_ptr); + + input_offset = Offset(input_shape, b, y1, x1, 0); + scale = (input_y - y0) * (input_x - x0); + input_ptr = &input_data[input_offset]; + ResizeBilinearKernel(input_ptr, depth, scale, output_ptr); + + output_offset += depth; + } + } + } +} + +template <typename T> +inline void ResizeBilinearGenericSmallChannel(int32_t batches, int32_t input_height, + int32_t input_width, int32_t depth, + int32_t output_height, int32_t output_width, + float height_scale, float width_scale, + const Shape &input_shape, const T *input_data, + T *output_data, const bool half_pixel_centers) +{ + T *output_ptr = &output_data[0]; + for (int b = 0; b < batches; ++b) + { + for (int y = 0; y < output_height; ++y) + { + float input_y; + int32_t y0, y1; + ComputeInterpolationValues(y, height_scale, half_pixel_centers, input_height, &input_y, &y0, + &y1); + for (int x = 0; x < output_width; ++x) + { + float input_x; + int32_t x0, x1; + ComputeInterpolationValues(x, width_scale, half_pixel_centers, input_width, &input_x, &x0, + &x1); + + int32_t input_offset[4] = { + Offset(input_shape, b, y0, x0, 0), Offset(input_shape, b, y0, x1, 0), + Offset(input_shape, b, y1, x0, 0), Offset(input_shape, b, y1, x1, 0)}; + float scale[4] = {(1 - (input_y - y0)) * (1 - (input_x - x0)), + (1 - (input_y - y0)) * (input_x - x0), + (input_y - y0) * (1 - (input_x - x0)), (input_y - y0) * (input_x - x0)}; + + for (int d = 0; d < depth; d++) + { + const T *input_ptr = &input_data[d]; + *output_ptr++ = static_cast<T>( + input_ptr[input_offset[0]] * scale[0] + input_ptr[input_offset[1]] * scale[1] + + input_ptr[input_offset[2]] * scale[2] + input_ptr[input_offset[3]] * scale[3]); + } + } + } + } +} + +void ResizeBilinear(ResizeBilinearParams ¶ms, const Shape &input_shape, const float *input_data, + const Shape &output_shape, float *output_data) +{ + int32_t batches = static_cast<int32_t>(MatchingDim(input_shape, 0, output_shape, 0)); + int32_t input_height = input_shape.Dims(1); + int32_t input_width = input_shape.Dims(2); + int32_t depth = static_cast<int32_t>(MatchingDim(input_shape, 3, output_shape, 3)); + + // Specialize for 2x2 upsample. + if (!params.align_corners && !params.half_pixel_centers && + params.output_height == 2 * input_height && params.output_width == 2 * input_width) + { + ResizeBilinear2x2(batches, input_height, input_width, depth, params.output_height, + params.output_width, input_shape, input_data, output_shape, output_data); + } + else + { + float height_scale = static_cast<float>(input_height) / params.output_height; + float width_scale = static_cast<float>(input_width) / params.output_width; + if (params.align_corners && params.output_height > 1) + { + height_scale = static_cast<float>(input_height - 1) / (params.output_height - 1); + } + if (params.align_corners && params.output_width > 1) + { + width_scale = static_cast<float>(input_width - 1) / (params.output_width - 1); + } + + ResizeBilinearGeneric(batches, input_height, input_width, depth, params.output_height, + params.output_width, height_scale, width_scale, input_shape, input_data, + output_data, params.half_pixel_centers); + } +} + +void ResizeBilinear(ResizeBilinearParams ¶ms, const Shape &input_shape, + const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data) +{ + int32_t batches = MatchingDim(input_shape, 0, output_shape, 0); + int32_t input_height = input_shape.Dims(1); + int32_t input_width = input_shape.Dims(2); + int32_t depth = MatchingDim(input_shape, 3, output_shape, 3); + + float height_scale = (params.align_corners && params.output_height > 1) + ? (static_cast<float>(input_height - 1) / (params.output_height - 1)) + : (static_cast<float>(input_height) / params.output_height); + + float width_scale = (params.align_corners && params.output_width > 1) + ? (static_cast<float>(input_width - 1) / (params.output_width - 1)) + : (static_cast<float>(input_width) / params.output_width); + + ResizeBilinearGenericSmallChannel<uint8_t>( + batches, input_height, input_width, depth, params.output_height, params.output_width, + height_scale, width_scale, input_shape, input_data, output_data, params.half_pixel_centers); +} +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_RESIZEBILINEAR_H__ diff --git a/compute/cker/include/cker/operation/SpaceToDepth.h b/compute/cker/include/cker/operation/SpaceToDepth.h new file mode 100644 index 000000000..ef679315e --- /dev/null +++ b/compute/cker/include/cker/operation/SpaceToDepth.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_SPACE_TO_DEPTH_H__ +#define __NNFW_CKER_SPACE_TO_DEPTH_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" + +namespace nnfw +{ +namespace cker +{ + +template <typename T> +inline void SpaceToDepth(const SpaceToDepthParams ¶ms, const Shape &unextended_input_shape, + const T *input_data, const Shape &unextended_output_shape, T *output_data) +{ + assert(unextended_input_shape.DimensionsCount() <= 4); + assert(unextended_output_shape.DimensionsCount() <= 4); + const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape); + const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape); + + const int output_depth = output_shape.Dims(3); + const int output_width = output_shape.Dims(2); + const int output_height = output_shape.Dims(1); + + const int input_depth = input_shape.Dims(3); + const int batch_size = input_shape.Dims(0); + + // Number of continuous values that we can copy in one interation. + const int stride = params.block_size * input_depth; + + for (int batch = 0; batch < batch_size; ++batch) + { + for (int out_h = 0; out_h < output_height; ++out_h) + { + T *output_ptr = output_data + Offset(output_shape, batch, out_h, 0, 0); + for (int offset_h = 0; offset_h < params.block_size; ++offset_h) + { + T *dst = output_ptr; + for (int out_w = 0; out_w < output_width; ++out_w) + { + memcpy(dst, input_data, stride * sizeof(T)); + input_data += stride; + dst += output_depth; + } + output_ptr += stride; + } + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_SPACE_TO_DEPTH_H__ diff --git a/compute/cker/include/cker/operation/SplitV.h b/compute/cker/include/cker/operation/SplitV.h new file mode 100644 index 000000000..9e46f4b04 --- /dev/null +++ b/compute/cker/include/cker/operation/SplitV.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_SPLIT_V_H__ +#define __NNFW_CKER_SPLIT_V_H__ + +#include "cker/Shape.h" +#include "cker/Types.h" + +namespace nnfw +{ +namespace cker +{ + +template <typename Scalar> +void SplitV(const SplitVParams ¶ms, const Shape &input_shape, const Scalar *input_data, + std::vector<nnfw::cker::Shape> &output_shapes, Scalar *const *output_data) +{ + const int split_dimensions = input_shape.DimensionsCount(); + int axis = params.axis < 0 ? params.axis + split_dimensions : params.axis; + int outputs_count = params.num_split; + + int64_t split_size = 0; + + for (int i = 0; i < outputs_count; i++) + { + // TFLITE_DCHECK_EQ(output_shapes[i]->DimensionsCount(), split_dimensions); + for (int j = 0; j < split_dimensions; j++) + { + if (j != axis) + { + MatchingDim(output_shapes[i], j, input_shape, j); + } + } + split_size += output_shapes[i].Dims(axis); + } + + int64_t outer_size = 1; + for (int i = 0; i < axis; ++i) + { + outer_size *= input_shape.Dims(i); + } + // For all output arrays, + // FlatSize() = outer_size * Dims(axis) * base_inner_size; + int64_t base_inner_size = 1; + for (int i = axis + 1; i < split_dimensions; ++i) + { + base_inner_size *= input_shape.Dims(i); + } + + const Scalar *input_ptr = input_data; + int copy_size = 0; + for (int k = 0; k < outer_size; k++) + { + for (int i = 0; i < outputs_count; ++i) + { + copy_size = output_shapes[i].Dims(axis) * base_inner_size; + memcpy(output_data[i] + k * copy_size, input_ptr, copy_size * sizeof(Scalar)); + input_ptr += copy_size; + } + } +} + +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_SPLIT_V_H__ diff --git a/compute/cker/include/cker/operation/StatelessRandomUniform.h b/compute/cker/include/cker/operation/StatelessRandomUniform.h new file mode 100644 index 000000000..d5952ae23 --- /dev/null +++ b/compute/cker/include/cker/operation/StatelessRandomUniform.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_CKER_STATELESS_RANDOM_UNIFORM_H__ +#define __NNFW_CKER_STATELESS_RANDOM_UNIFORM_H__ + +#include "cker/Types.h" +#include "cker/Shape.h" +#include "cker/Utils.h" + +#include "cker/eigen/EigenSupport.h" + +#include "cker/operation/Helper/Tensor.h" +#include "cker/operation/Helper/PhiloxRandom.h" +#include "cker/operation/Helper/RandomOpCpu.h" +#include "cker/operation/Helper/RandomDistributions.h" + +namespace nnfw +{ +namespace cker +{ + +void GenerateKey(Tensor seed, random::PhiloxRandom::Key *out_key, + random::PhiloxRandom::ResultType *out_counter) +{ + // Grab the two seeds + uint32_t seed0; + uint32_t seed1; + + const auto seed_vals = seed.flat<int32_t>(); + + seed0 = seed_vals(0); + seed1 = seed_vals(1); + // Scramble the seeds so that the user doesn't need to worry about which + // part of the seed needs to be strong. + (*out_key)[0] = 0x3ec8f720; + (*out_key)[1] = 0x02461e29; + (*out_counter)[0] = static_cast<uint32_t>(seed0); + (*out_counter)[1] = (*out_counter)[3] = 0; + (*out_counter)[2] = static_cast<uint32_t>(seed1); + const auto mix = random::PhiloxRandom(*out_counter, *out_key)(); + (*out_key)[0] = mix[0]; + (*out_key)[1] = mix[1]; + (*out_counter)[0] = (*out_counter)[1] = 0; + (*out_counter)[2] = mix[2]; + (*out_counter)[3] = mix[3]; +} + +template <typename Device, class Distribution> +void Fill(random::PhiloxRandom random, Tensor *output) +{ + // Build distribution + typedef typename Distribution::ResultElementType T; + + auto flat = output->flat<T>(); + // Reuse the compute kernels from the stateful random ops + functor::FillPhiloxRandom<Device, Distribution>()(random, flat.data(), flat.size(), + Distribution()); +} + +inline void StatelessRandomUniform(const Shape &shape_shape, const int *shape_data, + const Shape &seed_shape, const int *seed_data, + const Shape &output_shape, float *output_data) +{ + Tensor shape_t; + Tensor seed_t; + + shape_t.shape.ReplaceWith(shape_shape.DimensionsCount(), shape_shape.DimsData()); + shape_t.buffer = (void *)shape_data; + + seed_t.shape.ReplaceWith(seed_shape.DimensionsCount(), seed_shape.DimsData()); + seed_t.buffer = (void *)seed_data; + + Tensor output_t; + output_t.shape.ReplaceWith(output_shape.DimensionsCount(), output_shape.DimsData()); + output_t.buffer = output_data; + + random::PhiloxRandom::Key key; + random::PhiloxRandom::ResultType counter; + + GenerateKey(seed_t, &key, &counter); + + Fill<Eigen::ThreadPoolDevice, random::UniformDistribution<random::PhiloxRandom, float>>( + random::PhiloxRandom(counter, key), &output_t); +} +} // namespace cker +} // namespace nnfw + +#endif // __NNFW_CKER_STATELESS_RANDOM_UNIFORM_H__ diff --git a/compute/cker/include/cker/ruy/RuySupport.h b/compute/cker/include/cker/ruy/RuySupport.h index 432b181bd..9612dd517 100644 --- a/compute/cker/include/cker/ruy/RuySupport.h +++ b/compute/cker/include/cker/ruy/RuySupport.h @@ -22,11 +22,6 @@ #include <ruy/context.h> #include "cker/Types.h" -namespace -{ -const int kDefaultNumThreadpoolThreads = 4; -} - namespace nnfw { namespace cker @@ -34,42 +29,6 @@ namespace cker namespace ruy_support { -struct RuyContext -{ -public: - RuyContext() : ruy_context_(new ruy::Context) - { - SetMaxNumThreads(onert::util::getConfigInt(onert::util::config::RUY_THREADS)); -#ifdef USE_RUY_GEMV - ruy_context_->cache_policy = ruy::kCacheLHSOnNarrowMul; -#endif - }; - - ruy::Context *ruy_context() const { return ruy_context_.get(); } - - static inline RuyContext &GetRuyContext() - { - static thread_local RuyContext instance; - return instance; - } - - void SetMaxNumThreads(int max_num_threads) - { - const int target_num_threads = - max_num_threads > -1 ? max_num_threads : kDefaultNumThreadpoolThreads; - ruy_context_->max_num_threads = target_num_threads; - } - -private: - const std::unique_ptr<ruy::Context> ruy_context_; -}; - -inline ruy::Context *GetRuyContext() -{ - auto &ctx = RuyContext::GetRuyContext(); - return ctx.ruy_context(); -} - template <typename Scalar, typename DataPointer> void MakeRuyMatrix(const MatrixParams<Scalar> ¶ms, DataPointer data_ptr, ruy::Matrix<Scalar> *dst) |