path: root/compiler/nnc/backends/soft_backend/code_snippets/cpp_operations.def
diff options
Diffstat (limited to 'compiler/nnc/backends/soft_backend/code_snippets/cpp_operations.def')
1 files changed, 656 insertions, 0 deletions
diff --git a/compiler/nnc/backends/soft_backend/code_snippets/cpp_operations.def b/compiler/nnc/backends/soft_backend/code_snippets/cpp_operations.def
new file mode 100644
index 000000000..f78274e5c
--- /dev/null
+++ b/compiler/nnc/backends/soft_backend/code_snippets/cpp_operations.def
@@ -0,0 +1,656 @@
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <string>
+#include <cstdint>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <cstring>
+using namespace std;
+#define UNUSED(x) ((void)(x))
+static_assert(numeric_limits<float>::is_iec559, "Unsupported float type");
+void readParameters(char *&data, size_t &len, const string &path,
+ uint32_t expectedVersion, uint32_t expectedHash)
+ static_assert(sizeof(expectedVersion) == params::VERSION_LEN, "version length mismatch");
+ static_assert(sizeof(expectedHash) == params::HASH_LEN, "hash length mismatch");
+ int fd;
+ struct stat st;
+ fd = open(path.c_str(), O_RDONLY);
+ assert(fd != -1);
+ // gather file info
+ int statRes = fstat(fd, &st);
+ assert(statRes != -1);
+ UNUSED(statRes);
+ len = static_cast<size_t>(st.st_size);
+ assert(len >= params::HEADER_LEN);
+ // check magic correctness
+ char magic[params::MAGIC_LEN + 1] = {};
+ ssize_t magic_len = read(fd, magic, params::MAGIC_LEN);
+ assert(magic_len == params::MAGIC_LEN);
+ UNUSED(magic_len);
+ assert(strncmp(magic, params::MAGIC, params::MAGIC_LEN) == 0);
+ UNUSED(magic);
+ // checkversion correctness
+ decltype(expectedVersion) version;
+ ssize_t version_len = read(fd, &version, sizeof(version));
+ assert(version_len == sizeof(version));
+ UNUSED(version_len);
+ assert(version == expectedVersion);
+ UNUSED(version);
+ // check hash correctness
+ decltype(expectedHash) hash;
+ ssize_t hash_len = read(fd, &hash, sizeof(hash));
+ assert(hash_len == sizeof(hash));
+ UNUSED(hash_len);
+ assert(hash == expectedHash);
+ UNUSED(hash);
+ data = static_cast<char *>(mmap(NULL, len, PROT_READ, MAP_PRIVATE, fd, 0));
+ int is_error = close(fd);
+ assert(!is_error && "Can not close file!");
+ UNUSED(is_error);
+void releaseParameters(char *data, size_t len)
+ int res = munmap(data, len);
+ assert(res == 0);
+ UNUSED(res);
+template <int rank>
+size_t volume(Dims<rank> d)
+ size_t v = 1;
+ for (int i = 0; i < rank; ++i)
+ {
+ v *= d.sizes[i];
+ }
+ return v;
+RuntimeShape shapeToRuntimeShape(const Shape& s) {
+ const int rank = static_cast<int>(s.getDims());
+ RuntimeShape sh(rank);
+ for (int i = 0; i < rank; i++) {
+ sh.SetDim(i, static_cast<int32_t>(s[i]));
+ }
+ return sh;
+Dims<4> shapeToDims(const Shape &s)
+ Dims<4> dims;
+ const int rank = static_cast<int>(s.getDims());
+ assert(rank >= 1 && rank <= 4);
+ int stride = 1;
+ for (int i = 0; i < rank; ++i)
+ {
+ dims.sizes[i] = static_cast<int>(s[rank - 1 - i]);
+ dims.strides[i] = stride;
+ stride *= s[rank - 1 - i];
+ }
+ for (int i = rank; i < 4; ++i)
+ {
+ dims.sizes[i] = 1;
+ dims.strides[i] = stride;
+ }
+ return dims;
+template <class T>
+static inline T deserializeT(const char *&buf)
+ T v;
+ const char *end = buf + sizeof(T);
+ copy(buf, end, reinterpret_cast<char *>(&v));
+ buf = end;
+ return v;
+static inline Shape deserializeShape(const char *&buf)
+ Shape s;
+ int32_t rank = deserializeT<int32_t>(buf);
+ s.setDims(rank);
+ for (int i = 0; i < rank; ++i)
+ {
+ s[i] = deserializeT<int32_t>(buf);
+ }
+ return s;
+static inline vector<int32_t> deserializeStrides(const char *&buf)
+ vector<int32_t> strides;
+ const int num_strides = deserializeT<int>(buf);
+ for (int i = 0; i < num_strides; ++i) {
+ strides.emplace_back(deserializeT<int32_t>(buf));
+ }
+ return strides;
+static bool isAddrAligned(const void *data, int alignment)
+ return (reinterpret_cast<uintptr_t>(data) % alignment) == 0;
+static inline Tensor deserializeTensor(const char*& buf)
+ int32_t d_type = deserializeT<int32_t>(buf);
+ assert(d_type == 1 && "Unknown data type");
+ int32_t element_size = deserializeT<int32_t>(buf);
+ assert(element_size == 4 && "Unsupported element size");
+ Shape shape = deserializeShape(buf);
+ const float* data = reinterpret_cast<const float*>(buf);
+ assert(isAddrAligned(data, 4));
+ Tensor tensor(shape, const_cast<float*>(data));
+ buf += element_size * shape.getNumElems();
+ return tensor;
+// This operation takes as input multiple tensors, at least 2, likely less then 7
+// parameter pack provides generalization for all possible number of inputs
+template <class ...Args>
+void concat(Tensor &out, const char *params, const Args &...inputs)
+ const float *input[] = {inputs.getData()...};
+ Dims<4> input_d[] = {shapeToDims(inputs.getShape())...};
+ int axis = deserializeT<int32_t>(params);
+ Shape out_s = deserializeShape(params);
+ // because inner functions accepts axis in reverse order
+ axis = static_cast<int>(out_s.getDims()) - 1 - axis;
+ int inputs_count = sizeof(input)/sizeof(input[0]);
+ out.reshape(out_s);
+ Concatenation(axis,
+ input, input_d, inputs_count,
+ out.getData(), shapeToDims(out.getShape()));
+void conv2d(Tensor& out, const char* params, const Tensor& input, const Tensor& kernel,
+ Tensor& temporary) {
+ const vector<int32_t> strides = deserializeStrides(params);
+ const Shape pads = deserializeShape(params);
+ const Shape out_shape = deserializeShape(params);
+ out.reshape(out_shape);
+ assert(strides.size() == 2);
+ const auto stride_h = static_cast<int16>(strides[0]);
+ const auto stride_w = static_cast<int16>(strides[1]);
+ assert(pads.getDims() == 2);
+ const auto pad_h = static_cast<int16>(pads[0]);
+ const auto pad_w = static_cast<int16>(pads[1]);
+ const Shape& kernel_shape = kernel.getShape();
+ const Shape im2col_shape{out_shape[0], out_shape[1], out_shape[2],
+ kernel_shape[1] * kernel_shape[2] * kernel_shape[3]};
+ float* im2col_data = nullptr;
+ if (stride_w != 1 || stride_h != 1 || kernel_shape[1] != 1 || kernel_shape[2] != 1) {
+ im2col_data = temporary.getData();
+ }
+ const ConvParams conv_params{{pad_w, pad_h}, stride_w, stride_h};
+ Conv(conv_params,
+ shapeToRuntimeShape(input.getShape()), input.getData(),
+ shapeToRuntimeShape(kernel_shape), kernel.getData(),
+ shapeToRuntimeShape(out_shape), out.getData(),
+ shapeToRuntimeShape(im2col_shape), im2col_data);
+void convTransposed2d(Tensor& out, const char* params, const Tensor& input, const Tensor& kernel,
+ Tensor& temporary) {
+ const vector<int32_t> strides = deserializeStrides(params);
+ const Shape pads = deserializeShape(params);
+ const Shape out_shape = deserializeShape(params);
+ out.reshape(out_shape);
+ assert(strides.size() == 2);
+ const auto stride_h = static_cast<int16>(strides[0]);
+ const auto stride_w = static_cast<int16>(strides[1]);
+ assert(pads.getDims() == 2);
+ const auto pad_h = static_cast<int16>(pads[0]);
+ const auto pad_w = static_cast<int16>(pads[1]);
+ const RuntimeShape input_rt_shape = shapeToRuntimeShape(input.getShape());
+ const RuntimeShape out_rt_shape = shapeToRuntimeShape(out_shape);
+ // Transpose the kernel from HWOI to OHWI format.
+ const Shape& kernel_shape = kernel.getShape();
+ const RuntimeShape kernel_rt_shape = {static_cast<int>(kernel_shape[2]),
+ static_cast<int>(kernel_shape[0]),
+ static_cast<int>(kernel_shape[1]),
+ static_cast<int>(kernel_shape[3])};
+ unique_ptr<float[]> kernel_data(new float[kernel_rt_shape.FlatSize()]);
+ TransposeParams transpose_params{4, {2, 0, 1, 3}};
+ Transpose(transpose_params,
+ shapeToRuntimeShape(kernel_shape), kernel.getData(),
+ kernel_rt_shape, kernel_data.get());
+ const int32 kernel_height = kernel_rt_shape.Dims(1);
+ const int32 kernel_width = kernel_rt_shape.Dims(2);
+ const RuntimeShape im2col_shape{out_rt_shape.Dims(0),
+ out_rt_shape.Dims(1),
+ out_rt_shape.Dims(2),
+ input_rt_shape.Dims(3) * kernel_width * kernel_height};
+ ConvParams conv_params{{pad_w, pad_h}, stride_w, stride_h};
+ TransposeConv(conv_params,
+ input_rt_shape, input.getData(),
+ kernel_rt_shape, kernel_data.get(),
+ out_rt_shape, out.getData(),
+ im2col_shape, temporary.getData());
+void depthwiseConv2d(Tensor& out, const char* params, const Tensor& input, const Tensor& kernel) {
+ const vector<int32_t> strides = deserializeStrides(params);
+ const Shape pads = deserializeShape(params);
+ const Shape out_shape = deserializeShape(params);
+ out.reshape(out_shape);
+ assert(strides.size() == 2);
+ const auto stride_h = static_cast<int16>(strides[0]);
+ const auto stride_w = static_cast<int16>(strides[1]);
+ assert(pads.getDims() == 2);
+ const auto pad_h = static_cast<int16>(pads[0]);
+ const auto pad_w = static_cast<int16>(pads[1]);
+ const RuntimeShape input_dims = shapeToRuntimeShape(input.getShape());
+ const RuntimeShape kernel_dims = shapeToRuntimeShape(kernel.getShape());
+ const RuntimeShape out_dims = shapeToRuntimeShape(out_shape);
+ const auto depth_multiplier = static_cast<int16>(out_dims.Dims(3) / input_dims.Dims(3));
+ assert(out_dims.Dims(3) % input_dims.Dims(3) == 0);
+ // Reshape kernel -- squash zeroth and first dimensions.
+ const int output_channels = kernel_dims.Dims(3) * kernel_dims.Dims(2);
+ assert(output_channels == out_dims.Dims(3));
+ const int kernel_w = kernel_dims.Dims(1);
+ const int kernel_h = kernel_dims.Dims(0);
+ const RuntimeShape kernel_rt_shape = {1, kernel_h, kernel_w, output_channels};
+ const DepthwiseParams depthwise_conv_params = {{pad_w, pad_h}, stride_w,
+ stride_h, 1, 1,
+ depth_multiplier};
+ // TODO Fusing bias into depthwise conv is close to a no-op due to the nature of the operation
+ // consider doing that
+ DepthwiseConv(depthwise_conv_params,
+ input_dims, input.getData(),
+ kernel_rt_shape, kernel.getData(),
+ out_dims, out.getData());
+void softmax(Tensor &out, const char *params, const Tensor &in)
+ const float *input = in.getData();
+ Dims<4> input_d = shapeToDims(in.getShape());
+ float beta = 1;
+ int32_t axis = deserializeT<int32_t>(params);
+ assert(axis == in.getShape().getDims() - 1);
+ UNUSED(axis);
+ out.reshape(in.getShape());
+ Softmax(input, input_d, beta, out.getData(), input_d);
+void avgPool(Tensor &out, const char *params, const Tensor &in)
+ const float *input = in.getData();
+ Dims<4> input_d = shapeToDims(in.getShape());
+ Shape window = deserializeShape(params);
+ vector<int32_t> strides = deserializeStrides(params);
+ Shape pads = deserializeShape(params);
+ bool include_pad = deserializeT<int32_t>(params);
+ Shape out_s = deserializeShape(params);
+ assert(window.getDims() == 2);
+ const int window_w = static_cast<int>(window[1]);
+ const int window_h = static_cast<int>(window[0]);
+ assert(strides.size() == 2);
+ const int stride_w = static_cast<int>(strides[1]);
+ const int stride_h = static_cast<int>(strides[0]);
+ assert(pads.getDims() == 2);
+ const int pad_w = static_cast<int>(pads[1]);
+ const int pad_h = static_cast<int>(pads[0]);
+ out.reshape(out_s);
+ Dims<4> out_d = shapeToDims(out_s);
+ AveragePool(input, input_d,
+ stride_w, stride_h,
+ pad_w, pad_h,
+ window_w, window_h,
+ out.getData(), out_d,
+ include_pad);
+void maxPool(Tensor &out, const char *params, const Tensor &in)
+ const float *input = in.getData();
+ Dims<4> input_d = shapeToDims(in.getShape());
+ Shape window = deserializeShape(params);
+ vector<int32_t> strides = deserializeStrides(params);
+ Shape pads = deserializeShape(params);
+ Shape out_s = deserializeShape(params);
+ assert(window.getDims() == 2);
+ const int window_w = static_cast<int>(window[1]);
+ const int window_h = static_cast<int>(window[0]);
+ assert(strides.size() == 2);
+ const int stride_w = static_cast<int>(strides[1]);
+ const int stride_h = static_cast<int>(strides[0]);
+ assert(pads.getDims() == 2);
+ const int pad_w = static_cast<int>(pads[1]);
+ const int pad_h = static_cast<int>(pads[0]);
+ out.reshape(out_s);
+ Dims<4> out_d = shapeToDims(out_s);
+ MaxPool(input, input_d,
+ stride_w, stride_h,
+ pad_w, pad_h,
+ window_w, window_h,
+ out.getData(), out_d);
+void fullConnect(Tensor& out, const char* params, const Tensor& in, const Tensor& w) {
+ Shape out_s = deserializeShape(params);
+ out.reshape(out_s);
+ FullyConnected(in.getData(), shapeToDims(in.getShape()),
+ w.getData(), shapeToDims(w.getShape()),
+ out.getData(), shapeToDims(out_s));
+ * @brief Resize assuming tflite axis order (NHWC)
+ */
+void resize(Tensor& out, const char* params, const Tensor& in) {
+ // The Tensorflow version of this op allows resize on the width and height
+ // axis only.
+ const float* input = in.getData();
+ assert(in.getShape().getDims() == 4 && "Should be a 4d tensor");
+ RuntimeShape in_shape = shapeToRuntimeShape(in.getShape());
+ Shape out_shape = deserializeShape(params);
+ out.reshape(out_shape);
+ assert(out_shape.getDims() == 4 && "Should be a 4d tensor");
+ RuntimeShape out_runtime = shapeToRuntimeShape(out_shape);
+ assert(out_shape[0] == in_shape.Dims(0) && out_shape[3] == in_shape.Dims(3) &&
+ "Resize is unly supported over hight and width");
+ ResizeNearestNeighbor<float>(
+ in_shape, input,
+ static_cast<int>(out_shape[1]), static_cast<int>(out_shape[2]),
+ out_runtime, out.getData());
+void cappedRelu(Tensor &out, const char *params, const Tensor &in)
+ const float *input = in.getData();
+ Dims<4> input_d = shapeToDims(in.getShape());
+ float cap = deserializeT<float>(params);
+ out.reshape(in.getShape());
+ CappedRelu(input, input_d, cap, out.getData(), input_d);
+void slice(Tensor& out, const char* params, const Tensor& in) {
+ Shape starts = deserializeShape(params);
+ Shape sizes = deserializeShape(params);
+ Shape out_s = deserializeShape(params);
+ out.reshape(out_s);
+ SliceParams slice_params;
+ slice_params.begin_count = static_cast<uint8>(starts.getDims());
+ slice_params.size_count = static_cast<uint8>(sizes.getDims());
+ assert(slice_params.begin_count <= 4);
+ assert(slice_params.size_count <= 4);
+ assert(starts.getDims() == sizes.getDims());
+ for (int i = 0; i < slice_params.begin_count; i++) {
+ slice_params.begin[i] = static_cast<int32>(starts[i]);
+ slice_params.size[i] = static_cast<int32>(sizes[i]);
+ }
+ Slice(
+ slice_params,
+ shapeToRuntimeShape(in.getShape()), in.getData(),
+ shapeToRuntimeShape(out_s), out.getData()
+ );
+void relu(Tensor &out, const char *params, const Tensor &in)
+ const float *input = in.getData();
+ Dims<4> input_d = shapeToDims(in.getShape());
+ out.reshape(in.getShape());
+ Relu(input, input_d, out.getData(), input_d);
+void sigmoid(Tensor& out, const char* params, const Tensor& in) {
+ out.reshape(in.getShape());
+ Logistic(shapeToRuntimeShape(in.getShape()), in.getData(),
+ shapeToRuntimeShape(out.getShape()), out.getData());
+void elu(Tensor &out, const char* params, const Tensor& in) {
+ const float* input = in.getData();
+ const Dims<4> inp_d = shapeToDims(in.getShape());
+ const float alpha = deserializeT<float>(params);
+ out.reshape(in.getShape());
+ ELU(input, inp_d, alpha, out.getData(), inp_d);
+void tanhActivation(Tensor &out, const char* params, const Tensor& in) {
+ UNUSED(params);
+ const float* input = in.getData();
+ const Dims<4> inp_d = shapeToDims(in.getShape());
+ out.reshape(in.getShape());
+ float* output = out.getData();
+ const Dims<4> out_d = shapeToDims(in.getShape());
+ Tanh(input, inp_d, output, out_d);
+template <typename F>
+void ElementWise(Tensor &out, const char *params, const Tensor &lhs, const Tensor &rhs)
+ const float *lhs_data = lhs.getData();
+ const float *rhs_data = rhs.getData();
+ const Shape out_shape = deserializeShape(params);
+ out.reshape(out_shape);
+ F::Call(lhs_data, shapeToRuntimeShape(lhs.getShape()),
+ rhs_data, shapeToRuntimeShape(rhs.getShape()),
+ out.getData(), shapeToRuntimeShape(out_shape));
+// TODO refactor tflite's code for this op
+void reshape(Tensor& out, const char* params, const Tensor& in) {
+ Shape out_s = deserializeShape(params);
+ assert(out_s.getNumElems() == in.getShape().getNumElems());
+ out.reshape(out_s);
+ out.fillData(in.getData(), in.getShape().getNumElems());
+void reduceMean(Tensor& out, const char* params, const Tensor& in) {
+ Shape tmp_reduction_dims = deserializeShape(params);
+ bool keep_dims = static_cast<bool>(deserializeT<int32_t>(params));
+ Shape out_s = deserializeShape(params);
+ out.reshape(out_s);
+ const int32_t rank_inp = static_cast<int32_t>(in.getShape().getDims());
+ const int32_t rank_out = static_cast<int32_t>(out_s.getDims());
+ const int32_t rank_axis = static_cast<int32_t>(tmp_reduction_dims.getDims());
+ int32_t in_dim[8];
+ int32_t tmp_index[8]; // input iterator storage
+ assert(rank_inp < 8);
+ for (int i = 0; i < rank_inp; i++) {
+ in_dim[i] = static_cast<int32_t>(in.getShape()[i]);
+ }
+ int32_t out_dim[8];
+ assert(rank_out <= 8);
+ for (int i = 0; i < rank_out; i++) {
+ out_dim[i] = static_cast<int32_t>(out.getShape()[i]);
+ }
+ int32_t axis[8];
+ int32_t resolved_axis[8]; // in case there are negative or duplicate indexes
+ assert(rank_axis <= 8);
+ for (int i = 0; i < rank_axis; i++) {
+ axis[i] = static_cast<int32_t>(tmp_reduction_dims[i]);
+ }
+ float* temp_sum = new float[out_s.getNumElems()];
+ bool succ = Mean(
+ in.getData(), in_dim, rank_inp,
+ out.getData(), out_dim, rank_out,
+ axis, rank_axis, keep_dims,
+ tmp_index, resolved_axis, temp_sum
+ );
+ assert(succ && "Mean failed!");
+ delete[] temp_sum;
+void pad(Tensor& out, const char* params, const Tensor& in) {
+ const float* input = in.getData();
+ const Dims<4> input_dims = shapeToDims(in.getShape());
+ // deserialize output shape
+ Shape output_shape = deserializeShape(params);
+ // deserialize number of dimensions
+ const int32_t num_dim = deserializeT<int32_t>(params);
+ // deserialize paddings
+ std::vector<int> left_paddings, right_paddings;
+ for(int i = 0; i < num_dim; i++) {
+ left_paddings.push_back(deserializeT<int32_t>(params));
+ right_paddings.push_back(deserializeT<int32_t>(params));
+ }
+ for(int i = num_dim; i < 4; i++) {
+ left_paddings.push_back(0);
+ right_paddings.push_back(0);
+ }
+ out.reshape(output_shape);
+ float* output = out.getData();
+ const Dims<4> output_dims = shapeToDims(out.getShape());
+ Pad(input, input_dims, left_paddings, right_paddings, output, output_dims);
+void sqrtFN(Tensor& out, const char* params, const Tensor& in) {
+ const float* input = in.getData();
+ const Dims<4> inp_d = shapeToDims(in.getShape());
+ // no params to deserialize
+ out.reshape(in.getShape());
+ Sqrt(input, inp_d, out.getData());
+void absFN(Tensor &out, const char *params, const Tensor& in) {
+ out.reshape(in.getShape());
+ const float* in_data = in.getData();
+ float* out_data = out.getData();
+ const index_t num_elements = in.getShape().getNumElems();
+ for (index_t i = 0; i < num_elements; ++i) {
+ out_data[i] = abs(in_data[i]);
+ }
+void transpose(Tensor &out, const char *params, const Tensor &in) {
+ TransposeParams transpose_params;
+ transpose_params.perm_count = static_cast<int8>(deserializeT<int32_t>(params));
+ for (int i = 0; i < transpose_params.perm_count; ++i)
+ transpose_params.perm[i] = deserializeT<int32_t>(params);
+ Shape out_s = deserializeShape(params);
+ assert(out_s.getNumElems() == in.getShape().getNumElems());
+ out.reshape(out_s);
+ Transpose(transpose_params,
+ shapeToRuntimeShape(in.getShape()), in.getData(),
+ shapeToRuntimeShape(out.getShape()), out.getData());
+void gather(Tensor &out, const char *params, const Tensor &data, const Tensor &indices) {
+ GatherParams gather_params;
+ gather_params.axis = static_cast<int16>(deserializeT<int32_t>(params));
+ Shape out_s = deserializeShape(params);
+ out.reshape(out_s);
+ // reinterpret_cast is used here because indices in ModelIR are integral, but getData returns
+ // pointer to float.
+ Gather(gather_params,
+ shapeToRuntimeShape(data.getShape()), data.getData(),
+ shapeToRuntimeShape(indices.getShape()), indices.getData(),
+ shapeToRuntimeShape(out.getShape()), out.getData());
+void broadcast(Tensor &out, const char *params, const Tensor &in)
+ Shape out_shape = deserializeShape(params);
+ out.reshape(out_shape);
+ Broadcast4DSlow(shapeToRuntimeShape(in.getShape()), in.getData(),
+ shapeToRuntimeShape(out_shape), out.getData());
+void constant(Tensor& out, const char* params) {
+ out = deserializeTensor(params);
+void out(const char* params, const Tensor& in) {