summaryrefslogtreecommitdiff
path: root/compiler/luci-interpreter/pal/mcu
diff options
context:
space:
mode:
Diffstat (limited to 'compiler/luci-interpreter/pal/mcu')
-rw-r--r--compiler/luci-interpreter/pal/mcu/KernelsToBuild.lst63
-rw-r--r--compiler/luci-interpreter/pal/mcu/PALArgMax.h33
-rw-r--r--compiler/luci-interpreter/pal/mcu/PALAveragePool2d.h73
-rw-r--r--compiler/luci-interpreter/pal/mcu/PALBatchToSpaceND.h37
-rw-r--r--compiler/luci-interpreter/pal/mcu/PALConv2d.h85
-rw-r--r--compiler/luci-interpreter/pal/mcu/PALDepthToSpace.h35
-rw-r--r--compiler/luci-interpreter/pal/mcu/PALDepthwiseConv2d.h91
-rw-r--r--compiler/luci-interpreter/pal/mcu/PALDequantize.h44
-rw-r--r--compiler/luci-interpreter/pal/mcu/PALElu.h33
-rw-r--r--compiler/luci-interpreter/pal/mcu/PALFullyConnected.h61
-rw-r--r--compiler/luci-interpreter/pal/mcu/PALL2Normalize.h34
-rw-r--r--compiler/luci-interpreter/pal/mcu/PALL2Pool2D.h33
-rw-r--r--compiler/luci-interpreter/pal/mcu/PALLeakyRelu.h32
-rw-r--r--compiler/luci-interpreter/pal/mcu/PALMul.h45
-rw-r--r--compiler/luci-interpreter/pal/mcu/PALNeg.h32
-rw-r--r--compiler/luci-interpreter/pal/mcu/PALQuantize.h44
-rw-r--r--compiler/luci-interpreter/pal/mcu/PALResizeBilinear.h37
-rw-r--r--compiler/luci-interpreter/pal/mcu/PALResizeNearestNeighbor.h37
-rw-r--r--compiler/luci-interpreter/pal/mcu/PALSVDF.h258
-rw-r--r--compiler/luci-interpreter/pal/mcu/PALSoftmax.h62
-rw-r--r--compiler/luci-interpreter/pal/mcu/PALSpaceToBatchND.h38
-rw-r--r--compiler/luci-interpreter/pal/mcu/PALSpaceToDepth.h35
-rw-r--r--compiler/luci-interpreter/pal/mcu/PALSub.h35
-rw-r--r--compiler/luci-interpreter/pal/mcu/PALreference_ops.h1556
-rw-r--r--compiler/luci-interpreter/pal/mcu/pal.cmake56
25 files changed, 2889 insertions, 0 deletions
diff --git a/compiler/luci-interpreter/pal/mcu/KernelsToBuild.lst b/compiler/luci-interpreter/pal/mcu/KernelsToBuild.lst
new file mode 100644
index 000000000..fe3f73f5d
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/KernelsToBuild.lst
@@ -0,0 +1,63 @@
+REGISTER_KERNEL(Abs)
+REGISTER_KERNEL(Add)
+REGISTER_KERNEL(ArgMax)
+REGISTER_KERNEL(AveragePool2D)
+REGISTER_KERNEL(BatchToSpaceND)
+REGISTER_KERNEL(Cast)
+REGISTER_KERNEL(Concatenation)
+REGISTER_KERNEL(Conv2D)
+REGISTER_KERNEL(DepthToSpace)
+REGISTER_KERNEL(DepthwiseConv2D)
+REGISTER_KERNEL(Dequantize)
+REGISTER_KERNEL(Div)
+REGISTER_KERNEL(Elu)
+REGISTER_KERNEL(Exp)
+REGISTER_KERNEL(ExpandDims)
+REGISTER_KERNEL(Fill)
+REGISTER_KERNEL(Floor)
+REGISTER_KERNEL(FloorDiv)
+REGISTER_KERNEL(Equal)
+REGISTER_KERNEL(FullyConnected)
+REGISTER_KERNEL(Greater)
+REGISTER_KERNEL(GreaterEqual)
+REGISTER_KERNEL(If)
+REGISTER_KERNEL(InstanceNorm)
+REGISTER_KERNEL(L2Normalize)
+REGISTER_KERNEL(L2Pool2D)
+REGISTER_KERNEL(LeakyRelu)
+REGISTER_KERNEL(Less)
+REGISTER_KERNEL(LessEqual)
+REGISTER_KERNEL(LogicalAnd)
+REGISTER_KERNEL(LogicalNot)
+REGISTER_KERNEL(LogicalOr)
+REGISTER_KERNEL(Logistic)
+REGISTER_KERNEL(Maximum)
+REGISTER_KERNEL(MaxPool2D)
+REGISTER_KERNEL(Minimum)
+REGISTER_KERNEL(MirrorPad)
+REGISTER_KERNEL(Mul)
+REGISTER_KERNEL(Neg)
+REGISTER_KERNEL(NotEqual)
+REGISTER_KERNEL(Pad)
+REGISTER_KERNEL(PadV2)
+REGISTER_KERNEL(PRelu)
+REGISTER_KERNEL(Quantize)
+REGISTER_KERNEL(Reshape)
+REGISTER_KERNEL(ResizeBilinear)
+REGISTER_KERNEL(ResizeNearestNeighbor)
+REGISTER_KERNEL(Rsqrt)
+REGISTER_KERNEL(Shape)
+REGISTER_KERNEL(Softmax)
+REGISTER_KERNEL(SpaceToBatchND)
+REGISTER_KERNEL(SpaceToDepth)
+REGISTER_KERNEL(StridedSlice)
+REGISTER_KERNEL(Sqrt)
+REGISTER_KERNEL(Square)
+REGISTER_KERNEL(SquaredDifference)
+REGISTER_KERNEL(Squeeze)
+REGISTER_KERNEL(Sub)
+REGISTER_KERNEL(SVDF)
+REGISTER_KERNEL(Tanh)
+REGISTER_KERNEL(Transpose)
+REGISTER_KERNEL(TransposeConv)
+REGISTER_KERNEL(While)
diff --git a/compiler/luci-interpreter/pal/mcu/PALArgMax.h b/compiler/luci-interpreter/pal/mcu/PALArgMax.h
new file mode 100644
index 000000000..21e63296d
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALArgMax.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_ARGMAX_H
+#define LUCI_INTERPRETER_PAL_ARGMAX_H
+
+#include <tensorflow/lite/kernels/internal/reference/arg_min_max.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T1, typename T2, typename T3>
+static inline void ArgMinMax(const tflite::RuntimeShape &input1_shape, const T1 *input1_data,
+ const T2 *axis, const tflite::RuntimeShape &output_shape,
+ T3 *output_data, const std::greater<T1> cmp)
+{
+ tflite::reference_ops::ArgMinMax(input1_shape, input1_data, axis, output_shape, output_data, cmp);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_ARGMAX_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALAveragePool2d.h b/compiler/luci-interpreter/pal/mcu/PALAveragePool2d.h
new file mode 100644
index 000000000..cce30601f
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALAveragePool2d.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H
+#define LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H
+
+#include <tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h>
+#include <tensorflow/lite/kernels/internal/reference/pooling.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void AveragePool(const tflite::PoolParams &params,
+ const tflite::RuntimeShape &input_shape, const T *input_data,
+ const tflite::RuntimeShape &output_shape, T *output_data,
+ const tflite::RuntimeShape &scratchpad_shape, T *scratchpad_data)
+{
+ {
+ // MARK: At this moment this operation doesn't support
+ assert(false && "AveragePool NYI");
+ (void)params;
+ (void)input_shape;
+ (void)input_data;
+ (void)output_shape;
+ (void)output_data;
+ (void)scratchpad_shape;
+ (void)scratchpad_data;
+ }
+}
+
+template <>
+inline void AveragePool<int8_t>(const tflite::PoolParams &params,
+ const tflite::RuntimeShape &input_shape, const int8_t *input_data,
+ const tflite::RuntimeShape &output_shape, int8_t *output_data,
+ const tflite::RuntimeShape &scratchpad_shape,
+ int8_t *scratchpad_data)
+{
+ (void)scratchpad_shape;
+ (void)scratchpad_data;
+
+ tflite::reference_integer_ops::AveragePool(params, input_shape, input_data, output_shape,
+ output_data);
+}
+
+static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad,
+ const luci_interpreter::DataType &input_data_type,
+ const tflite::RuntimeShape &input_shape,
+ const tflite::RuntimeShape &output_shape)
+
+{
+ (void)input_data_type;
+ (void)input_shape;
+ (void)output_shape;
+
+ scratchpad->set_allocatable(false);
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_AVERAGEPOOL2D_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALBatchToSpaceND.h b/compiler/luci-interpreter/pal/mcu/PALBatchToSpaceND.h
new file mode 100644
index 000000000..f8a4a8036
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALBatchToSpaceND.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_BATCHTOSPACEND_H
+#define LUCI_INTERPRETER_PAL_BATCHTOSPACEND_H
+
+#include <tensorflow/lite/kernels/internal/reference/batch_to_space_nd.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+BatchToSpaceND(const tflite::RuntimeShape &unextended_input1_shape, const T *input1_data,
+ const tflite::RuntimeShape &unextended_input2_shape, const int32 *block_shape_data,
+ const tflite::RuntimeShape &unextended_input3_shape, const int32 *crops_data,
+ const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+ tflite::reference_ops::BatchToSpaceND(
+ unextended_input1_shape, input1_data, unextended_input2_shape, block_shape_data,
+ unextended_input3_shape, crops_data, unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_BATCHTOSPACEND_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALConv2d.h b/compiler/luci-interpreter/pal/mcu/PALConv2d.h
new file mode 100644
index 000000000..13976877a
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALConv2d.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_CONV2D_H
+#define LUCI_INTERPRETER_PAL_CONV2D_H
+
+#include <tensorflow/lite/kernels/internal/reference/conv.h>
+#include <tensorflow/lite/kernels/internal/reference/integer_ops/conv.h>
+
+namespace luci_interpreter_pal
+{
+static inline void Conv(const tflite::ConvParams &params, const tflite::RuntimeShape &input_shape,
+ const float *input_data, const tflite::RuntimeShape &filter_shape,
+ const float *filter_data, const tflite::RuntimeShape &bias_shape,
+ const float *bias_data, const tflite::RuntimeShape &output_shape,
+ float *output_data, const tflite::RuntimeShape &scratchpad_shape,
+ float *scratchpad_data)
+{
+ (void)scratchpad_shape;
+ (void)scratchpad_data;
+ tflite::reference_ops::Conv(params, input_shape, input_data, filter_shape, filter_data,
+ bias_shape, bias_data, output_shape, output_data,
+ tflite::RuntimeShape(), nullptr);
+}
+
+static inline void Conv(const tflite::ConvParams &params, const tflite::RuntimeShape &input_shape,
+ const uint8 *input_data, const tflite::RuntimeShape &filter_shape,
+ const uint8 *filter_data, const tflite::RuntimeShape &bias_shape,
+ const int32 *bias_data, const tflite::RuntimeShape &output_shape,
+ uint8 *output_data, const tflite::RuntimeShape &scratchpad_shape,
+ uint8 *scratchpad_data)
+{
+ (void)scratchpad_shape;
+ (void)scratchpad_data;
+ tflite::reference_ops::Conv(params, input_shape, input_data, filter_shape, filter_data,
+ bias_shape, bias_data, output_shape, output_data, scratchpad_shape,
+ scratchpad_data, nullptr);
+}
+
+static inline void ConvPerChannel(const tflite::ConvParams &params, const int32_t *mult,
+ const int32_t *shifts, const tflite::RuntimeShape &input_shape,
+ const int8 *input_data, const tflite::RuntimeShape &filter_shape,
+ const int8 *filter_data, const tflite::RuntimeShape &bias_shape,
+ const int32 *bias_data, const tflite::RuntimeShape &output_shape,
+ int8 *output_data, const tflite::RuntimeShape &scratchpad_shape,
+ int8 *scratchpad_data)
+{
+ (void)scratchpad_shape;
+ (void)scratchpad_data;
+ tflite::reference_integer_ops::ConvPerChannel(params, mult, shifts, input_shape, input_data,
+ filter_shape, filter_data, bias_shape, bias_data,
+ output_shape, output_data);
+}
+
+static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad,
+ const luci_interpreter::DataType &input_data_type,
+ const tflite::ConvParams &params,
+ const tflite::RuntimeShape &input_shape,
+ const tflite::RuntimeShape &filter_shape,
+ const tflite::RuntimeShape &output_shape)
+{
+ (void)input_data_type;
+ (void)params;
+ (void)input_shape;
+ (void)filter_shape;
+ (void)output_shape;
+ scratchpad->set_allocatable(false);
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_CONV2D_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALDepthToSpace.h b/compiler/luci-interpreter/pal/mcu/PALDepthToSpace.h
new file mode 100644
index 000000000..8463e571e
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALDepthToSpace.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_DEPTHTOSPACE_H
+#define LUCI_INTERPRETER_PAL_DEPTHTOSPACE_H
+
+#include <tensorflow/lite/kernels/internal/reference/depth_to_space.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void DepthToSpace(const tflite::DepthToSpaceParams &op_params,
+ const tflite::RuntimeShape &unextended_input_shape,
+ const T *input_data,
+ const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+ tflite::reference_ops::DepthToSpace(op_params, unextended_input_shape, input_data,
+ unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_DEPTHTOSPACE_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALDepthwiseConv2d.h b/compiler/luci-interpreter/pal/mcu/PALDepthwiseConv2d.h
new file mode 100644
index 000000000..c9d1a2948
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALDepthwiseConv2d.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H
+#define LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H
+
+#include <tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h>
+#include <tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h>
+#include <tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+DepthwiseConvPerChannel(const tflite::DepthwiseParams &params, const int32_t *output_multiplier,
+ const int32_t *output_shift, const tflite::RuntimeShape &input_shape,
+ const T *input_data, const tflite::RuntimeShape &filter_shape,
+ const T *filter_data, const tflite::RuntimeShape &bias_shape,
+ const int32_t *bias_data, const tflite::RuntimeShape &output_shape,
+ T *output_data, const tflite::RuntimeShape &scratchpad_shape,
+ T *scratchpad_data)
+{
+ {
+ // MARK: At this moment this operation is not supported
+ assert(false && "DepthwiseConvPerChannel NYI");
+ (void)params;
+ (void)output_multiplier;
+ (void)output_shift;
+ (void)input_shape;
+ (void)output_data;
+ (void)input_data;
+ (void)filter_shape;
+ (void)filter_data;
+ (void)bias_shape;
+ (void)bias_data;
+ (void)output_shape;
+ (void)output_data;
+ (void)scratchpad_shape;
+ (void)scratchpad_data;
+ }
+}
+
+template <>
+inline void DepthwiseConvPerChannel<int8_t>(
+ const tflite::DepthwiseParams &params, const int32_t *output_multiplier,
+ const int32_t *output_shift, const tflite::RuntimeShape &input_shape, const int8_t *input_data,
+ const tflite::RuntimeShape &filter_shape, const int8_t *filter_data,
+ const tflite::RuntimeShape &bias_shape, const int32_t *bias_data,
+ const tflite::RuntimeShape &output_shape, int8_t *output_data,
+ const tflite::RuntimeShape &scratchpad_shape, int8_t *scratchpad_data)
+{
+ (void)scratchpad_shape;
+ (void)scratchpad_data;
+ tflite::reference_integer_ops::DepthwiseConvPerChannel(
+ params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
+ bias_shape, bias_data, output_shape, output_data);
+}
+
+static inline void SetupScratchpadTensor(luci_interpreter::Tensor *scratchpad,
+ const tflite::DepthwiseParams &params,
+ const luci_interpreter::DataType &input_data_type,
+ const tflite::RuntimeShape &input_shape,
+ const tflite::RuntimeShape &filter_shape,
+ const tflite::RuntimeShape &output_shape)
+
+{
+ (void)params;
+ (void)input_data_type;
+ (void)input_shape;
+ (void)filter_shape;
+ (void)output_shape;
+
+ scratchpad->set_allocatable(false);
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_DEPTHWISECONV2D_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALDequantize.h b/compiler/luci-interpreter/pal/mcu/PALDequantize.h
new file mode 100644
index 000000000..efa6b167e
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALDequantize.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_DEQUANTIZE_H
+#define LUCI_INTERPRETER_PAL_DEQUANTIZE_H
+
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h"
+#include "PALreference_ops.h"
+
+namespace luci_interpreter_pal
+{
+
+template <typename T>
+static inline void Dequantize(tflite::DequantizationParams &params,
+ const tflite::RuntimeShape &input_shape, const T *input_data,
+ const tflite::RuntimeShape &output_shape, float *output_data)
+{
+ tflite::reference_integer_ops::Dequantize<T>(params, input_shape, input_data, output_shape,
+ output_data);
+}
+
+static inline void Dequantize(tflite::DequantizationParams &params,
+ const tflite::RuntimeShape &input_shape, const uint8_t *input_data,
+ const tflite::RuntimeShape &output_shape, float *output_data)
+{
+ tflite::reference_ops::Dequantize(params, input_shape, input_data, output_shape, output_data);
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_DEQUANTIZE_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALElu.h b/compiler/luci-interpreter/pal/mcu/PALElu.h
new file mode 100644
index 000000000..4089d0a0c
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALElu.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_ELU_H
+#define LUCI_INTERPRETER_PAL_ELU_H
+
+#include <tensorflow/lite/kernels/internal/reference/elu.h>
+
+namespace luci_interpreter_pal
+{
+
+static inline void Elu(const tflite::RuntimeShape &input_shape, const float *input_data,
+ const tflite::RuntimeShape &output_shape, float *output_data)
+{
+ tflite::reference_ops::Elu(input_shape, input_data, output_shape, output_data);
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_ELU_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALFullyConnected.h b/compiler/luci-interpreter/pal/mcu/PALFullyConnected.h
new file mode 100644
index 000000000..048624d74
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALFullyConnected.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_FULLYCONNECTED_H
+#define LUCI_INTERPRETER_PAL_FULLYCONNECTED_H
+
+#include <tensorflow/lite/kernels/internal/reference/fully_connected.h>
+#include <tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void FullyConnected(const tflite::FullyConnectedParams &params,
+ const tflite::RuntimeShape &input_shape, const T *input_data,
+ const tflite::RuntimeShape &filter_shape, const T *filter_data,
+ const tflite::RuntimeShape &bias_shape, const int32_t *bias_data,
+ const tflite::RuntimeShape &output_shape, T *output_data)
+{
+ {
+ // MARK: At this moment this operation is not supported
+ assert(false && "FullyConnected NYI");
+ (void)params;
+ (void)input_shape;
+ (void)input_data;
+ (void)filter_shape;
+ (void)filter_data;
+ (void)bias_shape;
+ (void)bias_data;
+ (void)output_shape;
+ (void)output_data;
+ }
+}
+
+template <>
+inline void
+FullyConnected<int8_t>(const tflite::FullyConnectedParams &params,
+ const tflite::RuntimeShape &input_shape, const int8_t *input_data,
+ const tflite::RuntimeShape &filter_shape, const int8_t *filter_data,
+ const tflite::RuntimeShape &bias_shape, const int32_t *bias_data,
+ const tflite::RuntimeShape &output_shape, int8_t *output_data)
+{
+ tflite::reference_integer_ops::FullyConnected(params, input_shape, input_data, filter_shape,
+ filter_data, bias_shape, bias_data, output_shape,
+ output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_FULLYCONNECTED_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALL2Normalize.h b/compiler/luci-interpreter/pal/mcu/PALL2Normalize.h
new file mode 100644
index 000000000..f84742a44
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALL2Normalize.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_L2NORMALIZE_H
+#define LUCI_INTERPRETER_PAL_L2NORMALIZE_H
+
+#include <tensorflow/lite/kernels/internal/reference/l2normalization.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void L2Normalization(const tflite::L2NormalizationParams &op_params,
+ const tflite::RuntimeShape &input_shape, const T *input_data,
+ const tflite::RuntimeShape &output_shape, T *output_data)
+{
+ tflite::reference_ops::L2Normalization(op_params, input_shape, input_data, output_shape,
+ output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_L2NORMALIZE_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALL2Pool2D.h b/compiler/luci-interpreter/pal/mcu/PALL2Pool2D.h
new file mode 100644
index 000000000..38a302fc6
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALL2Pool2D.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_L2POOL2D_H
+#define LUCI_INTERPRETER_PAL_L2POOL2D_H
+
+#include <tensorflow/lite/kernels/internal/reference/pooling.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void L2Pool(const tflite::PoolParams &params, const tflite::RuntimeShape &input_shape,
+ const T *input_data, const tflite::RuntimeShape &output_shape,
+ T *output_data)
+{
+ tflite::reference_ops::L2Pool(params, input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_L2POOL2D_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALLeakyRelu.h b/compiler/luci-interpreter/pal/mcu/PALLeakyRelu.h
new file mode 100644
index 000000000..9ccd2224f
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALLeakyRelu.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_LEAKYRELU_H
+#define LUCI_INTERPRETER_PAL_LEAKYRELU_H
+
+#include <tensorflow/lite/kernels/internal/reference/leaky_relu.h>
+
+namespace luci_interpreter_pal
+{
+static inline void LeakyRelu(const tflite::LeakyReluParams &params,
+ const tflite::RuntimeShape &input_shape, const float *input_data,
+ const tflite::RuntimeShape &output_shape, float *output_data)
+{
+ tflite::reference_ops::LeakyRelu(params, input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_LEAKYRELU_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALMul.h b/compiler/luci-interpreter/pal/mcu/PALMul.h
new file mode 100644
index 000000000..347a97a83
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALMul.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_MUL_H
+#define LUCI_INTERPRETER_PAL_MUL_H
+
+#include <tensorflow/lite/kernels/internal/reference/mul.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void Mul(tflite::ArithmeticParams &params, const tflite::RuntimeShape &input1_shape,
+ const T *input1_data, const tflite::RuntimeShape &input2_shape,
+ const T *input2_data, const tflite::RuntimeShape &output_shape,
+ T *output_data)
+{
+ tflite::reference_ops::BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape,
+ input2_data, output_shape, output_data);
+}
+
+template <typename T>
+static inline void
+BroadcastMul4DSlow(tflite::ArithmeticParams &params, const tflite::RuntimeShape &input1_shape,
+ const T *input1_data, const tflite::RuntimeShape &input2_shape,
+ const T *input2_data, const tflite::RuntimeShape &output_shape, T *output_data)
+{
+ tflite::reference_ops::BroadcastMul4DSlow(params, input1_shape, input1_data, input2_shape,
+ input2_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_MUL_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALNeg.h b/compiler/luci-interpreter/pal/mcu/PALNeg.h
new file mode 100644
index 000000000..be5903a0c
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALNeg.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_NEG_H
+#define LUCI_INTERPRETER_PAL_NEG_H
+
+#include <tensorflow/lite/kernels/internal/reference/neg.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void Negate(const tflite::RuntimeShape &input_shape, const T *input_data,
+ const tflite::RuntimeShape &output_shape, T *output_data)
+{
+ tflite::reference_ops::Negate(input_shape, input_data, output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_NEG_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALQuantize.h b/compiler/luci-interpreter/pal/mcu/PALQuantize.h
new file mode 100644
index 000000000..effb85d54
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALQuantize.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_QUANTIZE_H
+#define LUCI_INTERPRETER_PAL_QUANTIZE_H
+
+#include "PALreference_ops.h"
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void Quantize(tflite::QuantizationParams &params,
+ const tflite::RuntimeShape &input_shape, const float *input_data,
+ const tflite::RuntimeShape &output_shape, T *output_data)
+{
+ tflite::reference_ops::AffineQuantize(params, input_shape, input_data, output_shape, output_data);
+}
+
+template <typename Input, typename Output>
+static inline void Requantize(const Input *input_data, int32_t size,
+ int32_t effective_scale_multiplier, int32_t effective_scale_shift,
+ int32_t input_zero_point, int32_t output_zero_point,
+ Output *output_data)
+{
+ tflite::reference_ops::Requantize(input_data, size, effective_scale_multiplier,
+ effective_scale_shift, input_zero_point, output_zero_point,
+ output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_QUANTIZE_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALResizeBilinear.h b/compiler/luci-interpreter/pal/mcu/PALResizeBilinear.h
new file mode 100644
index 000000000..cc9f0fd54
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALResizeBilinear.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_RESIZEBILINEAR_H
+#define LUCI_INTERPRETER_PAL_RESIZEBILINEAR_H
+
+#include <tensorflow/lite/kernels/internal/reference/resize_bilinear.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+ResizeBilinear(const tflite::ResizeBilinearParams &op_params,
+ const tflite::RuntimeShape &unextended_input_shape, const T *input_data,
+ const tflite::RuntimeShape &output_size_shape, const int32 *output_size_data,
+ const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+ tflite::reference_ops::ResizeBilinear(op_params, unextended_input_shape, input_data,
+ output_size_shape, output_size_data,
+ unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_RESIZEBILINEAR_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALResizeNearestNeighbor.h b/compiler/luci-interpreter/pal/mcu/PALResizeNearestNeighbor.h
new file mode 100644
index 000000000..f4d5a6ed3
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALResizeNearestNeighbor.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_RESIZENEARESTNEIGHBOR_H
+#define LUCI_INTERPRETER_PAL_RESIZENEARESTNEIGHBOR_H
+
+#include <tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+ResizeNearestNeighbor(const tflite::ResizeNearestNeighborParams &op_params,
+ const tflite::RuntimeShape &unextended_input_shape, const T *input_data,
+ const tflite::RuntimeShape &output_size_shape, const int32 *output_size_data,
+ const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+ tflite::reference_ops::ResizeNearestNeighbor(op_params, unextended_input_shape, input_data,
+ output_size_shape, output_size_data,
+ unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_RESIZENEARESTNEIGHBOR_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALSVDF.h b/compiler/luci-interpreter/pal/mcu/PALSVDF.h
new file mode 100644
index 000000000..3bba668fb
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALSVDF.h
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SVDF_H
+#define LUCI_INTERPRETER_PAL_SVDF_H
+
+#include <tensorflow/lite/kernels/internal/reference/svdf.h>
+
+namespace luci_interpreter_pal
+{
+static inline void
+IntegerSVDF(const TfLiteSVDFParams &params, const tflite::RuntimeShape &input_shape,
+ const int8_t *input_data, const tflite::RuntimeShape &weight_feature_shape,
+ const int8_t *weight_feature_data, const tflite::RuntimeShape &weight_time_shape,
+ const int16_t *weight_time_data, const tflite::RuntimeShape &bias_shape,
+ const int32_t *bias_data, int16_t *activation_state_data,
+ const tflite::RuntimeShape &output_shape, int8_t *output_data, int32_t *scratchpad_data,
+ int32_t *output_temp_data, int32_t scale_1_a, int scale_1_b, int32_t scale_2_a,
+ int scale_2_b, int32_t input_zp, int32_t output_zp)
+{
+ const int n_rank = params.rank;
+ const int n_batch = input_shape.Dims(0);
+ const int n_input = input_shape.Dims(1);
+ const int n_filter = weight_feature_shape.Dims(0);
+ const int n_unit = n_filter / n_rank;
+ const int n_memory = weight_time_shape.Dims(1);
+
+ // Left shift the activation_state.
+ {
+ int16_t *new_state_start = activation_state_data;
+ const int16_t *old_state_start = activation_state_data + 1;
+ const int16_t *old_state_end = activation_state_data + n_batch * n_filter * n_memory;
+ while (old_state_start != old_state_end)
+ {
+ *new_state_start++ = *old_state_start++;
+ }
+ }
+
+ // Note: no need to clear the latest activation, matmul is not accumulative.
+
+ // Feature matmul.
+ {
+ const int32_t output_max = std::numeric_limits<int16_t>::max();
+ const int32_t output_min = std::numeric_limits<int16_t>::min();
+ int16_t *result_in_batch = activation_state_data + (n_memory - 1);
+ for (int b = 0; b < n_batch; b++)
+ {
+ const int8_t *matrix_ptr = weight_feature_data;
+ for (int r = 0; r < n_filter; r++)
+ {
+ int32_t dot_prod = 0;
+ const int8_t *vector_in_batch = input_data + b * n_input;
+ for (int c = 0; c < n_input; c++)
+ {
+ dot_prod += *matrix_ptr++ * (*vector_in_batch++ - input_zp);
+ }
+ dot_prod = tflite::MultiplyByQuantizedMultiplier(dot_prod, scale_1_a, scale_1_b);
+ dot_prod = std::min(std::max(output_min, dot_prod), output_max);
+ // This assumes state is symmetrically quantized. Otherwise last bit of
+ // state should be initialized to its zero point and accumulate the
+ // dot_prod.
+ // Equivalent as the following:
+ // result_in_batch = zero point, which happens to be zero.
+ // result_in_batch += dot_prod_56.
+ *result_in_batch = dot_prod;
+ result_in_batch += n_memory;
+ }
+ }
+ }
+
+ // Time.
+ {
+ for (int b = 0; b < n_batch; ++b)
+ {
+ int32_t *scratch_ptr_batch = scratchpad_data + b * n_filter;
+
+ // Perform batched vector dot product:
+ const int16_t *vector1_ptr = weight_time_data;
+ const int16_t *vector2_ptr = activation_state_data + b * n_memory * n_filter;
+
+ for (int i = 0; i < n_filter; i++)
+ {
+ *scratch_ptr_batch = 0;
+ for (int j = 0; j < n_memory; j++)
+ {
+ *scratch_ptr_batch += *vector1_ptr++ * *vector2_ptr++;
+ }
+ scratch_ptr_batch++;
+ }
+ }
+ }
+
+ // Reduce, add bias, rescale, activation.
+ {
+ // Add bias.
+ if (bias_data)
+ {
+ // Vector batch assign:
+ for (int i = 0; i < n_batch; ++i)
+ {
+ int32_t *output_ptr = output_temp_data + i * n_unit;
+ const int32_t *bias_ptr = bias_data;
+ for (int j = 0; j < n_unit; ++j)
+ {
+ *output_ptr++ = *bias_ptr++;
+ }
+ }
+ }
+ else
+ {
+ int32_t *output_ptr = output_temp_data;
+ for (int i = 0; i < n_batch * n_unit; ++i)
+ {
+ *output_ptr++ = 0;
+ }
+ }
+
+ // Reduce.
+ for (int b = 0; b < n_batch; ++b)
+ {
+ int32_t *output_temp_ptr = output_temp_data + b * n_unit;
+ int32_t *scratch_ptr_batch = scratchpad_data + b * n_filter;
+
+ // Reduction sum vector
+ for (int i = 0; i < n_unit; ++i)
+ {
+ for (int j = 0; j < n_rank; ++j)
+ {
+ output_temp_ptr[i] += *scratch_ptr_batch++;
+ }
+ }
+ }
+
+ // Rescale.
+ const int32_t output_max = std::numeric_limits<int8_t>::max();
+ const int32_t output_min = std::numeric_limits<int8_t>::min();
+ for (int i = 0; i < n_batch * n_unit; ++i)
+ {
+ int32_t x1 = output_temp_data[i];
+ int32_t x2 = tflite::MultiplyByQuantizedMultiplier(x1, scale_2_a, scale_2_b);
+ int32_t x3 = x2 + output_zp;
+ int32_t x4 = std::min(std::max(output_min, x3), output_max);
+ output_data[i] = static_cast<int8_t>(x4);
+ }
+ }
+}
+static inline void
+FloatSVDF(const TfLiteSVDFParams &params, const tflite::RuntimeShape &input_shape,
+ const float *input_data, const tflite::RuntimeShape &weight_feature_shape,
+ const float *weight_feature_data, const tflite::RuntimeShape &weight_time_shape,
+ const float *weight_time_data, const tflite::RuntimeShape &bias_shape,
+ const float *bias_data, float *scratchpad_data, float *activation_state_data,
+ const tflite::RuntimeShape &output_shape, float *output_data)
+{
+ const int32_t rank = params.rank;
+ const int32_t batch_size = input_shape.Dims(0);
+ const int32_t input_size = input_shape.Dims(1);
+ const int32_t num_filters = weight_feature_shape.Dims(0);
+ const int32_t num_units = num_filters / rank;
+ const int32_t memory_size = weight_time_shape.Dims(1);
+
+ // Left shift the activation_state.
+ {
+ float *new_state_start = activation_state_data;
+ const float *old_state_start = activation_state_data + 1;
+ const float *old_state_end = activation_state_data + batch_size * num_filters * memory_size;
+ while (old_state_start != old_state_end)
+ {
+ *new_state_start++ = *old_state_start++;
+ }
+ }
+
+ // Note: no need to clear the latest activation, matmul is not accumulative.
+
+ // Compute conv1d(inputs, weights_feature).
+ // The activation_state's rightmost column is used to save current cycle
+ // activation. This is achieved by starting at state_ptr[memory_size - 1] and
+ // having the stride equal to memory_size.
+
+ // Perform batched matrix vector multiply operation:
+ {
+ const float *matrix = weight_feature_data;
+ const float *vector = input_data;
+ float *result = &activation_state_data[memory_size - 1];
+ float *result_in_batch = result;
+ for (int i = 0; i < batch_size; ++i)
+ {
+ const float *matrix_ptr = matrix;
+ for (int j = 0; j < num_filters; ++j)
+ {
+ float dot_prod = 0.0f;
+ const float *vector_in_batch = vector + i * input_size;
+ for (int k = 0; k < input_size; ++k)
+ {
+ dot_prod += *matrix_ptr++ * *vector_in_batch++;
+ }
+ *result_in_batch = dot_prod;
+ result_in_batch += memory_size;
+ }
+ }
+ }
+
+ tflite::reference_ops::ApplyTimeWeightsBiasAndActivation(
+ batch_size, memory_size, num_filters, num_units, rank, weight_time_data, bias_data,
+ params.activation, activation_state_data, scratchpad_data, output_data);
+}
+
+static inline void SetupScratchpadTensor(
+ const luci_interpreter::DataType &input_data_type,
+ const luci_interpreter::DataType &weight_feature_data_type,
+ luci_interpreter::Tensor *scratchpad_1, luci_interpreter::Tensor *scratchpad_2,
+ luci_interpreter::Tensor *scratchpad_3, luci_interpreter::Tensor *scratchpad_4,
+ luci_interpreter::Tensor *scratchpad_5, luci_interpreter::Tensor *scratchpad_6,
+ const luci_interpreter::Shape input_shape, const luci_interpreter::Shape weight_time_shape,
+ const int32_t batch_size, const int32_t num_filters, const int32_t num_units)
+{
+
+ if (input_data_type == loco::DataType::FLOAT32 &&
+ (weight_feature_data_type == loco::DataType::S8 ||
+ weight_feature_data_type == loco::DataType::U8))
+ {
+ (void)input_shape;
+ (void)weight_time_shape;
+ (void)scratchpad_3;
+ (void)scratchpad_4;
+ (void)scratchpad_5;
+ (void)scratchpad_6;
+
+ throw std::runtime_error("Hybrid type is not currently supported for mcu platform");
+ }
+
+ // Resize scratchpad_1 tensor
+ scratchpad_1->resize({batch_size, num_filters});
+
+ if (input_data_type == loco::DataType::S8)
+ {
+ // Resize scratchpad_2 for full_integer op
+ scratchpad_2->resize({batch_size, num_units});
+ }
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SVDF_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALSoftmax.h b/compiler/luci-interpreter/pal/mcu/PALSoftmax.h
new file mode 100644
index 000000000..9838b542d
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALSoftmax.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SOFTMAX_H
+#define LUCI_INTERPRETER_PAL_SOFTMAX_H
+
+#include <tensorflow/lite/kernels/internal/reference/softmax.h>
+
+namespace luci_interpreter_pal
+{
+static inline void PopulateSoftmaxLookupTable(tflite::SoftmaxParams *data, float input_scale,
+ float beta)
+{
+ // Do nothing for mcu
+ (void)data;
+ (void)input_scale;
+ (void)beta;
+}
+
+static inline void InitializeParams(tflite::SoftmaxParams *params, float input_scale, float beta)
+{
+ int32 input_beta_multiplier;
+ int input_beta_left_shift;
+ static const int kScaledDiffIntegerBits = 5;
+ tflite::PreprocessSoftmaxScaling(beta, input_scale, kScaledDiffIntegerBits,
+ &input_beta_multiplier, &input_beta_left_shift);
+
+ params->input_multiplier = input_beta_multiplier;
+ params->input_left_shift = input_beta_left_shift;
+ params->diff_min =
+ -tflite::CalculateInputRadius(kScaledDiffIntegerBits, params->input_left_shift);
+}
+
+template <typename T>
+static inline void Softmax(const tflite::SoftmaxParams &params,
+ const tflite::RuntimeShape &input_shape, const T *input_data,
+ const tflite::RuntimeShape &output_shape, T *output_data)
+{
+ // MARK: At this moment this operation doesn't support on mcu
+ assert(false && "Softmax NYI");
+ (void)params;
+ (void)input_shape;
+ (void)input_data;
+ (void)output_shape;
+ (void)output_data;
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SOFTMAX_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALSpaceToBatchND.h b/compiler/luci-interpreter/pal/mcu/PALSpaceToBatchND.h
new file mode 100644
index 000000000..fdddaa929
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALSpaceToBatchND.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SPACETOBATCHND_H
+#define LUCI_INTERPRETER_PAL_SPACETOBATCHND_H
+
+#include <tensorflow/lite/kernels/internal/reference/space_to_batch_nd.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void
+SpaceToBatchND(const tflite::SpaceToBatchParams &params,
+ const tflite::RuntimeShape &unextended_input1_shape, const T *input1_data,
+ const tflite::RuntimeShape &unextended_input2_shape, const int32 *block_shape_data,
+ const tflite::RuntimeShape &unextended_input3_shape, const int32 *paddings_data,
+ const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+ tflite::reference_ops::SpaceToBatchND(
+ params, unextended_input1_shape, input1_data, unextended_input2_shape, block_shape_data,
+ unextended_input3_shape, paddings_data, unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SPACETOBATCHND_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALSpaceToDepth.h b/compiler/luci-interpreter/pal/mcu/PALSpaceToDepth.h
new file mode 100644
index 000000000..816b7f663
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALSpaceToDepth.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SPACETODEPTH_H
+#define LUCI_INTERPRETER_PAL_SPACETODEPTH_H
+
+#include <tensorflow/lite/kernels/internal/reference/space_to_depth.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void SpaceToDepth(const tflite::SpaceToDepthParams &op_params,
+ const tflite::RuntimeShape &unextended_input_shape,
+ const T *input_data,
+ const tflite::RuntimeShape &unextended_output_shape, T *output_data)
+{
+ tflite::reference_ops::SpaceToDepth(op_params, unextended_input_shape, input_data,
+ unextended_output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SPACETODEPTH_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALSub.h b/compiler/luci-interpreter/pal/mcu/PALSub.h
new file mode 100644
index 000000000..ea57578c6
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALSub.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_SUB_H
+#define LUCI_INTERPRETER_PAL_SUB_H
+
+#include <tensorflow/lite/kernels/internal/reference/sub.h>
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+static inline void Sub(const tflite::ArithmeticParams &params,
+ const tflite::RuntimeShape &input1_shape, const T *input1_data,
+ const tflite::RuntimeShape &input2_shape, const T *input2_data,
+ const tflite::RuntimeShape &output_shape, T *output_data)
+{
+ tflite::reference_ops::Sub(params, input1_shape, input1_data, input2_shape, input2_data,
+ output_shape, output_data);
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_SUB_H
diff --git a/compiler/luci-interpreter/pal/mcu/PALreference_ops.h b/compiler/luci-interpreter/pal/mcu/PALreference_ops.h
new file mode 100644
index 000000000..62c720937
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALreference_ops.h
@@ -0,0 +1,1556 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_REFERENCE_OPS_H
+#define LUCI_INTERPRETER_PAL_REFERENCE_OPS_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <type_traits>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "fixedpoint/fixedpoint.h"
+#include "ruy/profiler/instrumentation.h" // from @ruy
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/add.h"
+#include "tensorflow/lite/kernels/internal/reference/add_n.h"
+#include "tensorflow/lite/kernels/internal/reference/arg_min_max.h"
+#include "tensorflow/lite/kernels/internal/reference/batch_matmul.h"
+#include "tensorflow/lite/kernels/internal/reference/batch_to_space_nd.h"
+#include "tensorflow/lite/kernels/internal/reference/binary_function.h"
+#include "tensorflow/lite/kernels/internal/reference/cast.h"
+#include "tensorflow/lite/kernels/internal/reference/ceil.h"
+#include "tensorflow/lite/kernels/internal/reference/comparisons.h"
+#include "tensorflow/lite/kernels/internal/reference/concatenation.h"
+#include "tensorflow/lite/kernels/internal/reference/conv.h"
+#include "tensorflow/lite/kernels/internal/reference/depth_to_space.h"
+#include "tensorflow/lite/kernels/internal/reference/dequantize.h"
+#include "tensorflow/lite/kernels/internal/reference/div.h"
+#include "tensorflow/lite/kernels/internal/reference/elu.h"
+#include "tensorflow/lite/kernels/internal/reference/exp.h"
+#include "tensorflow/lite/kernels/internal/reference/fill.h"
+#include "tensorflow/lite/kernels/internal/reference/floor.h"
+#include "tensorflow/lite/kernels/internal/reference/floor_div.h"
+#include "tensorflow/lite/kernels/internal/reference/floor_mod.h"
+#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/reference/gather.h"
+#include "tensorflow/lite/kernels/internal/reference/hard_swish.h"
+#include "tensorflow/lite/kernels/internal/reference/l2normalization.h"
+#include "tensorflow/lite/kernels/internal/reference/leaky_relu.h"
+#include "tensorflow/lite/kernels/internal/reference/log_softmax.h"
+#include "tensorflow/lite/kernels/internal/reference/logistic.h"
+#include "tensorflow/lite/kernels/internal/reference/maximum_minimum.h"
+#include "tensorflow/lite/kernels/internal/reference/mul.h"
+#include "tensorflow/lite/kernels/internal/reference/neg.h"
+#include "tensorflow/lite/kernels/internal/reference/pad.h"
+#include "tensorflow/lite/kernels/internal/reference/pooling.h"
+#include "tensorflow/lite/kernels/internal/reference/prelu.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/kernels/internal/reference/quantize.h"
+#include "tensorflow/lite/kernels/internal/reference/reduce.h"
+#include "tensorflow/lite/kernels/internal/reference/requantize.h"
+#include "tensorflow/lite/kernels/internal/reference/resize_bilinear.h"
+#include "tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h"
+#include "tensorflow/lite/kernels/internal/reference/round.h"
+#include "tensorflow/lite/kernels/internal/reference/softmax.h"
+#include "tensorflow/lite/kernels/internal/reference/space_to_batch_nd.h"
+#include "tensorflow/lite/kernels/internal/reference/space_to_depth.h"
+#include "tensorflow/lite/kernels/internal/reference/strided_slice.h"
+#include "tensorflow/lite/kernels/internal/reference/string_comparisons.h"
+#include "tensorflow/lite/kernels/internal/reference/sub.h"
+#include "tensorflow/lite/kernels/internal/reference/tanh.h"
+#include "tensorflow/lite/kernels/internal/reference/transpose.h"
+#include "tensorflow/lite/kernels/internal/reference/transpose_conv.h"
+#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+namespace tflite
+{
+
+namespace reference_ops
+{
+
+template <typename T>
+inline void Relu(const RuntimeShape &input_shape, const T *input_data,
+ const RuntimeShape &output_shape, T *output_data)
+{
+ const int flat_size = MatchingFlatSize(input_shape, output_shape);
+ for (int i = 0; i < flat_size; ++i)
+ {
+ const T val = input_data[i];
+ const T lower = 0;
+ const T clamped = val < lower ? lower : val;
+ output_data[i] = clamped;
+ }
+}
+
+template <typename T>
+inline void Relu1(const RuntimeShape &input_shape, const T *input_data,
+ const RuntimeShape &output_shape, T *output_data)
+{
+ ruy::profiler::ScopeLabel label("Relu1 (not fused)");
+ const int flat_size = MatchingFlatSize(input_shape, output_shape);
+ for (int i = 0; i < flat_size; ++i)
+ {
+ const T val = input_data[i];
+ const T upper = 1;
+ const T lower = -1;
+ const T clamped = val > upper ? upper : val < lower ? lower : val;
+ output_data[i] = clamped;
+ }
+}
+
+inline void Relu6(const RuntimeShape &input_shape, const float *input_data,
+ const RuntimeShape &output_shape, float *output_data)
+{
+ ruy::profiler::ScopeLabel label("Relu6 (not fused)");
+ const int flat_size = MatchingFlatSize(input_shape, output_shape);
+ for (int i = 0; i < flat_size; ++i)
+ {
+ const float val = input_data[i];
+ const float upper = 6;
+ const float lower = 0;
+ const float clamped = val > upper ? upper : val < lower ? lower : val;
+ output_data[i] = clamped;
+ }
+}
+
+template <typename T>
+inline void ReluX(const tflite::ReluParams &params, const RuntimeShape &input_shape,
+ const T *input_data, const RuntimeShape &output_shape, T *output_data)
+{
+ ruy::profiler::ScopeLabel label("Quantized ReluX (not fused)");
+ const int flat_size = MatchingFlatSize(input_shape, output_shape);
+ for (int i = 0; i < flat_size; ++i)
+ {
+ const int32 val = static_cast<int32_t>(input_data[i]);
+ int32 clamped = params.output_offset + MultiplyByQuantizedMultiplier(val - params.input_offset,
+ params.output_multiplier,
+ params.output_shift);
+ clamped = std::max(params.quantized_activation_min, clamped);
+ clamped = std::min(params.quantized_activation_max, clamped);
+ output_data[i] = static_cast<T>(clamped);
+ }
+}
+
+template <typename T>
+inline void ReluX(const tflite::ActivationParams &params, const RuntimeShape &input_shape,
+ const T *input_data, const RuntimeShape &output_shape, T *output_data)
+{
+ ruy::profiler::ScopeLabel label("Quantized ReluX (not fused)");
+ const int flat_size = MatchingFlatSize(input_shape, output_shape);
+ const T max_value = params.quantized_activation_max;
+ const T min_value = params.quantized_activation_min;
+ for (int i = 0; i < flat_size; ++i)
+ {
+ const T val = input_data[i];
+ const T clamped = val > max_value ? max_value : val < min_value ? min_value : val;
+ output_data[i] = clamped;
+ }
+}
+
+// TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+inline void BroadcastMulFivefold(const ArithmeticParams &unswitched_params,
+ const RuntimeShape &unswitched_input1_shape,
+ const uint8 *unswitched_input1_data,
+ const RuntimeShape &unswitched_input2_shape,
+ const uint8 *unswitched_input2_data,
+ const RuntimeShape &output_shape, uint8 *output_data)
+{
+ ArithmeticParams switched_params = unswitched_params;
+ switched_params.input1_offset = unswitched_params.input2_offset;
+ switched_params.input2_offset = unswitched_params.input1_offset;
+
+ const bool use_unswitched = unswitched_params.broadcast_category ==
+ tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+ const ArithmeticParams &params = use_unswitched ? unswitched_params : switched_params;
+ const uint8 *input1_data = use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+ const uint8 *input2_data = use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+ // Fivefold nested loops. The second input resets its position for each
+ // iteration of the second loop. The first input resets its position at the
+ // beginning of the fourth loop. The innermost loop is an elementwise Mul of
+ // sections of the arrays.
+ uint8 *output_data_ptr = output_data;
+ const uint8 *input1_data_ptr = input1_data;
+ const uint8 *input2_data_reset = input2_data;
+ int y0 = params.broadcast_shape[0];
+ int y1 = params.broadcast_shape[1];
+ int y2 = params.broadcast_shape[2];
+ int y3 = params.broadcast_shape[3];
+ int y4 = params.broadcast_shape[4];
+ for (int i0 = 0; i0 < y0; ++i0)
+ {
+ const uint8 *input2_data_ptr;
+ for (int i1 = 0; i1 < y1; ++i1)
+ {
+ input2_data_ptr = input2_data_reset;
+ for (int i2 = 0; i2 < y2; ++i2)
+ {
+ for (int i3 = 0; i3 < y3; ++i3)
+ {
+ MulElementwise(y4, params, input1_data_ptr, input2_data_ptr, output_data_ptr);
+ input2_data_ptr += y4;
+ output_data_ptr += y4;
+ }
+ input1_data_ptr += y4;
+ }
+ }
+ input2_data_reset = input2_data_ptr;
+ }
+}
+
+inline void Mul(const ArithmeticParams &params, const RuntimeShape &input1_shape,
+ const int16 *input1_data, const RuntimeShape &input2_shape,
+ const int16 *input2_data, const RuntimeShape &output_shape, int16 *output_data)
+{
+ ruy::profiler::ScopeLabel label("Mul/Int16");
+
+ const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+ for (int i = 0; i < flat_size; i++)
+ {
+ // F0 uses 0 integer bits, range [-1, 1].
+ using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+ F0 unclamped_result = F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
+ output_data[i] = unclamped_result.raw();
+ }
+}
+
+inline void Mul(const ArithmeticParams &params, const RuntimeShape &input1_shape,
+ const int16 *input1_data, const RuntimeShape &input2_shape,
+ const int16 *input2_data, const RuntimeShape &output_shape, uint8 *output_data)
+{
+ ruy::profiler::ScopeLabel label("Mul/Int16Uint8");
+ int32 output_offset = params.output_offset;
+ int32 output_activation_min = params.quantized_activation_min;
+ int32 output_activation_max = params.quantized_activation_max;
+ TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+ const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+ for (int i = 0; i < flat_size; i++)
+ {
+ // F0 uses 0 integer bits, range [-1, 1].
+ using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+ F0 unclamped_result = F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
+ int16 rescaled_result = gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
+ int16 clamped_result = std::min<int16>(output_activation_max - output_offset, rescaled_result);
+ clamped_result = std::max<int16>(output_activation_min - output_offset, clamped_result);
+ output_data[i] = output_offset + clamped_result;
+ }
+}
+
+inline void Sub16(const ArithmeticParams &params, const RuntimeShape &input1_shape,
+ const int16_t *input1_data, const RuntimeShape &input2_shape,
+ const int16_t *input2_data, const RuntimeShape &output_shape,
+ int16_t *output_data)
+{
+ ruy::profiler::ScopeLabel label("Sub/Int16");
+ const int input1_shift = params.input1_shift;
+ const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+ const int16 output_activation_min = params.quantized_activation_min;
+ const int16 output_activation_max = params.quantized_activation_max;
+
+ TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
+ TFLITE_DCHECK_LE(input1_shift, 0);
+ TFLITE_DCHECK_LE(params.input2_shift, 0);
+ const int16 *not_shift_input = input1_shift == 0 ? input1_data : input2_data;
+ const int16 *shift_input = input1_shift == 0 ? input2_data : input1_data;
+ const int input_right_shift = input1_shift == 0 ? -params.input2_shift : -input1_shift;
+
+ if (input1_shift == 0)
+ {
+ // F0 uses 0 integer bits, range [-1, 1].
+ using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+ for (int i = 0; i < flat_size; ++i)
+ {
+ F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
+ F0 scaled_input =
+ F0::FromRaw(gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
+ F0 result = SaturatingSub(input_ready_scaled, scaled_input);
+ const int16 raw_output = result.raw();
+ const int16 clamped_output =
+ std::min(output_activation_max, std::max(output_activation_min, raw_output));
+ output_data[i] = clamped_output;
+ }
+ }
+ else
+ {
+ // F0 uses 0 integer bits, range [-1, 1].
+ using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+ for (int i = 0; i < flat_size; ++i)
+ {
+ F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
+ F0 scaled_input =
+ F0::FromRaw(gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
+ F0 result = SaturatingSub(scaled_input, input_ready_scaled);
+ const int16 raw_output = result.raw();
+ const int16 clamped_output =
+ std::min(output_activation_max, std::max(output_activation_min, raw_output));
+ output_data[i] = clamped_output;
+ }
+ }
+}
+
+template <typename Scalar>
+void Pack(const PackParams &params, const RuntimeShape *const *input_shapes,
+ const Scalar *const *input_data, const RuntimeShape &output_shape, Scalar *output_data)
+{
+ ruy::profiler::ScopeLabel label("Pack");
+ const int dimensions = output_shape.DimensionsCount();
+ int axis = params.axis;
+ int inputs_count = params.inputs_count;
+
+ int outer_size = 1;
+ for (int i = 0; i < axis; i++)
+ {
+ outer_size *= output_shape.Dims(i);
+ }
+ int copy_size = 1;
+ for (int i = params.axis + 1; i < dimensions; i++)
+ {
+ copy_size *= output_shape.Dims(i);
+ }
+ TFLITE_DCHECK_EQ((**input_shapes).FlatSize(), copy_size * outer_size);
+
+ for (int i = 0; i < inputs_count; ++i)
+ {
+ for (int k = 0; k < outer_size; k++)
+ {
+ const Scalar *input_ptr = input_data[i] + copy_size * k;
+ int loc = k * inputs_count * copy_size + i * copy_size;
+ memcpy(output_data + loc, input_ptr, copy_size * sizeof(Scalar));
+ }
+ }
+}
+
+template <typename Scalar>
+void Unpack(const UnpackParams &params, const RuntimeShape &input_shape, const Scalar *input_data,
+ const RuntimeShape &output_shape, Scalar *const *output_datas)
+{
+ ruy::profiler::ScopeLabel label("Unpack");
+ const int dimensions = input_shape.DimensionsCount();
+ const int outputs_count = params.num_split;
+
+ int outer_size = 1;
+ int axis = params.axis;
+ if (axis < 0)
+ {
+ axis += dimensions;
+ }
+ TFLITE_DCHECK_GE(axis, 0);
+ TFLITE_DCHECK_LT(axis, dimensions);
+ for (int i = 0; i < axis; ++i)
+ {
+ outer_size *= input_shape.Dims(i);
+ }
+ int copy_size = 1;
+ for (int i = axis + 1; i < dimensions; ++i)
+ {
+ copy_size *= input_shape.Dims(i);
+ }
+ TFLITE_DCHECK_EQ(output_shape.FlatSize(), copy_size * outer_size);
+
+ for (int i = 0; i < outputs_count; ++i)
+ {
+ for (int k = 0; k < outer_size; k++)
+ {
+ Scalar *output_ptr = output_datas[i] + copy_size * k;
+ int loc = k * outputs_count * copy_size + i * copy_size;
+ memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
+ }
+ }
+}
+
+template <typename Scalar>
+void PackWithScaling(const PackParams &params, const RuntimeShape *const *input_shapes,
+ const uint8 *const *input_data, const RuntimeShape &output_shape,
+ uint8 *output_data)
+{
+ ruy::profiler::ScopeLabel label("PackWithScaling");
+ const int dimensions = output_shape.DimensionsCount();
+ int axis = params.axis;
+ const int32 *input_zeropoint = params.input_zeropoint;
+ const float *input_scale = params.input_scale;
+ int inputs_count = params.inputs_count;
+ const int32 output_zeropoint = params.output_zeropoint;
+ const float output_scale = params.output_scale;
+
+ int outer_size = 1;
+ for (int i = 0; i < axis; i++)
+ {
+ outer_size *= output_shape.Dims(i);
+ }
+ int copy_size = 1;
+ for (int i = axis + 1; i < dimensions; i++)
+ {
+ copy_size *= output_shape.Dims(i);
+ }
+ TFLITE_DCHECK_EQ((**input_shapes).FlatSize(), copy_size * outer_size);
+
+ Scalar *output_ptr = output_data;
+ const float inverse_output_scale = 1.f / output_scale;
+ for (int k = 0; k < outer_size; k++)
+ {
+ for (int i = 0; i < inputs_count; ++i)
+ {
+ if (input_zeropoint[i] == output_zeropoint && input_scale[i] == output_scale)
+ {
+ memcpy(output_ptr, input_data[i] + k * copy_size, copy_size * sizeof(Scalar));
+ }
+ else
+ {
+ assert(false);
+ const float scale = input_scale[i] * inverse_output_scale;
+ const float bias = -input_zeropoint[i] * scale;
+ auto input_ptr = input_data[i];
+ for (int j = 0; j < copy_size; ++j)
+ {
+ const int value =
+ static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) + output_zeropoint;
+ output_ptr[j] = static_cast<uint8_t>(std::max(std::min(255, value), 0));
+ }
+ }
+ output_ptr += copy_size;
+ }
+ }
+}
+
+template <typename Scalar>
+void DepthConcatenation(const ConcatenationParams &params, const RuntimeShape *const *input_shapes,
+ const Scalar *const *input_data, const RuntimeShape &output_shape,
+ Scalar *output_data)
+{
+ ruy::profiler::ScopeLabel label("DepthConcatenation");
+ auto params_copy = params;
+ params_copy.axis = 3;
+ Concatenation(params_copy, input_shapes, input_data, output_shape, output_data);
+}
+
+inline void LstmCell(const LstmCellParams &params, const RuntimeShape &unextended_input_shape,
+ const float *input_data, const RuntimeShape &unextended_prev_activ_shape,
+ const float *prev_activ_data, const RuntimeShape &weights_shape,
+ const float *weights_data, const RuntimeShape &unextended_bias_shape,
+ const float *bias_data, const RuntimeShape &unextended_prev_state_shape,
+ const float *prev_state_data,
+ const RuntimeShape &unextended_output_state_shape, float *output_state_data,
+ const RuntimeShape &unextended_output_activ_shape, float *output_activ_data,
+ const RuntimeShape &unextended_concat_temp_shape, float *concat_temp_data,
+ const RuntimeShape &unextended_activ_temp_shape, float *activ_temp_data)
+{
+ TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
+ const RuntimeShape input_shape = RuntimeShape::ExtendedShape(4, unextended_input_shape);
+ const RuntimeShape prev_activ_shape = RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
+ const RuntimeShape bias_shape = RuntimeShape::ExtendedShape(4, unextended_bias_shape);
+ const RuntimeShape prev_state_shape = RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
+ const RuntimeShape output_state_shape =
+ RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
+ const RuntimeShape output_activ_shape =
+ RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
+ const RuntimeShape concat_temp_shape =
+ RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
+ const RuntimeShape activ_temp_shape = RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
+ TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+
+ const int weights_dim_count = weights_shape.DimensionsCount();
+ const int batches = MatchingDim(input_shape, 0, prev_activ_shape, 0, prev_state_shape, 0,
+ output_state_shape, 0, output_activ_shape, 0);
+ const int height = MatchingDim(input_shape, 1, prev_activ_shape, 1, prev_state_shape, 1,
+ output_state_shape, 1, output_activ_shape, 1);
+ const int width = MatchingDim(input_shape, 2, prev_activ_shape, 2, prev_state_shape, 2,
+ output_state_shape, 2, output_activ_shape, 2);
+ const int input_depth = input_shape.Dims(3);
+ const int prev_activ_depth = prev_activ_shape.Dims(3);
+ const int total_input_depth = prev_activ_depth + input_depth;
+ TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1), total_input_depth);
+ TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
+ const int intern_activ_depth = MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
+ TFLITE_DCHECK_EQ(weights_shape.FlatSize(), intern_activ_depth * total_input_depth);
+ TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
+ const int output_depth = MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
+ 3, output_activ_shape, 3);
+ TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
+
+ // Concatenate prev_activ and input data together
+ std::vector<float const *> concat_input_arrays_data;
+ std::vector<RuntimeShape const *> concat_input_arrays_shapes;
+ concat_input_arrays_data.push_back(input_data);
+ concat_input_arrays_data.push_back(prev_activ_data);
+ concat_input_arrays_shapes.push_back(&input_shape);
+ concat_input_arrays_shapes.push_back(&prev_activ_shape);
+ tflite::ConcatenationParams concat_params;
+ concat_params.axis = 3;
+ concat_params.inputs_count = concat_input_arrays_data.size();
+ Concatenation(concat_params, &(concat_input_arrays_shapes[0]), &(concat_input_arrays_data[0]),
+ concat_temp_shape, concat_temp_data);
+
+ // Fully connected
+ tflite::FullyConnectedParams fc_params;
+ fc_params.float_activation_min = std::numeric_limits<float>::lowest();
+ fc_params.float_activation_max = std::numeric_limits<float>::max();
+ FullyConnected(fc_params, concat_temp_shape, concat_temp_data, weights_shape, weights_data,
+ bias_shape, bias_data, activ_temp_shape, activ_temp_data);
+
+ // Memory state update (the LSTM "guts")
+ for (int b = 0; b < batches; ++b)
+ {
+ for (int w = 0; w < width; ++w)
+ {
+ for (int h = 0; h < height; ++h)
+ {
+ for (int c = 0; c < output_depth; ++c)
+ {
+ const float input_gate =
+ 1.f /
+ (1.f +
+ std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, 0 * output_depth + c)]));
+ const float new_input =
+ std::tanh(activ_temp_data[Offset(activ_temp_shape, b, h, w, 1 * output_depth + c)]);
+ const float forget_gate =
+ 1.f /
+ (1.f +
+ std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, 2 * output_depth + c)]));
+ const float output_gate =
+ 1.f /
+ (1.f +
+ std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, 3 * output_depth + c)]));
+ const float new_state =
+ input_gate * new_input +
+ forget_gate * prev_state_data[Offset(prev_state_shape, b, h, w, c)];
+ output_state_data[Offset(output_state_shape, b, h, w, c)] = new_state;
+ output_activ_data[Offset(output_activ_shape, b, h, w, c)] =
+ output_gate * std::tanh(new_state);
+ }
+ }
+ }
+ }
+}
+
+// Quantized LSTM cell implementation.
+// The quantization of the input, output arrays is as follows:
+// - The input activations are quantized as uint8 on the interval
+// [-1, 127/128].
+// The rationale for that is that is the natural interval for output
+// activations (see next point) and these need to be concatenated together.
+// We could accommodate different ranges by re-scaling, but we empirically
+// found that setting the input activations range to be [-1, 127/128] in the
+// first place, removing the need for re-scaling, greatly improves accuracy.
+// - The output activations are quantized as uint8 on the interval
+// [-1, 127/128].
+// The rationale for that is that the definition of a LSTM cell makes them
+// intrinsically constrained in [-1, 1]; tweaking that to [-1, 127/128]
+// makes for simpler, more accurate fixed-point arithmetic.
+// - The output-at-previous-timestep state array is obviously quantized as
+// the output activations.
+// - The internal LSTM memory (not the output-at-previous-timestep, the other
+// internal state array) is int16-quantized and may use any power-of-two,
+// symmetric range i.e. [-2^N, 2^N * 32767/32768] for any N, which we call
+// StateIntegerBits below, see the below discussion of that template
+// parameter ("The StateIntegerBits template parameter").
+// - The output of the internal fully-connected node is int16-quantized
+// on the interval [-8, 8 * 32767/32768], the rationale for which is
+// explained just below ("Why [-8, 8] for fully-connected output?").
+//
+//
+// === The StateIntegerBits template parameter ===
+//
+// The StateIntegerBits template parameter controls the fixed-point format used
+// to represent the internal memory of the LSTM cell (not the
+// output-at-previous-timestep, the other internal state array). It's currently
+// a template parameter so that the model can control that. The most typical
+// value for StateIntegerBits is 4. Other plausible values are anywhere between
+// 3 and 5. We might eventually standardize on a single supported value, e.g. 4,
+// and drop that template parameter. The reason why it can't be a runtime
+// parameter is that this controls the fixed-point format used, i.e. we need to
+// generate actually different code based on it. In particular, we generate code
+// for a fixed-point tanh() implementation for that format, which internally
+// uses a fixed-point exp() implementation, which internally uses a
+// barrel-shifter with a number of steps that depends on StateIntegerBits.
+// Another consequence of that is that a higher value of StateIntegerBits
+// results in a more expensive implementation (more barrel shifter steps
+// needed).
+//
+//
+// === Why [-8, 8] for fully-connected output? ===
+//
+// This array is only fed to Logistic and Tanh functions, for which
+// the quantized implementation will want to use fixed-point arithmetic,
+// requiring a power-of-two representation interval. Thus, we should right
+// away quantize this array to a power-of-two interval; otherwise,
+// implementation will need to rescale that, losing any benefit that a tighter
+// representation interval might otherwise yield, while introducing some
+// numerical error and computational overhead.
+//
+// Now, Logistic and Tanh
+// are nearly constant (nearly equal to their horizontal asymptotes)
+// outside of a small bounded interval around 0:
+//
+// Logistic(4) = 1 - 1.8e-2 Tanh(4) = 1 - 6.7e-4
+// Logistic(8) = 1 - 3.4e-4 Tanh(8) = 1 - 2.3e-7
+// Logistic(16) = 1 - 1.1e-7 Tanh(16) = 1 - 2.5e-14
+//
+// From this, we see that clamping to [-4, 4] would be too inaccurate
+// (the error of 1.8e-2 on Logistic would be felt even in 8bit precision)
+// while clamping to [-16, 16] would make no difference even in float32.
+// However, for a fixed-point implementation in 16-bit integers, using 5
+// integer bits to represent the [-16, 16] range would leave only 11
+// fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive
+// representable values. Notice that is higher than the
+// worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic.
+// Using [-8, 8] thus seems like the better compromise overall, enjoying
+// an increment of 2.4e-4 between representable values and a worst-case
+// clamping error of 3.4e-4, both better than the increment of 4.9e-4 with
+// [-16, 16].
+//
+// Moreover, all other things being equal, it is nice to choose the narrower
+// representation range, as that makes the implementation of fixed-point
+// math functions a little cheaper (each integer bit requires an additional
+// barrel-shifter atep in the implementation of exp(-x)). That is further
+// reason to prefer [-8, 8] over [-16, 16]. The choice of [-16, 16] would make
+// sense for 32-bit float or 32-bit fixed-point quantization, but we are
+// aiming for 16-bit fixed-point quantization of these internal nodes here.
+//
+template <int StateIntegerBits>
+inline void
+LstmCell(const LstmCellParams &params, const RuntimeShape &unextended_input_shape,
+ const uint8 *input_data_uint8, const RuntimeShape &unextended_prev_activ_shape,
+ const uint8 *prev_activ_data_uint8, const RuntimeShape &weights_shape,
+ const uint8 *weights_data_uint8, const RuntimeShape &unextended_bias_shape,
+ const int32 *bias_data_int32, const RuntimeShape &unextended_prev_state_shape,
+ const int16 *prev_state_data_int16, const RuntimeShape &unextended_output_state_shape,
+ int16 *output_state_data_int16, const RuntimeShape &unextended_output_activ_shape,
+ uint8 *output_activ_data_uint8, const RuntimeShape &unextended_concat_temp_shape,
+ uint8 *concat_temp_data_uint8, const RuntimeShape &unextended_activ_temp_shape,
+ int16 *activ_temp_data_int16, void *gemmlowp_context)
+{
+ (void)gemmlowp_context; // only used in optimized code.
+ int32 weights_zero_point = params.weights_zero_point;
+ int32 accum_multiplier = params.accum_multiplier;
+ int accum_shift = params.accum_shift;
+ TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
+ const RuntimeShape input_shape = RuntimeShape::ExtendedShape(4, unextended_input_shape);
+ const RuntimeShape prev_activ_shape = RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
+ const RuntimeShape bias_shape = RuntimeShape::ExtendedShape(4, unextended_bias_shape);
+ const RuntimeShape prev_state_shape = RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
+ const RuntimeShape output_state_shape =
+ RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
+ const RuntimeShape output_activ_shape =
+ RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
+ const RuntimeShape concat_temp_shape =
+ RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
+ const RuntimeShape activ_temp_shape = RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
+ TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+
+ // Gather dimensions information, and perform consistency checks.
+ const int weights_dim_count = weights_shape.DimensionsCount();
+ const int outer_size = MatchingFlatSizeSkipDim(input_shape, 3, prev_activ_shape, prev_state_shape,
+ output_state_shape, output_activ_shape);
+ const int input_depth = input_shape.Dims(3);
+ const int prev_activ_depth = prev_activ_shape.Dims(3);
+ const int total_input_depth = prev_activ_depth + input_depth;
+ TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1), total_input_depth);
+ const int intern_activ_depth = MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
+ TFLITE_DCHECK_EQ(weights_shape.FlatSize(), intern_activ_depth * total_input_depth);
+ TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
+ TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
+ const int output_depth = MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
+ 3, output_activ_shape, 3);
+ TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
+ const int fc_batches = FlatSizeSkipDim(activ_temp_shape, 3);
+ const int fc_output_depth =
+ MatchingDim(weights_shape, weights_dim_count - 2, activ_temp_shape, 3);
+ const int fc_accum_depth = total_input_depth;
+ TFLITE_DCHECK_EQ(fc_output_depth, 4 * output_depth);
+
+ // Depth-concatenate prev_activ and input data together.
+ uint8 const *concat_input_arrays_data[2] = {input_data_uint8, prev_activ_data_uint8};
+ const RuntimeShape *concat_input_arrays_shapes[2] = {&input_shape, &prev_activ_shape};
+ tflite::ConcatenationParams concat_params;
+ concat_params.axis = 3;
+ concat_params.inputs_count = 2;
+ Concatenation(concat_params, concat_input_arrays_shapes, concat_input_arrays_data,
+ concat_temp_shape, concat_temp_data_uint8);
+
+ // Implementation of the fully connected node inside the LSTM cell.
+ // The operands are 8-bit integers, the accumulators are internally 32bit
+ // integers, and the output is 16-bit fixed-point with 3 integer bits so
+ // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
+ // is explained in the function comment above.
+ for (int b = 0; b < fc_batches; ++b)
+ {
+ for (int out_c = 0; out_c < fc_output_depth; ++out_c)
+ {
+ // Internal accumulation.
+ // Initialize accumulator with the bias-value.
+ int32 accum = bias_data_int32[out_c];
+ // Accumulation loop.
+ for (int d = 0; d < fc_accum_depth; ++d)
+ {
+ int16 input_val = concat_temp_data_uint8[b * fc_accum_depth + d] - 128;
+ int16 weights_val = weights_data_uint8[out_c * fc_accum_depth + d] - weights_zero_point;
+ accum += input_val * weights_val;
+ }
+ // Down-scale the final int32 accumulator to the scale used by our
+ // (16-bit, using 3 integer bits) fixed-point format. The quantized
+ // multiplier and shift here have been pre-computed offline
+ // (e.g. by toco).
+ accum = MultiplyByQuantizedMultiplier(accum, accum_multiplier, accum_shift);
+ // Saturate, cast to int16, and store to the temporary activations array.
+ accum = std::max(-32768, std::min(32767, static_cast<int>(accum)));
+ activ_temp_data_int16[out_c + fc_output_depth * b] = accum;
+ }
+ }
+
+ // Rest of the LSTM cell: tanh and logistic math functions, and some adds
+ // and muls, all done in 16-bit fixed-point.
+ for (int b = 0; b < outer_size; ++b)
+ {
+ for (int c = 0; c < output_depth; ++c)
+ {
+ // Define the fixed-point data types that we will use here. All use
+ // int16 as the underlying integer type i.e. all are 16-bit fixed-point.
+ // They only differ by the number of integral vs. fractional bits,
+ // determining the range of values that they can represent.
+ //
+ // F0 uses 0 integer bits, range [-1, 1].
+ // This is the return type of math functions such as tanh, logistic,
+ // whose range is in [-1, 1].
+ using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+ // F3 uses 3 integer bits, range [-8, 8].
+ // This is the range of the previous fully-connected node's output,
+ // which is our input here.
+ using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+ // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits,
+ // 2^StateIntegerBits]. It's used to represent the internal state, whose
+ // number of integer bits is currently dictated by the model. See comment
+ // on the StateIntegerBits template parameter above.
+ using FS = gemmlowp::FixedPoint<std::int16_t, StateIntegerBits>;
+ // Implementation of input gate, using fixed-point logistic function.
+ F3 input_gate_input =
+ F3::FromRaw(activ_temp_data_int16[b * fc_output_depth + 0 * output_depth + c]);
+ F0 input_gate_output = gemmlowp::logistic(input_gate_input);
+ // Implementation of input modulation gate, using fixed-point tanh
+ // function.
+ F3 input_modulation_gate_input =
+ F3::FromRaw(activ_temp_data_int16[b * fc_output_depth + 1 * output_depth + c]);
+ F0 input_modulation_gate_output = gemmlowp::tanh(input_modulation_gate_input);
+ // Implementation of forget gate, using fixed-point logistic function.
+ F3 forget_gate_input =
+ F3::FromRaw(activ_temp_data_int16[b * fc_output_depth + 2 * output_depth + c]);
+ F0 forget_gate_output = gemmlowp::logistic(forget_gate_input);
+ // Implementation of output gate, using fixed-point logistic function.
+ F3 output_gate_input =
+ F3::FromRaw(activ_temp_data_int16[b * fc_output_depth + 3 * output_depth + c]);
+ F0 output_gate_output = gemmlowp::logistic(output_gate_input);
+ // Implementation of internal multiplication nodes, still in fixed-point.
+ F0 input_times_input_modulation = input_gate_output * input_modulation_gate_output;
+ FS prev_state = FS::FromRaw(prev_state_data_int16[b * output_depth + c]);
+ FS prev_state_times_forget_state = forget_gate_output * prev_state;
+ // Implementation of internal addition node, saturating.
+ FS new_state =
+ gemmlowp::SaturatingAdd(gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
+ prev_state_times_forget_state);
+ // Implementation of last internal Tanh node, still in fixed-point.
+ // Since a Tanh fixed-point implementation is specialized for a given
+ // number or integer bits, and each specialization can have a substantial
+ // code size, and we already used above a Tanh on an input with 3 integer
+ // bits, and per the table in the above function comment there is no
+ // significant accuracy to be lost by clamping to [-8, +8] for a
+ // 3-integer-bits representation, let us just do that. This helps people
+ // porting this to targets where code footprint must be minimized.
+ F3 new_state_f3 = gemmlowp::Rescale<3>(new_state);
+ F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state_f3);
+ // Store the new internal state back to memory, as 16-bit integers.
+ // Note: here we store the original value with StateIntegerBits, not
+ // the rescaled 3-integer-bits value fed to tanh.
+ output_state_data_int16[b * output_depth + c] = new_state.raw();
+ // Down-scale the output activations to 8-bit integers, saturating,
+ // and store back to memory.
+ int16 rescaled_output_activ = gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8);
+ int16 clamped_output_activ =
+ std::max<int16>(-128, std::min<int16>(127, rescaled_output_activ));
+ output_activ_data_uint8[b * output_depth + c] = 128 + clamped_output_activ;
+ }
+ }
+}
+
+template <typename Scalar>
+void Split(const SplitParams &params, const RuntimeShape &input_shape, const Scalar *input_data,
+ const RuntimeShape *const *output_shapes, Scalar *const *output_data)
+{
+ ruy::profiler::ScopeLabel label("Split");
+ const int split_dimensions = input_shape.DimensionsCount();
+ int axis = params.axis < 0 ? params.axis + split_dimensions : params.axis;
+ int outputs_count = params.num_split;
+ TFLITE_DCHECK_LT(axis, split_dimensions);
+
+ int64_t split_size = 0;
+ for (int i = 0; i < outputs_count; i++)
+ {
+ TFLITE_DCHECK_EQ(output_shapes[i]->DimensionsCount(), split_dimensions);
+ for (int j = 0; j < split_dimensions; j++)
+ {
+ if (j != axis)
+ {
+ MatchingDim(*output_shapes[i], j, input_shape, j);
+ }
+ }
+ split_size += output_shapes[i]->Dims(axis);
+ }
+ TFLITE_DCHECK_EQ(split_size, input_shape.Dims(axis));
+ int64_t outer_size = 1;
+ for (int i = 0; i < axis; ++i)
+ {
+ outer_size *= input_shape.Dims(i);
+ }
+ // For all output arrays,
+ // FlatSize() = outer_size * Dims(axis) * base_inner_size;
+ int64_t base_inner_size = 1;
+ for (int i = axis + 1; i < split_dimensions; ++i)
+ {
+ base_inner_size *= input_shape.Dims(i);
+ }
+
+ const Scalar *input_ptr = input_data;
+ for (int k = 0; k < outer_size; k++)
+ {
+ for (int i = 0; i < outputs_count; ++i)
+ {
+ const int copy_size = output_shapes[i]->Dims(axis) * base_inner_size;
+ memcpy(output_data[i] + k * copy_size, input_ptr, copy_size * sizeof(Scalar));
+ input_ptr += copy_size;
+ }
+ }
+}
+
+inline int NodeOffset(int b, int h, int w, int height, int width)
+{
+ return (b * height + h) * width + w;
+}
+
+inline void LocalResponseNormalization(const tflite::LocalResponseNormalizationParams &op_params,
+ const RuntimeShape &input_shape, const float *input_data,
+ const RuntimeShape &output_shape, float *output_data)
+{
+ const int trailing_dim = input_shape.DimensionsCount() - 1;
+ const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+ const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+ for (int i = 0; i < outer_size; ++i)
+ {
+ for (int c = 0; c < depth; ++c)
+ {
+ const int begin_input_c = std::max(0, static_cast<int>(c - op_params.range));
+ const int end_input_c = std::min(depth, static_cast<int>(c + op_params.range));
+ float accum = 0.f;
+ for (int input_c = begin_input_c; input_c < end_input_c; ++input_c)
+ {
+ const float input_val = input_data[i * depth + input_c];
+ accum += input_val * input_val;
+ }
+ const float multiplier = std::pow(op_params.bias + op_params.alpha * accum, -op_params.beta);
+ output_data[i * depth + c] = input_data[i * depth + c] * multiplier;
+ }
+ }
+}
+
+inline void Dequantize(const RuntimeShape &input_shape, const Eigen::half *input_data,
+ const RuntimeShape &output_shape, float *output_data)
+{
+ const int flat_size = MatchingFlatSize(input_shape, output_shape);
+ for (int i = 0; i < flat_size; i++)
+ {
+ output_data[i] = static_cast<float>(input_data[i]);
+ }
+}
+
+inline void FakeQuant(const tflite::FakeQuantParams &op_params, const RuntimeShape &input_shape,
+ const float *input_data, const RuntimeShape &output_shape, float *output_data)
+{
+ ruy::profiler::ScopeLabel label("FakeQuant");
+ float rmin = op_params.minmax.min;
+ float rmax = op_params.minmax.max;
+ int num_bits = op_params.num_bits;
+ // 0 should always be a representable value. Let's assume that the initial
+ // min,max range contains 0.
+ TFLITE_DCHECK_LE(rmin, 0.0f);
+ TFLITE_DCHECK_GE(rmax, 0.0f);
+ TFLITE_DCHECK_LT(rmin, rmax);
+
+ // Code matches tensorflow's FakeQuantWithMinMaxArgsFunctor.
+ int quant_min = 0;
+ int quant_max = (1 << num_bits) - 1;
+ float nudged_min, nudged_max, nudged_scale;
+ NudgeQuantizationRange(rmin, rmax, quant_min, quant_max, &nudged_min, &nudged_max, &nudged_scale);
+ const int flat_size = MatchingFlatSize(input_shape, output_shape);
+ FakeQuantizeArray(nudged_scale, nudged_min, nudged_max, input_data, output_data, flat_size);
+}
+
+// Common subroutine for both `GatherNd` and `GatherNdString`.
+struct GatherNdHelperResult
+{
+ int n_slices;
+ int slice_size;
+ int indices_nd;
+ std::vector<int> dims_to_count;
+};
+
+// Returns common values being used on both `GatherNd` and `GatherNdString`.
+inline GatherNdHelperResult GatherNdHelper(const RuntimeShape &params_shape,
+ const RuntimeShape &indices_shape)
+{
+ GatherNdHelperResult ret;
+ ret.n_slices = 1;
+ ret.slice_size = 1;
+ const int indices_dims = indices_shape.DimensionsCount();
+ ret.indices_nd = indices_shape.Dims(indices_dims - 1);
+ const int params_dims = params_shape.DimensionsCount();
+ for (int i = 0; i < indices_dims - 1; ++i)
+ {
+ ret.n_slices *= indices_shape.Dims(i);
+ }
+ for (int i = ret.indices_nd; i < params_dims; ++i)
+ {
+ ret.slice_size *= params_shape.Dims(i);
+ }
+
+ int remain_flat_size = params_shape.FlatSize();
+ ret.dims_to_count = std::vector<int>(ret.indices_nd, 0);
+ for (int i = 0; i < ret.indices_nd; ++i)
+ {
+ ret.dims_to_count[i] = remain_flat_size / params_shape.Dims(i);
+ remain_flat_size = ret.dims_to_count[i];
+ }
+
+ return ret;
+}
+
+template <typename ParamsT, typename IndicesT = int32>
+inline void GatherNd(const RuntimeShape &params_shape, const ParamsT *params_data,
+ const RuntimeShape &indices_shape, const IndicesT *indices_data,
+ const RuntimeShape &output_shape, ParamsT *output_data)
+{
+ ruy::profiler::ScopeLabel label("GatherNd");
+
+ const GatherNdHelperResult res = GatherNdHelper(params_shape, indices_shape);
+ for (int i = 0; i < res.n_slices; ++i)
+ {
+ int from_pos = 0;
+ for (int j = 0; j < res.indices_nd; ++j)
+ {
+ from_pos += indices_data[i * res.indices_nd + j] * res.dims_to_count[j];
+ }
+ std::memcpy(output_data + i * res.slice_size, params_data + from_pos,
+ sizeof(ParamsT) * res.slice_size);
+ }
+}
+
+#ifndef TF_LITE_STATIC_MEMORY
+template <typename IndicesT = int32>
+inline void GatherNdString(const RuntimeShape &params_shape, const TfLiteTensor *params_data,
+ const RuntimeShape &indices_shape, const IndicesT *indices_data,
+ const RuntimeShape &output_shape, TfLiteTensor *output_data)
+{
+ ruy::profiler::ScopeLabel label("GatherNdString");
+
+ const GatherNdHelperResult res = GatherNdHelper(params_shape, indices_shape);
+ DynamicBuffer buffer;
+ for (int i = 0; i < res.n_slices; ++i)
+ {
+ int from_pos = 0;
+ for (int j = 0; j < res.indices_nd; ++j)
+ {
+ from_pos += indices_data[i * res.indices_nd + j] * res.dims_to_count[j];
+ }
+ for (int j = 0; j < res.slice_size; ++j)
+ {
+ buffer.AddString(GetString(params_data, from_pos + j));
+ }
+ }
+ buffer.WriteToTensor(output_data, /*new_shape=*/nullptr);
+}
+#endif
+
+template <typename IndicesT, typename UpdatesT>
+inline void ScatterNd(const RuntimeShape &indices_shape, const IndicesT *indices_data,
+ const RuntimeShape &updates_shape, const UpdatesT *updates_data,
+ const RuntimeShape &output_shape, UpdatesT *output_data)
+{
+ ruy::profiler::ScopeLabel label("ScatterNd");
+
+ int n_slices = 1;
+ int slice_size = 1;
+ const int outer_dims = indices_shape.DimensionsCount() - 1;
+ const int indices_nd = indices_shape.Dims(outer_dims);
+ const int updates_dims = updates_shape.DimensionsCount();
+ for (int i = 0; i < outer_dims; ++i)
+ {
+ n_slices *= indices_shape.Dims(i);
+ }
+ for (int i = outer_dims; i < updates_dims; ++i)
+ {
+ slice_size *= updates_shape.Dims(i);
+ }
+
+ int output_flat_size = output_shape.FlatSize();
+ int remain_flat_size = output_flat_size;
+ std::vector<int> dims_to_count(indices_nd, 0);
+ for (int i = 0; i < indices_nd; ++i)
+ {
+ dims_to_count[i] = remain_flat_size / output_shape.Dims(i);
+ remain_flat_size = dims_to_count[i];
+ }
+
+ memset(output_data, 0, sizeof(UpdatesT) * output_flat_size);
+ for (int i = 0; i < n_slices; ++i)
+ {
+ int to_pos = 0;
+ for (int j = 0; j < indices_nd; ++j)
+ {
+ IndicesT idx = indices_data[i * indices_nd + j];
+ TFLITE_DCHECK(0 <= idx && idx < output_shape.Dims(j));
+ to_pos += idx * dims_to_count[j];
+ }
+ for (int j = 0; j < slice_size; j++)
+ {
+ output_data[to_pos + j] += updates_data[i * slice_size + j];
+ }
+ }
+}
+
+template <typename T>
+inline void Slice(const tflite::SliceParams &op_params, const RuntimeShape &input_shape,
+ const RuntimeShape &output_shape, SequentialTensorWriter<T> *writer)
+{
+ const RuntimeShape ext_shape = RuntimeShape::ExtendedShape(5, input_shape);
+ TFLITE_DCHECK_LE(op_params.begin_count, 5);
+ TFLITE_DCHECK_LE(op_params.size_count, 5);
+ const int begin_count = op_params.begin_count;
+ const int size_count = op_params.size_count;
+ // We front-pad the begin and size vectors.
+ std::array<int, 5> start;
+ std::array<int, 5> stop;
+ for (int i = 0; i < 5; ++i)
+ {
+ int padded_i = 5 - i;
+ start[i] = begin_count < padded_i ? 0 : op_params.begin[begin_count - padded_i];
+ stop[i] = (size_count < padded_i || op_params.size[size_count - padded_i] == -1)
+ ? ext_shape.Dims(i)
+ : start[i] + op_params.size[size_count - padded_i];
+ }
+
+ for (int i0 = start[0]; i0 < stop[0]; ++i0)
+ {
+ for (int i1 = start[1]; i1 < stop[1]; ++i1)
+ {
+ for (int i2 = start[2]; i2 < stop[2]; ++i2)
+ {
+ for (int i3 = start[3]; i3 < stop[3]; ++i3)
+ {
+ for (int i4 = start[4]; i4 < stop[4]; ++i4)
+ {
+ writer->Write(Offset(ext_shape, i0, i1, i2, i3, i4));
+ }
+ }
+ }
+ }
+ }
+}
+
+template <typename T>
+inline void Slice(const tflite::SliceParams &op_params, const RuntimeShape &input_shape,
+ const T *input_data, const RuntimeShape &output_shape, T *output_data)
+{
+ SequentialTensorWriter<T> writer(input_data, output_data);
+ return Slice(op_params, input_shape, output_shape, &writer);
+}
+
+template <typename T>
+inline void Slice(const tflite::SliceParams &op_params, const RuntimeShape &input_shape,
+ const TfLiteTensor *input, const RuntimeShape &output_shape, TfLiteTensor *output)
+{
+ SequentialTensorWriter<T> writer(input, output);
+ return Slice(op_params, input_shape, output_shape, &writer);
+}
+
+template <typename T>
+void Minimum(const RuntimeShape &input1_shape, const T *input1_data, const T *input2_data,
+ const RuntimeShape &output_shape, T *output_data)
+{
+ const int flat_size = MatchingFlatSize(input1_shape, output_shape);
+
+ auto min_value = input2_data[0];
+ for (int i = 0; i < flat_size; i++)
+ {
+ output_data[i] = input1_data[i] > min_value ? min_value : input1_data[i];
+ }
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// the same as other binary ops.
+template <typename T>
+inline void Minimum(const RuntimeShape &input1_shape, const T *input1_data, const RuntimeShape &,
+ const T *input2_data, const RuntimeShape &output_shape, T *output_data)
+{
+ // Drop shape of second input: not needed.
+ Minimum(input1_shape, input1_data, input2_data, output_shape, output_data);
+}
+
+template <typename T>
+void Maximum(const RuntimeShape &input1_shape, const T *input1_data, const T *input2_data,
+ const RuntimeShape &output_shape, T *output_data)
+{
+ const int flat_size = MatchingFlatSize(input1_shape, output_shape);
+
+ auto max_value = input2_data[0];
+ for (int i = 0; i < flat_size; i++)
+ {
+ output_data[i] = input1_data[i] < max_value ? max_value : input1_data[i];
+ }
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// the same as other binary ops.
+template <typename T>
+inline void Maximum(const RuntimeShape &input1_shape, const T *input1_data, const RuntimeShape &,
+ const T *input2_data, const RuntimeShape &output_shape, T *output_data)
+{
+ // Drop shape of second input: not needed.
+ Maximum(input1_shape, input1_data, input2_data, output_shape, output_data);
+}
+
+template <typename T1, typename T2, typename T3>
+void ArgMax(const RuntimeShape &input1_shape, const T1 *input1_data, const T3 *input2_data,
+ const RuntimeShape &output_shape, T2 *output_data)
+{
+ ArgMinMax(input1_shape, input1_data, input2_data, output_shape, output_data, std::greater<T1>());
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// the same as other binary ops.
+template <typename T1, typename T2, typename T3>
+inline void ArgMax(const RuntimeShape &input1_shape, const T1 *input1_data,
+ const RuntimeShape &input2_shape, const T3 *input2_data,
+ const RuntimeShape &output_shape, T2 *output_data)
+{
+ // Drop shape of second input: not needed.
+ ArgMax(input1_shape, input1_data, input2_data, output_shape, output_data);
+}
+
+template <typename D, typename T>
+void Select(const RuntimeShape &input_condition_shape, const D *input_condition_data,
+ const RuntimeShape &input_x_shape, const T *input_x_data,
+ const RuntimeShape &input_y_shape, const T *input_y_data,
+ const RuntimeShape &output_shape, T *output_data)
+{
+ int64_t flatsize;
+ // Allow select operator executions on mixed scalar tensors and one element
+ // tensors.
+ if (input_condition_shape.FlatSize() == 1 && input_x_shape.FlatSize() == 1 &&
+ input_y_shape.FlatSize() == 1 && output_shape.FlatSize() == 1)
+ {
+ flatsize = 1;
+ }
+ else
+ {
+ flatsize = MatchingFlatSize(input_condition_shape, input_x_shape, input_y_shape, output_shape);
+ }
+ for (int64_t i = 0; i < flatsize; ++i)
+ {
+ output_data[i] = input_condition_data[i] ? input_x_data[i] : input_y_data[i];
+ }
+}
+
+template <typename D, typename T>
+void RankOneSelect(const RuntimeShape &input_condition_shape, const D *input_condition_data,
+ const RuntimeShape &input_x_shape, const T *input_x_data,
+ const RuntimeShape &input_y_shape, const T *input_y_data,
+ const RuntimeShape &output_shape, T *output_data)
+{
+ const int64_t outer_size = input_condition_shape.FlatSize();
+ int64_t inner_size;
+ if (input_condition_shape.DimensionsCount() == 0)
+ {
+ inner_size = MatchingFlatSize(input_x_shape, input_y_shape, output_shape);
+ }
+ else
+ {
+ TFLITE_DCHECK_EQ(MatchingDim(input_x_shape, 0, input_y_shape, 0, output_shape, 0), outer_size);
+ inner_size = MatchingFlatSizeSkipDim(input_x_shape, 0, input_y_shape, output_shape);
+ }
+
+ int64_t offset = 0;
+ for (int64_t i = 0; i < outer_size; i++)
+ {
+ const T *input_data = input_condition_data[i] ? input_x_data : input_y_data;
+ memcpy(output_data + offset, input_data + offset, inner_size * sizeof(T));
+ offset += inner_size;
+ }
+}
+
+template <typename D, typename T>
+void BroadcastSelect4DSlow(const RuntimeShape &input_condition_shape, const D *input_condition_data,
+ const RuntimeShape &input_x_shape, const T *input_x_data,
+ const RuntimeShape &input_y_shape, const T *input_y_data,
+ const RuntimeShape &output_shape, T *output_data)
+{
+ TFLITE_DCHECK_LE(input_condition_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_LE(input_x_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_LE(input_y_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 4);
+
+ const RuntimeShape extended_output_shape = RuntimeShape::ExtendedShape(4, output_shape);
+
+ NdArrayDesc<4> desc_condition;
+ NdArrayDesc<4> desc_x;
+ NdArrayDesc<4> desc_y;
+ NdArrayDescsForElementwiseBroadcast(input_condition_shape, input_x_shape, input_y_shape,
+ &desc_condition, &desc_x, &desc_y);
+
+ // In Tensorflow, the dimensions are canonically named (batch_number, row,
+ // col, channel), with extents (batches, height, width, depth), with the
+ // trailing dimension changing most rapidly (channels has the smallest
+ // stride, typically 1 element).
+ //
+ // In generated C code, we store arrays with the dimensions reversed. The
+ // first dimension has smallest stride.
+ //
+ // We name our variables by their Tensorflow convention, but generate C code
+ // nesting loops such that the innermost loop has the smallest stride for
+ // the best cache behavior.
+ for (int b = 0; b < extended_output_shape.Dims(0); ++b)
+ {
+ for (int y = 0; y < extended_output_shape.Dims(1); ++y)
+ {
+ for (int x = 0; x < extended_output_shape.Dims(2); ++x)
+ {
+ for (int c = 0; c < extended_output_shape.Dims(3); ++c)
+ {
+ const int condition_index = SubscriptToIndex(desc_condition, b, y, x, c);
+ const int x_index = SubscriptToIndex(desc_x, b, y, x, c);
+ const int y_index = SubscriptToIndex(desc_y, b, y, x, c);
+ output_data[Offset(extended_output_shape, b, y, x, c)] =
+ input_condition_data[condition_index] ? input_x_data[x_index] : input_y_data[y_index];
+ }
+ }
+ }
+ }
+}
+
+template <typename D, typename T>
+void SelectTrueCoords(const RuntimeShape &input_condition_shape, const D *input_condition_data,
+ T *output_data)
+{
+ const size_t size = input_condition_shape.FlatSize();
+ if (size == 0)
+ {
+ // Dimension is zero, in which case we don't need to output.
+ return;
+ }
+ const size_t cond_rank = input_condition_shape.DimensionsCount();
+
+ std::vector<int> dims_to_count(cond_rank, 0);
+ int cur_flat_size = size;
+ for (int i = 0; i < cond_rank; ++i)
+ {
+ dims_to_count[i] = cur_flat_size / input_condition_shape.Dims(i);
+ cur_flat_size = dims_to_count[i];
+ }
+
+ int output_index = 0;
+ for (int i = 0; i < size; ++i)
+ {
+ if (input_condition_data[i])
+ {
+ // Insert the coordinate of the current item (row major) into output.
+ int flat_index = i;
+ for (int j = 0; j < cond_rank; ++j)
+ {
+ int coord_j = flat_index / dims_to_count[j];
+ output_data[output_index * cond_rank + j] = coord_j;
+ flat_index %= dims_to_count[j];
+ }
+ output_index++;
+ }
+ }
+}
+
+// For easy implementation, the indices is always a vector of size-4 vectors.
+template <typename T, typename TI>
+inline void SparseToDense(const std::vector<std::vector<TI>> &indices, const T *values,
+ T default_value, bool value_is_scalar,
+ const RuntimeShape &unextended_output_shape, T *output_data)
+{
+ TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+ const RuntimeShape output_shape = RuntimeShape::ExtendedShape(4, unextended_output_shape);
+ const int value_count = indices.size();
+
+ // First fill the output_data with default value.
+ const int num_elements = output_shape.FlatSize();
+ for (int i = 0; i < num_elements; ++i)
+ {
+ output_data[i] = default_value;
+ }
+
+ // Special handle for value is scalar case to avoid checking the boolean
+ // condition within the loop every time.
+ if (value_is_scalar)
+ {
+ for (int i = 0; i < value_count; ++i)
+ {
+ const std::vector<TI> &index = indices[i];
+ TFLITE_DCHECK_EQ(index.size(), 4);
+ const T value = *values; // just use the first value.
+ output_data[Offset(output_shape, index[0], index[1], index[2], index[3])] = value;
+ }
+ return;
+ }
+
+ // Go through the values and indices to fill the sparse values.
+ for (int i = 0; i < value_count; ++i)
+ {
+ const std::vector<TI> &index = indices[i];
+ TFLITE_DCHECK_EQ(index.size(), 4);
+ const T value = values[i];
+ output_data[Offset(output_shape, index[0], index[1], index[2], index[3])] = value;
+ }
+}
+
+template <typename T>
+inline void Pow(const RuntimeShape &input1_shape, const T *input1_data,
+ const RuntimeShape &input2_shape, const T *input2_data,
+ const RuntimeShape &output_shape, T *output_data)
+{
+ const int flat_size = MatchingFlatSize(input1_shape, input2_shape, output_shape);
+ for (int i = 0; i < flat_size; ++i)
+ {
+ output_data[i] = std::pow(input1_data[i], input2_data[i]);
+ }
+}
+
+template <typename T>
+inline void BroadcastPow4DSlow(const RuntimeShape &unextended_input1_shape, const T *input1_data,
+ const RuntimeShape &unextended_input2_shape, const T *input2_data,
+ const RuntimeShape &unextended_output_shape, T *output_data)
+{
+ TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
+ TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+ const RuntimeShape output_shape = RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+ NdArrayDesc<4> desc1;
+ NdArrayDesc<4> desc2;
+ NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
+ &desc2);
+
+ for (int b = 0; b < output_shape.Dims(0); ++b)
+ {
+ for (int y = 0; y < output_shape.Dims(1); ++y)
+ {
+ for (int x = 0; x < output_shape.Dims(2); ++x)
+ {
+ for (int c = 0; c < output_shape.Dims(3); ++c)
+ {
+ auto out_idx = Offset(output_shape, b, y, x, c);
+ auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
+ auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
+ auto in1_val = input1_data[in1_idx];
+ auto in2_val = input2_data[in2_idx];
+ output_data[out_idx] = std::pow(in1_val, in2_val);
+ }
+ }
+ }
+ }
+}
+
+template <typename Scalar>
+void Reverse(int axis, const RuntimeShape &input_shape, const Scalar *input_data,
+ const RuntimeShape &output_shape, Scalar *output_data)
+{
+ ruy::profiler::ScopeLabel label("Reverse");
+
+ int outer_size = 1;
+ for (int i = 0; i < axis; ++i)
+ {
+ outer_size *= input_shape.Dims(i);
+ }
+
+ int copy_size = 1;
+ for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i)
+ {
+ copy_size *= input_shape.Dims(i);
+ }
+
+ const int dims_at_axis = input_shape.Dims(axis);
+ for (int i = 0; i < outer_size; ++i)
+ {
+ for (int j = 0; j < dims_at_axis; ++j)
+ {
+ const int start_pos = (i * dims_at_axis + j) * copy_size;
+ Scalar *output_ptr = output_data + start_pos;
+ int loc = (i * dims_at_axis + dims_at_axis - j - 1) * copy_size;
+ memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
+ }
+ }
+}
+
+template <typename Scalar, typename TS>
+void ReverseSequence(const TS *seq_lengths, const int seq_dim, const int batch_dim,
+ const RuntimeShape &input_shape, const Scalar *input_data,
+ const RuntimeShape &output_shape, Scalar *output_data)
+{
+ ruy::profiler::ScopeLabel label("ReverseSequence");
+
+ int outer_size = 1;
+ int outer_dim = std::min(batch_dim, seq_dim);
+ int medium_dim = std::max(batch_dim, seq_dim);
+ for (int i = 0; i < outer_dim; ++i)
+ {
+ outer_size *= input_shape.Dims(i);
+ }
+
+ int medium_size = 1;
+ for (int i = outer_dim + 1; i < medium_dim; ++i)
+ {
+ medium_size *= input_shape.Dims(i);
+ }
+
+ int copy_size = 1;
+ for (int i = medium_dim + 1; i < input_shape.DimensionsCount(); ++i)
+ {
+ copy_size *= input_shape.Dims(i);
+ }
+
+ const int dims_at_outer_dim = input_shape.Dims(outer_dim);
+ const int dims_at_medium_dim = input_shape.Dims(medium_dim);
+
+ Scalar *output_ptr;
+ if (batch_dim > seq_dim)
+ {
+ for (int i = 0; i < outer_size; ++i)
+ {
+ for (int j = 0; j < dims_at_outer_dim; ++j)
+ {
+ const int in_pos_base = (i * dims_at_outer_dim + j) * medium_size;
+ for (int p = 0; p < medium_size; ++p)
+ {
+ for (int q = 0; q < dims_at_medium_dim; ++q)
+ {
+ const int in_pos = ((in_pos_base + p) * dims_at_medium_dim + q) * copy_size;
+ const Scalar *in_ptr = input_data + in_pos;
+ int sl = seq_lengths[q] - 1;
+ if (j > sl)
+ {
+ output_ptr = output_data + in_pos;
+ }
+ else
+ {
+ const int out_pos_base = (i * dims_at_outer_dim + sl - j) * medium_size;
+ const int out_pos = ((out_pos_base + p) * dims_at_medium_dim + q) * copy_size;
+ output_ptr = output_data + out_pos;
+ }
+ memcpy(output_ptr, in_ptr, copy_size * sizeof(Scalar));
+ }
+ }
+ }
+ }
+ }
+ else if (batch_dim < seq_dim)
+ {
+ for (int i = 0; i < outer_size; ++i)
+ {
+ for (int j = 0; j < dims_at_outer_dim; ++j)
+ {
+ const int in_pos_base = (i * dims_at_outer_dim + j) * medium_size;
+ int sl = seq_lengths[j] - 1;
+ const int out_pos_base = (i * dims_at_outer_dim + j) * medium_size;
+ for (int p = 0; p < medium_size; ++p)
+ {
+ for (int q = 0; q < dims_at_medium_dim; ++q)
+ {
+ const int in_pos = ((in_pos_base + p) * dims_at_medium_dim + q) * copy_size;
+ const Scalar *in_ptr = input_data + in_pos;
+ if (q > sl)
+ {
+ output_ptr = output_data + in_pos;
+ }
+ else
+ {
+ const int out_pos = ((out_pos_base + p) * dims_at_medium_dim + sl - q) * copy_size;
+ output_ptr = output_data + out_pos;
+ }
+ memcpy(output_ptr, in_ptr, copy_size * sizeof(Scalar));
+ }
+ }
+ }
+ }
+ }
+}
+
+template <typename T>
+inline void SegmentSum(const RuntimeShape &input_shape, const T *input_data,
+ const RuntimeShape &segment_ids_shape, const int32_t *segment_ids_data,
+ const RuntimeShape &output_shape, T *output_data)
+{
+ const int segment_flat_size = MatchingFlatSizeSkipDim(input_shape, 0, output_shape);
+
+ memset(output_data, 0, sizeof(T) * output_shape.FlatSize());
+
+ for (int i = 0; i < input_shape.Dims(0); i++)
+ {
+ int output_index = segment_ids_data[i];
+ for (int j = 0; j < segment_flat_size; ++j)
+ {
+ output_data[output_index * segment_flat_size + j] += input_data[i * segment_flat_size + j];
+ }
+ }
+}
+
+} // namespace reference_ops
+} // namespace tflite
+
+#endif // LUCI_INTERPRETER_PAL_REFERENCE_OPS_H
diff --git a/compiler/luci-interpreter/pal/mcu/pal.cmake b/compiler/luci-interpreter/pal/mcu/pal.cmake
new file mode 100644
index 000000000..907d51de6
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/pal.cmake
@@ -0,0 +1,56 @@
+macro(initialize_pal)
+ nnas_find_package(TensorFlowSource EXACT 2.6.0 QUIET)
+ nnas_find_package(TensorFlowGEMMLowpSource EXACT 2.6.0 QUIET)
+ nnas_find_package(TensorFlowEigenSource EXACT 2.6.0 QUIET)
+ nnas_find_package(TensorFlowRuySource EXACT 2.6.0 QUIET)
+
+ if (NOT TensorFlowSource_FOUND)
+ message(STATUS "Skipping luci-interpreter: TensorFlow not found")
+ return()
+ endif ()
+
+ if (NOT TensorFlowGEMMLowpSource_FOUND)
+ message(STATUS "Skipping luci-interpreter: gemmlowp not found")
+ return()
+ endif ()
+
+ if (NOT TensorFlowEigenSource_FOUND)
+ message(STATUS "Skipping luci-interpreter: Eigen not found")
+ return()
+ endif ()
+
+ if (NOT TensorFlowRuySource_FOUND)
+ message(STATUS "Skipping luci-interpreter: Ruy not found")
+ return()
+ endif ()
+ #find_package(Threads REQUIRED)
+
+ set(PAL_INITIALIZED TRUE)
+endmacro()
+
+macro(add_pal_to_target TGT)
+ target_include_directories(${TGT} PRIVATE "${PAL}")
+ target_include_directories(${TGT} PRIVATE
+ "${TensorFlowRuySource_DIR}"
+ "${TensorFlowGEMMLowpSource_DIR}"
+ "${TensorFlowEigenSource_DIR}"
+ "${TensorFlowSource_DIR}")
+ target_include_directories(${TGT} PRIVATE ${LUCI_INTERPRETER_PAL_DIR})
+
+ # TODO put it back, I changed my mind.
+ # instead add sources with visitors in this library
+ set(PAL_SOURCES ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/quantization_util.cc
+ ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/tensor_utils.cc
+ ${TensorFlowSource_DIR}/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc)
+ add_library(luci_interpreter_mcu_pal STATIC ${PAL_SOURCES})
+ set_target_properties(luci_interpreter_mcu_pal PROPERTIES POSITION_INDEPENDENT_CODE ON)
+ target_include_directories(luci_interpreter_mcu_pal PRIVATE
+ "${TensorFlowRuySource_DIR}"
+ "${TensorFlowGEMMLowpSource_DIR}"
+ "${TensorFlowEigenSource_DIR}"
+ "${TensorFlowSource_DIR}"
+ )
+
+ target_link_libraries(${TGT} PRIVATE luci_interpreter_mcu_pal)
+ #target_link_libraries(${TGT} PRIVATE Threads::Threads luci_interpreter_mcu_pal)
+endmacro()