104 files changed, 4295 insertions, 234 deletions
diff --git a/compiler/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst b/compiler/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst
index d134a6b95..f0df58db3 100644
--- a/compiler/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst
+++ b/compiler/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst
@@ -12,6 +12,7 @@ REGISTER_KERNEL(Div)
 REGISTER_KERNEL(Elu)
 REGISTER_KERNEL(Exp)
 REGISTER_KERNEL(ExpandDims)
+REGISTER_KERNEL(Fill)
 REGISTER_KERNEL(Floor)
 REGISTER_KERNEL(FloorDiv)
 REGISTER_KERNEL(Equal)
@@ -44,6 +45,7 @@ REGISTER_KERNEL(Reshape)
 REGISTER_KERNEL(ResizeBilinear)
 REGISTER_KERNEL(ResizeNearestNeighbor)
 REGISTER_KERNEL(Rsqrt)
+REGISTER_KERNEL(Shape)
 REGISTER_KERNEL(Softmax)
 REGISTER_KERNEL(SpaceToBatchND)
 REGISTER_KERNEL(SpaceToDepth)
diff --git a/compiler/luci-interpreter/pal/cmsisnn/PALDequantize.h b/compiler/luci-interpreter/pal/cmsisnn/PALDequantize.h
index 15ff0327b..efa6b167e 100644
--- a/compiler/luci-interpreter/pal/cmsisnn/PALDequantize.h
+++ b/compiler/luci-interpreter/pal/cmsisnn/PALDequantize.h
@@ -18,7 +18,7 @@
 #define LUCI_INTERPRETER_PAL_DEQUANTIZE_H
 
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h"
-#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "PALreference_ops.h"
 
 namespace luci_interpreter_pal
 {
diff --git a/compiler/luci-interpreter/pal/cmsisnn/PALQuantize.h b/compiler/luci-interpreter/pal/cmsisnn/PALQuantize.h
index 6046789ae..effb85d54 100644
--- a/compiler/luci-interpreter/pal/cmsisnn/PALQuantize.h
+++ b/compiler/luci-interpreter/pal/cmsisnn/PALQuantize.h
@@ -17,7 +17,7 @@
 #ifndef LUCI_INTERPRETER_PAL_QUANTIZE_H
 #define LUCI_INTERPRETER_PAL_QUANTIZE_H
 
-#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "PALreference_ops.h"
 
 namespace luci_interpreter_pal
 {
diff --git a/compiler/luci-interpreter/pal/cmsisnn/PALreference_ops.h b/compiler/luci-interpreter/pal/cmsisnn/PALreference_ops.h
new file mode 100644
index 000000000..813b1ec2c
--- /dev/null
+++ b/compiler/luci-interpreter/pal/cmsisnn/PALreference_ops.h
@@ -0,0 +1,1568 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef LUCI_INTERPRETER_PAL_REFERENCE_OPS_H
+#define LUCI_INTERPRETER_PAL_REFERENCE_OPS_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <type_traits>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "fixedpoint/fixedpoint.h"
+#include "ruy/profiler/instrumentation.h" // from @ruy
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/add.h"
+#include "tensorflow/lite/kernels/internal/reference/add_n.h"
+#include "tensorflow/lite/kernels/internal/reference/arg_min_max.h"
+#include "tensorflow/lite/kernels/internal/reference/batch_matmul.h"
+#include "tensorflow/lite/kernels/internal/reference/batch_to_space_nd.h"
+#include "tensorflow/lite/kernels/internal/reference/binary_function.h"
+#include "tensorflow/lite/kernels/internal/reference/cast.h"
+#include "tensorflow/lite/kernels/internal/reference/ceil.h"
+#include "tensorflow/lite/kernels/internal/reference/comparisons.h"
+#include "tensorflow/lite/kernels/internal/reference/concatenation.h"
+#include "tensorflow/lite/kernels/internal/reference/conv.h"
+#include "tensorflow/lite/kernels/internal/reference/depth_to_space.h"
+#include "tensorflow/lite/kernels/internal/reference/dequantize.h"
+#include "tensorflow/lite/kernels/internal/reference/div.h"
+#include "tensorflow/lite/kernels/internal/reference/elu.h"
+#include "tensorflow/lite/kernels/internal/reference/exp.h"
+#include "tensorflow/lite/kernels/internal/reference/fill.h"
+#include "tensorflow/lite/kernels/internal/reference/floor.h"
+#include "tensorflow/lite/kernels/internal/reference/floor_div.h"
+#include "tensorflow/lite/kernels/internal/reference/floor_mod.h"
+#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/reference/gather.h"
+#include "tensorflow/lite/kernels/internal/reference/hard_swish.h"
+#include "tensorflow/lite/kernels/internal/reference/l2normalization.h"
+#include "tensorflow/lite/kernels/internal/reference/leaky_relu.h"
+#include "tensorflow/lite/kernels/internal/reference/log_softmax.h"
+#include "tensorflow/lite/kernels/internal/reference/logistic.h"
+#include "tensorflow/lite/kernels/internal/reference/maximum_minimum.h"
+#include "tensorflow/lite/kernels/internal/reference/mul.h"
+#include "tensorflow/lite/kernels/internal/reference/neg.h"
+#include "tensorflow/lite/kernels/internal/reference/pad.h"
+#include "tensorflow/lite/kernels/internal/reference/pooling.h"
+#include "tensorflow/lite/kernels/internal/reference/prelu.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/kernels/internal/reference/quantize.h"
+#include "tensorflow/lite/kernels/internal/reference/reduce.h"
+#include "tensorflow/lite/kernels/internal/reference/requantize.h"
+#include "tensorflow/lite/kernels/internal/reference/resize_bilinear.h"
+#include "tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h"
+#include "tensorflow/lite/kernels/internal/reference/round.h"
+#include "tensorflow/lite/kernels/internal/reference/softmax.h"
+#include "tensorflow/lite/kernels/internal/reference/space_to_batch_nd.h"
+#include "tensorflow/lite/kernels/internal/reference/space_to_depth.h"
+#include "tensorflow/lite/kernels/internal/reference/strided_slice.h"
+#include "tensorflow/lite/kernels/internal/reference/string_comparisons.h"
+#include "tensorflow/lite/kernels/internal/reference/sub.h"
+#include "tensorflow/lite/kernels/internal/reference/tanh.h"
+#include "tensorflow/lite/kernels/internal/reference/transpose.h"
+#include "tensorflow/lite/kernels/internal/reference/transpose_conv.h"
+#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+namespace tflite
+{
+
+namespace reference_ops
+{
+
+template <typename T>
+inline void Relu(const RuntimeShape &input_shape, const T *input_data,
+                 const RuntimeShape &output_shape, T *output_data)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    const T val = input_data[i];
+    const T lower = 0;
+    const T clamped = val < lower ? lower : val;
+    output_data[i] = clamped;
+  }
+}
+
+template <typename T>
+inline void Relu1(const RuntimeShape &input_shape, const T *input_data,
+                  const RuntimeShape &output_shape, T *output_data)
+{
+  ruy::profiler::ScopeLabel label("Relu1 (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    const T val = input_data[i];
+    const T upper = 1;
+    const T lower = -1;
+    const T clamped = val > upper ? upper : val < lower ? lower : val;
+    output_data[i] = clamped;
+  }
+}
+
+inline void Relu6(const RuntimeShape &input_shape, const float *input_data,
+                  const RuntimeShape &output_shape, float *output_data)
+{
+  ruy::profiler::ScopeLabel label("Relu6 (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    const float val = input_data[i];
+    const float upper = 6;
+    const float lower = 0;
+    const float clamped = val > upper ? upper : val < lower ? lower : val;
+    output_data[i] = clamped;
+  }
+}
+
+template <typename T>
+inline void ReluX(const tflite::ReluParams &params, const RuntimeShape &input_shape,
+                  const T *input_data, const RuntimeShape &output_shape, T *output_data)
+{
+  ruy::profiler::ScopeLabel label("Quantized ReluX (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    const int32 val = static_cast<int32_t>(input_data[i]);
+    int32 clamped = params.output_offset + MultiplyByQuantizedMultiplier(val - params.input_offset,
+                                                                         params.output_multiplier,
+                                                                         params.output_shift);
+    clamped = std::max(params.quantized_activation_min, clamped);
+    clamped = std::min(params.quantized_activation_max, clamped);
+    output_data[i] = static_cast<T>(clamped);
+  }
+}
+
+template <typename T>
+inline void ReluX(const tflite::ActivationParams &params, const RuntimeShape &input_shape,
+                  const T *input_data, const RuntimeShape &output_shape, T *output_data)
+{
+  ruy::profiler::ScopeLabel label("Quantized ReluX (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  const T max_value = params.quantized_activation_max;
+  const T min_value = params.quantized_activation_min;
+  for (int i = 0; i < flat_size; ++i)
+  {
+    const T val = input_data[i];
+    const T clamped = val > max_value ? max_value : val < min_value ? min_value : val;
+    output_data[i] = clamped;
+  }
+}
+
+// TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+inline void BroadcastMulFivefold(const ArithmeticParams &unswitched_params,
+                                 const RuntimeShape &unswitched_input1_shape,
+                                 const uint8 *unswitched_input1_data,
+                                 const RuntimeShape &unswitched_input2_shape,
+                                 const uint8 *unswitched_input2_data,
+                                 const RuntimeShape &output_shape, uint8 *output_data)
+{
+  ArithmeticParams switched_params = unswitched_params;
+  switched_params.input1_offset = unswitched_params.input2_offset;
+  switched_params.input2_offset = unswitched_params.input1_offset;
+
+  const bool use_unswitched = unswitched_params.broadcast_category ==
+                              tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+  const ArithmeticParams &params = use_unswitched ? unswitched_params : switched_params;
+  const uint8 *input1_data = use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+  const uint8 *input2_data = use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+  // Fivefold nested loops. The second input resets its position for each
+  // iteration of the second loop. The first input resets its position at the
+  // beginning of the fourth loop. The innermost loop is an elementwise Mul of
+  // sections of the arrays.
+  uint8 *output_data_ptr = output_data;
+  const uint8 *input1_data_ptr = input1_data;
+  const uint8 *input2_data_reset = input2_data;
+  int y0 = params.broadcast_shape[0];
+  int y1 = params.broadcast_shape[1];
+  int y2 = params.broadcast_shape[2];
+  int y3 = params.broadcast_shape[3];
+  int y4 = params.broadcast_shape[4];
+  for (int i0 = 0; i0 < y0; ++i0)
+  {
+    const uint8 *input2_data_ptr;
+    for (int i1 = 0; i1 < y1; ++i1)
+    {
+      input2_data_ptr = input2_data_reset;
+      for (int i2 = 0; i2 < y2; ++i2)
+      {
+        for (int i3 = 0; i3 < y3; ++i3)
+        {
+          MulElementwise(y4, params, input1_data_ptr, input2_data_ptr, output_data_ptr);
+          input2_data_ptr += y4;
+          output_data_ptr += y4;
+        }
+        input1_data_ptr += y4;
+      }
+    }
+    input2_data_reset = input2_data_ptr;
+  }
+}
+
+inline void Mul(const ArithmeticParams &params, const RuntimeShape &input1_shape,
+                const int16 *input1_data, const RuntimeShape &input2_shape,
+                const int16 *input2_data, const RuntimeShape &output_shape, int16 *output_data)
+{
+  ruy::profiler::ScopeLabel label("Mul/Int16");
+
+  const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++)
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 unclamped_result = F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
+    output_data[i] = unclamped_result.raw();
+  }
+}
+
+inline void Mul(const ArithmeticParams &params, const RuntimeShape &input1_shape,
+                const int16 *input1_data, const RuntimeShape &input2_shape,
+                const int16 *input2_data, const RuntimeShape &output_shape, uint8 *output_data)
+{
+  ruy::profiler::ScopeLabel label("Mul/Int16Uint8");
+  int32 output_offset = params.output_offset;
+  int32 output_activation_min = params.quantized_activation_min;
+  int32 output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++)
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 unclamped_result = F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
+    int16 rescaled_result = gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
+    int16 clamped_result = std::min<int16>(output_activation_max - output_offset, rescaled_result);
+    clamped_result = std::max<int16>(output_activation_min - output_offset, clamped_result);
+    output_data[i] = output_offset + clamped_result;
+  }
+}
+
+inline void Sub16(const ArithmeticParams &params, const RuntimeShape &input1_shape,
+                  const int16_t *input1_data, const RuntimeShape &input2_shape,
+                  const int16_t *input2_data, const RuntimeShape &output_shape,
+                  int16_t *output_data)
+{
+  ruy::profiler::ScopeLabel label("Sub/Int16");
+  const int input1_shift = params.input1_shift;
+  const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  const int16 output_activation_min = params.quantized_activation_min;
+  const int16 output_activation_max = params.quantized_activation_max;
+
+  TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
+  TFLITE_DCHECK_LE(input1_shift, 0);
+  TFLITE_DCHECK_LE(params.input2_shift, 0);
+  const int16 *not_shift_input = input1_shift == 0 ? input1_data : input2_data;
+  const int16 *shift_input = input1_shift == 0 ? input2_data : input1_data;
+  const int input_right_shift = input1_shift == 0 ? -params.input2_shift : -input1_shift;
+
+  if (input1_shift == 0)
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+    for (int i = 0; i < flat_size; ++i)
+    {
+      F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
+      F0 scaled_input =
+        F0::FromRaw(gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
+      F0 result = SaturatingSub(input_ready_scaled, scaled_input);
+      const int16 raw_output = result.raw();
+      const int16 clamped_output =
+        std::min(output_activation_max, std::max(output_activation_min, raw_output));
+      output_data[i] = clamped_output;
+    }
+  }
+  else
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+    for (int i = 0; i < flat_size; ++i)
+    {
+      F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
+      F0 scaled_input =
+        F0::FromRaw(gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
+      F0 result = SaturatingSub(scaled_input, input_ready_scaled);
+      const int16 raw_output = result.raw();
+      const int16 clamped_output =
+        std::min(output_activation_max, std::max(output_activation_min, raw_output));
+      output_data[i] = clamped_output;
+    }
+  }
+}
+
+template <typename Scalar>
+void Pack(const PackParams &params, const RuntimeShape *const *input_shapes,
+          const Scalar *const *input_data, const RuntimeShape &output_shape, Scalar *output_data)
+{
+  ruy::profiler::ScopeLabel label("Pack");
+  const int dimensions = output_shape.DimensionsCount();
+  int axis = params.axis;
+  int inputs_count = params.inputs_count;
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; i++)
+  {
+    outer_size *= output_shape.Dims(i);
+  }
+  int copy_size = 1;
+  for (int i = params.axis + 1; i < dimensions; i++)
+  {
+    copy_size *= output_shape.Dims(i);
+  }
+  TFLITE_DCHECK_EQ((**input_shapes).FlatSize(), copy_size * outer_size);
+
+  for (int i = 0; i < inputs_count; ++i)
+  {
+    for (int k = 0; k < outer_size; k++)
+    {
+      const Scalar *input_ptr = input_data[i] + copy_size * k;
+      int loc = k * inputs_count * copy_size + i * copy_size;
+      memcpy(output_data + loc, input_ptr, copy_size * sizeof(Scalar));
+    }
+  }
+}
+
+template <typename Scalar>
+void Unpack(const UnpackParams &params, const RuntimeShape &input_shape, const Scalar *input_data,
+            const RuntimeShape &output_shape, Scalar *const *output_datas)
+{
+  ruy::profiler::ScopeLabel label("Unpack");
+  const int dimensions = input_shape.DimensionsCount();
+  const int outputs_count = params.num_split;
+
+  int outer_size = 1;
+  int axis = params.axis;
+  if (axis < 0)
+  {
+    axis += dimensions;
+  }
+  TFLITE_DCHECK_GE(axis, 0);
+  TFLITE_DCHECK_LT(axis, dimensions);
+  for (int i = 0; i < axis; ++i)
+  {
+    outer_size *= input_shape.Dims(i);
+  }
+  int copy_size = 1;
+  for (int i = axis + 1; i < dimensions; ++i)
+  {
+    copy_size *= input_shape.Dims(i);
+  }
+  TFLITE_DCHECK_EQ(output_shape.FlatSize(), copy_size * outer_size);
+
+  for (int i = 0; i < outputs_count; ++i)
+  {
+    for (int k = 0; k < outer_size; k++)
+    {
+      Scalar *output_ptr = output_datas[i] + copy_size * k;
+      int loc = k * outputs_count * copy_size + i * copy_size;
+      memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
+    }
+  }
+}
+
+template <typename Scalar>
+void PackWithScaling(const PackParams &params, const RuntimeShape *const *input_shapes,
+                     const uint8 *const *input_data, const RuntimeShape &output_shape,
+                     uint8 *output_data)
+{
+  ruy::profiler::ScopeLabel label("PackWithScaling");
+  const int dimensions = output_shape.DimensionsCount();
+  int axis = params.axis;
+  const int32 *input_zeropoint = params.input_zeropoint;
+  const float *input_scale = params.input_scale;
+  int inputs_count = params.inputs_count;
+  const int32 output_zeropoint = params.output_zeropoint;
+  const float output_scale = params.output_scale;
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; i++)
+  {
+    outer_size *= output_shape.Dims(i);
+  }
+  int copy_size = 1;
+  for (int i = axis + 1; i < dimensions; i++)
+  {
+    copy_size *= output_shape.Dims(i);
+  }
+  TFLITE_DCHECK_EQ((**input_shapes).FlatSize(), copy_size * outer_size);
+
+  Scalar *output_ptr = output_data;
+  const float inverse_output_scale = 1.f / output_scale;
+  for (int k = 0; k < outer_size; k++)
+  {
+    for (int i = 0; i < inputs_count; ++i)
+    {
+      if (input_zeropoint[i] == output_zeropoint && input_scale[i] == output_scale)
+      {
+        memcpy(output_ptr, input_data[i] + k * copy_size, copy_size * sizeof(Scalar));
+      }
+      else
+      {
+        assert(false);
+        const float scale = input_scale[i] * inverse_output_scale;
+        const float bias = -input_zeropoint[i] * scale;
+        auto input_ptr = input_data[i];
+        for (int j = 0; j < copy_size; ++j)
+        {
+          const int value =
+            static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) + output_zeropoint;
+          output_ptr[j] = static_cast<uint8_t>(std::max(std::min(255, value), 0));
+        }
+      }
+      output_ptr += copy_size;
+    }
+  }
+}
+
+template <typename Scalar>
+void DepthConcatenation(const ConcatenationParams &params, const RuntimeShape *const *input_shapes,
+                        const Scalar *const *input_data, const RuntimeShape &output_shape,
+                        Scalar *output_data)
+{
+  ruy::profiler::ScopeLabel label("DepthConcatenation");
+  auto params_copy = params;
+  params_copy.axis = 3;
+  Concatenation(params_copy, input_shapes, input_data, output_shape, output_data);
+}
+
+inline void LstmCell(const LstmCellParams &params, const RuntimeShape &unextended_input_shape,
+                     const float *input_data, const RuntimeShape &unextended_prev_activ_shape,
+                     const float *prev_activ_data, const RuntimeShape &weights_shape,
+                     const float *weights_data, const RuntimeShape &unextended_bias_shape,
+                     const float *bias_data, const RuntimeShape &unextended_prev_state_shape,
+                     const float *prev_state_data,
+                     const RuntimeShape &unextended_output_state_shape, float *output_state_data,
+                     const RuntimeShape &unextended_output_activ_shape, float *output_activ_data,
+                     const RuntimeShape &unextended_concat_temp_shape, float *concat_temp_data,
+                     const RuntimeShape &unextended_activ_temp_shape, float *activ_temp_data)
+{
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape = RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape prev_activ_shape = RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
+  const RuntimeShape bias_shape = RuntimeShape::ExtendedShape(4, unextended_bias_shape);
+  const RuntimeShape prev_state_shape = RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
+  const RuntimeShape output_state_shape =
+    RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
+  const RuntimeShape output_activ_shape =
+    RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
+  const RuntimeShape concat_temp_shape =
+    RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
+  const RuntimeShape activ_temp_shape = RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  const int batches = MatchingDim(input_shape, 0, prev_activ_shape, 0, prev_state_shape, 0,
+                                  output_state_shape, 0, output_activ_shape, 0);
+  const int height = MatchingDim(input_shape, 1, prev_activ_shape, 1, prev_state_shape, 1,
+                                 output_state_shape, 1, output_activ_shape, 1);
+  const int width = MatchingDim(input_shape, 2, prev_activ_shape, 2, prev_state_shape, 2,
+                                output_state_shape, 2, output_activ_shape, 2);
+  const int input_depth = input_shape.Dims(3);
+  const int prev_activ_depth = prev_activ_shape.Dims(3);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1), total_input_depth);
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
+  const int intern_activ_depth = MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
+  TFLITE_DCHECK_EQ(weights_shape.FlatSize(), intern_activ_depth * total_input_depth);
+  TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth = MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
+                                       3, output_activ_shape, 3);
+  TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
+
+  // Concatenate prev_activ and input data together
+  std::vector<float const *> concat_input_arrays_data;
+  std::vector<RuntimeShape const *> concat_input_arrays_shapes;
+  concat_input_arrays_data.push_back(input_data);
+  concat_input_arrays_data.push_back(prev_activ_data);
+  concat_input_arrays_shapes.push_back(&input_shape);
+  concat_input_arrays_shapes.push_back(&prev_activ_shape);
+  tflite::ConcatenationParams concat_params;
+  concat_params.axis = 3;
+  concat_params.inputs_count = concat_input_arrays_data.size();
+  Concatenation(concat_params, &(concat_input_arrays_shapes[0]), &(concat_input_arrays_data[0]),
+                concat_temp_shape, concat_temp_data);
+
+  // Fully connected
+  tflite::FullyConnectedParams fc_params;
+  fc_params.float_activation_min = std::numeric_limits<float>::lowest();
+  fc_params.float_activation_max = std::numeric_limits<float>::max();
+  FullyConnected(fc_params, concat_temp_shape, concat_temp_data, weights_shape, weights_data,
+                 bias_shape, bias_data, activ_temp_shape, activ_temp_data);
+
+  // Memory state update (the LSTM "guts")
+  for (int b = 0; b < batches; ++b)
+  {
+    for (int w = 0; w < width; ++w)
+    {
+      for (int h = 0; h < height; ++h)
+      {
+        for (int c = 0; c < output_depth; ++c)
+        {
+          const float input_gate =
+            1.f /
+            (1.f +
+             std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, 0 * output_depth + c)]));
+          const float new_input =
+            std::tanh(activ_temp_data[Offset(activ_temp_shape, b, h, w, 1 * output_depth + c)]);
+          const float forget_gate =
+            1.f /
+            (1.f +
+             std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, 2 * output_depth + c)]));
+          const float output_gate =
+            1.f /
+            (1.f +
+             std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, 3 * output_depth + c)]));
+          const float new_state =
+            input_gate * new_input +
+            forget_gate * prev_state_data[Offset(prev_state_shape, b, h, w, c)];
+          output_state_data[Offset(output_state_shape, b, h, w, c)] = new_state;
+          output_activ_data[Offset(output_activ_shape, b, h, w, c)] =
+            output_gate * std::tanh(new_state);
+        }
+      }
+    }
+  }
+}
+
+// Quantized LSTM cell implementation.
+// The quantization of the input, output arrays is as follows:
+//  - The input activations are quantized as uint8 on the interval
+//    [-1, 127/128].
+//    The rationale for that is that is the natural interval for output
+//    activations (see next point) and these need to be concatenated together.
+//    We could accommodate different ranges by re-scaling, but we empirically
+//    found that setting the input activations range to be [-1, 127/128] in the
+//    first place, removing the need for re-scaling, greatly improves accuracy.
+//  - The output activations are quantized as uint8 on the interval
+//    [-1, 127/128].
+//    The rationale for that is that the definition of a LSTM cell makes them
+//    intrinsically constrained in [-1, 1]; tweaking that to [-1, 127/128]
+//    makes for simpler, more accurate fixed-point arithmetic.
+//  - The output-at-previous-timestep state array is obviously quantized as
+//    the output activations.
+//  - The internal LSTM memory (not the output-at-previous-timestep, the other
+//    internal state array) is int16-quantized and may use any power-of-two,
+//    symmetric range i.e. [-2^N, 2^N * 32767/32768] for any N, which we call
+//    StateIntegerBits below, see the below discussion of that template
+//    parameter ("The StateIntegerBits template parameter").
+//  - The output of the internal fully-connected node is int16-quantized
+//    on the interval [-8, 8 * 32767/32768], the rationale for which is
+//    explained just below ("Why [-8, 8] for fully-connected output?").
+//
+//
+// === The StateIntegerBits template parameter ===
+//
+// The StateIntegerBits template parameter controls the fixed-point format used
+// to represent the internal memory of the LSTM cell (not the
+// output-at-previous-timestep, the other internal state array). It's currently
+// a template parameter so that the model can control that. The most typical
+// value for StateIntegerBits is 4. Other plausible values are anywhere between
+// 3 and 5. We might eventually standardize on a single supported value, e.g. 4,
+// and drop that template parameter. The reason why it can't be a runtime
+// parameter is that this controls the fixed-point format used, i.e. we need to
+// generate actually different code based on it. In particular, we generate code
+// for a fixed-point tanh() implementation for that format, which internally
+// uses a fixed-point exp() implementation, which internally uses a
+// barrel-shifter with a number of steps that depends on StateIntegerBits.
+// Another consequence of that is that a higher value of StateIntegerBits
+// results in a more expensive implementation (more barrel shifter steps
+// needed).
+//
+//
+// === Why [-8, 8] for fully-connected output? ===
+//
+// This array is only fed to Logistic and Tanh functions, for which
+// the quantized implementation will want to use fixed-point arithmetic,
+// requiring a power-of-two representation interval. Thus, we should right
+// away quantize this array to a power-of-two interval; otherwise,
+// implementation will need to rescale that, losing any benefit that a tighter
+// representation interval might otherwise yield, while introducing some
+// numerical error and computational overhead.
+//
+// Now, Logistic and Tanh
+// are nearly constant (nearly equal to their horizontal asymptotes)
+// outside of a small bounded interval around 0:
+//
+//   Logistic(4) = 1 - 1.8e-2     Tanh(4) = 1 - 6.7e-4
+//   Logistic(8) = 1 - 3.4e-4     Tanh(8) = 1 - 2.3e-7
+//   Logistic(16) = 1 - 1.1e-7    Tanh(16) = 1 - 2.5e-14
+//
+// From this, we see that clamping to [-4, 4] would be too inaccurate
+// (the error of 1.8e-2 on Logistic would be felt even in 8bit precision)
+// while clamping to [-16, 16] would make no difference even in float32.
+// However, for a fixed-point implementation in 16-bit integers, using 5
+// integer bits to represent the [-16, 16] range would leave only 11
+// fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive
+// representable values. Notice that is higher than the
+// worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic.
+// Using [-8, 8] thus seems like the better compromise overall, enjoying
+// an increment of 2.4e-4 between representable values and a worst-case
+// clamping error of 3.4e-4, both better than the increment of 4.9e-4 with
+// [-16, 16].
+//
+// Moreover, all other things being equal, it is nice to choose the narrower
+// representation range, as that makes the implementation of fixed-point
+// math functions a little cheaper (each integer bit requires an additional
+// barrel-shifter atep in the implementation of exp(-x)). That is further
+// reason to prefer [-8, 8] over [-16, 16]. The choice of [-16, 16] would make
+// sense for 32-bit float or 32-bit fixed-point quantization, but we are
+// aiming for 16-bit fixed-point quantization of these internal nodes here.
+//
+template <int StateIntegerBits>
+inline void
+LstmCell(const LstmCellParams &params, const RuntimeShape &unextended_input_shape,
+         const uint8 *input_data_uint8, const RuntimeShape &unextended_prev_activ_shape,
+         const uint8 *prev_activ_data_uint8, const RuntimeShape &weights_shape,
+         const uint8 *weights_data_uint8, const RuntimeShape &unextended_bias_shape,
+         const int32 *bias_data_int32, const RuntimeShape &unextended_prev_state_shape,
+         const int16 *prev_state_data_int16, const RuntimeShape &unextended_output_state_shape,
+         int16 *output_state_data_int16, const RuntimeShape &unextended_output_activ_shape,
+         uint8 *output_activ_data_uint8, const RuntimeShape &unextended_concat_temp_shape,
+         uint8 *concat_temp_data_uint8, const RuntimeShape &unextended_activ_temp_shape,
+         int16 *activ_temp_data_int16, void *gemmlowp_context)
+{
+  (void)gemmlowp_context; // only used in optimized code.
+  int32 weights_zero_point = params.weights_zero_point;
+  int32 accum_multiplier = params.accum_multiplier;
+  int accum_shift = params.accum_shift;
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape = RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape prev_activ_shape = RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
+  const RuntimeShape bias_shape = RuntimeShape::ExtendedShape(4, unextended_bias_shape);
+  const RuntimeShape prev_state_shape = RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
+  const RuntimeShape output_state_shape =
+    RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
+  const RuntimeShape output_activ_shape =
+    RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
+  const RuntimeShape concat_temp_shape =
+    RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
+  const RuntimeShape activ_temp_shape = RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+
+  // Gather dimensions information, and perform consistency checks.
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  const int outer_size = MatchingFlatSizeSkipDim(input_shape, 3, prev_activ_shape, prev_state_shape,
+                                                 output_state_shape, output_activ_shape);
+  const int input_depth = input_shape.Dims(3);
+  const int prev_activ_depth = prev_activ_shape.Dims(3);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1), total_input_depth);
+  const int intern_activ_depth = MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
+  TFLITE_DCHECK_EQ(weights_shape.FlatSize(), intern_activ_depth * total_input_depth);
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
+  TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth = MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
+                                       3, output_activ_shape, 3);
+  TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
+  const int fc_batches = FlatSizeSkipDim(activ_temp_shape, 3);
+  const int fc_output_depth =
+    MatchingDim(weights_shape, weights_dim_count - 2, activ_temp_shape, 3);
+  const int fc_accum_depth = total_input_depth;
+  TFLITE_DCHECK_EQ(fc_output_depth, 4 * output_depth);
+
+  // Depth-concatenate prev_activ and input data together.
+  uint8 const *concat_input_arrays_data[2] = {input_data_uint8, prev_activ_data_uint8};
+  const RuntimeShape *concat_input_arrays_shapes[2] = {&input_shape, &prev_activ_shape};
+  tflite::ConcatenationParams concat_params;
+  concat_params.axis = 3;
+  concat_params.inputs_count = 2;
+  Concatenation(concat_params, concat_input_arrays_shapes, concat_input_arrays_data,
+                concat_temp_shape, concat_temp_data_uint8);
+
+  // Implementation of the fully connected node inside the LSTM cell.
+  // The operands are 8-bit integers, the accumulators are internally 32bit
+  // integers, and the output is 16-bit fixed-point with 3 integer bits so
+  // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
+  // is explained in the function comment above.
+  for (int b = 0; b < fc_batches; ++b)
+  {
+    for (int out_c = 0; out_c < fc_output_depth; ++out_c)
+    {
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32 accum = bias_data_int32[out_c];
+      // Accumulation loop.
+      for (int d = 0; d < fc_accum_depth; ++d)
+      {
+        int16 input_val = concat_temp_data_uint8[b * fc_accum_depth + d] - 128;
+        int16 weights_val = weights_data_uint8[out_c * fc_accum_depth + d] - weights_zero_point;
+        accum += input_val * weights_val;
+      }
+      // Down-scale the final int32 accumulator to the scale used by our
+      // (16-bit, using 3 integer bits) fixed-point format. The quantized
+      // multiplier and shift here have been pre-computed offline
+      // (e.g. by toco).
+      accum = MultiplyByQuantizedMultiplier(accum, accum_multiplier, accum_shift);
+      // Saturate, cast to int16, and store to the temporary activations array.
+      accum = std::max(-32768, std::min(32767, static_cast<int>(accum)));
+      activ_temp_data_int16[out_c + fc_output_depth * b] = accum;
+    }
+  }
+
+  // Rest of the LSTM cell: tanh and logistic math functions, and some adds
+  // and muls, all done in 16-bit fixed-point.
+  for (int b = 0; b < outer_size; ++b)
+  {
+    for (int c = 0; c < output_depth; ++c)
+    {
+      // Define the fixed-point data types that we will use here. All use
+      // int16 as the underlying integer type i.e. all are 16-bit fixed-point.
+      // They only differ by the number of integral vs. fractional bits,
+      // determining the range of values that they can represent.
+      //
+      // F0 uses 0 integer bits, range [-1, 1].
+      // This is the return type of math functions such as tanh, logistic,
+      // whose range is in [-1, 1].
+      using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+      // F3 uses 3 integer bits, range [-8, 8].
+      // This is the range of the previous fully-connected node's output,
+      // which is our input here.
+      using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+      // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits,
+      // 2^StateIntegerBits]. It's used to represent the internal state, whose
+      // number of integer bits is currently dictated by the model. See comment
+      // on the StateIntegerBits template parameter above.
+      using FS = gemmlowp::FixedPoint<std::int16_t, StateIntegerBits>;
+      // Implementation of input gate, using fixed-point logistic function.
+      F3 input_gate_input =
+        F3::FromRaw(activ_temp_data_int16[b * fc_output_depth + 0 * output_depth + c]);
+      F0 input_gate_output = gemmlowp::logistic(input_gate_input);
+      // Implementation of input modulation gate, using fixed-point tanh
+      // function.
+      F3 input_modulation_gate_input =
+        F3::FromRaw(activ_temp_data_int16[b * fc_output_depth + 1 * output_depth + c]);
+      F0 input_modulation_gate_output = gemmlowp::tanh(input_modulation_gate_input);
+      // Implementation of forget gate, using fixed-point logistic function.
+      F3 forget_gate_input =
+        F3::FromRaw(activ_temp_data_int16[b * fc_output_depth + 2 * output_depth + c]);
+      F0 forget_gate_output = gemmlowp::logistic(forget_gate_input);
+      // Implementation of output gate, using fixed-point logistic function.
+      F3 output_gate_input =
+        F3::FromRaw(activ_temp_data_int16[b * fc_output_depth + 3 * output_depth + c]);
+      F0 output_gate_output = gemmlowp::logistic(output_gate_input);
+      // Implementation of internal multiplication nodes, still in fixed-point.
+      F0 input_times_input_modulation = input_gate_output * input_modulation_gate_output;
+      FS prev_state = FS::FromRaw(prev_state_data_int16[b * output_depth + c]);
+      FS prev_state_times_forget_state = forget_gate_output * prev_state;
+      // Implementation of internal addition node, saturating.
+      FS new_state =
+        gemmlowp::SaturatingAdd(gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
+                                prev_state_times_forget_state);
+      // Implementation of last internal Tanh node, still in fixed-point.
+      // Since a Tanh fixed-point implementation is specialized for a given
+      // number or integer bits, and each specialization can have a substantial
+      // code size, and we already used above a Tanh on an input with 3 integer
+      // bits, and per the table in the above function comment there is no
+      // significant accuracy to be lost by clamping to [-8, +8] for a
+      // 3-integer-bits representation, let us just do that. This helps people
+      // porting this to targets where code footprint must be minimized.
+      F3 new_state_f3 = gemmlowp::Rescale<3>(new_state);
+      F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state_f3);
+      // Store the new internal state back to memory, as 16-bit integers.
+      // Note: here we store the original value with StateIntegerBits, not
+      // the rescaled 3-integer-bits value fed to tanh.
+      output_state_data_int16[b * output_depth + c] = new_state.raw();
+      // Down-scale the output activations to 8-bit integers, saturating,
+      // and store back to memory.
+      int16 rescaled_output_activ = gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8);
+      int16 clamped_output_activ =
+        std::max<int16>(-128, std::min<int16>(127, rescaled_output_activ));
+      output_activ_data_uint8[b * output_depth + c] = 128 + clamped_output_activ;
+    }
+  }
+}
+
+template <typename Scalar>
+void Split(const SplitParams &params, const RuntimeShape &input_shape, const Scalar *input_data,
+           const RuntimeShape *const *output_shapes, Scalar *const *output_data)
+{
+  ruy::profiler::ScopeLabel label("Split");
+  const int split_dimensions = input_shape.DimensionsCount();
+  int axis = params.axis < 0 ? params.axis + split_dimensions : params.axis;
+  int outputs_count = params.num_split;
+  TFLITE_DCHECK_LT(axis, split_dimensions);
+
+  int64_t split_size = 0;
+  for (int i = 0; i < outputs_count; i++)
+  {
+    TFLITE_DCHECK_EQ(output_shapes[i]->DimensionsCount(), split_dimensions);
+    for (int j = 0; j < split_dimensions; j++)
+    {
+      if (j != axis)
+      {
+        MatchingDim(*output_shapes[i], j, input_shape, j);
+      }
+    }
+    split_size += output_shapes[i]->Dims(axis);
+  }
+  TFLITE_DCHECK_EQ(split_size, input_shape.Dims(axis));
+  int64_t outer_size = 1;
+  for (int i = 0; i < axis; ++i)
+  {
+    outer_size *= input_shape.Dims(i);
+  }
+  // For all output arrays,
+  // FlatSize() = outer_size * Dims(axis) * base_inner_size;
+  int64_t base_inner_size = 1;
+  for (int i = axis + 1; i < split_dimensions; ++i)
+  {
+    base_inner_size *= input_shape.Dims(i);
+  }
+
+  const Scalar *input_ptr = input_data;
+  for (int k = 0; k < outer_size; k++)
+  {
+    for (int i = 0; i < outputs_count; ++i)
+    {
+      const int copy_size = output_shapes[i]->Dims(axis) * base_inner_size;
+      memcpy(output_data[i] + k * copy_size, input_ptr, copy_size * sizeof(Scalar));
+      input_ptr += copy_size;
+    }
+  }
+}
+
+inline int NodeOffset(int b, int h, int w, int height, int width)
+{
+  return (b * height + h) * width + w;
+}
+
+inline void LocalResponseNormalization(const tflite::LocalResponseNormalizationParams &op_params,
+                                       const RuntimeShape &input_shape, const float *input_data,
+                                       const RuntimeShape &output_shape, float *output_data)
+{
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int i = 0; i < outer_size; ++i)
+  {
+    for (int c = 0; c < depth; ++c)
+    {
+      const int begin_input_c = std::max(0, static_cast<int>(c - op_params.range));
+      const int end_input_c = std::min(depth, static_cast<int>(c + op_params.range));
+      float accum = 0.f;
+      for (int input_c = begin_input_c; input_c < end_input_c; ++input_c)
+      {
+        const float input_val = input_data[i * depth + input_c];
+        accum += input_val * input_val;
+      }
+      const float multiplier = std::pow(op_params.bias + op_params.alpha * accum, -op_params.beta);
+      output_data[i * depth + c] = input_data[i * depth + c] * multiplier;
+    }
+  }
+}
+
+inline void Dequantize(const RuntimeShape &input_shape, const Eigen::half *input_data,
+                       const RuntimeShape &output_shape, float *output_data)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; i++)
+  {
+    output_data[i] = static_cast<float>(input_data[i]);
+  }
+}
+
+inline void FakeQuant(const tflite::FakeQuantParams &op_params, const RuntimeShape &input_shape,
+                      const float *input_data, const RuntimeShape &output_shape, float *output_data)
+{
+  ruy::profiler::ScopeLabel label("FakeQuant");
+  float rmin = op_params.minmax.min;
+  float rmax = op_params.minmax.max;
+  int num_bits = op_params.num_bits;
+  // 0 should always be a representable value. Let's assume that the initial
+  // min,max range contains 0.
+  TFLITE_DCHECK_LE(rmin, 0.0f);
+  TFLITE_DCHECK_GE(rmax, 0.0f);
+  TFLITE_DCHECK_LT(rmin, rmax);
+
+  // Code matches tensorflow's FakeQuantWithMinMaxArgsFunctor.
+  int quant_min = 0;
+  int quant_max = (1 << num_bits) - 1;
+  float nudged_min, nudged_max, nudged_scale;
+  NudgeQuantizationRange(rmin, rmax, quant_min, quant_max, &nudged_min, &nudged_max, &nudged_scale);
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  FakeQuantizeArray(nudged_scale, nudged_min, nudged_max, input_data, output_data, flat_size);
+}
+
+// Common subroutine for both `GatherNd` and `GatherNdString`.
+struct GatherNdHelperResult
+{
+  int n_slices;
+  int slice_size;
+  int indices_nd;
+  std::vector<int> dims_to_count;
+};
+
+// Returns common values being used on both `GatherNd` and `GatherNdString`.
+inline GatherNdHelperResult GatherNdHelper(const RuntimeShape &params_shape,
+                                           const RuntimeShape &indices_shape)
+{
+  GatherNdHelperResult ret;
+  ret.n_slices = 1;
+  ret.slice_size = 1;
+  const int indices_dims = indices_shape.DimensionsCount();
+  ret.indices_nd = indices_shape.Dims(indices_dims - 1);
+  const int params_dims = params_shape.DimensionsCount();
+  for (int i = 0; i < indices_dims - 1; ++i)
+  {
+    ret.n_slices *= indices_shape.Dims(i);
+  }
+  for (int i = ret.indices_nd; i < params_dims; ++i)
+  {
+    ret.slice_size *= params_shape.Dims(i);
+  }
+
+  int remain_flat_size = params_shape.FlatSize();
+  ret.dims_to_count = std::vector<int>(ret.indices_nd, 0);
+  for (int i = 0; i < ret.indices_nd; ++i)
+  {
+    ret.dims_to_count[i] = remain_flat_size / params_shape.Dims(i);
+    remain_flat_size = ret.dims_to_count[i];
+  }
+
+  return ret;
+}
+
+template <typename ParamsT, typename IndicesT = int32>
+inline void GatherNd(const RuntimeShape &params_shape, const ParamsT *params_data,
+                     const RuntimeShape &indices_shape, const IndicesT *indices_data,
+                     const RuntimeShape &output_shape, ParamsT *output_data)
+{
+  ruy::profiler::ScopeLabel label("GatherNd");
+
+  const GatherNdHelperResult res = GatherNdHelper(params_shape, indices_shape);
+  for (int i = 0; i < res.n_slices; ++i)
+  {
+    int from_pos = 0;
+    for (int j = 0; j < res.indices_nd; ++j)
+    {
+      from_pos += indices_data[i * res.indices_nd + j] * res.dims_to_count[j];
+    }
+    std::memcpy(output_data + i * res.slice_size, params_data + from_pos,
+                sizeof(ParamsT) * res.slice_size);
+  }
+}
+
+#ifndef TF_LITE_STATIC_MEMORY
+template <typename IndicesT = int32>
+inline void GatherNdString(const RuntimeShape &params_shape, const TfLiteTensor *params_data,
+                           const RuntimeShape &indices_shape, const IndicesT *indices_data,
+                           const RuntimeShape &output_shape, TfLiteTensor *output_data)
+{
+  ruy::profiler::ScopeLabel label("GatherNdString");
+
+  const GatherNdHelperResult res = GatherNdHelper(params_shape, indices_shape);
+  DynamicBuffer buffer;
+  for (int i = 0; i < res.n_slices; ++i)
+  {
+    int from_pos = 0;
+    for (int j = 0; j < res.indices_nd; ++j)
+    {
+      from_pos += indices_data[i * res.indices_nd + j] * res.dims_to_count[j];
+    }
+    for (int j = 0; j < res.slice_size; ++j)
+    {
+      buffer.AddString(GetString(params_data, from_pos + j));
+    }
+  }
+  buffer.WriteToTensor(output_data, /*new_shape=*/nullptr);
+}
+#endif
+
+template <typename IndicesT, typename UpdatesT>
+inline void ScatterNd(const RuntimeShape &indices_shape, const IndicesT *indices_data,
+                      const RuntimeShape &updates_shape, const UpdatesT *updates_data,
+                      const RuntimeShape &output_shape, UpdatesT *output_data)
+{
+  ruy::profiler::ScopeLabel label("ScatterNd");
+
+  int n_slices = 1;
+  int slice_size = 1;
+  const int outer_dims = indices_shape.DimensionsCount() - 1;
+  const int indices_nd = indices_shape.Dims(outer_dims);
+  const int updates_dims = updates_shape.DimensionsCount();
+  for (int i = 0; i < outer_dims; ++i)
+  {
+    n_slices *= indices_shape.Dims(i);
+  }
+  for (int i = outer_dims; i < updates_dims; ++i)
+  {
+    slice_size *= updates_shape.Dims(i);
+  }
+
+  int output_flat_size = output_shape.FlatSize();
+  int remain_flat_size = output_flat_size;
+  std::vector<int> dims_to_count(indices_nd, 0);
+  for (int i = 0; i < indices_nd; ++i)
+  {
+    dims_to_count[i] = remain_flat_size / output_shape.Dims(i);
+    remain_flat_size = dims_to_count[i];
+  }
+
+  memset(output_data, 0, sizeof(UpdatesT) * output_flat_size);
+  for (int i = 0; i < n_slices; ++i)
+  {
+    int to_pos = 0;
+    for (int j = 0; j < indices_nd; ++j)
+    {
+      IndicesT idx = indices_data[i * indices_nd + j];
+      TFLITE_DCHECK(0 <= idx && idx < output_shape.Dims(j));
+      to_pos += idx * dims_to_count[j];
+    }
+    for (int j = 0; j < slice_size; j++)
+    {
+      output_data[to_pos + j] += updates_data[i * slice_size + j];
+    }
+  }
+}
+
+template <typename T>
+inline void Slice(const tflite::SliceParams &op_params, const RuntimeShape &input_shape,
+                  const RuntimeShape &output_shape, SequentialTensorWriter<T> *writer)
+{
+  const RuntimeShape ext_shape = RuntimeShape::ExtendedShape(5, input_shape);
+  TFLITE_DCHECK_LE(op_params.begin_count, 5);
+  TFLITE_DCHECK_LE(op_params.size_count, 5);
+  const int begin_count = op_params.begin_count;
+  const int size_count = op_params.size_count;
+  // We front-pad the begin and size vectors.
+  std::array<int, 5> start;
+  std::array<int, 5> stop;
+  for (int i = 0; i < 5; ++i)
+  {
+    int padded_i = 5 - i;
+    start[i] = begin_count < padded_i ? 0 : op_params.begin[begin_count - padded_i];
+    stop[i] = (size_count < padded_i || op_params.size[size_count - padded_i] == -1)
+                ? ext_shape.Dims(i)
+                : start[i] + op_params.size[size_count - padded_i];
+  }
+
+  for (int i0 = start[0]; i0 < stop[0]; ++i0)
+  {
+    for (int i1 = start[1]; i1 < stop[1]; ++i1)
+    {
+      for (int i2 = start[2]; i2 < stop[2]; ++i2)
+      {
+        for (int i3 = start[3]; i3 < stop[3]; ++i3)
+        {
+          for (int i4 = start[4]; i4 < stop[4]; ++i4)
+          {
+            writer->Write(Offset(ext_shape, i0, i1, i2, i3, i4));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void Slice(const tflite::SliceParams &op_params, const RuntimeShape &input_shape,
+                  const T *input_data, const RuntimeShape &output_shape, T *output_data)
+{
+  SequentialTensorWriter<T> writer(input_data, output_data);
+  return Slice(op_params, input_shape, output_shape, &writer);
+}
+
+template <typename T>
+inline void Slice(const tflite::SliceParams &op_params, const RuntimeShape &input_shape,
+                  const TfLiteTensor *input, const RuntimeShape &output_shape, TfLiteTensor *output)
+{
+  SequentialTensorWriter<T> writer(input, output);
+  return Slice(op_params, input_shape, output_shape, &writer);
+}
+
+template <typename T>
+void Minimum(const RuntimeShape &input1_shape, const T *input1_data, const T *input2_data,
+             const RuntimeShape &output_shape, T *output_data)
+{
+  const int flat_size = MatchingFlatSize(input1_shape, output_shape);
+
+  auto min_value = input2_data[0];
+  for (int i = 0; i < flat_size; i++)
+  {
+    output_data[i] = input1_data[i] > min_value ? min_value : input1_data[i];
+  }
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// the same as other binary ops.
+template <typename T>
+inline void Minimum(const RuntimeShape &input1_shape, const T *input1_data, const RuntimeShape &,
+                    const T *input2_data, const RuntimeShape &output_shape, T *output_data)
+{
+  // Drop shape of second input: not needed.
+  Minimum(input1_shape, input1_data, input2_data, output_shape, output_data);
+}
+
+template <typename T>
+void Maximum(const RuntimeShape &input1_shape, const T *input1_data, const T *input2_data,
+             const RuntimeShape &output_shape, T *output_data)
+{
+  const int flat_size = MatchingFlatSize(input1_shape, output_shape);
+
+  auto max_value = input2_data[0];
+  for (int i = 0; i < flat_size; i++)
+  {
+    output_data[i] = input1_data[i] < max_value ? max_value : input1_data[i];
+  }
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// the same as other binary ops.
+template <typename T>
+inline void Maximum(const RuntimeShape &input1_shape, const T *input1_data, const RuntimeShape &,
+                    const T *input2_data, const RuntimeShape &output_shape, T *output_data)
+{
+  // Drop shape of second input: not needed.
+  Maximum(input1_shape, input1_data, input2_data, output_shape, output_data);
+}
+
+template <typename T1, typename T2, typename T3>
+void ArgMax(const RuntimeShape &input1_shape, const T1 *input1_data, const T3 *input2_data,
+            const RuntimeShape &output_shape, T2 *output_data)
+{
+  ArgMinMax(input1_shape, input1_data, input2_data, output_shape, output_data, std::greater<T1>());
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// the same as other binary ops.
+template <typename T1, typename T2, typename T3>
+inline void ArgMax(const RuntimeShape &input1_shape, const T1 *input1_data,
+                   const RuntimeShape &input2_shape, const T3 *input2_data,
+                   const RuntimeShape &output_shape, T2 *output_data)
+{
+  // Drop shape of second input: not needed.
+  ArgMax(input1_shape, input1_data, input2_data, output_shape, output_data);
+}
+
+template <typename D, typename T>
+void Select(const RuntimeShape &input_condition_shape, const D *input_condition_data,
+            const RuntimeShape &input_x_shape, const T *input_x_data,
+            const RuntimeShape &input_y_shape, const T *input_y_data,
+            const RuntimeShape &output_shape, T *output_data)
+{
+  int64_t flatsize;
+  // Allow select operator executions on mixed scalar tensors and one element
+  // tensors.
+  if (input_condition_shape.FlatSize() == 1 && input_x_shape.FlatSize() == 1 &&
+      input_y_shape.FlatSize() == 1 && output_shape.FlatSize() == 1)
+  {
+    flatsize = 1;
+  }
+  else
+  {
+    flatsize = MatchingFlatSize(input_condition_shape, input_x_shape, input_y_shape, output_shape);
+  }
+  for (int64_t i = 0; i < flatsize; ++i)
+  {
+    output_data[i] = input_condition_data[i] ? input_x_data[i] : input_y_data[i];
+  }
+}
+
+template <typename D, typename T>
+void RankOneSelect(const RuntimeShape &input_condition_shape, const D *input_condition_data,
+                   const RuntimeShape &input_x_shape, const T *input_x_data,
+                   const RuntimeShape &input_y_shape, const T *input_y_data,
+                   const RuntimeShape &output_shape, T *output_data)
+{
+  const int64_t outer_size = input_condition_shape.FlatSize();
+  int64_t inner_size;
+  if (input_condition_shape.DimensionsCount() == 0)
+  {
+    inner_size = MatchingFlatSize(input_x_shape, input_y_shape, output_shape);
+  }
+  else
+  {
+    TFLITE_DCHECK_EQ(MatchingDim(input_x_shape, 0, input_y_shape, 0, output_shape, 0), outer_size);
+    inner_size = MatchingFlatSizeSkipDim(input_x_shape, 0, input_y_shape, output_shape);
+  }
+
+  int64_t offset = 0;
+  for (int64_t i = 0; i < outer_size; i++)
+  {
+    const T *input_data = input_condition_data[i] ? input_x_data : input_y_data;
+    memcpy(output_data + offset, input_data + offset, inner_size * sizeof(T));
+    offset += inner_size;
+  }
+}
+
+template <typename D, typename T>
+void BroadcastSelect4DSlow(const RuntimeShape &input_condition_shape, const D *input_condition_data,
+                           const RuntimeShape &input_x_shape, const T *input_x_data,
+                           const RuntimeShape &input_y_shape, const T *input_y_data,
+                           const RuntimeShape &output_shape, T *output_data)
+{
+  TFLITE_DCHECK_LE(input_condition_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(input_x_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(input_y_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 4);
+
+  const RuntimeShape extended_output_shape = RuntimeShape::ExtendedShape(4, output_shape);
+
+  NdArrayDesc<4> desc_condition;
+  NdArrayDesc<4> desc_x;
+  NdArrayDesc<4> desc_y;
+  NdArrayDescsForElementwiseBroadcast(input_condition_shape, input_x_shape, input_y_shape,
+                                      &desc_condition, &desc_x, &desc_y);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest
+  // stride, typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for
+  // the best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b)
+  {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y)
+    {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x)
+      {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c)
+        {
+          const int condition_index = SubscriptToIndex(desc_condition, b, y, x, c);
+          const int x_index = SubscriptToIndex(desc_x, b, y, x, c);
+          const int y_index = SubscriptToIndex(desc_y, b, y, x, c);
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+            input_condition_data[condition_index] ? input_x_data[x_index] : input_y_data[y_index];
+        }
+      }
+    }
+  }
+}
+
+template <typename D, typename T>
+void SelectTrueCoords(const RuntimeShape &input_condition_shape, const D *input_condition_data,
+                      T *output_data)
+{
+  const size_t size = input_condition_shape.FlatSize();
+  if (size == 0)
+  {
+    // Dimension is zero, in which case we don't need to output.
+    return;
+  }
+  const size_t cond_rank = input_condition_shape.DimensionsCount();
+
+  std::vector<int> dims_to_count(cond_rank, 0);
+  int cur_flat_size = size;
+  for (int i = 0; i < cond_rank; ++i)
+  {
+    dims_to_count[i] = cur_flat_size / input_condition_shape.Dims(i);
+    cur_flat_size = dims_to_count[i];
+  }
+
+  int output_index = 0;
+  for (int i = 0; i < size; ++i)
+  {
+    if (input_condition_data[i])
+    {
+      // Insert the coordinate of the current item (row major) into output.
+      int flat_index = i;
+      for (int j = 0; j < cond_rank; ++j)
+      {
+        int coord_j = flat_index / dims_to_count[j];
+        output_data[output_index * cond_rank + j] = coord_j;
+        flat_index %= dims_to_count[j];
+      }
+      output_index++;
+    }
+  }
+}
+
+// For easy implementation, the indices is always a vector of size-4 vectors.
+template <typename T, typename TI>
+inline void SparseToDense(const std::vector<std::vector<TI>> &indices, const T *values,
+                          T default_value, bool value_is_scalar,
+                          const RuntimeShape &unextended_output_shape, T *output_data)
+{
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape output_shape = RuntimeShape::ExtendedShape(4, unextended_output_shape);
+  const int value_count = indices.size();
+
+  // First fill the output_data with default value.
+  const int num_elements = output_shape.FlatSize();
+  for (int i = 0; i < num_elements; ++i)
+  {
+    output_data[i] = default_value;
+  }
+
+  // Special handle for value is scalar case to avoid checking the boolean
+  // condition within the loop every time.
+  if (value_is_scalar)
+  {
+    for (int i = 0; i < value_count; ++i)
+    {
+      const std::vector<TI> &index = indices[i];
+      TFLITE_DCHECK_EQ(index.size(), 4);
+      const T value = *values; // just use the first value.
+      output_data[Offset(output_shape, index[0], index[1], index[2], index[3])] = value;
+    }
+    return;
+  }
+
+  // Go through the values and indices to fill the sparse values.
+  for (int i = 0; i < value_count; ++i)
+  {
+    const std::vector<TI> &index = indices[i];
+    TFLITE_DCHECK_EQ(index.size(), 4);
+    const T value = values[i];
+    output_data[Offset(output_shape, index[0], index[1], index[2], index[3])] = value;
+  }
+}
+
+template <typename T>
+inline void Pow(const RuntimeShape &input1_shape, const T *input1_data,
+                const RuntimeShape &input2_shape, const T *input2_data,
+                const RuntimeShape &output_shape, T *output_data)
+{
+  const int flat_size = MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    output_data[i] = std::pow(input1_data[i], input2_data[i]);
+  }
+}
+
+template <typename T>
+inline void BroadcastPow4DSlow(const RuntimeShape &unextended_input1_shape, const T *input1_data,
+                               const RuntimeShape &unextended_input2_shape, const T *input2_data,
+                               const RuntimeShape &unextended_output_shape, T *output_data)
+{
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape output_shape = RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
+                                      &desc2);
+
+  for (int b = 0; b < output_shape.Dims(0); ++b)
+  {
+    for (int y = 0; y < output_shape.Dims(1); ++y)
+    {
+      for (int x = 0; x < output_shape.Dims(2); ++x)
+      {
+        for (int c = 0; c < output_shape.Dims(3); ++c)
+        {
+          auto out_idx = Offset(output_shape, b, y, x, c);
+          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
+          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
+          auto in1_val = input1_data[in1_idx];
+          auto in2_val = input2_data[in2_idx];
+          output_data[out_idx] = std::pow(in1_val, in2_val);
+        }
+      }
+    }
+  }
+}
+
+template <typename Scalar>
+void Reverse(int axis, const RuntimeShape &input_shape, const Scalar *input_data,
+             const RuntimeShape &output_shape, Scalar *output_data)
+{
+  ruy::profiler::ScopeLabel label("Reverse");
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; ++i)
+  {
+    outer_size *= input_shape.Dims(i);
+  }
+
+  int copy_size = 1;
+  for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i)
+  {
+    copy_size *= input_shape.Dims(i);
+  }
+
+  const int dims_at_axis = input_shape.Dims(axis);
+  for (int i = 0; i < outer_size; ++i)
+  {
+    for (int j = 0; j < dims_at_axis; ++j)
+    {
+      const int start_pos = (i * dims_at_axis + j) * copy_size;
+      Scalar *output_ptr = output_data + start_pos;
+      int loc = (i * dims_at_axis + dims_at_axis - j - 1) * copy_size;
+      memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
+    }
+  }
+}
+
+template <typename Scalar, typename TS>
+void ReverseSequence(const TS *seq_lengths, const int seq_dim, const int batch_dim,
+                     const RuntimeShape &input_shape, const Scalar *input_data,
+                     const RuntimeShape &output_shape, Scalar *output_data)
+{
+  ruy::profiler::ScopeLabel label("ReverseSequence");
+
+  int outer_size = 1;
+  int outer_dim = std::min(batch_dim, seq_dim);
+  int medium_dim = std::max(batch_dim, seq_dim);
+  for (int i = 0; i < outer_dim; ++i)
+  {
+    outer_size *= input_shape.Dims(i);
+  }
+
+  int medium_size = 1;
+  for (int i = outer_dim + 1; i < medium_dim; ++i)
+  {
+    medium_size *= input_shape.Dims(i);
+  }
+
+  int copy_size = 1;
+  for (int i = medium_dim + 1; i < input_shape.DimensionsCount(); ++i)
+  {
+    copy_size *= input_shape.Dims(i);
+  }
+
+  const int dims_at_outer_dim = input_shape.Dims(outer_dim);
+  const int dims_at_medium_dim = input_shape.Dims(medium_dim);
+
+  Scalar *output_ptr;
+  if (batch_dim > seq_dim)
+  {
+    for (int i = 0; i < outer_size; ++i)
+    {
+      for (int j = 0; j < dims_at_outer_dim; ++j)
+      {
+        const int in_pos_base = (i * dims_at_outer_dim + j) * medium_size;
+        for (int p = 0; p < medium_size; ++p)
+        {
+          for (int q = 0; q < dims_at_medium_dim; ++q)
+          {
+            const int in_pos = ((in_pos_base + p) * dims_at_medium_dim + q) * copy_size;
+            const Scalar *in_ptr = input_data + in_pos;
+            int sl = seq_lengths[q] - 1;
+            if (j > sl)
+            {
+              output_ptr = output_data + in_pos;
+            }
+            else
+            {
+              const int out_pos_base = (i * dims_at_outer_dim + sl - j) * medium_size;
+              const int out_pos = ((out_pos_base + p) * dims_at_medium_dim + q) * copy_size;
+              output_ptr = output_data + out_pos;
+            }
+            memcpy(output_ptr, in_ptr, copy_size * sizeof(Scalar));
+          }
+        }
+      }
+    }
+  }
+  else if (batch_dim < seq_dim)
+  {
+    for (int i = 0; i < outer_size; ++i)
+    {
+      for (int j = 0; j < dims_at_outer_dim; ++j)
+      {
+        const int in_pos_base = (i * dims_at_outer_dim + j) * medium_size;
+        int sl = seq_lengths[j] - 1;
+        const int out_pos_base = (i * dims_at_outer_dim + j) * medium_size;
+        for (int p = 0; p < medium_size; ++p)
+        {
+          for (int q = 0; q < dims_at_medium_dim; ++q)
+          {
+            const int in_pos = ((in_pos_base + p) * dims_at_medium_dim + q) * copy_size;
+            const Scalar *in_ptr = input_data + in_pos;
+            if (q > sl)
+            {
+              output_ptr = output_data + in_pos;
+            }
+            else
+            {
+              const int out_pos = ((out_pos_base + p) * dims_at_medium_dim + sl - q) * copy_size;
+              output_ptr = output_data + out_pos;
+            }
+            memcpy(output_ptr, in_ptr, copy_size * sizeof(Scalar));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void SegmentSum(const RuntimeShape &input_shape, const T *input_data,
+                       const RuntimeShape &segment_ids_shape, const int32_t *segment_ids_data,
+                       const RuntimeShape &output_shape, T *output_data)
+{
+  const int segment_flat_size = MatchingFlatSizeSkipDim(input_shape, 0, output_shape);
+
+  memset(output_data, 0, sizeof(T) * output_shape.FlatSize());
+
+  for (int i = 0; i < input_shape.Dims(0); i++)
+  {
+    int output_index = segment_ids_data[i];
+    for (int j = 0; j < segment_flat_size; ++j)
+    {
+      output_data[output_index * segment_flat_size + j] += input_data[i * segment_flat_size + j];
+    }
+  }
+}
+
+} // namespace reference_ops
+} // namespace tflite
+
+#endif // LUCI_INTERPRETER_PAL_REFERENCE_OPS_H
diff --git a/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst b/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst
index 428b15ee0..1e6c41ecc 100644
--- a/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst
+++ b/compiler/luci-interpreter/pal/linux/KernelsToBuild.lst
@@ -13,6 +13,7 @@ REGISTER_KERNEL(Div)
 REGISTER_KERNEL(Elu)
 REGISTER_KERNEL(Exp)
 REGISTER_KERNEL(ExpandDims)
+REGISTER_KERNEL(Fill)
 REGISTER_KERNEL(Floor)
 REGISTER_KERNEL(FloorDiv)
 REGISTER_KERNEL(Equal)
@@ -48,6 +49,7 @@ REGISTER_KERNEL(PadV2)
 REGISTER_KERNEL(Pow)
 REGISTER_KERNEL(PRelu)
 REGISTER_KERNEL(Quantize)
+REGISTER_KERNEL(ReduceMax)
 REGISTER_KERNEL(Relu)
 REGISTER_KERNEL(Relu6)
 REGISTER_KERNEL(Reshape)
@@ -55,6 +57,7 @@ REGISTER_KERNEL(ResizeBilinear)
 REGISTER_KERNEL(ResizeNearestNeighbor)
 REGISTER_KERNEL(ReverseV2)
 REGISTER_KERNEL(Rsqrt)
+REGISTER_KERNEL(Shape)
 REGISTER_KERNEL(Slice)
 REGISTER_KERNEL(Softmax)
 REGISTER_KERNEL(SpaceToBatchND)
diff --git a/compiler/luci-interpreter/pal/linux/PALreference_ops.h b/compiler/luci-interpreter/pal/linux/PALreference_ops.h
new file mode 100644
index 000000000..825ebfe8e
--- /dev/null
+++ b/compiler/luci-interpreter/pal/linux/PALreference_ops.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_REFERENCE_OPS_H
+#define LUCI_INTERPRETER_PAL_REFERENCE_OPS_H
+
+#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+
+#endif // LUCI_INTERPRETER_PAL_REFERENCE_OPS_H
diff --git a/compiler/luci-interpreter/pal/mcu/KernelsToBuild.lst b/compiler/luci-interpreter/pal/mcu/KernelsToBuild.lst
index d134a6b95..f0df58db3 100644
--- a/compiler/luci-interpreter/pal/mcu/KernelsToBuild.lst
+++ b/compiler/luci-interpreter/pal/mcu/KernelsToBuild.lst
@@ -12,6 +12,7 @@ REGISTER_KERNEL(Div)
 REGISTER_KERNEL(Elu)
 REGISTER_KERNEL(Exp)
 REGISTER_KERNEL(ExpandDims)
+REGISTER_KERNEL(Fill)
 REGISTER_KERNEL(Floor)
 REGISTER_KERNEL(FloorDiv)
 REGISTER_KERNEL(Equal)
@@ -44,6 +45,7 @@ REGISTER_KERNEL(Reshape)
 REGISTER_KERNEL(ResizeBilinear)
 REGISTER_KERNEL(ResizeNearestNeighbor)
 REGISTER_KERNEL(Rsqrt)
+REGISTER_KERNEL(Shape)
 REGISTER_KERNEL(Softmax)
 REGISTER_KERNEL(SpaceToBatchND)
 REGISTER_KERNEL(SpaceToDepth)
diff --git a/compiler/luci-interpreter/pal/mcu/PALDequantize.h b/compiler/luci-interpreter/pal/mcu/PALDequantize.h
index 15ff0327b..efa6b167e 100644
--- a/compiler/luci-interpreter/pal/mcu/PALDequantize.h
+++ b/compiler/luci-interpreter/pal/mcu/PALDequantize.h
@@ -18,7 +18,7 @@
 #define LUCI_INTERPRETER_PAL_DEQUANTIZE_H
 
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h"
-#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "PALreference_ops.h"
 
 namespace luci_interpreter_pal
 {
diff --git a/compiler/luci-interpreter/pal/mcu/PALQuantize.h b/compiler/luci-interpreter/pal/mcu/PALQuantize.h
index 6046789ae..effb85d54 100644
--- a/compiler/luci-interpreter/pal/mcu/PALQuantize.h
+++ b/compiler/luci-interpreter/pal/mcu/PALQuantize.h
@@ -17,7 +17,7 @@
 #ifndef LUCI_INTERPRETER_PAL_QUANTIZE_H
 #define LUCI_INTERPRETER_PAL_QUANTIZE_H
 
-#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "PALreference_ops.h"
 
 namespace luci_interpreter_pal
 {
diff --git a/compiler/luci-interpreter/pal/mcu/PALreference_ops.h b/compiler/luci-interpreter/pal/mcu/PALreference_ops.h
new file mode 100644
index 000000000..62c720937
--- /dev/null
+++ b/compiler/luci-interpreter/pal/mcu/PALreference_ops.h
@@ -0,0 +1,1556 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_REFERENCE_OPS_H
+#define LUCI_INTERPRETER_PAL_REFERENCE_OPS_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <type_traits>
+
+#include "third_party/eigen3/Eigen/Core"
+#include "fixedpoint/fixedpoint.h"
+#include "ruy/profiler/instrumentation.h" // from @ruy
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/add.h"
+#include "tensorflow/lite/kernels/internal/reference/add_n.h"
+#include "tensorflow/lite/kernels/internal/reference/arg_min_max.h"
+#include "tensorflow/lite/kernels/internal/reference/batch_matmul.h"
+#include "tensorflow/lite/kernels/internal/reference/batch_to_space_nd.h"
+#include "tensorflow/lite/kernels/internal/reference/binary_function.h"
+#include "tensorflow/lite/kernels/internal/reference/cast.h"
+#include "tensorflow/lite/kernels/internal/reference/ceil.h"
+#include "tensorflow/lite/kernels/internal/reference/comparisons.h"
+#include "tensorflow/lite/kernels/internal/reference/concatenation.h"
+#include "tensorflow/lite/kernels/internal/reference/conv.h"
+#include "tensorflow/lite/kernels/internal/reference/depth_to_space.h"
+#include "tensorflow/lite/kernels/internal/reference/dequantize.h"
+#include "tensorflow/lite/kernels/internal/reference/div.h"
+#include "tensorflow/lite/kernels/internal/reference/elu.h"
+#include "tensorflow/lite/kernels/internal/reference/exp.h"
+#include "tensorflow/lite/kernels/internal/reference/fill.h"
+#include "tensorflow/lite/kernels/internal/reference/floor.h"
+#include "tensorflow/lite/kernels/internal/reference/floor_div.h"
+#include "tensorflow/lite/kernels/internal/reference/floor_mod.h"
+#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/lite/kernels/internal/reference/gather.h"
+#include "tensorflow/lite/kernels/internal/reference/hard_swish.h"
+#include "tensorflow/lite/kernels/internal/reference/l2normalization.h"
+#include "tensorflow/lite/kernels/internal/reference/leaky_relu.h"
+#include "tensorflow/lite/kernels/internal/reference/log_softmax.h"
+#include "tensorflow/lite/kernels/internal/reference/logistic.h"
+#include "tensorflow/lite/kernels/internal/reference/maximum_minimum.h"
+#include "tensorflow/lite/kernels/internal/reference/mul.h"
+#include "tensorflow/lite/kernels/internal/reference/neg.h"
+#include "tensorflow/lite/kernels/internal/reference/pad.h"
+#include "tensorflow/lite/kernels/internal/reference/pooling.h"
+#include "tensorflow/lite/kernels/internal/reference/prelu.h"
+#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
+#include "tensorflow/lite/kernels/internal/reference/quantize.h"
+#include "tensorflow/lite/kernels/internal/reference/reduce.h"
+#include "tensorflow/lite/kernels/internal/reference/requantize.h"
+#include "tensorflow/lite/kernels/internal/reference/resize_bilinear.h"
+#include "tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h"
+#include "tensorflow/lite/kernels/internal/reference/round.h"
+#include "tensorflow/lite/kernels/internal/reference/softmax.h"
+#include "tensorflow/lite/kernels/internal/reference/space_to_batch_nd.h"
+#include "tensorflow/lite/kernels/internal/reference/space_to_depth.h"
+#include "tensorflow/lite/kernels/internal/reference/strided_slice.h"
+#include "tensorflow/lite/kernels/internal/reference/string_comparisons.h"
+#include "tensorflow/lite/kernels/internal/reference/sub.h"
+#include "tensorflow/lite/kernels/internal/reference/tanh.h"
+#include "tensorflow/lite/kernels/internal/reference/transpose.h"
+#include "tensorflow/lite/kernels/internal/reference/transpose_conv.h"
+#include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+namespace tflite
+{
+
+namespace reference_ops
+{
+
+template <typename T>
+inline void Relu(const RuntimeShape &input_shape, const T *input_data,
+                 const RuntimeShape &output_shape, T *output_data)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    const T val = input_data[i];
+    const T lower = 0;
+    const T clamped = val < lower ? lower : val;
+    output_data[i] = clamped;
+  }
+}
+
+template <typename T>
+inline void Relu1(const RuntimeShape &input_shape, const T *input_data,
+                  const RuntimeShape &output_shape, T *output_data)
+{
+  ruy::profiler::ScopeLabel label("Relu1 (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    const T val = input_data[i];
+    const T upper = 1;
+    const T lower = -1;
+    const T clamped = val > upper ? upper : val < lower ? lower : val;
+    output_data[i] = clamped;
+  }
+}
+
+inline void Relu6(const RuntimeShape &input_shape, const float *input_data,
+                  const RuntimeShape &output_shape, float *output_data)
+{
+  ruy::profiler::ScopeLabel label("Relu6 (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    const float val = input_data[i];
+    const float upper = 6;
+    const float lower = 0;
+    const float clamped = val > upper ? upper : val < lower ? lower : val;
+    output_data[i] = clamped;
+  }
+}
+
+template <typename T>
+inline void ReluX(const tflite::ReluParams &params, const RuntimeShape &input_shape,
+                  const T *input_data, const RuntimeShape &output_shape, T *output_data)
+{
+  ruy::profiler::ScopeLabel label("Quantized ReluX (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    const int32 val = static_cast<int32_t>(input_data[i]);
+    int32 clamped = params.output_offset + MultiplyByQuantizedMultiplier(val - params.input_offset,
+                                                                         params.output_multiplier,
+                                                                         params.output_shift);
+    clamped = std::max(params.quantized_activation_min, clamped);
+    clamped = std::min(params.quantized_activation_max, clamped);
+    output_data[i] = static_cast<T>(clamped);
+  }
+}
+
+template <typename T>
+inline void ReluX(const tflite::ActivationParams &params, const RuntimeShape &input_shape,
+                  const T *input_data, const RuntimeShape &output_shape, T *output_data)
+{
+  ruy::profiler::ScopeLabel label("Quantized ReluX (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  const T max_value = params.quantized_activation_max;
+  const T min_value = params.quantized_activation_min;
+  for (int i = 0; i < flat_size; ++i)
+  {
+    const T val = input_data[i];
+    const T clamped = val > max_value ? max_value : val < min_value ? min_value : val;
+    output_data[i] = clamped;
+  }
+}
+
+// TODO(jiawen): We can implement BroadcastMul on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+inline void BroadcastMulFivefold(const ArithmeticParams &unswitched_params,
+                                 const RuntimeShape &unswitched_input1_shape,
+                                 const uint8 *unswitched_input1_data,
+                                 const RuntimeShape &unswitched_input2_shape,
+                                 const uint8 *unswitched_input2_data,
+                                 const RuntimeShape &output_shape, uint8 *output_data)
+{
+  ArithmeticParams switched_params = unswitched_params;
+  switched_params.input1_offset = unswitched_params.input2_offset;
+  switched_params.input2_offset = unswitched_params.input1_offset;
+
+  const bool use_unswitched = unswitched_params.broadcast_category ==
+                              tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+  const ArithmeticParams &params = use_unswitched ? unswitched_params : switched_params;
+  const uint8 *input1_data = use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+  const uint8 *input2_data = use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+  // Fivefold nested loops. The second input resets its position for each
+  // iteration of the second loop. The first input resets its position at the
+  // beginning of the fourth loop. The innermost loop is an elementwise Mul of
+  // sections of the arrays.
+  uint8 *output_data_ptr = output_data;
+  const uint8 *input1_data_ptr = input1_data;
+  const uint8 *input2_data_reset = input2_data;
+  int y0 = params.broadcast_shape[0];
+  int y1 = params.broadcast_shape[1];
+  int y2 = params.broadcast_shape[2];
+  int y3 = params.broadcast_shape[3];
+  int y4 = params.broadcast_shape[4];
+  for (int i0 = 0; i0 < y0; ++i0)
+  {
+    const uint8 *input2_data_ptr;
+    for (int i1 = 0; i1 < y1; ++i1)
+    {
+      input2_data_ptr = input2_data_reset;
+      for (int i2 = 0; i2 < y2; ++i2)
+      {
+        for (int i3 = 0; i3 < y3; ++i3)
+        {
+          MulElementwise(y4, params, input1_data_ptr, input2_data_ptr, output_data_ptr);
+          input2_data_ptr += y4;
+          output_data_ptr += y4;
+        }
+        input1_data_ptr += y4;
+      }
+    }
+    input2_data_reset = input2_data_ptr;
+  }
+}
+
+inline void Mul(const ArithmeticParams &params, const RuntimeShape &input1_shape,
+                const int16 *input1_data, const RuntimeShape &input2_shape,
+                const int16 *input2_data, const RuntimeShape &output_shape, int16 *output_data)
+{
+  ruy::profiler::ScopeLabel label("Mul/Int16");
+
+  const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++)
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 unclamped_result = F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
+    output_data[i] = unclamped_result.raw();
+  }
+}
+
+inline void Mul(const ArithmeticParams &params, const RuntimeShape &input1_shape,
+                const int16 *input1_data, const RuntimeShape &input2_shape,
+                const int16 *input2_data, const RuntimeShape &output_shape, uint8 *output_data)
+{
+  ruy::profiler::ScopeLabel label("Mul/Int16Uint8");
+  int32 output_offset = params.output_offset;
+  int32 output_activation_min = params.quantized_activation_min;
+  int32 output_activation_max = params.quantized_activation_max;
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+
+  const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++)
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+
+    F0 unclamped_result = F0::FromRaw(input1_data[i]) * F0::FromRaw(input2_data[i]);
+    int16 rescaled_result = gemmlowp::RoundingDivideByPOT(unclamped_result.raw(), 8);
+    int16 clamped_result = std::min<int16>(output_activation_max - output_offset, rescaled_result);
+    clamped_result = std::max<int16>(output_activation_min - output_offset, clamped_result);
+    output_data[i] = output_offset + clamped_result;
+  }
+}
+
+inline void Sub16(const ArithmeticParams &params, const RuntimeShape &input1_shape,
+                  const int16_t *input1_data, const RuntimeShape &input2_shape,
+                  const int16_t *input2_data, const RuntimeShape &output_shape,
+                  int16_t *output_data)
+{
+  ruy::profiler::ScopeLabel label("Sub/Int16");
+  const int input1_shift = params.input1_shift;
+  const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
+  const int16 output_activation_min = params.quantized_activation_min;
+  const int16 output_activation_max = params.quantized_activation_max;
+
+  TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
+  TFLITE_DCHECK_LE(input1_shift, 0);
+  TFLITE_DCHECK_LE(params.input2_shift, 0);
+  const int16 *not_shift_input = input1_shift == 0 ? input1_data : input2_data;
+  const int16 *shift_input = input1_shift == 0 ? input2_data : input1_data;
+  const int input_right_shift = input1_shift == 0 ? -params.input2_shift : -input1_shift;
+
+  if (input1_shift == 0)
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+    for (int i = 0; i < flat_size; ++i)
+    {
+      F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
+      F0 scaled_input =
+        F0::FromRaw(gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
+      F0 result = SaturatingSub(input_ready_scaled, scaled_input);
+      const int16 raw_output = result.raw();
+      const int16 clamped_output =
+        std::min(output_activation_max, std::max(output_activation_min, raw_output));
+      output_data[i] = clamped_output;
+    }
+  }
+  else
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+    for (int i = 0; i < flat_size; ++i)
+    {
+      F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
+      F0 scaled_input =
+        F0::FromRaw(gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
+      F0 result = SaturatingSub(scaled_input, input_ready_scaled);
+      const int16 raw_output = result.raw();
+      const int16 clamped_output =
+        std::min(output_activation_max, std::max(output_activation_min, raw_output));
+      output_data[i] = clamped_output;
+    }
+  }
+}
+
+template <typename Scalar>
+void Pack(const PackParams &params, const RuntimeShape *const *input_shapes,
+          const Scalar *const *input_data, const RuntimeShape &output_shape, Scalar *output_data)
+{
+  ruy::profiler::ScopeLabel label("Pack");
+  const int dimensions = output_shape.DimensionsCount();
+  int axis = params.axis;
+  int inputs_count = params.inputs_count;
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; i++)
+  {
+    outer_size *= output_shape.Dims(i);
+  }
+  int copy_size = 1;
+  for (int i = params.axis + 1; i < dimensions; i++)
+  {
+    copy_size *= output_shape.Dims(i);
+  }
+  TFLITE_DCHECK_EQ((**input_shapes).FlatSize(), copy_size * outer_size);
+
+  for (int i = 0; i < inputs_count; ++i)
+  {
+    for (int k = 0; k < outer_size; k++)
+    {
+      const Scalar *input_ptr = input_data[i] + copy_size * k;
+      int loc = k * inputs_count * copy_size + i * copy_size;
+      memcpy(output_data + loc, input_ptr, copy_size * sizeof(Scalar));
+    }
+  }
+}
+
+template <typename Scalar>
+void Unpack(const UnpackParams &params, const RuntimeShape &input_shape, const Scalar *input_data,
+            const RuntimeShape &output_shape, Scalar *const *output_datas)
+{
+  ruy::profiler::ScopeLabel label("Unpack");
+  const int dimensions = input_shape.DimensionsCount();
+  const int outputs_count = params.num_split;
+
+  int outer_size = 1;
+  int axis = params.axis;
+  if (axis < 0)
+  {
+    axis += dimensions;
+  }
+  TFLITE_DCHECK_GE(axis, 0);
+  TFLITE_DCHECK_LT(axis, dimensions);
+  for (int i = 0; i < axis; ++i)
+  {
+    outer_size *= input_shape.Dims(i);
+  }
+  int copy_size = 1;
+  for (int i = axis + 1; i < dimensions; ++i)
+  {
+    copy_size *= input_shape.Dims(i);
+  }
+  TFLITE_DCHECK_EQ(output_shape.FlatSize(), copy_size * outer_size);
+
+  for (int i = 0; i < outputs_count; ++i)
+  {
+    for (int k = 0; k < outer_size; k++)
+    {
+      Scalar *output_ptr = output_datas[i] + copy_size * k;
+      int loc = k * outputs_count * copy_size + i * copy_size;
+      memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
+    }
+  }
+}
+
+template <typename Scalar>
+void PackWithScaling(const PackParams &params, const RuntimeShape *const *input_shapes,
+                     const uint8 *const *input_data, const RuntimeShape &output_shape,
+                     uint8 *output_data)
+{
+  ruy::profiler::ScopeLabel label("PackWithScaling");
+  const int dimensions = output_shape.DimensionsCount();
+  int axis = params.axis;
+  const int32 *input_zeropoint = params.input_zeropoint;
+  const float *input_scale = params.input_scale;
+  int inputs_count = params.inputs_count;
+  const int32 output_zeropoint = params.output_zeropoint;
+  const float output_scale = params.output_scale;
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; i++)
+  {
+    outer_size *= output_shape.Dims(i);
+  }
+  int copy_size = 1;
+  for (int i = axis + 1; i < dimensions; i++)
+  {
+    copy_size *= output_shape.Dims(i);
+  }
+  TFLITE_DCHECK_EQ((**input_shapes).FlatSize(), copy_size * outer_size);
+
+  Scalar *output_ptr = output_data;
+  const float inverse_output_scale = 1.f / output_scale;
+  for (int k = 0; k < outer_size; k++)
+  {
+    for (int i = 0; i < inputs_count; ++i)
+    {
+      if (input_zeropoint[i] == output_zeropoint && input_scale[i] == output_scale)
+      {
+        memcpy(output_ptr, input_data[i] + k * copy_size, copy_size * sizeof(Scalar));
+      }
+      else
+      {
+        assert(false);
+        const float scale = input_scale[i] * inverse_output_scale;
+        const float bias = -input_zeropoint[i] * scale;
+        auto input_ptr = input_data[i];
+        for (int j = 0; j < copy_size; ++j)
+        {
+          const int value =
+            static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) + output_zeropoint;
+          output_ptr[j] = static_cast<uint8_t>(std::max(std::min(255, value), 0));
+        }
+      }
+      output_ptr += copy_size;
+    }
+  }
+}
+
+template <typename Scalar>
+void DepthConcatenation(const ConcatenationParams &params, const RuntimeShape *const *input_shapes,
+                        const Scalar *const *input_data, const RuntimeShape &output_shape,
+                        Scalar *output_data)
+{
+  ruy::profiler::ScopeLabel label("DepthConcatenation");
+  auto params_copy = params;
+  params_copy.axis = 3;
+  Concatenation(params_copy, input_shapes, input_data, output_shape, output_data);
+}
+
+inline void LstmCell(const LstmCellParams &params, const RuntimeShape &unextended_input_shape,
+                     const float *input_data, const RuntimeShape &unextended_prev_activ_shape,
+                     const float *prev_activ_data, const RuntimeShape &weights_shape,
+                     const float *weights_data, const RuntimeShape &unextended_bias_shape,
+                     const float *bias_data, const RuntimeShape &unextended_prev_state_shape,
+                     const float *prev_state_data,
+                     const RuntimeShape &unextended_output_state_shape, float *output_state_data,
+                     const RuntimeShape &unextended_output_activ_shape, float *output_activ_data,
+                     const RuntimeShape &unextended_concat_temp_shape, float *concat_temp_data,
+                     const RuntimeShape &unextended_activ_temp_shape, float *activ_temp_data)
+{
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape = RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape prev_activ_shape = RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
+  const RuntimeShape bias_shape = RuntimeShape::ExtendedShape(4, unextended_bias_shape);
+  const RuntimeShape prev_state_shape = RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
+  const RuntimeShape output_state_shape =
+    RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
+  const RuntimeShape output_activ_shape =
+    RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
+  const RuntimeShape concat_temp_shape =
+    RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
+  const RuntimeShape activ_temp_shape = RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  const int batches = MatchingDim(input_shape, 0, prev_activ_shape, 0, prev_state_shape, 0,
+                                  output_state_shape, 0, output_activ_shape, 0);
+  const int height = MatchingDim(input_shape, 1, prev_activ_shape, 1, prev_state_shape, 1,
+                                 output_state_shape, 1, output_activ_shape, 1);
+  const int width = MatchingDim(input_shape, 2, prev_activ_shape, 2, prev_state_shape, 2,
+                                output_state_shape, 2, output_activ_shape, 2);
+  const int input_depth = input_shape.Dims(3);
+  const int prev_activ_depth = prev_activ_shape.Dims(3);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1), total_input_depth);
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
+  const int intern_activ_depth = MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
+  TFLITE_DCHECK_EQ(weights_shape.FlatSize(), intern_activ_depth * total_input_depth);
+  TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth = MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
+                                       3, output_activ_shape, 3);
+  TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
+
+  // Concatenate prev_activ and input data together
+  std::vector<float const *> concat_input_arrays_data;
+  std::vector<RuntimeShape const *> concat_input_arrays_shapes;
+  concat_input_arrays_data.push_back(input_data);
+  concat_input_arrays_data.push_back(prev_activ_data);
+  concat_input_arrays_shapes.push_back(&input_shape);
+  concat_input_arrays_shapes.push_back(&prev_activ_shape);
+  tflite::ConcatenationParams concat_params;
+  concat_params.axis = 3;
+  concat_params.inputs_count = concat_input_arrays_data.size();
+  Concatenation(concat_params, &(concat_input_arrays_shapes[0]), &(concat_input_arrays_data[0]),
+                concat_temp_shape, concat_temp_data);
+
+  // Fully connected
+  tflite::FullyConnectedParams fc_params;
+  fc_params.float_activation_min = std::numeric_limits<float>::lowest();
+  fc_params.float_activation_max = std::numeric_limits<float>::max();
+  FullyConnected(fc_params, concat_temp_shape, concat_temp_data, weights_shape, weights_data,
+                 bias_shape, bias_data, activ_temp_shape, activ_temp_data);
+
+  // Memory state update (the LSTM "guts")
+  for (int b = 0; b < batches; ++b)
+  {
+    for (int w = 0; w < width; ++w)
+    {
+      for (int h = 0; h < height; ++h)
+      {
+        for (int c = 0; c < output_depth; ++c)
+        {
+          const float input_gate =
+            1.f /
+            (1.f +
+             std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, 0 * output_depth + c)]));
+          const float new_input =
+            std::tanh(activ_temp_data[Offset(activ_temp_shape, b, h, w, 1 * output_depth + c)]);
+          const float forget_gate =
+            1.f /
+            (1.f +
+             std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, 2 * output_depth + c)]));
+          const float output_gate =
+            1.f /
+            (1.f +
+             std::exp(-activ_temp_data[Offset(activ_temp_shape, b, h, w, 3 * output_depth + c)]));
+          const float new_state =
+            input_gate * new_input +
+            forget_gate * prev_state_data[Offset(prev_state_shape, b, h, w, c)];
+          output_state_data[Offset(output_state_shape, b, h, w, c)] = new_state;
+          output_activ_data[Offset(output_activ_shape, b, h, w, c)] =
+            output_gate * std::tanh(new_state);
+        }
+      }
+    }
+  }
+}
+
+// Quantized LSTM cell implementation.
+// The quantization of the input, output arrays is as follows:
+//  - The input activations are quantized as uint8 on the interval
+//    [-1, 127/128].
+//    The rationale for that is that is the natural interval for output
+//    activations (see next point) and these need to be concatenated together.
+//    We could accommodate different ranges by re-scaling, but we empirically
+//    found that setting the input activations range to be [-1, 127/128] in the
+//    first place, removing the need for re-scaling, greatly improves accuracy.
+//  - The output activations are quantized as uint8 on the interval
+//    [-1, 127/128].
+//    The rationale for that is that the definition of a LSTM cell makes them
+//    intrinsically constrained in [-1, 1]; tweaking that to [-1, 127/128]
+//    makes for simpler, more accurate fixed-point arithmetic.
+//  - The output-at-previous-timestep state array is obviously quantized as
+//    the output activations.
+//  - The internal LSTM memory (not the output-at-previous-timestep, the other
+//    internal state array) is int16-quantized and may use any power-of-two,
+//    symmetric range i.e. [-2^N, 2^N * 32767/32768] for any N, which we call
+//    StateIntegerBits below, see the below discussion of that template
+//    parameter ("The StateIntegerBits template parameter").
+//  - The output of the internal fully-connected node is int16-quantized
+//    on the interval [-8, 8 * 32767/32768], the rationale for which is
+//    explained just below ("Why [-8, 8] for fully-connected output?").
+//
+//
+// === The StateIntegerBits template parameter ===
+//
+// The StateIntegerBits template parameter controls the fixed-point format used
+// to represent the internal memory of the LSTM cell (not the
+// output-at-previous-timestep, the other internal state array). It's currently
+// a template parameter so that the model can control that. The most typical
+// value for StateIntegerBits is 4. Other plausible values are anywhere between
+// 3 and 5. We might eventually standardize on a single supported value, e.g. 4,
+// and drop that template parameter. The reason why it can't be a runtime
+// parameter is that this controls the fixed-point format used, i.e. we need to
+// generate actually different code based on it. In particular, we generate code
+// for a fixed-point tanh() implementation for that format, which internally
+// uses a fixed-point exp() implementation, which internally uses a
+// barrel-shifter with a number of steps that depends on StateIntegerBits.
+// Another consequence of that is that a higher value of StateIntegerBits
+// results in a more expensive implementation (more barrel shifter steps
+// needed).
+//
+//
+// === Why [-8, 8] for fully-connected output? ===
+//
+// This array is only fed to Logistic and Tanh functions, for which
+// the quantized implementation will want to use fixed-point arithmetic,
+// requiring a power-of-two representation interval. Thus, we should right
+// away quantize this array to a power-of-two interval; otherwise,
+// implementation will need to rescale that, losing any benefit that a tighter
+// representation interval might otherwise yield, while introducing some
+// numerical error and computational overhead.
+//
+// Now, Logistic and Tanh
+// are nearly constant (nearly equal to their horizontal asymptotes)
+// outside of a small bounded interval around 0:
+//
+//   Logistic(4) = 1 - 1.8e-2     Tanh(4) = 1 - 6.7e-4
+//   Logistic(8) = 1 - 3.4e-4     Tanh(8) = 1 - 2.3e-7
+//   Logistic(16) = 1 - 1.1e-7    Tanh(16) = 1 - 2.5e-14
+//
+// From this, we see that clamping to [-4, 4] would be too inaccurate
+// (the error of 1.8e-2 on Logistic would be felt even in 8bit precision)
+// while clamping to [-16, 16] would make no difference even in float32.
+// However, for a fixed-point implementation in 16-bit integers, using 5
+// integer bits to represent the [-16, 16] range would leave only 11
+// fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive
+// representable values. Notice that is higher than the
+// worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic.
+// Using [-8, 8] thus seems like the better compromise overall, enjoying
+// an increment of 2.4e-4 between representable values and a worst-case
+// clamping error of 3.4e-4, both better than the increment of 4.9e-4 with
+// [-16, 16].
+//
+// Moreover, all other things being equal, it is nice to choose the narrower
+// representation range, as that makes the implementation of fixed-point
+// math functions a little cheaper (each integer bit requires an additional
+// barrel-shifter atep in the implementation of exp(-x)). That is further
+// reason to prefer [-8, 8] over [-16, 16]. The choice of [-16, 16] would make
+// sense for 32-bit float or 32-bit fixed-point quantization, but we are
+// aiming for 16-bit fixed-point quantization of these internal nodes here.
+//
+template <int StateIntegerBits>
+inline void
+LstmCell(const LstmCellParams &params, const RuntimeShape &unextended_input_shape,
+         const uint8 *input_data_uint8, const RuntimeShape &unextended_prev_activ_shape,
+         const uint8 *prev_activ_data_uint8, const RuntimeShape &weights_shape,
+         const uint8 *weights_data_uint8, const RuntimeShape &unextended_bias_shape,
+         const int32 *bias_data_int32, const RuntimeShape &unextended_prev_state_shape,
+         const int16 *prev_state_data_int16, const RuntimeShape &unextended_output_state_shape,
+         int16 *output_state_data_int16, const RuntimeShape &unextended_output_activ_shape,
+         uint8 *output_activ_data_uint8, const RuntimeShape &unextended_concat_temp_shape,
+         uint8 *concat_temp_data_uint8, const RuntimeShape &unextended_activ_temp_shape,
+         int16 *activ_temp_data_int16, void *gemmlowp_context)
+{
+  (void)gemmlowp_context; // only used in optimized code.
+  int32 weights_zero_point = params.weights_zero_point;
+  int32 accum_multiplier = params.accum_multiplier;
+  int accum_shift = params.accum_shift;
+  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_bias_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_prev_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_state_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_activ_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_concat_temp_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_activ_temp_shape.DimensionsCount(), 4);
+  const RuntimeShape input_shape = RuntimeShape::ExtendedShape(4, unextended_input_shape);
+  const RuntimeShape prev_activ_shape = RuntimeShape::ExtendedShape(4, unextended_prev_activ_shape);
+  const RuntimeShape bias_shape = RuntimeShape::ExtendedShape(4, unextended_bias_shape);
+  const RuntimeShape prev_state_shape = RuntimeShape::ExtendedShape(4, unextended_prev_state_shape);
+  const RuntimeShape output_state_shape =
+    RuntimeShape::ExtendedShape(4, unextended_output_state_shape);
+  const RuntimeShape output_activ_shape =
+    RuntimeShape::ExtendedShape(4, unextended_output_activ_shape);
+  const RuntimeShape concat_temp_shape =
+    RuntimeShape::ExtendedShape(4, unextended_concat_temp_shape);
+  const RuntimeShape activ_temp_shape = RuntimeShape::ExtendedShape(4, unextended_activ_temp_shape);
+  TFLITE_DCHECK_GE(weights_shape.DimensionsCount(), 2);
+
+  // Gather dimensions information, and perform consistency checks.
+  const int weights_dim_count = weights_shape.DimensionsCount();
+  const int outer_size = MatchingFlatSizeSkipDim(input_shape, 3, prev_activ_shape, prev_state_shape,
+                                                 output_state_shape, output_activ_shape);
+  const int input_depth = input_shape.Dims(3);
+  const int prev_activ_depth = prev_activ_shape.Dims(3);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  TFLITE_DCHECK_EQ(weights_shape.Dims(weights_dim_count - 1), total_input_depth);
+  const int intern_activ_depth = MatchingDim(weights_shape, weights_dim_count - 2, bias_shape, 3);
+  TFLITE_DCHECK_EQ(weights_shape.FlatSize(), intern_activ_depth * total_input_depth);
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(bias_shape, 3), 1);
+  TFLITE_DCHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth = MatchingDim(prev_state_shape, 3, prev_activ_shape, 3, output_state_shape,
+                                       3, output_activ_shape, 3);
+  TFLITE_DCHECK_EQ(output_depth, intern_activ_depth / 4);
+  const int fc_batches = FlatSizeSkipDim(activ_temp_shape, 3);
+  const int fc_output_depth =
+    MatchingDim(weights_shape, weights_dim_count - 2, activ_temp_shape, 3);
+  const int fc_accum_depth = total_input_depth;
+  TFLITE_DCHECK_EQ(fc_output_depth, 4 * output_depth);
+
+  // Depth-concatenate prev_activ and input data together.
+  uint8 const *concat_input_arrays_data[2] = {input_data_uint8, prev_activ_data_uint8};
+  const RuntimeShape *concat_input_arrays_shapes[2] = {&input_shape, &prev_activ_shape};
+  tflite::ConcatenationParams concat_params;
+  concat_params.axis = 3;
+  concat_params.inputs_count = 2;
+  Concatenation(concat_params, concat_input_arrays_shapes, concat_input_arrays_data,
+                concat_temp_shape, concat_temp_data_uint8);
+
+  // Implementation of the fully connected node inside the LSTM cell.
+  // The operands are 8-bit integers, the accumulators are internally 32bit
+  // integers, and the output is 16-bit fixed-point with 3 integer bits so
+  // the output range is [-2^3, 2^3] == [-8, 8]. The rationale for that
+  // is explained in the function comment above.
+  for (int b = 0; b < fc_batches; ++b)
+  {
+    for (int out_c = 0; out_c < fc_output_depth; ++out_c)
+    {
+      // Internal accumulation.
+      // Initialize accumulator with the bias-value.
+      int32 accum = bias_data_int32[out_c];
+      // Accumulation loop.
+      for (int d = 0; d < fc_accum_depth; ++d)
+      {
+        int16 input_val = concat_temp_data_uint8[b * fc_accum_depth + d] - 128;
+        int16 weights_val = weights_data_uint8[out_c * fc_accum_depth + d] - weights_zero_point;
+        accum += input_val * weights_val;
+      }
+      // Down-scale the final int32 accumulator to the scale used by our
+      // (16-bit, using 3 integer bits) fixed-point format. The quantized
+      // multiplier and shift here have been pre-computed offline
+      // (e.g. by toco).
+      accum = MultiplyByQuantizedMultiplier(accum, accum_multiplier, accum_shift);
+      // Saturate, cast to int16, and store to the temporary activations array.
+      accum = std::max(-32768, std::min(32767, static_cast<int>(accum)));
+      activ_temp_data_int16[out_c + fc_output_depth * b] = accum;
+    }
+  }
+
+  // Rest of the LSTM cell: tanh and logistic math functions, and some adds
+  // and muls, all done in 16-bit fixed-point.
+  for (int b = 0; b < outer_size; ++b)
+  {
+    for (int c = 0; c < output_depth; ++c)
+    {
+      // Define the fixed-point data types that we will use here. All use
+      // int16 as the underlying integer type i.e. all are 16-bit fixed-point.
+      // They only differ by the number of integral vs. fractional bits,
+      // determining the range of values that they can represent.
+      //
+      // F0 uses 0 integer bits, range [-1, 1].
+      // This is the return type of math functions such as tanh, logistic,
+      // whose range is in [-1, 1].
+      using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+      // F3 uses 3 integer bits, range [-8, 8].
+      // This is the range of the previous fully-connected node's output,
+      // which is our input here.
+      using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+      // FS uses StateIntegerBits integer bits, range [-2^StateIntegerBits,
+      // 2^StateIntegerBits]. It's used to represent the internal state, whose
+      // number of integer bits is currently dictated by the model. See comment
+      // on the StateIntegerBits template parameter above.
+      using FS = gemmlowp::FixedPoint<std::int16_t, StateIntegerBits>;
+      // Implementation of input gate, using fixed-point logistic function.
+      F3 input_gate_input =
+        F3::FromRaw(activ_temp_data_int16[b * fc_output_depth + 0 * output_depth + c]);
+      F0 input_gate_output = gemmlowp::logistic(input_gate_input);
+      // Implementation of input modulation gate, using fixed-point tanh
+      // function.
+      F3 input_modulation_gate_input =
+        F3::FromRaw(activ_temp_data_int16[b * fc_output_depth + 1 * output_depth + c]);
+      F0 input_modulation_gate_output = gemmlowp::tanh(input_modulation_gate_input);
+      // Implementation of forget gate, using fixed-point logistic function.
+      F3 forget_gate_input =
+        F3::FromRaw(activ_temp_data_int16[b * fc_output_depth + 2 * output_depth + c]);
+      F0 forget_gate_output = gemmlowp::logistic(forget_gate_input);
+      // Implementation of output gate, using fixed-point logistic function.
+      F3 output_gate_input =
+        F3::FromRaw(activ_temp_data_int16[b * fc_output_depth + 3 * output_depth + c]);
+      F0 output_gate_output = gemmlowp::logistic(output_gate_input);
+      // Implementation of internal multiplication nodes, still in fixed-point.
+      F0 input_times_input_modulation = input_gate_output * input_modulation_gate_output;
+      FS prev_state = FS::FromRaw(prev_state_data_int16[b * output_depth + c]);
+      FS prev_state_times_forget_state = forget_gate_output * prev_state;
+      // Implementation of internal addition node, saturating.
+      FS new_state =
+        gemmlowp::SaturatingAdd(gemmlowp::Rescale<StateIntegerBits>(input_times_input_modulation),
+                                prev_state_times_forget_state);
+      // Implementation of last internal Tanh node, still in fixed-point.
+      // Since a Tanh fixed-point implementation is specialized for a given
+      // number or integer bits, and each specialization can have a substantial
+      // code size, and we already used above a Tanh on an input with 3 integer
+      // bits, and per the table in the above function comment there is no
+      // significant accuracy to be lost by clamping to [-8, +8] for a
+      // 3-integer-bits representation, let us just do that. This helps people
+      // porting this to targets where code footprint must be minimized.
+      F3 new_state_f3 = gemmlowp::Rescale<3>(new_state);
+      F0 output_activ_int16 = output_gate_output * gemmlowp::tanh(new_state_f3);
+      // Store the new internal state back to memory, as 16-bit integers.
+      // Note: here we store the original value with StateIntegerBits, not
+      // the rescaled 3-integer-bits value fed to tanh.
+      output_state_data_int16[b * output_depth + c] = new_state.raw();
+      // Down-scale the output activations to 8-bit integers, saturating,
+      // and store back to memory.
+      int16 rescaled_output_activ = gemmlowp::RoundingDivideByPOT(output_activ_int16.raw(), 8);
+      int16 clamped_output_activ =
+        std::max<int16>(-128, std::min<int16>(127, rescaled_output_activ));
+      output_activ_data_uint8[b * output_depth + c] = 128 + clamped_output_activ;
+    }
+  }
+}
+
+template <typename Scalar>
+void Split(const SplitParams &params, const RuntimeShape &input_shape, const Scalar *input_data,
+           const RuntimeShape *const *output_shapes, Scalar *const *output_data)
+{
+  ruy::profiler::ScopeLabel label("Split");
+  const int split_dimensions = input_shape.DimensionsCount();
+  int axis = params.axis < 0 ? params.axis + split_dimensions : params.axis;
+  int outputs_count = params.num_split;
+  TFLITE_DCHECK_LT(axis, split_dimensions);
+
+  int64_t split_size = 0;
+  for (int i = 0; i < outputs_count; i++)
+  {
+    TFLITE_DCHECK_EQ(output_shapes[i]->DimensionsCount(), split_dimensions);
+    for (int j = 0; j < split_dimensions; j++)
+    {
+      if (j != axis)
+      {
+        MatchingDim(*output_shapes[i], j, input_shape, j);
+      }
+    }
+    split_size += output_shapes[i]->Dims(axis);
+  }
+  TFLITE_DCHECK_EQ(split_size, input_shape.Dims(axis));
+  int64_t outer_size = 1;
+  for (int i = 0; i < axis; ++i)
+  {
+    outer_size *= input_shape.Dims(i);
+  }
+  // For all output arrays,
+  // FlatSize() = outer_size * Dims(axis) * base_inner_size;
+  int64_t base_inner_size = 1;
+  for (int i = axis + 1; i < split_dimensions; ++i)
+  {
+    base_inner_size *= input_shape.Dims(i);
+  }
+
+  const Scalar *input_ptr = input_data;
+  for (int k = 0; k < outer_size; k++)
+  {
+    for (int i = 0; i < outputs_count; ++i)
+    {
+      const int copy_size = output_shapes[i]->Dims(axis) * base_inner_size;
+      memcpy(output_data[i] + k * copy_size, input_ptr, copy_size * sizeof(Scalar));
+      input_ptr += copy_size;
+    }
+  }
+}
+
+inline int NodeOffset(int b, int h, int w, int height, int width)
+{
+  return (b * height + h) * width + w;
+}
+
+inline void LocalResponseNormalization(const tflite::LocalResponseNormalizationParams &op_params,
+                                       const RuntimeShape &input_shape, const float *input_data,
+                                       const RuntimeShape &output_shape, float *output_data)
+{
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int i = 0; i < outer_size; ++i)
+  {
+    for (int c = 0; c < depth; ++c)
+    {
+      const int begin_input_c = std::max(0, static_cast<int>(c - op_params.range));
+      const int end_input_c = std::min(depth, static_cast<int>(c + op_params.range));
+      float accum = 0.f;
+      for (int input_c = begin_input_c; input_c < end_input_c; ++input_c)
+      {
+        const float input_val = input_data[i * depth + input_c];
+        accum += input_val * input_val;
+      }
+      const float multiplier = std::pow(op_params.bias + op_params.alpha * accum, -op_params.beta);
+      output_data[i * depth + c] = input_data[i * depth + c] * multiplier;
+    }
+  }
+}
+
+inline void Dequantize(const RuntimeShape &input_shape, const Eigen::half *input_data,
+                       const RuntimeShape &output_shape, float *output_data)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; i++)
+  {
+    output_data[i] = static_cast<float>(input_data[i]);
+  }
+}
+
+inline void FakeQuant(const tflite::FakeQuantParams &op_params, const RuntimeShape &input_shape,
+                      const float *input_data, const RuntimeShape &output_shape, float *output_data)
+{
+  ruy::profiler::ScopeLabel label("FakeQuant");
+  float rmin = op_params.minmax.min;
+  float rmax = op_params.minmax.max;
+  int num_bits = op_params.num_bits;
+  // 0 should always be a representable value. Let's assume that the initial
+  // min,max range contains 0.
+  TFLITE_DCHECK_LE(rmin, 0.0f);
+  TFLITE_DCHECK_GE(rmax, 0.0f);
+  TFLITE_DCHECK_LT(rmin, rmax);
+
+  // Code matches tensorflow's FakeQuantWithMinMaxArgsFunctor.
+  int quant_min = 0;
+  int quant_max = (1 << num_bits) - 1;
+  float nudged_min, nudged_max, nudged_scale;
+  NudgeQuantizationRange(rmin, rmax, quant_min, quant_max, &nudged_min, &nudged_max, &nudged_scale);
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  FakeQuantizeArray(nudged_scale, nudged_min, nudged_max, input_data, output_data, flat_size);
+}
+
+// Common subroutine for both `GatherNd` and `GatherNdString`.
+struct GatherNdHelperResult
+{
+  int n_slices;
+  int slice_size;
+  int indices_nd;
+  std::vector<int> dims_to_count;
+};
+
+// Returns common values being used on both `GatherNd` and `GatherNdString`.
+inline GatherNdHelperResult GatherNdHelper(const RuntimeShape &params_shape,
+                                           const RuntimeShape &indices_shape)
+{
+  GatherNdHelperResult ret;
+  ret.n_slices = 1;
+  ret.slice_size = 1;
+  const int indices_dims = indices_shape.DimensionsCount();
+  ret.indices_nd = indices_shape.Dims(indices_dims - 1);
+  const int params_dims = params_shape.DimensionsCount();
+  for (int i = 0; i < indices_dims - 1; ++i)
+  {
+    ret.n_slices *= indices_shape.Dims(i);
+  }
+  for (int i = ret.indices_nd; i < params_dims; ++i)
+  {
+    ret.slice_size *= params_shape.Dims(i);
+  }
+
+  int remain_flat_size = params_shape.FlatSize();
+  ret.dims_to_count = std::vector<int>(ret.indices_nd, 0);
+  for (int i = 0; i < ret.indices_nd; ++i)
+  {
+    ret.dims_to_count[i] = remain_flat_size / params_shape.Dims(i);
+    remain_flat_size = ret.dims_to_count[i];
+  }
+
+  return ret;
+}
+
+template <typename ParamsT, typename IndicesT = int32>
+inline void GatherNd(const RuntimeShape &params_shape, const ParamsT *params_data,
+                     const RuntimeShape &indices_shape, const IndicesT *indices_data,
+                     const RuntimeShape &output_shape, ParamsT *output_data)
+{
+  ruy::profiler::ScopeLabel label("GatherNd");
+
+  const GatherNdHelperResult res = GatherNdHelper(params_shape, indices_shape);
+  for (int i = 0; i < res.n_slices; ++i)
+  {
+    int from_pos = 0;
+    for (int j = 0; j < res.indices_nd; ++j)
+    {
+      from_pos += indices_data[i * res.indices_nd + j] * res.dims_to_count[j];
+    }
+    std::memcpy(output_data + i * res.slice_size, params_data + from_pos,
+                sizeof(ParamsT) * res.slice_size);
+  }
+}
+
+#ifndef TF_LITE_STATIC_MEMORY
+template <typename IndicesT = int32>
+inline void GatherNdString(const RuntimeShape &params_shape, const TfLiteTensor *params_data,
+                           const RuntimeShape &indices_shape, const IndicesT *indices_data,
+                           const RuntimeShape &output_shape, TfLiteTensor *output_data)
+{
+  ruy::profiler::ScopeLabel label("GatherNdString");
+
+  const GatherNdHelperResult res = GatherNdHelper(params_shape, indices_shape);
+  DynamicBuffer buffer;
+  for (int i = 0; i < res.n_slices; ++i)
+  {
+    int from_pos = 0;
+    for (int j = 0; j < res.indices_nd; ++j)
+    {
+      from_pos += indices_data[i * res.indices_nd + j] * res.dims_to_count[j];
+    }
+    for (int j = 0; j < res.slice_size; ++j)
+    {
+      buffer.AddString(GetString(params_data, from_pos + j));
+    }
+  }
+  buffer.WriteToTensor(output_data, /*new_shape=*/nullptr);
+}
+#endif
+
+template <typename IndicesT, typename UpdatesT>
+inline void ScatterNd(const RuntimeShape &indices_shape, const IndicesT *indices_data,
+                      const RuntimeShape &updates_shape, const UpdatesT *updates_data,
+                      const RuntimeShape &output_shape, UpdatesT *output_data)
+{
+  ruy::profiler::ScopeLabel label("ScatterNd");
+
+  int n_slices = 1;
+  int slice_size = 1;
+  const int outer_dims = indices_shape.DimensionsCount() - 1;
+  const int indices_nd = indices_shape.Dims(outer_dims);
+  const int updates_dims = updates_shape.DimensionsCount();
+  for (int i = 0; i < outer_dims; ++i)
+  {
+    n_slices *= indices_shape.Dims(i);
+  }
+  for (int i = outer_dims; i < updates_dims; ++i)
+  {
+    slice_size *= updates_shape.Dims(i);
+  }
+
+  int output_flat_size = output_shape.FlatSize();
+  int remain_flat_size = output_flat_size;
+  std::vector<int> dims_to_count(indices_nd, 0);
+  for (int i = 0; i < indices_nd; ++i)
+  {
+    dims_to_count[i] = remain_flat_size / output_shape.Dims(i);
+    remain_flat_size = dims_to_count[i];
+  }
+
+  memset(output_data, 0, sizeof(UpdatesT) * output_flat_size);
+  for (int i = 0; i < n_slices; ++i)
+  {
+    int to_pos = 0;
+    for (int j = 0; j < indices_nd; ++j)
+    {
+      IndicesT idx = indices_data[i * indices_nd + j];
+      TFLITE_DCHECK(0 <= idx && idx < output_shape.Dims(j));
+      to_pos += idx * dims_to_count[j];
+    }
+    for (int j = 0; j < slice_size; j++)
+    {
+      output_data[to_pos + j] += updates_data[i * slice_size + j];
+    }
+  }
+}
+
+template <typename T>
+inline void Slice(const tflite::SliceParams &op_params, const RuntimeShape &input_shape,
+                  const RuntimeShape &output_shape, SequentialTensorWriter<T> *writer)
+{
+  const RuntimeShape ext_shape = RuntimeShape::ExtendedShape(5, input_shape);
+  TFLITE_DCHECK_LE(op_params.begin_count, 5);
+  TFLITE_DCHECK_LE(op_params.size_count, 5);
+  const int begin_count = op_params.begin_count;
+  const int size_count = op_params.size_count;
+  // We front-pad the begin and size vectors.
+  std::array<int, 5> start;
+  std::array<int, 5> stop;
+  for (int i = 0; i < 5; ++i)
+  {
+    int padded_i = 5 - i;
+    start[i] = begin_count < padded_i ? 0 : op_params.begin[begin_count - padded_i];
+    stop[i] = (size_count < padded_i || op_params.size[size_count - padded_i] == -1)
+                ? ext_shape.Dims(i)
+                : start[i] + op_params.size[size_count - padded_i];
+  }
+
+  for (int i0 = start[0]; i0 < stop[0]; ++i0)
+  {
+    for (int i1 = start[1]; i1 < stop[1]; ++i1)
+    {
+      for (int i2 = start[2]; i2 < stop[2]; ++i2)
+      {
+        for (int i3 = start[3]; i3 < stop[3]; ++i3)
+        {
+          for (int i4 = start[4]; i4 < stop[4]; ++i4)
+          {
+            writer->Write(Offset(ext_shape, i0, i1, i2, i3, i4));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void Slice(const tflite::SliceParams &op_params, const RuntimeShape &input_shape,
+                  const T *input_data, const RuntimeShape &output_shape, T *output_data)
+{
+  SequentialTensorWriter<T> writer(input_data, output_data);
+  return Slice(op_params, input_shape, output_shape, &writer);
+}
+
+template <typename T>
+inline void Slice(const tflite::SliceParams &op_params, const RuntimeShape &input_shape,
+                  const TfLiteTensor *input, const RuntimeShape &output_shape, TfLiteTensor *output)
+{
+  SequentialTensorWriter<T> writer(input, output);
+  return Slice(op_params, input_shape, output_shape, &writer);
+}
+
+template <typename T>
+void Minimum(const RuntimeShape &input1_shape, const T *input1_data, const T *input2_data,
+             const RuntimeShape &output_shape, T *output_data)
+{
+  const int flat_size = MatchingFlatSize(input1_shape, output_shape);
+
+  auto min_value = input2_data[0];
+  for (int i = 0; i < flat_size; i++)
+  {
+    output_data[i] = input1_data[i] > min_value ? min_value : input1_data[i];
+  }
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// the same as other binary ops.
+template <typename T>
+inline void Minimum(const RuntimeShape &input1_shape, const T *input1_data, const RuntimeShape &,
+                    const T *input2_data, const RuntimeShape &output_shape, T *output_data)
+{
+  // Drop shape of second input: not needed.
+  Minimum(input1_shape, input1_data, input2_data, output_shape, output_data);
+}
+
+template <typename T>
+void Maximum(const RuntimeShape &input1_shape, const T *input1_data, const T *input2_data,
+             const RuntimeShape &output_shape, T *output_data)
+{
+  const int flat_size = MatchingFlatSize(input1_shape, output_shape);
+
+  auto max_value = input2_data[0];
+  for (int i = 0; i < flat_size; i++)
+  {
+    output_data[i] = input1_data[i] < max_value ? max_value : input1_data[i];
+  }
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// the same as other binary ops.
+template <typename T>
+inline void Maximum(const RuntimeShape &input1_shape, const T *input1_data, const RuntimeShape &,
+                    const T *input2_data, const RuntimeShape &output_shape, T *output_data)
+{
+  // Drop shape of second input: not needed.
+  Maximum(input1_shape, input1_data, input2_data, output_shape, output_data);
+}
+
+template <typename T1, typename T2, typename T3>
+void ArgMax(const RuntimeShape &input1_shape, const T1 *input1_data, const T3 *input2_data,
+            const RuntimeShape &output_shape, T2 *output_data)
+{
+  ArgMinMax(input1_shape, input1_data, input2_data, output_shape, output_data, std::greater<T1>());
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// the same as other binary ops.
+template <typename T1, typename T2, typename T3>
+inline void ArgMax(const RuntimeShape &input1_shape, const T1 *input1_data,
+                   const RuntimeShape &input2_shape, const T3 *input2_data,
+                   const RuntimeShape &output_shape, T2 *output_data)
+{
+  // Drop shape of second input: not needed.
+  ArgMax(input1_shape, input1_data, input2_data, output_shape, output_data);
+}
+
+template <typename D, typename T>
+void Select(const RuntimeShape &input_condition_shape, const D *input_condition_data,
+            const RuntimeShape &input_x_shape, const T *input_x_data,
+            const RuntimeShape &input_y_shape, const T *input_y_data,
+            const RuntimeShape &output_shape, T *output_data)
+{
+  int64_t flatsize;
+  // Allow select operator executions on mixed scalar tensors and one element
+  // tensors.
+  if (input_condition_shape.FlatSize() == 1 && input_x_shape.FlatSize() == 1 &&
+      input_y_shape.FlatSize() == 1 && output_shape.FlatSize() == 1)
+  {
+    flatsize = 1;
+  }
+  else
+  {
+    flatsize = MatchingFlatSize(input_condition_shape, input_x_shape, input_y_shape, output_shape);
+  }
+  for (int64_t i = 0; i < flatsize; ++i)
+  {
+    output_data[i] = input_condition_data[i] ? input_x_data[i] : input_y_data[i];
+  }
+}
+
+template <typename D, typename T>
+void RankOneSelect(const RuntimeShape &input_condition_shape, const D *input_condition_data,
+                   const RuntimeShape &input_x_shape, const T *input_x_data,
+                   const RuntimeShape &input_y_shape, const T *input_y_data,
+                   const RuntimeShape &output_shape, T *output_data)
+{
+  const int64_t outer_size = input_condition_shape.FlatSize();
+  int64_t inner_size;
+  if (input_condition_shape.DimensionsCount() == 0)
+  {
+    inner_size = MatchingFlatSize(input_x_shape, input_y_shape, output_shape);
+  }
+  else
+  {
+    TFLITE_DCHECK_EQ(MatchingDim(input_x_shape, 0, input_y_shape, 0, output_shape, 0), outer_size);
+    inner_size = MatchingFlatSizeSkipDim(input_x_shape, 0, input_y_shape, output_shape);
+  }
+
+  int64_t offset = 0;
+  for (int64_t i = 0; i < outer_size; i++)
+  {
+    const T *input_data = input_condition_data[i] ? input_x_data : input_y_data;
+    memcpy(output_data + offset, input_data + offset, inner_size * sizeof(T));
+    offset += inner_size;
+  }
+}
+
+template <typename D, typename T>
+void BroadcastSelect4DSlow(const RuntimeShape &input_condition_shape, const D *input_condition_data,
+                           const RuntimeShape &input_x_shape, const T *input_x_data,
+                           const RuntimeShape &input_y_shape, const T *input_y_data,
+                           const RuntimeShape &output_shape, T *output_data)
+{
+  TFLITE_DCHECK_LE(input_condition_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(input_x_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(input_y_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 4);
+
+  const RuntimeShape extended_output_shape = RuntimeShape::ExtendedShape(4, output_shape);
+
+  NdArrayDesc<4> desc_condition;
+  NdArrayDesc<4> desc_x;
+  NdArrayDesc<4> desc_y;
+  NdArrayDescsForElementwiseBroadcast(input_condition_shape, input_x_shape, input_y_shape,
+                                      &desc_condition, &desc_x, &desc_y);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest
+  // stride, typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for
+  // the best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b)
+  {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y)
+    {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x)
+      {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c)
+        {
+          const int condition_index = SubscriptToIndex(desc_condition, b, y, x, c);
+          const int x_index = SubscriptToIndex(desc_x, b, y, x, c);
+          const int y_index = SubscriptToIndex(desc_y, b, y, x, c);
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+            input_condition_data[condition_index] ? input_x_data[x_index] : input_y_data[y_index];
+        }
+      }
+    }
+  }
+}
+
+template <typename D, typename T>
+void SelectTrueCoords(const RuntimeShape &input_condition_shape, const D *input_condition_data,
+                      T *output_data)
+{
+  const size_t size = input_condition_shape.FlatSize();
+  if (size == 0)
+  {
+    // Dimension is zero, in which case we don't need to output.
+    return;
+  }
+  const size_t cond_rank = input_condition_shape.DimensionsCount();
+
+  std::vector<int> dims_to_count(cond_rank, 0);
+  int cur_flat_size = size;
+  for (int i = 0; i < cond_rank; ++i)
+  {
+    dims_to_count[i] = cur_flat_size / input_condition_shape.Dims(i);
+    cur_flat_size = dims_to_count[i];
+  }
+
+  int output_index = 0;
+  for (int i = 0; i < size; ++i)
+  {
+    if (input_condition_data[i])
+    {
+      // Insert the coordinate of the current item (row major) into output.
+      int flat_index = i;
+      for (int j = 0; j < cond_rank; ++j)
+      {
+        int coord_j = flat_index / dims_to_count[j];
+        output_data[output_index * cond_rank + j] = coord_j;
+        flat_index %= dims_to_count[j];
+      }
+      output_index++;
+    }
+  }
+}
+
+// For easy implementation, the indices is always a vector of size-4 vectors.
+template <typename T, typename TI>
+inline void SparseToDense(const std::vector<std::vector<TI>> &indices, const T *values,
+                          T default_value, bool value_is_scalar,
+                          const RuntimeShape &unextended_output_shape, T *output_data)
+{
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape output_shape = RuntimeShape::ExtendedShape(4, unextended_output_shape);
+  const int value_count = indices.size();
+
+  // First fill the output_data with default value.
+  const int num_elements = output_shape.FlatSize();
+  for (int i = 0; i < num_elements; ++i)
+  {
+    output_data[i] = default_value;
+  }
+
+  // Special handle for value is scalar case to avoid checking the boolean
+  // condition within the loop every time.
+  if (value_is_scalar)
+  {
+    for (int i = 0; i < value_count; ++i)
+    {
+      const std::vector<TI> &index = indices[i];
+      TFLITE_DCHECK_EQ(index.size(), 4);
+      const T value = *values; // just use the first value.
+      output_data[Offset(output_shape, index[0], index[1], index[2], index[3])] = value;
+    }
+    return;
+  }
+
+  // Go through the values and indices to fill the sparse values.
+  for (int i = 0; i < value_count; ++i)
+  {
+    const std::vector<TI> &index = indices[i];
+    TFLITE_DCHECK_EQ(index.size(), 4);
+    const T value = values[i];
+    output_data[Offset(output_shape, index[0], index[1], index[2], index[3])] = value;
+  }
+}
+
+template <typename T>
+inline void Pow(const RuntimeShape &input1_shape, const T *input1_data,
+                const RuntimeShape &input2_shape, const T *input2_data,
+                const RuntimeShape &output_shape, T *output_data)
+{
+  const int flat_size = MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i)
+  {
+    output_data[i] = std::pow(input1_data[i], input2_data[i]);
+  }
+}
+
+template <typename T>
+inline void BroadcastPow4DSlow(const RuntimeShape &unextended_input1_shape, const T *input1_data,
+                               const RuntimeShape &unextended_input2_shape, const T *input2_data,
+                               const RuntimeShape &unextended_output_shape, T *output_data)
+{
+  TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  const RuntimeShape output_shape = RuntimeShape::ExtendedShape(4, unextended_output_shape);
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, unextended_input2_shape, &desc1,
+                                      &desc2);
+
+  for (int b = 0; b < output_shape.Dims(0); ++b)
+  {
+    for (int y = 0; y < output_shape.Dims(1); ++y)
+    {
+      for (int x = 0; x < output_shape.Dims(2); ++x)
+      {
+        for (int c = 0; c < output_shape.Dims(3); ++c)
+        {
+          auto out_idx = Offset(output_shape, b, y, x, c);
+          auto in1_idx = SubscriptToIndex(desc1, b, y, x, c);
+          auto in2_idx = SubscriptToIndex(desc2, b, y, x, c);
+          auto in1_val = input1_data[in1_idx];
+          auto in2_val = input2_data[in2_idx];
+          output_data[out_idx] = std::pow(in1_val, in2_val);
+        }
+      }
+    }
+  }
+}
+
+template <typename Scalar>
+void Reverse(int axis, const RuntimeShape &input_shape, const Scalar *input_data,
+             const RuntimeShape &output_shape, Scalar *output_data)
+{
+  ruy::profiler::ScopeLabel label("Reverse");
+
+  int outer_size = 1;
+  for (int i = 0; i < axis; ++i)
+  {
+    outer_size *= input_shape.Dims(i);
+  }
+
+  int copy_size = 1;
+  for (int i = axis + 1; i < input_shape.DimensionsCount(); ++i)
+  {
+    copy_size *= input_shape.Dims(i);
+  }
+
+  const int dims_at_axis = input_shape.Dims(axis);
+  for (int i = 0; i < outer_size; ++i)
+  {
+    for (int j = 0; j < dims_at_axis; ++j)
+    {
+      const int start_pos = (i * dims_at_axis + j) * copy_size;
+      Scalar *output_ptr = output_data + start_pos;
+      int loc = (i * dims_at_axis + dims_at_axis - j - 1) * copy_size;
+      memcpy(output_ptr, input_data + loc, copy_size * sizeof(Scalar));
+    }
+  }
+}
+
+template <typename Scalar, typename TS>
+void ReverseSequence(const TS *seq_lengths, const int seq_dim, const int batch_dim,
+                     const RuntimeShape &input_shape, const Scalar *input_data,
+                     const RuntimeShape &output_shape, Scalar *output_data)
+{
+  ruy::profiler::ScopeLabel label("ReverseSequence");
+
+  int outer_size = 1;
+  int outer_dim = std::min(batch_dim, seq_dim);
+  int medium_dim = std::max(batch_dim, seq_dim);
+  for (int i = 0; i < outer_dim; ++i)
+  {
+    outer_size *= input_shape.Dims(i);
+  }
+
+  int medium_size = 1;
+  for (int i = outer_dim + 1; i < medium_dim; ++i)
+  {
+    medium_size *= input_shape.Dims(i);
+  }
+
+  int copy_size = 1;
+  for (int i = medium_dim + 1; i < input_shape.DimensionsCount(); ++i)
+  {
+    copy_size *= input_shape.Dims(i);
+  }
+
+  const int dims_at_outer_dim = input_shape.Dims(outer_dim);
+  const int dims_at_medium_dim = input_shape.Dims(medium_dim);
+
+  Scalar *output_ptr;
+  if (batch_dim > seq_dim)
+  {
+    for (int i = 0; i < outer_size; ++i)
+    {
+      for (int j = 0; j < dims_at_outer_dim; ++j)
+      {
+        const int in_pos_base = (i * dims_at_outer_dim + j) * medium_size;
+        for (int p = 0; p < medium_size; ++p)
+        {
+          for (int q = 0; q < dims_at_medium_dim; ++q)
+          {
+            const int in_pos = ((in_pos_base + p) * dims_at_medium_dim + q) * copy_size;
+            const Scalar *in_ptr = input_data + in_pos;
+            int sl = seq_lengths[q] - 1;
+            if (j > sl)
+            {
+              output_ptr = output_data + in_pos;
+            }
+            else
+            {
+              const int out_pos_base = (i * dims_at_outer_dim + sl - j) * medium_size;
+              const int out_pos = ((out_pos_base + p) * dims_at_medium_dim + q) * copy_size;
+              output_ptr = output_data + out_pos;
+            }
+            memcpy(output_ptr, in_ptr, copy_size * sizeof(Scalar));
+          }
+        }
+      }
+    }
+  }
+  else if (batch_dim < seq_dim)
+  {
+    for (int i = 0; i < outer_size; ++i)
+    {
+      for (int j = 0; j < dims_at_outer_dim; ++j)
+      {
+        const int in_pos_base = (i * dims_at_outer_dim + j) * medium_size;
+        int sl = seq_lengths[j] - 1;
+        const int out_pos_base = (i * dims_at_outer_dim + j) * medium_size;
+        for (int p = 0; p < medium_size; ++p)
+        {
+          for (int q = 0; q < dims_at_medium_dim; ++q)
+          {
+            const int in_pos = ((in_pos_base + p) * dims_at_medium_dim + q) * copy_size;
+            const Scalar *in_ptr = input_data + in_pos;
+            if (q > sl)
+            {
+              output_ptr = output_data + in_pos;
+            }
+            else
+            {
+              const int out_pos = ((out_pos_base + p) * dims_at_medium_dim + sl - q) * copy_size;
+              output_ptr = output_data + out_pos;
+            }
+            memcpy(output_ptr, in_ptr, copy_size * sizeof(Scalar));
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void SegmentSum(const RuntimeShape &input_shape, const T *input_data,
+                       const RuntimeShape &segment_ids_shape, const int32_t *segment_ids_data,
+                       const RuntimeShape &output_shape, T *output_data)
+{
+  const int segment_flat_size = MatchingFlatSizeSkipDim(input_shape, 0, output_shape);
+
+  memset(output_data, 0, sizeof(T) * output_shape.FlatSize());
+
+  for (int i = 0; i < input_shape.Dims(0); i++)
+  {
+    int output_index = segment_ids_data[i];
+    for (int j = 0; j < segment_flat_size; ++j)
+    {
+      output_data[output_index * segment_flat_size + j] += input_data[i * segment_flat_size + j];
+    }
+  }
+}
+
+} // namespace reference_ops
+} // namespace tflite
+
+#endif // LUCI_INTERPRETER_PAL_REFERENCE_OPS_H
diff --git a/compiler/luci-interpreter/src/core/KernelParams.h b/compiler/luci-interpreter/src/core/KernelParams.h
index 958fd4b74..6c0220c62 100644
--- a/compiler/luci-interpreter/src/core/KernelParams.h
+++ b/compiler/luci-interpreter/src/core/KernelParams.h
@@ -170,6 +170,11 @@ struct ResizeNearestNeighborParams
   bool half_pixel_centers;
 };
 
+struct ShapeParams
+{
+  loco::DataType out_type;
+};
+
 struct SubParams
 {
   Activation activation;
diff --git a/compiler/luci-interpreter/src/kernels/Fill.cpp b/compiler/luci-interpreter/src/kernels/Fill.cpp
new file mode 100644
index 000000000..e09d6331a
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Fill.cpp
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Fill.h"
+#include "kernels/Utils.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Fill::Fill(const Tensor *dims, const Tensor *value, Tensor *output)
+  : Kernel({dims, value}, {output})
+{
+}
+
+template <typename T> void Fill::configureShape()
+{
+  const auto dims_data = getTensorData<T>(dims());
+  Shape output_shape(dims()->shape().dim(0));
+
+  for (int i = 0; i < output_shape.num_dims(); ++i)
+  {
+    T data = dims_data[i];
+    if (data < 0)
+      throw std::runtime_error("Fill dimensions must be >= 0");
+
+    output_shape.dim(i) = data;
+  }
+
+  output()->resize(output_shape);
+}
+
+void Fill::configure()
+{
+  const auto dims_shape = dims()->shape();
+  const auto value_shape = value()->shape();
+
+  // Make sure the 1st input tensor is 1-D
+  LUCI_INTERPRETER_CHECK(dims_shape.num_dims() == 1);
+
+  // Make sure the 1st input tensor is int32 or int64
+  LUCI_INTERPRETER_CHECK(dims()->element_type() == DataType::S32 or
+                         dims()->element_type() == DataType::S64);
+
+  // Make sure the 2nd input tensor is a scalar
+  LUCI_INTERPRETER_CHECK(value_shape.num_dims() == 0)
+
+  // Check zero point and scale for S16 and S8
+  if (value()->element_type() == loco::DataType::S16 or
+      value()->element_type() == loco::DataType::S8)
+  {
+    LUCI_INTERPRETER_CHECK(value()->scale() == output()->scale());
+    LUCI_INTERPRETER_CHECK(value()->zero_point() == output()->zero_point());
+
+    if (value()->element_type() == loco::DataType::S16)
+      LUCI_INTERPRETER_CHECK(value()->zero_point() == 0);
+  }
+  // Resize output
+  switch (dims()->element_type())
+  {
+    case DataType::S32:
+      configureShape<int32_t>();
+      break;
+    case DataType::S64:
+      configureShape<int64_t>();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Fill::execute() const
+{
+  switch (output()->element_type())
+  {
+    case DataType::S8:
+      tflite::reference_ops::Fill(getTensorShape(value()), getTensorData<int8_t>(value()),
+                                  getTensorShape(output()), getTensorData<int8_t>(output()));
+      break;
+    case DataType::S16:
+      tflite::reference_ops::Fill(getTensorShape(value()), getTensorData<int16_t>(value()),
+                                  getTensorShape(output()), getTensorData<int16_t>(output()));
+      break;
+    case DataType::S32:
+      tflite::reference_ops::Fill(getTensorShape(value()), getTensorData<int32_t>(value()),
+                                  getTensorShape(output()), getTensorData<int32_t>(output()));
+      break;
+    case DataType::S64:
+      tflite::reference_ops::Fill(getTensorShape(value()), getTensorData<int64_t>(value()),
+                                  getTensorShape(output()), getTensorData<int64_t>(output()));
+      break;
+    case DataType::FLOAT32:
+      tflite::reference_ops::Fill(getTensorShape(value()), getTensorData<float>(value()),
+                                  getTensorShape(output()), getTensorData<float>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Fill.h b/compiler/luci-interpreter/src/kernels/Fill.h
new file mode 100644
index 000000000..184f0cb83
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Fill.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_FILL_H
+#define LUCI_INTERPRETER_KERNELS_FILL_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Fill : public Kernel
+{
+public:
+  Fill(const Tensor *dims, const Tensor *value, Tensor *output);
+
+  const Tensor *dims() const { return _inputs[0]; }
+  const Tensor *value() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  template <typename T> void configureShape();
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_FILL_H
diff --git a/compiler/luci-interpreter/src/kernels/Fill.test.cpp b/compiler/luci-interpreter/src/kernels/Fill.test.cpp
new file mode 100644
index 000000000..cf56df507
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Fill.test.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Fill.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class FillTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+template <typename T, DataType DT> void runFillIntKernel(IMemoryManager *memory_manager)
+{
+  Shape dims_shape{2};
+
+  std::vector<int32_t> dims_data = {2, 3};
+  std::vector<T> value_data = {5};
+
+  Tensor dims = makeInputTensor<loco::DataType::S32>(dims_shape, dims_data, memory_manager);
+  Tensor value = makeInputTensor<DT>(/*scalar*/ {}, value_data, memory_manager);
+
+  Tensor output_tensor = makeOutputTensor(DT);
+
+  Fill kernel(&dims, &value, &output_tensor);
+
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<T> ref_output_data{5, 5, 5, 5, 5, 5};
+  EXPECT_THAT(extractTensorData<T>(output_tensor), ref_output_data);
+
+  std::vector<int32_t> ref_output_shape{2, 3};
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+template <DataType DT> void runFillQuantIntKernel(IMemoryManager *memory_manager)
+{
+  Shape dims_shape{2};
+
+  std::vector<int32_t> dims_data = {2, 3};
+  std::vector<float> value_data = {5};
+
+  int32_t zero_point = 0;
+
+  if (DT == loco::DataType::S8)
+    zero_point = 1;
+
+  Tensor dims = makeInputTensor<loco::DataType::S32>(dims_shape, dims_data, memory_manager);
+  Tensor value = makeInputTensor<DT>(/*scalar*/ {}, /*scale*/ 0.25, /*zero_point*/ zero_point,
+                                     value_data, memory_manager);
+
+  Tensor output_tensor = makeOutputTensor(DT, /*scale*/ 0.25, /*zero_point*/ zero_point);
+
+  Fill kernel(&dims, &value, &output_tensor);
+
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{5, 5, 5, 5, 5, 5};
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+
+  std::vector<int32_t> ref_output_shape{2, 3};
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(FillTest, FillInt)
+{
+  // Run for int32_t input
+  runFillIntKernel<int32_t, loco::DataType::S32>(_memory_manager.get());
+  // Run for int64_t input
+  runFillIntKernel<int64_t, loco::DataType::S64>(_memory_manager.get());
+  // Run for int8_t input
+  runFillQuantIntKernel<loco::DataType::S8>(_memory_manager.get());
+  // Run for int16_t input
+  runFillQuantIntKernel<loco::DataType::S16>(_memory_manager.get());
+
+  SUCCEED();
+}
+
+TEST_F(FillTest, FillFloat)
+{
+  Shape dims_shape{3};
+
+  std::vector<int64_t> dims_data = {2, 2, 2};
+  std::vector<float> value_data = {5};
+
+  Tensor dims = makeInputTensor<loco::DataType::S64>(dims_shape, dims_data, _memory_manager.get());
+  Tensor value =
+    makeInputTensor<loco::DataType::FLOAT32>(/*scalar*/ {}, value_data, _memory_manager.get());
+
+  Tensor output_tensor = makeOutputTensor(loco::DataType::FLOAT32);
+
+  Fill kernel(&dims, &value, &output_tensor);
+
+  kernel.configure();
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{5, 5, 5, 5, 5, 5, 5, 5};
+
+  std::vector<int32_t> ref_output_shape{2, 2, 2};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), ref_output_data);
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(FillTest, Invalid_Input_Shape_NEG)
+{
+  Shape dims_shape{1, 3};
+
+  std::vector<int32_t> dims_data = {2, 2, 2};
+  std::vector<float> value_data = {5};
+
+  Tensor dims = makeInputTensor<loco::DataType::S32>(dims_shape, dims_data, _memory_manager.get());
+  Tensor value =
+    makeInputTensor<loco::DataType::FLOAT32>(/*scalar*/ {}, value_data, _memory_manager.get());
+
+  Tensor output_tensor = makeOutputTensor(loco::DataType::FLOAT32);
+
+  Fill kernel(&dims, &value, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST_F(FillTest, Invalid_Value_Shape_NEG)
+{
+  Shape dims_shape{3};
+
+  std::vector<int32_t> dims_data = {2, 2, 2};
+  std::vector<float> value_data = {5};
+
+  Tensor dims = makeInputTensor<loco::DataType::S32>(dims_shape, dims_data, _memory_manager.get());
+  Tensor value = makeInputTensor<loco::DataType::FLOAT32>({1}, value_data, _memory_manager.get());
+
+  Tensor output_tensor = makeOutputTensor(loco::DataType::FLOAT32);
+
+  Fill kernel(&dims, &value, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/MirrorPad.cpp b/compiler/luci-interpreter/src/kernels/MirrorPad.cpp
index 2fbeefce4..bae1eac70 100644
--- a/compiler/luci-interpreter/src/kernels/MirrorPad.cpp
+++ b/compiler/luci-interpreter/src/kernels/MirrorPad.cpp
@@ -19,6 +19,8 @@
 
 #include "kernels/Utils.h"
 
+#include <limits>
+
 namespace luci_interpreter
 {
 namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/Pack.cpp b/compiler/luci-interpreter/src/kernels/Pack.cpp
index 6fee93890..42aab330c 100644
--- a/compiler/luci-interpreter/src/kernels/Pack.cpp
+++ b/compiler/luci-interpreter/src/kernels/Pack.cpp
@@ -76,9 +76,8 @@ void Pack::configure()
     }
   }
 
-  if (t0->element_type() == DataType::S32 || t0->element_type() == DataType::U8 ||
-      t0->element_type() == DataType::S8 || t0->element_type() == DataType::S16 ||
-      t0->element_type() == DataType::S64)
+  if (t0->element_type() == DataType::U8 || t0->element_type() == DataType::S8 ||
+      t0->element_type() == DataType::S16)
   {
     LUCI_INTERPRETER_CHECK(output()->zero_point() == t0->zero_point());
     LUCI_INTERPRETER_CHECK(output()->scale() == t0->scale());
diff --git a/compiler/luci-interpreter/src/kernels/Pack.test.cpp b/compiler/luci-interpreter/src/kernels/Pack.test.cpp
index 2404e4303..d16320b78 100644
--- a/compiler/luci-interpreter/src/kernels/Pack.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Pack.test.cpp
@@ -38,18 +38,26 @@ void Check(std::vector<std::initializer_list<int32_t>> input_shapes,
   std::vector<Tensor> tmp_inputs;
   for (int i = 0; i < input_datas.size(); i++)
   {
-    if (std::is_same<T, float>::value)
+    if (std::is_same<T, float>::value || std::is_same<T, int32_t>::value ||
+        std::is_same<T, int64_t>::value)
     {
       tmp_inputs.push_back(Tensor(element_type, input_shapes[i], {}, ""));
       memory_manager->allocate_memory(tmp_inputs[i]);
       tmp_inputs[i].writeData(input_datas[i].data(), input_datas[i].size() * sizeof(T));
     }
-    else
+    else if (std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)
     {
       tmp_inputs.push_back(Tensor(element_type, input_shapes[i], {{1.0f / 255}, {128}}, ""));
       memory_manager->allocate_memory(tmp_inputs[i]);
       tmp_inputs[i].writeData(input_datas[i].data(), input_datas[i].size() * sizeof(T));
     }
+    else
+    {
+      assert((std::is_same<T, int16_t>::value) && "unexpected dtype is tested");
+      tmp_inputs.push_back(Tensor(element_type, input_shapes[i], {{1.0f}, {0}}, ""));
+      memory_manager->allocate_memory(tmp_inputs[i]);
+      tmp_inputs[i].writeData(input_datas[i].data(), input_datas[i].size() * sizeof(T));
+    }
   }
   for (int i = 0; i < input_datas.size(); i++)
   {
@@ -57,10 +65,14 @@ void Check(std::vector<std::initializer_list<int32_t>> input_shapes,
   }
 
   Tensor output_tensor = makeOutputTensor(element_type);
-  if (!std::is_same<T, float>::value)
+  if (std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)
   {
     output_tensor = makeOutputTensor(element_type, 1.0f / 255, 128);
   }
+  else if (std::is_same<T, int16_t>::value)
+  {
+    output_tensor = makeOutputTensor(element_type, 1.0f, 0);
+  }
 
   PackParams params{};
   params.axis = axis;
@@ -79,7 +91,7 @@ template <typename T> class PackTest : public ::testing::Test
 {
 };
 
-using DataTypes = ::testing::Types<uint8_t, float>;
+using DataTypes = ::testing::Types<uint8_t, int8_t, int16_t, int32_t, int64_t, float>;
 TYPED_TEST_SUITE(PackTest, DataTypes);
 
 TYPED_TEST(PackTest, ThreeInputs)
diff --git a/compiler/luci-interpreter/src/kernels/Pad.cpp b/compiler/luci-interpreter/src/kernels/Pad.cpp
index fe172884b..c07f6e310 100644
--- a/compiler/luci-interpreter/src/kernels/Pad.cpp
+++ b/compiler/luci-interpreter/src/kernels/Pad.cpp
@@ -20,6 +20,8 @@
 
 #include <tensorflow/lite/kernels/internal/reference/pad.h>
 
+#include <limits>
+
 namespace luci_interpreter
 {
 namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/PadV2.cpp b/compiler/luci-interpreter/src/kernels/PadV2.cpp
index e90469239..197cdaa69 100644
--- a/compiler/luci-interpreter/src/kernels/PadV2.cpp
+++ b/compiler/luci-interpreter/src/kernels/PadV2.cpp
@@ -20,6 +20,8 @@
 
 #include <tensorflow/lite/kernels/internal/reference/pad.h>
 
+#include <limits>
+
 namespace luci_interpreter
 {
 namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/ReduceMax.cpp b/compiler/luci-interpreter/src/kernels/ReduceMax.cpp
new file mode 100644
index 000000000..d58cd1563
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/ReduceMax.cpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/ReduceMax.h"
+
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/reduce.h>
+
+#include <stdexcept>
+#include <limits>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+// Returns the number of axes that will be reduced. Removes duplicates.
+static int getAxisReductionCount(const int32_t *axes_data, int num_axes, int input_num_dims)
+{
+  int reduction_count = num_axes;
+  for (int i = 0; i < num_axes; ++i)
+  {
+    int current = axes_data[i] >= 0 ? axes_data[i] : axes_data[i] + input_num_dims;
+    assert(current >= 0 && current < input_num_dims);
+    for (int j = 0; j < i; j++)
+    {
+      int previous = axes_data[j] >= 0 ? axes_data[j] : axes_data[j] + input_num_dims;
+      // This checks for duplicate axis
+      if (current == previous)
+      {
+        --reduction_count;
+        break;
+      }
+    }
+  }
+  return reduction_count;
+}
+
+static Shape getOutputShape(const Shape &input_shape, const int32_t *axes_data, int num_axes,
+                            bool keep_dims)
+{
+  int input_num_dims = input_shape.num_dims();
+  if (input_num_dims == 0)
+  {
+    return Shape(0);
+  }
+
+  if (keep_dims)
+  {
+    Shape output_shape(input_num_dims);
+    for (int idx = 0; idx < input_num_dims; ++idx)
+    {
+      bool is_axis = false;
+      for (int axis_idx = 0; axis_idx < num_axes; ++axis_idx)
+      {
+        if (axes_data[axis_idx] == idx || axes_data[axis_idx] + input_num_dims == idx)
+        {
+          is_axis = true;
+          break;
+        }
+      }
+      if (is_axis)
+      {
+        output_shape.dim(idx) = 1;
+      }
+      else
+      {
+        output_shape.dim(idx) = input_shape.dim(idx);
+      }
+    }
+    return output_shape;
+  }
+  else
+  {
+    int num_reduce_axes = getAxisReductionCount(axes_data, num_axes, input_num_dims);
+    Shape output_shape(input_num_dims - num_reduce_axes);
+    int num_skip_axes = 0;
+    for (int idx = 0; idx < input_num_dims; ++idx)
+    {
+      bool is_axis = false;
+      for (int axis_idx = 0; axis_idx < num_axes; ++axis_idx)
+      {
+        if (axes_data[axis_idx] == idx || axes_data[axis_idx] + input_num_dims == idx)
+        {
+          ++num_skip_axes;
+          is_axis = true;
+          break;
+        }
+      }
+      if (!is_axis)
+      {
+        output_shape.dim(idx - num_skip_axes) = input_shape.dim(idx);
+      }
+    }
+    return output_shape;
+  }
+}
+
+ReduceMax::ReduceMax(const Tensor *input, const Tensor *axes, Tensor *output, Tensor *temp_index,
+                     Tensor *resolved_axes, const ReducerParams &params)
+  : KernelWithParams<ReducerParams>({input, axes}, {output, temp_index, resolved_axes}, params)
+{
+}
+
+void ReduceMax::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+  LUCI_INTERPRETER_CHECK(axes()->element_type() == DataType::S32);
+
+  const Shape &input_shape = input()->shape();
+  int input_num_dims = input_shape.num_dims();
+
+  const auto *axes_data = getTensorData<int32_t>(axes());
+  int num_axes = axes()->shape().num_elements();
+  LUCI_INTERPRETER_CHECK(num_axes <= 4);
+
+  // We compute shapes of outputs in configure, assuming that outputs have
+  // static shape
+  // TODO Support dynamic shape
+  Shape output_shape = getOutputShape(input_shape, axes_data, num_axes, _params.keep_dims);
+  output()->resize(output_shape);
+
+  auto temp_index = getOutputTensors()[1];
+  auto resolved_axes = getOutputTensors()[2];
+
+  temp_index->resize(Shape(input_num_dims));
+  resolved_axes->resize(Shape(num_axes));
+}
+
+void ReduceMax::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    // TODO Support quantized kernels
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void ReduceMax::evalFloat() const
+{
+  const auto *axes_data = getTensorData<int32_t>(axes());
+  int num_axes = axes()->shape().num_elements();
+
+  auto temp_index = getOutputTensors()[1];
+  auto resolved_axes = getOutputTensors()[2];
+
+  int num_resolved_axis = 0;
+  LUCI_INTERPRETER_CHECK(
+    tflite::reference_ops::ResolveAxis(input()->shape().num_dims(), axes_data, num_axes,
+                                       getTensorData<int>(resolved_axes), &num_resolved_axis));
+
+  float init_value = std::numeric_limits<float>::lowest();
+  tflite::reference_ops::ReduceGeneric<float>(
+    getTensorData<float>(input()), getTensorShape(input()).DimsData(), input()->shape().num_dims(),
+    getTensorData<float>(output()), getTensorShape(output()).DimsData(),
+    output()->shape().num_dims(), axes_data, num_axes, _params.keep_dims,
+    getTensorData<int>(temp_index), getTensorData<int>(resolved_axes), init_value,
+    [](const float current, const float in) -> float { return (in > current) ? in : current; });
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/ReduceMax.h b/compiler/luci-interpreter/src/kernels/ReduceMax.h
new file mode 100644
index 000000000..25a66278a
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/ReduceMax.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_REDUCE_MAX_H
+#define LUCI_INTERPRETER_KERNELS_REDUCE_MAX_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+#include <memory>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class ReduceMax : public KernelWithParams<ReducerParams>
+{
+public:
+  ReduceMax(const Tensor *input, const Tensor *axes, Tensor *output, Tensor *temp_index,
+            Tensor *resolved_axes, const ReducerParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *axes() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_REDUCE_MAX_H
diff --git a/compiler/luci-interpreter/src/kernels/ReduceMax.test.cpp b/compiler/luci-interpreter/src/kernels/ReduceMax.test.cpp
new file mode 100644
index 000000000..ab688827b
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/ReduceMax.test.cpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/ReduceMax.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class ReduceMaxTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+TEST_F(ReduceMaxTest, FloatNotKeepDims)
+{
+  std::vector<float> input_data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
+                                   9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                                   17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+
+  std::vector<int32_t> axis_data{1, 0, -3, -3};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({4, 3, 2}, input_data, _memory_manager.get());
+  Tensor axis_tensor = makeInputTensor<DataType::S32>({4}, axis_data, _memory_manager.get());
+  Tensor temp_index(DataType::S32, Shape({}), {}, "");
+  Tensor resolved_axes(DataType::S32, Shape({}), {}, "");
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  ReducerParams params{};
+  params.keep_dims = false;
+
+  ReduceMax kernel(&input_tensor, &axis_tensor, &output_tensor, &temp_index, &resolved_axes,
+                   params);
+  kernel.configure();
+  _memory_manager->allocate_memory(temp_index);
+  _memory_manager->allocate_memory(resolved_axes);
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{23, 24};
+  std::initializer_list<int32_t> ref_output_shape{2};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(ReduceMaxTest, FloatKeepDims)
+{
+  std::vector<float> input_data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
+                                   9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                                   17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+
+  std::vector<int32_t> axis_data{0, 2};
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({4, 3, 2}, input_data, _memory_manager.get());
+  Tensor axis_tensor = makeInputTensor<DataType::S32>({2}, axis_data, _memory_manager.get());
+  Tensor temp_index(DataType::S32, Shape({}), {}, "");
+  Tensor resolved_axes(DataType::S32, Shape({}), {}, "");
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  ReducerParams params{};
+  params.keep_dims = true;
+
+  ReduceMax kernel(&input_tensor, &axis_tensor, &output_tensor, &temp_index, &resolved_axes,
+                   params);
+  kernel.configure();
+  _memory_manager->allocate_memory(temp_index);
+  _memory_manager->allocate_memory(resolved_axes);
+  _memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<float> ref_output_data{20, 22, 24};
+  std::initializer_list<int32_t> ref_output_shape{1, 3, 1};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Shape.cpp b/compiler/luci-interpreter/src/kernels/Shape.cpp
new file mode 100644
index 000000000..0429fe1e5
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Shape.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Shape.h"
+#include "kernels/Utils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+ShapeKernel::ShapeKernel(const Tensor *input, Tensor *output, const ShapeParams &params)
+  : KernelWithParams<ShapeParams>({input}, {output}, params)
+{
+}
+
+void ShapeKernel::configure()
+{
+  LUCI_INTERPRETER_CHECK(output()->element_type() == DataType::S32 or
+                         output()->element_type() == DataType::S64);
+  const auto input_shape = input()->shape();
+
+  Shape output_shape(1);
+  output_shape.dim(0) = input_shape.num_dims();
+
+  output()->resize(output_shape);
+}
+
+void ShapeKernel::execute() const
+{
+  switch (params().out_type)
+  {
+    case DataType::S32:
+      evalInt<int32_t>();
+      break;
+    case DataType::S64:
+      evalInt<int64_t>();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+template <typename T> void ShapeKernel::evalInt() const
+{
+  const auto input_shape = input()->shape();
+
+  auto output_data = getTensorData<T>(output());
+
+  for (int i = 0; i < input_shape.num_dims(); ++i)
+  {
+    output_data[i] = input_shape.dim(i);
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Shape.h b/compiler/luci-interpreter/src/kernels/Shape.h
new file mode 100644
index 000000000..cfaadec91
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Shape.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_SHAPE_H
+#define LUCI_INTERPRETER_KERNELS_SHAPE_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class ShapeKernel : public KernelWithParams<ShapeParams>
+{
+public:
+  ShapeKernel(const Tensor *input, Tensor *output, const ShapeParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  template <typename T> void evalInt() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_SHAPE_H
diff --git a/compiler/luci-interpreter/src/kernels/Shape.test.cpp b/compiler/luci-interpreter/src/kernels/Shape.test.cpp
new file mode 100644
index 000000000..4763e016c
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Shape.test.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Shape.h"
+#include "kernels/TestUtils.h"
+#include "luci_interpreter/TestMemoryManager.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+class ShapeTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { _memory_manager = std::make_unique<TestMemoryManager>(); }
+
+  std::unique_ptr<IMemoryManager> _memory_manager;
+};
+
+template <typename T> void runShapeKernel(loco::DataType dataType, IMemoryManager *memory_manager)
+{
+  Shape input_shape{1, 3, 1, 3, 5};
+
+  Tensor input_tensor = Tensor(loco::DataType::FLOAT32, input_shape, {}, "");
+  Tensor output_tensor = makeOutputTensor(dataType);
+
+  ShapeParams params{};
+  params.out_type = dataType;
+
+  ShapeKernel kernel(&input_tensor, &output_tensor, params);
+
+  kernel.configure();
+  memory_manager->allocate_memory(output_tensor);
+  kernel.execute();
+
+  std::vector<T> ref_output_data{1, 3, 1, 3, 5};
+  EXPECT_THAT(extractTensorData<T>(output_tensor), ref_output_data);
+
+  std::vector<int32_t> ref_output_shape{5};
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(ref_output_shape));
+}
+
+TEST_F(ShapeTest, OutTypeInt)
+{
+
+  // Run for int32_t output
+  runShapeKernel<int32_t>(loco::DataType::S32, _memory_manager.get());
+  // Run for int64_t output
+  runShapeKernel<int64_t>(loco::DataType::S64, _memory_manager.get());
+
+  SUCCEED();
+}
+
+TEST_F(ShapeTest, Invalid_Output_Type_NEG)
+{
+  Shape input_shape{1, 3};
+
+  Tensor input_tensor = Tensor(loco::DataType::FLOAT32, input_shape, {}, "");
+  Tensor output_tensor = makeOutputTensor(loco::DataType::FLOAT32);
+
+  ShapeParams params{};
+  params.out_type = loco::DataType::FLOAT32;
+
+  ShapeKernel kernel(&input_tensor, &output_tensor, params);
+
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/SplitV.cpp b/compiler/luci-interpreter/src/kernels/SplitV.cpp
index 281988272..aa6820889 100644
--- a/compiler/luci-interpreter/src/kernels/SplitV.cpp
+++ b/compiler/luci-interpreter/src/kernels/SplitV.cpp
@@ -43,14 +43,36 @@ void SplitV::configure()
   auto sizes_data = getTensorData<int32_t>(size_splits());
 
   assert(size_splits()->shape().num_dims() == 1);
+
+  int32_t sum = 0;
+  const auto num_dims_size_spits = size_splits()->shape().dim(0);
+  int32_t count_neg_dim = 0;
+
+  for (int32_t i = 0; i < num_dims_size_spits - 1; ++i)
+  {
+    if (sizes_data[i] != -1)
+    {
+      sum += sizes_data[i];
+    }
+    else
+    {
+      count_neg_dim++;
+    }
+  }
+  assert(count_neg_dim < 2);
   assert(size_splits()->shape().num_elements() == num_split);
-  assert(std::accumulate(sizes_data, sizes_data + num_split, 0) ==
-         input()->shape().dim(_axis_value));
 
   auto output_shape = input()->shape();
   for (int32_t i = 0; i < num_split; ++i)
   {
-    output_shape.dim(_axis_value) = sizes_data[i];
+    if (sizes_data[i] == -1)
+    {
+      output_shape.dim(_axis_value) = input()->shape().dim(_axis_value) - sum;
+    }
+    else
+    {
+      output_shape.dim(_axis_value) = sizes_data[i];
+    }
     _outputs[i]->resize(output_shape);
   }
 }
diff --git a/compiler/luci-interpreter/src/kernels/StridedSlice.cpp b/compiler/luci-interpreter/src/kernels/StridedSlice.cpp
index c6452cdb0..a8730d861 100644
--- a/compiler/luci-interpreter/src/kernels/StridedSlice.cpp
+++ b/compiler/luci-interpreter/src/kernels/StridedSlice.cpp
@@ -136,6 +136,11 @@ void StridedSlice::execute() const
                                           getTensorData<uint8_t>(input()), getTensorShape(output()),
                                           getTensorData<uint8_t>(output()));
       break;
+    case DataType::S32:
+      tflite::reference_ops::StridedSlice(op_params, getTensorShape(input()),
+                                          getTensorData<int32_t>(input()), getTensorShape(output()),
+                                          getTensorData<int32_t>(output()));
+      break;
     default:
       throw std::runtime_error("Unsupported type.");
   }
diff --git a/compiler/luci-interpreter/src/loader/GraphLoader.cpp b/compiler/luci-interpreter/src/loader/GraphLoader.cpp
index dba39050c..40207090b 100644
--- a/compiler/luci-interpreter/src/loader/GraphLoader.cpp
+++ b/compiler/luci-interpreter/src/loader/GraphLoader.cpp
@@ -187,7 +187,7 @@ void GraphLoader::loadTensors()
     const auto *node = loco::must_cast<const luci::CircleNode *>(_graph->nodes()->at(i));
 
     if (node->opcode() == luci::CircleOpcode::CUSTOM && !isSupportedCustomNode(node))
-      throw std::runtime_error("Unknown Custom Node, yet.");
+      throw std::runtime_error("Unsupported Custom operator. " + node->name());
 
     if (!isTensorProducingNode(node))
       continue;
diff --git a/compiler/luci-interpreter/src/loader/nodes/Add.cpp b/compiler/luci-interpreter/src/loader/nodes/Add.cpp
index decccaa1d..501e84752 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Add.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Add.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleAdd(const luci::CircleNode *circle_node,
                                                KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleAdd *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleAdd *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input1 = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/ArgMax.cpp b/compiler/luci-interpreter/src/loader/nodes/ArgMax.cpp
index 0ee367748..f3ca55744 100644
--- a/compiler/luci-interpreter/src/loader/nodes/ArgMax.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/ArgMax.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleArgMax(const luci::CircleNode *circle_node,
                                                   KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleArgMax *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleArgMax *>(circle_node);
   assert(node->arity() == 2);
   const Tensor *input = helper.getInputTensor(node->input());
   const Tensor *axis = helper.getInputTensor(node->dimension());
diff --git a/compiler/luci-interpreter/src/loader/nodes/AveragePool2D.cpp b/compiler/luci-interpreter/src/loader/nodes/AveragePool2D.cpp
index efb011257..a8135706f 100644
--- a/compiler/luci-interpreter/src/loader/nodes/AveragePool2D.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/AveragePool2D.cpp
@@ -25,9 +25,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleAveragePool2D(const luci::CircleNode *circle_node,
                                                          KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleAveragePool2D *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleAveragePool2D *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->value());
diff --git a/compiler/luci-interpreter/src/loader/nodes/BatchMatMul.cpp b/compiler/luci-interpreter/src/loader/nodes/BatchMatMul.cpp
index aae3dbab1..9da2f6d93 100644
--- a/compiler/luci-interpreter/src/loader/nodes/BatchMatMul.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/BatchMatMul.cpp
@@ -25,9 +25,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleBatchMatMul(const luci::CircleNode *circle_node,
                                                        KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleBatchMatMul *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleBatchMatMul *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *lhs = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/BatchToSpaceND.cpp b/compiler/luci-interpreter/src/loader/nodes/BatchToSpaceND.cpp
index 33d0e2db6..ac6ebb30f 100644
--- a/compiler/luci-interpreter/src/loader/nodes/BatchToSpaceND.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/BatchToSpaceND.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleBatchToSpaceND(const luci::CircleNode *circle_node,
                                                           KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleBatchToSpaceND *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleBatchToSpaceND *>(circle_node);
   assert(node->arity() == 3);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Cast.cpp b/compiler/luci-interpreter/src/loader/nodes/Cast.cpp
index 21ea5ceab..a16354c96 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Cast.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Cast.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleCast(const luci::CircleNode *circle_node,
                                                 KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleCast *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleCast *>(circle_node);
 
   assert(node->arity() == 1);
 
diff --git a/compiler/luci-interpreter/src/loader/nodes/Concatenation.cpp b/compiler/luci-interpreter/src/loader/nodes/Concatenation.cpp
index 7823a9967..ba2564ea2 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Concatenation.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Concatenation.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleConcatenation(const luci::CircleNode *circle_node,
                                                          KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleConcatenation *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleConcatenation *>(circle_node);
   std::vector<const Tensor *> inputs(node->numValues());
   for (uint32_t i = 0; i < node->numValues(); ++i)
   {
diff --git a/compiler/luci-interpreter/src/loader/nodes/Conv2D.cpp b/compiler/luci-interpreter/src/loader/nodes/Conv2D.cpp
index b48d97d19..218165e20 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Conv2D.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Conv2D.cpp
@@ -25,9 +25,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleConv2D(const luci::CircleNode *circle_node,
                                                   KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleConv2D *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleConv2D *>(circle_node);
   assert(node->arity() == 3);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/DepthToSpace.cpp b/compiler/luci-interpreter/src/loader/nodes/DepthToSpace.cpp
index 0310fb23f..174946367 100644
--- a/compiler/luci-interpreter/src/loader/nodes/DepthToSpace.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/DepthToSpace.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleDepthToSpace(const luci::CircleNode *circle_node,
                                                         KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleDepthToSpace *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleDepthToSpace *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/DepthwiseConv2D.cpp b/compiler/luci-interpreter/src/loader/nodes/DepthwiseConv2D.cpp
index db26ecf2e..8af1e3b58 100644
--- a/compiler/luci-interpreter/src/loader/nodes/DepthwiseConv2D.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/DepthwiseConv2D.cpp
@@ -25,9 +25,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleDepthwiseConv2D(const luci::CircleNode *circle_node,
                                                            KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleDepthwiseConv2D *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleDepthwiseConv2D *>(circle_node);
   assert(node->arity() == 3);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Dequantize.cpp b/compiler/luci-interpreter/src/loader/nodes/Dequantize.cpp
index 4aae56469..787322e9b 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Dequantize.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Dequantize.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleDequantize(const luci::CircleNode *circle_node,
                                                       KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleDequantize *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleDequantize *>(circle_node);
 
   const Tensor *input = helper.getInputTensor(node->input());
   Tensor *output = helper.getOutputTensor(node);
diff --git a/compiler/luci-interpreter/src/loader/nodes/Div.cpp b/compiler/luci-interpreter/src/loader/nodes/Div.cpp
index 56c2e98f2..0611dfdab 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Div.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Div.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleDiv(const luci::CircleNode *circle_node,
                                                KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleDiv *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleDiv *>(circle_node);
   assert(node->arity() == 2);
   const Tensor *input1 = helper.getInputTensor(node->x());
   const Tensor *input2 = helper.getInputTensor(node->y());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Elu.cpp b/compiler/luci-interpreter/src/loader/nodes/Elu.cpp
index 98ee78be7..a79985e3b 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Elu.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Elu.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleElu(const luci::CircleNode *circle_node,
                                                KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleElu *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleElu *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->features());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Equal.cpp b/compiler/luci-interpreter/src/loader/nodes/Equal.cpp
index 649d9bfe9..59692883f 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Equal.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Equal.cpp
@@ -25,9 +25,7 @@ std::unique_ptr<Kernel> build_kernel_CircleEqual(const luci::CircleNode *circle_
                                                  KernelBuilderHelper &helper)
 
 {
-  const auto *node = dynamic_cast<const luci::CircleEqual *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleEqual *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *x = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Exp.cpp b/compiler/luci-interpreter/src/loader/nodes/Exp.cpp
index 411d142c3..30d11cb89 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Exp.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Exp.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleExp(const luci::CircleNode *circle_node,
                                                KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleExp *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleExp *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Fill.cpp b/compiler/luci-interpreter/src/loader/nodes/Fill.cpp
new file mode 100644
index 000000000..3aefdf1c5
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Fill.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Fill.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleFill(const luci::CircleNode *circle_node,
+                                                KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleFill *>(circle_node);
+  assert(node->arity() == 2);
+
+  const auto dims = helper.getInputTensor(node->dims());
+  const auto value = helper.getInputTensor(node->value());
+  auto output = helper.getOutputTensor(node);
+
+  return std::make_unique<kernels::Fill>(dims, value, output);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Floor.cpp b/compiler/luci-interpreter/src/loader/nodes/Floor.cpp
index 6d8435f6c..e0a223116 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Floor.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Floor.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleFloor(const luci::CircleNode *circle_node,
                                                  KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleFloor *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleFloor *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/FloorDiv.cpp b/compiler/luci-interpreter/src/loader/nodes/FloorDiv.cpp
index cae2e186e..a45d89e38 100644
--- a/compiler/luci-interpreter/src/loader/nodes/FloorDiv.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/FloorDiv.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleFloorDiv(const luci::CircleNode *circle_node,
                                                     KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleFloorDiv *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleFloorDiv *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *x = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/FullyConnected.cpp b/compiler/luci-interpreter/src/loader/nodes/FullyConnected.cpp
index 0b8ac44bd..b7b742b8a 100644
--- a/compiler/luci-interpreter/src/loader/nodes/FullyConnected.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/FullyConnected.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleFullyConnected(const luci::CircleNode *circle_node,
                                                           KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleFullyConnected *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleFullyConnected *>(circle_node);
   assert(node->arity() == 3);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Gather.cpp b/compiler/luci-interpreter/src/loader/nodes/Gather.cpp
index 9df9775c5..2ee2906e0 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Gather.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Gather.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleGather(const luci::CircleNode *circle_node,
                                                   KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleGather *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleGather *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *params = helper.getInputTensor(node->params());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Greater.cpp b/compiler/luci-interpreter/src/loader/nodes/Greater.cpp
index 3db11b840..80aa63cf0 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Greater.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Greater.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleGreater(const luci::CircleNode *circle_node,
                                                    KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleGreater *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleGreater *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *x = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/GreaterEqual.cpp b/compiler/luci-interpreter/src/loader/nodes/GreaterEqual.cpp
index dbe051d67..272f2843b 100644
--- a/compiler/luci-interpreter/src/loader/nodes/GreaterEqual.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/GreaterEqual.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleGreaterEqual(const luci::CircleNode *circle_node,
                                                         KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleGreaterEqual *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleGreaterEqual *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *x = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/If.cpp b/compiler/luci-interpreter/src/loader/nodes/If.cpp
index 5983f4d3b..3ac7d4941 100644
--- a/compiler/luci-interpreter/src/loader/nodes/If.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/If.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleIf(const luci::CircleNode *circle_node,
                                               KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleIf *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleIf *>(circle_node);
   auto output_nodes = collectOutputNodes<luci::CircleIfOut>(node);
   assert(node->arity() == 1 + node->input_count());
   assert(output_nodes.size() == static_cast<size_t>(node->output_count()));
diff --git a/compiler/luci-interpreter/src/loader/nodes/InstanceNorm.cpp b/compiler/luci-interpreter/src/loader/nodes/InstanceNorm.cpp
index 0a8fb85e2..06031e5bc 100644
--- a/compiler/luci-interpreter/src/loader/nodes/InstanceNorm.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/InstanceNorm.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleInstanceNorm(const luci::CircleNode *circle_node,
                                                         KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleInstanceNorm *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleInstanceNorm *>(circle_node);
   assert(node->arity() == 3);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/L2Normalize.cpp b/compiler/luci-interpreter/src/loader/nodes/L2Normalize.cpp
index 05f920266..6e22e6d4e 100644
--- a/compiler/luci-interpreter/src/loader/nodes/L2Normalize.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/L2Normalize.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleL2Normalize(const luci::CircleNode *circle_node,
                                                        KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleL2Normalize *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleL2Normalize *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/L2Pool2D.cpp b/compiler/luci-interpreter/src/loader/nodes/L2Pool2D.cpp
index 0e70afafa..95b55896f 100644
--- a/compiler/luci-interpreter/src/loader/nodes/L2Pool2D.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/L2Pool2D.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleL2Pool2D(const luci::CircleNode *circle_node,
                                                     KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleL2Pool2D *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleL2Pool2D *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->value());
diff --git a/compiler/luci-interpreter/src/loader/nodes/LeakyRelu.cpp b/compiler/luci-interpreter/src/loader/nodes/LeakyRelu.cpp
index 7b229ad0e..bbf5067b1 100644
--- a/compiler/luci-interpreter/src/loader/nodes/LeakyRelu.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/LeakyRelu.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleLeakyRelu(const luci::CircleNode *circle_node,
                                                      KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleLeakyRelu *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleLeakyRelu *>(circle_node);
   assert(node->arity() == 1);
   const Tensor *input = helper.getInputTensor(node->features());
   Tensor *output = helper.getOutputTensor(node);
diff --git a/compiler/luci-interpreter/src/loader/nodes/Less.cpp b/compiler/luci-interpreter/src/loader/nodes/Less.cpp
index 81156f275..ae914ecc9 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Less.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Less.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleLess(const luci::CircleNode *circle_node,
                                                 KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleLess *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleLess *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *x = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/LessEqual.cpp b/compiler/luci-interpreter/src/loader/nodes/LessEqual.cpp
index 82141e5ae..f1b424b55 100644
--- a/compiler/luci-interpreter/src/loader/nodes/LessEqual.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/LessEqual.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleLessEqual(const luci::CircleNode *circle_node,
                                                      KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleLessEqual *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleLessEqual *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *x = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/LocalResponseNormalization.cpp b/compiler/luci-interpreter/src/loader/nodes/LocalResponseNormalization.cpp
index a12dce0a0..962ca2d7c 100644
--- a/compiler/luci-interpreter/src/loader/nodes/LocalResponseNormalization.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/LocalResponseNormalization.cpp
@@ -25,9 +25,7 @@ std::unique_ptr<Kernel>
 build_kernel_CircleLocalResponseNormalization(const luci::CircleNode *circle_node,
                                               KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleLocalResponseNormalization *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleLocalResponseNormalization *>(circle_node);
   assert(node->arity() == 1);
   const Tensor *input = helper.getInputTensor(node->input());
   Tensor *output = helper.getOutputTensor(node);
diff --git a/compiler/luci-interpreter/src/loader/nodes/LogSoftmax.cpp b/compiler/luci-interpreter/src/loader/nodes/LogSoftmax.cpp
index 6cf547aae..432204115 100644
--- a/compiler/luci-interpreter/src/loader/nodes/LogSoftmax.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/LogSoftmax.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleLogSoftmax(const luci::CircleNode *circle_node,
                                                       KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleLogSoftmax *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleLogSoftmax *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->logits());
diff --git a/compiler/luci-interpreter/src/loader/nodes/LogicalAnd.cpp b/compiler/luci-interpreter/src/loader/nodes/LogicalAnd.cpp
index 2c9549f71..bf3cb671a 100644
--- a/compiler/luci-interpreter/src/loader/nodes/LogicalAnd.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/LogicalAnd.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleLogicalAnd(const luci::CircleNode *circle_node,
                                                       KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleLogicalAnd *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleLogicalAnd *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input1 = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/LogicalNot.cpp b/compiler/luci-interpreter/src/loader/nodes/LogicalNot.cpp
index 3d327d6c4..fefcd9a06 100644
--- a/compiler/luci-interpreter/src/loader/nodes/LogicalNot.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/LogicalNot.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleLogicalNot(const luci::CircleNode *circle_node,
                                                       KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleLogicalNot *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleLogicalNot *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/LogicalOr.cpp b/compiler/luci-interpreter/src/loader/nodes/LogicalOr.cpp
index 50566bb30..a416cb401 100644
--- a/compiler/luci-interpreter/src/loader/nodes/LogicalOr.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/LogicalOr.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleLogicalOr(const luci::CircleNode *circle_node,
                                                      KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleLogicalOr *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleLogicalOr *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input1 = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Logistic.cpp b/compiler/luci-interpreter/src/loader/nodes/Logistic.cpp
index e4160edb3..4a69deef1 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Logistic.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Logistic.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleLogistic(const luci::CircleNode *circle_node,
                                                     KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleLogistic *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleLogistic *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/MaxPool2D.cpp b/compiler/luci-interpreter/src/loader/nodes/MaxPool2D.cpp
index 914f22838..f66a206ca 100644
--- a/compiler/luci-interpreter/src/loader/nodes/MaxPool2D.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/MaxPool2D.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleMaxPool2D(const luci::CircleNode *circle_node,
                                                      KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleMaxPool2D *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleMaxPool2D *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->value());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Maximum.cpp b/compiler/luci-interpreter/src/loader/nodes/Maximum.cpp
index dc50d6773..d0bff776a 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Maximum.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Maximum.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleMaximum(const luci::CircleNode *circle_node,
                                                    KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleMaximum *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleMaximum *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input1 = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Mean.cpp b/compiler/luci-interpreter/src/loader/nodes/Mean.cpp
index 97d91207f..0dec63e79 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Mean.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Mean.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleMean(const luci::CircleNode *circle_node,
                                                 KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleMean *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleMean *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Minimum.cpp b/compiler/luci-interpreter/src/loader/nodes/Minimum.cpp
index ff659524a..1a49c1090 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Minimum.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Minimum.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleMinimum(const luci::CircleNode *circle_node,
                                                    KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleMinimum *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleMinimum *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input1 = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/MirrorPad.cpp b/compiler/luci-interpreter/src/loader/nodes/MirrorPad.cpp
index ebf294583..b221b4574 100644
--- a/compiler/luci-interpreter/src/loader/nodes/MirrorPad.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/MirrorPad.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleMirrorPad(const luci::CircleNode *circle_node,
                                                      KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleMirrorPad *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleMirrorPad *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Mul.cpp b/compiler/luci-interpreter/src/loader/nodes/Mul.cpp
index 4f9da967d..f9984853a 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Mul.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Mul.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleMul(const luci::CircleNode *circle_node,
                                                KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleMul *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleMul *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input1 = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Neg.cpp b/compiler/luci-interpreter/src/loader/nodes/Neg.cpp
index 23c00537b..9a9ecf991 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Neg.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Neg.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleNeg(const luci::CircleNode *circle_node,
                                                KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleNeg *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleNeg *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/NotEqual.cpp b/compiler/luci-interpreter/src/loader/nodes/NotEqual.cpp
index 8e5711fc1..3916a5854 100644
--- a/compiler/luci-interpreter/src/loader/nodes/NotEqual.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/NotEqual.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleNotEqual(const luci::CircleNode *circle_node,
                                                     KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleNotEqual *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleNotEqual *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *x = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/PRelu.cpp b/compiler/luci-interpreter/src/loader/nodes/PRelu.cpp
index e31601bf6..f3d700c95 100644
--- a/compiler/luci-interpreter/src/loader/nodes/PRelu.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/PRelu.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CirclePRelu(const luci::CircleNode *circle_node,
                                                  KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CirclePRelu *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CirclePRelu *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Pack.cpp b/compiler/luci-interpreter/src/loader/nodes/Pack.cpp
index 699472081..efc5850e0 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Pack.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Pack.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CirclePack(const luci::CircleNode *circle_node,
                                                 KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CirclePack *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CirclePack *>(circle_node);
   assert(node->arity() == node->values_count());
 
   std::vector<const Tensor *> inputs(node->values_count());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Pad.cpp b/compiler/luci-interpreter/src/loader/nodes/Pad.cpp
index 770549295..67ce997a7 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Pad.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Pad.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CirclePad(const luci::CircleNode *circle_node,
                                                KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CirclePad *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CirclePad *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/PadV2.cpp b/compiler/luci-interpreter/src/loader/nodes/PadV2.cpp
index 12deb15f0..e378a972a 100644
--- a/compiler/luci-interpreter/src/loader/nodes/PadV2.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/PadV2.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CirclePadV2(const luci::CircleNode *circle_node,
                                                  KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CirclePadV2 *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CirclePadV2 *>(circle_node);
   assert(node->arity() == 3);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Pow.cpp b/compiler/luci-interpreter/src/loader/nodes/Pow.cpp
index b430bc94f..d32fc3dbb 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Pow.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Pow.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CirclePow(const luci::CircleNode *circle_node,
                                                KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CirclePow *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CirclePow *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input1 = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Quantize.cpp b/compiler/luci-interpreter/src/loader/nodes/Quantize.cpp
index fd9836345..cb36fb6da 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Quantize.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Quantize.cpp
@@ -24,9 +24,8 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleQuantize(const luci::CircleNode *circle_node,
                                                     KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleQuantize *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleQuantize *>(circle_node);
+  assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->input());
   Tensor *output = helper.getOutputTensor(node);
diff --git a/compiler/luci-interpreter/src/loader/nodes/ReduceMax.cpp b/compiler/luci-interpreter/src/loader/nodes/ReduceMax.cpp
new file mode 100644
index 000000000..1a8522dd6
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/ReduceMax.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/ReduceMax.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleReduceMax(const luci::CircleNode *circle_node,
+                                                     KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleReduceMax *>(circle_node);
+  assert(node->arity() == 2);
+
+  const Tensor *input = helper.getInputTensor(node->input());
+  const Tensor *axes = helper.getInputTensor(node->reduction_indices());
+  Tensor *output = helper.getOutputTensor(node);
+
+  auto temp_index_unique =
+    std::make_unique<Tensor>(DataType::S32, Shape({}), AffineQuantization{}, "");
+  temp_index_unique->set_observable(false);
+  temp_index_unique->set_data_buffer(nullptr);
+  Tensor *temp_index =
+    helper.getRuntimeGraph(node->graph())->addTensor(std::move(temp_index_unique));
+
+  auto resolved_axes_unique =
+    std::make_unique<Tensor>(DataType::S32, Shape({}), AffineQuantization{}, "");
+  resolved_axes_unique->set_observable(false);
+  resolved_axes_unique->set_data_buffer(nullptr);
+  Tensor *resolved_axes =
+    helper.getRuntimeGraph(node->graph())->addTensor(std::move(resolved_axes_unique));
+
+  ReducerParams params{};
+  params.keep_dims = node->keep_dims();
+
+  return std::make_unique<kernels::ReduceMax>(input, axes, output, temp_index, resolved_axes,
+                                              params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Relu.cpp b/compiler/luci-interpreter/src/loader/nodes/Relu.cpp
index d53a66a06..1d64c1c4e 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Relu.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Relu.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleRelu(const luci::CircleNode *circle_node,
                                                 KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleRelu *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleRelu *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->features());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Relu6.cpp b/compiler/luci-interpreter/src/loader/nodes/Relu6.cpp
index f1b5d219b..e50cd2545 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Relu6.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Relu6.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleRelu6(const luci::CircleNode *circle_node,
                                                  KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleRelu6 *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleRelu6 *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->features());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Reshape.cpp b/compiler/luci-interpreter/src/loader/nodes/Reshape.cpp
index 89e3ecebf..76ddd88a3 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Reshape.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Reshape.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleReshape(const luci::CircleNode *circle_node,
                                                    KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleReshape *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleReshape *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input = helper.getInputTensor(node->tensor());
diff --git a/compiler/luci-interpreter/src/loader/nodes/ResizeBilinear.cpp b/compiler/luci-interpreter/src/loader/nodes/ResizeBilinear.cpp
index dca56588d..dc2b88ad3 100644
--- a/compiler/luci-interpreter/src/loader/nodes/ResizeBilinear.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/ResizeBilinear.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleResizeBilinear(const luci::CircleNode *circle_node,
                                                           KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleResizeBilinear *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleResizeBilinear *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/ResizeNearestNeighbor.cpp b/compiler/luci-interpreter/src/loader/nodes/ResizeNearestNeighbor.cpp
index d1ea19c0f..c7058ae78 100644
--- a/compiler/luci-interpreter/src/loader/nodes/ResizeNearestNeighbor.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/ResizeNearestNeighbor.cpp
@@ -25,9 +25,7 @@ std::unique_ptr<Kernel>
 build_kernel_CircleResizeNearestNeighbor(const luci::CircleNode *circle_node,
                                          KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleResizeNearestNeighbor *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleResizeNearestNeighbor *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/ReverseV2.cpp b/compiler/luci-interpreter/src/loader/nodes/ReverseV2.cpp
index ea00f5408..c1a7f5350 100644
--- a/compiler/luci-interpreter/src/loader/nodes/ReverseV2.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/ReverseV2.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleReverseV2(const luci::CircleNode *circle_node,
                                                      KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleReverseV2 *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleReverseV2 *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input = helper.getInputTensor(node->tensor());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Rsqrt.cpp b/compiler/luci-interpreter/src/loader/nodes/Rsqrt.cpp
index ff87f435c..0714a5dba 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Rsqrt.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Rsqrt.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleRsqrt(const luci::CircleNode *circle_node,
                                                  KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleRsqrt *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleRsqrt *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/SVDF.cpp b/compiler/luci-interpreter/src/loader/nodes/SVDF.cpp
index 89528d5ee..d172ef438 100644
--- a/compiler/luci-interpreter/src/loader/nodes/SVDF.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/SVDF.cpp
@@ -24,9 +24,8 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleSVDF(const luci::CircleNode *circle_node,
                                                 KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleSVDF *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleSVDF *>(circle_node);
+  assert(node->arity() == 5);
 
   const Tensor *input = helper.getInputTensor(node->input());
   const Tensor *feature = helper.getInputTensor(node->weight_feature());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Shape.cpp b/compiler/luci-interpreter/src/loader/nodes/Shape.cpp
new file mode 100644
index 000000000..d1edbc794
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/nodes/Shape.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Builders.h"
+
+#include "kernels/Shape.h"
+
+namespace luci_interpreter
+{
+
+std::unique_ptr<Kernel> build_kernel_CircleShape(const luci::CircleNode *circle_node,
+                                                 KernelBuilderHelper &helper)
+{
+  const auto *node = loco::must_cast<const luci::CircleShape *>(circle_node);
+  assert(node->arity() == 1);
+
+  const auto input = helper.getInputTensor(node->input());
+  auto output = helper.getOutputTensor(node);
+
+  ShapeParams shape_params{};
+  shape_params.out_type = node->out_type();
+
+  return std::make_unique<kernels::ShapeKernel>(input, output, shape_params);
+}
+
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/nodes/Slice.cpp b/compiler/luci-interpreter/src/loader/nodes/Slice.cpp
index 741cd0806..60ac6417c 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Slice.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Slice.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleSlice(const luci::CircleNode *circle_node,
                                                  KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleSlice *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleSlice *>(circle_node);
   assert(node->arity() == 3);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Softmax.cpp b/compiler/luci-interpreter/src/loader/nodes/Softmax.cpp
index b15e4b6f3..f41f63f6f 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Softmax.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Softmax.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleSoftmax(const luci::CircleNode *circle_node,
                                                    KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleSoftmax *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleSoftmax *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->logits());
diff --git a/compiler/luci-interpreter/src/loader/nodes/SpaceToBatchND.cpp b/compiler/luci-interpreter/src/loader/nodes/SpaceToBatchND.cpp
index 91c237aa5..b6e6cf516 100644
--- a/compiler/luci-interpreter/src/loader/nodes/SpaceToBatchND.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/SpaceToBatchND.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleSpaceToBatchND(const luci::CircleNode *circle_node,
                                                           KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleSpaceToBatchND *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleSpaceToBatchND *>(circle_node);
   assert(node->arity() == 3);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/SpaceToDepth.cpp b/compiler/luci-interpreter/src/loader/nodes/SpaceToDepth.cpp
index 3cbbd9718..63fdb95ec 100644
--- a/compiler/luci-interpreter/src/loader/nodes/SpaceToDepth.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/SpaceToDepth.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleSpaceToDepth(const luci::CircleNode *circle_node,
                                                         KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleSpaceToDepth *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleSpaceToDepth *>(circle_node);
   assert(node->arity() == 1);
   const Tensor *input = helper.getInputTensor(node->input());
 
diff --git a/compiler/luci-interpreter/src/loader/nodes/Split.cpp b/compiler/luci-interpreter/src/loader/nodes/Split.cpp
index 32553ad5e..3f6d4a7df 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Split.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Split.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleSplit(const luci::CircleNode *circle_node,
                                                  KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleSplit *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleSplit *>(circle_node);
   auto output_nodes = collectOutputNodes<luci::CircleSplitOut>(node);
   assert(node->arity() == 2);
   assert(output_nodes.size() == static_cast<size_t>(node->num_split()));
diff --git a/compiler/luci-interpreter/src/loader/nodes/SplitV.cpp b/compiler/luci-interpreter/src/loader/nodes/SplitV.cpp
index d78816447..0788822ca 100644
--- a/compiler/luci-interpreter/src/loader/nodes/SplitV.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/SplitV.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleSplitV(const luci::CircleNode *circle_node,
                                                   KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleSplitV *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleSplitV *>(circle_node);
   auto output_nodes = collectOutputNodes<luci::CircleSplitVOut>(node);
   assert(node->arity() == 3);
   assert(output_nodes.size() == static_cast<size_t>(node->num_split()));
diff --git a/compiler/luci-interpreter/src/loader/nodes/Sqrt.cpp b/compiler/luci-interpreter/src/loader/nodes/Sqrt.cpp
index 56dd986f1..b9843fe0b 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Sqrt.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Sqrt.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleSqrt(const luci::CircleNode *circle_node,
                                                 KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleSqrt *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleSqrt *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Square.cpp b/compiler/luci-interpreter/src/loader/nodes/Square.cpp
index 43aadb969..0ad7c1772 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Square.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Square.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleSquare(const luci::CircleNode *circle_node,
                                                   KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleSquare *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleSquare *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/SquaredDifference.cpp b/compiler/luci-interpreter/src/loader/nodes/SquaredDifference.cpp
index 6a2717aa2..e4c6fd851 100644
--- a/compiler/luci-interpreter/src/loader/nodes/SquaredDifference.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/SquaredDifference.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleSquaredDifference(const luci::CircleNode *circle_node,
                                                              KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleSquaredDifference *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleSquaredDifference *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input1 = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Squeeze.cpp b/compiler/luci-interpreter/src/loader/nodes/Squeeze.cpp
index 583ff9314..6885f8077 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Squeeze.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Squeeze.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleSqueeze(const luci::CircleNode *circle_node,
                                                    KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleSqueeze *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleSqueeze *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/StridedSlice.cpp b/compiler/luci-interpreter/src/loader/nodes/StridedSlice.cpp
index fe5fa7707..359b4e3e9 100644
--- a/compiler/luci-interpreter/src/loader/nodes/StridedSlice.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/StridedSlice.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleStridedSlice(const luci::CircleNode *circle_node,
                                                         KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleStridedSlice *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleStridedSlice *>(circle_node);
   assert(node->arity() == 4);
 
   const Tensor *input = helper.getInputTensor(node->input());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Sub.cpp b/compiler/luci-interpreter/src/loader/nodes/Sub.cpp
index bad4fbb13..a6252cb53 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Sub.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Sub.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleSub(const luci::CircleNode *circle_node,
                                                KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleSub *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleSub *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input1 = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Tanh.cpp b/compiler/luci-interpreter/src/loader/nodes/Tanh.cpp
index f4255291b..a58ef60a8 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Tanh.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Tanh.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleTanh(const luci::CircleNode *circle_node,
                                                 KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleTanh *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleTanh *>(circle_node);
   assert(node->arity() == 1);
 
   const Tensor *input = helper.getInputTensor(node->x());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Transpose.cpp b/compiler/luci-interpreter/src/loader/nodes/Transpose.cpp
index 4e095fbbc..ea17d8311 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Transpose.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Transpose.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleTranspose(const luci::CircleNode *circle_node,
                                                      KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleTranspose *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleTranspose *>(circle_node);
   assert(node->arity() == 2);
 
   const Tensor *input = helper.getInputTensor(node->a());
diff --git a/compiler/luci-interpreter/src/loader/nodes/TransposeConv.cpp b/compiler/luci-interpreter/src/loader/nodes/TransposeConv.cpp
index 1b954c35c..d773e301e 100644
--- a/compiler/luci-interpreter/src/loader/nodes/TransposeConv.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/TransposeConv.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleTransposeConv(const luci::CircleNode *circle_node,
                                                          KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleTransposeConv *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleTransposeConv *>(circle_node);
   assert(node->arity() == 4);
 
   const Tensor *input_sizes = helper.getInputTensor(node->inputSizes());
diff --git a/compiler/luci-interpreter/src/loader/nodes/Unpack.cpp b/compiler/luci-interpreter/src/loader/nodes/Unpack.cpp
index 978c738c6..a1c0d323a 100644
--- a/compiler/luci-interpreter/src/loader/nodes/Unpack.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/Unpack.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleUnpack(const luci::CircleNode *circle_node,
                                                   KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleUnpack *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleUnpack *>(circle_node);
   auto output_nodes = collectOutputNodes<luci::CircleUnpackOut>(node);
   assert(node->arity() == 1);
   assert(output_nodes.size() == static_cast<size_t>(node->num()));
diff --git a/compiler/luci-interpreter/src/loader/nodes/While.cpp b/compiler/luci-interpreter/src/loader/nodes/While.cpp
index 284dc0c68..8fde6ec8a 100644
--- a/compiler/luci-interpreter/src/loader/nodes/While.cpp
+++ b/compiler/luci-interpreter/src/loader/nodes/While.cpp
@@ -24,9 +24,7 @@ namespace luci_interpreter
 std::unique_ptr<Kernel> build_kernel_CircleWhile(const luci::CircleNode *circle_node,
                                                  KernelBuilderHelper &helper)
 {
-  const auto *node = dynamic_cast<const luci::CircleWhile *>(circle_node);
-  if (node == nullptr)
-    throw std::runtime_error("wrong builder for operation");
+  const auto *node = loco::must_cast<const luci::CircleWhile *>(circle_node);
 
   auto output_nodes = collectOutputNodes<luci::CircleWhileOut>(node);
   assert(node->arity() == node->input_count());