7 files changed, 301 insertions, 179 deletions
diff --git a/runtimes/neurun/src/kernel/cpu/AvgPoolLayer.cc b/runtimes/neurun/src/kernel/cpu/AvgPoolLayer.cc
index 2a6a84e10..f434a6dec 100644
--- a/runtimes/neurun/src/kernel/cpu/AvgPoolLayer.cc
+++ b/runtimes/neurun/src/kernel/cpu/AvgPoolLayer.cc
@@ -27,14 +27,14 @@ namespace kernel
 namespace cpu
 {
 
-#define AVGPOOLING_PARAMETERS                               \
-  uint32_t height = getSizeOfDimension(_inputShape, 1);     \
-  uint32_t width = getSizeOfDimension(_inputShape, 2);      \
-  uint32_t outHeight = getSizeOfDimension(_outputShape, 1); \
-  uint32_t outWidth = getSizeOfDimension(_outputShape, 2);  \
-                                                            \
-  uint32_t paddingHeight = (uint32_t)_paddingTop;           \
-  uint32_t paddingWidth = (uint32_t)_paddingLeft;
+#define AVGPOOLING_PARAMETERS                            \
+  tflite::PoolParams op_params;                          \
+  op_params.stride_height = _strideHeight;               \
+  op_params.stride_width = _strideWidth;                 \
+  op_params.filter_height = _kernelHeight;               \
+  op_params.filter_width = _kernelWidth;                 \
+  op_params.padding_values.height = (int8_t)_paddingTop; \
+  op_params.padding_values.width = (int8_t)_paddingLeft;
 
 AvgPoolLayer::AvgPoolLayer()
     : _inputData(nullptr), _outputData(nullptr), _inputShape(), _outputShape(), _paddingLeft(0),
@@ -47,31 +47,31 @@ AvgPoolLayer::AvgPoolLayer()
 
 bool AvgPoolLayer::averagePoolFloat32()
 {
-
   AVGPOOLING_PARAMETERS
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(_activation, &output_activation_min, &output_activation_max);
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
 
-  ::tflite::optimized_ops::AveragePool(
-      reinterpret_cast<const float *>(_inputData), convertShapeToDims(_inputShape), _strideWidth,
-      _strideHeight, paddingWidth, paddingHeight, _kernelWidth, _kernelHeight,
-      output_activation_min, output_activation_max, reinterpret_cast<float *>(_outputData),
-      convertShapeToDims(_outputShape));
+  ::tflite::optimized_ops::AveragePool(op_params, convertShapeToTFLiteShape(_inputShape),
+                                       reinterpret_cast<const float *>(_inputData),
+                                       convertShapeToTFLiteShape(_outputShape),
+                                       reinterpret_cast<float *>(_outputData));
   return true;
 }
 bool AvgPoolLayer::averagePoolQuant8()
 {
-
   AVGPOOLING_PARAMETERS
   int32_t output_activation_min = 0;
   int32_t output_activation_max = 0;
   CalculateActivationRangeUint8(_activation, _outputShape, &output_activation_min,
                                 &output_activation_max);
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
 
-  ::tflite::optimized_ops::AveragePool(_inputData, convertShapeToDims(_inputShape), _strideWidth,
-                                       _strideHeight, paddingWidth, paddingHeight, _kernelWidth,
-                                       _kernelHeight, output_activation_min, output_activation_max,
-                                       _outputData, convertShapeToDims(_outputShape));
+  ::tflite::optimized_ops::AveragePool(op_params, convertShapeToTFLiteShape(_inputShape),
+                                       _inputData, convertShapeToTFLiteShape(_outputShape),
+                                       _outputData);
   return true;
 }
 
diff --git a/runtimes/neurun/src/kernel/cpu/ConcatLayer.cc b/runtimes/neurun/src/kernel/cpu/ConcatLayer.cc
index 5fe5e3993..be093b437 100644
--- a/runtimes/neurun/src/kernel/cpu/ConcatLayer.cc
+++ b/runtimes/neurun/src/kernel/cpu/ConcatLayer.cc
@@ -24,6 +24,7 @@ namespace neurun
 {
 namespace kernel
 {
+
 namespace cpu
 {
 
@@ -36,13 +37,21 @@ ConcatLayer::ConcatLayer()
 
 bool ConcatLayer::concatenationFloat32()
 {
-  int num_inputs = _inputShapes.size();
-  std::vector<::tflite::Dims<4> *> inputDimsPtr(num_inputs);
-  std::vector<::tflite::Dims<4>> inputDims(num_inputs);
-  for (int i = 0; i < num_inputs; i++)
+  uint32_t num_inputs = _inputShapes.size();
+
+  tflite::ConcatenationParams op_params;
+  op_params.axis = _axis;
+  op_params.inputs_count = num_inputs;
+
+  std::vector<::tflite::RuntimeShape *> inputDimsPtr;
+  std::vector<::tflite::RuntimeShape> inputDims;
+  inputDimsPtr.reserve(num_inputs);
+  inputDims.reserve(num_inputs);
+
+  for (uint32_t i = 0; i < num_inputs; i++)
   {
-    inputDims[i] = convertShapeToDims(_inputShapes[i]);
-    inputDimsPtr[i] = &inputDims[i];
+    inputDims.push_back(convertShapeToTFLiteShape(_inputShapes[i]));
+    inputDimsPtr.push_back(&inputDims[i]);
   }
 
   std::vector<const float *> inputFloatPtrs;
@@ -52,24 +61,44 @@ bool ConcatLayer::concatenationFloat32()
     inputFloatPtrs.emplace_back(reinterpret_cast<const float *>(ptr));
   }
 
-  ::tflite::optimized_ops::Concatenation<::tflite::FusedActivationFunctionType::kNone, float>(
-      getNumberOfDimensions(_outputShape) - _axis - 1, inputFloatPtrs.data(), inputDimsPtr.data(),
-      num_inputs, reinterpret_cast<float *>(_outputData), convertShapeToDims(_outputShape));
+  ::tflite::optimized_ops::Concatenation<float>(
+      op_params, inputDimsPtr.data(), inputFloatPtrs.data(),
+      convertShapeToTFLiteShape(_outputShape), reinterpret_cast<float *>(_outputData));
   return true;
 }
 bool ConcatLayer::concatenationQuant8()
 {
   int num_inputs = _inputShapes.size();
-  std::vector<::tflite::Dims<4> *> inputDimsPtr(num_inputs);
-  std::vector<::tflite::Dims<4>> inputDims(num_inputs);
-  for (int i = 0; i < num_inputs; i++)
+
+  std::vector<int32_t> input_zeropoints(num_inputs);
+  std::vector<float> input_scales(num_inputs);
+  for (uint32_t i = 0; i < num_inputs; i++)
   {
-    inputDims[i] = convertShapeToDims(_inputShapes[i]);
-    inputDimsPtr[i] = &inputDims[i];
+    input_zeropoints[i] = _inputShapes[i].offset;
+    input_scales[i] = _inputShapes[i].scale;
   }
-  ::tflite::optimized_ops::Concatenation<::tflite::FusedActivationFunctionType::kNone, uint8_t>(
-      getNumberOfDimensions(_outputShape) - _axis - 1, _inputDataPtrs.data(), inputDimsPtr.data(),
-      num_inputs, _outputData, convertShapeToDims(_outputShape));
+
+  tflite::ConcatenationParams op_params;
+  op_params.axis = _axis;
+  op_params.inputs_count = num_inputs;
+  op_params.input_zeropoint = input_zeropoints.data();
+  op_params.input_scale = input_scales.data();
+  op_params.output_zeropoint = _outputShape.offset;
+  op_params.output_scale = _outputShape.scale;
+
+  std::vector<::tflite::RuntimeShape *> inputDimsPtr;
+  std::vector<::tflite::RuntimeShape> inputDims;
+  inputDimsPtr.reserve(num_inputs);
+  inputDims.reserve(num_inputs);
+  for (uint32_t i = 0; i < num_inputs; i++)
+  {
+    inputDims.push_back(convertShapeToTFLiteShape(_inputShapes[i]));
+    inputDimsPtr.push_back(&inputDims[i]);
+  }
+
+  ::tflite::optimized_ops::Concatenation<uint8_t>(
+      op_params, inputDimsPtr.data(), _inputDataPtrs.data(),
+      convertShapeToTFLiteShape(_outputShape), _outputData);
   return true;
 }
 
diff --git a/runtimes/neurun/src/kernel/cpu/ConvolutionLayer.cc b/runtimes/neurun/src/kernel/cpu/ConvolutionLayer.cc
index 81e88e0f0..c694fa75f 100644
--- a/runtimes/neurun/src/kernel/cpu/ConvolutionLayer.cc
+++ b/runtimes/neurun/src/kernel/cpu/ConvolutionLayer.cc
@@ -33,55 +33,51 @@ static constexpr int kStaticBufferSize = 1605632;
 static char static_scratch_buffer[kStaticBufferSize];
 static std::mutex executionMutex;
 
-#define ANDROID_NN_CONV_PARAMETERS(Type)                                      \
-  uint32_t height = getSizeOfDimension(_inputShape, 1);                       \
-  uint32_t width = getSizeOfDimension(_inputShape, 2);                        \
-  uint32_t kernelHeight = getSizeOfDimension(_kernelShape, 1);                \
-  uint32_t kernelWidth = getSizeOfDimension(_kernelShape, 2);                 \
-  uint32_t outHeight = getSizeOfDimension(_outputShape, 1);                   \
-  uint32_t outWidth = getSizeOfDimension(_outputShape, 2);                    \
-  uint32_t inDepth = getSizeOfDimension(_inputShape, 3);                      \
-                                                                              \
-  uint32_t paddingHeight = (uint32_t)_paddingTop;                             \
-  uint32_t paddingWidth = (uint32_t)_paddingLeft;                             \
-                                                                              \
-  ::tflite::Dims<4> im2colDim;                                                \
-  im2colDim.sizes[3] = (int)getSizeOfDimension(_outputShape, 0);              \
-  im2colDim.sizes[2] = (int)getSizeOfDimension(_outputShape, 1);              \
-  im2colDim.sizes[1] = (int)getSizeOfDimension(_outputShape, 2);              \
-  im2colDim.sizes[0] = (int)inDepth * kernelHeight * kernelWidth;             \
-                                                                              \
-  im2colDim.strides[0] = 1;                                                   \
-  for (int i = 1; i < 4; i++)                                                 \
-  {                                                                           \
-    im2colDim.strides[i] = im2colDim.strides[i - 1] * im2colDim.sizes[i - 1]; \
-  }                                                                           \
-  Type *im2colData = nullptr;                                                 \
-  uint64_t im2colByteSize = sizeof(Type);                                     \
-  std::unique_ptr<Type[]> im2colGuard;                                        \
-  for (int i = 0; i < 4; i++)                                                 \
-  {                                                                           \
-    im2colByteSize *= im2colDim.sizes[i];                                     \
-  }                                                                           \
-  /* http://b/77982879, tflite::optimized_ops::Conv uses int for offsets */   \
-  if (im2colByteSize >= 0x7fffffff)                                           \
-  {                                                                           \
-    std::cout << "Conv size is too large, not enough memory" << std::endl;    \
-    return false;                                                             \
-  }                                                                           \
-  if (im2colByteSize <= kStaticBufferSize)                                    \
-  {                                                                           \
-    im2colData = reinterpret_cast<Type *>(static_scratch_buffer);             \
-  }                                                                           \
-  else                                                                        \
-  {                                                                           \
-    im2colData = new (std::nothrow) Type[im2colByteSize / sizeof(Type)];      \
-    if (im2colData == nullptr)                                                \
-    {                                                                         \
-      std::cout << "Conv size is too large, not enough memory" << std::endl;  \
-      return false;                                                           \
-    }                                                                         \
-    im2colGuard.reset(im2colData);                                            \
+#define ANDROID_NN_CONV_PARAMETERS(Type)                                     \
+  uint32_t height = getSizeOfDimension(_inputShape, 1);                      \
+  uint32_t width = getSizeOfDimension(_inputShape, 2);                       \
+  uint32_t kernelHeight = getSizeOfDimension(_kernelShape, 1);               \
+  uint32_t kernelWidth = getSizeOfDimension(_kernelShape, 2);                \
+  uint32_t outHeight = getSizeOfDimension(_outputShape, 1);                  \
+  uint32_t outWidth = getSizeOfDimension(_outputShape, 2);                   \
+  uint32_t inDepth = getSizeOfDimension(_inputShape, 3);                     \
+                                                                             \
+  uint32_t paddingHeight = (uint32_t)_paddingTop;                            \
+  uint32_t paddingWidth = (uint32_t)_paddingLeft;                            \
+                                                                             \
+  Shape im2colShape;                                                         \
+  im2colShape.dimensions.resize(4);                                          \
+  im2colShape.dimensions[0] = getSizeOfDimension(_outputShape, 0);           \
+  im2colShape.dimensions[1] = getSizeOfDimension(_outputShape, 1);           \
+  im2colShape.dimensions[2] = getSizeOfDimension(_outputShape, 2);           \
+  im2colShape.dimensions[3] = inDepth * kernelHeight * kernelWidth;          \
+                                                                             \
+  Type *im2colData = nullptr;                                                \
+  uint64_t im2colByteSize = sizeof(Type);                                    \
+  std::unique_ptr<Type[]> im2colGuard;                                       \
+  for (int i = 0; i < 4; i++)                                                \
+  {                                                                          \
+    im2colByteSize *= im2colShape.dimensions[i];                             \
+  }                                                                          \
+  /* http://b/77982879, tflite::optimized_ops::Conv uses int for offsets */  \
+  if (im2colByteSize >= 0x7fffffff)                                          \
+  {                                                                          \
+    std::cout << "Conv size is too large, not enough memory" << std::endl;   \
+    return false;                                                            \
+  }                                                                          \
+  if (im2colByteSize <= kStaticBufferSize)                                   \
+  {                                                                          \
+    im2colData = reinterpret_cast<Type *>(static_scratch_buffer);            \
+  }                                                                          \
+  else                                                                       \
+  {                                                                          \
+    im2colData = new (std::nothrow) Type[im2colByteSize / sizeof(Type)];     \
+    if (im2colData == nullptr)                                               \
+    {                                                                        \
+      std::cout << "Conv size is too large, not enough memory" << std::endl; \
+      return false;                                                          \
+    }                                                                        \
+    im2colGuard.reset(im2colData);                                           \
   }
 
 ConvolutionLayer::ConvolutionLayer()
@@ -112,19 +108,32 @@ bool ConvolutionLayer::convFloat32()
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(_activation, &output_activation_min, &output_activation_max);
   int32_t dilationWidthFactor = 1, dilationHeightFactor = 1;
+
+  ::tflite::ConvParams op_params;
+  op_params.padding_type = ::tflite::PaddingType::kSame;
+  op_params.padding_values.width = paddingWidth;
+  op_params.padding_values.height = paddingHeight;
+  op_params.stride_width = _strideWidth;
+  op_params.stride_height = _strideHeight;
+  op_params.dilation_width_factor = dilationWidthFactor;
+  op_params.dilation_height_factor = dilationHeightFactor;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
   ::tflite::optimized_ops::Conv(
-      reinterpret_cast<const float *>(_inputData), convertShapeToDims(_inputShape),
-      reinterpret_cast<const float *>(_kernelData), convertShapeToDims(_kernelShape),
-      reinterpret_cast<const float *>(_biasData), convertShapeToDims(_biasShape), _strideWidth,
-      _strideHeight, dilationWidthFactor, dilationHeightFactor, paddingWidth, paddingHeight,
-      output_activation_min, output_activation_max, reinterpret_cast<float *>(_outputData),
-      convertShapeToDims(_outputShape), im2colDataToPass, im2colDim);
+      op_params, convertShapeToTFLiteShape(_inputShape),
+      reinterpret_cast<const float *>(_inputData), convertShapeToTFLiteShape(_kernelShape),
+      reinterpret_cast<const float *>(_kernelData), convertShapeToTFLiteShape(_biasShape),
+      reinterpret_cast<const float *>(_biasData), convertShapeToTFLiteShape(_outputShape),
+      reinterpret_cast<float *>(_outputData), convertShapeToTFLiteShape(im2colShape),
+      im2colDataToPass);
   return true;
 }
 
 bool ConvolutionLayer::convQuant8()
 {
   ANDROID_NN_CONV_PARAMETERS(uint8_t)
+
   int32_t inputOffset = -_inputShape.offset;
   int32_t kernelOffset = -_kernelShape.offset;
   int32_t outputOffset = _outputShape.offset;
@@ -141,6 +150,24 @@ bool ConvolutionLayer::convQuant8()
   }
   CalculateActivationRangeUint8(_activation, _outputShape, &output_activation_min,
                                 &output_activation_max);
+  int32_t dilationWidthFactor = 1, dilationHeightFactor = 1;
+
+  ::tflite::ConvParams op_params;
+  op_params.padding_type = ::tflite::PaddingType::kSame;
+  op_params.padding_values.width = paddingWidth;
+  op_params.padding_values.height = paddingHeight;
+  op_params.stride_width = _strideWidth;
+  op_params.stride_height = _strideHeight;
+  op_params.dilation_width_factor = dilationWidthFactor;
+  op_params.dilation_height_factor = dilationHeightFactor;
+  op_params.input_offset = inputOffset;
+  op_params.weights_offset = kernelOffset;
+  op_params.output_offset = outputOffset;
+  op_params.output_multiplier = output_multiplier;
+  op_params.output_shift = output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
   static gemmlowp::GemmContext gemm_context;
   // Prevent concurrent executions that may access the scratch buffer and
   // gemm_context.
@@ -148,11 +175,10 @@ bool ConvolutionLayer::convQuant8()
   // Alow gemmlowp automatically decide how many threads to use.
   gemm_context.set_max_num_threads(0);
   ::tflite::optimized_ops::Conv(
-      _inputData, convertShapeToDims(_inputShape), inputOffset, _kernelData,
-      convertShapeToDims(_kernelShape), kernelOffset, reinterpret_cast<const int32_t *>(_biasData),
-      convertShapeToDims(_biasShape), _strideWidth, _strideHeight, paddingWidth, paddingHeight,
-      outputOffset, output_multiplier, output_shift, output_activation_min, output_activation_max,
-      _outputData, convertShapeToDims(_outputShape), im2colData, im2colDim, &gemm_context);
+      op_params, convertShapeToTFLiteShape(_inputShape), _inputData,
+      convertShapeToTFLiteShape(_kernelShape), _kernelData, convertShapeToTFLiteShape(_biasShape),
+      reinterpret_cast<const int32_t *>(_biasData), convertShapeToTFLiteShape(_outputShape),
+      _outputData, convertShapeToTFLiteShape(im2colShape), im2colData, &gemm_context);
   return true;
 }
 
diff --git a/runtimes/neurun/src/kernel/cpu/FullyConnectedLayer.cc b/runtimes/neurun/src/kernel/cpu/FullyConnectedLayer.cc
index 41b9afc0c..abe82db5e 100644
--- a/runtimes/neurun/src/kernel/cpu/FullyConnectedLayer.cc
+++ b/runtimes/neurun/src/kernel/cpu/FullyConnectedLayer.cc
@@ -44,64 +44,39 @@ FullyConnectedLayer::FullyConnectedLayer()
 static std::mutex executionMutex;
 bool FullyConnectedLayer::fullyConnectedFloat32()
 {
-  float output_activation_min, output_activation_max;
-  CalculateActivationRangeFloat(_activation, &output_activation_min, &output_activation_max);
-  // b/80425683, optimized implementation produces incorrect results when the
-  // number of input elements is the squre of batch_size.
-  uint32_t batch_size = getSizeOfDimension(_outputShape, 0);
-  uint32_t input_n_elements = getNumberOfElements(_inputShape);
-  if (batch_size * batch_size == input_n_elements)
+  int total_input_size = 1;
+  for (int i = 0; i < _inputShape.dimensions.size(); i++)
   {
-    ::tflite::reference_ops::FullyConnected(
-        reinterpret_cast<const float *>(_inputData), convertShapeToDims(_inputShape),
-        reinterpret_cast<const float *>(_weightsData), convertShapeToDims(_weightsShape),
-        reinterpret_cast<const float *>(_biasData), convertShapeToDims(_biasShape),
-        output_activation_min, output_activation_max, reinterpret_cast<float *>(_outputData),
-        convertShapeToDims(_outputShape));
-  }
-  else
-  {
-    ::tflite::optimized_ops::FullyConnected(
-        reinterpret_cast<const float *>(_inputData), convertShapeToDims(_inputShape),
-        reinterpret_cast<const float *>(_weightsData), convertShapeToDims(_weightsShape),
-        reinterpret_cast<const float *>(_biasData), convertShapeToDims(_biasShape),
-        output_activation_min, output_activation_max, reinterpret_cast<float *>(_outputData),
-        convertShapeToDims(_outputShape));
+    total_input_size *= _inputShape.dimensions[i];
   }
+
+  int input_size = _weightsShape.dimensions[1];
+  const int batch_size = total_input_size / input_size;
+  const int num_units = _weightsShape.dimensions[0];
+
+  TfLiteFusedActivation act = convertFusedActivation(_activation);
+
+  ::tflite::tensor_utils::VectorBatchVectorAssign(reinterpret_cast<const float *>(_biasData),
+                                                  num_units, batch_size,
+                                                  reinterpret_cast<float *>(_outputData));
+
+  // Compute output += weight * input
+  ::tflite::tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      reinterpret_cast<const float *>(_weightsData), num_units, input_size,
+      reinterpret_cast<const float *>(_inputData), batch_size,
+      reinterpret_cast<float *>(_outputData), /*result_stride=*/1);
+
+  // Apply activation function
+  ::tflite::tensor_utils::ApplyActivationToVector(reinterpret_cast<float *>(_outputData),
+                                                  batch_size * num_units, act,
+                                                  reinterpret_cast<float *>(_outputData));
+
   return true;
 }
 
 bool FullyConnectedLayer::fullyConnectedQuant8()
 {
-  int32_t inputOffset = -_inputShape.offset;
-  int32_t weightsOffset = -_weightsShape.offset;
-  int32_t outputOffset = _outputShape.offset;
-  float real_multiplier = 0.0;
-  int32_t output_multiplier = 0;
-  int32_t output_shift = 0;
-  int32_t output_activation_min = 0;
-  int32_t output_activation_max = 0;
-  // Caution : 'Convolution' can make misleading. It seems it is just math term.
-  if (!GetQuantizedConvolutionMultipler(_inputShape, _weightsShape, _biasShape, _outputShape,
-                                        &real_multiplier) ||
-      !QuantizeMultiplierSmallerThanOne(real_multiplier, &output_multiplier, &output_shift))
-  {
-    return false;
-  }
-  CalculateActivationRangeUint8(_activation, _outputShape, &output_activation_min,
-                                &output_activation_max);
-  static gemmlowp::GemmContext gemm_context;
-  // Prevent concurrent executions that access gemm_context.
-  std::unique_lock<std::mutex> lock(executionMutex);
-  // Alow gemmlowp automatically decide how many threads to use.
-  gemm_context.set_max_num_threads(0);
-  ::tflite::optimized_ops::FullyConnected(
-      _inputData, convertShapeToDims(_inputShape), inputOffset, _weightsData,
-      convertShapeToDims(_weightsShape), weightsOffset,
-      reinterpret_cast<const int32_t *>(_biasData), convertShapeToDims(_biasShape), outputOffset,
-      output_multiplier, output_shift, output_activation_min, output_activation_max, _outputData,
-      convertShapeToDims(_outputShape), &gemm_context);
-  return true;
+  throw std::runtime_error{"FullyConnectedLayer : Not tested for TENSOR_QUANT8_ASYMM"};
 }
 
 void FullyConnectedLayer::configure(uint8_t *inputData, const Shape inputShape,
diff --git a/runtimes/neurun/src/kernel/cpu/MaxPoolLayer.cc b/runtimes/neurun/src/kernel/cpu/MaxPoolLayer.cc
index 3d96bb401..c4a288b07 100644
--- a/runtimes/neurun/src/kernel/cpu/MaxPoolLayer.cc
+++ b/runtimes/neurun/src/kernel/cpu/MaxPoolLayer.cc
@@ -26,14 +26,14 @@ namespace kernel
 namespace cpu
 {
 
-#define MAXPOOLING_PARAMETERS                               \
-  uint32_t height = getSizeOfDimension(_inputShape, 1);     \
-  uint32_t width = getSizeOfDimension(_inputShape, 2);      \
-  uint32_t outHeight = getSizeOfDimension(_outputShape, 1); \
-  uint32_t outWidth = getSizeOfDimension(_outputShape, 2);  \
-                                                            \
-  uint32_t paddingHeight = (uint32_t)_paddingTop;           \
-  uint32_t paddingWidth = (uint32_t)_paddingLeft;
+#define MAXPOOLING_PARAMETERS                            \
+  tflite::PoolParams op_params;                          \
+  op_params.stride_height = _strideHeight;               \
+  op_params.stride_width = _strideWidth;                 \
+  op_params.filter_height = _kernelHeight;               \
+  op_params.filter_width = _kernelWidth;                 \
+  op_params.padding_values.height = (int8_t)_paddingTop; \
+  op_params.padding_values.width = (int8_t)_paddingLeft;
 
 MaxPoolLayer::MaxPoolLayer()
     : _inputData(nullptr), _outputData(nullptr), _inputShape(), _outputShape(), _paddingLeft(0),
@@ -46,31 +46,30 @@ MaxPoolLayer::MaxPoolLayer()
 
 bool MaxPoolLayer::maxPoolFloat32()
 {
-
   MAXPOOLING_PARAMETERS
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(_activation, &output_activation_min, &output_activation_max);
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
 
-  ::tflite::optimized_ops::MaxPool(
-      reinterpret_cast<const float *>(_inputData), convertShapeToDims(_inputShape), _strideWidth,
-      _strideHeight, paddingWidth, paddingHeight, _kernelWidth, _kernelHeight,
-      output_activation_min, output_activation_max, reinterpret_cast<float *>(_outputData),
-      convertShapeToDims(_outputShape));
+  ::tflite::optimized_ops::MaxPool(op_params, convertShapeToTFLiteShape(_inputShape),
+                                   reinterpret_cast<const float *>(_inputData),
+                                   convertShapeToTFLiteShape(_outputShape),
+                                   reinterpret_cast<float *>(_outputData));
   return true;
 }
 bool MaxPoolLayer::maxPoolQuant8()
 {
-
   MAXPOOLING_PARAMETERS
   int32_t output_activation_min = 0;
   int32_t output_activation_max = 0;
   CalculateActivationRangeUint8(_activation, _outputShape, &output_activation_min,
                                 &output_activation_max);
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
 
-  ::tflite::optimized_ops::MaxPool(_inputData, convertShapeToDims(_inputShape), _strideWidth,
-                                   _strideHeight, paddingWidth, paddingHeight, _kernelWidth,
-                                   _kernelHeight, output_activation_min, output_activation_max,
-                                   _outputData, convertShapeToDims(_outputShape));
+  ::tflite::optimized_ops::MaxPool(op_params, convertShapeToTFLiteShape(_inputShape), _inputData,
+                                   convertShapeToTFLiteShape(_outputShape), _outputData);
   return true;
 }
 
diff --git a/runtimes/neurun/src/kernel/cpu/OperationUtils.h b/runtimes/neurun/src/kernel/cpu/OperationUtils.h
index 5914d04e3..066b1e573 100644
--- a/runtimes/neurun/src/kernel/cpu/OperationUtils.h
+++ b/runtimes/neurun/src/kernel/cpu/OperationUtils.h
@@ -23,7 +23,9 @@
 #include <limits>
 #include <vector>
 
+#include "tensorflow/contrib/lite/c/builtin_op_data.h"
 #include "tensorflow/contrib/lite/kernels/internal/types.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
 #include "graph/operand/Object.h"
 #include "graph/operand/DataType.h"
 
@@ -75,6 +77,51 @@ inline ::tflite::Dims<4> convertShapeToDims(const Shape &shape)
   return dims;
 }
 
+inline ::tflite::RuntimeShape convertShapeToTFLiteShape(const Shape &shape)
+{
+  std::vector<int32_t> raw_shape;
+  raw_shape.resize(4);
+
+  for (uint32_t i = 0; i < 4; ++i)
+  {
+    if (i >= shape.dimensions.size())
+    {
+      raw_shape[i] = 1;
+    }
+    else
+    {
+      raw_shape[i] = shape.dimensions[i];
+    }
+  }
+
+  return ::tflite::GetTensorShape(raw_shape);
+}
+
+inline TfLiteFusedActivation convertFusedActivation(FuseCode act)
+{
+  if (act == ANEURALNETWORKS_FUSED_NONE)
+  {
+    return kTfLiteActNone;
+  }
+
+  if (act == ANEURALNETWORKS_FUSED_RELU)
+  {
+    return kTfLiteActRelu;
+  }
+
+  if (act == ANEURALNETWORKS_FUSED_RELU1)
+  {
+    return kTfLiteActRelu1;
+  }
+
+  if (act == ANEURALNETWORKS_FUSED_RELU6)
+  {
+    return kTfLiteActRelu6;
+  }
+
+  return kTfLiteActNone;
+}
+
 __wur bool QuantizeMultiplierSmallerThanOne(double double_multiplier, int32_t *quantized_multiplier,
                                             int32_t *right_shift);
 
diff --git a/runtimes/neurun/src/kernel/cpu/SoftMaxLayer.cc b/runtimes/neurun/src/kernel/cpu/SoftMaxLayer.cc
index 4f5a69f2e..c998c65f6 100644
--- a/runtimes/neurun/src/kernel/cpu/SoftMaxLayer.cc
+++ b/runtimes/neurun/src/kernel/cpu/SoftMaxLayer.cc
@@ -33,45 +33,86 @@ SoftMaxLayer::SoftMaxLayer()
   // DO NOTHING
 }
 
+// Performs softmax along the input of size (input_size * batch_size).
+void Softmax(const float *in, const int input_size, const int batch_size, const float beta,
+             float *out)
+{
+  TF_LITE_ASSERT(input_size > 0);
+
+  // For each batch
+  for (int b = 0; b < batch_size; b++)
+  {
+    // Find the max coeff.
+    float max_coeff = in[0];
+    for (int i = 1; i < input_size; i++)
+    {
+      if (in[i] > max_coeff)
+        max_coeff = in[i];
+    }
+
+    // Compute the normalized sum of exps.
+    float exp_sum = 0.0;
+    for (int i = 0; i < input_size; i++)
+    {
+      out[i] = std::exp((in[i] - max_coeff) * beta);
+      exp_sum += out[i];
+    }
+
+    // Divide by the sum of exps.
+    float reciprocal_sum_exp = 1.f / exp_sum;
+    for (int i = 0; i < input_size; i++)
+    {
+      out[i] *= reciprocal_sum_exp;
+    }
+
+    // Advance in and out pointers for the next batch.
+    in += input_size;
+    out += input_size;
+  }
+}
+
 bool SoftMaxLayer::softmaxFloat32()
 {
-  ::tflite::Dims<4> dim;
+  Shape shapeIn4D;
+
   if (getNumberOfDimensions(_inputShape) == 2)
   {
     uint32_t batch_size = getSizeOfDimension(_inputShape, 0);
     uint32_t input_size = getNumberOfElements(_inputShape) / batch_size;
-    Shape shapeIn4D;
-    shapeIn4D.dimensions = {batch_size, 1, 1, input_size};
-    dim = convertShapeToDims(shapeIn4D);
+    Softmax(reinterpret_cast<const float *>(_inputData), input_size, batch_size, _beta,
+            reinterpret_cast<float *>(_outputData));
   }
   else if (getNumberOfDimensions(_inputShape) == 4)
   {
-    dim = convertShapeToDims(_inputShape);
+    ::tflite::SoftmaxParams op_params;
+    op_params.beta = _beta;
+    ::tflite::optimized_ops::Softmax(op_params, convertShapeToTFLiteShape(_inputShape),
+                                     reinterpret_cast<const float *>(_inputData),
+                                     convertShapeToTFLiteShape(_outputShape),
+                                     reinterpret_cast<float *>(_outputData));
   }
   else
   {
     std::cout << "only 2D and 4D tensors supported" << std::endl;
     return false;
   }
-  ::tflite::optimized_ops::Softmax(reinterpret_cast<const float *>(_inputData), dim, _beta,
-                                   reinterpret_cast<float *>(_outputData), dim);
+
   return true;
 }
 
 bool SoftMaxLayer::softmaxQuant8()
 {
-  ::tflite::Dims<4> dim;
+  Shape shapeIn4D = _inputShape;
+
   if (getNumberOfDimensions(_inputShape) == 2)
   {
     uint32_t batch_size = getSizeOfDimension(_inputShape, 0);
     uint32_t input_size = getNumberOfElements(_inputShape) / batch_size;
-    Shape shapeIn4D;
     shapeIn4D.dimensions = {batch_size, 1, 1, input_size};
-    dim = convertShapeToDims(shapeIn4D);
   }
   else if (getNumberOfDimensions(_inputShape) == 4)
   {
-    dim = convertShapeToDims(_inputShape);
+    shapeIn4D = _inputShape;
   }
   else
   {
@@ -94,8 +135,13 @@ bool SoftMaxLayer::softmaxQuant8()
     return false;
   }
   float diff_min = -1.0f * CalculateInputRadius(kScaledDiffIntegerBits, input_left_shift);
-  ::tflite::optimized_ops::Softmax(_inputData, dim, input_multiplier, input_left_shift, diff_min,
-                                   _outputData, dim);
+
+  ::tflite::SoftmaxParams op_params;
+  op_params.input_multiplier = input_multiplier;
+  op_params.input_left_shift = input_left_shift;
+  op_params.diff_min = diff_min;
+  ::tflite::optimized_ops::Softmax(op_params, convertShapeToTFLiteShape(shapeIn4D), _inputData,
+                                   convertShapeToTFLiteShape(shapeIn4D), _outputData);
   return true;
 }