32 files changed, 3317 insertions, 0 deletions
diff --git a/runtime/neurun/backend/cpu/kernel/AddLayer.cc b/runtime/neurun/backend/cpu/kernel/AddLayer.cc
new file mode 100644
index 000000000..8a2d872e5
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/AddLayer.cc
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "AddLayer.h"
+
+#include <cker/operation/BinaryArithmeticOps.h>
+
+#include "OperationUtils.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+void AddLayer::addFloat32()
+{
+  float output_activation_min, output_activation_max;
+  CalculateActivationRangeFloat(_activation, &output_activation_min, &output_activation_max);
+  nnfw::cker::BinaryArithmeticOpParam op_params;
+  op_params.float_activation_max = output_activation_max;
+  op_params.float_activation_min = output_activation_min;
+
+  const std::function<float(const float &, const float &)> fn = [](const float &a, const float &b) {
+    return a + b;
+  };
+
+  if (!HaveSameShapes(&_lhsDescr, &_rhsDescr))
+  {
+    nnfw::cker::BroadcastBinaryArithmeticOpSlow(
+        op_params, convertToExtendedCkerShape(_lhsDescr), _lhsData.f,
+        convertToExtendedCkerShape(_rhsDescr), _rhsData.f, convertToExtendedCkerShape(_outputDescr),
+        _outputData.f, fn);
+    return;
+  }
+
+  nnfw::cker::BinaryArithmeticOp(op_params, convertTensorDescriptorToCkerShape(_lhsDescr),
+                                 _lhsData.f, convertTensorDescriptorToCkerShape(_rhsDescr),
+                                 _rhsData.f, convertTensorDescriptorToCkerShape(_outputDescr),
+                                 _outputData.f, fn);
+}
+
+void AddLayer::addQuant8()
+{
+  int32_t output_activation_min, output_activation_max;
+  CalculateActivationRangeUint8(_activation, _outputDescr, &output_activation_min,
+                                &output_activation_max);
+  // nnfw::cker::BinaryArithmeticOpParam op_params;
+  // op_params.quantized_activation_max = output_activation_max;
+  // op_params.quantized_activation_min = output_activation_min;
+
+  // cker quant8 add is not implemented yet
+  throw std::runtime_error{"NYI"};
+}
+
+void AddLayer::configure(uint8_t *lhsData, const TensorDescriptor &lhsDescr, uint8_t *rhsData,
+                         const TensorDescriptor &rhsDescr, const ir::Activation activation,
+                         uint8_t *outputData, const TensorDescriptor &outputDescr)
+{
+  _lhsData.u8 = lhsData;
+  _lhsDescr = lhsDescr;
+  _rhsData.u8 = rhsData;
+  _rhsDescr = rhsDescr;
+  _inputType = lhsDescr.type;
+  _activation = activation;
+  _outputData.u8 = outputData;
+  _outputDescr = outputDescr;
+}
+
+void AddLayer::run()
+{
+  if (_inputType == OperandType::FLOAT32)
+  {
+    addFloat32();
+  }
+  else if (_inputType == OperandType::QUANT8_ASYMM)
+  {
+    addQuant8();
+  }
+}
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
diff --git a/runtime/neurun/backend/cpu/kernel/AddLayer.h b/runtime/neurun/backend/cpu/kernel/AddLayer.h
new file mode 100644
index 000000000..7018e4c48
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/AddLayer.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_CPU_KERNEL_ADDLAYER_H__
+#define __NEURUN_BACKEND_CPU_KERNEL_ADDLAYER_H__
+
+#include <exec/IFunction.h>
+
+#include "OperationUtils.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+class AddLayer : public ::neurun::exec::IFunction
+{
+public:
+  AddLayer() : _lhsData(), _rhsData(), _outputData(), _lhsDescr(), _rhsDescr(), _outputDescr()
+  {
+    // DO NOTHING
+  }
+
+public:
+  void addFloat32();
+
+  void addQuant8();
+
+  void configure(uint8_t *lhsData, const TensorDescriptor &lhsDescr, uint8_t *rhsData,
+                 const TensorDescriptor &rhsDescr, const ir::Activation activation,
+                 uint8_t *outputData, const TensorDescriptor &outputDescr);
+
+  void run();
+  void runSync()
+  {
+    // this abstract method is used just for profiling and called for
+    // backend::acl_common::AclFunction
+    run();
+  }
+
+private:
+  DataPtr _lhsData;
+  DataPtr _rhsData;
+  DataPtr _outputData;
+
+  TensorDescriptor _lhsDescr;
+  TensorDescriptor _rhsDescr;
+  TensorDescriptor _outputDescr;
+
+  ir::Activation _activation{ir::Activation::NONE};
+
+  OperandType _inputType{OperandType::FLOAT32};
+};
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_CPU_KERNEL_ADDLAYER_H__
diff --git a/runtime/neurun/backend/cpu/kernel/AvgPoolLayer.cc b/runtime/neurun/backend/cpu/kernel/AvgPoolLayer.cc
new file mode 100644
index 000000000..389955796
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/AvgPoolLayer.cc
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "AvgPoolLayer.h"
+
+#include "OperationUtils.h"
+
+#include <cker/operation/AveragePool.h>
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+#define AVGPOOLING_PARAMETERS                            \
+  nnfw::cker::PoolParams op_params;                      \
+  op_params.stride_height = _strideHeight;               \
+  op_params.stride_width = _strideWidth;                 \
+  op_params.filter_height = _kernelHeight;               \
+  op_params.filter_width = _kernelWidth;                 \
+  op_params.padding_values.height = (int8_t)_paddingTop; \
+  op_params.padding_values.width = (int8_t)_paddingLeft;
+
+AvgPoolLayer::AvgPoolLayer()
+    : _inputData(), _outputData(), _inputDescr(), _outputDescr(), _paddingLeft(0), _paddingTop(0),
+      _paddingRight(0), _paddingBottom(0), _strideWidth(0), _strideHeight(0), _kernelWidth(0),
+      _kernelHeight(0), _activation(ir::Activation::NONE), _inputType(OperandType::FLOAT32)
+{
+  // DO NOTHING
+}
+
+void AvgPoolLayer::averagePoolFloat32()
+{
+  AVGPOOLING_PARAMETERS
+  float output_activation_min, output_activation_max;
+  CalculateActivationRangeFloat(_activation, &output_activation_min, &output_activation_max);
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  nnfw::cker::AveragePool(op_params, convertTensorDescriptorToCkerShape(_inputDescr), _inputData.f,
+                          convertTensorDescriptorToCkerShape(_outputDescr), _outputData.f);
+}
+void AvgPoolLayer::averagePoolQuant8()
+{
+  AVGPOOLING_PARAMETERS
+  int32_t output_activation_min = 0;
+  int32_t output_activation_max = 0;
+  CalculateActivationRangeUint8(_activation, _outputDescr, &output_activation_min,
+                                &output_activation_max);
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
+  nnfw::cker::AveragePool(op_params, convertTensorDescriptorToCkerShape(_inputDescr), _inputData.u8,
+                          convertTensorDescriptorToCkerShape(_outputDescr), _outputData.u8);
+}
+
+void AvgPoolLayer::configure(uint8_t *inputData, const TensorDescriptor inputDescr,
+                             const uint32_t paddingLeft, const uint32_t paddingRight,
+                             const uint32_t paddingTop, const uint32_t paddingBottom,
+                             const uint32_t strideWidth, const uint32_t strideHeight,
+                             const uint32_t kernelWidth, const uint32_t kernelHeight,
+                             const ir::Activation activation, uint8_t *outputData,
+                             const TensorDescriptor outputDescr)
+{
+  _inputData.u8 = inputData;
+  _inputDescr = inputDescr;
+  _inputType = inputDescr.type;
+  _paddingLeft = paddingLeft;
+  _paddingRight = paddingRight;
+  _paddingTop = paddingTop;
+  _paddingBottom = paddingBottom;
+  _strideWidth = strideWidth;
+  _strideHeight = strideHeight;
+  _kernelWidth = kernelWidth;
+  _kernelHeight = kernelHeight;
+  _activation = activation;
+  _outputData.u8 = outputData;
+  _outputDescr = outputDescr;
+}
+
+void AvgPoolLayer::run()
+{
+  if (_inputType == OperandType::FLOAT32)
+  {
+    averagePoolFloat32();
+  }
+  else if (_inputType == OperandType::QUANT8_ASYMM)
+  {
+    averagePoolQuant8();
+  }
+}
+
+#undef AVGPOOLING_PARAMETERS
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
diff --git a/runtime/neurun/backend/cpu/kernel/AvgPoolLayer.h b/runtime/neurun/backend/cpu/kernel/AvgPoolLayer.h
new file mode 100644
index 000000000..6339efa41
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/AvgPoolLayer.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_CPU_KERNEL_AVGPOOLLAYER_H__
+#define __NEURUN_BACKEND_CPU_KERNEL_AVGPOOLLAYER_H__
+
+#include <exec/IFunction.h>
+
+#include "OperationUtils.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+class AvgPoolLayer : public ::neurun::exec::IFunction
+{
+public:
+  AvgPoolLayer();
+
+public:
+  void averagePoolFloat32();
+
+  void averagePoolQuant8();
+
+  void configure(uint8_t *inputData, const TensorDescriptor inputDescr, const uint32_t paddingLeft,
+                 const uint32_t paddingRight, const uint32_t paddingTop,
+                 const uint32_t paddingBottom, const uint32_t strideWidth,
+                 const uint32_t strideHeight, const uint32_t kernelWidth,
+                 const uint32_t kernelHeight, const ir::Activation activation, uint8_t *outputData,
+                 const TensorDescriptor outputDescr);
+
+  void run();
+  void runSync()
+  {
+    // this abstract method is used just for profiling and called for
+    // backend::acl_common::AclFunction
+    run();
+  }
+
+private:
+  DataPtr _inputData;
+  DataPtr _outputData;
+
+  TensorDescriptor _inputDescr;
+  TensorDescriptor _outputDescr;
+
+  uint32_t _paddingLeft;
+  uint32_t _paddingTop;
+  uint32_t _paddingRight;
+  uint32_t _paddingBottom;
+
+  uint32_t _strideWidth;
+  uint32_t _strideHeight;
+  uint32_t _kernelWidth;
+  uint32_t _kernelHeight;
+
+  ir::Activation _activation;
+
+  OperandType _inputType;
+};
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_CPU_KERNEL_AVGPOOLLAYER_H__
diff --git a/runtime/neurun/backend/cpu/kernel/ConcatLayer.cc b/runtime/neurun/backend/cpu/kernel/ConcatLayer.cc
new file mode 100644
index 000000000..471c9b3bb
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/ConcatLayer.cc
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConcatLayer.h"
+
+#include "OperationUtils.h"
+
+#include <cker/operation/Concatenation.h>
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+ConcatLayer::ConcatLayer()
+    : _inputDataPtrs(), _outputData(), _axis(0), _inputDescriptors(), _outputDescr(),
+      _inputType(OperandType::FLOAT32)
+{
+  // DO NOTHING
+}
+
+void ConcatLayer::concatenationFloat32()
+{
+  uint32_t num_inputs = _inputDescriptors.size();
+
+  nnfw::cker::ConcatenationParams op_params;
+  op_params.axis = _axis;
+  op_params.inputs_count = num_inputs;
+
+  std::vector<nnfw::cker::Shape *> inputDimsPtr;
+  std::vector<nnfw::cker::Shape> inputDims;
+  inputDimsPtr.reserve(num_inputs);
+  inputDims.reserve(num_inputs);
+
+  for (uint32_t i = 0; i < num_inputs; i++)
+  {
+    inputDims.push_back(convertTensorDescriptorToCkerShape(_inputDescriptors[i]));
+    inputDimsPtr.push_back(&inputDims[i]);
+  }
+
+  std::vector<const float *> inputFloatPtrs;
+
+  for (auto ptr : _inputDataPtrs)
+  {
+    inputFloatPtrs.emplace_back(reinterpret_cast<const float *>(ptr));
+  }
+
+  nnfw::cker::Concatenation<float>(op_params, inputDimsPtr.data(), inputFloatPtrs.data(),
+                                   convertTensorDescriptorToCkerShape(_outputDescr), _outputData.f);
+}
+void ConcatLayer::concatenationQuant8()
+{
+  uint32_t num_inputs = _inputDescriptors.size();
+
+  std::vector<int32_t> input_zeropoints(num_inputs);
+  std::vector<float> input_scales(num_inputs);
+  for (uint32_t i = 0; i < num_inputs; i++)
+  {
+    input_zeropoints[i] = _inputDescriptors[i].offset;
+    input_scales[i] = _inputDescriptors[i].scale;
+  }
+
+  nnfw::cker::ConcatenationParams op_params;
+  op_params.axis = _axis;
+  op_params.inputs_count = num_inputs;
+  op_params.input_zeropoint = input_zeropoints.data();
+  op_params.input_scale = input_scales.data();
+  op_params.output_zeropoint = _outputDescr.offset;
+  op_params.output_scale = _outputDescr.scale;
+
+  std::vector<nnfw::cker::Shape *> inputDimsPtr;
+  std::vector<nnfw::cker::Shape> inputDims;
+  inputDimsPtr.reserve(num_inputs);
+  inputDims.reserve(num_inputs);
+  for (uint32_t i = 0; i < num_inputs; i++)
+  {
+    inputDims.push_back(convertTensorDescriptorToCkerShape(_inputDescriptors[i]));
+    inputDimsPtr.push_back(&inputDims[i]);
+  }
+
+  nnfw::cker::Concatenation<uint8_t>(op_params, inputDimsPtr.data(), _inputDataPtrs.data(),
+                                     convertTensorDescriptorToCkerShape(_outputDescr),
+                                     _outputData.u8);
+}
+
+void ConcatLayer::configure(const std::vector<const uint8_t *> &inputDataPtrs,
+                            const std::vector<TensorDescriptor> &inputDescriptors, int32_t axis,
+                            uint8_t *outputData, const TensorDescriptor outputDescr)
+{
+  _inputDataPtrs = inputDataPtrs;
+
+  for (auto inputDescr : inputDescriptors)
+  {
+    _inputDescriptors.emplace_back(inputDescr);
+    _inputType = inputDescr.type;
+  }
+
+  _axis = axis;
+
+  _outputData.u8 = outputData;
+  _outputDescr = outputDescr;
+}
+
+void ConcatLayer::run()
+{
+  if (_inputType == OperandType::FLOAT32)
+  {
+    concatenationFloat32();
+  }
+  else if (_inputType == OperandType::QUANT8_ASYMM)
+  {
+    concatenationQuant8();
+  }
+}
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
diff --git a/runtime/neurun/backend/cpu/kernel/ConcatLayer.h b/runtime/neurun/backend/cpu/kernel/ConcatLayer.h
new file mode 100644
index 000000000..048aa4208
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/ConcatLayer.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_CPU_KERNEL_CONCATLAYER_H__
+#define __NEURUN_BACKEND_CPU_KERNEL_CONCATLAYER_H__
+
+#include <exec/IFunction.h>
+
+#include "OperationUtils.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+class ConcatLayer : public ::neurun::exec::IFunction
+{
+public:
+  ConcatLayer();
+
+public:
+  void concatenationFloat32();
+
+  void concatenationQuant8();
+
+  void configure(const std::vector<const uint8_t *> &inputDataPtrs,
+                 const std::vector<TensorDescriptor> &inputDescriptors, int32_t axis,
+                 uint8_t *outputData, const TensorDescriptor outputDescr);
+
+  void run();
+  void runSync()
+  {
+    // this abstract method is used just for profiling and called for
+    // backend::acl_common::AclFunction
+    run();
+  }
+
+private:
+  std::vector<const uint8_t *> _inputDataPtrs;
+  DataPtr _outputData;
+
+  int32_t _axis;
+
+  std::vector<TensorDescriptor> _inputDescriptors;
+  TensorDescriptor _outputDescr;
+
+  OperandType _inputType;
+};
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_CPU_KERNEL_CONCATLAYER_H__
diff --git a/runtime/neurun/backend/cpu/kernel/ConvolutionLayer.cc b/runtime/neurun/backend/cpu/kernel/ConvolutionLayer.cc
new file mode 100644
index 000000000..2fdb0baf7
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/ConvolutionLayer.cc
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConvolutionLayer.h"
+
+#include <cker/operation/Conv.h>
+
+#include "OperationUtils.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+ConvolutionLayer::ConvolutionLayer()
+    : _inputData(), _kernelData(), _outputData(), _biasData(), _inputDescr(), _kernelDescr(),
+      _outputDescr(), _biasDescr(), _paddingLeft(0), _paddingTop(0), _paddingRight(0),
+      _paddingBottom(0), _strideWidth(0), _strideHeight(0), _activation(ir::Activation::NONE),
+      _inputType(OperandType::FLOAT32)
+{
+  // DO NOTHING
+}
+
+void ConvolutionLayer::convFloat32()
+{
+  float output_activation_min, output_activation_max;
+  CalculateActivationRangeFloat(_activation, &output_activation_min, &output_activation_max);
+
+  nnfw::cker::ConvParams op_params;
+  op_params.padding_values.width = _paddingLeft;
+  op_params.padding_values.height = _paddingTop;
+  op_params.stride_width = _strideWidth;
+  op_params.stride_height = _strideHeight;
+  op_params.dilation_width_factor = 1;
+  op_params.dilation_height_factor = 1;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  nnfw::cker::Conv(op_params, convertTensorDescriptorToCkerShape(_inputDescr), _inputData.f,
+                   convertTensorDescriptorToCkerShape(_kernelDescr), _kernelData.f,
+                   convertTensorDescriptorToCkerShape(_biasDescr), _biasData.f,
+                   convertTensorDescriptorToCkerShape(_outputDescr), _outputData.f);
+}
+
+void ConvolutionLayer::convQuant8()
+{
+  int32_t output_activation_min = 0;
+  int32_t output_activation_max = 0;
+  CalculateActivationRangeUint8(_activation, _outputDescr, &output_activation_min,
+                                &output_activation_max);
+
+  float real_multiplier = 0.0;
+  int32_t output_multiplier = 0;
+  int32_t output_shift = 0;
+  GetQuantizedConvolutionMultiplier(_inputDescr, _kernelDescr, _biasDescr, _outputDescr,
+                                    &real_multiplier);
+  QuantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
+
+  nnfw::cker::ConvParams op_params;
+  op_params.stride_width = _strideWidth;
+  op_params.stride_height = _strideHeight;
+  op_params.dilation_width_factor = 1;
+  op_params.dilation_height_factor = 1;
+  op_params.padding_values.width = _paddingLeft;
+  op_params.padding_values.height = _paddingTop;
+  op_params.input_offset = -_inputDescr.offset;
+  op_params.weights_offset = -_kernelDescr.offset;
+  op_params.output_offset = _outputDescr.offset;
+  op_params.output_multiplier = output_multiplier;
+  op_params.output_shift = output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
+  nnfw::cker::Conv(op_params, convertTensorDescriptorToCkerShape(_inputDescr), _inputData.u8,
+                   convertTensorDescriptorToCkerShape(_kernelDescr), _kernelData.u8,
+                   convertTensorDescriptorToCkerShape(_biasDescr), _biasData.i32,
+                   convertTensorDescriptorToCkerShape(_outputDescr), _outputData.u8);
+}
+
+void ConvolutionLayer::configure(uint8_t *inputData, const TensorDescriptor inputDescr,
+                                 uint8_t *kernelData, const TensorDescriptor kernelDescr,
+                                 uint8_t *biasData, const TensorDescriptor biasDescr,
+                                 const uint32_t paddingLeft, const uint32_t paddingRight,
+                                 const uint32_t paddingTop, const uint32_t paddingBottom,
+                                 const uint32_t strideWidth, const uint32_t strideHeight,
+                                 const ir::Activation activation, uint8_t *outputData,
+                                 const TensorDescriptor outputDescr)
+{
+  _inputData.u8 = inputData;
+  _inputDescr = inputDescr;
+  _inputType = inputDescr.type;
+  _kernelData.u8 = kernelData;
+  _kernelDescr = kernelDescr;
+  _biasData.u8 = biasData;
+  _biasDescr = biasDescr;
+  _paddingLeft = paddingLeft;
+  _paddingRight = paddingRight;
+  _paddingTop = paddingTop;
+  _paddingBottom = paddingBottom;
+  _strideWidth = strideWidth;
+  _strideHeight = strideHeight;
+  _activation = activation;
+  _outputData.u8 = outputData;
+  _outputDescr = outputDescr;
+}
+
+void ConvolutionLayer::run()
+{
+  if (_inputType == OperandType::FLOAT32)
+  {
+    convFloat32();
+  }
+  else if (_inputType == OperandType::QUANT8_ASYMM)
+  {
+    convQuant8();
+  }
+}
+
+#undef ANDROID_NN_CONV_PARAMETERS
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
diff --git a/runtime/neurun/backend/cpu/kernel/ConvolutionLayer.h b/runtime/neurun/backend/cpu/kernel/ConvolutionLayer.h
new file mode 100644
index 000000000..16669f316
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/ConvolutionLayer.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_CPU_KERNEL_CONVOLUTIONLAYER_H__
+#define __NEURUN_BACKEND_CPU_KERNEL_CONVOLUTIONLAYER_H__
+
+#include <exec/IFunction.h>
+
+#include "OperationUtils.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+class ConvolutionLayer : public ::neurun::exec::IFunction
+{
+public:
+  ConvolutionLayer();
+
+public:
+  void convFloat32();
+
+  void convQuant8();
+
+  void configure(uint8_t *inputData, const TensorDescriptor inputDescr, uint8_t *kernelData,
+                 const TensorDescriptor kernelDescr, uint8_t *biasData,
+                 const TensorDescriptor biasDescr, const uint32_t paddingLeft,
+                 const uint32_t paddingRight, const uint32_t paddingTop,
+                 const uint32_t paddingBottom, const uint32_t strideW, const uint32_t strideH,
+                 const ir::Activation activation, uint8_t *outputData,
+                 const TensorDescriptor outputDescr);
+
+  void run();
+  void runSync()
+  {
+    // this abstract method is used just for profiling and called for
+    // backend::acl_common::AclFunction
+    run();
+  }
+
+private:
+  DataPtr _inputData;
+  DataPtr _kernelData;
+  DataPtr _outputData;
+  DataPtr _biasData;
+
+  TensorDescriptor _inputDescr;
+  TensorDescriptor _kernelDescr;
+  TensorDescriptor _outputDescr;
+  TensorDescriptor _biasDescr;
+
+  uint32_t _paddingLeft;
+  uint32_t _paddingTop;
+  uint32_t _paddingRight;
+  uint32_t _paddingBottom;
+
+  uint32_t _strideWidth;
+  uint32_t _strideHeight;
+
+  ir::Activation _activation;
+
+  OperandType _inputType;
+};
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_CPU_KERNEL_CONVOLUTIONLAYER_H__
diff --git a/runtime/neurun/backend/cpu/kernel/DepthwiseConvolutionLayer.cc b/runtime/neurun/backend/cpu/kernel/DepthwiseConvolutionLayer.cc
new file mode 100644
index 000000000..e33e3465e
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/DepthwiseConvolutionLayer.cc
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DepthwiseConvolutionLayer.h"
+
+#include <cker/operation/DepthwiseConv.h>
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+DepthwiseConvolutionLayer::DepthwiseConvolutionLayer()
+    : _inputData(), _kernelData(), _outputData(), _biasData(), _inputDescr(), _kernelDescr(),
+      _outputDescr(), _biasDescr(), _paddingLeft(0), _paddingTop(0), _paddingRight(0),
+      _paddingBottom(0), _strideWidth(0), _strideHeight(0), _multiplier(0),
+      _activation(ir::Activation::NONE), _inputType(OperandType::FLOAT32)
+{
+  // DO NOTHING
+}
+
+void DepthwiseConvolutionLayer::convFloat32()
+{
+  float output_activation_min, output_activation_max;
+  CalculateActivationRangeFloat(_activation, &output_activation_min, &output_activation_max);
+
+  nnfw::cker::DepthwiseConvParams op_params;
+  op_params.stride_width = _strideWidth;
+  op_params.stride_height = _strideHeight;
+  op_params.dilation_width_factor = 1;
+  op_params.dilation_height_factor = 1;
+  op_params.padding_values.width = _paddingLeft;
+  op_params.padding_values.height = _paddingTop;
+  op_params.depth_multiplier = _multiplier;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  nnfw::cker::DepthwiseConv(op_params, convertTensorDescriptorToCkerShape(_inputDescr),
+                            _inputData.f, convertTensorDescriptorToCkerShape(_kernelDescr),
+                            _kernelData.f, convertTensorDescriptorToCkerShape(_biasDescr),
+                            _biasData.f, convertTensorDescriptorToCkerShape(_outputDescr),
+                            _outputData.f);
+}
+
+void DepthwiseConvolutionLayer::convQuant8()
+{
+  int32_t output_activation_min = 0;
+  int32_t output_activation_max = 0;
+  CalculateActivationRangeUint8(_activation, _outputDescr, &output_activation_min,
+                                &output_activation_max);
+
+  float real_multiplier = 0.0;
+  int32_t output_multiplier = 0;
+  int32_t output_shift = 0;
+  GetQuantizedConvolutionMultiplier(_inputDescr, _kernelDescr, _biasDescr, _outputDescr,
+                                    &real_multiplier);
+  QuantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
+
+  nnfw::cker::DepthwiseConvParams op_params;
+  op_params.stride_width = _strideWidth;
+  op_params.stride_height = _strideHeight;
+  op_params.dilation_width_factor = 1;
+  op_params.dilation_height_factor = 1;
+  op_params.padding_values.width = _paddingLeft;
+  op_params.padding_values.height = _paddingTop;
+  op_params.depth_multiplier = _multiplier;
+  op_params.input_offset = -_inputDescr.offset;
+  op_params.weights_offset = -_kernelDescr.offset;
+  op_params.output_offset = _outputDescr.offset;
+  op_params.output_multiplier = output_multiplier;
+  op_params.output_shift = output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
+  nnfw::cker::DepthwiseConv(op_params, convertTensorDescriptorToCkerShape(_inputDescr),
+                            _inputData.u8, convertTensorDescriptorToCkerShape(_kernelDescr),
+                            _kernelData.u8, convertTensorDescriptorToCkerShape(_biasDescr),
+                            _biasData.i32, convertTensorDescriptorToCkerShape(_outputDescr),
+                            _outputData.u8);
+}
+
+void DepthwiseConvolutionLayer::configure(uint8_t *inputData, const TensorDescriptor inputDescr,
+                                          uint8_t *kernelData, const TensorDescriptor kernelDescr,
+                                          uint8_t *biasData, const TensorDescriptor biasDescr,
+                                          const uint32_t paddingLeft, const uint32_t paddingRight,
+                                          const uint32_t paddingTop, const uint32_t paddingBottom,
+                                          const uint32_t strideWidth, const uint32_t strideHeight,
+                                          const uint32_t multiplier,
+                                          const ir::Activation activation, uint8_t *outputData,
+                                          const TensorDescriptor outputDescr)
+{
+  _inputData.u8 = inputData;
+  _inputDescr = inputDescr;
+  _inputType = inputDescr.type;
+  _kernelData.u8 = kernelData;
+  _kernelDescr = kernelDescr;
+  _biasData.u8 = biasData;
+  _biasDescr = biasDescr;
+  _paddingLeft = paddingLeft;
+  _paddingRight = paddingRight;
+  _paddingTop = paddingTop;
+  _paddingBottom = paddingBottom;
+  _strideWidth = strideWidth;
+  _strideHeight = strideHeight;
+  _multiplier = multiplier;
+  _activation = activation;
+  _outputData.u8 = outputData;
+  _outputDescr = outputDescr;
+}
+
+void DepthwiseConvolutionLayer::run()
+{
+  if (_inputType == OperandType::FLOAT32)
+  {
+    convFloat32();
+  }
+  else if (_inputType == OperandType::QUANT8_ASYMM)
+  {
+    convQuant8();
+  }
+}
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
diff --git a/runtime/neurun/backend/cpu/kernel/DepthwiseConvolutionLayer.h b/runtime/neurun/backend/cpu/kernel/DepthwiseConvolutionLayer.h
new file mode 100644
index 000000000..575cc0ab1
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/DepthwiseConvolutionLayer.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_KERNEL_CPU_DEPTHWISECONVOLUTIONLAYER_H__
+#define __NEURUN_KERNEL_CPU_DEPTHWISECONVOLUTIONLAYER_H__
+
+#include <exec/IFunction.h>
+
+#include "OperationUtils.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+class DepthwiseConvolutionLayer : public ::neurun::exec::IFunction
+{
+public:
+  DepthwiseConvolutionLayer();
+
+public:
+  void convFloat32();
+
+  void convQuant8();
+
+  void configure(uint8_t *inputData, const TensorDescriptor inputDescr, uint8_t *kernelData,
+                 const TensorDescriptor kernelDescr, uint8_t *biasData,
+                 const TensorDescriptor biasDescr, const uint32_t paddingLeft,
+                 const uint32_t paddingRight, const uint32_t paddingTop,
+                 const uint32_t paddingBottom, const uint32_t strideW, const uint32_t strideH,
+                 const uint32_t multiplier, const ir::Activation activation, uint8_t *outputData,
+                 const TensorDescriptor outputDescr);
+
+  void run();
+  void runSync()
+  {
+    // this abstract method is used just for profiling and called for
+    // backend::acl_common::AclFunction
+    run();
+  }
+
+private:
+  DataPtr _inputData;
+  DataPtr _kernelData;
+  DataPtr _outputData;
+  DataPtr _biasData;
+
+  TensorDescriptor _inputDescr;
+  TensorDescriptor _kernelDescr;
+  TensorDescriptor _outputDescr;
+  TensorDescriptor _biasDescr;
+
+  uint32_t _paddingLeft;
+  uint32_t _paddingTop;
+  uint32_t _paddingRight;
+  uint32_t _paddingBottom;
+
+  uint32_t _strideWidth;
+  uint32_t _strideHeight;
+
+  uint32_t _multiplier;
+
+  ir::Activation _activation;
+
+  OperandType _inputType;
+};
+
+} // namespace kernel
+} // namespace cpu
+} // backend
+} // namespace neurun
+
+#endif // __NEURUN_KERNEL_CPU_DEPTHWISECONVOLUTIONLAYER_H__
diff --git a/runtime/neurun/backend/cpu/kernel/FullyConnectedLayer.cc b/runtime/neurun/backend/cpu/kernel/FullyConnectedLayer.cc
new file mode 100644
index 000000000..055f71590
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/FullyConnectedLayer.cc
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "FullyConnectedLayer.h"
+
+#include <cker/operation/FullyConnected.h>
+
+#include "OperationUtils.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+FullyConnectedLayer::FullyConnectedLayer()
+    : _inputData(), _weightsData(), _biasData(), _outputData(), _inputDescr(), _weightsDescr(),
+      _biasDescr(), _outputDescr(), _activation(ir::Activation::NONE),
+      _inputType(OperandType::FLOAT32)
+{
+  // DO NOTHING
+}
+
+void FullyConnectedLayer::fullyConnectedFloat32()
+{
+  float output_activation_min, output_activation_max;
+  CalculateActivationRangeFloat(_activation, &output_activation_min, &output_activation_max);
+
+  nnfw::cker::FullyConnectedParams op_params;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  nnfw::cker::FullyConnected(op_params, convertToExtendedCkerShape(_inputDescr), _inputData.f,
+                             convertToExtendedCkerShape(_weightsDescr), _weightsData.f,
+                             convertToExtendedCkerShape(_biasDescr), _biasData.f,
+                             convertToExtendedCkerShape(_outputDescr), _outputData.f);
+}
+
+// executionMutex is used to protect concurrent access of non-threadsafe resources
+// like gemmlowp::GemmContext.
+void FullyConnectedLayer::fullyConnectedQuant8()
+{
+  float real_multiplier = 0.0;
+  int32_t output_multiplier = 0;
+  int32_t output_shift = 0;
+  int32_t output_activation_min = 0;
+  int32_t output_activation_max = 0;
+  GetQuantizedConvolutionMultiplier(_inputDescr, _weightsDescr, _biasDescr, _outputDescr,
+                                    &real_multiplier);
+  QuantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
+  CalculateActivationRangeUint8(_activation, _outputDescr, &output_activation_min,
+                                &output_activation_max);
+
+  nnfw::cker::FullyConnectedParams op_params;
+  op_params.input_offset = -_inputDescr.offset;
+  op_params.weights_offset = -_weightsDescr.offset;
+  op_params.output_offset = _outputDescr.offset;
+  op_params.output_multiplier = output_multiplier;
+  op_params.output_shift = output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
+  nnfw::cker::FullyConnected(op_params, convertToExtendedCkerShape(_inputDescr), _inputData.u8,
+                             convertToExtendedCkerShape(_weightsDescr), _weightsData.u8,
+                             convertToExtendedCkerShape(_biasDescr), _biasData.i32,
+                             convertToExtendedCkerShape(_outputDescr), _outputData.u8);
+}
+
+void FullyConnectedLayer::configure(uint8_t *inputData, const TensorDescriptor inputDescr,
+                                    uint8_t *weightsData, const TensorDescriptor weightsDescr,
+                                    uint8_t *biasData, const TensorDescriptor biasDescr,
+                                    ir::Activation activation, uint8_t *outputData,
+                                    const TensorDescriptor outputDescr)
+{
+  _inputData.u8 = inputData;
+  _inputDescr = inputDescr;
+  _inputType = inputDescr.type;
+  _weightsData.u8 = weightsData;
+  _weightsDescr = weightsDescr;
+  _biasData.u8 = biasData;
+  _biasDescr = biasDescr;
+  _activation = activation;
+  _outputData.u8 = outputData;
+  _outputDescr = outputDescr;
+}
+
+void FullyConnectedLayer::run()
+{
+  if (_inputType == OperandType::FLOAT32)
+  {
+    fullyConnectedFloat32();
+  }
+  else if (_inputType == OperandType::QUANT8_ASYMM)
+  {
+    fullyConnectedQuant8();
+  }
+}
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
diff --git a/runtime/neurun/backend/cpu/kernel/FullyConnectedLayer.h b/runtime/neurun/backend/cpu/kernel/FullyConnectedLayer.h
new file mode 100644
index 000000000..9fdc393a4
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/FullyConnectedLayer.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_CPU_KERNEL_FULLYCONNECTEDLAYER_H__
+#define __NEURUN_BACKEND_CPU_KERNEL_FULLYCONNECTEDLAYER_H__
+
+#include <exec/IFunction.h>
+
+#include "OperationUtils.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+class FullyConnectedLayer : public ::neurun::exec::IFunction
+{
+public:
+  FullyConnectedLayer();
+
+public:
+  void fullyConnectedFloat32();
+
+  void fullyConnectedQuant8();
+
+  void configure(uint8_t *inputData, const TensorDescriptor inputDescr, uint8_t *weightsData,
+                 const TensorDescriptor weightsDescr, uint8_t *biasData,
+                 const TensorDescriptor biasDescr, ir::Activation activation, uint8_t *outputData,
+                 const TensorDescriptor outputDescr);
+
+  void run();
+  void runSync()
+  {
+    // this abstract method is used just for profiling and called for
+    // backend::acl_common::AclFunction
+    run();
+  }
+
+private:
+  DataPtr _inputData;
+  DataPtr _weightsData;
+  DataPtr _biasData;
+  DataPtr _outputData;
+
+  TensorDescriptor _inputDescr;
+  TensorDescriptor _weightsDescr;
+  TensorDescriptor _biasDescr;
+  TensorDescriptor _outputDescr;
+
+  ir::Activation _activation;
+
+  OperandType _inputType;
+};
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_CPU_KERNEL_FULLYCONNECTEDLAYER_H__
diff --git a/runtime/neurun/backend/cpu/kernel/GatherLayer.cc b/runtime/neurun/backend/cpu/kernel/GatherLayer.cc
new file mode 100644
index 000000000..b29acba79
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/GatherLayer.cc
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GatherLayer.h"
+
+#include <cker/operation/Gather.h>
+
+#include "OperationUtils.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+void GatherLayer::configure(uint8_t *inputData, const TensorDescriptor &inputDescr,
+                            uint8_t *indicesData, const TensorDescriptor &indicesDescr,
+                            uint8_t *outputData, const TensorDescriptor &outputDescr, int32_t axis)
+{
+  _inputData.u8 = inputData;
+  _inputDescr = inputDescr;
+  _indicesData.u8 = indicesData;
+  _indicesDescr = indicesDescr;
+  _axis = axis;
+  _inputType = inputDescr.type;
+  _outputData.u8 = outputData;
+  _outputDescr = outputDescr;
+}
+
+void GatherLayer::run()
+{
+  nnfw::cker::GatherParams op_params;
+  op_params.axis = _axis;
+
+  switch (_inputType)
+  {
+    case OperandType::FLOAT32:
+      nnfw::cker::Gather<float>(op_params, convertTensorDescriptorToCkerShape(_inputDescr),
+                                _inputData.f, convertTensorDescriptorToCkerShape(_indicesDescr),
+                                _indicesData.i32, convertTensorDescriptorToCkerShape(_outputDescr),
+                                _outputData.f);
+      break;
+    case OperandType::QUANT8_ASYMM:
+      nnfw::cker::Gather<uint8_t>(op_params, convertTensorDescriptorToCkerShape(_inputDescr),
+                                  _inputData.u8, convertTensorDescriptorToCkerShape(_indicesDescr),
+                                  _indicesData.i32,
+                                  convertTensorDescriptorToCkerShape(_outputDescr), _outputData.u8);
+      break;
+    case OperandType::INT32:
+      nnfw::cker::Gather<int32_t>(
+          op_params, convertTensorDescriptorToCkerShape(_inputDescr), _inputData.i32,
+          convertTensorDescriptorToCkerShape(_indicesDescr), _indicesData.i32,
+          convertTensorDescriptorToCkerShape(_outputDescr), _outputData.i32);
+      break;
+    default:
+      throw std::runtime_error("Gather NYI for this operand type!");
+  }
+}
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
diff --git a/runtime/neurun/backend/cpu/kernel/GatherLayer.h b/runtime/neurun/backend/cpu/kernel/GatherLayer.h
new file mode 100644
index 000000000..af4f8b8f6
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/GatherLayer.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_CPU_KERNEL_GATHERLAYER_H__
+#define __NEURUN_BACKEND_CPU_KERNEL_GATHERLAYER_H__
+
+#include <exec/IFunction.h>
+
+#include "OperationUtils.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+class GatherLayer : public ::neurun::exec::IFunction
+{
+public:
+  GatherLayer()
+      : _inputData{nullptr}, _indicesData{nullptr}, _outputData{nullptr}, _axis{-1},
+        _inputType{OperandType::FLOAT32}
+  {
+    // DO NOTHING
+  }
+
+public:
+  void configure(uint8_t *inputData, const TensorDescriptor &inputDescr, uint8_t *indicesData,
+                 const TensorDescriptor &indicesDescr, uint8_t *outputData,
+                 const TensorDescriptor &outputDescr, int32_t axis);
+
+  void run();
+  void runSync()
+  {
+    // this abstract method is used just for profiling and called for
+    // backend::acl_common::AclFunction
+    run();
+  }
+
+private:
+  DataPtr _inputData;
+  DataPtr _indicesData;
+  DataPtr _outputData;
+
+  TensorDescriptor _inputDescr;
+  TensorDescriptor _indicesDescr;
+  TensorDescriptor _outputDescr;
+
+  int32_t _axis;
+  OperandType _inputType;
+};
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_CPU_KERNEL_GATHERLAYER_H__
diff --git a/runtime/neurun/backend/cpu/kernel/LogisticLayer.cc b/runtime/neurun/backend/cpu/kernel/LogisticLayer.cc
new file mode 100644
index 000000000..d9916964e
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/LogisticLayer.cc
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "LogisticLayer.h"
+
+#include <cker/operation/Logistic.h>
+
+#include "OperationUtils.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+LogisticLayer::LogisticLayer()
+    : _inputData(), _outputData(), _inputDescr(), _outputDescr(), _inputType(OperandType::FLOAT32)
+{
+  // DO NOTHING
+}
+
+void LogisticLayer::logisticFloat32()
+{
+  nnfw::cker::Logistic(convertTensorDescriptorToCkerShape(_inputDescr), _inputData.f,
+                       convertTensorDescriptorToCkerShape(_outputDescr), _outputData.f);
+}
+
+void LogisticLayer::logisticQuant8()
+{
+  // cker quant8 logistic is not implemented yet
+  throw std::runtime_error{"NYI"};
+}
+
+void LogisticLayer::configure(uint8_t *inputData, const TensorDescriptor &inputDescr,
+                              uint8_t *outputData, const TensorDescriptor &outputDescr)
+{
+  _inputData.u8 = inputData;
+  _inputDescr = inputDescr;
+  _inputType = inputDescr.type;
+  _outputData.u8 = outputData;
+  _outputDescr = outputDescr;
+}
+
+void LogisticLayer::run()
+{
+  if (_inputType == OperandType::FLOAT32)
+  {
+    logisticFloat32();
+  }
+  else if (_inputType == OperandType::QUANT8_ASYMM)
+  {
+    logisticQuant8();
+  }
+}
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
diff --git a/runtime/neurun/backend/cpu/kernel/LogisticLayer.h b/runtime/neurun/backend/cpu/kernel/LogisticLayer.h
new file mode 100644
index 000000000..33fcd6fed
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/LogisticLayer.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_CPU_KERNEL_LOGISTICLAYER_H__
+#define __NEURUN_BACKEND_CPU_KERNEL_LOGISTICLAYER_H__
+
+#include <exec/IFunction.h>
+
+#include "OperationUtils.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+class LogisticLayer : public ::neurun::exec::IFunction
+{
+public:
+  LogisticLayer();
+
+public:
+  void logisticFloat32();
+
+  void logisticQuant8();
+
+  void configure(uint8_t *inputData, const TensorDescriptor &inputDescr, uint8_t *outputData,
+                 const TensorDescriptor &outputDescr);
+
+  void run();
+  void runSync()
+  {
+    // this abstract method is used just for profiling and called for
+    // backend::acl_common::AclFunction
+    run();
+  }
+
+private:
+  DataPtr _inputData;
+  DataPtr _outputData;
+
+  TensorDescriptor _inputDescr;
+  TensorDescriptor _outputDescr;
+
+  OperandType _inputType;
+};
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_CPU_KERNEL_LOGISTICLAYER_H__
diff --git a/runtime/neurun/backend/cpu/kernel/MaxPoolLayer.cc b/runtime/neurun/backend/cpu/kernel/MaxPoolLayer.cc
new file mode 100644
index 000000000..095cd6d1d
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/MaxPoolLayer.cc
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "MaxPoolLayer.h"
+
+#include <cker/operation/MaxPool.h>
+
+#include "OperationUtils.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+#define MAXPOOLING_PARAMETERS                            \
+  nnfw::cker::PoolParams op_params;                      \
+  op_params.stride_height = _strideHeight;               \
+  op_params.stride_width = _strideWidth;                 \
+  op_params.filter_height = _kernelHeight;               \
+  op_params.filter_width = _kernelWidth;                 \
+  op_params.padding_values.height = (int8_t)_paddingTop; \
+  op_params.padding_values.width = (int8_t)_paddingLeft;
+
+MaxPoolLayer::MaxPoolLayer()
+    : _inputData(), _outputData(), _inputDescr(), _outputDescr(), _paddingLeft(0), _paddingTop(0),
+      _paddingRight(0), _paddingBottom(0), _strideWidth(0), _strideHeight(0), _kernelWidth(0),
+      _kernelHeight(0), _activation(ir::Activation::NONE), _inputType(OperandType::FLOAT32)
+{
+  // DO NOTHING
+}
+
+void MaxPoolLayer::maxPoolFloat32()
+{
+  MAXPOOLING_PARAMETERS
+  float output_activation_min, output_activation_max;
+  CalculateActivationRangeFloat(_activation, &output_activation_min, &output_activation_max);
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+
+  nnfw::cker::MaxPool(op_params, convertTensorDescriptorToCkerShape(_inputDescr), _inputData.f,
+                      convertTensorDescriptorToCkerShape(_outputDescr), _outputData.f);
+}
+void MaxPoolLayer::maxPoolQuant8()
+{
+  MAXPOOLING_PARAMETERS
+  int32_t output_activation_min = 0;
+  int32_t output_activation_max = 0;
+  CalculateActivationRangeUint8(_activation, _outputDescr, &output_activation_min,
+                                &output_activation_max);
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
+  nnfw::cker::MaxPool(op_params, convertTensorDescriptorToCkerShape(_inputDescr), _inputData.u8,
+                      convertTensorDescriptorToCkerShape(_outputDescr), _outputData.u8);
+}
+
+void MaxPoolLayer::configure(uint8_t *inputData, const TensorDescriptor inputDescr,
+                             const uint32_t paddingLeft, const uint32_t paddingRight,
+                             const uint32_t paddingTop, const uint32_t paddingBottom,
+                             const uint32_t strideWidth, const uint32_t strideHeight,
+                             const uint32_t kernelWidth, const uint32_t kernelHeight,
+                             const ir::Activation activation, uint8_t *outputData,
+                             const TensorDescriptor outputDescr)
+{
+  _inputData.u8 = inputData;
+
+  _inputDescr = inputDescr;
+  _inputType = inputDescr.type;
+  _paddingLeft = paddingLeft;
+  _paddingRight = paddingRight;
+  _paddingTop = paddingTop;
+  _paddingBottom = paddingBottom;
+  _strideWidth = strideWidth;
+  _strideHeight = strideHeight;
+  _kernelWidth = kernelWidth;
+  _kernelHeight = kernelHeight;
+  _activation = activation;
+  _outputData.u8 = outputData;
+  _outputDescr = outputDescr;
+}
+
+void MaxPoolLayer::run()
+{
+  if (_inputType == OperandType::FLOAT32)
+  {
+    maxPoolFloat32();
+  }
+  else if (_inputType == OperandType::QUANT8_ASYMM)
+  {
+    maxPoolQuant8();
+  }
+}
+
+#undef MAXPOOLING_PARAMETERS
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
diff --git a/runtime/neurun/backend/cpu/kernel/MaxPoolLayer.h b/runtime/neurun/backend/cpu/kernel/MaxPoolLayer.h
new file mode 100644
index 000000000..88a574c42
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/MaxPoolLayer.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_CPU_KERNEL_MAXPOOLLAYER_H__
+#define __NEURUN_BACKEND_CPU_KERNEL_MAXPOOLLAYER_H__
+
+#include <exec/IFunction.h>
+
+#include "OperationUtils.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+class MaxPoolLayer : public ::neurun::exec::IFunction
+{
+public:
+  MaxPoolLayer();
+
+public:
+  void maxPoolFloat32();
+
+  void maxPoolQuant8();
+
+  void configure(uint8_t *inputData, const TensorDescriptor inputDescr, const uint32_t paddingLeft,
+                 const uint32_t paddingRight, const uint32_t paddingTop,
+                 const uint32_t paddingBottom, const uint32_t strideWidth,
+                 const uint32_t strideHeight, const uint32_t kernelWidth,
+                 const uint32_t kernelHeight, const ir::Activation activation, uint8_t *outputData,
+                 const TensorDescriptor outputDescr);
+
+  void run();
+  void runSync()
+  {
+    // this abstract method is used just for profiling and called for
+    // backend::acl_common::AclFunction
+    run();
+  }
+
+private:
+  DataPtr _inputData;
+  DataPtr _outputData;
+
+  TensorDescriptor _inputDescr;
+  TensorDescriptor _outputDescr;
+
+  uint32_t _paddingLeft;
+  uint32_t _paddingTop;
+  uint32_t _paddingRight;
+  uint32_t _paddingBottom;
+
+  uint32_t _strideWidth;
+  uint32_t _strideHeight;
+  uint32_t _kernelWidth;
+  uint32_t _kernelHeight;
+
+  ir::Activation _activation;
+
+  OperandType _inputType;
+};
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_CPU_KERNEL_MAXPOOLLAYER_H__
diff --git a/runtime/neurun/backend/cpu/kernel/MulLayer.cc b/runtime/neurun/backend/cpu/kernel/MulLayer.cc
new file mode 100644
index 000000000..d6ce2cfad
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/MulLayer.cc
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "MulLayer.h"
+
+#include <cker/operation/BinaryArithmeticOps.h>
+
+#include "OperationUtils.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+void MulLayer::mulFloat32()
+{
+  float output_activation_min, output_activation_max;
+  CalculateActivationRangeFloat(_activation, &output_activation_min, &output_activation_max);
+  nnfw::cker::BinaryArithmeticOpParam op_params;
+  op_params.float_activation_max = output_activation_max;
+  op_params.float_activation_min = output_activation_min;
+
+  const std::function<float(const float &, const float &)> fn = [](const float &a, const float &b) {
+    return a * b;
+  };
+
+  if (!HaveSameShapes(&_lhsDescr, &_rhsDescr))
+  {
+    nnfw::cker::BroadcastBinaryArithmeticOpSlow(
+        op_params, convertToExtendedCkerShape(_lhsDescr), _lhsData.f,
+        convertToExtendedCkerShape(_rhsDescr), _rhsData.f, convertToExtendedCkerShape(_outputDescr),
+        _outputData.f, fn);
+    return;
+  }
+
+  nnfw::cker::BinaryArithmeticOp(op_params, convertTensorDescriptorToCkerShape(_lhsDescr),
+                                 _lhsData.f, convertTensorDescriptorToCkerShape(_rhsDescr),
+                                 _rhsData.f, convertTensorDescriptorToCkerShape(_outputDescr),
+                                 _outputData.f, fn);
+}
+
+void MulLayer::mulQuant8()
+{
+  int32_t output_activation_min, output_activation_max;
+  CalculateActivationRangeUint8(_activation, _outputDescr, &output_activation_min,
+                                &output_activation_max);
+  // nnfw::cker::BinaryArithmeticOpParam op_params;
+  // op_params.quantized_activation_max = output_activation_max;
+  // op_params.quantized_activation_min = output_activation_min;
+
+  // cker quant8 mul is not implemented yet
+  throw std::runtime_error{"Mull NYI for quantized"};
+}
+
+void MulLayer::configure(uint8_t *lhsData, const TensorDescriptor &lhsDescr, uint8_t *rhsData,
+                         const TensorDescriptor &rhsDescr, const ir::Activation activation,
+                         uint8_t *outputData, const TensorDescriptor &outputDescr)
+{
+  _lhsData.u8 = lhsData;
+  _lhsDescr = lhsDescr;
+  _rhsData.u8 = rhsData;
+  _rhsDescr = rhsDescr;
+  _inputType = lhsDescr.type;
+  _activation = activation;
+  _outputData.u8 = outputData;
+  _outputDescr = outputDescr;
+}
+
+void MulLayer::run()
+{
+  if (_inputType == OperandType::FLOAT32)
+  {
+    mulFloat32();
+  }
+  else if (_inputType == OperandType::QUANT8_ASYMM)
+  {
+    mulQuant8();
+  }
+}
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
diff --git a/runtime/neurun/backend/cpu/kernel/MulLayer.h b/runtime/neurun/backend/cpu/kernel/MulLayer.h
new file mode 100644
index 000000000..05fc3052f
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/MulLayer.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_CPU_KERNEL_MULLAYER_H__
+#define __NEURUN_BACKEND_CPU_KERNEL_MULLAYER_H__
+
+#include <exec/IFunction.h>
+
+#include "OperationUtils.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+class MulLayer : public ::neurun::exec::IFunction
+{
+public:
+  MulLayer() : _lhsData(), _rhsData(), _outputData(), _lhsDescr(), _rhsDescr(), _outputDescr()
+  {
+    // DO NOTHING
+  }
+
+public:
+  void mulFloat32();
+
+  void mulQuant8();
+
+  void configure(uint8_t *lhsData, const TensorDescriptor &lhsDescr, uint8_t *rhsData,
+                 const TensorDescriptor &rhsDescr, const ir::Activation activation,
+                 uint8_t *outputData, const TensorDescriptor &outputDescr);
+
+  void run();
+  void runSync()
+  {
+    // this abstract method is used just for profiling and called for
+    // backend::acl_common::AclFunction
+    run();
+  }
+
+private:
+  DataPtr _lhsData;
+  DataPtr _rhsData;
+  DataPtr _outputData;
+
+  TensorDescriptor _lhsDescr;
+  TensorDescriptor _rhsDescr;
+  TensorDescriptor _outputDescr;
+
+  ir::Activation _activation{ir::Activation::NONE};
+
+  OperandType _inputType{OperandType::FLOAT32};
+};
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_CPU_KERNEL_MULLAYER_H__
diff --git a/runtime/neurun/backend/cpu/kernel/OperationUtils.cc b/runtime/neurun/backend/cpu/kernel/OperationUtils.cc
new file mode 100644
index 000000000..8aa15dcbd
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/OperationUtils.cc
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "OperationUtils.h"
+
+#include <cmath>
+#include <algorithm>
+#include <cassert>
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+uint32_t getNumberOfDimensions(const TensorDescriptor &descr) { return descr.dimensions.size(); }
+
+uint32_t getNumberOfElements(const TensorDescriptor &descr)
+{
+  uint32_t count = 1;
+  for (size_t i = 0; i < descr.dimensions.size(); i++)
+  {
+    count *= descr.dimensions[i];
+  }
+  return count;
+}
+
+uint32_t getSizeOfDimension(const TensorDescriptor &descr, uint32_t dimensionIdx)
+{
+  if (dimensionIdx >= descr.dimensions.size())
+  {
+    // TODO, log the error
+    return 0;
+  }
+  return descr.dimensions[dimensionIdx];
+}
+
+void QuantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift)
+{
+  if (double_multiplier == 0.)
+  {
+    *quantized_multiplier = 0;
+    *shift = 0;
+    return;
+  }
+  const double q = std::frexp(double_multiplier, shift);
+  auto q_fixed = static_cast<int64_t>(std::round(q * (1ll << 31)));
+
+  assert(q_fixed <= (1ll << 31));
+  if (q_fixed == (1ll << 31))
+  {
+    q_fixed /= 2;
+    ++*shift;
+  }
+  assert(q_fixed <= std::numeric_limits<int32_t>::max());
+  *quantized_multiplier = static_cast<int32_t>(q_fixed);
+}
+
+void GetQuantizedConvolutionMultiplier(const TensorDescriptor &inputDescr,
+                                       const TensorDescriptor &filterDescr,
+                                       const TensorDescriptor &biasDescr,
+                                       const TensorDescriptor &outputDescr, float *multiplier)
+{
+  const float input_product_scale = inputDescr.scale * filterDescr.scale;
+  const float bias_scale = biasDescr.scale;
+  const float output_scale = outputDescr.scale;
+  // The following conditions must be guaranteed by the training pipeline.
+  UNUSED_RELEASE(bias_scale);
+  assert(std::abs(input_product_scale - bias_scale) <=
+         1e-6 * std::min(input_product_scale, bias_scale));
+  assert(input_product_scale >= 0);
+  assert(input_product_scale < output_scale);
+  *multiplier = input_product_scale / output_scale;
+}
+
+void QuantizeMultiplierGreaterThanOne(double double_multiplier, int32_t *quantized_multiplier,
+                                      int *left_shift)
+{
+  assert(double_multiplier > 1.);
+  const double q = std::frexp(double_multiplier, left_shift);
+  int64_t q_fixed = static_cast<int64_t>(std::round(q * (1ll << 31)));
+  assert(q_fixed <= (1ll << 31));
+  if (q_fixed == (1ll << 31))
+  {
+    q_fixed /= 2;
+    ++*left_shift;
+  }
+  assert(*left_shift >= 0);
+  assert(q_fixed <= std::numeric_limits<int32_t>::max());
+  *quantized_multiplier = static_cast<int32_t>(q_fixed);
+}
+
+void CalculateActivationRangeFloat(ir::Activation activation, float *activation_min,
+                                   float *activation_max)
+{
+  if (activation == ir::Activation::RELU)
+  {
+    *activation_min = 0.f;
+    *activation_max = std::numeric_limits<float>::max();
+  }
+  else if (activation == ir::Activation::RELU6)
+  {
+    *activation_min = 0.f;
+    *activation_max = 6.f;
+  }
+  else if (activation == ir::Activation::RELU1)
+  {
+    *activation_min = -1.f;
+    *activation_max = 1.f;
+  }
+  else if (activation == ir::Activation::SIGMOID)
+  {
+    *activation_min = 0.f;
+    *activation_max = 1.f;
+  }
+  else if (activation == ir::Activation::NONE)
+  {
+    *activation_min = std::numeric_limits<float>::lowest();
+    *activation_max = std::numeric_limits<float>::max();
+  }
+  else
+  {
+    std::cout << "Unsupported fused activation function." << std::endl;
+  }
+}
+
+void CalculateActivationRangeUint8(ir::Activation activation, const TensorDescriptor &outputDescr,
+                                   int32_t *act_min, int32_t *act_max)
+{
+  const int32_t qmin = std::numeric_limits<uint8_t>::min();
+  const int32_t qmax = std::numeric_limits<uint8_t>::max();
+  const auto scale = outputDescr.scale;
+  const auto zero_point = outputDescr.offset;
+  auto quantize = [scale, zero_point](float f) {
+    return zero_point + static_cast<int32_t>(std::round(f / scale));
+  };
+  if (activation == ir::Activation::RELU)
+  {
+    *act_min = std::max(qmin, quantize(0.0));
+    *act_max = qmax;
+  }
+  else if (activation == ir::Activation::RELU6)
+  {
+    *act_min = std::max(qmin, quantize(0.0));
+    *act_max = std::min(qmax, quantize(6.0));
+  }
+  else if (activation == ir::Activation::RELU1)
+  {
+    *act_min = std::max(qmin, quantize(-1.0));
+    *act_max = std::min(qmax, quantize(1.0));
+  }
+  else if (activation == ir::Activation::SIGMOID)
+  {
+    *act_min = std::max(qmin, quantize(0.0));
+    *act_max = std::min(qmax, quantize(1.0));
+  }
+  else if (activation == ir::Activation::NONE)
+  {
+    *act_min = qmin;
+    *act_max = qmax;
+  }
+  else
+  {
+    std::cout << "Unsupported fused activation function." << std::endl;
+  }
+}
+
+bool HaveSameShapes(const TensorDescriptor *input1, const TensorDescriptor *input2)
+{
+  if (input1 == input2)
+    return true;
+  if (input2 == NULL || input2 == NULL)
+    return false;
+
+  if (input1 == NULL)
+  {
+    return (getNumberOfDimensions(*input2) == 0);
+  }
+
+  if (getNumberOfDimensions(*input1) != getNumberOfDimensions(*input2))
+    return false;
+
+  for (uint32_t i = 0; i < getNumberOfDimensions(*input1); i++)
+    if (input1->dimensions[i] != input2->dimensions[i])
+      return false;
+
+  return true;
+}
+
+int32_t CalculateInputRadius(int input_integer_bits, int input_left_shift)
+{
+  const double max_input_rescaled = 1.0 * ((1 << input_integer_bits) - 1) *
+                                    (1ll << (31 - input_integer_bits)) / (1ll << input_left_shift);
+  // Tighten bound using floor.  Suppose that we could use the exact value.
+  // After scaling the difference, the result would be at the maximum.  Thus we
+  // must ensure that our value has lower magnitude.
+  return static_cast<int32_t>(std::floor(max_input_rescaled));
+}
+
+TensorDescriptor getTensorDescriptor(const ir::Operand &o, ir::Layout frontend_layout)
+{
+  TensorDescriptor descriptor;
+
+  auto dims = o.shape().dims();
+  if (frontend_layout == ir::Layout::NCHW && o.shape().rank() == 4)
+  {
+    // NCHW -> NHWC
+    uint32_t permutation[4] = {0, 2, 3, 1};
+    for (int i = 0; i < o.shape().rank(); ++i)
+    {
+      dims.at(i) = o.shape().dim(permutation[i]);
+    }
+  }
+  descriptor.dimensions = std::vector<uint32_t>(dims.begin(), dims.end());
+  descriptor.type = static_cast<OperandType>(static_cast<int32_t>(o.typeInfo().type()));
+  descriptor.scale = o.typeInfo().scale();
+  descriptor.offset = o.typeInfo().offset();
+
+  // CPU backend assume that neurun internal shape's rank is always same or less than 4
+  assert(descriptor.dimensions.size() <= 4);
+
+  return descriptor;
+}
+
+uint32_t sizeOfData(OperandType type, const std::vector<uint32_t> &dimensions)
+{
+  uint32_t size = 4;
+
+  switch (type)
+  {
+    case OperandType::FLOAT32:
+    case OperandType::INT32:
+    case OperandType::UINT32:
+      size = 4;
+      break;
+    case OperandType::BOOL8:
+    case OperandType::QUANT8_ASYMM:
+    case OperandType::QUANT8_SYMM:
+      size = 1;
+      break;
+    default:
+      throw std::runtime_error("Not supported operand type.");
+      break;
+  }
+
+  for (auto d : dimensions)
+  {
+    size *= d;
+  }
+
+  return size;
+}
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
diff --git a/runtime/neurun/backend/cpu/kernel/OperationUtils.h b/runtime/neurun/backend/cpu/kernel/OperationUtils.h
new file mode 100644
index 000000000..b9e8c8974
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/OperationUtils.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_SUPPORT_NNAPI_OPERATION_UTILS_H__
+#define __NNFW_SUPPORT_NNAPI_OPERATION_UTILS_H__
+
+#include <iostream>
+#include <limits>
+#include <vector>
+
+#include <cker/Shape.h>
+
+#include "ir/Operand.h"
+#include "ir/DataType.h"
+#include <ir/InternalType.h>
+
+using OperandType = neurun::ir::DataType;
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+struct TensorDescriptor
+{
+  OperandType type;
+  std::vector<uint32_t> dimensions;
+  float scale;
+  int32_t offset;
+};
+
+union DataPtr {
+  uint8_t *u8;
+  int8_t *i8;
+  int32_t *i32;
+  float *f;
+  void *v;
+};
+
+uint32_t getNumberOfDimensions(const TensorDescriptor &descr);
+
+uint32_t getNumberOfElements(const TensorDescriptor &descr);
+
+uint32_t getSizeOfDimension(const TensorDescriptor &descr, uint32_t dimensionIdx);
+
+inline nnfw::cker::Shape convertToExtendedCkerShape(const TensorDescriptor &descr)
+{
+  std::vector<int32_t> raw_shape;
+  raw_shape.resize(4);
+
+  uint32_t src = 4 - descr.dimensions.size();
+  for (uint32_t i = 0; i < 4; ++i)
+  {
+    if (i < src)
+    {
+      raw_shape[i] = 1;
+    }
+    else
+    {
+      raw_shape[i] = descr.dimensions[i - src];
+    }
+  }
+
+  return nnfw::cker::GetShape(raw_shape);
+}
+
+inline nnfw::cker::Shape convertTensorDescriptorToCkerShape(const TensorDescriptor &descr)
+{
+  std::vector<int32_t> raw_shape;
+  raw_shape.resize(4);
+
+  for (uint32_t i = 0; i < 4; ++i)
+  {
+    if (i >= descr.dimensions.size())
+    {
+      raw_shape[i] = 1;
+    }
+    else
+    {
+      raw_shape[i] = descr.dimensions[i];
+    }
+  }
+
+  return nnfw::cker::GetShape(raw_shape);
+}
+
+inline int32_t getAxis(uint32_t rank, int32_t axis, ir::Layout frontend_layout)
+{
+  auto ret = axis;
+
+  if (axis < 0)
+  {
+    ret += rank;
+  }
+
+  // NCHW -> NHWC
+  if (frontend_layout == ir::Layout::NCHW)
+  {
+    int32_t permutation[4] = {0, 3, 1, 2};
+    ret = permutation[ret];
+  }
+
+  return ret;
+}
+
+void QuantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift);
+
+void GetQuantizedConvolutionMultiplier(const TensorDescriptor &inputDescr,
+                                       const TensorDescriptor &filterDescr,
+                                       const TensorDescriptor &biasDescr,
+                                       const TensorDescriptor &outputDescr, float *multiplier);
+
+void QuantizeMultiplierGreaterThanOne(double double_multiplier, int32_t *quantized_multiplier,
+                                      int *left_shift);
+
+void CalculateActivationRangeFloat(ir::Activation activation, float *activation_min,
+                                   float *activation_max);
+
+void CalculateActivationRangeUint8(ir::Activation activation, const TensorDescriptor &outputDescr,
+                                   int32_t *act_min, int32_t *act_max);
+
+bool HaveSameShapes(const TensorDescriptor *input1, const TensorDescriptor *input2);
+
+int32_t CalculateInputRadius(int input_integer_bits, int input_left_shift);
+
+TensorDescriptor getTensorDescriptor(const ir::Operand &o, ir::Layout frontend_layout);
+
+uint32_t sizeOfData(OperandType type, const std::vector<uint32_t> &dimensions);
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
+
+#endif // __NNFW_SUPPORT_NNAPI_OPERATION_UTILS_H__
diff --git a/runtime/neurun/backend/cpu/kernel/PadLayer.cc b/runtime/neurun/backend/cpu/kernel/PadLayer.cc
new file mode 100644
index 000000000..1fd9429b5
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/PadLayer.cc
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PadLayer.h"
+
+#include "OperationUtils.h"
+
+#include <cker/operation/Pad.h>
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+PadLayer::PadLayer()
+    : _inputData(), _outputData(), _inputDescr(), _outputDescr(), _padData(), _padRank(),
+      _constantValueData(), _inputType(OperandType::FLOAT32)
+{
+  // DO NOTHING
+}
+
+void PadLayer::padFloat32()
+{
+  nnfw::cker::Pad(_padData, _padRank, convertTensorDescriptorToCkerShape(_inputDescr), _inputData.f,
+                  convertTensorDescriptorToCkerShape(_outputDescr), _outputData.f,
+                  _constantValueData.f);
+}
+void PadLayer::padQuant8() { throw std::runtime_error("Quantized Pad isn't supported NYI"); }
+
+void PadLayer::configure(uint8_t *inputData, const TensorDescriptor inputDescr, uint8_t *outputData,
+                         const TensorDescriptor outputDescr, const int32_t *padData,
+                         int32_t padRank, uint8_t *constantValueData)
+{
+  _inputData.u8 = inputData;
+  _inputDescr = inputDescr;
+  _inputType = inputDescr.type;
+  _outputData.u8 = outputData;
+  _outputDescr = outputDescr;
+  _padData = padData;
+  _padRank = padRank;
+  _constantValueData.u8 = constantValueData;
+}
+
+void PadLayer::run()
+{
+  if (_inputType == OperandType::FLOAT32)
+  {
+    padFloat32();
+  }
+  else if (_inputType == OperandType::QUANT8_ASYMM)
+  {
+    padQuant8();
+  }
+}
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
diff --git a/runtime/neurun/backend/cpu/kernel/PadLayer.h b/runtime/neurun/backend/cpu/kernel/PadLayer.h
new file mode 100644
index 000000000..f4413a8ed
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/PadLayer.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_CPU_KERNEL_PADLAYER_H__
+#define __NEURUN_BACKEND_CPU_KERNEL_PADLAYER_H__
+
+#include <exec/IFunction.h>
+
+#include "OperationUtils.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+// Note, this is pad with mode=`CONSTANT`: it doesn't support `REFLECT` and `SYMMETRIC`
+class PadLayer : public ::neurun::exec::IFunction
+{
+public:
+  PadLayer();
+
+public:
+  void padFloat32();
+
+  void padQuant8();
+
+  void configure(uint8_t *inputData, const TensorDescriptor inputDescr, uint8_t *outputData,
+                 const TensorDescriptor outputDescr, const int32_t *padData, int32_t padRank,
+                 uint8_t *constantValueData = nullptr);
+
+  void run();
+  void runSync()
+  {
+    // this abstract method is used just for profiling and called for
+    // backend::acl_common::AclFunction
+    run();
+  }
+
+private:
+  DataPtr _inputData;
+  DataPtr _outputData;
+
+  TensorDescriptor _inputDescr;
+  TensorDescriptor _outputDescr;
+
+  const int32_t *_padData;
+  int32_t _padRank;
+  DataPtr _constantValueData;
+
+  OperandType _inputType;
+};
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_CPU_KERNEL_PADLAYER_H__
diff --git a/runtime/neurun/backend/cpu/kernel/PermuteLayer.cc b/runtime/neurun/backend/cpu/kernel/PermuteLayer.cc
new file mode 100644
index 000000000..6f28d8436
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/PermuteLayer.cc
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PermuteLayer.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+using Type = ir::operation::Permute::Type;
+
+void PermuteLayer::configure(std::shared_ptr<backend::operand::ITensor> input,
+                             std::shared_ptr<backend::operand::ITensor> output,
+                             const ir::Shape &output_shape, Type type, ir::DataType dataType)
+{
+  _input = input;
+  _output = output;
+  _output_shape = output_shape;
+  _type = type;
+  _dataType = dataType;
+}
+
+void PermuteLayer::run()
+{
+  using ir::DataType;
+  switch (_dataType)
+  {
+    case DataType::FLOAT32:
+      runTempl<float>();
+      break;
+    case DataType::INT32:
+      runTempl<int32_t>();
+      break;
+    case DataType::UINT32:
+      runTempl<uint32_t>();
+      break;
+    case DataType::BOOL8:
+    case DataType::QUANT8_ASYMM:
+      runTempl<uint8_t>();
+      break;
+    case DataType::QUANT8_SYMM:
+      runTempl<int8_t>();
+      break;
+    default:
+      throw std::runtime_error("NYI");
+      break;
+  }
+}
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
diff --git a/runtime/neurun/backend/cpu/kernel/PermuteLayer.h b/runtime/neurun/backend/cpu/kernel/PermuteLayer.h
new file mode 100644
index 000000000..1f9110807
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/PermuteLayer.h
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_CPU_KERNEL_PERMUTE_LAYER_H__
+#define __NEURUN_BACKEND_CPU_KERNEL_PERMUTE_LAYER_H__
+
+#include <exec/IFunction.h>
+
+#include "util/feature/nhwc/View.h"
+#include "OperationUtils.h"
+#include "ir/operation/Permute.h"
+#include "util/feature/nhwc/Reader.h"
+#include "util/feature/nchw/View.h"
+#include "util/Coordinates.h"
+
+#include <misc/feature/IndexIterator.h>
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+class PermuteLayer : public ::neurun::exec::IFunction
+{
+public:
+  PermuteLayer() = default;
+
+public:
+  void configure(std::shared_ptr<backend::operand::ITensor> input,
+                 std::shared_ptr<backend::operand::ITensor> output, const ir::Shape &output_shape,
+                 ir::operation::Permute::Type type, ir::DataType dataType);
+  void run();
+  void runSync()
+  {
+    // this abstract method is used just for profiling and called for
+    // backend::acl_common::AclFunction
+    run();
+  }
+
+private:
+  template <class T> void runTempl()
+  {
+    auto rank = _output_shape.rank();
+    auto fn = [&](::neurun::backend::operand::ITensor &in_tensor) {
+      _output->access([&](::neurun::backend::operand::ITensor &out_tensor) {
+        auto input_buffer = in_tensor.buffer();
+        auto input_size = in_tensor.total_size();
+        auto output_buffer = out_tensor.buffer();
+        if (_type == ir::operation::Permute::Type::COPY)
+        {
+          assert(in_tensor.layout() == out_tensor.layout());
+          if (!in_tensor.has_padding() && !out_tensor.has_padding())
+          {
+            assert(input_size == out_tensor.total_size());
+            memcpy(output_buffer, input_buffer, input_size);
+            return;
+          }
+        }
+        switch (rank)
+        {
+          case 0:
+          case 1:
+          {
+            const int32_t copy_len = _output_shape.dim(0);
+
+            memcpy(output_buffer, input_buffer, copy_len);
+            break;
+          }
+          case 2:
+          {
+            const int32_t copy_len = _output_shape.dim(1);
+
+            for (auto i = 0; i < _output_shape.dim(0); ++i)
+            {
+              neurun::util::Coordinates coords{i, 0};
+              memcpy(output_buffer + out_tensor.calcOffset(coords),
+                     input_buffer + in_tensor.calcOffset(coords), copy_len * sizeof(T));
+            }
+            break;
+          }
+          case 3:
+          {
+            const int32_t copy_len = _output_shape.dim(2);
+
+            for (auto i = 0; i < _output_shape.dim(0); ++i)
+            {
+              for (auto j = 0; j < _output_shape.dim(1); ++j)
+              {
+                neurun::util::Coordinates coords{i, j, 0};
+                memcpy(output_buffer + out_tensor.calcOffset(coords),
+                       input_buffer + in_tensor.calcOffset(coords), copy_len * sizeof(T));
+              }
+            }
+            break;
+          }
+          case 4:
+          {
+            // TODO Unify permute type and remove switch case
+            switch (_type)
+            {
+              case ir::operation::Permute::Type::NHWC_TO_NCHW:
+              {
+                for (auto n = 0; n < _output_shape.dim(0); ++n)
+                {
+                  for (auto c = 0; c < _output_shape.dim(1); ++c)
+                  {
+                    for (auto h = 0; h < _output_shape.dim(2); ++h)
+                    {
+                      for (auto w = 0; w < _output_shape.dim(3); ++w)
+                      {
+                        const neurun::util::Coordinates in_coords{n, h, w, c};
+                        const auto out_coords =
+                            convertCoordinates(in_coords, in_tensor.layout(), out_tensor.layout());
+                        const auto value =
+                            *reinterpret_cast<T *>(input_buffer + in_tensor.calcOffset(in_coords));
+                        *reinterpret_cast<T *>(output_buffer + out_tensor.calcOffset(out_coords)) =
+                            value;
+                      }
+                    }
+                  }
+                }
+                break;
+              }
+              case ir::operation::Permute::Type::NCHW_TO_NHWC:
+              {
+                for (auto n = 0; n < _output_shape.dim(0); ++n)
+                {
+                  for (auto h = 0; h < _output_shape.dim(1); ++h)
+                  {
+                    for (auto w = 0; w < _output_shape.dim(2); ++w)
+                    {
+                      for (auto c = 0; c < _output_shape.dim(3); ++c)
+                      {
+                        const neurun::util::Coordinates in_coords{n, c, h, w};
+                        const auto out_coords =
+                            convertCoordinates(in_coords, in_tensor.layout(), out_tensor.layout());
+                        const auto value =
+                            *reinterpret_cast<T *>(input_buffer + in_tensor.calcOffset(in_coords));
+                        *reinterpret_cast<T *>(output_buffer + out_tensor.calcOffset(out_coords)) =
+                            value;
+                      }
+                    }
+                  }
+                }
+                break;
+              }
+              case ir::operation::Permute::Type::COPY:
+              {
+                const int32_t copy_len = _output_shape.dim(3);
+
+                for (auto i = 0; i < _output_shape.dim(0); ++i)
+                {
+                  for (auto j = 0; j < _output_shape.dim(1); ++j)
+                  {
+                    for (auto k = 0; k < _output_shape.dim(2); ++k)
+                    {
+                      neurun::util::Coordinates coords{i, j, k, 0};
+                      memcpy(output_buffer + out_tensor.calcOffset(coords),
+                             input_buffer + in_tensor.calcOffset(coords), copy_len * sizeof(T));
+                    }
+                  }
+                }
+                break;
+              }
+              default:
+                throw std::runtime_error("NYI");
+                break;
+            }
+            break;
+          }
+          default:
+            throw std::runtime_error("NYI");
+            break;
+        }
+      });
+    };
+    _input->access(fn);
+  }
+
+private:
+  std::shared_ptr<backend::operand::ITensor> _input{nullptr};
+  std::shared_ptr<backend::operand::ITensor> _output{nullptr};
+  ir::Shape _output_shape{};
+  ir::operation::Permute::Type _type{ir::operation::Permute::Type::COPY};
+  ir::DataType _dataType{ir::DataType::FLOAT32};
+};
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_CPU_KERNEL_PERMUTE_LAYER_H__
diff --git a/runtime/neurun/backend/cpu/kernel/ReshapeLayer.cc b/runtime/neurun/backend/cpu/kernel/ReshapeLayer.cc
new file mode 100644
index 000000000..caeee9f12
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/ReshapeLayer.cc
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ReshapeLayer.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+ReshapeLayer::ReshapeLayer() : _inputData(), _outputData(), _inputDescr(), _outputDescr()
+{
+  // DO NOTHING
+}
+
+void ReshapeLayer::reshapeGeneric()
+{
+  size_t count = sizeOfData(_inputDescr.type, _inputDescr.dimensions);
+  memcpy(_outputData.v, _inputData.v, count);
+}
+
+void ReshapeLayer::configure(uint8_t *inputData, const TensorDescriptor &inputDescr,
+                             uint8_t *outputData, const TensorDescriptor &outputDescr)
+{
+  _inputData.u8 = inputData;
+  _inputDescr = inputDescr;
+  _outputData.u8 = outputData;
+  _outputDescr = outputDescr;
+}
+
+void ReshapeLayer::run() { reshapeGeneric(); }
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
diff --git a/runtime/neurun/backend/cpu/kernel/ReshapeLayer.h b/runtime/neurun/backend/cpu/kernel/ReshapeLayer.h
new file mode 100644
index 000000000..25dd851b2
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/ReshapeLayer.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_CPU_KERNEL_RESHAPELAYER_H__
+#define __NEURUN_BACKEND_CPU_KERNEL_RESHAPELAYER_H__
+
+#include <exec/IFunction.h>
+
+#include "OperationUtils.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+class ReshapeLayer : public ::neurun::exec::IFunction
+{
+public:
+  ReshapeLayer();
+
+public:
+  void reshapeGeneric();
+
+  void configure(uint8_t *inputData, const TensorDescriptor &inputDescr, uint8_t *outputData,
+                 const TensorDescriptor &outputDescr);
+
+  void run();
+  void runSync()
+  {
+    // this abstract method is used just for profiling and called for
+    // backend::acl_common::AclFunction
+    run();
+  }
+
+private:
+  DataPtr _inputData;
+  DataPtr _outputData;
+
+  TensorDescriptor _inputDescr;
+  TensorDescriptor _outputDescr;
+};
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_CPU_KERNEL_RESHAPELAYER_H__
diff --git a/runtime/neurun/backend/cpu/kernel/SoftMaxLayer.cc b/runtime/neurun/backend/cpu/kernel/SoftMaxLayer.cc
new file mode 100644
index 000000000..58ba109b4
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/SoftMaxLayer.cc
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "SoftMaxLayer.h"
+
+#include <cker/operation/SoftMax.h>
+
+#include "OperationUtils.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+SoftMaxLayer::SoftMaxLayer()
+    : _inputData(), _outputData(), _beta(0.0), _inputDescr(), _outputDescr(),
+      _inputType(OperandType::FLOAT32)
+{
+  // DO NOTHING
+}
+
+// Performs softmax along the input of size (input_size * batch_size).
+void Softmax(const float *in, const int input_size, const int batch_size, const float beta,
+             float *out)
+{
+  assert(input_size > 0);
+
+  // For each batch
+  for (int b = 0; b < batch_size; b++)
+  {
+    // Find the max coeff.
+    float max_coeff = in[0];
+    for (int i = 1; i < input_size; i++)
+    {
+      if (in[i] > max_coeff)
+        max_coeff = in[i];
+    }
+
+    // Compute the normalized sum of exps.
+    float exp_sum = 0.0;
+    for (int i = 0; i < input_size; i++)
+    {
+      out[i] = std::exp((in[i] - max_coeff) * beta);
+      exp_sum += out[i];
+    }
+
+    // Divide by the sum of exps.
+    float reciprocal_sum_exp = 1.f / exp_sum;
+    for (int i = 0; i < input_size; i++)
+    {
+      out[i] *= reciprocal_sum_exp;
+    }
+
+    // Advance in and out pointers for the next batch.
+    in += input_size;
+    out += input_size;
+  }
+}
+
+void SoftMaxLayer::softmaxFloat32()
+{
+  TensorDescriptor descrIn4D;
+
+  if (getNumberOfDimensions(_inputDescr) == 2)
+  {
+    uint32_t batch_size = getSizeOfDimension(_inputDescr, 0);
+    if (batch_size == 0)
+      throw std::runtime_error("batch_size should not be 0");
+
+    uint32_t input_size = getNumberOfElements(_inputDescr) / batch_size;
+    Softmax(_inputData.f, input_size, batch_size, _beta, _outputData.f);
+  }
+  else if (getNumberOfDimensions(_inputDescr) == 4)
+  {
+    nnfw::cker::SoftmaxParams op_params;
+    op_params.beta = _beta;
+    nnfw::cker::Softmax(op_params, convertTensorDescriptorToCkerShape(_inputDescr), _inputData.f,
+                        convertTensorDescriptorToCkerShape(_outputDescr), _outputData.f);
+  }
+  else
+  {
+    throw std::runtime_error{"only 2D and 4D tensors supported"};
+  }
+}
+
+void SoftMaxLayer::softmaxQuant8()
+{
+  TensorDescriptor descrIn4D = _inputDescr;
+
+  if (getNumberOfDimensions(_inputDescr) == 2)
+  {
+    uint32_t batch_size = getSizeOfDimension(_inputDescr, 0);
+    if (batch_size == 0)
+      throw std::runtime_error("batch_size should not be 0");
+
+    uint32_t input_size = getNumberOfElements(_inputDescr) / batch_size;
+    descrIn4D.dimensions = {batch_size, 1, 1, input_size};
+  }
+  else if (getNumberOfDimensions(_inputDescr) == 4)
+  {
+    descrIn4D = _inputDescr;
+  }
+  else
+  {
+    throw std::runtime_error{"only 2D and 4D tensors supported"};
+  }
+  if (_outputDescr.offset != 0 || _outputDescr.scale != 1.f / 256)
+  {
+    throw std::runtime_error{"incorrect scale / offset for output"};
+  }
+  static const int32_t kScaledDiffIntegerBits = 5;
+  const double input_beta_real_multiplier = std::min(
+      1.0 * _beta * _inputDescr.scale * (1 << (31 - kScaledDiffIntegerBits)), (1ll << 31) - 1.0);
+  int32_t input_multiplier = 0;
+  int32_t input_left_shift = 0;
+  QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier, &input_multiplier,
+                                   &input_left_shift);
+  float diff_min = -1.0f * CalculateInputRadius(kScaledDiffIntegerBits, input_left_shift);
+
+  nnfw::cker::SoftmaxParams op_params;
+  op_params.input_multiplier = input_multiplier;
+  op_params.input_left_shift = input_left_shift;
+  op_params.diff_min = diff_min;
+  nnfw::cker::Softmax(op_params, convertTensorDescriptorToCkerShape(descrIn4D), _inputData.u8,
+                      convertTensorDescriptorToCkerShape(descrIn4D), _outputData.u8);
+}
+
+void SoftMaxLayer::configure(uint8_t *inputData, const TensorDescriptor &inputDescr,
+                             const float beta, uint8_t *outputData,
+                             const TensorDescriptor &outputDescr)
+{
+  _inputData.u8 = inputData;
+  _inputDescr = inputDescr;
+  _inputType = inputDescr.type;
+  _outputData.u8 = outputData;
+  _outputDescr = outputDescr;
+  _beta = beta;
+}
+
+void SoftMaxLayer::run()
+{
+  if (_inputType == OperandType::FLOAT32)
+  {
+    softmaxFloat32();
+  }
+  else if (_inputType == OperandType::QUANT8_ASYMM)
+  {
+    softmaxQuant8();
+  }
+}
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
diff --git a/runtime/neurun/backend/cpu/kernel/SoftMaxLayer.h b/runtime/neurun/backend/cpu/kernel/SoftMaxLayer.h
new file mode 100644
index 000000000..4723afb72
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/SoftMaxLayer.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_CPU_KERNEL_SOFTMAXLAYER_H__
+#define __NEURUN_BACKEND_CPU_KERNEL_SOFTMAXLAYER_H__
+
+#include <exec/IFunction.h>
+
+#include "OperationUtils.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+class SoftMaxLayer : public ::neurun::exec::IFunction
+{
+public:
+  SoftMaxLayer();
+
+public:
+  void softmaxFloat32();
+
+  void softmaxQuant8();
+
+  void configure(uint8_t *inputData, const TensorDescriptor &inputDescr, const float beta,
+                 uint8_t *outputData, const TensorDescriptor &outputDescr);
+
+  void run();
+  void runSync()
+  {
+    // this abstract method is used just for profiling and called for
+    // backend::acl_common::AclFunction
+    run();
+  }
+
+private:
+  DataPtr _inputData;
+  DataPtr _outputData;
+
+  float _beta;
+
+  TensorDescriptor _inputDescr;
+  TensorDescriptor _outputDescr;
+
+  OperandType _inputType;
+};
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_CPU_KERNEL_SOFTMAXLAYER_H__
diff --git a/runtime/neurun/backend/cpu/kernel/SubLayer.cc b/runtime/neurun/backend/cpu/kernel/SubLayer.cc
new file mode 100644
index 000000000..c6f7188e0
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/SubLayer.cc
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "SubLayer.h"
+
+#include <cker/operation/BinaryArithmeticOps.h>
+
+#include "OperationUtils.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+void SubLayer::subFloat32()
+{
+  float output_activation_min, output_activation_max;
+  CalculateActivationRangeFloat(_activation, &output_activation_min, &output_activation_max);
+  nnfw::cker::BinaryArithmeticOpParam op_params;
+  op_params.float_activation_max = output_activation_max;
+  op_params.float_activation_min = output_activation_min;
+  const std::function<float(const float &, const float &)> fn = [](const float &a, const float &b) {
+    return a - b;
+  };
+
+  if (!HaveSameShapes(&_lhsDescr, &_rhsDescr))
+  {
+    nnfw::cker::BroadcastBinaryArithmeticOpSlow(
+        op_params, convertToExtendedCkerShape(_lhsDescr), _lhsData.f,
+        convertToExtendedCkerShape(_rhsDescr), _rhsData.f, convertToExtendedCkerShape(_outputDescr),
+        _outputData.f, fn);
+    return;
+  }
+
+  nnfw::cker::BinaryArithmeticOp(op_params, convertTensorDescriptorToCkerShape(_lhsDescr),
+                                 _lhsData.f, convertTensorDescriptorToCkerShape(_rhsDescr),
+                                 _rhsData.f, convertTensorDescriptorToCkerShape(_outputDescr),
+                                 _outputData.f, fn);
+}
+
+void SubLayer::subQuant8()
+{
+  int32_t output_activation_min, output_activation_max;
+  CalculateActivationRangeUint8(_activation, _outputDescr, &output_activation_min,
+                                &output_activation_max);
+  // nnfw::cker::SubParam op_params;
+  // op_params.quantized_activation_max = output_activation_max;
+  // op_params.quantized_activation_min = output_activation_min;
+
+  // cker quant8 sub is not implemented yet
+  throw std::runtime_error{"NYI"};
+}
+
+void SubLayer::configure(uint8_t *lhsData, const TensorDescriptor &lhsDescr, uint8_t *rhsData,
+                         const TensorDescriptor &rhsDescr, const ir::Activation activation,
+                         uint8_t *outputData, const TensorDescriptor &outputDescr)
+{
+  _lhsData.u8 = lhsData;
+  _lhsDescr = lhsDescr;
+  _rhsData.u8 = rhsData;
+  _rhsDescr = rhsDescr;
+  _inputType = lhsDescr.type;
+  _activation = activation;
+  _outputData.u8 = outputData;
+  _outputDescr = outputDescr;
+}
+
+void SubLayer::run()
+{
+  if (_inputType == OperandType::FLOAT32)
+  {
+    subFloat32();
+  }
+  else if (_inputType == OperandType::QUANT8_ASYMM)
+  {
+    subQuant8();
+  }
+}
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
diff --git a/runtime/neurun/backend/cpu/kernel/SubLayer.h b/runtime/neurun/backend/cpu/kernel/SubLayer.h
new file mode 100644
index 000000000..c9abdb48c
--- /dev/null
+++ b/runtime/neurun/backend/cpu/kernel/SubLayer.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_CPU_KERNEL_SUBLAYER_H__
+#define __NEURUN_BACKEND_CPU_KERNEL_SUBLAYER_H__
+
+#include <exec/IFunction.h>
+
+#include "OperationUtils.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace cpu
+{
+namespace kernel
+{
+
+class SubLayer : public ::neurun::exec::IFunction
+{
+public:
+  SubLayer() : _lhsData(), _rhsData(), _outputData(), _lhsDescr(), _rhsDescr(), _outputDescr()
+  {
+    // DO NOTHING
+  }
+
+public:
+  void subFloat32();
+
+  void subQuant8();
+
+  void configure(uint8_t *lhsData, const TensorDescriptor &lhsDescr, uint8_t *rhsData,
+                 const TensorDescriptor &rhsDescr, const ir::Activation activation,
+                 uint8_t *outputData, const TensorDescriptor &outputDescr);
+
+  void run();
+  void runSync()
+  {
+    // this abstract method is used just for profiling and called for
+    // backend::acl_common::AclFunction
+    run();
+  }
+
+private:
+  DataPtr _lhsData;
+  DataPtr _rhsData;
+  DataPtr _outputData;
+
+  TensorDescriptor _lhsDescr;
+  TensorDescriptor _rhsDescr;
+  TensorDescriptor _outputDescr;
+
+  ir::Activation _activation{ir::Activation::NONE};
+
+  OperandType _inputType{OperandType::FLOAT32};
+};
+
+} // namespace kernel
+} // namespace cpu
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_CPU_KERNEL_SUBLAYER_H__