47 files changed, 5248 insertions, 0 deletions
diff --git a/libs/kernel/acl/CMakeLists.txt b/libs/kernel/acl/CMakeLists.txt
new file mode 100644
index 000000000..8f0486e56
--- /dev/null
+++ b/libs/kernel/acl/CMakeLists.txt
@@ -0,0 +1,94 @@
+set(LIB_KERNELACL kernelacl)
+set(LIB_KERNELACL_TEST kernelacl_test)
+
+# TODO remove this when default goes to c++14
+if(CMAKE_VERSION VERSION_LESS 3.1.0)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
+else(CMAKE_VERSION VERSION_LESS 3.1.0)
+  set(CMAKE_CXX_STANDARD 14)
+endif(CMAKE_VERSION VERSION_LESS 3.1.0)
+
+# runtime information
+set(PATH_RUNTIME_NN ${CMAKE_SOURCE_DIR}/runtimes/nn)
+SET(RUNTIME_INCLUDES ${PATH_RUNTIME_NN}/common/include
+                     ${PATH_RUNTIME_NN}/runtime/include
+                     ${PATH_RUNTIME_NN}/depend/hal/include
+                     ${PATH_RUNTIME_NN}/depend/libhidl/base/include
+                     ${PATH_RUNTIME_NN}/depend/libcutils/include
+                     ${PATH_RUNTIME_NN}/depend/libutils/include
+                     ${PATH_RUNTIME_NN}/depend/android-base/include
+                     )
+
+# common
+link_directories(${CMAKE_INSTALL_PREFIX}/lib)
+
+# kernel library
+set(KERNELACL_SRCS "src/Init_acl.cpp"
+                   "src/IO_accessor.cpp"
+                   "src/shape.cpp"
+                   "src/support.cpp"
+                   "src/cl/Conv2D.cpp"
+                   "src/cl/DepthwiseConv2D.cpp"
+                   "src/cl/FullyConnected.cpp"
+                   "src/cl/Pooling.cpp"
+                   "src/cl/Reshape.cpp"
+                   "src/cl/Softmax.cpp"
+                   "src/cl/Concatenation.cpp"
+                   "src/neon/Conv2D.cpp"
+                   "src/neon/DepthwiseConv2D.cpp"
+                   "src/neon/FullyConnected.cpp"
+                   "src/neon/Pooling.cpp"
+                   "src/neon/Softmax.cpp"
+                   "src/neon/Reshape.cpp"
+                   "src/neon/Concatenation.cpp"
+                   )
+
+add_library(${LIB_KERNELACL} SHARED ${KERNELACL_SRCS})
+target_include_directories(${LIB_KERNELACL} PUBLIC
+                           ${NNFW_INCLUDE_DIR}
+                           ${RUNTIME_INCLUDES}
+                           ${NNFW_ACL_INCLUDES}
+                           ${CMAKE_SOURCE_DIR}/include
+                           )
+target_link_libraries(${LIB_KERNELACL} nnfw_support_nnapi)
+if (${TARGET_OS} STREQUAL "tizen")
+  target_link_libraries(${LIB_KERNELACL} nnfw_util ${NNFW_ACL_LIBS} OpenCL)
+else()
+  target_link_libraries(${LIB_KERNELACL} nnfw_util ${NNFW_ACL_LIBS})
+endif()
+install(TARGETS ${LIB_KERNELACL} DESTINATION lib)
+
+# kernel test executable
+set(KERNELACL_TEST_SRCS "src/util.cpp"
+                        "src/gtest_env.cpp"
+                        "src/cl/Conv2D.test.cpp"
+                        "src/cl/DepthwiseConv2D.test.cpp"
+                        "src/cl/FullyConnected.test.cpp"
+                        "src/cl/Pooling.test.cpp"
+                        "src/cl/Reshape.test.cpp"
+                        "src/cl/Softmax.test.cpp"
+                        "src/cl/Concatenation.test.cpp"
+                        "src/neon/Conv2D.test.cpp"
+                        "src/neon/DepthwiseConv2D.test.cpp"
+                        "src/neon/FullyConnected.test.cpp"
+                        "src/neon/Pooling.test.cpp"
+                        "src/neon/Softmax.test.cpp"
+                        "src/neon/Reshape.test.cpp"
+                        "src/neon/Concatenation.test.cpp"
+                        )
+
+add_executable(${LIB_KERNELACL_TEST} ${KERNELACL_TEST_SRCS})
+target_include_directories(${LIB_KERNELACL_TEST} PUBLIC
+                           ${NNFW_INCLUDE_DIR}
+                           ${RUNTIME_INCLUDES}
+                           ${NNFW_ACL_INCLUDES}
+                           )
+if (NOT ${TARGET_OS} STREQUAL "tizen")
+  add_dependencies(${LIB_KERNELACL_TEST} googletest)
+endif()
+target_link_libraries(${LIB_KERNELACL_TEST}
+                      ${LIB_KERNELACL}
+                      nnfw_util ${NNFW_ACL_LIBS}
+                      ${NNFW_GTEST_LIBS}
+                      )
+install(TARGETS ${LIB_KERNELACL_TEST} DESTINATION unittest)
diff --git a/libs/kernel/acl/src/CLUniqueTensor.h b/libs/kernel/acl/src/CLUniqueTensor.h
new file mode 100644
index 000000000..6844e4565
--- /dev/null
+++ b/libs/kernel/acl/src/CLUniqueTensor.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_KERNEL_ACL_CLUNIQUETENSOR_H__
+#define __NNFW_KERNEL_ACL_CLUNIQUETENSOR_H__
+
+#include <arm_compute/runtime/CL/CLTensor.h>
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+
+class CLUniqueTensor
+{
+public:
+  CLUniqueTensor(const ::arm_compute::TensorInfo &info)
+  {
+    _tensor.allocator()->init(info);
+  }
+
+public:
+  // Both copy and move are not allowed
+  CLUniqueTensor(const CLUniqueTensor &) = delete;
+  CLUniqueTensor(CLUniqueTensor &&) = delete;
+
+public:
+  ~CLUniqueTensor()
+  {
+    _tensor.allocator()->free();
+  }
+
+public:
+  void allocate()
+  {
+    _tensor.allocator()->allocate();
+  }
+
+public:
+  ::arm_compute::CLTensor &ref(void) { return _tensor; }
+  ::arm_compute::CLTensor *ptr(void) { return &_tensor; }
+
+private:
+  ::arm_compute::CLTensor _tensor;
+};
+
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
+
+#endif //__NNFW_KERNEL_ACL_CLUNIQUETENSOR_H__
diff --git a/libs/kernel/acl/src/DepthwiseConv2D.h b/libs/kernel/acl/src/DepthwiseConv2D.h
new file mode 100644
index 000000000..8af8d4fd0
--- /dev/null
+++ b/libs/kernel/acl/src/DepthwiseConv2D.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_KERNEL_ACL_DEPTHWISECONV2D_COMMON_H__
+#define __NNFW_KERNEL_ACL_DEPTHWISECONV2D_COMMON_H__
+
+#include <OperationsUtils.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/TensorInfo.h>
+#include <arm_compute/runtime/IFunction.h>
+
+#include "shape.h"
+#include "IO_accessor.h"
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+
+namespace common {
+
+typedef std::function<void (void)> sync_scheduler_f;
+
+template<class TensorT, class LayerT, class ActT>
+bool depthwiseConvFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
+                          const float* filterData, const nnfw::rt::Shape& filterShape,
+                          const float* biasData, const nnfw::rt::Shape& biasShape,
+                          int32_t padding_left, int32_t padding_right,
+                          int32_t padding_top, int32_t padding_bottom,
+                          int32_t stride_width, int32_t stride_height,
+                          int32_t depth_multiplier, int32_t activation,
+                          float* outputData, const nnfw::rt::Shape& outputShape,
+                          sync_scheduler_f sync_scheduler) {
+  auto inputShapeACL = util::fromNNShape(inputShape);
+  auto weightsShapeACL = util::fromNNShape(filterShape);
+  auto biasShapeACL = util::fromNNShape(biasShape);
+  auto outputShapeACL = util::fromNNShape(outputShape);
+
+  TensorT input(arm_compute::TensorInfo(inputShapeACL, arm_compute::Format::F32));
+  TensorT weights(arm_compute::TensorInfo(weightsShapeACL, arm_compute::Format::F32));
+  TensorT bias(arm_compute::TensorInfo(biasShapeACL, arm_compute::Format::F32));
+  TensorT output(arm_compute::TensorInfo(outputShapeACL, arm_compute::Format::F32));
+
+  arm_compute::PadStrideInfo psinfo = arm_compute::PadStrideInfo(stride_width, stride_height,
+                                              padding_left, padding_right,
+                                              padding_top, padding_bottom,
+                                              arm_compute::DimensionRoundingType::FLOOR);
+
+  auto l = std::make_shared<LayerT>();
+  l->configure(input.ptr(), weights.ptr(), bias.ptr(), output.ptr(), psinfo);
+
+  std::vector<std::shared_ptr<arm_compute::IFunction>> fns;
+
+  fns.emplace_back(l);
+
+  util::insertFusedActivationLayer<TensorT, ActT>(output, activation, fns);
+
+  input.allocate();
+  output.allocate();
+  bias.allocate();
+  weights.allocate();
+
+  // TODO: Do we need 2D tensor accessor for the input feature?
+  TensorAccess<InputAccessor>(input.ref(), inputData, inputShape);
+  TensorAccess<BiasAccessor>(bias.ref(), biasData, biasShape);
+  TensorAccess<WeightAccessor>(weights.ref(), filterData, filterShape);
+
+  for (const auto &fn : fns)
+  {
+    fn->run();
+  }
+
+  sync_scheduler();
+
+  TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape);
+
+  return true;
+}
+
+} // namespace common
+
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
+
+#endif // __NNFW_KERNEL_ACL_DEPTHWISECONV2D_COMMON_H__
diff --git a/libs/kernel/acl/src/DepthwiseConv2D.test.h b/libs/kernel/acl/src/DepthwiseConv2D.test.h
new file mode 100644
index 000000000..b2c8592ee
--- /dev/null
+++ b/libs/kernel/acl/src/DepthwiseConv2D.test.h
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <OperationsUtils.h>
+#include <kernel/acl/nnfw_kernel_acl.h>
+#include <kernel/acl/DepthwiseConv2D.h>
+
+// TODO: fix include path in CMakeFiles
+#include "util.h"
+
+#ifndef ACL_TEST
+#error "ACL_TEST should be defined first!"
+#endif // ACL_TEST
+
+#ifndef ACL_CORE_FUNC_NAME
+#error "ACL_CORE_FUNC_NAME should be defined first!"
+#endif // ACL_CORE_FUNC_NAME
+
+using namespace nnfw::kernel::acl;
+
+ACL_TEST(KernelACL_TC, dwise_conv2d_1) {
+  uint32_t input_n = 1;
+  uint32_t input_h = 3;
+  uint32_t input_w = 3;
+  uint32_t input_c = 1;
+  uint32_t filter_h = 3;
+  uint32_t filter_w = 3;
+  uint32_t filter_c = 1;
+  uint32_t out_h = 1;
+  uint32_t out_w = 1;
+
+  int32_t padding_left = 0;
+  int32_t padding_right = 0;
+  int32_t padding_top = 0;
+  int32_t padding_bottom = 0;
+  int32_t stride_width = 1;
+  int32_t stride_height = 1;
+  int32_t depth_multiplier = 1;
+
+  util::TensorWrapper input({input_n, input_h, input_w, input_c});
+  util::TensorWrapper weights({1, filter_h, filter_w, filter_c});
+  util::TensorWrapper bias({filter_c});
+  util::TensorWrapper output({1, out_h, out_w, filter_c});
+
+  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
+
+  input.initValue([&](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    uint32_t N = input_n;
+    uint32_t H = input_h;
+    uint32_t W = input_w;
+    uint32_t C = input_c;
+
+    return n*H*W*C + h*W*C + w*C + c;
+  });
+  weights.initValue([&](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    uint32_t N = 1;
+    uint32_t H = filter_h;
+    uint32_t W = filter_w;
+    uint32_t C = filter_c;
+
+    return n*H*W*C + h*W*C + w*C + c;
+  });
+  bias.initValue([](uint32_t w) {
+    return 0.f;
+  });
+  output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 0.f;
+  });
+
+  bool bret = ACL_CORE_FUNC_NAME(input.ptr<float>(), input.shape(),
+      weights.ptr<float>(), weights.shape(),
+      bias.ptr<float>(), bias.shape(),
+      padding_left, padding_right,
+      padding_top, padding_bottom,
+      stride_width, stride_height,
+      depth_multiplier, activation,
+      output.ptr<float>(), output.shape());
+
+  EXPECT_EQ(bret, true);
+
+  util::TensorWrapper expected({1, out_h, out_w, filter_c});
+  expected.initValue([&](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 204.f;
+  });
+
+  EXPECT_EQ(output, expected);
+}
+
+ACL_TEST(KernelACL_TC, dwise_conv2d_multi_channel) {
+  uint32_t input_n = 1;
+  uint32_t input_h = 3;
+  uint32_t input_w = 3;
+  uint32_t input_c = 3;
+  uint32_t filter_h = 3;
+  uint32_t filter_w = 3;
+  uint32_t filter_c = input_c;
+  uint32_t out_h = 1;
+  uint32_t out_w = 1;
+
+  int32_t padding_left = 0;
+  int32_t padding_right = 0;
+  int32_t padding_top = 0;
+  int32_t padding_bottom = 0;
+  int32_t stride_width = 1;
+  int32_t stride_height = 1;
+  int32_t depth_multiplier = 1;
+
+  util::TensorWrapper input({input_n, input_h, input_w, input_c});
+  util::TensorWrapper weights({1, filter_h, filter_w, filter_c});
+  util::TensorWrapper bias({filter_c});
+  util::TensorWrapper output({1, out_h, out_w, filter_c});
+
+  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
+
+  input.initValue([&](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    uint32_t N = input_n;
+    uint32_t H = input_h;
+    uint32_t W = input_w;
+    uint32_t C = input_c;
+
+    return n*H*W*C + h*W*C + w*C + c;
+  });
+  weights.initValue([&](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    uint32_t N = 1;
+    uint32_t H = filter_h;
+    uint32_t W = filter_w;
+    uint32_t C = filter_c;
+
+    return n*H*W*C + h*W*C + w*C + c;
+  });
+  bias.initValue([](uint32_t w) {
+    return 0.f;
+  });
+  output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 0.f;
+  });
+
+  bool bret = ACL_CORE_FUNC_NAME(input.ptr<float>(), input.shape(),
+      weights.ptr<float>(), weights.shape(),
+      bias.ptr<float>(), bias.shape(),
+      padding_left, padding_right,
+      padding_top, padding_bottom,
+      stride_width, stride_height,
+      depth_multiplier, activation,
+      output.ptr<float>(), output.shape());
+
+  EXPECT_EQ(bret, true);
+
+  util::TensorWrapper expected({1, out_h, out_w, filter_c});
+  expected.initValue({
+    1836.f,
+    2061.f,
+    2304.f
+  });
+
+  EXPECT_EQ(output, expected);
+}
+
+ACL_TEST(KernelACL_TC, dwise_conv2d_inception_1) {
+  uint32_t input_n = 1;
+  uint32_t input_h = 112;
+  uint32_t input_w = 112;
+  uint32_t input_c = 32;
+  uint32_t filter_h = 3;
+  uint32_t filter_w = 3;
+  uint32_t filter_c = input_c;
+  uint32_t out_h = 112;
+  uint32_t out_w = 112;
+
+  int32_t padding_left = 1;
+  int32_t padding_right = 1;
+  int32_t padding_top = 1;
+  int32_t padding_bottom = 1;
+  int32_t stride_width = 1;
+  int32_t stride_height = 1;
+  int32_t depth_multiplier = 1;
+
+  util::TensorWrapper input({input_n, input_h, input_w, input_c});
+  util::TensorWrapper weights({1, filter_h, filter_w, filter_c});
+  util::TensorWrapper bias({filter_c});
+  util::TensorWrapper output({1, out_h, out_w, filter_c});
+
+  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU6);
+
+  input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return c;
+  });
+  weights.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return c;
+  });
+  bias.initValue([](uint32_t w) {
+    return 0.f;
+  });
+  output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 0.f;
+  });
+
+  bool bret = ACL_CORE_FUNC_NAME(input.ptr<float>(), input.shape(),
+      weights.ptr<float>(), weights.shape(),
+      bias.ptr<float>(), bias.shape(),
+      padding_left, padding_right,
+      padding_top, padding_bottom,
+      stride_width, stride_height,
+      depth_multiplier, activation,
+      output.ptr<float>(), output.shape());
+
+  EXPECT_EQ(bret, true);
+
+  util::TensorWrapper expected({1, out_h, out_w, filter_c});
+  expected.initValue([&](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    float v = 9.f;
+    if( h == 0 || h == out_h-1 )
+      v -= 3.f;
+    if( w == 0 || w == out_w-1 )
+      v -= 3.f;
+
+    // four corners
+    if( (w == 0 && h == 0)
+     || (w == 0 && h == out_h-1)
+     || (w == out_w-1 && h == 0)
+     || (w == out_w-1 && h == out_h-1) )
+      v += 1.f;
+
+    // Assumption: negative numbers cannot appear because
+    // only positive numbers exist in the input and weights.
+    float ret = c*c*v;
+    return std::min(ret, 6.f);
+  });
+
+  EXPECT_EQ(output, expected);
+}
diff --git a/libs/kernel/acl/src/FullyConnected.h b/libs/kernel/acl/src/FullyConnected.h
new file mode 100644
index 000000000..5030a8548
--- /dev/null
+++ b/libs/kernel/acl/src/FullyConnected.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_KERNEL_ACL_FULLYCONNECTED_COMMON_H__
+#define __NNFW_KERNEL_ACL_FULLYCONNECTED_COMMON_H__
+
+#include <OperationsUtils.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/TensorInfo.h>
+#include <arm_compute/runtime/IFunction.h>
+
+#include "shape.h"
+#include "IO_accessor.h"
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+
+namespace common {
+
+typedef std::function<void (void)> sync_scheduler_f;
+
+template<class TensorT, class LayerT, class ActT>
+bool fullyConnectedFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
+                           const float* weightsData, const nnfw::rt::Shape& weightsShape,
+                           const float* biasData, const nnfw::rt::Shape& biasShape,
+                           int32_t activation,
+                           float* outputData, const nnfw::rt::Shape& outputShape,
+                           sync_scheduler_f sync_scheduler) {
+
+  // NNAPI specification: https://developer.android.com/ndk/reference/group___neural_networks.html#ggaabbe492c60331b13038e39d4207940e0aaada7a3dbaf4676aba560c933ff610c5
+
+  // According to the NNAPI Specification,
+  // INPUT
+  // 1. input rank is up to 4.
+  // 2. if input rank > 2, it is flattened to rank 2 [batch_size, input_size]
+  nnfw::rt::Shape flattenedInputShape = inputShape;
+  switch(inputShape.dimensions.size()) {
+  case 1:
+    {
+      assert("Need to be implemented." && 0);
+      break;
+    }
+  case 2:
+    {
+      // DO NOTHING.
+      break;
+    }
+  case 3:
+    {
+      assert("Need to be implemented." && 0);
+      break;
+    }
+  case 4:
+    {
+      auto N = inputShape.dimensions[0];
+      auto H = inputShape.dimensions[1];
+      auto W = inputShape.dimensions[2];
+      auto C = inputShape.dimensions[3];
+      flattenedInputShape.dimensions = {N, H*W*C};
+      break;
+    }
+  default:
+    assert(inputShape.dimensions.size() <= 4);
+  }
+  // Finally, flattenedInputShape is a 2D tensor.
+
+  // WEIGHTS is a 2D tensor
+  assert(weightsShape.dimensions.size() == 2);
+
+  // BIAS is a 1D tensor
+  assert(biasShape.dimensions.size() == 1);
+
+  // OUTPUT is a 2D tensor.
+  assert(outputShape.dimensions.size() == 2);
+
+  auto input_shape = util::fromNNShape(flattenedInputShape);
+  auto weights_shape = util::fromNNShape(weightsShape);
+  auto bias_shape = util::fromNNShape(biasShape);
+  auto output_shape = util::fromNNShape(outputShape);
+
+  assert(activation == ANEURALNETWORKS_FUSED_NONE || activation == ANEURALNETWORKS_FUSED_RELU);
+
+  std::vector<std::shared_ptr<arm_compute::IFunction>> fns;
+
+  TensorT input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
+  TensorT output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
+  TensorT bias(arm_compute::TensorInfo(bias_shape, arm_compute::Format::F32));
+  TensorT weights(arm_compute::TensorInfo(weights_shape, arm_compute::Format::F32));
+
+  auto fc = std::make_shared<LayerT>();
+  fc->configure(input.ptr(), weights.ptr(), bias.ptr(), output.ptr());
+
+  fns.emplace_back(fc);
+
+  if (ANEURALNETWORKS_FUSED_RELU == activation)
+  {
+    auto relu_f = std::make_shared<ActT>();
+
+    const arm_compute::ActivationLayerInfo relu_info{arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
+
+    // Do in-place update
+    relu_f->configure(output.ptr(), nullptr, relu_info);
+
+    fns.emplace_back(relu_f);
+  }
+
+  input.allocate();
+  output.allocate();
+  bias.allocate();
+  weights.allocate();
+
+  // TODO: Do we need 2D tensor accessor for the input feature?
+  TensorAccess<MatrixWeightAccessor>(input.ref(), inputData, inputShape);
+  TensorAccess<BiasAccessor>(bias.ref(), biasData, biasShape);
+  TensorAccess<MatrixWeightAccessor>(weights.ref(), weightsData, weightsShape);
+
+  for (const auto &fn : fns)
+  {
+    fn->run();
+  }
+
+  sync_scheduler();
+
+  TensorAccess<MatrixOutputAccessor>(output.ref(), outputData, outputShape);
+
+  return true;
+}
+
+} // namespace common
+
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
+
+#endif // __NNFW_KERNEL_ACL_FULLYCONNECTED_COMMON_H__
diff --git a/libs/kernel/acl/src/FullyConnected.test.h b/libs/kernel/acl/src/FullyConnected.test.h
new file mode 100644
index 000000000..01bbff802
--- /dev/null
+++ b/libs/kernel/acl/src/FullyConnected.test.h
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <OperationsUtils.h>
+#include <kernel/acl/nnfw_kernel_acl.h>
+#include <kernel/acl/FullyConnected.h>
+
+// TODO: fix include path in CMakeFiles
+#include "util.h"
+
+#ifndef ACL_TEST
+#error "ACL_TEST should be defined first!"
+#endif // ACL_TEST
+
+#ifndef ACL_CORE_FUNC_NAME
+#error "ACL_CORE_FUNC_NAME should be defined first!"
+#endif // ACL_CORE_FUNC_NAME
+
+using namespace nnfw::kernel::acl;
+using fullyConnectedFloat32T = bool (*)(const float* inputData, const nnfw::rt::Shape& inputShape,
+                           const float* weightsData, const nnfw::rt::Shape& weightsShape,
+                           const float* biasData, const nnfw::rt::Shape& biasShape,
+                           int32_t activation,
+                           float* outputData, const nnfw::rt::Shape& outputShape);
+
+ACL_TEST(KernelACL_TC, fcFloat32_1) {
+
+  util::TensorWrapper input({1,1,1,100});
+  util::TensorWrapper weights({1,100});
+  util::TensorWrapper bias({1});
+  util::TensorWrapper output({1,1});
+
+  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
+
+  input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 1.f;
+  });
+  weights.initValue([](uint32_t h, uint32_t w) {
+    return 1.f;
+  });
+  bias.initValue([](uint32_t w) {
+    return 0.f;
+  });
+  output.initValue([](uint32_t h, uint32_t w) {
+    return 0.f;
+  });
+
+  bool bret = ACL_CORE_FUNC_NAME(input.ptr<float>(), input.shape(),
+      weights.ptr<float>(), weights.shape(),
+      bias.ptr<float>(), bias.shape(),
+      activation,
+      output.ptr<float>(), output.shape());
+
+  EXPECT_EQ(bret, true);
+
+  util::TensorWrapper expected({1,1});
+  expected.initValue([](uint32_t h, uint32_t w) {
+    return 100.f;
+  });
+
+  EXPECT_EQ(output, expected);
+}
+
+ACL_TEST(KernelACL_TC, fcFloat32_relu) {
+
+  util::TensorWrapper input({1,1,1,100});
+  util::TensorWrapper weights({1,100});
+  util::TensorWrapper bias({1});
+  util::TensorWrapper output({1,1});
+
+  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
+
+  input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 1.f;
+  });
+  weights.initValue([](uint32_t h, uint32_t w) {
+    return -1.f;
+  });
+  bias.initValue([](uint32_t w) {
+    return 0.f;
+  });
+  output.initValue([](uint32_t h, uint32_t w) {
+    return 0.f;
+  });
+
+  bool bret = ACL_CORE_FUNC_NAME(input.ptr<float>(), input.shape(),
+      weights.ptr<float>(), weights.shape(),
+      bias.ptr<float>(), bias.shape(),
+      activation,
+      output.ptr<float>(), output.shape());
+
+  EXPECT_EQ(bret, true);
+
+  util::TensorWrapper expected({1,1});
+  expected.initValue([](uint32_t h, uint32_t w) {
+    return 0.f;
+  });
+
+  EXPECT_EQ(output, expected);
+}
+
+ACL_TEST(KernelACL_TC, fcFloat32_conv_fc) {
+  uint32_t input_n = 1;
+  uint32_t input_c = 5;
+  uint32_t input_h = 4;
+  uint32_t input_w = 4;
+  uint32_t weight_n = 6;
+
+  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
+
+  util::TensorWrapper input({input_n, input_h, input_w, input_c});
+  util::TensorWrapper weight({weight_n, input_c*input_h*input_w});
+  util::TensorWrapper bias({weight_n});
+  util::TensorWrapper output({1, weight_n});
+
+  input.initValue([&](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    uint32_t N = input_n;
+    uint32_t H = input_h;
+    uint32_t W = input_w;
+    uint32_t C = input_c;
+
+    return n*H*W*C + h*W*C + w*C + c;
+  });
+
+  weight.initValue([&](uint32_t h, uint32_t w) {
+    uint32_t H = weight_n;
+    uint32_t W = input_c*input_h*input_w;
+
+    return h*W + w;
+  });
+
+  bias.initValue([](uint32_t w) {
+    return 0.f;
+  });
+
+  output.initValue([](uint32_t h, uint32_t w) {
+    return 0.f;
+  });
+
+  bool bret = ACL_CORE_FUNC_NAME(input.ptr<float>(), input.shape(),
+      weight.ptr<float>(), weight.shape(),
+      bias.ptr<float>(), bias.shape(),
+      activation,
+      output.ptr<float>(), output.shape());
+
+  EXPECT_EQ(bret, true);
+
+  util::TensorWrapper expected({1, weight_n});
+  expected.initValue({
+      167480.f,
+      420280.f,
+      673080.f,
+      925880.f,
+      1178680.f,
+      1431480.f});
+
+  EXPECT_EQ(output, expected);
+}
+
+ACL_TEST(KernelACL_TC, fcFloat32_fc_fc) {
+  uint32_t input_n = 6;
+  uint32_t weight_n = 6;
+
+  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
+
+  util::TensorWrapper input({1, input_n});
+  util::TensorWrapper weight({weight_n, input_n});
+  util::TensorWrapper bias({weight_n});
+  util::TensorWrapper output({1, weight_n});
+
+  input.initValue([&](uint32_t h, uint32_t w) {
+    // not use h because h = 0.
+    return (float)w;
+  });
+
+  weight.initValue([&](uint32_t h, uint32_t w) {
+    uint32_t H = weight_n;
+    uint32_t W = input_n;
+
+    return (float)(h*W + w);
+  });
+
+  bias.initValue([](uint32_t w) {
+    return 0.f;
+  });
+
+  output.initValue([](uint32_t h, uint32_t w) {
+    return 0.f;
+  });
+
+  bool bret = ACL_CORE_FUNC_NAME(input.ptr<float>(), input.shape(),
+      weight.ptr<float>(), weight.shape(),
+      bias.ptr<float>(), bias.shape(),
+      activation,
+      output.ptr<float>(), output.shape());
+
+  EXPECT_EQ(bret, true);
+
+  util::TensorWrapper expected({1, weight_n});
+  expected.initValue({
+    55.f,
+    145.f,
+    235.f,
+    325.f,
+    415.f,
+    505.f,
+  });
+
+  EXPECT_EQ(output, expected);
+}
+
+ACL_TEST(KernelACL_TC, fcFloat32_inceptionv3) {
+
+  uint32_t input_c = 2048;
+  uint32_t weight_n = 1008;
+
+  util::TensorWrapper input({1,1,1,input_c});
+  util::TensorWrapper weight({weight_n,input_c});
+  util::TensorWrapper bias({weight_n});
+  util::TensorWrapper output({1, weight_n});
+
+  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
+
+  input.initValue([&](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 1.f;
+  });
+  weight.initValue([&](uint32_t h, uint32_t w) {
+    return (float)h;
+  });
+  bias.initValue([](uint32_t w) {
+    return 0.f;
+  });
+  output.initValue([](uint32_t h, uint32_t w) {
+    return 0.f;
+  });
+
+  bool bret = ACL_CORE_FUNC_NAME(input.ptr<float>(), input.shape(),
+      weight.ptr<float>(), weight.shape(),
+      bias.ptr<float>(), bias.shape(),
+      activation,
+      output.ptr<float>(), output.shape());
+
+  EXPECT_EQ(bret, true);
+
+  util::TensorWrapper expected({1, weight_n});
+  expected.initValue([&](uint32_t h, uint32_t w) {
+    return w*input_c;
+  });
+
+  EXPECT_EQ(output, expected);
+}
+
diff --git a/libs/kernel/acl/src/IO_accessor.cpp b/libs/kernel/acl/src/IO_accessor.cpp
new file mode 100644
index 000000000..410fb8ea5
--- /dev/null
+++ b/libs/kernel/acl/src/IO_accessor.cpp
@@ -0,0 +1,310 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "IO_accessor.h"
+
+#include <cassert>
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+
+InputAccessor::InputAccessor(const float* inputData, const nnfw::rt::Shape& inputShape)
+  : _inputData(inputData)
+  , _inputShape(inputShape)
+{
+}
+
+MatrixInputAccessor::MatrixInputAccessor(const float* inputData, const nnfw::rt::Shape& inputShape)
+  : _inputData(inputData)
+  , _inputShape(inputShape)
+{
+}
+
+VectorInputAccessor::VectorInputAccessor(const float* inputData, const nnfw::rt::Shape& inputShape)
+  : _inputData(inputData)
+  , _inputShape(inputShape)
+{
+}
+
+WeightAccessor::WeightAccessor(const float* filterData, const nnfw::rt::Shape& filterShape)
+  : _filterData(filterData)
+  , _filterShape(filterShape)
+{
+}
+
+MatrixWeightAccessor::MatrixWeightAccessor(const float* filterData, const nnfw::rt::Shape& filterShape)
+  : _filterData(filterData)
+  , _filterShape(filterShape)
+{
+}
+
+BiasAccessor::BiasAccessor(const float* biasData, const nnfw::rt::Shape& biasShape)
+  : _biasData(biasData)
+  , _biasShape(biasShape)
+{
+}
+
+OutputAccessor::OutputAccessor(float* outputData, const nnfw::rt::Shape& outputShape)
+  : _outputData(outputData)
+  , _outputShape(outputShape)
+{
+}
+
+MatrixOutputAccessor::MatrixOutputAccessor(float* outputData, const nnfw::rt::Shape& outputShape)
+  : _outputData(outputData)
+  , _outputShape(outputShape)
+{
+}
+
+VectorOutputAccessor::VectorOutputAccessor(float* outputData, const nnfw::rt::Shape& outputShape)
+  : _outputData(outputData)
+  , _outputShape(outputShape)
+{
+}
+
+static uint32_t getOffsetNCHW(const nnfw::rt::Shape& shape, const arm_compute::Coordinates& id)
+{
+  // get offset for ACL(NCHW) from data of NNAPI(NHWC)
+  uint32_t num    = getSizeOfDimension(shape, 0);
+  uint32_t height = getSizeOfDimension(shape, 1);
+  uint32_t width  = getSizeOfDimension(shape, 2);
+  uint32_t chann  = getSizeOfDimension(shape, 3);
+  uint32_t stride = 1;
+  uint32_t offset = 0;
+  uint32_t numdim = id.num_dimensions();
+  offset += numdim > 0 ? id[0] * stride : 0; stride *= width;
+  offset += numdim > 1 ? id[1] * stride : 0; stride *= height;
+  offset += numdim > 2 ? id[2] * stride : 0; stride *= chann;
+  offset += numdim > 3 ? id[3] * stride : 0; stride *= num;
+  return offset;
+}
+
+static uint32_t getElementOffset(const nnfw::rt::Shape& shape,
+                                 uint32_t ch, uint32_t row, uint32_t col)
+{
+  assert(getSizeOfDimension(shape, 0) == 1);
+  assert(shape.dimensions.size() == 4);
+
+  // TODO Optimize this!
+  const uint32_t W = getSizeOfDimension(shape, 2);
+  const uint32_t C = getSizeOfDimension(shape, 3);
+
+  int offset = 0;
+
+  // NNAPI uses NHWC ordering
+  offset += row * W * C;
+  offset += col * C;
+  offset += ch;
+
+  return offset;
+}
+
+static uint32_t getElementOffset(const nnfw::rt::Shape& shape,
+                                 uint32_t nth, uint32_t ch, uint32_t row, uint32_t col)
+{
+  assert(shape.dimensions.size() == 4);
+
+  // TODO Optimize this!
+  const uint32_t H = getSizeOfDimension(shape, 1);
+  const uint32_t W = getSizeOfDimension(shape, 2);
+  const uint32_t C = getSizeOfDimension(shape, 3);
+
+  int offset = 0;
+
+  // NNAPI uses NHWC ordering
+  offset += nth * H * W * C;
+  offset += row * W * C;
+  offset += col * C;
+  offset += ch;
+
+  return offset;
+}
+
+bool InputAccessor::access_tensor(arm_compute::ITensor &tensor)
+{
+  arm_compute::Window window;
+  window.use_tensor_dimensions(tensor.info()->tensor_shape());
+
+  execute_window_loop(window, [&](const arm_compute::Coordinates& id)
+  {
+    const uint32_t ch = id[2];
+    const uint32_t row = id[1];
+    const uint32_t col = id[0];
+
+    uint32_t offset = getElementOffset(_inputShape, ch, row, col);
+
+    *reinterpret_cast<float *>(tensor.ptr_to_element(id)) =
+        *(_inputData + offset);
+  });
+  return true;
+}
+
+bool MatrixInputAccessor::access_tensor(arm_compute::ITensor &tensor)
+{
+  arm_compute::Window window;
+  window.use_tensor_dimensions(tensor.info()->tensor_shape());
+
+  assert(tensor.info()->tensor_shape().num_dimensions() <= 2);
+
+  execute_window_loop(window, [&](const arm_compute::Coordinates& id)
+  {
+    const auto row = id[1];
+    const auto col = id[0];
+    const auto W = tensor.info()->tensor_shape().x();
+
+    const auto offset = row * W + col;
+
+    *reinterpret_cast<float *>(tensor.ptr_to_element(id)) =
+        *(_inputData + offset);
+  });
+  return true;
+}
+
+bool VectorInputAccessor::access_tensor(arm_compute::ITensor &tensor)
+{
+  arm_compute::Window window;
+  window.use_tensor_dimensions(tensor.info()->tensor_shape());
+
+  assert(tensor.info()->tensor_shape().num_dimensions() == 1);
+
+  execute_window_loop(window, [&](const arm_compute::Coordinates& id)
+  {
+    uint32_t offset = id[0];
+
+    *reinterpret_cast<float *>(tensor.ptr_to_element(id)) =
+        *(_inputData + offset);
+  });
+  return true;
+}
+
+bool WeightAccessor::access_tensor(arm_compute::ITensor &tensor)
+{
+  arm_compute::Window window;
+  window.use_tensor_dimensions(tensor.info()->tensor_shape());
+
+  execute_window_loop(window, [&](const arm_compute::Coordinates& id)
+  {
+    const uint32_t nth = id[3];
+    const uint32_t ch = id[2];
+    const uint32_t row = id[1];
+    const uint32_t col = id[0];
+
+    uint32_t offset = getElementOffset(_filterShape, nth, ch, row, col);
+
+    *reinterpret_cast<float *>(tensor.ptr_to_element(id)) =
+        *(_filterData + offset);
+  });
+  return true;
+}
+
+bool MatrixWeightAccessor::access_tensor(arm_compute::ITensor &tensor)
+{
+  arm_compute::Window window;
+  window.use_tensor_dimensions(tensor.info()->tensor_shape());
+
+  assert(tensor.info()->tensor_shape().num_dimensions() <= 2);
+
+  execute_window_loop(window, [&](const arm_compute::Coordinates& id)
+  {
+    const auto row = id[1];
+    const auto col = id[0];
+    const auto W = tensor.info()->tensor_shape().x();
+
+    uint32_t offset = row * W + col;
+
+    *reinterpret_cast<float *>(tensor.ptr_to_element(id)) =
+        *(_filterData + offset);
+  });
+  return true;
+}
+
+bool BiasAccessor::access_tensor(arm_compute::ITensor &tensor)
+{
+  arm_compute::Window window;
+  window.use_tensor_dimensions(tensor.info()->tensor_shape());
+
+  execute_window_loop(window, [&](const arm_compute::Coordinates& id)
+  {
+    uint32_t offset = getOffsetNCHW(_biasShape, id);
+    *reinterpret_cast<float *>(tensor.ptr_to_element(id)) =
+        *(_biasData + offset);
+  });
+  return true;
+}
+
+bool OutputAccessor::access_tensor(arm_compute::ITensor &tensor)
+{
+  arm_compute::Window window;
+  window.use_tensor_dimensions(tensor.info()->tensor_shape());
+
+  execute_window_loop(window, [&](const arm_compute::Coordinates& id)
+  {
+    const uint32_t ch = id[2];
+    const uint32_t row = id[1];
+    const uint32_t col = id[0];
+
+    uint32_t offset = getElementOffset(_outputShape, ch, row, col);
+
+    *(_outputData + offset) =
+        *reinterpret_cast<float *>(tensor.ptr_to_element(id));
+  });
+  return false; // end the network
+}
+
+bool VectorOutputAccessor::access_tensor(arm_compute::ITensor &tensor)
+{
+  arm_compute::Window window;
+  window.use_tensor_dimensions(tensor.info()->tensor_shape());
+
+  assert(tensor.info()->tensor_shape().num_dimensions() == 1);
+
+  execute_window_loop(window, [&](const arm_compute::Coordinates& id)
+  {
+    const uint32_t x = id[0];
+
+    uint32_t offset = x;
+
+    *(_outputData + offset) =
+        *reinterpret_cast<float *>(tensor.ptr_to_element(id));
+  });
+  return false; // end the network
+}
+
+bool MatrixOutputAccessor::access_tensor(arm_compute::ITensor &tensor)
+{
+  arm_compute::Window window;
+  window.use_tensor_dimensions(tensor.info()->tensor_shape());
+
+  assert(tensor.info()->tensor_shape().num_dimensions() <= 2);
+
+  execute_window_loop(window, [&](const arm_compute::Coordinates& id)
+  {
+    const auto row = id[1];
+    const auto col = id[0];
+    const auto W = tensor.info()->tensor_shape().x();
+
+    const auto offset = row * W + col;
+
+    *(_outputData + offset) =
+        *reinterpret_cast<float *>(tensor.ptr_to_element(id));
+  });
+  return false; // end the network
+}
+
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
diff --git a/libs/kernel/acl/src/IO_accessor.h b/libs/kernel/acl/src/IO_accessor.h
new file mode 100644
index 000000000..e7670f15c
--- /dev/null
+++ b/libs/kernel/acl/src/IO_accessor.h
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_KERNEL_ACL_IO_ACCESSOR_H__
+#define __NNFW_KERNEL_ACL_IO_ACCESSOR_H__
+
+#include <arm_compute/graph/ITensorAccessor.h>
+#include <arm_compute/runtime/CL/CLFunctions.h>
+#include <arm_compute/runtime/NEON/NEFunctions.h>
+
+#include <OperationsUtils.h> // for nnfw::rt::Shape
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+
+class InputAccessor : public arm_compute::graph::ITensorAccessor
+{
+public:
+    InputAccessor(const float* inputData, const nnfw::rt::Shape& inputShape);
+    InputAccessor(InputAccessor&&) = default;
+
+    // Inherited methods overriden:
+    bool access_tensor(arm_compute::ITensor& tensor) override;
+
+private:
+    const float* _inputData;
+    const nnfw::rt::Shape& _inputShape;
+};
+
+class MatrixInputAccessor : public arm_compute::graph::ITensorAccessor
+{
+public:
+    MatrixInputAccessor(const float* inputData, const nnfw::rt::Shape& inputShape);
+    MatrixInputAccessor(MatrixInputAccessor&&) = default;
+
+    // Inherited methods overriden:
+    bool access_tensor(arm_compute::ITensor& tensor) override;
+
+private:
+    const float* _inputData;
+    const nnfw::rt::Shape& _inputShape;
+};
+
+class VectorInputAccessor : public arm_compute::graph::ITensorAccessor
+{
+public:
+    VectorInputAccessor(const float* inputData, const nnfw::rt::Shape& inputShape);
+    VectorInputAccessor(VectorInputAccessor&&) = default;
+
+    // Inherited methods overriden:
+    bool access_tensor(arm_compute::ITensor& tensor) override;
+
+private:
+    const float* _inputData;
+    const nnfw::rt::Shape& _inputShape;
+};
+
+class WeightAccessor : public arm_compute::graph::ITensorAccessor
+{
+public:
+    WeightAccessor(const float* filterData, const nnfw::rt::Shape& filterShape);
+    WeightAccessor(WeightAccessor&&) = default;
+
+    // Inherited methods overriden:
+    bool access_tensor(arm_compute::ITensor& tensor) override;
+
+private:
+    const float* _filterData;
+    const nnfw::rt::Shape& _filterShape;
+};
+
+class MatrixWeightAccessor : public arm_compute::graph::ITensorAccessor
+{
+public:
+    MatrixWeightAccessor(const float* filterData, const nnfw::rt::Shape& filterShape);
+    MatrixWeightAccessor(MatrixWeightAccessor&&) = default;
+
+    // Inherited methods overriden:
+    bool access_tensor(arm_compute::ITensor& tensor) override;
+
+private:
+    const float* _filterData;
+    const nnfw::rt::Shape& _filterShape;
+};
+
+class BiasAccessor : public arm_compute::graph::ITensorAccessor
+{
+public:
+    BiasAccessor(const float* biasData, const nnfw::rt::Shape& biasShape);
+    BiasAccessor(BiasAccessor&&) = default;
+
+    // Inherited methods overriden:
+    bool access_tensor(arm_compute::ITensor& tensor) override;
+
+private:
+    const float* _biasData;
+    const nnfw::rt::Shape& _biasShape;
+};
+
+class OutputAccessor : public arm_compute::graph::ITensorAccessor
+{
+public:
+    OutputAccessor(float* outputData, const nnfw::rt::Shape& outputShape);
+    OutputAccessor(OutputAccessor&&) = default;
+
+    // Inherited methods overriden:
+    bool access_tensor(arm_compute::ITensor& tensor) override;
+
+private:
+    float* _outputData;
+    const nnfw::rt::Shape& _outputShape;
+};
+
+class MatrixOutputAccessor : public arm_compute::graph::ITensorAccessor
+{
+public:
+    MatrixOutputAccessor(float* outputData, const nnfw::rt::Shape& outputShape);
+    MatrixOutputAccessor(MatrixOutputAccessor&&) = default;
+
+    // Inherited methods overriden:
+    bool access_tensor(arm_compute::ITensor& tensor) override;
+
+private:
+    float* _outputData;
+    const nnfw::rt::Shape& _outputShape;
+};
+
+class VectorOutputAccessor : public arm_compute::graph::ITensorAccessor
+{
+public:
+    VectorOutputAccessor(float* outputData, const nnfw::rt::Shape& outputShape);
+    VectorOutputAccessor(VectorOutputAccessor&&) = default;
+
+    // Inherited methods overriden:
+    bool access_tensor(arm_compute::ITensor& tensor) override;
+
+private:
+    float* _outputData;
+    const nnfw::rt::Shape& _outputShape;
+};
+
+template<typename AccessorType>
+inline void TensorAccess(arm_compute::CLTensor& tensor, const float* data,
+                         const nnfw::rt::Shape& shape)
+{
+  tensor.map();
+  AccessorType accessor(data, shape);
+  accessor.access_tensor(tensor);
+  tensor.unmap();
+}
+
+template<typename AccessorType>
+inline void TensorAccess(arm_compute::CLTensor& tensor, float* data,
+                         const nnfw::rt::Shape& shape)
+{
+  tensor.map();
+  AccessorType accessor(data, shape);
+  accessor.access_tensor(tensor);
+  tensor.unmap();
+}
+
+template<typename AccessorType>
+inline void TensorAccess(arm_compute::Tensor& tensor, const float* data,
+                         const nnfw::rt::Shape& shape)
+{
+  AccessorType accessor(data, shape);
+  accessor.access_tensor(tensor);
+}
+
+template<typename AccessorType>
+inline void TensorAccess(arm_compute::Tensor& tensor, float* data,
+                         const nnfw::rt::Shape& shape)
+{
+  AccessorType accessor(data, shape);
+  accessor.access_tensor(tensor);
+}
+
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
+
+#endif // __NNFW_KERNEL_ACL_IO_ACCESSOR_H__
diff --git a/libs/kernel/acl/src/Init_acl.cpp b/libs/kernel/acl/src/Init_acl.cpp
new file mode 100644
index 000000000..cabf079fa
--- /dev/null
+++ b/libs/kernel/acl/src/Init_acl.cpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+#include <kernel/acl/nnfw_kernel_acl.h>
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+
+// This will do one time initialization but can be called multiple times
+void Initialize(void)
+{
+  arm_compute::CLScheduler::get().default_init();
+}
+
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
diff --git a/libs/kernel/acl/src/NEUniqueTensor.h b/libs/kernel/acl/src/NEUniqueTensor.h
new file mode 100644
index 000000000..34412f9e3
--- /dev/null
+++ b/libs/kernel/acl/src/NEUniqueTensor.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_KERNEL_ACL_NEUNIQUETENSOR_H__
+#define __NNFW_KERNEL_ACL_NEUNIQUETENSOR_H__
+
+#include <arm_compute/runtime/Tensor.h>
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+
+// TODO: find a way to merge CLUniqueTensor and NEUniqueTensor.
+class NEUniqueTensor
+{
+public:
+  NEUniqueTensor(const ::arm_compute::TensorInfo &info)
+  {
+    _tensor.allocator()->init(info);
+  }
+
+public:
+  // Both copy and move are not allowed
+  NEUniqueTensor(const NEUniqueTensor &) = delete;
+  NEUniqueTensor(NEUniqueTensor &&) = delete;
+
+public:
+  ~NEUniqueTensor()
+  {
+    _tensor.allocator()->free();
+  }
+
+public:
+  void allocate()
+  {
+    _tensor.allocator()->allocate();
+  }
+
+public:
+  ::arm_compute::Tensor &ref(void) { return _tensor; }
+  ::arm_compute::Tensor *ptr(void) { return &_tensor; }
+
+private:
+  ::arm_compute::Tensor _tensor;
+};
+
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
+
+#endif //__NNFW_KERNEL_ACL_NEUNIQUETENSOR_H__
diff --git a/libs/kernel/acl/src/Reshape.h b/libs/kernel/acl/src/Reshape.h
new file mode 100644
index 000000000..ebd82477d
--- /dev/null
+++ b/libs/kernel/acl/src/Reshape.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_KERNEL_ACL_RESHAPE_COMMON_H__
+#define __NNFW_KERNEL_ACL_RESHAPE_COMMON_H__
+#include <OperationsUtils.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/TensorInfo.h>
+
+// TODO: fix include path in CMakeFiles
+#include "IO_accessor.h"
+#include "shape.h"
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+
+namespace common {
+
+typedef std::function<void (void)> sync_scheduler_f;
+
+template<class TensorT, class LayerT>
+bool reshapeGeneric(const void* inputData, const nnfw::rt::Shape& inputShape,
+                    void* outputData, const nnfw::rt::Shape& outputShape,
+                    sync_scheduler_f sync_scheduler) {
+
+  auto input_shape = util::fromNNShape(inputShape);
+  auto output_shape = util::fromNNShape(outputShape);
+
+  TensorT input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
+  TensorT output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
+
+  LayerT l;
+
+  l.configure(input.ptr(), output.ptr());
+
+  input.allocate();
+  output.allocate();
+
+  TensorAccess<InputAccessor>(input.ref(), (float*)inputData, inputShape);
+
+  l.run();
+
+  sync_scheduler();
+
+  TensorAccess<OutputAccessor>(output.ref(), (float*)outputData, outputShape);
+
+  return true;
+}
+
+} // namespace common
+
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
+
+#endif // __NNFW_KERNEL_ACL_RESHAPE_COMMON_H__
diff --git a/libs/kernel/acl/src/Reshape.test.h b/libs/kernel/acl/src/Reshape.test.h
new file mode 100644
index 000000000..a96a896a6
--- /dev/null
+++ b/libs/kernel/acl/src/Reshape.test.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <OperationsUtils.h>
+#include <kernel/acl/nnfw_kernel_acl.h>
+#include <kernel/acl/Reshape.h>
+
+// TODO: fix include path in CMakeFiles
+#include "util.h"
+
+#ifndef ACL_TEST
+#error "ACL_TEST should be defined first!"
+#endif // ACL_TEST
+
+#ifndef ACL_CORE_FUNC_NAME
+#error "ACL_CORE_FUNC_NAME should be defined first!"
+#endif // ACL_CORE_FUNC_NAME
+
+using namespace nnfw::kernel::acl;
+
+ACL_TEST(KernelACL_TC, reshape_1) {
+  const nnfw::rt::Shape inputShape = {OperandType::FLOAT32, {1,1,9,1}, 1.0, 0};
+  float inputData[9] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
+
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  float outputData[9] = {0};
+
+  bool bret = ACL_CORE_FUNC_NAME(inputData, inputShape,
+      outputData, outputShape);
+
+  EXPECT_EQ(bret, true);
+
+  float expectData[9] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+
+}
diff --git a/libs/kernel/acl/src/cl/Concatenation.cpp b/libs/kernel/acl/src/cl/Concatenation.cpp
new file mode 100644
index 000000000..9376006ca
--- /dev/null
+++ b/libs/kernel/acl/src/cl/Concatenation.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <OperationsUtils.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/TensorInfo.h>
+
+#include <cassert>
+
+// TODO: fix include path in CMakeFiles
+#include "../IO_accessor.h"
+#include "../shape.h"
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+
+bool concatenationFloat32(const std::vector<const float*>& inputDataPtrs,
+                          const std::vector<nnfw::rt::Shape>& inputShapes, int32_t axis,
+                          float* outputData, const nnfw::rt::Shape& outputShape)
+{
+  if (axis != 3)
+  {
+    assert("Only support axis=3 for ACL" && 0);
+    return false;
+  }
+  assert(inputDataPtrs.size() == inputShapes.size());
+
+  std::vector<arm_compute::CLTensor*> inputPtrs;
+  std::vector<arm_compute::ICLTensor*> inputIptrs;
+  arm_compute::CLTensor output;
+
+  // init Tensors
+  std::vector<nnfw::rt::Shape>::const_iterator it_inputShape = inputShapes.begin();
+  for (auto inputData : inputDataPtrs)
+  {
+    const nnfw::rt::Shape& inputShape = *it_inputShape;
+    arm_compute::TensorShape input_shape = util::fromNNShape(inputShape);
+    arm_compute::CLTensor* inputPtr = new arm_compute::CLTensor();
+
+    inputPtr->allocator()->init(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
+    inputPtrs.push_back(inputPtr);
+    inputIptrs.push_back(inputPtr);
+
+    it_inputShape++;
+  }
+  arm_compute::TensorShape output_shape = util::fromNNShape(outputShape);
+  output.allocator()->init(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
+
+  // prepare ACL Concatenate and configure tensors
+  auto concat = std::make_shared<arm_compute::CLDepthConcatenateLayer>();
+  concat->configure(inputIptrs, &output);
+
+  // allocate Tensors
+  it_inputShape = inputShapes.begin();
+  std::vector<const float*>::const_iterator it_inputData = inputDataPtrs.begin();
+  for (auto inputPtr : inputPtrs)
+  {
+    inputPtr->allocator()->allocate();
+
+    const float* inputData = *it_inputData;
+    const nnfw::rt::Shape& inputShape = *it_inputShape;
+
+    TensorAccess<InputAccessor>(*inputPtr, inputData, inputShape);
+
+    it_inputShape++;
+    it_inputData++;
+  }
+  output.allocator()->allocate();
+
+  // run
+  concat->run();
+  arm_compute::CLScheduler::get().sync();
+
+  // get output
+  TensorAccess<OutputAccessor>(output, outputData, outputShape);
+
+  // cleanup
+  for (auto inputPtr : inputPtrs)
+  {
+    inputPtr->allocator()->free();
+    delete inputPtr;
+  }
+  output.allocator()->free();
+
+  return true;
+}
+
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
diff --git a/libs/kernel/acl/src/cl/Concatenation.test.cpp b/libs/kernel/acl/src/cl/Concatenation.test.cpp
new file mode 100644
index 000000000..b2c5a5891
--- /dev/null
+++ b/libs/kernel/acl/src/cl/Concatenation.test.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <OperationsUtils.h>
+#include <kernel/acl/nnfw_kernel_acl.h>
+#include <kernel/acl/Concatenation.h>
+
+// TODO: fix include path in CMakeFiles
+#include "../util.h"
+
+using namespace nnfw::kernel::acl;
+
+TEST(KernelACL_TC, concatFloat32_1)
+{
+  float inputData_1[6] = {
+    1, 2, 3, 4, 5, 6      // [ [ [1],[2],[3] ], [ [4],[5],[6] ] ]
+  };
+  float inputData_2[6] = {
+    7, 8, 9, 10, 11, 12   // [ [ [7],[8],[9] ], [ [10],[11],[12] ] ]
+  };
+  const nnfw::rt::Shape inputShape_1 = { OperandType::FLOAT32, {1,2,3,1}, 1.0, 0 };
+  const nnfw::rt::Shape inputShape_2 = { OperandType::FLOAT32, {1,2,3,1}, 1.0, 0 };
+  std::vector<const float*> inputDataPtrs;
+  std::vector<nnfw::rt::Shape> inputShapes;
+  float outputData[12];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,2,3,2}, 1.0, 0 };
+  bool bret;
+
+  inputDataPtrs.push_back(inputData_1);
+  inputDataPtrs.push_back(inputData_2);
+  inputShapes.push_back(inputShape_1);
+  inputShapes.push_back(inputShape_2);
+
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  bret = concatenationFloat32(inputDataPtrs, inputShapes, 3, 
+                              outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectNCHW[] = {
+    1, 2, 3, 4, 5, 6,
+    7, 8, 9, 10, 11, 12
+  };
+  float expectData[12]; // [ [ [1,7],[2,8],[3,9] ], [ [4,10],[5,11],[6,12] ] ]
+  util::NCHW2NHWC(expectNCHW, expectData, outputShape);
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
diff --git a/libs/kernel/acl/src/cl/Conv2D.cpp b/libs/kernel/acl/src/cl/Conv2D.cpp
new file mode 100644
index 000000000..4783bdc1d
--- /dev/null
+++ b/libs/kernel/acl/src/cl/Conv2D.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <OperationsUtils.h>
+#include <NeuralNetworks.h>
+
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/TensorInfo.h>
+
+#include <util/environment.h>
+
+#include "../IO_accessor.h"
+#include "../util.h"
+#include "../shape.h"
+#include "../CLUniqueTensor.h"
+#include "../support.h"
+
+#include "util/feature/TextFormatter.h"
+
+#include "support/nnapi/feature/Reader.h"
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+
+static int verbose = 0;
+
+bool convFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
+                 const float* filterData, const nnfw::rt::Shape& filterShape,
+                 const float* biasData, const nnfw::rt::Shape& biasShape,
+                 int32_t padding_left, int32_t padding_right,
+                 int32_t padding_top, int32_t padding_bottom,
+                 int32_t stride_width, int32_t stride_height,
+                 int32_t activation,
+                 float* outputData, const nnfw::rt::Shape& outputShape)
+{
+  arm_compute::TensorShape input_shape = util::fromNNShape(inputShape);
+  arm_compute::TensorShape filter_shape = util::fromNNShape(filterShape);
+  arm_compute::TensorShape bias_shape = util::fromVectorNNShape(biasShape);
+  arm_compute::TensorShape output_shape = util::fromNNShape(outputShape);
+  arm_compute::PadStrideInfo conv_info = arm_compute::PadStrideInfo(stride_width, stride_height,
+                                              padding_left, padding_right,
+                                              padding_top, padding_bottom,
+                                              arm_compute::DimensionRoundingType::FLOOR);
+
+  CLUniqueTensor input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
+  CLUniqueTensor output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
+  CLUniqueTensor bias(arm_compute::TensorInfo(bias_shape, arm_compute::Format::F32));
+  CLUniqueTensor filter(arm_compute::TensorInfo(filter_shape, arm_compute::Format::F32));
+
+  std::vector<std::shared_ptr<arm_compute::IFunction>> fns;
+
+  auto conv_f = std::make_shared<arm_compute::CLConvolutionLayer>();
+
+  conv_f->configure(input.ptr(), filter.ptr(), bias.ptr(), output.ptr(), conv_info);
+
+  fns.emplace_back(conv_f);
+
+  util::insertFusedActivationLayer<CLUniqueTensor, arm_compute::CLActivationLayer>(output, activation, fns);
+
+  input.allocate();
+  output.allocate();
+  bias.allocate();
+  filter.allocate();
+
+  TensorAccess<InputAccessor>(input.ref(), inputData, inputShape);
+  TensorAccess<BiasAccessor>(bias.ref(), biasData, biasShape);
+  TensorAccess<WeightAccessor>(filter.ref(), filterData, filterShape);
+
+  nnfw::util::env::IntAccessor("CONV2D_VERBOSE").access(verbose);
+  if (verbose)
+  {
+    input.ref().map();
+    auto ifm_shape = nnfw::support::nnapi::feature::asFeatureShape(inputShape);
+    nnfw::support::nnapi::feature::Reader<float> nnapi_ifm_reader{ifm_shape, inputData};
+    nnfw::support::acl::feature::Reader<float> acl_ifm_reader{input.ptr()};
+
+    std::cout << "NNAPI IFM:" << std::endl;
+    std::cout << nnfw::util::feature::TextFormatter<float>{ifm_shape, nnapi_ifm_reader} << std::endl;
+
+    std::cout << "ARM Compute IFM:" << std::endl;
+    std::cout << nnfw::util::feature::TextFormatter<float>{ifm_shape, acl_ifm_reader} << std::endl;
+    input.ref().unmap();
+  }
+
+  for (const auto &fn : fns)
+  {
+    fn->run();
+  }
+
+  arm_compute::CLScheduler::get().sync();
+
+  TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape);
+
+  return true;
+}
+
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
diff --git a/libs/kernel/acl/src/cl/Conv2D.test.cpp b/libs/kernel/acl/src/cl/Conv2D.test.cpp
new file mode 100644
index 000000000..e34cdeea5
--- /dev/null
+++ b/libs/kernel/acl/src/cl/Conv2D.test.cpp
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <OperationsUtils.h>
+#include <kernel/acl/nnfw_kernel_acl.h>
+#include <kernel/acl/Conv2D.h>
+
+// TODO: fix include path in CMakeFiles
+#include "../util.h"
+
+using namespace nnfw::kernel::acl;
+
+TEST(KernelACL_TC, convFloat32_3x3to1x1)
+{
+  float inputData[9];
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  float filterData[9];
+  const nnfw::rt::Shape filterShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  float biasData[1] = { 1.0 };
+  const nnfw::rt::Shape biasShape = { OperandType::FLOAT32, {1}, 1.0, 0 };
+  int32_t padding_left = 0;
+  int32_t padding_right = 0;
+  int32_t padding_top = 0;
+  int32_t padding_bottom = 0;
+  int32_t stride_width = 1;
+  int32_t stride_height = 1;
+  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
+  float outputData[1];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 };
+  bool bret;
+
+  util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
+  util::initData(filterData, sizeof(filterData) / sizeof(filterData[0]), 1.0);
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  bret = convFloat32(inputData, inputShape,
+                     filterData, filterShape,
+                     biasData, biasShape,
+                     padding_left, padding_right,
+                     padding_top, padding_bottom,
+                     stride_width, stride_height,
+                     activation,
+                     outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] = { 10.0f };
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
+
+TEST(KernelACL_TC, convFloat32_3x3to3x3)
+{
+  float inputData[9];
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  float filterData[9];
+  const nnfw::rt::Shape filterShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  float biasData[1] = { 1.0 };
+  const nnfw::rt::Shape biasShape = { OperandType::FLOAT32, {1}, 1.0, 0 };
+  int32_t padding_left = 1;
+  int32_t padding_right = 1;
+  int32_t padding_top = 1;
+  int32_t padding_bottom = 1;
+  int32_t stride_width = 1;
+  int32_t stride_height = 1;
+  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
+  float outputData[9];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  bool bret;
+
+  util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
+  util::initData(filterData, sizeof(filterData) / sizeof(filterData[0]), 1.0);
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  bret = convFloat32(inputData, inputShape,
+                     filterData, filterShape,
+                     biasData, biasShape,
+                     padding_left, padding_right,
+                     padding_top, padding_bottom,
+                     stride_width, stride_height,
+                     activation,
+                     outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] = {
+    5.0f, 7.0f, 5.0f,
+    7.0f, 10.0f, 7.0f,
+    5.0f, 7.0f, 5.0f
+  };
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
+
+TEST(KernelACL_TC, convFloat32_3x3to3x3_RELU)
+{
+  float inputData[9];
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  float filterData[9];
+  const nnfw::rt::Shape filterShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  float biasData[1] = { -5.0f };
+  const nnfw::rt::Shape biasShape = { OperandType::FLOAT32, {1}, 1.0, 0 };
+  int32_t padding_left = 1;
+  int32_t padding_right = 1;
+  int32_t padding_top = 1;
+  int32_t padding_bottom = 1;
+  int32_t stride_width = 1;
+  int32_t stride_height = 1;
+  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
+  float outputData[9];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  bool bret;
+
+  util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
+  util::initData(filterData, sizeof(filterData) / sizeof(filterData[0]), 1.0);
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  bret = convFloat32(inputData, inputShape,
+                     filterData, filterShape,
+                     biasData, biasShape,
+                     padding_left, padding_right,
+                     padding_top, padding_bottom,
+                     stride_width, stride_height,
+                     activation,
+                     outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] =
+  {
+    0.0f, 1.0f, 0.0f,
+    1.0f, 4.0f, 1.0f,
+    0.0f, 1.0f, 0.0f
+  };
+
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
+
+TEST(KernelACL_TC, convFloat32_3x5to3x3)
+{
+  float inputData[15] = {
+    1,2,3,4,5,
+    6,7,8,9,10,
+    11,12,13,14,15
+  };
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,5,1}, 1.0, 0 };
+  float filterData[18] = {
+    1,1,1, 1,1,1, 1,1,1,
+    2,2,2, 2,2,2, 2,2,2
+  };
+  const nnfw::rt::Shape filterShape = { OperandType::FLOAT32, {2,3,3,1}, 1.0, 0 };
+  float biasData[2] = { 1.0, 1.0 };
+  const nnfw::rt::Shape biasShape = { OperandType::FLOAT32, {2}, 1.0, 0 };
+  int32_t padding_left = 1;
+  int32_t padding_right = 1;
+  int32_t padding_top = 1;
+  int32_t padding_bottom = 1;
+  int32_t stride_width = 1;
+  int32_t stride_height = 1;
+  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
+  float outputData[30];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,3,5,2}, 1.0, 0 };
+  bool bret;
+
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  bret = convFloat32(inputData, inputShape,
+                     filterData, filterShape,
+                     biasData, biasShape,
+                     padding_left, padding_right,
+                     padding_top, padding_bottom,
+                     stride_width, stride_height,
+                     activation,
+                     outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectNCHW[] = {
+    17.0f, 28.0f, 34.0f, 40.0f, 29.0f,
+    40.0f, 64.0f, 73.0f, 82.0f, 58.0f,
+    37.0f, 58.0f, 64.0f, 70.0f, 49.0f,
+
+    33.0f, 55.0f, 67.0f, 79.0f, 57.0f,
+    79.0f, 127.0f, 145.0f, 163.0f, 115.0f,
+    73.0f, 115.0f, 127.0f, 139.0f, 97.0f
+  };
+  float expectData[30];
+  util::NCHW2NHWC(expectNCHW, expectData, outputShape);
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
diff --git a/libs/kernel/acl/src/cl/DepthwiseConv2D.cpp b/libs/kernel/acl/src/cl/DepthwiseConv2D.cpp
new file mode 100644
index 000000000..7593a99f4
--- /dev/null
+++ b/libs/kernel/acl/src/cl/DepthwiseConv2D.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <OperationsUtils.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/TensorInfo.h>
+
+#include <cassert>
+
+// TODO: fix include path in CMakeFiles
+#include "../IO_accessor.h"
+#include "../shape.h"
+#include "../CLUniqueTensor.h"
+#include "../DepthwiseConv2D.h"
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+
+static void sync_scheduler() {
+  arm_compute::CLScheduler::get().sync();
+}
+
+bool depthwiseConvFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
+                          const float* filterData, const nnfw::rt::Shape& filterShape,
+                          const float* biasData, const nnfw::rt::Shape& biasShape,
+                          int32_t padding_left, int32_t padding_right,
+                          int32_t padding_top, int32_t padding_bottom,
+                          int32_t stride_width, int32_t stride_height,
+                          int32_t depth_multiplier, int32_t activation,
+                          float* outputData, const nnfw::rt::Shape& outputShape) {
+  return common::depthwiseConvFloat32<CLUniqueTensor, arm_compute::CLDepthwiseConvolutionLayer,
+  arm_compute::CLActivationLayer>(inputData, inputShape,
+      filterData, filterShape,
+      biasData, biasShape,
+      padding_left, padding_right,
+      padding_top, padding_bottom,
+      stride_width, stride_height,
+      depth_multiplier, activation,
+      outputData, outputShape,
+      sync_scheduler);
+}
+
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
+
diff --git a/libs/kernel/acl/src/cl/DepthwiseConv2D.test.cpp b/libs/kernel/acl/src/cl/DepthwiseConv2D.test.cpp
new file mode 100644
index 000000000..695563383
--- /dev/null
+++ b/libs/kernel/acl/src/cl/DepthwiseConv2D.test.cpp
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ACL_CORE_FUNC_NAME depthwiseConvFloat32
+#define ACL_TEST(tc, t) TEST(tc, cl_##t)
+
+#include "../DepthwiseConv2D.test.h"
diff --git a/libs/kernel/acl/src/cl/FullyConnected.cpp b/libs/kernel/acl/src/cl/FullyConnected.cpp
new file mode 100644
index 000000000..7513355ab
--- /dev/null
+++ b/libs/kernel/acl/src/cl/FullyConnected.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <OperationsUtils.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/TensorInfo.h>
+
+#include <cassert>
+
+// TODO: fix include path in CMakeFiles
+#include "../IO_accessor.h"
+#include "../shape.h"
+#include "../CLUniqueTensor.h"
+#include "../FullyConnected.h"
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+
+void sync_scheduler() {
+  arm_compute::CLScheduler::get().sync();
+}
+
+bool fullyConnectedFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
+                           const float* weightsData, const nnfw::rt::Shape& weightsShape,
+                           const float* biasData, const nnfw::rt::Shape& biasShape,
+                           int32_t activation,
+                           float* outputData, const nnfw::rt::Shape& outputShape) {
+  return common::fullyConnectedFloat32<CLUniqueTensor, arm_compute::CLFullyConnectedLayer,
+  arm_compute::CLActivationLayer>(inputData, inputShape,
+      weightsData, weightsShape,
+      biasData, biasShape,
+      activation,
+      outputData, outputShape,
+      sync_scheduler);
+}
+
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
diff --git a/libs/kernel/acl/src/cl/FullyConnected.test.cpp b/libs/kernel/acl/src/cl/FullyConnected.test.cpp
new file mode 100644
index 000000000..b1f5a095f
--- /dev/null
+++ b/libs/kernel/acl/src/cl/FullyConnected.test.cpp
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ACL_CORE_FUNC_NAME fullyConnectedFloat32
+#define ACL_TEST(tc, t) TEST(tc, cl_##t)
+
+#include "../FullyConnected.test.h"
diff --git a/libs/kernel/acl/src/cl/Pooling.cpp b/libs/kernel/acl/src/cl/Pooling.cpp
new file mode 100644
index 000000000..e22eacccc
--- /dev/null
+++ b/libs/kernel/acl/src/cl/Pooling.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <OperationsUtils.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/TensorInfo.h>
+#include "../IO_accessor.h"
+#include "../shape.h"
+#include "../CLUniqueTensor.h"
+
+#include <cassert>
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+
+bool maxPoolFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
+                 int32_t padding_left, int32_t padding_right,
+                 int32_t padding_top, int32_t padding_bottom,
+                 int32_t stride_width, int32_t stride_height,
+                 int32_t filter_width, int32_t filter_height,
+                 int32_t activation,
+                 float* outputData, const nnfw::rt::Shape& outputShape)
+{
+  arm_compute::TensorShape input_shape = util::fromNNShape(inputShape);
+  arm_compute::TensorShape output_shape = util::fromNNShape(outputShape);
+
+  std::vector<std::shared_ptr<arm_compute::IFunction>> fns;
+
+  arm_compute::PadStrideInfo pad_info = arm_compute::PadStrideInfo(stride_width, stride_height,
+                                              padding_left, padding_right,
+                                              padding_top, padding_bottom,
+                                              arm_compute::DimensionRoundingType::FLOOR);
+
+  arm_compute::PoolingLayerInfo maxpool_info = arm_compute::PoolingLayerInfo(arm_compute::PoolingType::MAX,
+                                                        arm_compute::Size2D(filter_width,filter_height),
+                                                        pad_info, false);
+
+  CLUniqueTensor input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
+  CLUniqueTensor output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
+
+  auto pool_f = std::make_shared<arm_compute::CLPoolingLayer>();
+  pool_f->configure(input.ptr(), output.ptr(), maxpool_info);
+
+  fns.emplace_back(pool_f);
+
+  input.allocate();
+  output.allocate();
+
+  util::insertFusedActivationLayer<CLUniqueTensor, arm_compute::CLActivationLayer>(output, activation, fns);
+
+  TensorAccess<InputAccessor>(input.ref(), inputData, inputShape);
+
+  for (const auto &fn : fns)
+  {
+    fn->run();
+  }
+
+  arm_compute::CLScheduler::get().sync();
+
+  TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape);
+
+  return true;
+}
+
+bool averagePoolFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
+                 int32_t padding_left, int32_t padding_right,
+                 int32_t padding_top, int32_t padding_bottom,
+                 int32_t stride_width, int32_t stride_height,
+                 int32_t filter_width, int32_t filter_height,
+                 int32_t activation,
+                 float* outputData, const nnfw::rt::Shape& outputShape)
+{
+  arm_compute::TensorShape input_shape = util::fromNNShape(inputShape);
+  arm_compute::TensorShape output_shape = util::fromNNShape(outputShape);
+
+  std::vector<std::shared_ptr<arm_compute::IFunction>> fns;
+
+  arm_compute::PadStrideInfo pad_info = arm_compute::PadStrideInfo(stride_width, stride_height,
+                                              padding_left, padding_right,
+                                              padding_top, padding_bottom,
+                                              arm_compute::DimensionRoundingType::FLOOR);
+
+  arm_compute::PoolingLayerInfo pool_info = arm_compute::PoolingLayerInfo(arm_compute::PoolingType::AVG,
+                                                        arm_compute::Size2D(filter_width,filter_height),
+                                                        pad_info, true);
+
+  CLUniqueTensor input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
+  CLUniqueTensor output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
+
+  auto pool_f = std::make_shared<arm_compute::CLPoolingLayer>();
+  pool_f->configure(input.ptr(), output.ptr(), pool_info);
+
+  fns.emplace_back(pool_f);
+
+  input.allocate();
+  output.allocate();
+
+  util::insertFusedActivationLayer<CLUniqueTensor, arm_compute::CLActivationLayer>(output, activation, fns);
+
+  TensorAccess<InputAccessor>(input.ref(), inputData, inputShape);
+
+  for (const auto &fn : fns)
+  {
+    fn->run();
+  }
+
+  arm_compute::CLScheduler::get().sync();
+
+  TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape);
+
+  return true;
+}
+
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
diff --git a/libs/kernel/acl/src/cl/Pooling.test.cpp b/libs/kernel/acl/src/cl/Pooling.test.cpp
new file mode 100644
index 000000000..8112e7a45
--- /dev/null
+++ b/libs/kernel/acl/src/cl/Pooling.test.cpp
@@ -0,0 +1,482 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <OperationsUtils.h>
+#include <kernel/acl/nnfw_kernel_acl.h>
+#include <arm_compute/core/Types.h>
+#include <kernel/acl/Pooling.h>
+
+#include "../util.h"
+
+using namespace nnfw::kernel::acl;
+
+TEST(KernelACL_TC, maxPoolFloat32_3x3to1x1)
+{
+  util::TensorWrapper input({1,3,3,1});
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  int32_t padding_left = 0;
+  int32_t padding_right = 0;
+  int32_t padding_top = 0;
+  int32_t padding_bottom = 0;
+  int32_t stride_width = 1;
+  int32_t stride_height = 1;
+  int32_t filter_width = 3;
+  int32_t filter_height = 3;
+
+  float outputData[1];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 };
+  bool bret;
+
+  float value = 1.0f;
+  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return value++;
+  });
+
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
+
+  bret = maxPoolFloat32(input.ptr<float>(), inputShape,
+                    padding_left, padding_right,
+                    padding_top, padding_bottom,
+                    stride_width, stride_height,
+                    filter_width, filter_height,
+                    activation,
+                    outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] = { 9.0f };
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
+
+TEST(KernelACL_TC, maxPoolFloat32_3x3to1x1_RELU)
+{
+  util::TensorWrapper input({1,3,3,1});
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  int32_t padding_left = 0;
+  int32_t padding_right = 0;
+  int32_t padding_top = 0;
+  int32_t padding_bottom = 0;
+  int32_t stride_width = 1;
+  int32_t stride_height = 1;
+  int32_t filter_width = 3;
+  int32_t filter_height = 3;
+
+  float outputData[1];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 };
+  bool bret;
+
+  float value = -1.0f;
+  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return value--;
+  });
+
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  int32_t activation = ANEURALNETWORKS_FUSED_RELU;
+
+  bret = maxPoolFloat32(input.ptr<float>(), inputShape,
+                    padding_left, padding_right,
+                    padding_top, padding_bottom,
+                    stride_width, stride_height,
+                    filter_width, filter_height,
+                    activation,
+                    outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] = { 0.0f };
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
+
+TEST(KernelACL_TC, maxPoolFloat32_3x3to2x2)
+{
+  util::TensorWrapper input({1,3,3,1});
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  int32_t padding_left = 0;
+  int32_t padding_right = 1;
+  int32_t padding_top = 0;
+  int32_t padding_bottom = 1;
+  int32_t stride_width = 2;
+  int32_t stride_height = 2;
+  int32_t filter_width = 2;
+  int32_t filter_height = 2;
+
+  float outputData[4];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,2,2,1}, 1.0, 0 };
+  bool bret;
+
+  float value = 1.0f;
+  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return value++;
+  });
+
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
+
+  bret = maxPoolFloat32(input.ptr<float>(), inputShape,
+                    padding_left, padding_right,
+                    padding_top, padding_bottom,
+                    stride_width, stride_height,
+                    filter_width, filter_height,
+                    activation,
+                    outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] = {
+    5.0f, 6.0f,
+    8.0f, 9.0f
+  };
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
+
+TEST(KernelACL_TC, maxPoolFloat32_147x147to73x73)
+{
+  util::TensorWrapper input({1,147,147,64});
+  util::TensorWrapper output({1,73,73,64});
+
+  int32_t padding_left = 0;
+  int32_t padding_right = 0;
+  int32_t padding_top = 0;
+  int32_t padding_bottom = 0;
+  int32_t stride_width = 2;
+  int32_t stride_height = 2;
+  int32_t filter_width = 3;
+  int32_t filter_height = 3;
+
+  input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 1.0f;
+  });
+
+  output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 0.f;
+  });
+
+  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
+
+  bool bret = maxPoolFloat32(input.ptr<float>(), input.shape(),
+                    padding_left, padding_right,
+                    padding_top, padding_bottom,
+                    stride_width, stride_height,
+                    filter_width, filter_height,
+                    activation,
+                    output.ptr<float>(), output.shape());
+  EXPECT_EQ(bret, true);
+
+  util::TensorWrapper expected({1,73,73,64});
+  expected.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 1.0f;
+  });
+
+  EXPECT_EQ(output, expected);
+}
+
+TEST(KernelACL_TC, maxPoolFloat32_71x71to35x35)
+{
+  util::TensorWrapper input({1,71,71,192});
+  util::TensorWrapper output({1,35,35,192});
+
+  int32_t padding_left = 0;
+  int32_t padding_right = 0;
+  int32_t padding_top = 0;
+  int32_t padding_bottom = 0;
+  int32_t stride_width = 2;
+  int32_t stride_height = 2;
+  int32_t filter_width = 3;
+  int32_t filter_height = 3;
+
+  input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 1.0f;
+  });
+
+  output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 0.f;
+  });
+
+  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
+
+  bool bret = maxPoolFloat32(input.ptr<float>(), input.shape(),
+                    padding_left, padding_right,
+                    padding_top, padding_bottom,
+                    stride_width, stride_height,
+                    filter_width, filter_height,
+                    activation,
+                    output.ptr<float>(), output.shape());
+  EXPECT_EQ(bret, true);
+
+  util::TensorWrapper expected({1,35,35,192});
+  expected.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 1.0f;
+  });
+
+  EXPECT_EQ(output, expected);
+}
+
+TEST(KernelACL_TC, averagePoolFloat32_3x3to1x1)
+{
+  util::TensorWrapper input({1,3,3,1});
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  int32_t padding_left = 0;
+  int32_t padding_right = 0;
+  int32_t padding_top = 0;
+  int32_t padding_bottom = 0;
+  int32_t stride_width = 1;
+  int32_t stride_height = 1;
+  int32_t filter_width = 3;
+  int32_t filter_height = 3;
+
+  float outputData[1];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 };
+  bool bret;
+
+  float value = 1.0f;
+  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return value++;
+  });
+
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
+
+  bret = averagePoolFloat32(input.ptr<float>(), inputShape,
+                    padding_left, padding_right,
+                    padding_top, padding_bottom,
+                    stride_width, stride_height,
+                    filter_width, filter_height,
+                    activation,
+                    outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] = { 5.0f };
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
+
+TEST(KernelACL_TC, averagePoolFloat32_3x3to1x1_RELU)
+{
+  util::TensorWrapper input({1,3,3,1});
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  int32_t padding_left = 0;
+  int32_t padding_right = 0;
+  int32_t padding_top = 0;
+  int32_t padding_bottom = 0;
+  int32_t stride_width = 1;
+  int32_t stride_height = 1;
+  int32_t filter_width = 3;
+  int32_t filter_height = 3;
+
+  float outputData[1];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 };
+  bool bret;
+
+  float value = 3.0f;
+  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return value--;
+  });
+
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  int32_t activation = ANEURALNETWORKS_FUSED_RELU;
+
+  bret = averagePoolFloat32(input.ptr<float>(), inputShape,
+                    padding_left, padding_right,
+                    padding_top, padding_bottom,
+                    stride_width, stride_height,
+                    filter_width, filter_height,
+                    activation,
+                    outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] = { 0.0f };
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
+
+TEST(KernelACL_TC, averagePoolFloat32_3x3to2x2)
+{
+  util::TensorWrapper input({1,3,3,1});
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  int32_t padding_left = 0;
+  int32_t padding_right = 0;
+  int32_t padding_top = 0;
+  int32_t padding_bottom = 0;
+  int32_t stride_width = 1;
+  int32_t stride_height = 1;
+  int32_t filter_width = 2;
+  int32_t filter_height = 2;
+
+  float outputData[4];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,2,2,1}, 1.0, 0 };
+  bool bret;
+
+  float value = 1.0f;
+  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return value++;
+  });
+
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
+
+  bret = averagePoolFloat32(input.ptr<float>(), inputShape,
+                    padding_left, padding_right,
+                    padding_top, padding_bottom,
+                    stride_width, stride_height,
+                    filter_width, filter_height,
+                    activation,
+                    outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] = {
+    3.0f, 4.0f,
+    6.0f, 7.0f
+  };  
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
+
+TEST(KernelACL_TC, averagePoolFloat32_3x3to3x3)
+{
+  std::vector<uint32_t> dims = {1,3,3,1};
+  util::TensorWrapper input(dims);
+  util::TensorWrapper output(dims);
+
+  int32_t padding_left = 1;
+  int32_t padding_right = 1;
+  int32_t padding_top = 1;
+  int32_t padding_bottom = 1;
+  int32_t stride_width = 1;
+  int32_t stride_height = 1;
+  int32_t filter_width = 3;
+  int32_t filter_height = 3;
+
+  int32_t value=1.0f;
+  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return value++;
+  });
+
+  output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 0.f;
+  });
+
+  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
+
+  bool bret = averagePoolFloat32(input.ptr<float>(), input.shape(),
+                    padding_left, padding_right,
+                    padding_top, padding_bottom,
+                    stride_width, stride_height,
+                    filter_width, filter_height,
+                    activation,
+                    output.ptr<float>(), output.shape());
+  EXPECT_EQ(bret, true);
+
+  util::TensorWrapper expected(dims);
+  float v=2.5f;
+  expected.initValue([&v](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    v = v + 0.5f;
+    return v;
+  });
+
+  EXPECT_EQ(output, expected);
+}
+
+TEST(KernelACL_TC, averagePoolFloat32_35x35to35x35)
+{
+  int32_t N=35;
+  std::vector<uint32_t> dims = {1,35,35,768};
+  util::TensorWrapper input(dims);
+  util::TensorWrapper output(dims);
+
+  int32_t padding_left = 1;
+  int32_t padding_right = 1;
+  int32_t padding_top = 1;
+  int32_t padding_bottom = 1;
+  int32_t stride_width = 1;
+  int32_t stride_height = 1;
+  int32_t filter_width = 3;
+  int32_t filter_height = 3;
+
+  input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 1.0f;
+  });
+
+  output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 0.f;
+  });
+
+  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
+
+  bool bret = averagePoolFloat32(input.ptr<float>(), input.shape(),
+                    padding_left, padding_right,
+                    padding_top, padding_bottom,
+                    stride_width, stride_height,
+                    filter_width, filter_height,
+                    activation,
+                    output.ptr<float>(), output.shape());
+  EXPECT_EQ(bret, true);
+
+  util::TensorWrapper expected(dims);
+  expected.initValue([&N](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+      return 1.0f;
+  });
+
+  EXPECT_EQ(output, expected);
+}
+
+TEST(KernelACL_TC, averagePoolFloat32_8x8to1x1)
+{
+  util::TensorWrapper input({1,8,8,2048});
+  util::TensorWrapper output({1,1,1,2048});
+
+  int32_t padding_left = 0;
+  int32_t padding_right = 0;
+  int32_t padding_top = 0;
+  int32_t padding_bottom = 0;
+  int32_t stride_width = 2;
+  int32_t stride_height = 2;
+  int32_t filter_width = 8;
+  int32_t filter_height = 8;
+
+  input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 1.0f;
+  });
+
+  output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 0.f;
+  });
+
+  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
+
+  bool bret = averagePoolFloat32(input.ptr<float>(), input.shape(),
+                    padding_left, padding_right,
+                    padding_top, padding_bottom,
+                    stride_width, stride_height,
+                    filter_width, filter_height,
+                    activation,
+                    output.ptr<float>(), output.shape());
+  EXPECT_EQ(bret, true);
+
+  util::TensorWrapper expected({1,1,1,2048});
+  expected.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 1.0f;
+  });
+
+  EXPECT_EQ(output, expected);
+}
diff --git a/libs/kernel/acl/src/cl/Reshape.cpp b/libs/kernel/acl/src/cl/Reshape.cpp
new file mode 100644
index 000000000..e420ab92b
--- /dev/null
+++ b/libs/kernel/acl/src/cl/Reshape.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <OperationsUtils.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/TensorInfo.h>
+
+// TODO: fix include path in CMakeFiles
+#include "../IO_accessor.h"
+#include "../shape.h"
+#include "../CLUniqueTensor.h"
+#include "../Reshape.h"
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+
+static void sync_scheduler() {
+  arm_compute::CLScheduler::get().sync();
+}
+
+bool reshapeGeneric(const void* inputData, const nnfw::rt::Shape& inputShape,
+                    void* outputData, const nnfw::rt::Shape& outputShape) {
+  return common::reshapeGeneric<CLUniqueTensor, arm_compute::CLReshapeLayer>
+    (inputData, inputShape, outputData, outputShape, sync_scheduler);
+}
+
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
diff --git a/libs/kernel/acl/src/cl/Reshape.test.cpp b/libs/kernel/acl/src/cl/Reshape.test.cpp
new file mode 100644
index 000000000..db23a6d3d
--- /dev/null
+++ b/libs/kernel/acl/src/cl/Reshape.test.cpp
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ACL_CORE_FUNC_NAME reshapeGeneric
+#define ACL_TEST(tc, t) TEST(tc, cl_##t)
+
+#include "../Reshape.test.h"
diff --git a/libs/kernel/acl/src/cl/Softmax.cpp b/libs/kernel/acl/src/cl/Softmax.cpp
new file mode 100644
index 000000000..a628f05fe
--- /dev/null
+++ b/libs/kernel/acl/src/cl/Softmax.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <OperationsUtils.h>
+#include <NeuralNetworks.h>
+
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/TensorInfo.h>
+#include "../IO_accessor.h"
+#include "../shape.h"
+#include "../CLUniqueTensor.h"
+#include "../util.h"
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+
+bool softmaxFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
+                    const float beta,
+                    float* outputData, const nnfw::rt::Shape& outputShape)
+{
+  arm_compute::TensorShape input_shape = util::fromNNShape(inputShape);
+  arm_compute::TensorShape output_shape = util::fromNNShape(outputShape);
+
+  CLUniqueTensor input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
+  CLUniqueTensor output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
+
+  auto softmax_f = std::make_shared<arm_compute::CLSoftmaxLayer>();
+  softmax_f->configure(input.ptr(), output.ptr(), beta);
+
+  input.allocate();
+  output.allocate();
+
+  if (inputShape.dimensions.size() == 4)
+  {
+    TensorAccess<InputAccessor>(input.ref(), inputData, inputShape);
+
+    softmax_f->run();
+
+    arm_compute::CLScheduler::get().sync();
+
+    TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape);
+  }
+  else if (inputShape.dimensions.size() == 2)
+  {
+    TensorAccess<MatrixInputAccessor>(input.ref(), inputData, inputShape);
+
+    softmax_f->run();
+
+    arm_compute::CLScheduler::get().sync();
+
+    TensorAccess<MatrixOutputAccessor>(output.ref(), outputData, outputShape);
+  }
+  else
+  {
+    assert("undefined dimension of input" && 0);
+    return false;
+  }
+
+  return true;
+}
+
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
diff --git a/libs/kernel/acl/src/cl/Softmax.test.cpp b/libs/kernel/acl/src/cl/Softmax.test.cpp
new file mode 100644
index 000000000..8ee8b41e2
--- /dev/null
+++ b/libs/kernel/acl/src/cl/Softmax.test.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <OperationsUtils.h>
+#include <kernel/acl/nnfw_kernel_acl.h>
+#include <arm_compute/core/Types.h>
+#include <kernel/acl/Softmax.h>
+
+#include "../util.h"
+
+using namespace nnfw::kernel::acl;
+
+TEST(KernelACL_TC, softmaxFloat32_1xn)
+{
+  float inputData[4];
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,4}, 1.0, 0 };
+  float outputData[4];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,4}, 1.0, 0 };
+  const float beta = 1.0f;
+  bool bret;
+
+  util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  bret = softmaxFloat32(inputData, inputShape, beta, outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] = { 0.25f, 0.25f, 0.25f, 0.25f };
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
+
+TEST(KernelACL_TC, softmaxFloat32_4d)
+{
+  float inputData[4];
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 };
+  float outputData[4];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 };
+  const float beta = 1.0f;
+  bool bret;
+
+  util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  bret = softmaxFloat32(inputData, inputShape, beta, outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] = { 0.25f, 0.25f, 0.25f, 0.25f };
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
+
+TEST(KernelACL_TC, softmaxFloat32_1xn_seq)
+{
+  float inputData[4];
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,4}, 1.0, 0 };
+  float outputData[4];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,4}, 1.0, 0 };
+  const float beta = 1.0f;
+  bool bret;
+
+  util::initData_Increasing(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  bret = softmaxFloat32(inputData, inputShape, beta, outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] = {0.032058603280085, 0.0871443187420326, 0.23688281808991, 0.643914259887972};
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
+
+TEST(KernelACL_TC, softmaxFloat32_4d_seq)
+{
+  float inputData[4];
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 };
+  float outputData[4];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 };
+  const float beta = 1.0f;
+  bool bret;
+
+  util::initData_Increasing(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  bret = softmaxFloat32(inputData, inputShape, beta, outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] = {0.032058603280085, 0.0871443187420326, 0.23688281808991, 0.643914259887972};
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
diff --git a/libs/kernel/acl/src/gtest_env.cpp b/libs/kernel/acl/src/gtest_env.cpp
new file mode 100644
index 000000000..f6fc52f7a
--- /dev/null
+++ b/libs/kernel/acl/src/gtest_env.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <kernel/acl/nnfw_kernel_acl.h>
+
+class TestEnvironment : public ::testing::Environment
+{
+public:
+  virtual ~TestEnvironment() = default;
+
+  virtual void SetUp()
+  {
+    nnfw::kernel::acl::Initialize();
+  }
+
+  virtual void TearDown()
+  {
+    // DO NOTHING
+  }
+};
+
+static ::testing::Environment* const testingenv =
+  ::testing::AddGlobalTestEnvironment(new TestEnvironment);
diff --git a/libs/kernel/acl/src/neon/Concatenation.cpp b/libs/kernel/acl/src/neon/Concatenation.cpp
new file mode 100644
index 000000000..8738a9d12
--- /dev/null
+++ b/libs/kernel/acl/src/neon/Concatenation.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <OperationsUtils.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/TensorInfo.h>
+
+#include <cassert>
+
+// TODO: fix include path in CMakeFiles
+#include "../IO_accessor.h"
+#include "../shape.h"
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+namespace neon {
+
+bool concatenationFloat32(const std::vector<const float*>& inputDataPtrs,
+                          const std::vector<nnfw::rt::Shape>& inputShapes, int32_t axis,
+                          float* outputData, const nnfw::rt::Shape& outputShape)
+{
+  if (axis != 3)
+  {
+    assert("Only support axis=3 for ACL" && 0);
+    return false;
+  }
+  assert(inputDataPtrs.size() == inputShapes.size());
+
+  std::vector<arm_compute::Tensor*> inputPtrs;
+  std::vector<arm_compute::ITensor*> inputIptrs;
+  arm_compute::Tensor output;
+
+  // init Tensors
+  std::vector<nnfw::rt::Shape>::const_iterator it_inputShape = inputShapes.begin();
+  for (auto inputData : inputDataPtrs)
+  {
+    const nnfw::rt::Shape& inputShape = *it_inputShape;
+    arm_compute::TensorShape input_shape = util::fromNNShape(inputShape);
+    arm_compute::Tensor* inputPtr = new arm_compute::Tensor();
+
+    inputPtr->allocator()->init(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
+    inputPtrs.push_back(inputPtr);
+    inputIptrs.push_back(inputPtr);
+
+    it_inputShape++;
+  }
+  arm_compute::TensorShape output_shape = util::fromNNShape(outputShape);
+  output.allocator()->init(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
+
+  // prepare ACL Concatenate and configure tensors
+  auto concat = std::make_shared<arm_compute::NEDepthConcatenateLayer>();
+  concat->configure(inputIptrs, &output);
+
+  // allocate Tensors
+  it_inputShape = inputShapes.begin();
+  std::vector<const float*>::const_iterator it_inputData = inputDataPtrs.begin();
+  for (auto inputPtr : inputPtrs)
+  {
+    inputPtr->allocator()->allocate();
+
+    const float* inputData = *it_inputData;
+    const nnfw::rt::Shape& inputShape = *it_inputShape;
+
+    TensorAccess<InputAccessor>(*inputPtr, inputData, inputShape);
+
+    it_inputShape++;
+    it_inputData++;
+  }
+  output.allocator()->allocate();
+
+  // run
+  concat->run();
+
+  // get output
+  TensorAccess<OutputAccessor>(output, outputData, outputShape);
+
+  // cleanup
+  for (auto inputPtr : inputPtrs)
+  {
+    inputPtr->allocator()->free();
+    delete inputPtr;
+  }
+  output.allocator()->free();
+
+  return true;
+}
+
+} // namespace neon
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
diff --git a/libs/kernel/acl/src/neon/Concatenation.test.cpp b/libs/kernel/acl/src/neon/Concatenation.test.cpp
new file mode 100644
index 000000000..03b05bd24
--- /dev/null
+++ b/libs/kernel/acl/src/neon/Concatenation.test.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <OperationsUtils.h>
+#include <kernel/acl/nnfw_kernel_acl.h>
+#include <kernel/acl/Concatenation.h>
+
+// TODO: fix include path in CMakeFiles
+#include "../util.h"
+
+using namespace nnfw::kernel::acl;
+
+TEST(KernelACL_TC, neon_concatFloat32_1)
+{
+  float inputData_1[6] = {
+    1, 2, 3, 4, 5, 6      // [ [ [1],[2],[3] ], [ [4],[5],[6] ] ]
+  };
+  float inputData_2[6] = {
+    7, 8, 9, 10, 11, 12   // [ [ [7],[8],[9] ], [ [10],[11],[12] ] ]
+  };
+  const nnfw::rt::Shape inputShape_1 = { OperandType::FLOAT32, {1,2,3,1}, 1.0, 0 };
+  const nnfw::rt::Shape inputShape_2 = { OperandType::FLOAT32, {1,2,3,1}, 1.0, 0 };
+  std::vector<const float*> inputDataPtrs;
+  std::vector<nnfw::rt::Shape> inputShapes;
+  float outputData[12];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,2,3,2}, 1.0, 0 };
+  bool bret;
+
+  inputDataPtrs.push_back(inputData_1);
+  inputDataPtrs.push_back(inputData_2);
+  inputShapes.push_back(inputShape_1);
+  inputShapes.push_back(inputShape_2);
+
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  bret = neon::concatenationFloat32(inputDataPtrs, inputShapes, 3, 
+                                    outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectNCHW[] = {
+    1, 2, 3, 4, 5, 6,
+    7, 8, 9, 10, 11, 12
+  };
+  float expectData[12]; // [ [ [1,7],[2,8],[3,9] ], [ [4,10],[5,11],[6,12] ] ]
+  util::NCHW2NHWC(expectNCHW, expectData, outputShape);
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
diff --git a/libs/kernel/acl/src/neon/Conv2D.cpp b/libs/kernel/acl/src/neon/Conv2D.cpp
new file mode 100644
index 000000000..679ecfced
--- /dev/null
+++ b/libs/kernel/acl/src/neon/Conv2D.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <OperationsUtils.h>
+#include <NeuralNetworks.h>
+
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/TensorInfo.h>
+
+#include <util/environment.h>
+
+#include "../IO_accessor.h"
+#include "../util.h"
+#include "../shape.h"
+#include "../NEUniqueTensor.h"
+#include "../support.h"
+
+#include "util/feature/TextFormatter.h"
+
+#include "support/nnapi/feature/Reader.h"
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+namespace neon {
+
+static int verbose = 0;
+
+bool convFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
+                 const float* filterData, const nnfw::rt::Shape& filterShape,
+                 const float* biasData, const nnfw::rt::Shape& biasShape,
+                 int32_t padding_left, int32_t padding_right,
+                 int32_t padding_top, int32_t padding_bottom,
+                 int32_t stride_width, int32_t stride_height,
+                 int32_t activation,
+                 float* outputData, const nnfw::rt::Shape& outputShape)
+{
+  arm_compute::TensorShape input_shape = util::fromNNShape(inputShape);
+  arm_compute::TensorShape filter_shape = util::fromNNShape(filterShape);
+  arm_compute::TensorShape bias_shape = util::fromVectorNNShape(biasShape);
+  arm_compute::TensorShape output_shape = util::fromNNShape(outputShape);
+  arm_compute::PadStrideInfo conv_info = arm_compute::PadStrideInfo(stride_width, stride_height,
+                                              padding_left, padding_right,
+                                              padding_top, padding_bottom,
+                                              arm_compute::DimensionRoundingType::FLOOR);
+
+  NEUniqueTensor input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
+  NEUniqueTensor output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
+  NEUniqueTensor bias(arm_compute::TensorInfo(bias_shape, arm_compute::Format::F32));
+  NEUniqueTensor filter(arm_compute::TensorInfo(filter_shape, arm_compute::Format::F32));
+
+  std::vector<std::shared_ptr<arm_compute::IFunction>> fns;
+
+  auto conv_f = std::make_shared<arm_compute::NEConvolutionLayer>();
+
+  conv_f->configure(input.ptr(), filter.ptr(), bias.ptr(), output.ptr(), conv_info);
+
+  fns.emplace_back(conv_f);
+
+  util::insertFusedActivationLayer<NEUniqueTensor, arm_compute::NEActivationLayer>(output, activation, fns);
+
+  input.allocate();
+  output.allocate();
+  bias.allocate();
+  filter.allocate();
+
+  TensorAccess<InputAccessor>(input.ref(), inputData, inputShape);
+  TensorAccess<BiasAccessor>(bias.ref(), biasData, biasShape);
+  TensorAccess<WeightAccessor>(filter.ref(), filterData, filterShape);
+
+  nnfw::util::env::IntAccessor("CONV2D_VERBOSE").access(verbose);
+  if (verbose)
+  {
+    auto ifm_shape = nnfw::support::nnapi::feature::asFeatureShape(inputShape);
+    nnfw::support::nnapi::feature::Reader<float> nnapi_ifm_reader{ifm_shape, inputData};
+    nnfw::support::acl::feature::Reader<float> acl_ifm_reader{ input.ptr() };
+
+    std::cout << "NNAPI IFM:" << std::endl;
+    std::cout << nnfw::util::feature::TextFormatter<float>{ifm_shape, nnapi_ifm_reader} << std::endl;
+
+    std::cout << "ARM Compute IFM:" << std::endl;
+    std::cout << nnfw::util::feature::TextFormatter<float>{ifm_shape, acl_ifm_reader} << std::endl;
+  }
+
+  for (const auto &fn : fns)
+  {
+    fn->run();
+  }
+
+  TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape);
+
+  return true;
+}
+
+} // namespace neon
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
diff --git a/libs/kernel/acl/src/neon/Conv2D.test.cpp b/libs/kernel/acl/src/neon/Conv2D.test.cpp
new file mode 100644
index 000000000..6a3de1c43
--- /dev/null
+++ b/libs/kernel/acl/src/neon/Conv2D.test.cpp
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <OperationsUtils.h>
+#include <kernel/acl/nnfw_kernel_acl.h>
+#include <kernel/acl/Conv2D.h>
+
+// TODO: fix include path in CMakeFiles
+#include "../util.h"
+
+using namespace nnfw::kernel::acl;
+
+TEST(KernelACL_TC, neon_convFloat32_3x3to1x1)
+{
+  float inputData[9];
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  float filterData[9];
+  const nnfw::rt::Shape filterShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  float biasData[1] = { 1.0 };
+  const nnfw::rt::Shape biasShape = { OperandType::FLOAT32, {1}, 1.0, 0 };
+  int32_t padding_left = 0;
+  int32_t padding_right = 0;
+  int32_t padding_top = 0;
+  int32_t padding_bottom = 0;
+  int32_t stride_width = 1;
+  int32_t stride_height = 1;
+  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
+  float outputData[1];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 };
+  bool bret;
+
+  util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
+  util::initData(filterData, sizeof(filterData) / sizeof(filterData[0]), 1.0);
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  bret = neon::convFloat32(inputData, inputShape,
+                     filterData, filterShape,
+                     biasData, biasShape,
+                     padding_left, padding_right,
+                     padding_top, padding_bottom,
+                     stride_width, stride_height,
+                     activation,
+                     outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] = { 10.0f };
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
+
+TEST(KernelACL_TC, neon_convFloat32_3x3to3x3)
+{
+  float inputData[9];
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  float filterData[9];
+  const nnfw::rt::Shape filterShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  float biasData[1] = { 1.0 };
+  const nnfw::rt::Shape biasShape = { OperandType::FLOAT32, {1}, 1.0, 0 };
+  int32_t padding_left = 1;
+  int32_t padding_right = 1;
+  int32_t padding_top = 1;
+  int32_t padding_bottom = 1;
+  int32_t stride_width = 1;
+  int32_t stride_height = 1;
+  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
+  float outputData[9];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  bool bret;
+
+  util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
+  util::initData(filterData, sizeof(filterData) / sizeof(filterData[0]), 1.0);
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  bret = neon::convFloat32(inputData, inputShape,
+                     filterData, filterShape,
+                     biasData, biasShape,
+                     padding_left, padding_right,
+                     padding_top, padding_bottom,
+                     stride_width, stride_height,
+                     activation,
+                     outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] = {
+    5.0f, 7.0f, 5.0f,
+    7.0f, 10.0f, 7.0f,
+    5.0f, 7.0f, 5.0f
+  };
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
+
+TEST(KernelACL_TC, neon_convFloat32_3x3to3x3_RELU)
+{
+  float inputData[9];
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  float filterData[9];
+  const nnfw::rt::Shape filterShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  float biasData[1] = { -5.0f };
+  const nnfw::rt::Shape biasShape = { OperandType::FLOAT32, {1}, 1.0, 0 };
+  int32_t padding_left = 1;
+  int32_t padding_right = 1;
+  int32_t padding_top = 1;
+  int32_t padding_bottom = 1;
+  int32_t stride_width = 1;
+  int32_t stride_height = 1;
+  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
+  float outputData[9];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  bool bret;
+
+  util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
+  util::initData(filterData, sizeof(filterData) / sizeof(filterData[0]), 1.0);
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  bret = neon::convFloat32(inputData, inputShape,
+                     filterData, filterShape,
+                     biasData, biasShape,
+                     padding_left, padding_right,
+                     padding_top, padding_bottom,
+                     stride_width, stride_height,
+                     activation,
+                     outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] =
+  {
+    0.0f, 1.0f, 0.0f,
+    1.0f, 4.0f, 1.0f,
+    0.0f, 1.0f, 0.0f
+  };
+
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
+
+TEST(KernelACL_TC, neon_convFloat32_3x5to3x3)
+{
+  float inputData[15] = {
+    1,2,3,4,5,
+    6,7,8,9,10,
+    11,12,13,14,15
+  };
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,5,1}, 1.0, 0 };
+  float filterData[18] = {
+    1,1,1, 1,1,1, 1,1,1,
+    2,2,2, 2,2,2, 2,2,2
+  };
+  const nnfw::rt::Shape filterShape = { OperandType::FLOAT32, {2,3,3,1}, 1.0, 0 };
+  float biasData[2] = { 1.0, 1.0 };
+  const nnfw::rt::Shape biasShape = { OperandType::FLOAT32, {2}, 1.0, 0 };
+  int32_t padding_left = 1;
+  int32_t padding_right = 1;
+  int32_t padding_top = 1;
+  int32_t padding_bottom = 1;
+  int32_t stride_width = 1;
+  int32_t stride_height = 1;
+  int32_t activation = static_cast<int32_t>(FusedActivationFunc::RELU);
+  float outputData[30];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,3,5,2}, 1.0, 0 };
+  bool bret;
+
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  bret = neon::convFloat32(inputData, inputShape,
+                     filterData, filterShape,
+                     biasData, biasShape,
+                     padding_left, padding_right,
+                     padding_top, padding_bottom,
+                     stride_width, stride_height,
+                     activation,
+                     outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectNCHW[] = {
+    17.0f, 28.0f, 34.0f, 40.0f, 29.0f,
+    40.0f, 64.0f, 73.0f, 82.0f, 58.0f,
+    37.0f, 58.0f, 64.0f, 70.0f, 49.0f,
+
+    33.0f, 55.0f, 67.0f, 79.0f, 57.0f,
+    79.0f, 127.0f, 145.0f, 163.0f, 115.0f,
+    73.0f, 115.0f, 127.0f, 139.0f, 97.0f
+  };
+  float expectData[30];
+  util::NCHW2NHWC(expectNCHW, expectData, outputShape);
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
diff --git a/libs/kernel/acl/src/neon/DepthwiseConv2D.cpp b/libs/kernel/acl/src/neon/DepthwiseConv2D.cpp
new file mode 100644
index 000000000..bcf56c667
--- /dev/null
+++ b/libs/kernel/acl/src/neon/DepthwiseConv2D.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <OperationsUtils.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/TensorInfo.h>
+#include <arm_compute/runtime/NEON/NEScheduler.h>
+
+#include <cassert>
+
+// TODO: fix include path in CMakeFiles
+#include "../IO_accessor.h"
+#include "../shape.h"
+#include "../NEUniqueTensor.h"
+#include "../DepthwiseConv2D.h"
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+
+namespace neon {
+static void sync_scheduler() {
+}
+
+bool depthwiseConvFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
+                          const float* filterData, const nnfw::rt::Shape& filterShape,
+                          const float* biasData, const nnfw::rt::Shape& biasShape,
+                          int32_t padding_left, int32_t padding_right,
+                          int32_t padding_top, int32_t padding_bottom,
+                          int32_t stride_width, int32_t stride_height,
+                          int32_t depth_multiplier, int32_t activation,
+                          float* outputData, const nnfw::rt::Shape& outputShape) {
+  return common::depthwiseConvFloat32<NEUniqueTensor, arm_compute::NEDepthwiseConvolutionLayer,
+  arm_compute::NEActivationLayer>(inputData, inputShape,
+      filterData, filterShape,
+      biasData, biasShape,
+      padding_left, padding_right,
+      padding_top, padding_bottom,
+      stride_width, stride_height,
+      depth_multiplier, activation,
+      outputData, outputShape,
+      sync_scheduler);
+}
+
+} // namespace neon
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
diff --git a/libs/kernel/acl/src/neon/DepthwiseConv2D.test.cpp b/libs/kernel/acl/src/neon/DepthwiseConv2D.test.cpp
new file mode 100644
index 000000000..d729d538e
--- /dev/null
+++ b/libs/kernel/acl/src/neon/DepthwiseConv2D.test.cpp
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ACL_CORE_FUNC_NAME neon::depthwiseConvFloat32
+#define ACL_TEST(tc, t) TEST(tc, neon_##t)
+
+#include "../DepthwiseConv2D.test.h"
diff --git a/libs/kernel/acl/src/neon/FullyConnected.cpp b/libs/kernel/acl/src/neon/FullyConnected.cpp
new file mode 100644
index 000000000..86229cbf2
--- /dev/null
+++ b/libs/kernel/acl/src/neon/FullyConnected.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <OperationsUtils.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/TensorInfo.h>
+#include <arm_compute/runtime/NEON/NEScheduler.h>
+
+#include <cassert>
+
+// TODO: fix include path in CMakeFiles
+#include "../IO_accessor.h"
+#include "../shape.h"
+#include "../NEUniqueTensor.h"
+#include "../FullyConnected.h"
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+
+namespace neon {
+
+void sync_scheduler() {
+}
+
+bool fullyConnectedFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
+                           const float* weightsData, const nnfw::rt::Shape& weightsShape,
+                           const float* biasData, const nnfw::rt::Shape& biasShape,
+                           int32_t activation,
+                           float* outputData, const nnfw::rt::Shape& outputShape) {
+
+  return common::fullyConnectedFloat32<NEUniqueTensor, arm_compute::NEFullyConnectedLayer,
+            arm_compute::NEActivationLayer>(inputData, inputShape,
+      weightsData, weightsShape,
+      biasData, biasShape,
+      activation,
+      outputData, outputShape,
+      sync_scheduler);
+}
+
+} // namespace neon
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
+
diff --git a/libs/kernel/acl/src/neon/FullyConnected.test.cpp b/libs/kernel/acl/src/neon/FullyConnected.test.cpp
new file mode 100644
index 000000000..d4c95e4cb
--- /dev/null
+++ b/libs/kernel/acl/src/neon/FullyConnected.test.cpp
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ACL_CORE_FUNC_NAME neon::fullyConnectedFloat32
+#define ACL_TEST(tc, t) TEST(tc, neon_##t)
+
+#include "../FullyConnected.test.h"
+
diff --git a/libs/kernel/acl/src/neon/Pooling.cpp b/libs/kernel/acl/src/neon/Pooling.cpp
new file mode 100644
index 000000000..5c58ae0b5
--- /dev/null
+++ b/libs/kernel/acl/src/neon/Pooling.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <OperationsUtils.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/TensorInfo.h>
+#include "../IO_accessor.h"
+#include "../shape.h"
+#include "../NEUniqueTensor.h"
+
+#include <cassert>
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+namespace neon {
+
+bool maxPoolFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
+                 int32_t padding_left, int32_t padding_right,
+                 int32_t padding_top, int32_t padding_bottom,
+                 int32_t stride_width, int32_t stride_height,
+                 int32_t filter_width, int32_t filter_height,
+                 int32_t activation,
+                 float* outputData, const nnfw::rt::Shape& outputShape)
+{
+  arm_compute::TensorShape input_shape = util::fromNNShape(inputShape);
+  arm_compute::TensorShape output_shape = util::fromNNShape(outputShape);
+
+  std::vector<std::shared_ptr<arm_compute::IFunction>> fns;
+
+  arm_compute::PadStrideInfo pad_info = arm_compute::PadStrideInfo(stride_width, stride_height,
+                                              padding_left, padding_right,
+                                              padding_top, padding_bottom,
+                                              arm_compute::DimensionRoundingType::FLOOR);
+
+  arm_compute::PoolingLayerInfo maxpool_info = arm_compute::PoolingLayerInfo(arm_compute::PoolingType::MAX,
+                                                        arm_compute::Size2D(filter_width,filter_height),
+                                                        pad_info, false);
+
+  NEUniqueTensor input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
+  NEUniqueTensor output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
+
+  auto pool_f = std::make_shared<arm_compute::NEPoolingLayer>();
+  pool_f->configure(input.ptr(), output.ptr(), maxpool_info);
+
+  fns.emplace_back(pool_f);
+
+  util::insertFusedActivationLayer<NEUniqueTensor, arm_compute::NEActivationLayer>(output, activation, fns);
+
+  input.allocate();
+  output.allocate();
+
+  TensorAccess<InputAccessor>(input.ref(), inputData, inputShape);
+
+  for (const auto &fn : fns)
+  {
+    fn->run();
+  }
+
+  TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape);
+
+  return true;
+}
+
+bool averagePoolFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
+                 int32_t padding_left, int32_t padding_right,
+                 int32_t padding_top, int32_t padding_bottom,
+                 int32_t stride_width, int32_t stride_height,
+                 int32_t filter_width, int32_t filter_height,
+                 int32_t activation,
+                 float* outputData, const nnfw::rt::Shape& outputShape)
+{
+  arm_compute::TensorShape input_shape = util::fromNNShape(inputShape);
+  arm_compute::TensorShape output_shape = util::fromNNShape(outputShape);
+
+  std::vector<std::shared_ptr<arm_compute::IFunction>> fns;
+
+  arm_compute::PadStrideInfo pad_info = arm_compute::PadStrideInfo(stride_width, stride_height,
+                                              padding_left, padding_right,
+                                              padding_top, padding_bottom,
+                                              arm_compute::DimensionRoundingType::FLOOR);
+
+  arm_compute::PoolingLayerInfo pool_info = arm_compute::PoolingLayerInfo(arm_compute::PoolingType::AVG,
+                                                        arm_compute::Size2D(filter_width,filter_height),
+                                                        pad_info, true);
+
+  NEUniqueTensor input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
+  NEUniqueTensor output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
+
+  auto pool_f = std::make_shared<arm_compute::NEPoolingLayer>();
+  pool_f->configure(input.ptr(), output.ptr(), pool_info);
+
+  fns.emplace_back(pool_f);
+
+  util::insertFusedActivationLayer<NEUniqueTensor, arm_compute::NEActivationLayer>(output, activation, fns);
+
+  input.allocate();
+  output.allocate();
+
+  TensorAccess<InputAccessor>(input.ref(), inputData, inputShape);
+
+  for (const auto &fn : fns)
+  {
+    fn->run();
+  }
+
+  TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape);
+
+  return true;
+}
+
+} // namespace neon
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
diff --git a/libs/kernel/acl/src/neon/Pooling.test.cpp b/libs/kernel/acl/src/neon/Pooling.test.cpp
new file mode 100644
index 000000000..4e6593921
--- /dev/null
+++ b/libs/kernel/acl/src/neon/Pooling.test.cpp
@@ -0,0 +1,436 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <OperationsUtils.h>
+#include <kernel/acl/nnfw_kernel_acl.h>
+#include <arm_compute/core/Types.h>
+#include <kernel/acl/Pooling.h>
+
+#include "../util.h"
+
+using namespace nnfw::kernel::acl;
+
+TEST(KernelACL_TC, neon_maxPoolFloat32_3x3to1x1)
+{
+  util::TensorWrapper input({1,3,3,1});
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  int32_t padding_left = 0;
+  int32_t padding_right = 0;
+  int32_t padding_top = 0;
+  int32_t padding_bottom = 0;
+  int32_t stride_width = 1;
+  int32_t stride_height = 1;
+  int32_t filter_width = 3;
+  int32_t filter_height = 3;
+
+  float outputData[1];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 };
+  bool bret;
+
+  float value = 1.0f;
+  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return value++;
+  });
+
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
+
+  bret = neon::maxPoolFloat32(input.ptr<float>(), inputShape,
+                    padding_left, padding_right,
+                    padding_top, padding_bottom,
+                    stride_width, stride_height,
+                    filter_width, filter_height,
+                    activation,
+                    outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] = { 9.0f };
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
+
+TEST(KernelACL_TC, neon_maxPoolFloat32_3x3to1x1_RELU)
+{
+  util::TensorWrapper input({1,3,3,1});
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  int32_t padding_left = 0;
+  int32_t padding_right = 0;
+  int32_t padding_top = 0;
+  int32_t padding_bottom = 0;
+  int32_t stride_width = 1;
+  int32_t stride_height = 1;
+  int32_t filter_width = 3;
+  int32_t filter_height = 3;
+
+  float outputData[1];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 };
+  bool bret;
+
+  float value = -1.0f;
+  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return value--;
+  });
+
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  int32_t activation = ANEURALNETWORKS_FUSED_RELU;
+
+  bret = neon::maxPoolFloat32(input.ptr<float>(), inputShape,
+                    padding_left, padding_right,
+                    padding_top, padding_bottom,
+                    stride_width, stride_height,
+                    filter_width, filter_height,
+                    activation,
+                    outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] = { 0.0f };
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
+
+TEST(KernelACL_TC, neon_maxPoolFloat32_3x3to2x2)
+{
+  util::TensorWrapper input({1,3,3,1});
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  int32_t padding_left = 0;
+  int32_t padding_right = 1;
+  int32_t padding_top = 0;
+  int32_t padding_bottom = 1;
+  int32_t stride_width = 2;
+  int32_t stride_height = 2;
+  int32_t filter_width = 2;
+  int32_t filter_height = 2;
+
+  float outputData[4];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,2,2,1}, 1.0, 0 };
+  bool bret;
+
+  float value = 1.0f;
+  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return value++;
+  });
+
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
+
+  bret = neon::maxPoolFloat32(input.ptr<float>(), inputShape,
+                    padding_left, padding_right,
+                    padding_top, padding_bottom,
+                    stride_width, stride_height,
+                    filter_width, filter_height,
+                    activation,
+                    outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] = {
+    5.0f, 6.0f,
+    8.0f, 9.0f
+  };
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
+
+TEST(KernelACL_TC, neon_maxPoolFloat32_147x147to73x73)
+{
+  util::TensorWrapper input({1,147,147,64});
+  util::TensorWrapper output({1,73,73,64});
+
+  int32_t padding_left = 0;
+  int32_t padding_right = 0;
+  int32_t padding_top = 0;
+  int32_t padding_bottom = 0;
+  int32_t stride_width = 2;
+  int32_t stride_height = 2;
+  int32_t filter_width = 3;
+  int32_t filter_height = 3;
+
+  input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 1.0f;
+  });
+
+  output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 0.f;
+  });
+
+  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
+
+  bool bret = neon::maxPoolFloat32(input.ptr<float>(), input.shape(),
+                    padding_left, padding_right,
+                    padding_top, padding_bottom,
+                    stride_width, stride_height,
+                    filter_width, filter_height,
+                    activation,
+                    output.ptr<float>(), output.shape());
+  EXPECT_EQ(bret, true);
+
+  util::TensorWrapper expected({1,73,73,64});
+  expected.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 1.0f;
+  });
+
+  EXPECT_EQ(output, expected);
+}
+
+TEST(KernelACL_TC, neon_maxPoolFloat32_71x71to35x35)
+{
+  util::TensorWrapper input({1,71,71,192});
+  util::TensorWrapper output({1,35,35,192});
+
+  int32_t padding_left = 0;
+  int32_t padding_right = 0;
+  int32_t padding_top = 0;
+  int32_t padding_bottom = 0;
+  int32_t stride_width = 2;
+  int32_t stride_height = 2;
+  int32_t filter_width = 3;
+  int32_t filter_height = 3;
+
+  input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 1.0f;
+  });
+
+  output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 0.f;
+  });
+
+  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
+
+  bool bret = neon::maxPoolFloat32(input.ptr<float>(), input.shape(),
+                    padding_left, padding_right,
+                    padding_top, padding_bottom,
+                    stride_width, stride_height,
+                    filter_width, filter_height,
+                    activation,
+                    output.ptr<float>(), output.shape());
+  EXPECT_EQ(bret, true);
+
+  util::TensorWrapper expected({1,35,35,192});
+  expected.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 1.0f;
+  });
+
+  EXPECT_EQ(output, expected);
+}
+
+TEST(KernelACL_TC, neon_averagePoolFloat32_3x3to1x1)
+{
+  util::TensorWrapper input({1,3,3,1});
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  int32_t padding_left = 0;
+  int32_t padding_right = 0;
+  int32_t padding_top = 0;
+  int32_t padding_bottom = 0;
+  int32_t stride_width = 1;
+  int32_t stride_height = 1;
+  int32_t filter_width = 3;
+  int32_t filter_height = 3;
+
+  float outputData[1];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 };
+  bool bret;
+
+  float value = 1.0f;
+  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return value++;
+  });
+
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
+
+  bret = neon::averagePoolFloat32(input.ptr<float>(), inputShape,
+                    padding_left, padding_right,
+                    padding_top, padding_bottom,
+                    stride_width, stride_height,
+                    filter_width, filter_height,
+                    activation,
+                    outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] = { 5.0f };
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
+
+TEST(KernelACL_TC, neon_averagePoolFloat32_3x3to1x1_RELU)
+{
+  util::TensorWrapper input({1,3,3,1});
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  int32_t padding_left = 0;
+  int32_t padding_right = 0;
+  int32_t padding_top = 0;
+  int32_t padding_bottom = 0;
+  int32_t stride_width = 1;
+  int32_t stride_height = 1;
+  int32_t filter_width = 3;
+  int32_t filter_height = 3;
+
+  float outputData[1];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,1,1}, 1.0, 0 };
+  bool bret;
+
+  float value = 3.0f;
+  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return value--;
+  });
+
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  int32_t activation = ANEURALNETWORKS_FUSED_RELU;
+
+  bret = neon::averagePoolFloat32(input.ptr<float>(), inputShape,
+                    padding_left, padding_right,
+                    padding_top, padding_bottom,
+                    stride_width, stride_height,
+                    filter_width, filter_height,
+                    activation,
+                    outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] = { 0.0f };
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
+
+TEST(KernelACL_TC, neon_averagePoolFloat32_3x3to2x2)
+{
+  util::TensorWrapper input({1,3,3,1});
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,3,3,1}, 1.0, 0 };
+  int32_t padding_left = 0;
+  int32_t padding_right = 0;
+  int32_t padding_top = 0;
+  int32_t padding_bottom = 0;
+  int32_t stride_width = 1;
+  int32_t stride_height = 1;
+  int32_t filter_width = 2;
+  int32_t filter_height = 2;
+
+  float outputData[4];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,2,2,1}, 1.0, 0 };
+  bool bret;
+
+  float value = 1.0f;
+  input.initValue([&value](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return value++;
+  });
+
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
+
+  bret = neon::averagePoolFloat32(input.ptr<float>(), inputShape,
+                    padding_left, padding_right,
+                    padding_top, padding_bottom,
+                    stride_width, stride_height,
+                    filter_width, filter_height,
+                    activation,
+                    outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] = {
+    3.0f, 4.0f,
+    6.0f, 7.0f
+  };
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
+
+TEST(KernelACL_TC, neon_averagePoolFloat32_35x35to35x35)
+{
+  std::vector<uint32_t> dims = {1,35,35,192};
+  util::TensorWrapper input(dims);
+  util::TensorWrapper output(dims);
+
+  int32_t padding_left = 1;
+  int32_t padding_right = 1;
+  int32_t padding_top = 1;
+  int32_t padding_bottom = 1;
+  int32_t stride_width = 1;
+  int32_t stride_height = 1;
+  int32_t filter_width = 3;
+  int32_t filter_height = 3;
+
+  input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 1.0f;
+  });
+
+  output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 0.f;
+  });
+
+  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
+
+  bool bret = neon::averagePoolFloat32(input.ptr<float>(), input.shape(),
+                    padding_left, padding_right,
+                    padding_top, padding_bottom,
+                    stride_width, stride_height,
+                    filter_width, filter_height,
+                    activation,
+                    output.ptr<float>(), output.shape());
+  EXPECT_EQ(bret, true);
+
+  util::TensorWrapper expected(dims);
+  expected.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+      return 1.0f;
+  });
+
+  EXPECT_EQ(output, expected);
+}
+
+TEST(KernelACL_TC, neon_averagePoolFloat32_8x8to1x1)
+{
+  util::TensorWrapper input({1,8,8,2048});
+  util::TensorWrapper output({1,1,1,2048});
+
+  int32_t padding_left = 0;
+  int32_t padding_right = 0;
+  int32_t padding_top = 0;
+  int32_t padding_bottom = 0;
+  int32_t stride_width = 2;
+  int32_t stride_height = 2;
+  int32_t filter_width = 8;
+  int32_t filter_height = 8;
+
+  input.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 1.0f;
+  });
+
+  output.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 0.f;
+  });
+
+  int32_t activation = ANEURALNETWORKS_FUSED_NONE;
+
+  bool bret = neon::averagePoolFloat32(input.ptr<float>(), input.shape(),
+                    padding_left, padding_right,
+                    padding_top, padding_bottom,
+                    stride_width, stride_height,
+                    filter_width, filter_height,
+                    activation,
+                    output.ptr<float>(), output.shape());
+  EXPECT_EQ(bret, true);
+
+  util::TensorWrapper expected({1,1,1,2048});
+  expected.initValue([](uint32_t n, uint32_t c, uint32_t h, uint32_t w) {
+    return 1.0f;
+  });
+
+  EXPECT_EQ(output, expected);
+}
diff --git a/libs/kernel/acl/src/neon/Reshape.cpp b/libs/kernel/acl/src/neon/Reshape.cpp
new file mode 100644
index 000000000..cef84c7f3
--- /dev/null
+++ b/libs/kernel/acl/src/neon/Reshape.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <OperationsUtils.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/TensorInfo.h>
+
+// TODO: fix include path in CMakeFiles
+#include "../IO_accessor.h"
+#include "../shape.h"
+#include "../NEUniqueTensor.h"
+#include "../Reshape.h"
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+
+namespace neon {
+
+static void sync_scheduler() {
+  arm_compute::CLScheduler::get().sync();
+}
+
+bool reshapeGeneric(const void* inputData, const nnfw::rt::Shape& inputShape,
+                    void* outputData, const nnfw::rt::Shape& outputShape) {
+  return common::reshapeGeneric<NEUniqueTensor, arm_compute::NEReshapeLayer>
+    (inputData, inputShape, outputData, outputShape, sync_scheduler);
+}
+
+} // namespace neon
+
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
+
diff --git a/libs/kernel/acl/src/neon/Reshape.test.cpp b/libs/kernel/acl/src/neon/Reshape.test.cpp
new file mode 100644
index 000000000..9aca45e7e
--- /dev/null
+++ b/libs/kernel/acl/src/neon/Reshape.test.cpp
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define ACL_CORE_FUNC_NAME neon::reshapeGeneric
+#define ACL_TEST(tc, t) TEST(tc, neon_##t)
+
+#include "../Reshape.test.h"
diff --git a/libs/kernel/acl/src/neon/Softmax.cpp b/libs/kernel/acl/src/neon/Softmax.cpp
new file mode 100644
index 000000000..79d614418
--- /dev/null
+++ b/libs/kernel/acl/src/neon/Softmax.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <OperationsUtils.h>
+#include <NeuralNetworks.h>
+
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/TensorInfo.h>
+#include "../IO_accessor.h"
+#include "../shape.h"
+#include "../util.h"
+#include "../NEUniqueTensor.h"
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+namespace neon {
+
+bool softmaxFloat32(const float* inputData, const nnfw::rt::Shape& inputShape,
+                    const float beta,
+                    float* outputData, const nnfw::rt::Shape& outputShape)
+{
+  arm_compute::TensorShape input_shape = util::fromNNShape(inputShape);
+  arm_compute::TensorShape output_shape = util::fromNNShape(outputShape);
+
+  NEUniqueTensor input(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
+  NEUniqueTensor output(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
+
+  auto softmax_f = std::make_shared<arm_compute::NESoftmaxLayer>();
+  softmax_f->configure(input.ptr(), output.ptr(), beta);
+
+  input.allocate();
+  output.allocate();
+
+  if (inputShape.dimensions.size() == 4)
+  {
+    TensorAccess<InputAccessor>(input.ref(), inputData, inputShape);
+
+    softmax_f->run();
+
+    TensorAccess<OutputAccessor>(output.ref(), outputData, outputShape);
+  }
+  else if (inputShape.dimensions.size() == 2)
+  {
+    // Softmax comes with 1xN matrix and this is translated to N vector in arm_compute::TensorShape
+    TensorAccess<VectorInputAccessor>(input.ref(), inputData, inputShape);
+
+    softmax_f->run();
+
+    TensorAccess<VectorOutputAccessor>(output.ref(), outputData, outputShape);
+  }
+  else
+  {
+    assert("undefined dimension of input" && 0);
+    return false;
+  }
+
+  return true;
+}
+
+} // namespace neon
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
diff --git a/libs/kernel/acl/src/neon/Softmax.test.cpp b/libs/kernel/acl/src/neon/Softmax.test.cpp
new file mode 100644
index 000000000..988f55078
--- /dev/null
+++ b/libs/kernel/acl/src/neon/Softmax.test.cpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <OperationsUtils.h>
+#include <kernel/acl/nnfw_kernel_acl.h>
+#include <arm_compute/core/Types.h>
+#include <kernel/acl/Softmax.h>
+
+#include "../util.h"
+
+using namespace nnfw::kernel::acl;
+
+TEST(KernelACL_TC, neon_softmaxFloat32_1xn)
+{
+  float inputData[4];
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,4}, 1.0, 0 };
+  float outputData[4];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,4}, 1.0, 0 };
+  const float beta = 1.0f;
+  bool bret;
+
+  util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  bret = neon::softmaxFloat32(inputData, inputShape, beta, outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] = { 0.25f, 0.25f, 0.25f, 0.25f };
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
+
+TEST(KernelACL_TC, neon_softmaxFloat32_4d)
+{
+  float inputData[4];
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 };
+  float outputData[4];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 };
+  const float beta = 1.0f;
+  bool bret;
+
+  util::initData(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  bret = neon::softmaxFloat32(inputData, inputShape, beta, outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] = { 0.25f, 0.25f, 0.25f, 0.25f };
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
+
+TEST(KernelACL_TC, neon_softmaxFloat32_1xn_seq)
+{
+  float inputData[4];
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 };
+  float outputData[4];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 };
+  const float beta = 1.0f;
+  bool bret;
+
+  util::initData_Increasing(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  bret = neon::softmaxFloat32(inputData, inputShape, beta, outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] = {0.032058603280085, 0.0871443187420326, 0.23688281808991, 0.643914259887972};
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
+
+TEST(KernelACL_TC, neon_softmaxFloat32_4d_seq)
+{
+  float inputData[4];
+  const nnfw::rt::Shape inputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 };
+  float outputData[4];
+  const nnfw::rt::Shape outputShape = { OperandType::FLOAT32, {1,1,4,1}, 1.0, 0 };
+  const float beta = 1.0f;
+  bool bret;
+
+  util::initData_Increasing(inputData, sizeof(inputData) / sizeof(inputData[0]), 1.0);
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  bret = neon::softmaxFloat32(inputData, inputShape, beta, outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectData[] = {0.032058603280085, 0.0871443187420326, 0.23688281808991, 0.643914259887972};
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
diff --git a/libs/kernel/acl/src/shape.cpp b/libs/kernel/acl/src/shape.cpp
new file mode 100644
index 000000000..3c976ae94
--- /dev/null
+++ b/libs/kernel/acl/src/shape.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cassert>
+
+#include "shape.h"
+
+namespace nnfw {
+namespace rt {
+
+// TODO remove from this source and use it from runtime
+uint32_t getSizeOfDimension(const Shape& shape, uint32_t dimensionIdx) {
+    if (dimensionIdx >= shape.dimensions.size()) {
+        // TODO, log the error
+        return 0;
+    }
+    return shape.dimensions[dimensionIdx];
+}
+
+} // namespace rt
+} // namespace nnfw
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+namespace util {
+
+arm_compute::TensorShape fromVectorNNShape(const nnfw::rt::Shape& shape)
+{
+  assert(shape.dimensions.size() == 1);
+
+  const uint32_t len = nnfw::rt::getSizeOfDimension(shape, 0);
+
+  return arm_compute::TensorShape(len);
+}
+
+arm_compute::TensorShape fromMatrixNNShape(const nnfw::rt::Shape& shape)
+{
+  assert(shape.dimensions.size() == 2);
+
+  const uint32_t n = nnfw::rt::getSizeOfDimension(shape, 0);
+  const uint32_t c = nnfw::rt::getSizeOfDimension(shape, 1);
+
+  return arm_compute::TensorShape(c, n);
+}
+
+arm_compute::TensorShape fromNNShape(const nnfw::rt::Shape& shape)
+{
+  if( shape.dimensions.size() == 1 )
+    return fromVectorNNShape(shape);
+  else if( shape.dimensions.size() == 2 )
+    return fromMatrixNNShape(shape);
+
+  // TODO: need to treat 3D tensors.
+
+  assert(shape.dimensions.size() == 4);
+
+  // NNAPI assumes the following ordering:
+  //
+  //  dim(0) -> N
+  //  dim(1) -> H
+  //  dim(2) -> W
+  //  dim(3) -> C
+  //
+  uint32_t c = nnfw::rt::getSizeOfDimension(shape, 3);
+  uint32_t h = nnfw::rt::getSizeOfDimension(shape, 1);
+  uint32_t w = nnfw::rt::getSizeOfDimension(shape, 2);
+  uint32_t n = nnfw::rt::getSizeOfDimension(shape, 0);
+
+  return arm_compute::TensorShape(w, h, c, n);
+}
+
+} // namespace util
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
diff --git a/libs/kernel/acl/src/shape.h b/libs/kernel/acl/src/shape.h
new file mode 100644
index 000000000..902115ebd
--- /dev/null
+++ b/libs/kernel/acl/src/shape.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_KERNEL_ACL_SHAPE_H__
+#define __NNFW_KERNEL_ACL_SHAPE_H__
+
+#include <OperationsUtils.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/TensorInfo.h>
+#include <arm_compute/runtime/IFunction.h>
+#include <cassert>
+
+namespace nnfw {
+namespace rt {
+
+// TODO remove from this source and use it from runtime
+uint32_t getSizeOfDimension(const Shape& shape, uint32_t dimensionIdx);
+
+} // namespace rt
+} // namespace nnfw
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+namespace util {
+
+arm_compute::TensorShape fromVectorNNShape(const nnfw::rt::Shape& shape);
+arm_compute::TensorShape fromNNShape(const nnfw::rt::Shape& shape);
+
+template<class TensorT, class ActT>
+void insertFusedActivationLayer(TensorT& out, int activation,
+    std::vector<std::shared_ptr<arm_compute::IFunction>>& fns) {
+  auto relu_f = std::make_shared<ActT>();
+
+  switch(activation) {
+  case ANEURALNETWORKS_FUSED_NONE:
+    // DO NOTHING
+    return;
+
+  case ANEURALNETWORKS_FUSED_RELU:
+    {
+      const arm_compute::ActivationLayerInfo relu_info(arm_compute::ActivationLayerInfo::ActivationFunction::RELU);
+
+      // Do in-place update
+      relu_f->configure(out.ptr(), nullptr, relu_info);
+    }
+    break;
+
+  case ANEURALNETWORKS_FUSED_RELU1:
+    {
+      const arm_compute::ActivationLayerInfo relu_info(arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 1.f);
+
+      // Do in-place update
+      relu_f->configure(out.ptr(), nullptr, relu_info);
+    }
+    break;
+
+  case ANEURALNETWORKS_FUSED_RELU6:
+    {
+      const arm_compute::ActivationLayerInfo relu_info(arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.f);
+
+      // Do in-place update
+      relu_f->configure(out.ptr(), nullptr, relu_info);
+    }
+    break;
+
+  default:
+    assert("Undefined activation type." && 0);
+    break;
+  }
+
+  fns.emplace_back(relu_f);
+}
+
+} // namespace util
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
+
+#endif // __NNFW_KERNEL_ACL_SHAPE_H__
diff --git a/libs/kernel/acl/src/support.cpp b/libs/kernel/acl/src/support.cpp
new file mode 100644
index 000000000..d04aef59e
--- /dev/null
+++ b/libs/kernel/acl/src/support.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "support.h"
+
+namespace nnfw
+{
+namespace support
+{
+namespace nnapi
+{
+namespace feature
+{
+
+// TODO Extract this function as utility function
+// NOTE It is not a good design to access nnfw::rt::Shape nnfw_support_nnapi lib
+nnfw::util::feature::Shape asFeatureShape(const nnfw::rt::Shape& shape)
+{
+  // NNAPI assumes the following ordering:
+  //
+  //  dim(0) -> N
+  //  dim(1) -> H
+  //  dim(2) -> W
+  //  dim(3) -> C
+  //
+  int32_t c = nnfw::rt::getSizeOfDimension(shape, 3);
+  int32_t h = nnfw::rt::getSizeOfDimension(shape, 1);
+  int32_t w = nnfw::rt::getSizeOfDimension(shape, 2);
+
+  assert(nnfw::rt::getSizeOfDimension(shape, 0) == 1);
+
+  return nnfw::util::feature::Shape{c, h, w};
+}
+
+} // namespace feature
+} // namespace nnapi
+} // namespace support
+} // namespace nnfw
diff --git a/libs/kernel/acl/src/support.h b/libs/kernel/acl/src/support.h
new file mode 100644
index 000000000..751d2c6cb
--- /dev/null
+++ b/libs/kernel/acl/src/support.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_KERNEL_SUPPORT_H_TEMPORARY__
+#define __NNFW_KERNEL_SUPPORT_H_TEMPORARY__
+
+// NOTE these are not decided yet but need to be moved out from Conv2D
+//      to separate NEON implementation to it's folder
+// TODO move to some folder where it should be
+
+#include <cassert>
+
+#include "util/feature/Shape.h"
+
+#include <OperationsUtils.h>
+
+namespace nnfw
+{
+namespace support
+{
+namespace nnapi
+{
+namespace feature
+{
+
+// TODO Extract this function as utility function
+// NOTE It is not a good design to access nnfw::rt::Shape nnfw_support_nnapi lib
+nnfw::util::feature::Shape asFeatureShape(const nnfw::rt::Shape& shape);
+
+} // namespace feature
+} // namespace nnapi
+} // namespace support
+} // namespace nnfw
+
+#include <arm_compute/core/ITensor.h>
+
+#include "util/feature/Reader.h"
+
+namespace nnfw
+{
+namespace support
+{
+namespace acl
+{
+namespace feature
+{
+
+template<typename T> class Reader;
+
+template<> class Reader<float> final : public nnfw::util::feature::Reader<float>
+{
+public:
+  Reader(arm_compute::ITensor *tensor) : _tensor{tensor}
+  {
+    assert(tensor->info()->data_type() == arm_compute::DataType::F32);
+  }
+
+public:
+  float at(uint32_t ch, uint32_t row, uint32_t col) const override
+  {
+    return *ptr_to_element(ch, row, col);
+  }
+
+private:
+  float *ptr_to_element(uint32_t ch, uint32_t row, uint32_t col) const
+  {
+    // ARM Compute uses CHW ordering
+    return reinterpret_cast<float *>(_tensor->ptr_to_element(arm_compute::Coordinates{col, row, ch}));
+  }
+
+private:
+  arm_compute::ITensor *_tensor;
+};
+
+} // namespace feature
+} // namespace acl
+} // namespace support
+} // namespace nnfw
+
+#endif // __NNFW_KERNEL_SUPPORT_H_TEMPORARY__
diff --git a/libs/kernel/acl/src/util.cpp b/libs/kernel/acl/src/util.cpp
new file mode 100644
index 000000000..7e5df534e
--- /dev/null
+++ b/libs/kernel/acl/src/util.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <util/fp32.h>
+
+#include "util.h"
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+namespace util {
+
+void initData(float* data, int num, float value)
+{
+  for (int i = 0; i < num; i++) {
+    *(data + i) = value;
+  }
+}
+
+void initData_Increasing(float* data, int num, float value)
+{
+  for (int i = 0; i < num; i++) {
+    *(data + i) = value;
+    value++;
+  }
+}
+
+// compareData
+// return true if result == expected with the shape info,
+// otherwise false
+bool compareData(const float* result, const float* expected, const nnfw::rt::Shape& shape)
+{
+  if (shape.dimensions.size() == 4)
+  {
+  // TODO fix indentation
+  uint32_t height = nnfw::rt::getSizeOfDimension(shape, 1);
+  uint32_t width  = nnfw::rt::getSizeOfDimension(shape, 2);
+  uint32_t numitems = height * width;
+  for (int item = 0; item < numitems; item++) {
+    if (!::nnfw::util::fp32::epsilon_equal(*(result + item), *(expected + item), 1)) {
+      LOG(ERROR) << "compareData failed: result " << *(result + item)
+                 << ", expected " << *(expected + item)
+                 << ", diff " << ::nnfw::util::fp32::relative_diff(*(result + item), *(expected + item))
+                 << std::endl;
+      return false;
+    }
+  }
+  }
+  else if (shape.dimensions.size() == 2)
+  {
+    uint32_t height = nnfw::rt::getSizeOfDimension(shape, 0);
+    uint32_t width  = nnfw::rt::getSizeOfDimension(shape, 1);
+    uint32_t numitems = height * width;
+    for (int item = 0; item < numitems; item++) {
+      if (!::nnfw::util::fp32::epsilon_equal(*(result + item), *(expected + item), 1)) {
+        LOG(ERROR) << "compareData failed: result " << *(result + item)
+                   << ", expected " << *(expected + item)
+                   << ", diff " << ::nnfw::util::fp32::relative_diff(*(result + item), *(expected + item))
+                   << std::endl;
+        return false;
+      }
+    }
+  }
+  else
+  {
+    // TODO: add a handler for rank 1 and 3
+    LOG(ERROR) << "Unhandled shape: " << shape.dimensions.size() << std::endl;
+  }
+  return true;
+}
+
+void NCHW2NHWC(const float* nchw, float* nhwc, const nnfw::rt::Shape& shape)
+{
+  uint32_t N = nnfw::rt::getSizeOfDimension(shape, 0);
+  uint32_t H = nnfw::rt::getSizeOfDimension(shape, 1);
+  uint32_t W = nnfw::rt::getSizeOfDimension(shape, 2);
+  uint32_t C = nnfw::rt::getSizeOfDimension(shape, 3);
+
+  for (uint32_t n = 0; n < N; n++) {
+    for (uint32_t c = 0; c < C; c++) {
+      for (uint32_t h = 0; h < H; h++) {
+        for (uint32_t w = 0; w < W; w++) {
+          uint32_t soffset = w + (h * W) + (c * W * H) + (n * W * H * C);
+          uint32_t doffset = c + (w * C) + (h * C * W) + (n * C * W * H);
+          *(nhwc + doffset) = *(nchw + soffset);
+        }
+      }
+    }
+  }
+}
+
+} // namespace util
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
diff --git a/libs/kernel/acl/src/util.h b/libs/kernel/acl/src/util.h
new file mode 100644
index 000000000..48ed02783
--- /dev/null
+++ b/libs/kernel/acl/src/util.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_KERNEL_ACL_UTIL_H__
+#define __NNFW_KERNEL_ACL_UTIL_H__
+#include <OperationsUtils.h>
+
+#include <cmath>
+#include <cassert>
+#include <functional>
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+namespace util {
+
+// TODO: make a separate module.
+class TensorWrapper {
+public:
+  TensorWrapper(std::vector<uint32_t> dims,
+      OperandType type = OperandType::FLOAT32,
+      float scale = 1.0,
+      int32_t offset = 0)
+  :_shape{type, dims, scale, offset}
+  {
+
+    // currently, we support only FLOAT32 for now.
+    assert( type == OperandType::FLOAT32);
+
+    uint32_t size_bytes = sizeof(float);
+
+    _num_elems = 1;
+    for( auto& d: dims ) {
+      _num_elems *= d;
+    }
+
+    _data = new uint8_t[_num_elems * size_bytes];
+  }
+
+  ~TensorWrapper() {
+    delete [] _data;
+  }
+
+  const nnfw::rt::Shape shape() const {
+    return _shape;
+  }
+
+  uint32_t num_elems() const { return _num_elems; }
+
+  template<class T>
+  T at(const uint32_t& idx) const {
+    return reinterpret_cast<T*>(_data)[idx];
+  }
+
+  template<class T>
+  T& at(const uint32_t& idx) {
+    return reinterpret_cast<T*>(_data)[idx];
+  }
+
+  template<class T>
+  T* ptr() { return reinterpret_cast<T*>(_data); }
+
+  void initValue(float f) {
+    for( uint32_t i = 0; i < _num_elems; ++i ) {
+      at<float>(i) = f;
+    }
+  }
+
+  typedef std::function<float(uint32_t n, uint32_t c, uint32_t h, uint32_t w)> funcInit4;
+  void initValue(funcInit4 f) {
+    assert(_shape.dimensions.size() == 4);
+
+    int N = _shape.dimensions[0];
+    int H = _shape.dimensions[1];
+    int W = _shape.dimensions[2];
+    int C = _shape.dimensions[3];
+
+    for(int n = 0; n < N; ++n) {
+      for(int h = 0; h < H; ++h) {
+        for(int w = 0; w < W; ++w) {
+          for(int c = 0; c < C; ++c) {
+            uint32_t offset = n*H*W*C + h*W*C + w*C + c;
+            at<float>(offset) = f(n,c,h,w);
+          }
+        }
+      }
+    }
+  }
+
+  typedef std::function<float(uint32_t c, uint32_t h, uint32_t w)> funcInit3;
+  void initValue(funcInit3 f) {
+    assert(_shape.dimensions.size() == 3);
+
+    int C = _shape.dimensions[0];
+    int H = _shape.dimensions[1];
+    int W = _shape.dimensions[2];
+
+    for(int h = 0; h < H; ++h) {
+      for(int w = 0; w < W; ++w) {
+        for(int c = 0; c < C; ++c) {
+          uint32_t offset = h*W*C + w*C + c;
+          at<float>(offset) = f(c,h,w);
+        }
+      }
+    }
+  }
+
+  typedef std::function<float(uint32_t h, uint32_t w)> funcInit2;
+  void initValue(funcInit2 f) {
+    assert(_shape.dimensions.size() == 2);
+
+    int H = _shape.dimensions[0];
+    int W = _shape.dimensions[1];
+
+    for(int h = 0; h < H; ++h) {
+      for(int w = 0; w < W; ++w) {
+        uint32_t offset = h*W + w;
+        at<float>(offset) = f(h,w);
+      }
+    }
+  }
+
+  typedef std::function<float(uint32_t w)> funcInit1;
+  void initValue(funcInit1 f) {
+    assert(_shape.dimensions.size() == 1);
+
+    int W = _shape.dimensions[0];
+
+    for(int w = 0; w < W; ++w) {
+      uint32_t offset = w;
+      at<float>(offset) = f(w);
+    }
+  }
+
+  void initValue(std::vector<float> v) {
+    assert(v.size() == _num_elems);
+    for( uint32_t i = 0; i < _num_elems; ++i ) {
+      at<float>(i) = v[i];
+    }
+  }
+
+  bool operator==(const TensorWrapper &t) const {
+    // compare the shape
+    assert(num_elems() == t.num_elems());
+    assert(_shape.type == t.shape().type);
+    assert(_shape.scale == t.shape().scale);
+    assert(_shape.offset == t.shape().offset);
+    assert(_shape.dimensions == t.shape().dimensions);
+
+    // currently, we support only FLOAT32.
+    assert(_shape.type == OperandType::FLOAT32);
+
+    for( uint32_t i = 0; i < _num_elems; ++i ) {
+      if( std::fabs(static_cast<float>(at<float>(i) - t.at<float>(i))) > 0.001f ) {
+        std::cout << "Comparing [" << i << "] " << at<float>(i) << "," << t.at<float>(i) << std::endl;
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+private:
+  nnfw::rt::Shape _shape;
+  uint32_t           _num_elems;
+  uint8_t*           _data;
+};
+
+void initData(float* data, int num, float value);
+bool compareData(const float* result, const float* expected, const nnfw::rt::Shape& shape);
+void initData_Increasing(float* data, int num, float value);
+
+void NCHW2NHWC(const float* nchw, float* nhwc, const nnfw::rt::Shape& shape);
+
+} // namespace util
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
+
+#endif // __NNFW_KERNEL_ACL_UTIL_H__