23 files changed, 3947 insertions, 0 deletions
diff --git a/runtimes/neurun/backend/acl_cl/Backend.h b/runtimes/neurun/backend/acl_cl/Backend.h
new file mode 100644
index 000000000..7c69d7b40
--- /dev/null
+++ b/runtimes/neurun/backend/acl_cl/Backend.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_ACL_CL_BACKEND_H__
+#define __NEURUN_BACKEND_ACL_CL_BACKEND_H__
+
+#include <memory>
+#include <backend/Backend.h>
+#include <model/Operands.h>
+
+#include "Config.h"
+#include "ConstantInitializer.h"
+#include "KernelGenerator.h"
+#include "ShapeFixer.h"
+#include "TensorManager.h"
+#include "backend/CustomKernelRegistry.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_cl
+{
+
+class Backend : public ::neurun::backend::Backend
+{
+public:
+  Backend() : _config{std::make_shared<Config>()} {}
+
+  std::shared_ptr<IConfig> config() const override { return _config; }
+
+  std::unique_ptr<BackendContext>
+  newContext(const model::Operands &operands,
+             const std::shared_ptr<custom::KernelRegistry> &) const override
+  {
+    auto tensor_builder = std::make_shared<TensorBuilder>(createTensorManager());
+    return std::unique_ptr<BackendContext>{new BackendContext{
+        this, tensor_builder, std::make_shared<ConstantInitializer>(operands, tensor_builder),
+        std::make_shared<KernelGenerator>(operands, tensor_builder),
+        std::make_shared<ShapeFixer>(operands, tensor_builder)}};
+  }
+
+private:
+  std::shared_ptr<IConfig> _config;
+};
+
+} // namespace acl_cl
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_ACL_CL_BACKEND_H__
diff --git a/runtimes/neurun/backend/acl_cl/CLTimer.h b/runtimes/neurun/backend/acl_cl/CLTimer.h
new file mode 100644
index 000000000..3939ee722
--- /dev/null
+++ b/runtimes/neurun/backend/acl_cl/CLTimer.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_ACL_CL_CLTIMER_H__
+#define __NEURUN_BACKEND_ACL_CL_CLTIMER_H__
+
+#include <util/ITimer.h>
+#include <arm_compute/core/CL/OpenCL.h>
+#include <arm_compute/runtime/CL/CLScheduler.h>
+#include <chrono>
+#include <list>
+#include <sstream>
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_cl
+{
+
+/**
+ * @brief Class to measure CL kernels execution time
+ */
+class CLTimer : public util::ITimer
+{
+public:
+  /**
+   * @brief This function replaces CL function, which enqueues a command to execute a kernel
+   *        with a wrapper which remembers enqueued kernels
+   */
+  void handleBegin() override
+  {
+    _measured_events.clear();
+
+    _origin_enqueue_function = arm_compute::CLSymbols::get().clEnqueueNDRangeKernel_ptr;
+
+    auto _timer_enqueue_function = [this](cl_command_queue command_queue, cl_kernel kernel,
+                                          cl_uint work_dim, const size_t *gwo, const size_t *gws,
+                                          const size_t *lws, cl_uint num_events_in_wait_list,
+                                          const cl_event *event_wait_list, cl_event *usr_event) {
+      cl_event event;
+      cl_int enqueue_res =
+          this->_origin_enqueue_function(command_queue, kernel, work_dim, gwo, gws, lws,
+                                         num_events_in_wait_list, event_wait_list, &event);
+      this->_measured_events.emplace_back(event);
+
+      // According to spec, if NULL was provided in usr_event - event shouldn't be returned
+      if (usr_event != nullptr)
+      {
+        clRetainEvent(event);
+        *usr_event = event;
+      }
+      return enqueue_res;
+    };
+    arm_compute::CLSymbols::get().clEnqueueNDRangeKernel_ptr = _timer_enqueue_function;
+
+    // Set CL_QUEUE_PROFILING_ENABLE flag for the CL command-queue, if it isn't already set
+    auto &cl_scheduler = arm_compute::CLScheduler::get();
+    auto props = cl_scheduler.queue().getInfo<CL_QUEUE_PROPERTIES>();
+    if ((props & CL_QUEUE_PROFILING_ENABLE) == 0)
+    {
+      cl_scheduler.set_queue(
+          cl::CommandQueue(cl_scheduler.context(), props | CL_QUEUE_PROFILING_ENABLE));
+    }
+  };
+
+  /**
+   * @brief Get timer result by addition executed CL kernels durations
+   */
+  void handleEnd() override
+  {
+    _timer_res = 0;
+    for (auto const &event : _measured_events)
+    {
+      cl_ulong start;
+      cl_ulong end;
+      event.getProfilingInfo(CL_PROFILING_COMMAND_START, &start);
+      event.getProfilingInfo(CL_PROFILING_COMMAND_END, &end);
+      _timer_res += (end - start) / 1000.f; // nanoseconds -> microseconds
+    }
+
+    // Restore origin CL enqueue function
+    arm_compute::CLSymbols::get().clEnqueueNDRangeKernel_ptr = _origin_enqueue_function;
+  };
+
+private:
+  std::function<decltype(clEnqueueNDRangeKernel)> _origin_enqueue_function;
+  std::list<::cl::Event> _measured_events;
+};
+
+} // namespace acl_cl
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_ACL_CL_CLTIMER_H__
diff --git a/runtimes/neurun/backend/acl_cl/CMakeLists.txt b/runtimes/neurun/backend/acl_cl/CMakeLists.txt
new file mode 100644
index 000000000..7d7b50cf0
--- /dev/null
+++ b/runtimes/neurun/backend/acl_cl/CMakeLists.txt
@@ -0,0 +1,21 @@
+# Unsupported architecture
+nnfw_find_package(ARMCompute QUIET)
+if(NOT ARMCompute_FOUND)
+  return()
+endif(NOT ARMCompute_FOUND)
+
+set(LIB_NEURUN_BACKEND_ACL_CL neurun_backend_acl_cl)
+
+file(GLOB_RECURSE SOURCES "*.cc")
+
+add_library(${LIB_NEURUN_BACKEND_ACL_CL} SHARED ${SOURCES})
+
+target_include_directories(${LIB_NEURUN_BACKEND_ACL_CL} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+target_link_libraries(${LIB_NEURUN_BACKEND_ACL_CL} PRIVATE neurun_core)
+target_link_libraries(${LIB_NEURUN_BACKEND_ACL_CL} PRIVATE ${LIB_NEURUN_BACKEND_ACL_COMMON})
+target_link_libraries(${LIB_NEURUN_BACKEND_ACL_CL} PRIVATE nnfw_common)
+target_link_libraries(${LIB_NEURUN_BACKEND_ACL_CL} PRIVATE nnfw_coverage)
+
+set_target_properties(${LIB_NEURUN_BACKEND_ACL_CL} PROPERTIES OUTPUT_NAME backend_acl_cl)
+
+install(TARGETS ${LIB_NEURUN_BACKEND_ACL_CL} DESTINATION lib)
diff --git a/runtimes/neurun/backend/acl_cl/Config.cc b/runtimes/neurun/backend/acl_cl/Config.cc
new file mode 100644
index 000000000..0c0769184
--- /dev/null
+++ b/runtimes/neurun/backend/acl_cl/Config.cc
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// For CLKernelLibraryEx initialization
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+#include "Config.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_cl
+{
+
+void Config::initialize()
+{
+  arm_compute::CLScheduler::get().default_init();
+  // NOTE CLKernelLibraryEx must use the same context as CLScheduler
+  // It did not check whether another device is available.
+  arm_compute::CLKernelLibraryEx::get().init(
+      "./cl_kernels/", arm_compute::CLScheduler::get().context(), cl::Device::getDefault());
+}
+
+} // namespace acl_cl
+} // namespace backend
+} // namespace neurun
diff --git a/runtimes/neurun/backend/acl_cl/Config.h b/runtimes/neurun/backend/acl_cl/Config.h
new file mode 100644
index 000000000..185765161
--- /dev/null
+++ b/runtimes/neurun/backend/acl_cl/Config.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_ACL_CL_CONFIG_H__
+#define __NEURUN_BACKEND_ACL_CL_CONFIG_H__
+
+#include "CLTimer.h"
+#include <cpp14/memory.h>
+#include <backend/IConfig.h>
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_cl
+{
+
+class Config : public IConfig
+{
+public:
+  std::string id() override { return "acl_cl"; }
+  void initialize() override;
+  bool SupportSubTensorAlloc() override { return true; }
+  std::unique_ptr<util::ITimer> timer() override { return nnfw::cpp14::make_unique<CLTimer>(); }
+};
+
+} // namespace acl_cl
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_ACL_CL_CONFIG_H__
diff --git a/runtimes/neurun/backend/acl_cl/ConstantInitializer.cc b/runtimes/neurun/backend/acl_cl/ConstantInitializer.cc
new file mode 100644
index 000000000..0a8f536ec
--- /dev/null
+++ b/runtimes/neurun/backend/acl_cl/ConstantInitializer.cc
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConstantInitializer.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_cl
+{
+
+ConstantInitializer::ConstantInitializer(const model::Operands &operands,
+                                         const std::shared_ptr<TensorBuilder> &tensor_builder)
+    : _operands{operands}, _tensor_builder{tensor_builder}
+{
+  // DO NOTHING
+}
+
+void ConstantInitializer::run()
+{
+  for (const auto &it : _init_map)
+  {
+    const auto &ind = it.first;
+    const auto &fn = it.second;
+
+    const auto &model_obj = _operands.at(ind);
+    auto tensor_obj = _tensor_builder->wrapTensor(ind);
+    fn(model_obj, *tensor_obj);
+  }
+
+  _init_map.clear();
+}
+
+void ConstantInitializer::visit(const model::operation::Conv2DNode &node)
+{
+  const auto &kernel_index = node.getInputs().at(model::operation::Conv2DNode::KERNEL);
+  const auto &kernel_obj = _operands.at(kernel_index);
+  registerPermuteInitializer(kernel_index, kernel_obj);
+
+  const auto &bias_index = node.getInputs().at(model::operation::Conv2DNode::BIAS);
+  const auto &bias_obj = _operands.at(bias_index);
+  registerCopyInitializer(bias_index, bias_obj);
+}
+
+void ConstantInitializer::visit(const model::operation::DepthwiseConv2DNode &node)
+{
+  const auto &kernel_index = node.getInputs().at(model::operation::DepthwiseConv2DNode::KERNEL);
+  const auto &kernel_obj = _operands.at(kernel_index);
+  registerPermuteInitializer(kernel_index, kernel_obj);
+
+  const auto &bias_index = node.getInputs().at(model::operation::DepthwiseConv2DNode::BIAS);
+  const auto &bias_obj = _operands.at(bias_index);
+  registerCopyInitializer(bias_index, bias_obj);
+}
+
+void ConstantInitializer::visit(const model::operation::EmbeddingLookupNode &node)
+{
+  const auto &lookups_index = node.getInputs().at(model::operation::EmbeddingLookupNode::LOOKUPS);
+  const auto &lookups_obj = _operands.at(lookups_index);
+  registerCopyInitializer(lookups_index, lookups_obj);
+}
+
+void ConstantInitializer::visit(const model::operation::FullyConnectedNode &node)
+{
+  const auto &weight_index = node.getInputs().at(model::operation::FullyConnectedNode::WEIGHT);
+  const auto &weight_obj = _operands.at(weight_index);
+  registerCopyInitializer(weight_index, weight_obj);
+
+  const auto &bias_index = node.getInputs().at(model::operation::FullyConnectedNode::BIAS);
+  const auto &bias_obj = _operands.at(bias_index);
+  registerCopyInitializer(bias_index, bias_obj);
+}
+
+void ConstantInitializer::visit(const model::operation::GatherNode &node)
+{
+  const auto &indices_index = node.getInputs().at(model::operation::GatherNode::INDICES);
+  const auto &indices_obj = _operands.at(indices_index);
+  registerCopyInitializer(indices_index, indices_obj);
+}
+
+void ConstantInitializer::visit(const model::operation::HashtableLookupNode &node)
+{
+  const auto &lookups_index = node.getInputs().at(model::operation::HashtableLookupNode::LOOKUPS);
+  const auto &lookups_obj = _operands.at(lookups_index);
+  registerCopyInitializer(lookups_index, lookups_obj);
+
+  const auto &keys_index = node.getInputs().at(model::operation::HashtableLookupNode::KEYS);
+  const auto &keys_obj = _operands.at(keys_index);
+  registerCopyInitializer(keys_index, keys_obj);
+}
+
+void ConstantInitializer::visit(const model::operation::LSTMNode &node)
+{
+  const auto &input_to_input_weights_index =
+      node.getInputs().at(model::operation::LSTMNode::INPUT_TO_INPUT_WEIGHTS);
+  const auto &input_to_input_weights_obj = _operands.at(input_to_input_weights_index);
+  registerCopyInitializer(input_to_input_weights_index, input_to_input_weights_obj);
+
+  const auto &input_to_forget_weights_index =
+      node.getInputs().at(model::operation::LSTMNode::INPUT_TO_FORGET_WEIGHTS);
+  const auto &input_to_forget_weights_obj = _operands.at(input_to_forget_weights_index);
+  registerCopyInitializer(input_to_forget_weights_index, input_to_forget_weights_obj);
+
+  const auto &input_to_cell_weights_index =
+      node.getInputs().at(model::operation::LSTMNode::INPUT_TO_CELL_WEIGHTS);
+  const auto &input_to_cell_weights_obj = _operands.at(input_to_cell_weights_index);
+  registerCopyInitializer(input_to_cell_weights_index, input_to_cell_weights_obj);
+
+  const auto &input_to_output_weights_index =
+      node.getInputs().at(model::operation::LSTMNode::INPUT_TO_OUTPUT_WEIGHTS);
+  const auto &input_to_output_weights_obj = _operands.at(input_to_output_weights_index);
+  registerCopyInitializer(input_to_output_weights_index, input_to_output_weights_obj);
+
+  const auto &recurrent_to_input_weights_index =
+      node.getInputs().at(model::operation::LSTMNode::RECURRENT_TO_INPUT_WEIGHTS);
+  const auto &recurrent_to_input_weights_obj = _operands.at(recurrent_to_input_weights_index);
+  registerCopyInitializer(recurrent_to_input_weights_index, recurrent_to_input_weights_obj);
+
+  const auto &recurrent_to_forget_weights_index =
+      node.getInputs().at(model::operation::LSTMNode::RECURRENT_TO_FORGET_WEIGHTS);
+  const auto &recurrent_to_forget_weights_obj = _operands.at(recurrent_to_forget_weights_index);
+  registerCopyInitializer(recurrent_to_forget_weights_index, recurrent_to_forget_weights_obj);
+
+  const auto &recurrent_to_cell_weights_index =
+      node.getInputs().at(model::operation::LSTMNode::RECURRENT_TO_CELL_WEIGHTS);
+  const auto &recurrent_to_cell_weights_obj = _operands.at(recurrent_to_cell_weights_index);
+  registerCopyInitializer(recurrent_to_cell_weights_index, recurrent_to_cell_weights_obj);
+
+  const auto &recurrent_to_output_weights_index =
+      node.getInputs().at(model::operation::LSTMNode::RECURRENT_TO_OUTPUT_WEIGHTS);
+  const auto &recurrent_to_output_weights_obj = _operands.at(recurrent_to_output_weights_index);
+  registerCopyInitializer(recurrent_to_output_weights_index, recurrent_to_output_weights_obj);
+
+  const auto &cell_to_input_weights_index =
+      node.getInputs().at(model::operation::LSTMNode::CELL_TO_INPUT_WEIGHTS);
+  const auto &cell_to_input_weights_obj = _operands.at(cell_to_input_weights_index);
+  registerCopyInitializer(cell_to_input_weights_index, cell_to_input_weights_obj);
+
+  const auto &cell_to_forget_weights_index =
+      node.getInputs().at(model::operation::LSTMNode::CELL_TO_FORGET_WEIGHTS);
+  const auto &cell_to_forget_weights_obj = _operands.at(cell_to_forget_weights_index);
+  registerCopyInitializer(cell_to_forget_weights_index, cell_to_forget_weights_obj);
+
+  const auto &cell_to_output_weights_index =
+      node.getInputs().at(model::operation::LSTMNode::CELL_TO_OUTPUT_WEIGHTS);
+  const auto &cell_to_output_weights_obj = _operands.at(cell_to_output_weights_index);
+  registerCopyInitializer(cell_to_output_weights_index, cell_to_output_weights_obj);
+
+  const auto &input_gate_bias_index =
+      node.getInputs().at(model::operation::LSTMNode::INPUT_GATE_BIAS);
+  const auto &input_gate_bias_obj = _operands.at(input_gate_bias_index);
+  registerCopyInitializer(input_gate_bias_index, input_gate_bias_obj);
+
+  const auto &forget_gate_bias_index =
+      node.getInputs().at(model::operation::LSTMNode::FORGET_GATE_BIAS);
+  const auto &forget_gate_bias_obj = _operands.at(forget_gate_bias_index);
+  registerCopyInitializer(forget_gate_bias_index, forget_gate_bias_obj);
+
+  const auto &output_gate_bias_index =
+      node.getInputs().at(model::operation::LSTMNode::OUTPUT_GATE_BIAS);
+  const auto &output_gate_bias_obj = _operands.at(output_gate_bias_index);
+  registerCopyInitializer(output_gate_bias_index, output_gate_bias_obj);
+
+  const auto &projection_weights_index =
+      node.getInputs().at(model::operation::LSTMNode::PROJECTION_WEIGHTS);
+  const auto &projection_weights_obj = _operands.at(projection_weights_index);
+  registerCopyInitializer(projection_weights_index, projection_weights_obj);
+
+  const auto &projection_bias_index =
+      node.getInputs().at(model::operation::LSTMNode::PROJECTION_BIAS);
+  const auto &projection_bias_obj = _operands.at(projection_bias_index);
+  registerCopyInitializer(projection_bias_index, projection_bias_obj);
+}
+
+void ConstantInitializer::visit(const model::operation::RNNNode &node)
+{
+  const auto &weights_index = node.getInputs().at(model::operation::RNNNode::WEIGHTS);
+  const auto &weights_obj = _operands.at(weights_index);
+  registerCopyInitializer(weights_index, weights_obj);
+
+  const auto &recurrent_weights_index =
+      node.getInputs().at(model::operation::RNNNode::RECURRENT_WEIGHTS);
+  const auto &recurrent_weights_obj = _operands.at(recurrent_weights_index);
+  registerCopyInitializer(recurrent_weights_index, recurrent_weights_obj);
+
+  const auto &bias_index = node.getInputs().at(model::operation::RNNNode::BIAS);
+  const auto &bias_obj = _operands.at(bias_index);
+  registerCopyInitializer(bias_index, bias_obj);
+}
+
+void ConstantInitializer::visit(const model::operation::TransposeConvNode &node)
+{
+  const auto &kernel_index = node.getInputs().at(model::operation::TransposeConvNode::KERNEL);
+  const auto &kernel_obj = _operands.at(kernel_index);
+  registerPermuteInitializer(kernel_index, kernel_obj);
+}
+
+} // namespace acl_cl
+} // namespace backend
+} // namespace neurun
diff --git a/runtimes/neurun/backend/acl_cl/ConstantInitializer.h b/runtimes/neurun/backend/acl_cl/ConstantInitializer.h
new file mode 100644
index 000000000..59772e0f7
--- /dev/null
+++ b/runtimes/neurun/backend/acl_cl/ConstantInitializer.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_COMPILER_ACL_CL_CONSTANT_INITIALIZER_H__
+#define __NEURUN_COMPILER_ACL_CL_CONSTANT_INITIALIZER_H__
+
+#include <backend/IConstantInitializer.h>
+#include <model/Operands.h>
+#include "TensorBuilder.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_cl
+{
+
+class ConstantInitializer : public IConstantInitializer
+{
+public:
+  ConstantInitializer(const model::Operands &operands,
+                      const std::shared_ptr<TensorBuilder> &tensor_builder);
+
+public:
+  void run() override;
+
+public:
+  void visit(const model::operation::Conv2DNode &) override;
+  void visit(const model::operation::DepthwiseConv2DNode &) override;
+  void visit(const model::operation::EmbeddingLookupNode &) override;
+  void visit(const model::operation::FullyConnectedNode &) override;
+  void visit(const model::operation::GatherNode &) override;
+  void visit(const model::operation::HashtableLookupNode &) override;
+  void visit(const model::operation::LSTMNode &) override;
+  void visit(const model::operation::RNNNode &) override;
+  void visit(const model::operation::TransposeConvNode &) override;
+
+private:
+  const model::Operands &_operands;
+  std::shared_ptr<TensorBuilder> _tensor_builder;
+};
+
+} // namespace acl_cl
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_COMPILER_ACL_CL_CONSTANT_INITIALIZER_H__
diff --git a/runtimes/neurun/backend/acl_cl/KernelGenerator.cc b/runtimes/neurun/backend/acl_cl/KernelGenerator.cc
new file mode 100644
index 000000000..8b019a45a
--- /dev/null
+++ b/runtimes/neurun/backend/acl_cl/KernelGenerator.cc
@@ -0,0 +1,2034 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "KernelGenerator.h"
+
+#include <arm_compute/runtime/CL/CLFunctions.h>   // Include all ARM Compute CL functions
+#include <arm_compute/runtime/CL/CLFunctionsEx.h> // Include all ARM Compute EX CL functions
+#include <arm_compute/runtime/misc/functions/GenericGather.h>
+#include <arm_compute/runtime/misc/functions/GenericReshapeLayer.h>
+
+#include <AclFunction.h>
+#include <Convert.h>
+#include <Swizzle.h>
+
+#include "kernel/ConcatLayer.h"
+#include "model/Index.h"
+#include "model/DataType.h"
+#include "model/InternalType.h"
+#include "compiler/IExecutionBuilder.h"
+#include "exec/NopFunction.h"
+#include "util/logging.h"
+#include "util/Utils.h"
+#include "util/Padding.h"
+
+using ::neurun::compiler::IExecutionBuilder;
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_cl
+{
+
+using ::neurun::backend::acl_common::asAclFunction;
+
+//
+// ActivationBuilder
+//
+class ActivationBuilder
+{
+public:
+  explicit ActivationBuilder(IExecutionBuilder &builder) : _builder(builder)
+  {
+    // DO NOTHING
+  }
+
+private:
+  void appendReLU(::arm_compute::ICLTensor *ifm_alloc);
+  void appendReLU1(::arm_compute::ICLTensor *ifm_alloc);
+  void appendReLU6(::arm_compute::ICLTensor *ifm_alloc);
+
+public:
+  void append(model::Activation code, ::arm_compute::ICLTensor *ifm_alloc);
+
+private:
+  IExecutionBuilder &_builder;
+};
+
+void ActivationBuilder::appendReLU(::arm_compute::ICLTensor *ifm_alloc)
+{
+  const ::arm_compute::ActivationLayerInfo act_info{
+      ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
+
+  fn->configure(ifm_alloc, nullptr, act_info);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _builder.append(std::move(acl_fn));
+}
+
+void ActivationBuilder::appendReLU1(::arm_compute::ICLTensor *ifm_alloc)
+{
+  const ::arm_compute::ActivationLayerInfo act_info{
+      ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
+
+  fn->configure(ifm_alloc, nullptr, act_info);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _builder.append(std::move(acl_fn));
+}
+
+void ActivationBuilder::appendReLU6(::arm_compute::ICLTensor *ifm_alloc)
+{
+  const ::arm_compute::ActivationLayerInfo act_info{
+      ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.0f, 0.0f};
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
+
+  fn->configure(ifm_alloc, nullptr, act_info);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _builder.append(std::move(acl_fn));
+}
+
+void ActivationBuilder::append(model::Activation code, ::arm_compute::ICLTensor *ifm_alloc)
+{
+  switch (code)
+  {
+    case model::Activation::NONE:
+    {
+      // DO NOTHING
+      break;
+    }
+    case model::Activation::RELU:
+    {
+      appendReLU(ifm_alloc);
+      break;
+    }
+    case model::Activation::RELU1:
+    {
+      appendReLU1(ifm_alloc);
+      break;
+    }
+    case model::Activation::RELU6:
+    {
+      appendReLU6(ifm_alloc);
+      break;
+    }
+    default:
+    {
+      throw std::runtime_error("Not supported, yet");
+    }
+  }
+}
+
+//
+// KernelGenerator
+//
+KernelGenerator::KernelGenerator(const neurun::model::Operands &ctx,
+                                 const std::shared_ptr<TensorBuilder> &tensor_builder)
+    : _ctx(ctx), _tensor_builder(tensor_builder), _current_subg_layout(model::Layout::UNKNOWN)
+{
+  // DO NOTHING
+}
+
+void KernelGenerator::visit(const model::Subgraph &subgraph)
+{
+  _current_subg_layout = subgraph.getLayout();
+  for (const auto &e : subgraph.operations())
+  {
+    const auto &node = *(e.node);
+    _tensor_builder->preVisit(node);
+    node.accept(*this);
+    _tensor_builder->postVisit(node);
+  }
+}
+
+void KernelGenerator::visit(const model::operation::CastNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::CastNode::Input::INPUT)};
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLCast>();
+
+  fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::Conv2DNode &node)
+{
+  using model::operation::Conv2DNode;
+
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(Conv2DNode::Input::INPUT)};
+  const auto ker_index{node.getInputs().at(Conv2DNode::Input::KERNEL)};
+  const auto bias_index{node.getInputs().at(Conv2DNode::Input::BIAS)};
+
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
+  // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
+  const auto &ker_shape = _ctx.at(ker_index).shape();
+  const auto ker_height = ker_shape.dim(1);
+  const auto ker_width = ker_shape.dim(2);
+
+  const auto stride = node.param().stride;
+  const auto padding = neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape,
+                                                      stride, ker_width, ker_height);
+  const auto activation = node.param().activation;
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ker_alloc = _tensor_builder->at(ker_index).get();
+  auto bias_alloc = _tensor_builder->at(bias_index).get();
+
+  const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
+  const auto act_info = acl_common::asActivationLayerInfo(activation);
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLConvolutionLayer>(
+      _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+
+  fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(),
+                conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
+
+  _execution_builder->append(asAclFunction(std::move(fn)));
+}
+
+void KernelGenerator::visit(const model::operation::DepthwiseConv2DNode &node)
+{
+  using model::operation::DepthwiseConv2DNode;
+
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(DepthwiseConv2DNode::Input::INPUT)};
+  const auto ker_index{node.getInputs().at(DepthwiseConv2DNode::Input::KERNEL)};
+  const auto bias_index{node.getInputs().at(DepthwiseConv2DNode::Input::BIAS)};
+
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
+  // Kernel format is [1, kernel_height, kernel_width, depth_out].
+  const auto &ker_shape = _ctx.at(ker_index).shape();
+  const auto ker_height = ker_shape.dim(1);
+  const auto ker_width = ker_shape.dim(2);
+
+  const auto stride = node.param().stride;
+  const auto padding = neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape,
+                                                      stride, ker_width, ker_height);
+  const auto multiplier = node.param().multiplier;
+  const auto activation = node.param().activation;
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ker_alloc = _tensor_builder->at(ker_index).get();
+  auto bias_alloc = _tensor_builder->at(bias_index).get();
+
+  const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
+  // TODO Use `activation` instead of `model::Activation::NONE`. See below.
+  const auto act_info = acl_common::asActivationLayerInfo(model::Activation::NONE);
+
+  if (ker_height == 3 && ker_width == 3)
+  {
+    auto fn = nnfw::cpp14::make_unique<::arm_compute::CLDepthwiseConvolutionLayer3x3>(
+        _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+
+    fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(),
+                  ofm_alloc->handle(), conv_info, multiplier, act_info);
+
+    _execution_builder->append(asAclFunction(std::move(fn)));
+  }
+  else
+  {
+    auto fn = nnfw::cpp14::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>();
+
+    fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(),
+                  ofm_alloc->handle(), conv_info, multiplier, act_info);
+
+    _execution_builder->append(asAclFunction(std::move(fn)));
+  }
+
+  // TODO Use fused activation instead of separate layer after switching to ACL version >= v19.05.
+  // Prior versions had a bug due to which the fused activation did not apply in some cases.
+  ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
+}
+
+void KernelGenerator::visit(const model::operation::MaxPool2DNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::MaxPool2DNode::Input::INPUT)};
+
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
+
+  const auto kh = node.param().kh;
+  const auto kw = node.param().kw;
+  const auto stride = node.param().stride;
+  const auto padding =
+      neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+  const auto activation = node.param().activation;
+
+  VERBOSE(MaxPool2D) << "IFM_H: " << ifm_shape.H << std::endl;
+  VERBOSE(MaxPool2D) << "IFM_W: " << ifm_shape.W << std::endl;
+  VERBOSE(MaxPool2D) << "OFM_H: " << ofm_shape.H << std::endl;
+  VERBOSE(MaxPool2D) << "OFM_W: " << ofm_shape.W << std::endl;
+  VERBOSE(MaxPool2D) << "KER_H: " << kh << std::endl;
+  VERBOSE(MaxPool2D) << "KER_W: " << kw << std::endl;
+  VERBOSE(MaxPool2D) << "STRIDE_H: " << stride.vertical << std::endl;
+  VERBOSE(MaxPool2D) << "STRIDE_W: " << stride.horizontal << std::endl;
+  VERBOSE(MaxPool2D) << "PAD(T): " << padding.top << std::endl;
+  VERBOSE(MaxPool2D) << "PAD(B): " << padding.bottom << std::endl;
+  VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl;
+  VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl;
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX,
+                                       ::arm_compute::Size2D{kw, kh},
+                                       acl_common::asPadStrideInfo(padding, stride)};
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPoolingLayer>();
+
+  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append((std::move(acl_fn)));
+
+  ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
+}
+
+void KernelGenerator::visit(const model::operation::AvgPool2DNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::AvgPool2DNode::Input::INPUT)};
+
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
+
+  const auto kh = node.param().kh;
+  const auto kw = node.param().kw;
+  const auto stride = node.param().stride;
+  const auto padding =
+      neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+  const auto activation = node.param().activation;
+
+  VERBOSE(AvgPool2D) << "IFM_H: " << ifm_shape.H << std::endl;
+  VERBOSE(AvgPool2D) << "IFM_W: " << ifm_shape.W << std::endl;
+  VERBOSE(AvgPool2D) << "OFM_H: " << ofm_shape.H << std::endl;
+  VERBOSE(AvgPool2D) << "OFM_W: " << ofm_shape.W << std::endl;
+  VERBOSE(AvgPool2D) << "KER_H: " << kh << std::endl;
+  VERBOSE(AvgPool2D) << "KER_W: " << kw << std::endl;
+  VERBOSE(AvgPool2D) << "STRIDE_H: " << stride.vertical << std::endl;
+  VERBOSE(AvgPool2D) << "STRIDE_W: " << stride.horizontal << std::endl;
+  VERBOSE(AvgPool2D) << "PAD(T): " << padding.top << std::endl;
+  VERBOSE(AvgPool2D) << "PAD(B): " << padding.bottom << std::endl;
+  VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl;
+  VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl;
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  ::arm_compute::PoolingLayerInfo info{
+      ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh},
+      acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */};
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPoolingLayer>();
+
+  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append((std::move(acl_fn)));
+
+  ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
+}
+
+void KernelGenerator::visit(const model::operation::ConcatNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+
+  std::vector<model::OperandIndex> input_indexes;
+
+  for (const auto &input : node.getInputs())
+    input_indexes.emplace_back(input);
+
+  const auto axis = node.param().axis;
+
+  // If tensor allocator allocate as subtensor
+  bool canEliminate = true;
+  for (auto &ifm_ind : input_indexes)
+  {
+    if (!_tensor_builder->isSubTensorOf(ofm_index, ifm_ind))
+    {
+      canEliminate = false;
+      break;
+    }
+  }
+  if (canEliminate)
+  {
+    // If concat eliminated, return a NOP IFunction
+    _execution_builder->append(nnfw::cpp14::make_unique<exec::NopFunction>());
+    return;
+  }
+
+  auto output_alloc = static_cast<::neurun::backend::acl_cl::operand::Object *>(
+      _tensor_builder->wrapTensor(ofm_index).get());
+
+  std::vector<::neurun::backend::acl_cl::operand::Object *> input_allocs;
+  for (auto &ifm_ind : input_indexes)
+    input_allocs.emplace_back(static_cast<::neurun::backend::acl_cl::operand::Object *>(
+        _tensor_builder->wrapTensor(ifm_ind).get()));
+
+  auto fn = nnfw::cpp14::make_unique<::neurun::backend::acl_cl::kernel::ConcatLayer>();
+
+  const auto rank = _ctx.at(ofm_index).shape().rank();
+  const auto frontend_layout = _current_subg_layout;
+  const auto backend_layout = output_alloc->ptr()->layout();
+  const auto fixed_axis =
+      acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
+
+  fn->configure(input_allocs, fixed_axis, output_alloc);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::FullyConnectedNode &node)
+{
+  using model::operation::FullyConnectedNode;
+
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(FullyConnectedNode::Input::INPUT)};
+  const auto weight_index{node.getInputs().at(FullyConnectedNode::Input::WEIGHT)};
+  const auto bias_index{node.getInputs().at(FullyConnectedNode::Input::BIAS)};
+
+  const auto input_rank = _ctx.at(input_index).shape().rank();
+  // TODO Currently we are not handling where the case is that the input's rank is 3.
+  // The handling should be added in the future.
+  assert(input_rank != 3);
+
+  const auto output_size = _ctx.at(output_index).shape().dim(1);
+  UNUSED_RELEASE(output_size);
+  assert(_ctx.at(bias_index).shape().dim(0) == output_size);
+  assert(_ctx.at(weight_index).shape().dim(0) == output_size);
+  const auto batch_size = _ctx.at(output_index).shape().dim(0);
+  const auto input_size = _ctx.at(weight_index).shape().dim(1);
+
+  // Check for reshaping input's shape into rank-2
+  bool needs_reshape = false;
+  neurun::model::Shape reshape(2);
+  if (input_rank == 4)
+  {
+    const auto feature_size = _ctx.at(input_index).shape().num_elements();
+
+    UNUSED_RELEASE(feature_size);
+    assert((batch_size * input_size) >= 0);
+    assert(feature_size == static_cast<uint64_t>(batch_size * input_size));
+
+    // for reshaping
+    needs_reshape = true;
+    reshape.dim(0) = batch_size; /* H */
+    reshape.dim(1) = input_size; /* W */
+  }
+
+  const auto activation = node.param().activation;
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto weight_alloc = _tensor_builder->at(weight_index).get();
+  auto bias_alloc = _tensor_builder->at(bias_index).get();
+  auto acl_layout = output_alloc->handle()->info()->data_layout();
+
+  auto fn = nnfw::cpp14::make_unique<arm_compute::CLFullyConnectedReshapingLayer>(
+      _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+
+  fn->configure(
+      input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(),
+      needs_reshape,
+      ::neurun::backend::acl_common::asTensorShape(/* TODO Support NCHW frontend */
+                                                   reshape, model::Layout::NHWC,
+                                                   ::neurun::backend::acl_common::asRuntimeLayout(
+                                                       acl_layout)));
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+
+  ActivationBuilder{*_execution_builder}.append(activation, output_alloc->handle());
+}
+
+void KernelGenerator::visit(const model::operation::MulNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto lhs_index{node.getInputs().at(model::operation::MulNode::Input::LHS)};
+  const auto rhs_index{node.getInputs().at(model::operation::MulNode::Input::RHS)};
+
+  const auto activation = node.param().activation;
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPixelWiseMultiplication>();
+
+  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale
+                arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+
+  ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
+}
+
+void KernelGenerator::visit(const model::operation::ReduceSumNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::ReduceSumNode::Input::INPUT)};
+  const auto axis_index{node.param().axis_index};
+
+  const auto axis_base = _ctx.at(axis_index).data().base();
+  const auto axis_size = _ctx.at(axis_index).shape().num_elements();
+  const auto input_rank = _ctx.at(input_index).shape().rank();
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
+  const auto frontend_layout = _current_subg_layout;
+  const auto backend_layout = input_alloc->layout();
+  // The axis's data must exist as constant values
+  assert(axis_base != nullptr);
+  std::set<uint32_t> axes;
+  for (size_t n = 0; n < axis_size; ++n)
+  {
+    int32_t axis_value = *(reinterpret_cast<const int32_t *>(axis_base) + n);
+    if (axis_value < 0)
+    {
+      axis_value += input_rank;
+    }
+    axes.insert(::neurun::backend::acl_common::ToARMComputeAxis(input_rank, axis_value,
+                                                                frontend_layout, backend_layout)
+                    .value());
+  }
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>();
+
+  fn->configure(input_alloc->handle(), output_alloc->handle(), axes,
+                ::arm_compute::ReduceOperation::SUM);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::ReshapeNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::ReshapeNode::Input::INPUT)};
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
+
+  // NOTE This operation must not be changed the layout from frontend to backend
+  //      However, this runtime can be change the layout of this operation from NHWC to NCHW now
+  // TODO Change the layout of frontend and backend to be the same and layer to CLReshapeLayer
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::misc::GenericReshapeLayer>();
+
+  fn->configure(input_alloc->handle(), output_alloc->handle());
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::SqueezeNode &node)
+{
+  // Squeeze is identical to reshape except that it has an optional dimensions input.
+  // In addition, optional dims_index is ignored since output tensor already has squeezed shape
+  // by freezer and toco
+  // TODO Support multi-layout for frontend and backend
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::SqueezeNode::Input::INPUT)};
+  const auto dims_index{node.param().dims};
+  (void)dims_index;
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto fn = nnfw::cpp14::make_unique<arm_compute::CLReshapeLayer>();
+  fn->configure(input_alloc->handle(), output_alloc->handle());
+  auto acl_fn = asAclFunction(std::move(fn));
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::TanhNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::TanhNode::Input::INPUT)};
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<arm_compute::CLActivationLayer>();
+
+  const ::arm_compute::ActivationLayerInfo act_info{
+      ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f};
+
+  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::SoftmaxNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::SoftmaxNode::Input::INPUT)};
+
+  const auto beta = node.param().beta;
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLSoftmaxLayer>(
+      _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+
+  fn->configure(input_alloc->handle(), output_alloc->handle(), beta);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::StridedSliceNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::StridedSliceNode::Input::INPUT)};
+  const auto startData_index{node.param().startData_index};
+  const auto endData_index{node.param().endData_index};
+  const auto stridesData_index{node.param().stridesData_index};
+  const auto beginMask_index{node.param().beginMask_index};
+  const auto endMask_index{node.param().endMask_index};
+  const auto shrinkAxisMask_index{node.param().shrinkAxisMask_index};
+
+  auto outputData_alloc = _tensor_builder->at(output_index).get();
+  auto inputData_alloc = _tensor_builder->at(input_index).get();
+  const auto frontend_layout = _current_subg_layout;
+  const auto backend_layout = inputData_alloc->layout();
+
+  // Set initializers for indices data such as order of inputData
+  int input_rank = _ctx.at(input_index).shape().rank();
+  std::vector<int32_t> starts;
+  std::vector<int32_t> ends;
+  std::vector<int32_t> strides;
+  starts.resize(input_rank, 0);
+  ends.resize(input_rank, 0);
+  strides.resize(input_rank, 0);
+  {
+    auto input_shape = _ctx.at(input_index).shape();
+    auto startData_base = _ctx.at(startData_index).data().base();
+    auto endData_base = _ctx.at(endData_index).data().base();
+    auto stridesData_base = _ctx.at(stridesData_index).data().base();
+    const int startData_size = _ctx.at(startData_index).shape().num_elements();
+    const int endData_size = _ctx.at(endData_index).shape().num_elements();
+    const int stridesData_size = _ctx.at(stridesData_index).shape().num_elements();
+
+    using neurun::model::DataType;
+
+    UNUSED_RELEASE(startData_size);
+    UNUSED_RELEASE(endData_size);
+    UNUSED_RELEASE(stridesData_size);
+
+    assert(_ctx.at(startData_index).typeInfo().type() == DataType::INT32);
+    assert(_ctx.at(endData_index).typeInfo().type() == DataType::INT32);
+    assert(_ctx.at(stridesData_index).typeInfo().type() == DataType::INT32);
+    assert(startData_size == input_rank);
+    assert(endData_size == input_rank);
+    assert(stridesData_size == input_rank);
+
+    assert(startData_base != nullptr);
+    for (int n = 0; n < input_rank; ++n)
+    {
+      auto axis = ::neurun::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
+                                                                  backend_layout)
+                      .value();
+
+      int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
+      starts[axis] = start_value;
+
+      int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
+      ends[axis] = end_value;
+
+      int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
+      strides[axis] = strides_value;
+    }
+  }
+
+  // Set mask bits such as order of inputData
+  const auto beginMask = ::neurun::backend::acl_common::ReorderBits<int32_t>(
+      _ctx.at(beginMask_index).asScalar<int32_t>(), input_rank, frontend_layout, backend_layout);
+  const auto endMask = ::neurun::backend::acl_common::ReorderBits<int32_t>(
+      _ctx.at(endMask_index).asScalar<int32_t>(), input_rank, frontend_layout, backend_layout);
+  const auto shrinkAxisMask = ::neurun::backend::acl_common::ReorderBits<int32_t>(
+      _ctx.at(shrinkAxisMask_index).asScalar<int32_t>(), input_rank, frontend_layout,
+      backend_layout);
+
+  ::arm_compute::Coordinates starts_set;
+  ::arm_compute::Coordinates ends_set;
+  ::arm_compute::BiStrides strides_set;
+
+  for (size_t i = 0; i < starts.size(); ++i)
+  {
+    starts_set.set(i, starts[i]);
+    ends_set.set(i, ends[i]);
+    strides_set.set(i, strides[i]);
+  }
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLStridedSlice>();
+
+  fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set,
+                strides_set, beginMask, endMask, shrinkAxisMask);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::TransposeNode &node)
+{
+  const auto ofm_idx{node.getOutputs().at(0)};
+  const auto ifm_idx{node.getInputs().at(model::operation::TransposeNode::Input::INPUT)};
+  const auto perm{node.param().perm};
+
+  const auto rank = _ctx.at(ifm_idx).shape().rank();
+  std::vector<int32_t> pv;
+  const auto perm_base = _ctx.at(perm).data().base();
+  const int perm_size = _ctx.at(perm).shape().num_elements();
+
+  assert(perm_base != nullptr);
+  for (int32_t n = 0; n < perm_size; ++n)
+  {
+    int32_t perm_value = *(reinterpret_cast<const int32_t *>(perm_base) + n);
+    assert(perm_value < rank);
+    pv.emplace_back(perm_value);
+  }
+
+  auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
+  const auto frontend_layout = _current_subg_layout;
+  const auto backend_layout = ifm_alloc->layout();
+  // Reversed
+  auto backend_pv = ::neurun::backend::acl_common::getARMComputePermutationVector(
+      rank, pv, frontend_layout, backend_layout);
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPermute>();
+
+  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::AddNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto lhs_index{node.getInputs().at(model::operation::AddNode::Input::LHS)};
+  const auto rhs_index{node.getInputs().at(model::operation::AddNode::Input::RHS)};
+
+  const auto activation = node.param().activation;
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLArithmeticAddition>();
+
+  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
+                arm_compute::ConvertPolicy::SATURATE);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+
+  ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
+}
+
+void KernelGenerator::visit(const model::operation::SubNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto lhs_index{node.getInputs().at(model::operation::SubNode::Input::LHS)};
+  const auto rhs_index{node.getInputs().at(model::operation::SubNode::Input::RHS)};
+
+  const auto activation = node.param().activation;
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLArithmeticSubtraction>();
+
+  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
+                arm_compute::ConvertPolicy::SATURATE);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+
+  ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
+}
+
+void KernelGenerator::visit(const model::operation::DivNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto lhs_index{node.getInputs().at(model::operation::DivNode::Input::LHS)};
+  const auto rhs_index{node.getInputs().at(model::operation::DivNode::Input::RHS)};
+
+  const auto activation = node.param().activation;
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLArithmeticDivision>();
+
+  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+
+  ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
+}
+
+void KernelGenerator::visit(const model::operation::ExpNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::ExpNode::Input::INPUT)};
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLExpLayer>();
+
+  fn->configure(input_alloc->handle(), output_alloc->handle());
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::LogisticNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::LogisticNode::Input::INPUT)};
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  const ::arm_compute::ActivationLayerInfo act_info{
+      ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC};
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
+
+  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::LogicalAndNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input0_index{node.getInputs().at(model::operation::LogicalAndNode::Input::INPUT0)};
+  const auto input1_index{node.getInputs().at(model::operation::LogicalAndNode::Input::INPUT1)};
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input0_alloc = _tensor_builder->at(input0_index).get();
+  auto input1_alloc = _tensor_builder->at(input1_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLBinaryLogicalOp>();
+
+  fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
+                ::arm_compute::BinaryLogicalOperation::AND);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::LSTMNode &node)
+{
+  // TODO Support dynamic rnn
+  // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
+  const auto scratch_buffer_index{
+      node.getOutputs().at(model::operation::LSTMNode::Output::SCRATCH_BUFFER)};
+  const auto output_state_out_index{
+      node.getOutputs().at(model::operation::LSTMNode::Output::OUTPUT_STATE_OUT)};
+  const auto cell_state_out_index{
+      node.getOutputs().at(model::operation::LSTMNode::Output::CELL_STATE_OUT)};
+  const auto output_index{node.getOutputs().at(model::operation::LSTMNode::Output::OUTPUT)};
+
+  const auto input_index{node.getInputs().at(model::operation::LSTMNode::Input::INPUT)};
+  const auto input_to_input_weights_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
+  const auto input_to_forget_weights_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::INPUT_TO_FORGET_WEIGHTS)};
+  const auto input_to_cell_weights_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::INPUT_TO_CELL_WEIGHTS)};
+  const auto input_to_output_weights_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::INPUT_TO_OUTPUT_WEIGHTS)};
+  const auto recurrent_to_input_weights_index{node.getInputs().at(
+      model::operation::LSTMNode::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
+  const auto recurrent_to_forget_weights_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::RECURRENT_TO_FORGET_WEIGHTS)};
+  const auto recurrent_to_cell_weights_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::RECURRENT_TO_CELL_WEIGHTS)};
+  const auto recurrent_to_output_weights_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
+  const auto cell_to_input_weights_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
+  const auto cell_to_forget_weights_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
+  const auto cell_to_output_weights_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
+  const auto input_gate_bias_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::INPUT_GATE_BIAS)};
+  const auto forget_gate_bias_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::FORGET_GATE_BIAS)};
+  const auto cell_bias_index{node.getInputs().at(model::operation::LSTMNode::Input::CELL_BIAS)};
+  const auto output_gate_bias_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::OUTPUT_GATE_BIAS)};
+  const auto projection_weights_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::PROJECTION_WEIGHTS)}; // optional
+  const auto projection_bias_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::PROJECTION_BIAS)}; // optional
+  const auto output_state_in_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::OUTPUT_STATE_IN)};
+  const auto cell_state_in_index{
+      node.getInputs().at(model::operation::LSTMNode::Input::CELL_STATE_IN)};
+  const auto cell_threshold = node.param().cell_threshold;
+  const auto projection_threshold = node.param().projection_threshold;
+
+  bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
+                                    _ctx.at(input_to_input_weights_index).shape().dim(1) != 0;
+  bool has_recurrent_to_input_weights =
+      _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
+      _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
+  bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0;
+  bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0;
+  bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 &&
+                                _ctx.at(projection_weights_index).shape().dim(1) != 0;
+  bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0);
+
+  // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
+  // true: no CIFG
+  // false: CIFG
+  // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
+  bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
+
+  // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
+  // But the cell_to_input_weights does not exist in regular CIFG although peephole.
+  // true: peephole
+  // false: no peephole
+  bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
+
+  // NOTE Although the projection weights has data the projection bias may not have data.
+  bool has_projection_param = has_projection_weights;
+
+  const auto activation = node.param().activation;
+  const auto cell_clip = cell_threshold;
+  const auto projection_clip = projection_threshold;
+  assert(cell_clip >= 0.f && projection_clip >= 0.f);
+
+  auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get();
+  auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get();
+  auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get();
+  auto output_alloc = _tensor_builder->at(output_index).get();
+
+  auto input_alloc = _tensor_builder->at(input_index).get();
+
+  auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get();
+  auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get();
+  auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get();
+  auto recurrent_to_forget_weights_alloc =
+      _tensor_builder->at(recurrent_to_forget_weights_index).get();
+  auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get();
+  auto recurrent_to_output_weights_alloc =
+      _tensor_builder->at(recurrent_to_output_weights_index).get();
+
+  auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get();
+  auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get();
+  auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get();
+  auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get();
+  auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get();
+
+  auto act_info = ::neurun::backend::acl_common::asActivationLayerInfo(activation);
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLLSTMLayer>();
+
+  ::arm_compute::LSTMParams<::arm_compute::ICLTensor> lstm_params{};
+  if (has_cifg_param)
+  {
+    auto input_to_input_weights_alloc =
+        _tensor_builder->at(input_to_input_weights_index).get(); // optional
+    auto recurrent_to_input_weights_alloc =
+        _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional
+    auto cell_to_input_weights_handle =
+        has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle()
+                           : nullptr; // optional (non-cifg && peephole)
+    auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional
+    lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(),
+                                recurrent_to_input_weights_alloc->handle(),
+                                cell_to_input_weights_handle, input_gate_bias_alloc->handle());
+  }
+  if (has_peephole_param)
+  {
+    auto cell_to_forget_weights_alloc =
+        _tensor_builder->at(cell_to_forget_weights_index).get(); // optional
+    auto cell_to_output_weights_alloc =
+        _tensor_builder->at(cell_to_output_weights_index).get(); // optional
+    lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(),
+                                    cell_to_output_weights_alloc->handle());
+  }
+  if (has_projection_param)
+  {
+    auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional
+    auto projection_bias_handle = has_projection_bias
+                                      ? _tensor_builder->at(projection_bias_index).get()->handle()
+                                      : nullptr; // optional
+    lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle);
+  }
+
+  fn->configure(
+      input_alloc->handle(), input_to_forget_weights_alloc->handle(),
+      input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(),
+      recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(),
+      recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(),
+      cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(),
+      cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(),
+      output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(),
+      lstm_params, act_info, cell_clip, projection_clip);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::ReduceMaxNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::ReduceMaxNode::Input::INPUT)};
+  const auto axis_index{node.param().axis_index};
+
+  auto input_shape = _ctx.at(input_index).shape();
+  auto axis_shape = _ctx.at(axis_index).shape();
+
+  auto ofm_alloc = _tensor_builder->at(output_index).get();
+  auto ifm_alloc = _tensor_builder->at(input_index).get();
+  std::set<uint32_t> axes;
+  const auto frontend_layout = _current_subg_layout;
+  const auto backend_layout = ifm_alloc->layout();
+  {
+    const auto ifm_rank = input_shape.rank();
+    switch (axis_shape.rank())
+    {
+      case 0: // scalar
+      {
+        int32_t axis_value = _ctx.at(axis_index).asScalar<int32_t>();
+        if (axis_value < 0)
+        {
+          axis_value += ifm_rank;
+        }
+        axes.insert(::neurun::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value,
+                                                                    frontend_layout, backend_layout)
+                        .value());
+        break;
+      }
+      case 1: // vector
+      {
+        const auto axis_base = _ctx.at(axis_index).data().base();
+        const int axis_size = axis_shape.num_elements();
+
+        // If axis's data does not exist as constant values and can be gotten as input data, we have
+        // to find a way to infer output shape when sinking output.
+        assert(axis_base != nullptr);
+        for (int32_t n = 0; n < axis_size; ++n)
+        {
+          int32_t axis_value = *(reinterpret_cast<const int32_t *>(axis_base) + n);
+          if (axis_value < 0)
+          {
+            axis_value += ifm_rank;
+          }
+          axes.insert(::neurun::backend::acl_common::ToARMComputeAxis(
+                          ifm_rank, axis_value, frontend_layout, backend_layout)
+                          .value());
+        }
+        break;
+      }
+      default:
+        throw std::runtime_error("Not supported");
+        break;
+    }
+  }
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>();
+
+  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), axes, arm_compute::ReduceOperation::MAX);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::ComparisonNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input0_index{node.getInputs().at(model::operation::ComparisonNode::Input::INPUT0)};
+  const auto input1_index{node.getInputs().at(model::operation::ComparisonNode::Input::INPUT1)};
+
+  const auto comparison_type = node.param().comparison_type;
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input0_alloc = _tensor_builder->at(input0_index).get();
+  auto input1_alloc = _tensor_builder->at(input1_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLComparison>();
+
+  fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
+                (arm_compute::ComparisonOperation)comparison_type);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::RSQRTNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::RSQRTNode::Input::INPUT)};
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLRsqrtLayer>();
+
+  fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+
+  _execution_builder->append(asAclFunction(std::move(fn)));
+}
+
+void KernelGenerator::visit(const model::operation::ReLUNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::ReLUNode::Input::INPUT)};
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<arm_compute::CLActivationLayer>();
+
+  const ::arm_compute::ActivationLayerInfo act_info{
+      ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
+
+  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::ResizeBilinearNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+
+  const auto ifm_index{node.getInputs().at(model::operation::ResizeBilinearNode::Input::INPUT)};
+  const auto height_index{node.param().height_index};
+  const auto width_index{node.param().width_index};
+  (void)height_index;
+  (void)width_index;
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLScale>();
+
+  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(),
+                ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE,
+                ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::ReLU1Node &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::ReLU1Node::Input::INPUT)};
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  const ::arm_compute::ActivationLayerInfo act_info{
+      ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
+
+  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::ReLU6Node &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::ReLU6Node::Input::INPUT)};
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  const ::arm_compute::ActivationLayerInfo act_info{
+      ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f};
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
+
+  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::RNNNode &node)
+{
+  const auto output_index{node.getOutputs().at(model::operation::RNNNode::Output::OUTPUT)};
+  const auto hidden_state_out_index{
+      node.getOutputs().at(model::operation::RNNNode::Output::HIDDEN_STATE_OUT)};
+
+  const auto input_index{node.getInputs().at(model::operation::RNNNode::Input::INPUT)};
+  const auto weights_index{node.getInputs().at(model::operation::RNNNode::Input::WEIGHTS)};
+  const auto recurrent_weights_index{
+      node.getInputs().at(model::operation::RNNNode::Input::RECURRENT_WEIGHTS)};
+  const auto bias_index{node.getInputs().at(model::operation::RNNNode::Input::BIAS)};
+  const auto hidden_state_in_index{
+      node.getInputs().at(model::operation::RNNNode::Input::HIDDEN_STATE_IN)};
+
+  const auto activation = node.param().activation;
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get();
+
+  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto weights_alloc = _tensor_builder->at(weights_index).get();
+  auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get();
+  auto bias_alloc = _tensor_builder->at(bias_index).get();
+  auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get();
+  auto act_info = ::neurun::backend::acl_common::asActivationLayerInfo(activation);
+
+  auto copy_layer = nnfw::cpp14::make_unique<::arm_compute::CLCopy>();
+  copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle());
+  _execution_builder->append(asAclFunction(std::move(copy_layer)));
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLRNNLayerEx>(
+      _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+  fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(),
+                bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(),
+                act_info);
+  _execution_builder->append(asAclFunction(std::move(fn)));
+}
+
+void KernelGenerator::visit(const model::operation::FloorNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::FloorNode::Input::INPUT)};
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLFloor>();
+
+  fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::SpaceToDepthNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::SpaceToDepthNode::Input::INPUT)};
+  const auto block_size_index{node.param().block_size_index};
+
+  auto block_size = _ctx.at(block_size_index).asScalar<int32_t>();
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLSpaceToDepth>();
+
+  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::L2Pool2DNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::L2Pool2DNode::Input::INPUT)};
+
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
+
+  uint32_t kw = node.param().kw;
+  uint32_t kh = node.param().kh;
+  const auto stride = node.param().stride;
+  const auto padding =
+      neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+  const auto activation = node.param().activation;
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  ::arm_compute::PoolingLayerInfo info{
+      ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh},
+      ::neurun::backend::acl_common::asPadStrideInfo(padding, stride)};
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPoolingLayer>();
+
+  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+
+  ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
+}
+
+void KernelGenerator::visit(const model::operation::EmbeddingLookupNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto lookups_index{
+      node.getInputs().at(model::operation::EmbeddingLookupNode::Input::LOOKUPS)};
+  const auto values_index{
+      node.getInputs().at(model::operation::EmbeddingLookupNode::Input::VALUES)};
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto lookups_alloc = _tensor_builder->at(lookups_index).get();
+  auto values_alloc = _tensor_builder->at(values_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLEmbeddingLookup>();
+
+  fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle());
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::L2NormalizationNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::L2NormalizationNode::Input::INPUT)};
+
+  // {CL|Neon}L2Normalization performs the reduction only along dimension 0
+  // L2 Normalization always performs the reduction along the depth axis
+  // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
+  // choosing normalization parameters as below
+
+  const auto &ifm_shape = _ctx.at(ifm_index).shape();
+  // TODO Support optional constant dimension that normalization would be performed on
+  const auto normalization_axis = ifm_shape.rank() - 1;
+  int32_t radius =
+      2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
+  float alpha = 1.0f;                            // In the implementation to make alpha_ become 1
+  float beta = 0.5f;                             // pow(reduction, -0.5) = 1 / sqrt(reduction)
+  float bias = 0.0f;                             // Don't offset the reduction.
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
+                                                               radius, alpha, beta, bias, false);
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLNormalizationLayer>();
+
+  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::HashtableLookupNode &node)
+{
+  const auto output_index{
+      node.getOutputs().at(model::operation::HashtableLookupNode::Output::OUTPUT)};
+  const auto hits_index{node.getOutputs().at(model::operation::HashtableLookupNode::Output::HITS)};
+
+  const auto lookups_index{
+      node.getInputs().at(model::operation::HashtableLookupNode::Input::LOOKUPS)};
+  const auto keys_index{node.getInputs().at(model::operation::HashtableLookupNode::Input::KEYS)};
+  const auto values_index{
+      node.getInputs().at(model::operation::HashtableLookupNode::Input::VALUES)};
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto hits_alloc = _tensor_builder->at(hits_index).get();
+
+  auto lookups_alloc = _tensor_builder->at(lookups_index).get();
+  auto keys_alloc = _tensor_builder->at(keys_index).get();
+  auto values_alloc = _tensor_builder->at(values_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLHashtableLookup>();
+
+  fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(),
+                output_alloc->handle(), hits_alloc->handle());
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::PReLUNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::PReLUNode::Input::INPUT)};
+  const auto alpha_index{node.getInputs().at(model::operation::PReLUNode::Input::ALPHA)};
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto alpha_alloc = _tensor_builder->at(alpha_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPReLU>();
+
+  fn->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle());
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::TransposeConvNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto output_shape_index{
+      node.getInputs().at(model::operation::TransposeConvNode::Input::OUTPUT_SHAPE)};
+  const auto ker_index{node.getInputs().at(model::operation::TransposeConvNode::Input::KERNEL)};
+  const auto ifm_index{node.getInputs().at(model::operation::TransposeConvNode::Input::INPUT)};
+
+  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
+  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
+  const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_subg_layout);
+
+  const auto stride = node.param().stride;
+
+  assert((node.param().padding.type == model::PaddingType::SAME) ||
+         (node.param().padding.type == model::PaddingType::VALID));
+  auto padding = neurun::util::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
+                                                ker_shape.W, ker_shape.H);
+
+  uint32_t invalid_horizontal = 0;
+  uint32_t invalid_vertical = 0;
+  if (node.param().padding.type == model::PaddingType::VALID)
+  {
+    invalid_horizontal =
+        ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
+    invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
+  }
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ker_alloc = _tensor_builder->at(ker_index).get();
+
+  const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLTransposeConvLayer>(
+      _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+
+  fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info,
+                invalid_horizontal, invalid_vertical);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::SQRTNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::SQRTNode::Input::INPUT)};
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
+
+  const ::arm_compute::ActivationLayerInfo act_info{
+      ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
+
+  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::LogicalOrNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input0_index{node.getInputs().at(model::operation::LogicalOrNode::Input::INPUT0)};
+  const auto input1_index{node.getInputs().at(model::operation::LogicalOrNode::Input::INPUT1)};
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input0_alloc = _tensor_builder->at(input0_index).get();
+  auto input1_alloc = _tensor_builder->at(input1_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLBitwiseOr>();
+
+  fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle());
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::LogicalNotNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::LogicalNotNode::Input::INPUT)};
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLBitwiseNot>();
+
+  fn->configure(input_alloc->handle(), output_alloc->handle());
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::SquaredDifferenceNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto lhs_index{node.getInputs().at(model::operation::SquaredDifferenceNode::Input::LHS)};
+  const auto rhs_index{node.getInputs().at(model::operation::SquaredDifferenceNode::Input::RHS)};
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
+  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLElementwiseSquaredDiff>();
+
+  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::TopKV2Node &node)
+{
+  const auto outputValues_index{
+      node.getOutputs().at(model::operation::TopKV2Node::Output::OUTPUT_VALUES)};
+  const auto outputIndices_index{
+      node.getOutputs().at(model::operation::TopKV2Node::Output::OUTPUT_INDICES)};
+
+  const auto inputData_index{node.getInputs().at(model::operation::TopKV2Node::Input::INPUT)};
+  const auto k_index{node.param().k_index};
+
+  // Currently, we only support the vector input.
+  assert(_ctx.at(inputData_index).shape().rank() == 1 ||
+         _ctx.at(inputData_index).shape().rank() == 2);
+
+  const auto k = _ctx.at(k_index).asScalar<int32_t>();
+
+  auto values_alloc = _tensor_builder->at(outputValues_index).get();
+  auto indices_alloc = _tensor_builder->at(outputIndices_index).get();
+  auto input_alloc = _tensor_builder->at(inputData_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLTopKV2>();
+
+  fn->configure(input_alloc->handle(), k, values_alloc->handle(), indices_alloc->handle());
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::GatherNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+
+  const auto ifm_index{node.getInputs().at(model::operation::GatherNode::Input::INPUT)};
+  const auto indices_index{node.getInputs().at(model::operation::GatherNode::Input::INDICES)};
+
+  const auto axis_index{node.param().axis_index};
+
+  const auto ifm_shape = _ctx.at(ifm_index).shape();
+
+  const auto axis_value = static_cast<int>(_ctx.at(axis_index).asScalar<int32_t>());
+  // Converting in reverse order
+  const int axis =
+      ::neurun::backend::acl_common::ToARMComputeAxis(ifm_shape.rank(), axis_value).value();
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto indices_alloc = _tensor_builder->at(indices_index).get();
+  auto acl_layout = ofm_alloc->handle()->info()->data_layout();
+  UNUSED_RELEASE(acl_layout);
+
+  // NOTE The frontend layout and backend layout must be the same for this operation.
+  //      If not the same, we have to add a stage(?) to perform permutation of output tensor. It
+  //      is not not efficient even if it works well. If so, it would be better to set the
+  //      layout of these backend tensors to the same layout.
+  //      There is also one thing we have to think about. This operation depends on the layout of
+  //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
+  //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
+  //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
+  // TODO Remove this workaround
+  //      It is a workaround how to set the layout of these backend tensors to the layout of the
+  //      frontend when creating them
+  // TODO Supports front-end in NCHW
+  // TODO Change the layout of frontend and backend to be the same
+  // assert(::arm_compute::DataLayout::NHWC == acl_layout);
+  assert(acl_layout == ifm_alloc->handle()->info()->data_layout());
+  assert(acl_layout == indices_alloc->handle()->info()->data_layout());
+
+  // TODO Change to CLGather
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::misc::GenericGather>();
+
+  fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::NegNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::NegNode::Input::INPUT)};
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLNeg>();
+
+  fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::AbsNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::AbsNode::Input::INPUT)};
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
+
+  const ::arm_compute::ActivationLayerInfo act_info{
+      ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
+
+  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::ArgMaxNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::ArgMaxNode::Input::INPUT)};
+  const auto axis_index{node.param().axis_index};
+
+  auto ifm_shape = _ctx.at(ifm_index).shape();
+  auto ofm_shape = _ctx.at(ofm_index).shape();
+  auto axis_shape = _ctx.at(axis_index).shape();
+
+  assert(_ctx.at(axis_index).isConstant());
+  // Axis dimension is always 1.
+  assert(axis_shape.rank() == 1);
+  assert((ifm_shape.rank() - 1) == ofm_shape.rank());
+
+  const int axis_size = axis_shape.num_elements();
+  auto axis_base = _ctx.at(axis_index).data().base();
+  // TODO Should support axis size > 1.
+  assert(axis_size == 1);
+  // axis is tensor with 1 dimension - always a vector.
+  assert(axis_base != nullptr);
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  const auto ifm_rank = ifm_shape.rank();
+  auto frontend_layout = _current_subg_layout;
+  auto backend_layout = ifm_alloc->layout();
+  std::set<uint32_t> axes;
+  for (int32_t n = 0; n < axis_size; ++n)
+  {
+    int32_t axis_value = *(reinterpret_cast<const int32_t *>(axis_base) + n);
+    if (axis_value < 0)
+    {
+      axis_value += ifm_rank;
+    }
+    axes.insert(acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout)
+                    .value());
+  }
+  std::vector<uint32_t> fixed_axes(axes.begin(), axes.end());
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLArgOperation>();
+
+  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), fixed_axes,
+                ::arm_compute::ArgOperation::MAX);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::DequantizeNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::DequantizeNode::Input::INPUT)};
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLCast>();
+
+  fn->configure(input_alloc->handle(), output_alloc->handle());
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::MeanNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::MeanNode::Input::INPUT)};
+
+  const auto axis_index{node.param().axis_index};
+  const auto keep_dims{node.param().keep_dims};
+  (void)keep_dims;
+
+  const auto ifm_shape = _ctx.at(ifm_index).shape();
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  std::set<uint32_t> axes;
+  {
+    const auto ifm_rank = ifm_shape.rank();
+    const auto frontend_layout = _current_subg_layout;
+    const auto backend_layout = ifm_alloc->layout();
+    const auto axis_shape = _ctx.at(axis_index).shape();
+    switch (axis_shape.rank())
+    {
+      case 0: // scalar
+      {
+        auto axis_value = _ctx.at(axis_index).asScalar<int32_t>();
+        if (axis_value < 0)
+        {
+          axis_value += ifm_rank;
+        }
+        axes.insert(::neurun::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value,
+                                                                    frontend_layout, backend_layout)
+                        .value());
+        break;
+      }
+      case 1: // vector
+      {
+        const auto axis_base = _ctx.at(axis_index).data().base();
+        const int axis_size = axis_shape.num_elements();
+
+        // If axis's data does not exist as constant values and can be gotten as input data, we have
+        // to find a way to infer output shape when sinking output.
+        assert(axis_base != nullptr);
+        for (int32_t n = 0; n < axis_size; ++n)
+        {
+          int32_t axis_value = *(reinterpret_cast<const int32_t *>(axis_base) + n);
+          if (axis_value < 0)
+          {
+            axis_value += ifm_rank;
+          }
+          axes.insert(::neurun::backend::acl_common::ToARMComputeAxis(
+                          ifm_rank, axis_value, frontend_layout, backend_layout)
+                          .value());
+        }
+        break;
+      }
+      default:
+        throw std::runtime_error("Not supported");
+    }
+  }
+
+  // NOTE CLReduceMean has a bug that does not support NHWC layout
+  //      CLReduceMean intermediate tensors are always NCHW layout
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>();
+
+  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), axes,
+                ::arm_compute::ReduceOperation::MEAN);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::LocalResponseNormalizationNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{
+      node.getInputs().at(model::operation::LocalResponseNormalizationNode::Input::INPUT)};
+  const auto radius_index{node.param().radius_index};
+  const auto bias_index{node.param().bias_index};
+  const auto alpha_index{node.param().alpha_index};
+  const auto beta_index{node.param().beta_index};
+
+  auto radius = _ctx.at(radius_index).asScalar<int32_t>();
+  auto alpha = _ctx.at(alpha_index).asScalar<float>();
+  auto beta = _ctx.at(beta_index).asScalar<float>();
+  auto bias = _ctx.at(bias_index).asScalar<float>();
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+
+  const auto norm_info = ::arm_compute::NormalizationLayerInfo(
+      ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLNormalizationLayer>();
+
+  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::DepthToSpaceNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::DepthToSpaceNode::Input::INPUT)};
+  const auto block_size_index{node.param().block_size_index};
+
+  auto block_size = _ctx.at(block_size_index).asScalar<int32_t>();
+  assert(block_size > 0);
+
+  auto output_alloc = _tensor_builder->at(output_index).get();
+  auto input_alloc = _tensor_builder->at(input_index).get();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLDepthToSpace>();
+
+  fn->configure(input_alloc->handle(), output_alloc->handle(), block_size);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::ReduceMinNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::ReduceMinNode::Input::INPUT)};
+  const auto axis_index{node.param().axis_index};
+
+  auto ifm_shape = _ctx.at(ifm_index).shape();
+  auto ofm_shape = _ctx.at(ofm_index).shape();
+  auto axis_shape = _ctx.at(axis_index).shape();
+
+  const auto ifm_rank = ifm_shape.rank();
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  std::set<uint32_t> axes;
+  {
+    const auto frontend_layout = _current_subg_layout;
+    const auto backend_layout = ifm_alloc->layout();
+    switch (axis_shape.rank())
+    {
+      case 0: // scalar
+      {
+        auto axis_value = _ctx.at(axis_index).asScalar<int32_t>();
+        if (axis_value < 0)
+        {
+          axis_value += ifm_rank;
+        }
+        axes.insert(::neurun::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value,
+                                                                    frontend_layout, backend_layout)
+                        .value());
+        break;
+      }
+      case 1: // vector
+      {
+        const auto axis_base = _ctx.at(axis_index).data().base();
+        const int axis_size = axis_shape.num_elements();
+
+        // If axis's data does not exist as constant values and can be gotten as input data, we have
+        // to find a way to infer output shape when sinking output.
+        assert(axis_base != nullptr);
+        for (int32_t n = 0; n < axis_size; ++n)
+        {
+          int32_t axis_value = *(reinterpret_cast<const int32_t *>(axis_base) + n);
+          if (axis_value < 0)
+          {
+            axis_value += ifm_rank;
+          }
+          axes.insert(::neurun::backend::acl_common::ToARMComputeAxis(
+                          ifm_rank, axis_value, frontend_layout, backend_layout)
+                          .value());
+        }
+        break;
+      }
+      default:
+        throw std::runtime_error("Not supported");
+        break;
+    }
+  }
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>();
+
+  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), axes,
+                ::arm_compute::ReduceOperation::MIN);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::SplitNode &node)
+{
+  const auto ifm_index{node.getInputs().at(model::operation::SplitNode::Input::INPUT)};
+  const auto axis_index{node.param().axis_index};
+  const auto num_of_splits_index{node.param().num_of_splits_index};
+
+  assert(_ctx.at(num_of_splits_index).asScalar<unsigned int>() == node.getOutputs().size());
+
+  const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
+  std::vector<model::OperandIndex> output_indexes;
+  for (const auto &output : node.getOutputs())
+    output_indexes.emplace_back(output);
+
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  std::vector<arm_compute::ICLTensor *> output_allocs;
+  for (const auto &ofm_ind : output_indexes)
+    output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
+
+  const auto frontend_layout = _current_subg_layout;
+  const auto backend_layout = ifm_alloc->layout();
+  auto axis = _ctx.at(axis_index).asScalar<int32_t>();
+  if (axis < 0)
+    axis += ifm_rank;
+  axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLSplit>();
+
+  fn->configure(ifm_alloc->handle(), output_allocs, axis);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
+void KernelGenerator::visit(const model::operation::UnpackNode &node)
+{
+  const auto input_index{node.getInputs().at(model::operation::UnpackNode::Input::INPUT)};
+  auto axis{node.param().axis};
+
+  const auto input_rank = _ctx.at(input_index).shape().rank();
+
+  std::vector<model::OperandIndex> output_indexes;
+  for (const auto &output_index : node.getOutputs())
+    output_indexes.emplace_back(output_index);
+
+  auto input = _tensor_builder->at(input_index).get()->handle();
+  std::vector<arm_compute::ICLTensor *> outputs;
+  for (const auto &output_index : output_indexes)
+    outputs.emplace_back(_tensor_builder->at(output_index)->handle());
+
+  const auto frontend_layout = _current_subg_layout;
+  const auto backend_layout = _tensor_builder->at(input_index).get()->layout();
+  if (axis < 0)
+    axis += input_rank;
+  axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLUnstack>();
+
+  fn->configure(input, outputs, axis);
+
+  _execution_builder->append(asAclFunction(std::move(fn)));
+}
+
+void KernelGenerator::visit(const model::operation::PadNode &node)
+{
+  const auto input_index{node.getInputs().at(model::operation::PadNode::Input::INPUT)};
+  const auto pad_index{node.getInputs().at(model::operation::PadNode::Input::PAD)};
+  const auto output_index{node.getOutputs().at(0)};
+  assert(_ctx.at(pad_index).isConstant());
+
+  auto rank = _ctx.at(pad_index).shape().dim(0);
+  auto pad_base = _ctx.at(pad_index).data().base();
+
+  auto input_type = _ctx.at(input_index).typeInfo();
+  auto data_type = acl_common::asDataType(input_type.type());
+  auto quant_info = ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset());
+  const auto pixel_value = ::arm_compute::PixelValue(0, data_type, quant_info);
+
+  auto input = _tensor_builder->at(input_index).get()->handle();
+  auto output = _tensor_builder->at(output_index).get()->handle();
+
+  ::arm_compute::PaddingList padding_list;
+  padding_list.resize(rank);
+  for (int32_t n = 0; n < rank; ++n)
+  {
+    const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
+
+    const auto frontend_layout = _current_subg_layout;
+    const auto backend_layout = _tensor_builder->at(input_index).get()->layout();
+    const auto axis =
+        acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
+    padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
+  }
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPadLayer>();
+  fn->configure(input, output, padding_list, pixel_value);
+
+  _execution_builder->append(asAclFunction(std::move(fn)));
+}
+
+} // namespace acl_cl
+} // namespace backend
+} // namespace neurun
diff --git a/runtimes/neurun/backend/acl_cl/KernelGenerator.h b/runtimes/neurun/backend/acl_cl/KernelGenerator.h
new file mode 100644
index 000000000..db9bf4199
--- /dev/null
+++ b/runtimes/neurun/backend/acl_cl/KernelGenerator.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_ACL_CL_KERNEL_GENERATOR_H__
+#define __NEURUN_BACKEND_ACL_CL_KERNEL_GENERATOR_H__
+
+#include <backend/IKernelGenerator.h>
+
+#include "model/Operands.h"
+#include "TensorBuilder.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_cl
+{
+
+class KernelGenerator : public IKernelGenerator
+{
+public:
+  KernelGenerator(const neurun::model::Operands &ctx,
+                  const std::shared_ptr<TensorBuilder> &tensor_builder);
+
+  void visit(const model::Subgraph &) override;
+  void visit(const model::operation::Conv2DNode &) override;
+  void visit(const model::operation::DepthwiseConv2DNode &) override;
+  void visit(const model::operation::MaxPool2DNode &) override;
+  void visit(const model::operation::AvgPool2DNode &) override;
+  void visit(const model::operation::ConcatNode &) override;
+  void visit(const model::operation::FullyConnectedNode &) override;
+  void visit(const model::operation::MulNode &) override;
+  void visit(const model::operation::ReduceSumNode &) override;
+  void visit(const model::operation::ReshapeNode &) override;
+  void visit(const model::operation::SqueezeNode &) override;
+  void visit(const model::operation::TanhNode &) override;
+  void visit(const model::operation::SoftmaxNode &) override;
+  void visit(const model::operation::StridedSliceNode &) override;
+  void visit(const model::operation::TransposeNode &) override;
+  void visit(const model::operation::AddNode &) override;
+  void visit(const model::operation::SubNode &) override;
+  void visit(const model::operation::CastNode &) override;
+  void visit(const model::operation::DivNode &) override;
+  void visit(const model::operation::ExpNode &) override;
+  void visit(const model::operation::LogisticNode &) override;
+  void visit(const model::operation::ReduceMaxNode &) override;
+  void visit(const model::operation::ComparisonNode &) override;
+  void visit(const model::operation::LogicalAndNode &) override;
+  void visit(const model::operation::LSTMNode &) override;
+  void visit(const model::operation::RSQRTNode &) override;
+  void visit(const model::operation::ReLUNode &) override;
+  void visit(const model::operation::ResizeBilinearNode &) override;
+  void visit(const model::operation::ReLU1Node &) override;
+  void visit(const model::operation::ReLU6Node &) override;
+  void visit(const model::operation::RNNNode &) override;
+  void visit(const model::operation::FloorNode &) override;
+  void visit(const model::operation::SpaceToDepthNode &) override;
+  void visit(const model::operation::L2Pool2DNode &) override;
+  void visit(const model::operation::EmbeddingLookupNode &) override;
+  void visit(const model::operation::L2NormalizationNode &) override;
+  void visit(const model::operation::HashtableLookupNode &) override;
+  void visit(const model::operation::PReLUNode &) override;
+  void visit(const model::operation::TransposeConvNode &) override;
+  void visit(const model::operation::SQRTNode &) override;
+  void visit(const model::operation::LogicalOrNode &) override;
+  void visit(const model::operation::LogicalNotNode &) override;
+  void visit(const model::operation::SquaredDifferenceNode &) override;
+  void visit(const model::operation::TopKV2Node &) override;
+  void visit(const model::operation::GatherNode &) override;
+  void visit(const model::operation::NegNode &) override;
+  void visit(const model::operation::AbsNode &) override;
+  void visit(const model::operation::ArgMaxNode &) override;
+  void visit(const model::operation::DequantizeNode &) override;
+  void visit(const model::operation::MeanNode &) override;
+  void visit(const model::operation::LocalResponseNormalizationNode &) override;
+  void visit(const model::operation::DepthToSpaceNode &) override;
+  void visit(const model::operation::ReduceMinNode &) override;
+  void visit(const model::operation::SplitNode &) override;
+  void visit(const model::operation::UnpackNode &) override;
+  void visit(const model::operation::PadNode &) override;
+
+private:
+  const neurun::model::Operands &_ctx;
+  std::shared_ptr<TensorBuilder> _tensor_builder;
+  model::Layout _current_subg_layout;
+};
+
+} // namespace acl_cl
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_ACL_CL_KERNEL_GENERATOR_H__
diff --git a/runtimes/neurun/backend/acl_cl/PluginClassesAllocator.cc b/runtimes/neurun/backend/acl_cl/PluginClassesAllocator.cc
new file mode 100644
index 000000000..ac3f0acff
--- /dev/null
+++ b/runtimes/neurun/backend/acl_cl/PluginClassesAllocator.cc
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <util/logging.h>
+
+#include "Backend.h"
+
+extern "C" {
+neurun::backend::Backend *neurun_backend_create()
+{
+  VERBOSE(neurun_backend_create) << "'acl_cl' loaded\n";
+  return new neurun::backend::acl_cl::Backend;
+}
+
+void neurun_backend_destroy(neurun::backend::Backend *backend)
+{
+  VERBOSE(neurun_backend_create) << "'acl_cl' unloaded\n";
+  delete backend;
+}
+}
diff --git a/runtimes/neurun/backend/acl_cl/ShapeFixer.cc b/runtimes/neurun/backend/acl_cl/ShapeFixer.cc
new file mode 100644
index 000000000..e6744cc24
--- /dev/null
+++ b/runtimes/neurun/backend/acl_cl/ShapeFixer.cc
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ShapeFixer.h"
+
+#include <arm_compute/runtime/CL/CLFunctions.h>   // Include all ARM Compute CL functions
+#include <arm_compute/runtime/CL/CLFunctionsEx.h> // Include all ARM Compute EX CL functions
+#include <arm_compute/runtime/misc/functions/GenericGather.h>
+#include <arm_compute/runtime/misc/functions/GenericReshapeLayer.h>
+
+#include <AclFunction.h>
+#include <Convert.h>
+#include <Swizzle.h>
+
+#include "kernel/ConcatLayer.h"
+#include "model/Index.h"
+#include "model/DataType.h"
+#include "model/InternalType.h"
+#include "compiler/IExecutionBuilder.h"
+#include "exec/NopFunction.h"
+#include "util/logging.h"
+#include "util/Utils.h"
+#include "util/Padding.h"
+
+using ::neurun::compiler::IExecutionBuilder;
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_cl
+{
+
+using ::neurun::backend::acl_common::asAclFunction;
+
+ShapeFixer::ShapeFixer(const neurun::model::Operands &ctx,
+                       const std::shared_ptr<TensorBuilder> &tensor_builder)
+    : _ctx(ctx), _tensor_builder(tensor_builder)
+{
+  assert(tensor_builder);
+}
+
+void ShapeFixer::visit(const model::operation::CastNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::Conv2DNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::DepthwiseConv2DNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::MaxPool2DNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::AvgPool2DNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::ConcatNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  _tensor_builder->dimCorrection(ofm_index, false);
+  for (const auto &input : node.getInputs())
+    _tensor_builder->dimCorrection(input, false);
+}
+
+void ShapeFixer::visit(const model::operation::FullyConnectedNode &node)
+{
+  using model::operation::FullyConnectedNode;
+  const auto input_index{node.getInputs().at(FullyConnectedNode::Input::INPUT)};
+  const auto input_rank = _ctx.at(input_index).shape().rank();
+  if (input_rank == 4)
+    _tensor_builder->dimCorrection(input_index, false);
+}
+
+void ShapeFixer::visit(const model::operation::MulNode &node)
+{
+  const auto lhs_index{node.getInputs().at(model::operation::MulNode::Input::LHS)};
+  const auto rhs_index{node.getInputs().at(model::operation::MulNode::Input::RHS)};
+
+  if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape()))
+  {
+    const auto broadcast_rank =
+        std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank());
+
+    // TODO remove const_cast later. For example, _ctx may need to be a non const variable or
+    //      a node to extend shape may be inserted in front of this operation
+    const_cast<::neurun::model::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank);
+    const_cast<::neurun::model::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank);
+  }
+}
+
+void ShapeFixer::visit(const model::operation::ReduceSumNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::ReshapeNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::ReshapeNode::Input::INPUT)};
+  _tensor_builder->dimCorrection(input_index, false);
+  _tensor_builder->dimCorrection(output_index, false);
+}
+
+void ShapeFixer::visit(const model::operation::SqueezeNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(model::operation::SqueezeNode::Input::INPUT)};
+  _tensor_builder->dimCorrection(input_index, false);
+  _tensor_builder->dimCorrection(output_index, false);
+}
+
+void ShapeFixer::visit(const model::operation::TanhNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::SoftmaxNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::StridedSliceNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::TransposeNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::AddNode &node)
+{
+  const auto lhs_index{node.getInputs().at(model::operation::AddNode::Input::LHS)};
+  const auto rhs_index{node.getInputs().at(model::operation::AddNode::Input::RHS)};
+
+  if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape()))
+  {
+    const auto broadcast_rank =
+        std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank());
+    const_cast<::neurun::model::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank);
+    const_cast<::neurun::model::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank);
+  }
+}
+
+void ShapeFixer::visit(const model::operation::SubNode &node)
+{
+  const auto lhs_index{node.getInputs().at(model::operation::SubNode::Input::LHS)};
+  const auto rhs_index{node.getInputs().at(model::operation::SubNode::Input::RHS)};
+
+  if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape()))
+  {
+    const auto broadcast_rank =
+        std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank());
+
+    // TODO remove const_cast later. For example, _ctx may need to be a non const variable or
+    //      a node to extend shape may be inserted in front of this operation
+    const_cast<::neurun::model::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank);
+    const_cast<::neurun::model::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank);
+  }
+}
+
+void ShapeFixer::visit(const model::operation::DivNode &node)
+{
+  const auto lhs_index{node.getInputs().at(model::operation::DivNode::Input::LHS)};
+  const auto rhs_index{node.getInputs().at(model::operation::DivNode::Input::RHS)};
+
+  if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape()))
+  {
+    const auto broadcast_rank =
+        std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank());
+
+    // TODO remove const_cast later. For example, _ctx may need to be a non const variable or
+    //      a node to extend shape may be inserted in front of this operation
+    const_cast<::neurun::model::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank);
+    const_cast<::neurun::model::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank);
+  }
+}
+
+void ShapeFixer::visit(const model::operation::ExpNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::LogisticNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::LogicalAndNode &node)
+{
+  const auto input0_index{node.getInputs().at(model::operation::LogicalAndNode::Input::INPUT0)};
+  const auto input1_index{node.getInputs().at(model::operation::LogicalAndNode::Input::INPUT1)};
+
+  if (!(_ctx.at(input0_index).shape() == _ctx.at(input1_index).shape()))
+  {
+    const auto broadcast_rank =
+        std::max(_ctx.at(input0_index).shape().rank(), _ctx.at(input1_index).shape().rank());
+
+    // TODO remove const_cast later. For example, _ctx may need to be a non const variable or
+    //      a node to extend shape may be inserted in front of this operation
+    const_cast<::neurun::model::Shape &>(_ctx.at(input0_index).shape()).extendRank(broadcast_rank);
+    const_cast<::neurun::model::Shape &>(_ctx.at(input1_index).shape()).extendRank(broadcast_rank);
+  }
+}
+
+void ShapeFixer::visit(const model::operation::LSTMNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::ReduceMaxNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::ComparisonNode &node)
+{
+  const auto input0_index{node.getInputs().at(model::operation::ComparisonNode::Input::INPUT0)};
+  const auto input1_index{node.getInputs().at(model::operation::ComparisonNode::Input::INPUT1)};
+
+  if (!(_ctx.at(input0_index).shape() == _ctx.at(input1_index).shape()))
+  {
+    const auto broadcast_rank =
+        std::max(_ctx.at(input0_index).shape().rank(), _ctx.at(input1_index).shape().rank());
+
+    // TODO remove const_cast later. For example, _ctx may need to be a non const variable or
+    //      a node to extend shape may be inserted in front of this operation
+    const_cast<::neurun::model::Shape &>(_ctx.at(input0_index).shape()).extendRank(broadcast_rank);
+    const_cast<::neurun::model::Shape &>(_ctx.at(input1_index).shape()).extendRank(broadcast_rank);
+  }
+}
+
+void ShapeFixer::visit(const model::operation::RSQRTNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::ReLUNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::ResizeBilinearNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::ReLU1Node &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::ReLU6Node &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::RNNNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::FloorNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::SpaceToDepthNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::SpaceToDepthNode::Input::INPUT)};
+  _tensor_builder->dimCorrection(ofm_index, false);
+  _tensor_builder->dimCorrection(ifm_index, false);
+}
+
+void ShapeFixer::visit(const model::operation::L2Pool2DNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::EmbeddingLookupNode &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto values_index{
+      node.getInputs().at(model::operation::EmbeddingLookupNode::Input::VALUES)};
+  _tensor_builder->dimCorrection(values_index, false);
+  _tensor_builder->dimCorrection(output_index, false);
+}
+
+void ShapeFixer::visit(const model::operation::L2NormalizationNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::HashtableLookupNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::PReLUNode &node)
+{
+  const auto ifm_index{node.getInputs().at(model::operation::PReLUNode::Input::INPUT)};
+  const auto alpha_index{node.getInputs().at(model::operation::PReLUNode::Input::ALPHA)};
+
+  if (!(_ctx.at(ifm_index).shape() == _ctx.at(alpha_index).shape()))
+  {
+    const auto broadcast_rank =
+        std::max(_ctx.at(ifm_index).shape().rank(), _ctx.at(alpha_index).shape().rank());
+    const_cast<::neurun::model::Shape &>(_ctx.at(ifm_index).shape()).extendRank(broadcast_rank);
+    const_cast<::neurun::model::Shape &>(_ctx.at(alpha_index).shape()).extendRank(broadcast_rank);
+  }
+}
+
+void ShapeFixer::visit(const model::operation::TransposeConvNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::SQRTNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::LogicalOrNode &node)
+{
+  const auto input0_index{node.getInputs().at(model::operation::LogicalOrNode::Input::INPUT0)};
+  const auto input1_index{node.getInputs().at(model::operation::LogicalOrNode::Input::INPUT1)};
+
+  if (!(_ctx.at(input0_index).shape() == _ctx.at(input1_index).shape()))
+  {
+    const auto broadcast_rank =
+        std::max(_ctx.at(input0_index).shape().rank(), _ctx.at(input1_index).shape().rank());
+    const_cast<::neurun::model::Shape &>(_ctx.at(input0_index).shape()).extendRank(broadcast_rank);
+    const_cast<::neurun::model::Shape &>(_ctx.at(input1_index).shape()).extendRank(broadcast_rank);
+  }
+}
+
+void ShapeFixer::visit(const model::operation::LogicalNotNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::SquaredDifferenceNode &node)
+{
+  const auto lhs_index{node.getInputs().at(model::operation::SquaredDifferenceNode::Input::LHS)};
+  const auto rhs_index{node.getInputs().at(model::operation::SquaredDifferenceNode::Input::RHS)};
+
+  if (!(_ctx.at(lhs_index).shape() == _ctx.at(rhs_index).shape()))
+  {
+    const auto broadcast_rank =
+        std::max(_ctx.at(lhs_index).shape().rank(), _ctx.at(rhs_index).shape().rank());
+    const_cast<neurun::model::Shape &>(_ctx.at(lhs_index).shape()).extendRank(broadcast_rank);
+    const_cast<neurun::model::Shape &>(_ctx.at(rhs_index).shape()).extendRank(broadcast_rank);
+  }
+}
+
+void ShapeFixer::visit(const model::operation::TopKV2Node &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::GatherNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::GatherNode::Input::INPUT)};
+  const auto indices_index{node.getInputs().at(model::operation::GatherNode::Input::INDICES)};
+  _tensor_builder->dimCorrection(ofm_index, false);
+  _tensor_builder->dimCorrection(ifm_index, false);
+  _tensor_builder->dimCorrection(indices_index, false);
+}
+
+void ShapeFixer::visit(const model::operation::NegNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::AbsNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::ArgMaxNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::ArgMaxNode::Input::INPUT)};
+  _tensor_builder->dimCorrection(ofm_index, false);
+  _tensor_builder->dimCorrection(ifm_index, false);
+}
+
+void ShapeFixer::visit(const model::operation::DequantizeNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::MeanNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::LocalResponseNormalizationNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::DepthToSpaceNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::ReduceMinNode &) { /* DO NOTHING */}
+
+void ShapeFixer::visit(const model::operation::SplitNode &node)
+{
+  const auto input_index{node.getInputs().at(model::operation::SplitNode::Input::INPUT)};
+  _tensor_builder->dimCorrection(input_index, false);
+  for (const auto &output : node.getOutputs())
+    _tensor_builder->dimCorrection(output, false);
+}
+
+void ShapeFixer::visit(const model::operation::UnpackNode &node)
+{
+  const auto input_index{node.getInputs().at(model::operation::UnpackNode::Input::INPUT)};
+  _tensor_builder->dimCorrection(input_index, false);
+  for (const auto &output_index : node.getOutputs())
+    _tensor_builder->dimCorrection(output_index, false);
+}
+
+void ShapeFixer::visit(const model::operation::PadNode &node)
+{
+  const auto input_index{node.getInputs().at(model::operation::PadNode::Input::INPUT)};
+  const auto output_index{node.getOutputs().at(0)};
+  _tensor_builder->dimCorrection(input_index, false);
+  _tensor_builder->dimCorrection(output_index, false);
+}
+
+} // namespace acl_cl
+} // namespace backend
+} // namespace neurun
diff --git a/runtimes/neurun/backend/acl_cl/ShapeFixer.h b/runtimes/neurun/backend/acl_cl/ShapeFixer.h
new file mode 100644
index 000000000..519d1bafb
--- /dev/null
+++ b/runtimes/neurun/backend/acl_cl/ShapeFixer.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_ACL_CL_SHAPE_FIXER_H__
+#define __NEURUN_BACKEND_ACL_CL_SHAPE_FIXER_H__
+
+#include <backend/IShapeFixer.h>
+
+#include "model/Operands.h"
+#include "TensorBuilder.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_cl
+{
+
+class ShapeFixer : public IShapeFixer
+{
+public:
+  ShapeFixer(const neurun::model::Operands &ctx,
+             const std::shared_ptr<TensorBuilder> &tensor_builder);
+
+  std::shared_ptr<ITensorBuilder> tensor_builder() override { return _tensor_builder; }
+
+  void visit(const model::operation::Conv2DNode &) override;
+  void visit(const model::operation::DepthwiseConv2DNode &) override;
+  void visit(const model::operation::MaxPool2DNode &) override;
+  void visit(const model::operation::AvgPool2DNode &) override;
+  void visit(const model::operation::ConcatNode &) override;
+  void visit(const model::operation::FullyConnectedNode &) override;
+  void visit(const model::operation::MulNode &) override;
+  void visit(const model::operation::ReduceSumNode &) override;
+  void visit(const model::operation::ReshapeNode &) override;
+  void visit(const model::operation::SqueezeNode &) override;
+  void visit(const model::operation::TanhNode &) override;
+  void visit(const model::operation::SoftmaxNode &) override;
+  void visit(const model::operation::StridedSliceNode &) override;
+  void visit(const model::operation::TransposeNode &) override;
+  void visit(const model::operation::AddNode &) override;
+  void visit(const model::operation::SubNode &) override;
+  void visit(const model::operation::CastNode &) override;
+  void visit(const model::operation::DivNode &) override;
+  void visit(const model::operation::ExpNode &) override;
+  void visit(const model::operation::LogisticNode &) override;
+  void visit(const model::operation::ReduceMaxNode &) override;
+  void visit(const model::operation::ComparisonNode &) override;
+  void visit(const model::operation::LogicalAndNode &) override;
+  void visit(const model::operation::LSTMNode &) override;
+  void visit(const model::operation::RSQRTNode &) override;
+  void visit(const model::operation::ReLUNode &) override;
+  void visit(const model::operation::ResizeBilinearNode &) override;
+  void visit(const model::operation::ReLU1Node &) override;
+  void visit(const model::operation::ReLU6Node &) override;
+  void visit(const model::operation::RNNNode &) override;
+  void visit(const model::operation::FloorNode &) override;
+  void visit(const model::operation::SpaceToDepthNode &) override;
+  void visit(const model::operation::L2Pool2DNode &) override;
+  void visit(const model::operation::EmbeddingLookupNode &) override;
+  void visit(const model::operation::L2NormalizationNode &) override;
+  void visit(const model::operation::HashtableLookupNode &) override;
+  void visit(const model::operation::PReLUNode &) override;
+  void visit(const model::operation::TransposeConvNode &) override;
+  void visit(const model::operation::SQRTNode &) override;
+  void visit(const model::operation::LogicalOrNode &) override;
+  void visit(const model::operation::LogicalNotNode &) override;
+  void visit(const model::operation::SquaredDifferenceNode &) override;
+  void visit(const model::operation::TopKV2Node &) override;
+  void visit(const model::operation::GatherNode &) override;
+  void visit(const model::operation::NegNode &) override;
+  void visit(const model::operation::AbsNode &) override;
+  void visit(const model::operation::ArgMaxNode &) override;
+  void visit(const model::operation::DequantizeNode &) override;
+  void visit(const model::operation::MeanNode &) override;
+  void visit(const model::operation::LocalResponseNormalizationNode &) override;
+  void visit(const model::operation::DepthToSpaceNode &) override;
+  void visit(const model::operation::ReduceMinNode &) override;
+  void visit(const model::operation::SplitNode &) override;
+  void visit(const model::operation::UnpackNode &) override;
+  void visit(const model::operation::PadNode &) override;
+
+private:
+  const neurun::model::Operands &_ctx;
+  std::shared_ptr<TensorBuilder> _tensor_builder;
+};
+
+} // namespace acl_cl
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_ACL_CL_SHAPE_FIXER_H__
diff --git a/runtimes/neurun/backend/acl_cl/TensorBuilder.h b/runtimes/neurun/backend/acl_cl/TensorBuilder.h
new file mode 100644
index 000000000..8ce69a6c2
--- /dev/null
+++ b/runtimes/neurun/backend/acl_cl/TensorBuilder.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_ACL_CL_TENSOR_BUILDER_H__
+#define __NEURUN_BACKEND_ACL_CL_TENSOR_BUILDER_H__
+
+#include <TemplTensorBuilder.h>
+
+#include "operand/CLTensor.h"
+#include "operand/CLSubTensor.h"
+#include "operand/Object.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_cl
+{
+
+using TensorBuilder =
+    ::neurun::backend::acl_common::TemplTensorBuilder<::neurun::backend::acl_cl::operand::ICLTensor,
+                                                      operand::CLTensor, operand::CLSubTensor,
+                                                      operand::Object>;
+
+} // namespace acl_cl
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_ACL_CL_TENSOR_BUILDER_H__
diff --git a/runtimes/neurun/backend/acl_cl/TensorManager.h b/runtimes/neurun/backend/acl_cl/TensorManager.h
new file mode 100644
index 000000000..10145bb9a
--- /dev/null
+++ b/runtimes/neurun/backend/acl_cl/TensorManager.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_ACL_CL_TENSOR_MANAGER_H__
+#define __NEURUN_BACKEND_ACL_CL_TENSOR_MANAGER_H__
+
+#include <arm_compute/runtime/CL/CLBufferAllocator.h>
+#include <arm_compute/runtime/PoolManager.h>
+#include <arm_compute/runtime/BlobLifetimeManager.h>
+#include <arm_compute/runtime/MemoryManagerOnDemand.h>
+#include <arm_compute/runtime/CL/CLMemoryGroup.h>
+
+#include <AclMemoryManager.h>
+#include <AclLinearMemoryManager.h>
+#include <AclInternalBufferManager.h>
+#include <AclTensorManager.h>
+
+#include "operand/CLTensor.h"
+#include "operand/CLSubTensor.h"
+#include "operand/Object.h"
+
+#include "util/logging.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_cl
+{
+
+using MemoryManager =
+    ::neurun::backend::acl_common::AclMemoryManager<operand::ICLTensor, operand::CLTensor,
+                                                    operand::CLSubTensor, operand::Object>;
+
+using LinearMemoryManager = ::neurun::backend::acl_common::AclLinearMemoryManager<
+    operand::ICLTensor, operand::CLTensor, operand::CLSubTensor, operand::Object,
+    ::arm_compute::MemoryManagerOnDemand, ::arm_compute::PoolManager,
+    ::arm_compute::BlobLifetimeManager, ::arm_compute::CLBufferAllocator,
+    ::arm_compute::CLMemoryGroup>;
+
+using InternalBufferManager = ::neurun::backend::acl_common::AclInternalBufferManager<
+    ::arm_compute::MemoryManagerOnDemand, ::arm_compute::PoolManager,
+    ::arm_compute::BlobLifetimeManager, ::arm_compute::CLBufferAllocator>;
+
+using TensorManager =
+    ::neurun::backend::acl_common::AclTensorManager<::neurun::backend::acl_cl::operand::ICLTensor,
+                                                    operand::CLTensor, operand::CLSubTensor,
+                                                    operand::Object>;
+
+TensorManager *createTensorManager()
+{
+  const std::string executor_str = util::getConfigString(util::config::EXECUTOR);
+
+  if (executor_str == "Linear")
+  {
+    VERBOSE(acl_cl_createTensorManager) << "AclTensorManager as Linear" << std::endl;
+    return new TensorManager(new MemoryManager(), new LinearMemoryManager(),
+                             new InternalBufferManager());
+  }
+  else
+  {
+    VERBOSE(acl_cl_createTensorManager) << "AclTensorManager" << std::endl;
+    return new TensorManager(new MemoryManager(), new MemoryManager(), new InternalBufferManager());
+  }
+}
+
+} // namespace acl_cl
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_ACL_CL_TENSOR_MANAGER_H__
diff --git a/runtimes/neurun/backend/acl_cl/kernel/ConcatLayer.cc b/runtimes/neurun/backend/acl_cl/kernel/ConcatLayer.cc
new file mode 100644
index 000000000..aa1fd9aed
--- /dev/null
+++ b/runtimes/neurun/backend/acl_cl/kernel/ConcatLayer.cc
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConcatLayer.h"
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+#include "util/feature/nchw/View.h"
+#include "util/logging.h"
+
+namespace
+{
+
+inline bool matchSizeExceptAxis(const ::neurun::backend::acl_cl::operand::ICLTensor *t1,
+                                const ::neurun::backend::acl_cl::operand::ICLTensor *t2,
+                                uint32_t axis)
+{
+  assert(t1->num_dimensions() <= 4);
+  assert(t2->num_dimensions() <= 4);
+
+  for (uint32_t i = 0; i < 4; i++)
+  {
+    if (axis == i)
+      continue;
+    if (t1->info()->dimension(i) != t2->info()->dimension(i))
+      return false;
+  }
+  return true;
+}
+
+} // namespace {anonymous}
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_cl
+{
+namespace kernel
+{
+
+ConcatLayer::ConcatLayer()
+    : _input_allocs(), _output_alloc(nullptr), _axis(0), _input_type(arm_compute::DataType::F32)
+{
+  // DO NOTHING
+}
+
+template <typename T> bool ConcatLayer::concatenate()
+{
+  // Input and output size check
+  {
+    // NOTE Support only tensor with dimension 4 or less
+
+    uint32_t axis_sum = 0;
+
+    for (auto input : _input_allocs)
+    {
+      assert(_output_alloc->ptr()->layout() == input->ptr()->layout());
+      assert(matchSizeExceptAxis(_output_alloc->ptr(), input->ptr(), _axis));
+      axis_sum += input->ptr()->info()->dimension(_axis);
+    }
+
+    assert(_output_alloc->ptr()->info()->dimension(_axis) == axis_sum);
+  }
+
+  VERBOSE(Concat_RUN) << "START Concat" << std::endl;
+
+  // Perform operation
+  {
+    uint32_t axis_offset = 0;
+
+    auto outout_fn = [&](::neurun::backend::operand::ITensor &out_tensor) {
+      for (auto input : _input_allocs)
+      {
+        auto &out_cl_tensor =
+            static_cast<::neurun::backend::acl_cl::operand::ICLTensor &>(out_tensor);
+        auto input_fn = [&](::neurun::backend::operand::ITensor &in_tensor) {
+          auto &in_cl_tensor =
+              static_cast<::neurun::backend::acl_cl::operand::ICLTensor &>(in_tensor);
+          for (uint32_t i = 0; i < in_cl_tensor.info()->dimension(0); i++)
+          {
+            for (uint32_t j = 0; j < in_cl_tensor.info()->dimension(1); j++)
+            {
+              for (uint32_t k = 0; k < in_cl_tensor.info()->dimension(2); k++)
+              {
+                for (uint32_t l = 0; l < in_cl_tensor.info()->dimension(3); l++)
+                {
+                  int32_t io = (_axis == 0) ? axis_offset : 0;
+                  int32_t jo = (_axis == 1) ? axis_offset : 0;
+                  int32_t ko = (_axis == 2) ? axis_offset : 0;
+                  int32_t lo = (_axis == 3) ? axis_offset : 0;
+                  T value =
+                      *reinterpret_cast<T *>(in_cl_tensor.handle()->ptr_to_element({i, j, k, l}));
+                  *reinterpret_cast<T *>(out_cl_tensor.handle()->ptr_to_element(
+                      {i + io, j + jo, k + ko, l + lo})) = value;
+                }
+              }
+            }
+          }
+          if (_axis == 0)
+            axis_offset += in_cl_tensor.info()->dimension(0);
+          if (_axis == 1)
+            axis_offset += in_cl_tensor.info()->dimension(1);
+          if (_axis == 2)
+            axis_offset += in_cl_tensor.info()->dimension(2);
+          if (_axis == 3)
+            axis_offset += in_cl_tensor.info()->dimension(3);
+        };
+        input->access(input_fn);
+      }
+    };
+    _output_alloc->access(outout_fn);
+  }
+
+  VERBOSE(Concat_RUN) << "End   Concat" << std::endl;
+
+  return true;
+}
+
+void ConcatLayer::configure(
+    const std::vector<::neurun::backend::acl_cl::operand::Object *> &input_allocs, int32_t axis,
+    ::neurun::backend::acl_cl::operand::Object *output_alloc)
+{
+  _input_allocs = input_allocs;
+  _output_alloc = output_alloc;
+
+  assert(axis < 4);
+
+  // TODO Handle when axis is negative
+  assert(axis >= 0);
+
+  _axis = axis;
+
+  _input_type = input_allocs[0]->ptr()->data_type();
+}
+
+void ConcatLayer::run()
+{
+  if (_input_type == arm_compute::DataType::F32)
+  {
+    concatenate<float>();
+  }
+  else if (_input_type == arm_compute::DataType::QASYMM8)
+  {
+    concatenate<uint8_t>();
+  }
+}
+
+} // namespace kernel
+} // namespace acl_cl
+} // namespace backend
+} // namespace neurun
diff --git a/runtimes/neurun/backend/acl_cl/kernel/ConcatLayer.h b/runtimes/neurun/backend/acl_cl/kernel/ConcatLayer.h
new file mode 100644
index 000000000..ed273e297
--- /dev/null
+++ b/runtimes/neurun/backend/acl_cl/kernel/ConcatLayer.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_ACL_CL_KERNEL_CONCAT_LAYER_H__
+#define __NEURUN_BACKEND_ACL_CL_KERNEL_CONCAT_LAYER_H__
+
+#include <arm_compute/runtime/IFunction.h>
+#include <arm_compute/core/Types.h>
+
+#include "operand/Object.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_cl
+{
+namespace kernel
+{
+
+//
+// neurun::backend::acl_cl::kernel::ConcatLayer
+// A naive implementation of ConcatLayer for ACL
+//
+
+class ConcatLayer : public ::arm_compute::IFunction
+{
+public:
+  ConcatLayer();
+
+public:
+  void configure(const std::vector<::neurun::backend::acl_cl::operand::Object *> &input_allocs,
+                 int32_t axis /* NNAPI tensor axis from NHWC order */,
+                 ::neurun::backend::acl_cl::operand::Object *output_alloc);
+
+  void run();
+
+private:
+  template <typename T> bool concatenate();
+
+private:
+  std::vector<::neurun::backend::acl_cl::operand::Object *> _input_allocs;
+  ::neurun::backend::acl_cl::operand::Object *_output_alloc;
+  int32_t _axis;
+  arm_compute::DataType _input_type;
+};
+
+} // namespace kernel
+} // namespace acl_cl
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_ACL_CL_KERNEL_CONCAT_LAYER_H__
diff --git a/runtimes/neurun/backend/acl_cl/operand/CLSubTensor.cc b/runtimes/neurun/backend/acl_cl/operand/CLSubTensor.cc
new file mode 100644
index 000000000..70c8829d9
--- /dev/null
+++ b/runtimes/neurun/backend/acl_cl/operand/CLSubTensor.cc
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CLSubTensor.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_cl
+{
+namespace operand
+{
+
+CLSubTensor::CLSubTensor(ICLTensor *parent, const arm_compute::TensorShape &tensor_shape,
+                         const arm_compute::Coordinates &coords, size_t rank, bool extend_parent)
+    : _cl_sub_tensor(std::make_shared<arm_compute::CLSubTensor>(parent->handle(), tensor_shape,
+                                                                coords, extend_parent)),
+      _rank{rank}
+{
+  // DO NOTHING
+}
+
+const arm_compute::CLSubTensor *CLSubTensor::handle() const { return _cl_sub_tensor.get(); }
+
+arm_compute::CLSubTensor *CLSubTensor::handle() { return _cl_sub_tensor.get(); }
+
+} // namespace operand
+} // namespace acl_cl
+} // namespace backend
+} // namespace neurun
diff --git a/runtimes/neurun/backend/acl_cl/operand/CLSubTensor.h b/runtimes/neurun/backend/acl_cl/operand/CLSubTensor.h
new file mode 100644
index 000000000..8eba3760f
--- /dev/null
+++ b/runtimes/neurun/backend/acl_cl/operand/CLSubTensor.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_ACL_CL_OPERAND_CL_SUB_TENSOR_H__
+#define __NEURUN_BACKEND_ACL_CL_OPERAND_CL_SUB_TENSOR_H__
+
+#include <arm_compute/runtime/CL/CLSubTensor.h>
+#include "ICLTensor.h"
+#include "compiler/SubTensorInfo.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_cl
+{
+namespace operand
+{
+
+class CLSubTensor : public ICLTensor
+{
+public:
+  CLSubTensor() = delete;
+
+public:
+  CLSubTensor(ICLTensor *parent, const arm_compute::TensorShape &tensor_shape,
+              const arm_compute::Coordinates &coords, size_t rank, bool extend_parent = false);
+
+public:
+  size_t num_dimensions() const final { return _rank; }
+
+public:
+  const arm_compute::CLSubTensor *handle() const override;
+  arm_compute::CLSubTensor *handle() override;
+
+public:
+  // This method is used to prevent the use of memcpy for SubTensor
+  bool has_padding() const override { return true; }
+
+private:
+  std::shared_ptr<arm_compute::CLSubTensor> _cl_sub_tensor;
+  size_t _rank;
+};
+
+} // namespace operand
+} // namespace acl_cl
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_ACL_CL_OPERAND_CL_SUB_TENSOR_H__
diff --git a/runtimes/neurun/backend/acl_cl/operand/CLTensor.cc b/runtimes/neurun/backend/acl_cl/operand/CLTensor.cc
new file mode 100644
index 000000000..6153fc2e4
--- /dev/null
+++ b/runtimes/neurun/backend/acl_cl/operand/CLTensor.cc
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CLTensor.h"
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+#include <arm_compute/runtime/CL/CLMemory.h>
+#include <arm_compute/runtime/CL/CLMemoryRegion.h>
+
+#include <Convert.h>
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_cl
+{
+namespace operand
+{
+
+CLTensor::CLTensor(const arm_compute::TensorInfo &info, size_t rank)
+    : _cl_tensor(std::make_shared<arm_compute::CLTensor>()), _rank{rank}
+{
+  allocator()->init(info);
+}
+
+const arm_compute::CLTensor *CLTensor::handle() const { return _cl_tensor.get(); }
+
+arm_compute::CLTensor *CLTensor::handle() { return _cl_tensor.get(); }
+
+arm_compute::CLTensorAllocator *CLTensor::allocator() { return _cl_tensor->allocator(); }
+
+void CLTensor::map(bool blocking) { _cl_tensor->map(blocking); }
+
+void CLTensor::unmap() { _cl_tensor->unmap(); }
+
+void CLTensor::setBuffer(void *host_ptr)
+{
+  // Constructs a Buffer on a user-supplied memory
+  auto buffer = cl::Buffer(arm_compute::CLScheduler::get().context(),
+                           CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, info()->total_size(), host_ptr);
+  // import memory
+  allocator()->import_memory(buffer);
+}
+
+} // namespace operand
+} // namespace acl_cl
+} // namespace backend
+} // namespace neurun
diff --git a/runtimes/neurun/backend/acl_cl/operand/CLTensor.h b/runtimes/neurun/backend/acl_cl/operand/CLTensor.h
new file mode 100644
index 000000000..952851623
--- /dev/null
+++ b/runtimes/neurun/backend/acl_cl/operand/CLTensor.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_ACL_CL_OPERAND_CL_TENSOR_H__
+#define __NEURUN_BACKEND_ACL_CL_OPERAND_CL_TENSOR_H__
+
+#include <arm_compute/core/TensorInfo.h>
+#include <arm_compute/runtime/CL/CLTensor.h>
+#include <arm_compute/runtime/CL/CLScheduler.h>
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "ICLTensor.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_cl
+{
+namespace operand
+{
+
+class CLTensor : public ICLTensor
+{
+public:
+  CLTensor() = delete;
+
+public:
+  CLTensor(const arm_compute::TensorInfo &info, size_t rank);
+
+public:
+  size_t num_dimensions() const final { return _rank; }
+
+public:
+  const arm_compute::CLTensor *handle() const override;
+  arm_compute::CLTensor *handle() override;
+
+public:
+  arm_compute::CLTensorAllocator *allocator();
+  void map(bool blocking = true);
+  void unmap();
+  /** Set given buffer as the buffer of the tensor
+   *
+   * @note Ownership of the memory is not transferred to this object.
+   *       Thus management (allocate/free) should be done by the client.
+   *
+   * @param[in] host_ptr Storage to be used.
+   */
+  void setBuffer(void *host_ptr);
+
+private:
+  std::shared_ptr<arm_compute::CLTensor> _cl_tensor;
+  size_t _rank;
+};
+
+} // namespace operand
+} // namespace acl_cl
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_ACL_CL_OPERAND_CL_TENSOR_H__
diff --git a/runtimes/neurun/backend/acl_cl/operand/ICLTensor.h b/runtimes/neurun/backend/acl_cl/operand/ICLTensor.h
new file mode 100644
index 000000000..022cec6e3
--- /dev/null
+++ b/runtimes/neurun/backend/acl_cl/operand/ICLTensor.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_ACL_CL_OPERAND_I_CL_TENSOR_H__
+#define __NEURUN_BACKEND_ACL_CL_OPERAND_I_CL_TENSOR_H__
+
+#include <arm_compute/core/CL/ICLTensor.h>
+
+#include <IACLTensor.h>
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_cl
+{
+namespace operand
+{
+
+class ICLTensor : public acl_common::IACLTensor
+{
+public:
+  const arm_compute::ICLTensor *handle() const override = 0;
+  arm_compute::ICLTensor *handle() override = 0;
+
+public:
+  void map(cl::CommandQueue &q, bool blocking = true) { return handle()->map(q, blocking); }
+  void unmap(cl::CommandQueue &q) { return handle()->unmap(q); }
+};
+
+} // namespace operand
+} // namespace acl_cl
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_ACL_CL_OPERAND_I_CL_TENSOR_H__
diff --git a/runtimes/neurun/backend/acl_cl/operand/Object.cc b/runtimes/neurun/backend/acl_cl/operand/Object.cc
new file mode 100644
index 000000000..8f9b2a181
--- /dev/null
+++ b/runtimes/neurun/backend/acl_cl/operand/Object.cc
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Object.h"
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_cl
+{
+namespace operand
+{
+
+void Object::access(const std::function<void(backend::operand::ITensor &tensor)> &fn) const
+{
+  auto &queue = ::arm_compute::CLScheduler::get().queue();
+
+  // This is an optional input
+  if (_tensor->total_size() == 0)
+    return;
+
+  _tensor->map(queue);
+  fn(*_tensor);
+  _tensor->unmap(queue);
+}
+
+} // namespace operand
+} // namespace acl_cl
+} // namespace backend
+} // namespace neurun
diff --git a/runtimes/neurun/backend/acl_cl/operand/Object.h b/runtimes/neurun/backend/acl_cl/operand/Object.h
new file mode 100644
index 000000000..a4308feed
--- /dev/null
+++ b/runtimes/neurun/backend/acl_cl/operand/Object.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_BACKEND_ACL_CL_OPERAND_OBJECT_H__
+#define __NEURUN_BACKEND_ACL_CL_OPERAND_OBJECT_H__
+
+#include <memory>
+
+#include <backend/operand/IObject.h>
+#include "operand/ICLTensor.h"
+
+namespace neurun
+{
+namespace backend
+{
+namespace acl_cl
+{
+namespace operand
+{
+
+class Object : public backend::operand::IObject
+{
+public:
+  Object() = default;
+
+public:
+  Object(const std::shared_ptr<acl_cl::operand::ICLTensor> &tensor) : _tensor{tensor}
+  {
+    // DO NOTHING
+  }
+
+public:
+  acl_cl::operand::ICLTensor *ptr(void) const override { return _tensor.get(); }
+
+private:
+  std::shared_ptr<acl_cl::operand::ICLTensor> _tensor;
+
+public:
+  void access(const std::function<void(backend::operand::ITensor &tensor)> &fn) const override;
+};
+
+} // namespace operand
+} // namespace acl_cl
+} // namespace backend
+} // namespace neurun
+
+#endif // __NEURUN_BACKEND_ACL_CL_OPERAND_OBJECT_H__