10 files changed, 986 insertions, 0 deletions
diff --git a/runtimes/contrib/labs/CMakeLists.txt b/runtimes/contrib/labs/CMakeLists.txt
new file mode 100644
index 000000000..57e28c11a
--- /dev/null
+++ b/runtimes/contrib/labs/CMakeLists.txt
@@ -0,0 +1,5 @@
+if(NOT BUILD_LABS)
+  return()
+endif(NOT BUILD_LABS)
+
+add_subdirectories()
diff --git a/runtimes/contrib/labs/jniacl/CMakeLists.txt b/runtimes/contrib/labs/jniacl/CMakeLists.txt
new file mode 100644
index 000000000..f66127b84
--- /dev/null
+++ b/runtimes/contrib/labs/jniacl/CMakeLists.txt
@@ -0,0 +1,18 @@
+#
+# Simple Android JNI execution test of ACL
+#
+
+if(NOT "${TARGET_OS}" STREQUAL "android")
+  return()
+endif(NOT "${TARGET_OS}" STREQUAL "android")
+
+nnfw_find_package(ARMCompute REQUIRED)
+
+link_directories(${CMAKE_INSTALL_PREFIX}/lib)
+
+set(JNIACL_SRCS src/jniacl_main.cc
+                src/io_accessor.cc)
+
+add_library(jniacl_jni SHARED ${JNIACL_SRCS})
+target_include_directories(jniacl_jni PUBLIC ${TFLITE_JNI_INCLUDES} src)
+target_link_libraries(jniacl_jni arm_compute_graph log)
diff --git a/runtimes/contrib/labs/jniacl/src/io_accessor.cc b/runtimes/contrib/labs/jniacl/src/io_accessor.cc
new file mode 100644
index 000000000..076c93f3d
--- /dev/null
+++ b/runtimes/contrib/labs/jniacl/src/io_accessor.cc
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "io_accessor.h"
+#include <ostream>
+#include <android/log.h>
+
+bool InputAccessor::access_tensor(arm_compute::ITensor &tensor)
+{
+  // Subtract the mean value from each channel
+  arm_compute::Window window;
+  window.use_tensor_dimensions(tensor.info()->tensor_shape());
+
+  execute_window_loop(window, [&](const arm_compute::Coordinates &id) {
+    *reinterpret_cast<float *>(tensor.ptr_to_element(id)) = _test_input;
+    _test_input += _inc ? 1.0 : 0.0;
+
+    __android_log_print(ANDROID_LOG_DEBUG, "LOG_TAG", "Input %d, %d = %lf\r\n", id.y(), id.x(),
+                        *reinterpret_cast<float *>(tensor.ptr_to_element(id)));
+  });
+  return true;
+}
+
+bool OutputAccessor::access_tensor(arm_compute::ITensor &tensor)
+{
+  // Subtract the mean value from each channel
+  arm_compute::Window window;
+  window.use_tensor_dimensions(tensor.info()->tensor_shape());
+
+  execute_window_loop(window, [&](const arm_compute::Coordinates &id) {
+    __android_log_print(ANDROID_LOG_DEBUG, "Output", "Input %d, %d = %lf\r\n", id.y(), id.x(),
+                        *reinterpret_cast<float *>(tensor.ptr_to_element(id)));
+  });
+  return false; // end the network
+}
+
+bool WeightAccessor::access_tensor(arm_compute::ITensor &tensor)
+{
+  // Subtract the mean value from each channel
+  arm_compute::Window window;
+  window.use_tensor_dimensions(tensor.info()->tensor_shape());
+
+  execute_window_loop(window, [&](const arm_compute::Coordinates &id) {
+    *reinterpret_cast<float *>(tensor.ptr_to_element(id)) = _test_weight;
+    _test_weight += _inc ? 1.0 : 0.0;
+  });
+  return true;
+}
+
+bool BiasAccessor::access_tensor(arm_compute::ITensor &tensor)
+{
+  // Subtract the mean value from each channel
+  arm_compute::Window window;
+  window.use_tensor_dimensions(tensor.info()->tensor_shape());
+
+  execute_window_loop(window, [&](const arm_compute::Coordinates &id) {
+    *reinterpret_cast<float *>(tensor.ptr_to_element(id)) = 0.0;
+  });
+  return true;
+}
diff --git a/runtimes/contrib/labs/jniacl/src/io_accessor.h b/runtimes/contrib/labs/jniacl/src/io_accessor.h
new file mode 100644
index 000000000..bc4376644
--- /dev/null
+++ b/runtimes/contrib/labs/jniacl/src/io_accessor.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __IO_ACCESSOR_H__
+#define __IO_ACCESSOR_H__
+
+#include <arm_compute/graph/ITensorAccessor.h>
+
+class InputAccessor : public arm_compute::graph::ITensorAccessor
+{
+public:
+  InputAccessor(bool inc) : _inc(inc) { _test_input = 1.0; }
+  InputAccessor(InputAccessor &&) = default;
+
+  // Inherited methods overriden:
+  bool access_tensor(arm_compute::ITensor &tensor) override;
+
+private:
+  bool _inc;
+  float _test_input;
+};
+
+class OutputAccessor : public arm_compute::graph::ITensorAccessor
+{
+public:
+  OutputAccessor() = default;
+  OutputAccessor(OutputAccessor &&) = default;
+
+  // Inherited methods overriden:
+  bool access_tensor(arm_compute::ITensor &tensor) override;
+};
+
+class WeightAccessor : public arm_compute::graph::ITensorAccessor
+{
+public:
+  WeightAccessor(bool inc) : _inc(inc) { _test_weight = 1.0; }
+  WeightAccessor(WeightAccessor &&) = default;
+
+  // Inherited methods overriden:
+  bool access_tensor(arm_compute::ITensor &tensor) override;
+
+private:
+  bool _inc;
+  float _test_weight;
+};
+
+class BiasAccessor : public arm_compute::graph::ITensorAccessor
+{
+public:
+  BiasAccessor() = default;
+  BiasAccessor(BiasAccessor &&) = default;
+
+  // Inherited methods overriden:
+  bool access_tensor(arm_compute::ITensor &tensor) override;
+};
+
+#endif // __IO_ACCESSOR_H__
diff --git a/runtimes/contrib/labs/jniacl/src/jniacl_main.cc b/runtimes/contrib/labs/jniacl/src/jniacl_main.cc
new file mode 100644
index 000000000..4e5f10d1f
--- /dev/null
+++ b/runtimes/contrib/labs/jniacl/src/jniacl_main.cc
@@ -0,0 +1,37 @@
+#include <jni.h>
+#include <string>
+
+#include <arm_compute/graph/Graph.h>
+#include <arm_compute/graph/Nodes.h>
+
+#include "io_accessor.h"
+
+extern "C" JNIEXPORT jstring JNICALL
+Java_com_samsung_testaclexec_ActivityMain_RunACLJNI(JNIEnv *env, jobject)
+{
+  using arm_compute::DataType;
+  using arm_compute::graph::Tensor;
+  using arm_compute::graph::TargetHint;
+  using arm_compute::graph::Graph;
+  using arm_compute::TensorInfo;
+  using arm_compute::TensorShape;
+
+  arm_compute::graph::Graph graph;
+  TargetHint target_hint = TargetHint::OPENCL;
+  bool autoinc = true;
+
+  graph << target_hint << Tensor(TensorInfo(TensorShape(3U, 3U, 1U, 1U), 1, DataType::F32),
+                                 std::unique_ptr<InputAccessor>(new InputAccessor(autoinc)))
+        << arm_compute::graph::ConvolutionLayer(
+               3U, 3U, 1U, std::unique_ptr<WeightAccessor>(new WeightAccessor(autoinc)),
+               std::unique_ptr<BiasAccessor>(new BiasAccessor()),
+               arm_compute::PadStrideInfo(1, 1, 0, 0))
+        << Tensor(std::unique_ptr<OutputAccessor>(new OutputAccessor()));
+  ;
+
+  graph.run();
+
+  std::string hello = "SoftMax Run OK";
+
+  return env->NewStringUTF(hello.c_str());
+}
diff --git a/runtimes/contrib/labs/opencl_test/CMakeLists.txt b/runtimes/contrib/labs/opencl_test/CMakeLists.txt
new file mode 100644
index 000000000..dc8f5f661
--- /dev/null
+++ b/runtimes/contrib/labs/opencl_test/CMakeLists.txt
@@ -0,0 +1,11 @@
+if(NOT ${TARGET_ARCH_BASE} STREQUAL "arm")
+  return()
+endif(NOT ${TARGET_ARCH_BASE} STREQUAL "arm")
+
+list(APPEND OPENCL_INFO_SOURCE "src/opencl_test.cc")
+
+nnfw_find_package(ARMCompute REQUIRED)
+
+add_executable(opencl_test ${OPENCL_INFO_SOURCE})
+target_link_libraries(opencl_test arm_compute)
+target_link_libraries(opencl_test arm_compute_ex)
diff --git a/runtimes/contrib/labs/opencl_test/README.md b/runtimes/contrib/labs/opencl_test/README.md
new file mode 100644
index 000000000..950528f81
--- /dev/null
+++ b/runtimes/contrib/labs/opencl_test/README.md
@@ -0,0 +1,8 @@
+This directory contains experients of OpenCL code.
+
+How to run:
+```
+LD_LIBRARY_PATH=Product/out/lib Product/obj/contrib/opencl_test/opencl_test [option]
+```
+   - `[option]`
+      - `-g`: prints devices inside GPU and check if they use same memory address
diff --git a/runtimes/contrib/labs/opencl_test/src/opencl_test.cc b/runtimes/contrib/labs/opencl_test/src/opencl_test.cc
new file mode 100644
index 000000000..1faa91478
--- /dev/null
+++ b/runtimes/contrib/labs/opencl_test/src/opencl_test.cc
@@ -0,0 +1,386 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*******************************************************************************
+ * Copyright (c) 2008-2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+#include "arm_compute/core/CL/OpenCL.h"
+
+#include <iostream>
+#include <vector>
+
+void printDeviceInfo(int n, cl::Device &device, cl::Device &default_device)
+{
+  bool is_default = (device() == default_device());
+  std::cout << "\t\t\t#" << n << " Device: (id: " << device() << ") "
+            << (is_default ? " -> default" : "") << "\n";
+
+  const auto name = device.getInfo<CL_DEVICE_NAME>();
+  std::cout << "\t\t\t\tName: " << name << "\n";
+
+  const auto compute_unit = device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
+  std::cout << "\t\t\t\tMax Compute Unit: " << compute_unit << "\n";
+
+  const auto max_work_item_size = device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();
+  std::cout << "\t\t\t\tMax Work Item Size: [";
+  for (auto size : max_work_item_size)
+    std::cout << size << ",";
+  std::cout << "]\n";
+
+  const auto max_work_group_size = device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
+  std::cout << "\t\t\t\tMax Work Grpup Size: " << max_work_group_size << "\n";
+
+  const auto max_clock_frequency = device.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>();
+  std::cout << "\t\t\t\tMax Clock Frequency: " << max_clock_frequency << "\n";
+
+  std::cout << "\n";
+}
+
+class OpenCLGpu
+{
+public:
+  cl::Platform platform_;
+  cl::Context context_;
+  cl::vector<cl::Device> devices_;
+  std::vector<cl::CommandQueue *> q_;
+  cl::Program program_;
+
+  OpenCLGpu()
+  {
+    cl_int cl_error;
+
+    platform_ = cl::Platform::getDefault();
+
+    try
+    {
+      cl_context_properties properties[3] = {CL_CONTEXT_PLATFORM,
+                                             (cl_context_properties)platform_(), 0};
+
+      context_ = cl::Context(CL_DEVICE_TYPE_GPU, properties, NULL, NULL, &cl_error);
+    }
+    catch (cl::Error &err) // thrown when there is no Context for this platform
+    {
+      std::cout << "\t\t No Context Found\n";
+      return;
+    }
+
+    devices_ = context_.getInfo<CL_CONTEXT_DEVICES>();
+
+    for (int dev_id = 0; dev_id < devices_.size(); dev_id++)
+    {
+      cl::CommandQueue *que = new cl::CommandQueue(context_, devices_[dev_id]);
+      q_.emplace_back(que);
+    }
+  }
+
+  ~OpenCLGpu()
+  {
+    for (auto each_q : q_)
+      delete each_q;
+  }
+
+  void buildProgram(std::string &kernel_source_code)
+  {
+    std::vector<std::string> programStrings{kernel_source_code};
+
+    program_ = cl::Program(context_, programStrings);
+
+    try
+    {
+      program_.build("-cl-std=CL1.2");
+    }
+    catch (cl::Error &err)
+    {
+      cl_int buildErr = CL_SUCCESS;
+      auto buildInfo = program_.getBuildInfo<CL_PROGRAM_BUILD_LOG>(&buildErr);
+      for (auto &pair : buildInfo)
+      {
+        std::cerr << pair.second << std::endl << std::endl;
+      }
+    }
+  }
+};
+
+void checkContextMem()
+{
+  cl_int cl_error;
+
+  // get context, devices
+  //
+  std::cout << "\nChecking if devices in GPU shares the same memory address:\n\n";
+
+  OpenCLGpu gpu;
+
+  std::cout << "\nDevices in GPU:\n\n";
+
+  auto &devices = gpu.devices_;
+  auto default_device = cl::Device::getDefault();
+
+  int d = 0;
+  for (auto device : devices)
+    printDeviceInfo(++d, device, default_device);
+
+  if (d < 2)
+  {
+    std::cout << "\t\t This options works when there are n (>= 2) devices.\n";
+    return;
+  }
+
+  // allocate and map memory
+
+  typedef cl_int T;
+  const int items_per_device = 128;
+  const int length = items_per_device * devices.size();
+
+  std::vector<T> input(length);
+  std::vector<T> output(length, 0);
+
+  for (int i = 0; i < length; i++)
+    input[i] = i;
+
+  cl::Buffer input_buf(gpu.context_, (cl_mem_flags)CL_MEM_USE_HOST_PTR, length * sizeof(T),
+                       input.data(), &cl_error);
+  cl::Buffer output_buf(gpu.context_, (cl_mem_flags)CL_MEM_USE_HOST_PTR, length * sizeof(T),
+                        output.data(), &cl_error);
+
+  // compile test cl code
+
+  std::string kernel_source{"typedef int T;                                                 \n"
+                            "kernel void memory_test(                                       \n"
+                            "   const int dev_id,                                           \n"
+                            "   global T* input,                                            \n"
+                            "   global T* output,                                           \n"
+                            "   const int start_idx,                                        \n"
+                            "   const int count)                                            \n"
+                            "{                                                              \n"
+                            "   int input_idx = get_global_id(0);                           \n"
+                            "   if(input_idx < count)                                       \n"
+                            "   {                                                           \n"
+                            "       int output_idx = start_idx + input_idx;                 \n"
+                            "       output[output_idx] = input[input_idx] + dev_id;         \n"
+                            "   }                                                           \n"
+                            "}                                                              \n"};
+
+  gpu.buildProgram(kernel_source);
+
+  try
+  {
+    auto kernel_functor = cl::KernelFunctor<cl_int, cl::Buffer, cl::Buffer, cl_int, cl_int>(
+        gpu.program_, "memory_test"); // name should be same as cl function name
+
+    // create a queue per device and queue a kernel job
+
+    for (int dev_id = 0; dev_id < devices.size(); dev_id++)
+    {
+      kernel_functor(cl::EnqueueArgs(*(gpu.q_[dev_id]), cl::NDRange(items_per_device)),
+                     (cl_int)dev_id, // dev id
+                     input_buf, output_buf,
+                     (cl_int)(items_per_device * dev_id), // start index
+                     (cl_int)(items_per_device),          // count
+                     cl_error);
+    }
+
+    // sync
+
+    for (d = 0; d < devices.size(); d++)
+      (gpu.q_[d])->finish();
+
+    // check if memory state changed by all devices
+
+    cl::copy(*(gpu.q_[0]), output_buf, begin(output), end(output));
+
+    bool use_same_memory = true;
+
+    for (int dev_id = 0; dev_id < devices.size(); dev_id++)
+    {
+      for (int i = 0; i < items_per_device; ++i)
+      {
+        int output_idx = items_per_device * dev_id + i;
+        if (output[output_idx] != input[i] + dev_id)
+        {
+          std::cout << "Output[" << output_idx << "] : "
+                    << "expected = " << input[i] + dev_id << "; actual = " << output[output_idx]
+                    << "\n";
+          use_same_memory = false;
+          break;
+        }
+      }
+    }
+
+    if (use_same_memory)
+      std::cout << "\n=> Mapped memory addresses used by devices in GPU are same.\n\n";
+    else
+      std::cout << "\n=> Mapped memory addresses used by devices in GPU are different.\n\n";
+  }
+  catch (cl::Error &err)
+  {
+    std::cerr << "error: code: " << err.err() << ", what: " << err.what() << std::endl;
+  }
+}
+
+void printHelp()
+{
+  std::cout << "opencl information: \n\n";
+  std::cout << "\t -h : help\n";
+  std::cout
+      << "\t -g : print if memory map is shared among devices in GPU (in default platform)\n\n";
+  std::cout << "\t -s : test for synchronized work by two devices in a GPU\n\n";
+}
+
+#include <mutex>
+#include <chrono>
+#include <thread>
+#include <condition_variable>
+
+#define MAX_DEVICE_NUM 8 // just for testing
+
+int kernel_idx[MAX_DEVICE_NUM];
+unsigned char kernel_completed = 0x00; // bit 0 = 1 means kernel by device[0] was completed.
+unsigned char
+    kernel_completed_flag; // if comparing kernel_completed with this var, all kernels are completed
+int device_num;
+std::mutex kernel_complete_handler_mutex;
+
+std::condition_variable wakeup_main;
+std::mutex wakeup_main_mutex;
+
+void notifyKernelFinished(cl_event ev, cl_int ev_info, void *device_idx)
+{
+  std::cout << "callback from device[" << *((int *)device_idx) << "] : ==> completed.\n";
+
+  std::unique_lock<std::mutex> lock(kernel_complete_handler_mutex);
+
+  kernel_completed |= 0x01 << *((int *)device_idx);
+  if (kernel_completed == kernel_completed_flag)
+    wakeup_main.notify_one();
+}
+
+void testSync()
+{
+  OpenCLGpu gpu;
+
+  cl_int cl_error;
+  typedef cl_int T;
+  const int items_per_device = 1024 * 768;
+  const int length = items_per_device * gpu.devices_.size();
+
+  std::vector<T> output(length, 0);
+
+  cl::Buffer output_buf(gpu.context_, (cl_mem_flags)CL_MEM_USE_HOST_PTR, length * sizeof(T),
+                        output.data(), &cl_error);
+
+  std::string kernel_source{"kernel void test(global float* output, const int count)  \n"
+                            "{                                                        \n"
+                            "   int idx = get_global_id(0);                           \n"
+                            "   if(idx < count)                                       \n"
+                            "   {                                                     \n"
+                            "       float x = hypot(idx/1.111, idx*1.111);            \n"
+                            "       for (int y = 0; y < 200; y++)                     \n"
+                            "         x = rootn(log(pown(rootn(log(pown(x, 20)), 5), 20)), 5);  \n"
+                            "       output[idx] = x;                                  \n"
+                            "   }                                                     \n"
+                            "}                                                        \n"};
+
+  gpu.buildProgram(kernel_source);
+
+  try
+  {
+    auto kernel_functor = cl::KernelFunctor<cl::Buffer, cl_int>(
+        gpu.program_, "test"); // name should be same as cl function name
+
+    // variable init
+    cl::Event ev[MAX_DEVICE_NUM];
+
+    device_num = gpu.devices_.size();
+
+    kernel_completed = 0;
+    kernel_completed_flag = 0;
+    for (int i = 0; i < device_num; i++)
+    {
+      kernel_idx[i] = i;
+      kernel_completed_flag |= 0x01 << i;
+    }
+
+    // create a queue per device and queue a kernel job
+    // queueing with callback function
+    for (int dev_id = 0; dev_id < gpu.devices_.size(); dev_id++)
+    {
+      ev[dev_id] = kernel_functor(cl::EnqueueArgs(*(gpu.q_[dev_id]), cl::NDRange(items_per_device)),
+                                  output_buf,
+                                  (cl_int)(items_per_device), // count
+                                  cl_error);
+      ev[dev_id].setCallback(CL_COMPLETE, notifyKernelFinished, (void *)(kernel_idx + dev_id));
+
+      // how to check kernel execution status
+      //
+      // auto status  = ev[dev_id].getInfo<CL_EVENT_COMMAND_EXECUTION_STATUS>();
+      // std::cout << "Event status = " << (status == CL_QUEUED ? "CL_QUEUED" : status ==
+      // CL_SUBMITTED ? "CL_SUBMITTED" : status == CL_COMPLETE ? "CL_COMPLETE" : "unknown")
+      //           << std::endl;
+      // std::cout << "Event status code = " << status << std::endl;
+    }
+
+    // long wait until kernels are over
+    {
+      std::unique_lock<std::mutex> lk(wakeup_main_mutex);
+      wakeup_main.wait(lk, [] { return (kernel_completed == kernel_completed_flag); });
+
+      std::cout << "all devices were completed.\n";
+    }
+  }
+  catch (cl::Error &err)
+  {
+    std::cerr << "error: code: " << err.err() << ", what: " << err.what() << std::endl;
+  }
+}
+
+int main(const int argc, char **argv)
+{
+  if (argc < 2)
+    printHelp();
+  else
+  {
+    std::string option = argv[1];
+
+    if (option == "-h") // help
+      printHelp();
+    else if (option == "-g") // check if devices in GPU uses same memory address
+      checkContextMem();
+    else if (option == "-s") // check synchronization between devices in GPU
+      testSync();
+  }
+  return 0;
+}
diff --git a/runtimes/contrib/labs/tflite_examples/CMakeLists.txt b/runtimes/contrib/labs/tflite_examples/CMakeLists.txt
new file mode 100644
index 000000000..463bc5531
--- /dev/null
+++ b/runtimes/contrib/labs/tflite_examples/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(tflite_conv_example "src/conv.cpp")
+target_link_libraries(tflite_conv_example tensorflow-lite ${LIB_PTHREAD} dl nnfw_lib_tflite)
diff --git a/runtimes/contrib/labs/tflite_examples/src/conv.cpp b/runtimes/contrib/labs/tflite_examples/src/conv.cpp
new file mode 100644
index 000000000..3117c316c
--- /dev/null
+++ b/runtimes/contrib/labs/tflite_examples/src/conv.cpp
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tflite/ext/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/builtin_op_data.h"
+
+#include <iostream>
+
+using namespace tflite;
+using namespace nnfw::tflite;
+
+namespace vector
+{
+
+template <typename T> struct View
+{
+  virtual ~View() = default;
+
+  virtual int32_t size(void) const = 0;
+  virtual T at(uint32_t off) const = 0;
+};
+}
+
+namespace feature
+{
+
+struct Shape
+{
+  int32_t C;
+  int32_t H;
+  int32_t W;
+};
+
+template <typename T> struct View
+{
+  virtual ~View() = default;
+
+  virtual const Shape &shape(void) const = 0;
+  virtual T at(uint32_t ch, uint32_t row, uint32_t col) const = 0;
+};
+}
+
+namespace kernel
+{
+
+struct Shape
+{
+  int32_t N;
+  int32_t C;
+  int32_t H;
+  int32_t W;
+};
+
+template <typename T> struct View
+{
+  virtual ~View() = default;
+
+  virtual const Shape &shape(void) const = 0;
+  virtual T at(uint32_t nth, uint32_t ch, uint32_t row, uint32_t col) const = 0;
+};
+}
+
+const int32_t N = 1;
+const int32_t C = 2;
+
+class SampleBiasObject final : public vector::View<float>
+{
+public:
+  SampleBiasObject() : _size(N)
+  {
+    // DO NOTHING
+  }
+
+public:
+  int32_t size(void) const override { return _size; }
+
+  float at(uint32_t off) const override { return 0.0f; }
+
+private:
+  int32_t _size;
+};
+
+class SampleFeatureObject final : public feature::View<float>
+{
+public:
+  SampleFeatureObject()
+  {
+    _shape.C = C;
+    _shape.H = 3;
+    _shape.W = 4;
+
+    const uint32_t size = _shape.C * _shape.H * _shape.W;
+
+    for (uint32_t off = 0; off < size; ++off)
+    {
+      _value.emplace_back(off);
+    }
+
+    assert(_value.size() == size);
+  }
+
+public:
+  const feature::Shape &shape(void) const override { return _shape; };
+
+  float at(uint32_t ch, uint32_t row, uint32_t col) const override
+  {
+    return _value.at(ch * _shape.H * _shape.W + row * _shape.W + col);
+  }
+
+public:
+  float &at(uint32_t ch, uint32_t row, uint32_t col)
+  {
+    return _value.at(ch * _shape.H * _shape.W + row * _shape.W + col);
+  }
+
+private:
+  feature::Shape _shape;
+  std::vector<float> _value;
+};
+
+class SampleKernelObject final : public kernel::View<float>
+{
+public:
+  SampleKernelObject()
+  {
+    _shape.N = N;
+    _shape.C = C;
+    _shape.H = 3;
+    _shape.W = 4;
+
+    const uint32_t size = _shape.N * _shape.C * _shape.H * _shape.W;
+
+    for (uint32_t off = 0; off < size; ++off)
+    {
+      _value.emplace_back(off);
+    }
+
+    assert(_value.size() == size);
+  }
+
+public:
+  const kernel::Shape &shape(void) const override { return _shape; };
+
+  float at(uint32_t nth, uint32_t ch, uint32_t row, uint32_t col) const override
+  {
+    return _value.at(nth * _shape.C * _shape.H * _shape.W + ch * _shape.H * _shape.W +
+                     row * _shape.W + col);
+  }
+
+private:
+  kernel::Shape _shape;
+  std::vector<float> _value;
+};
+
+int main(int argc, char **argv)
+{
+  const SampleFeatureObject ifm;
+  const SampleKernelObject kernel;
+  const SampleBiasObject bias;
+
+  const int32_t IFM_C = ifm.shape().C;
+  const int32_t IFM_H = ifm.shape().H;
+  const int32_t IFM_W = ifm.shape().W;
+
+  const int32_t KER_N = kernel.shape().N;
+  const int32_t KER_C = kernel.shape().C;
+  const int32_t KER_H = kernel.shape().H;
+  const int32_t KER_W = kernel.shape().W;
+
+  const int32_t OFM_C = kernel.shape().N;
+  const int32_t OFM_H = (IFM_H - KER_H) + 1;
+  const int32_t OFM_W = (IFM_W - KER_W) + 1;
+
+  // Assumption on this example
+  assert(IFM_C == KER_C);
+  assert(KER_N == bias.size());
+
+  // Comment from 'context.h'
+  //
+  // Parameters for asymmetric quantization. Quantized values can be converted
+  // back to float using:
+  //    real_value = scale * (quantized_value - zero_point);
+  //
+  // Q: Is this necessary?
+  TfLiteQuantizationParams quantization;
+
+  quantization.scale = 1;
+  quantization.zero_point = 0;
+
+  Interpreter interp;
+
+  // On AddTensors(N) call, T/F Lite interpreter creates N tensors whose index is [0 ~ N)
+  interp.AddTensors(5);
+
+  // Configure OFM
+  interp.SetTensorParametersReadWrite(0, kTfLiteFloat32 /* type */, "output" /* name */,
+                                      {1 /*N*/, OFM_H, OFM_W, OFM_C} /* dims */, quantization);
+
+  // Configure IFM
+  interp.SetTensorParametersReadWrite(1, kTfLiteFloat32 /* type */, "input" /* name */,
+                                      {1 /*N*/, IFM_H, IFM_W, IFM_C} /* dims */, quantization);
+
+  // Configure Filter
+  const uint32_t kernel_size = KER_N * KER_C * KER_H * KER_W;
+  float kernel_data[kernel_size] = {
+      0.0f,
+  };
+
+  // Fill kernel data in NHWC order
+  {
+    uint32_t off = 0;
+
+    for (uint32_t nth = 0; nth < KER_N; ++nth)
+    {
+      for (uint32_t row = 0; row < KER_H; ++row)
+      {
+        for (uint32_t col = 0; col < KER_W; ++col)
+        {
+          for (uint32_t ch = 0; ch < KER_C; ++ch)
+          {
+            const auto value = kernel.at(nth, ch, row, col);
+            kernel_data[off++] = value;
+          }
+        }
+      }
+    }
+
+    assert(kernel_size == off);
+  }
+
+  interp.SetTensorParametersReadOnly(
+      2, kTfLiteFloat32 /* type */, "filter" /* name */, {KER_N, KER_H, KER_W, KER_C} /* dims */,
+      quantization, reinterpret_cast<const char *>(kernel_data), sizeof(kernel_data));
+
+  // Configure Bias
+  const uint32_t bias_size = bias.size();
+  float bias_data[bias_size] = {
+      0.0f,
+  };
+
+  // Fill bias data
+  for (uint32_t off = 0; off < bias.size(); ++off)
+  {
+    bias_data[off] = bias.at(off);
+  }
+
+  interp.SetTensorParametersReadOnly(3, kTfLiteFloat32 /* type */, "bias" /* name */,
+                                     {bias.size()} /* dims */, quantization,
+                                     reinterpret_cast<const char *>(bias_data), sizeof(bias_data));
+
+  // Add Convolution Node
+  //
+  // NOTE AddNodeWithParameters take the ownership of param, and deallocate it with free
+  //      So, param should be allocated with malloc
+  TfLiteConvParams *param = reinterpret_cast<TfLiteConvParams *>(malloc(sizeof(TfLiteConvParams)));
+
+  param->padding = kTfLitePaddingValid;
+  param->stride_width = 1;
+  param->stride_height = 1;
+  param->activation = kTfLiteActRelu;
+
+  // Run Convolution and store its result into Tensor #0
+  //  - Read IFM from Tensor #1
+  //  - Read Filter from Tensor #2,
+  //  - Read Bias from Tensor #3
+  interp.AddNodeWithParameters({1, 2, 3}, {0}, nullptr, 0, reinterpret_cast<void *>(param),
+                               BuiltinOpResolver().FindOp(BuiltinOperator_CONV_2D, 1));
+
+  // Set Tensor #1 as Input #0, and Tensor #0 as Output #0
+  interp.SetInputs({1});
+  interp.SetOutputs({0});
+
+  // Let's use NNAPI (if possible)
+  interp.UseNNAPI(true);
+
+  // Allocate Tensor
+  interp.AllocateTensors();
+
+  // Fill IFM data in HWC order
+  {
+    uint32_t off = 0;
+
+    for (uint32_t row = 0; row < ifm.shape().H; ++row)
+    {
+      for (uint32_t col = 0; col < ifm.shape().W; ++col)
+      {
+        for (uint32_t ch = 0; ch < ifm.shape().C; ++ch)
+        {
+          const auto value = ifm.at(ch, row, col);
+          interp.typed_input_tensor<float>(0)[off++] = value;
+        }
+      }
+    }
+  }
+
+  // Let's Rock-n-Roll!
+  interp.Invoke();
+
+  // Print OFM
+  {
+    uint32_t off = 0;
+
+    for (uint32_t row = 0; row < OFM_H; ++row)
+    {
+      for (uint32_t col = 0; col < OFM_W; ++col)
+      {
+        for (uint32_t ch = 0; ch < kernel.shape().N; ++ch)
+        {
+          std::cout << interp.typed_output_tensor<float>(0)[off++] << std::endl;
+        }
+      }
+    }
+  }
+
+  return 0;
+}