36 files changed, 10846 insertions, 0 deletions
diff --git a/runtimes/nn/common/CMakeLists.txt b/runtimes/nn/common/CMakeLists.txt
new file mode 100644
index 000000000..31d2d8086
--- /dev/null
+++ b/runtimes/nn/common/CMakeLists.txt
@@ -0,0 +1,31 @@
+SET (CUR_INCS
+     ${CMAKE_CURRENT_SOURCE_DIR}/include
+     ${CMAKE_CURRENT_SOURCE_DIR}/operations
+     ${CMAKE_CURRENT_SOURCE_DIR}
+)
+SET (INC_DIRS
+     ${INC_DIRS}
+     ${CUR_INCS}
+     PARENT_SCOPE
+)
+
+SET (CUR_SRCS
+     ${CMAKE_CURRENT_SOURCE_DIR}/CpuExecutor.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/OperationsUtils.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/Utils.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/NNFWKernels.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/operations/Activation.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/operations/Conv2D.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/operations/Concatenation.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/operations/FullyConnected.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/operations/Pooling.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/operations/SimpleMath.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/operations/Reshape.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/Logging.cpp
+     ${CMAKE_CURRENT_SOURCE_DIR}/operations/DepthwiseConv2D.cpp
+)
+SET (SRCS
+     ${SRCS}
+     ${CUR_SRCS}
+     PARENT_SCOPE
+)
diff --git a/runtimes/nn/common/CpuExecutor.cpp b/runtimes/nn/common/CpuExecutor.cpp
new file mode 100644
index 000000000..5a8f6f18b
--- /dev/null
+++ b/runtimes/nn/common/CpuExecutor.cpp
@@ -0,0 +1,1324 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define LOG_TAG "CpuExecutor"
+
+#include "CpuExecutor.h"
+
+#include "NeuralNetworks.h"
+#include "Operations.h"
+
+#include "NNFWKernels.h"
+
+#include <sys/mman.h>
+
+namespace nnfw {
+namespace rt {
+
+// TODO: short term, make share memory mapping and updating a utility function.
+// TODO: long term, implement mmap_fd as a hidl IMemory service.
+bool RunTimePoolInfo::set(const hidl_memory& hidlMemory) {
+    this->hidlMemory = hidlMemory;
+    auto memType = hidlMemory.name();
+    if (memType == "ashmem") {
+#if 0 // REF-ANN Enable if ashmem type and IMemory are use
+        memory = mapMemory(hidlMemory);
+        if (memory == nullptr) {
+            LOG(ERROR) << "Can't map shared memory.";
+            return false;
+        }
+        memory->update();
+        buffer = reinterpret_cast<uint8_t*>(static_cast<void*>(memory->getPointer()));
+        if (buffer == nullptr) {
+            LOG(ERROR) << "Can't access shared memory.";
+            return false;
+        }
+        return true;
+#endif
+        LOG(ERROR) << "Currently, Not Support \"ashmem\" type";
+        return false;
+    } else if (memType == "mmap_fd") {
+        size_t size = hidlMemory.size();
+        int fd = hidlMemory.handle()->data[0];
+        int prot = hidlMemory.handle()->data[1];
+        size_t offset = getSizeFromInts(hidlMemory.handle()->data[2],
+                                        hidlMemory.handle()->data[3]);
+        buffer = static_cast<uint8_t*>(mmap(nullptr, size, prot, MAP_SHARED, fd, offset));
+        if (buffer == MAP_FAILED) {
+            LOG(ERROR) << "Can't mmap the file descriptor.";
+            return false;
+        }
+        return true;
+    } else {
+        LOG(ERROR) << "unsupported hidl_memory type";
+        return false;
+    }
+}
+
+// Making sure the output data are correctly updated after execution.
+bool RunTimePoolInfo::update() {
+    auto memType = hidlMemory.name();
+    if (memType == "ashmem") {
+#if 0 // REF-ANN Enable if ashmem type and IMemory are use
+        memory->commit();
+        return true;
+#endif
+        LOG(ERROR) << "Currently, Not Support \"ashmem\" type";
+        return false;
+    } else if (memType == "mmap_fd") {
+        int prot = hidlMemory.handle()->data[1];
+        if (prot & PROT_WRITE) {
+            size_t size = hidlMemory.size();
+            return msync(buffer, size, MS_SYNC) == 0;
+        }
+    }
+    // No-op for other types of memory.
+    return true;
+}
+
+bool setRunTimePoolInfosFromHidlMemories(std::vector<RunTimePoolInfo>* poolInfos,
+                                         const hidl_vec<hidl_memory>& pools) {
+    poolInfos->resize(pools.size());
+    for (size_t i = 0; i < pools.size(); i++) {
+        auto& poolInfo = (*poolInfos)[i];
+        if (!poolInfo.set(pools[i])) {
+            LOG(ERROR) << "Could not map pool";
+            return false;
+        }
+    }
+    return true;
+}
+
+// Updates the RunTimeOperandInfo with the newly calculated shape.
+// Allocate the buffer if we need to.
+static bool setInfoAndAllocateIfNeeded(RunTimeOperandInfo* info, const Shape& shape) {
+    // For user-provided model output operands, the parameters must match the Shape
+    // calculated from the preparation step.
+    if (info->lifetime == OperandLifeTime::MODEL_OUTPUT) {
+        if (info->type != shape.type ||
+            info->dimensions != shape.dimensions) {
+            LOG(ERROR) << "Invalid type or dimensions for model output";
+            return false;
+        }
+        if (info->type == OperandType::TENSOR_QUANT8_ASYMM &&
+            (info->scale != shape.scale || info->zeroPoint != shape.offset)) {
+            LOG(ERROR) << "Invalid scale or zeroPoint for model output";
+            return false;
+        }
+    }
+    info->type = shape.type;
+    info->dimensions = shape.dimensions;
+    info->scale = shape.scale;
+    info->zeroPoint = shape.offset;
+    if (info->lifetime == OperandLifeTime::TEMPORARY_VARIABLE && info->buffer == nullptr) {
+        uint32_t length = sizeOfData(info->type, info->dimensions);
+        info->buffer = new uint8_t[length];
+        if (info->buffer == nullptr) {
+            return false;
+        }
+    }
+    return true;
+}
+
+// Ignore the .pools entry in model and request.  This will have been taken care of
+// by the caller.
+int CpuExecutor::run(const Model& model, const Request& request,
+                     const std::vector<RunTimePoolInfo>& modelPoolInfos,
+                     const std::vector<RunTimePoolInfo>& requestPoolInfos) {
+    VLOG(CPUEXE) << "CpuExecutor::run()";
+    // VLOG(CPUEXE) << "model: " << toString(model);
+#if 0 // REF-ANN
+    VLOG(CPUEXE) << "request: " << toString(request);
+#endif
+
+    // Prepare NNFW_KERNELS
+    nnfw::rt::init_nnfw_kernels();
+
+    mModel = &model;
+    mRequest = &request; // TODO check if mRequest is needed
+    initializeRunTimeInfo(modelPoolInfos, requestPoolInfos);
+    // The model has serialized the operation in execution order.
+    for (const auto& operation : model.operations) {
+        int n = executeOperation(operation);
+        if (n != ANEURALNETWORKS_NO_ERROR) {
+            return n;
+        }
+    }
+    for (auto runtimeInfo : modelPoolInfos) {
+        runtimeInfo.update();
+    }
+    for (auto runtimeInfo : requestPoolInfos) {
+        runtimeInfo.update();
+    }
+    mModel = nullptr;
+    mRequest = nullptr;
+    VLOG(CPUEXE) << "Completed run normally";
+    return ANEURALNETWORKS_NO_ERROR;
+}
+
+bool CpuExecutor::initializeRunTimeInfo(const std::vector<RunTimePoolInfo>& modelPoolInfos,
+                                        const std::vector<RunTimePoolInfo>& requestPoolInfos) {
+    VLOG(CPUEXE) << "CpuExecutor::initializeRunTimeInfo";
+    const size_t count = mModel->operands.size();
+    mOperands.resize(count);
+
+    // Start by setting the runtime info to what's in the model.
+    for (size_t i = 0; i < count; i++) {
+        const Operand& from = mModel->operands[i];
+        RunTimeOperandInfo& to = mOperands[i];
+        to.type = from.type;
+        to.dimensions = from.dimensions;
+        to.scale = from.scale;
+        to.zeroPoint = from.zeroPoint;
+        to.length = from.location.length;
+        to.lifetime = from.lifetime;
+        switch (from.lifetime) {
+            case OperandLifeTime::TEMPORARY_VARIABLE:
+                to.buffer = nullptr;
+                to.numberOfUsesLeft = from.numberOfConsumers;
+                break;
+            case OperandLifeTime::CONSTANT_COPY:
+                to.buffer = const_cast<uint8_t*>(&mModel->operandValues[from.location.offset]);
+                to.numberOfUsesLeft = 0;
+                break;
+            case OperandLifeTime::CONSTANT_REFERENCE: {
+                auto poolIndex = from.location.poolIndex;
+                nnAssert(poolIndex < modelPoolInfos.size());
+                auto& r = modelPoolInfos[poolIndex];
+                to.buffer = r.buffer + from.location.offset;
+                to.numberOfUsesLeft = 0;
+                break;
+            }
+            case OperandLifeTime::MODEL_INPUT:
+            case OperandLifeTime::MODEL_OUTPUT:
+            case OperandLifeTime::NO_VALUE:
+                to.buffer = nullptr;
+                to.numberOfUsesLeft = 0;
+                break;
+            default:
+                nnAssert(false);
+                break;
+        }
+    }
+
+    // Adjust the runtime info for the arguments passed to the model,
+    // modifying the buffer location, and possibly the dimensions.
+    auto updateForArguments = [this, &requestPoolInfos](const std::vector<uint32_t>& indexes,
+                                  const hidl_vec<RequestArgument>& arguments) {
+        nnAssert(indexes.size() == arguments.size());
+        for (size_t i = 0; i < indexes.size(); i++) {
+            const uint32_t operandIndex = indexes[i];
+            const RequestArgument& from = arguments[i];
+            RunTimeOperandInfo& to = mOperands[operandIndex];
+            if (from.dimensions.size() > 0) {
+                // It's the responsibility of the caller to validate that
+                // from.dimensions only modifies the dimensions that were
+                // unspecified in the model.  That's the case in SampleDriver.cpp
+                // with the call to validateRequest().
+                // TODO make sure that's the case for the default CPU path.
+                to.dimensions = from.dimensions;
+            }
+            if (from.hasNoValue) {
+                to.lifetime = OperandLifeTime::NO_VALUE;
+                nnAssert(to.buffer == nullptr);
+            } else {
+                auto poolIndex = from.location.poolIndex;
+                nnAssert(poolIndex < requestPoolInfos.size());
+                auto& r = requestPoolInfos[poolIndex];
+                to.buffer = r.buffer + from.location.offset;
+            }
+        }
+    };
+    updateForArguments(mModel->inputIndexes, mRequest->inputs);
+    updateForArguments(mModel->outputIndexes, mRequest->outputs);
+
+    return true;
+}
+
+void CpuExecutor::freeNoLongerUsedOperands(const std::vector<uint32_t>& inputs) {
+    for (uint32_t i : inputs) {
+        auto& info = mOperands[i];
+        // Check if it's a static or model input/output.
+        if (info.numberOfUsesLeft == 0) {
+            continue;
+        }
+        info.numberOfUsesLeft--;
+        if (info.numberOfUsesLeft == 0) {
+            nnAssert(info.buffer != nullptr);
+            delete[] info.buffer;
+            info.buffer = nullptr;
+        }
+    }
+}
+
+#ifdef NNFW_KERNEL
+#error NNFW_KERNEL should not be defined elsewhere.
+#else
+#define NNFW_KERNEL(_func_name_, _kernel_name_) \
+    auto _func_name_ = _kernel_name_; \
+    { \
+        auto target = std::getenv("NNFW_KERNEL_" #_kernel_name_); \
+        if (target != nullptr) \
+        { \
+            auto it = nnfw_kernels_##_kernel_name_.find(target); \
+            if (it != nnfw_kernels_##_kernel_name_.end()) \
+            { \
+                _func_name_ = it->second; \
+            } \
+        } \
+    }
+#endif
+
+int CpuExecutor::executeOperation(const Operation& operation) {
+    // VLOG(CPUEXE) << "CpuExecutor::executeOperation(" << toString(operation) << ")";
+    const hidl_vec<uint32_t>& ins = operation.inputs;
+    const hidl_vec<uint32_t>& outs = operation.outputs;
+    bool success = false;
+
+    // Function to verify that the number of input and output parameters
+    // matches what is expected.  Also checks that all the parameters have
+    // values. This function is to be used only for operations that do not
+    // accept optional arguments.
+    // TODO Have a version that works for optional arguments.
+    auto allParametersPresent = [&operation, &ins, &outs, this](size_t requiredIns,
+                                                                size_t requiredOuts) -> bool {
+        auto verify = [&operation, this](size_t requiredCount, const hidl_vec<uint32_t>& indexes,
+                          const char* type) -> bool {
+            size_t actualCount = indexes.size();
+            if (actualCount != requiredCount) {
+                LOG(ERROR) << getOperationName(operation.type)
+                           << ": Invalid number of " << type << " operands. Got " << actualCount
+                           << " of " << requiredCount;
+                return false;
+            }
+            for (size_t i = 0; i < actualCount; i++) {
+                if (mOperands[indexes[i]].lifetime == OperandLifeTime::NO_VALUE) {
+                    LOG(ERROR) << getOperationName(operation.type) << " " << type
+                               << " operand " << i << " is required but missing.";
+                    return false;
+                }
+            }
+            return true;
+        };
+        return verify(requiredIns, ins, "in") && verify(requiredOuts, outs, "out");
+    };
+
+    switch (operation.type) {
+#if 0 // REF-ANN
+        case OperationType::OEM_OPERATION: {
+            LOG(ERROR) << "OEM operation not supported for CPU execution";
+            success = false;
+        } break;
+#endif
+        case OperationType::ADD: {
+            if (!allParametersPresent(3, 1)) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            const RunTimeOperandInfo& in1 = mOperands[ins[0]];
+            const RunTimeOperandInfo& in2 = mOperands[ins[1]];
+            int32_t activation = getScalarData<int32_t>(mOperands[ins[2]]);
+
+            RunTimeOperandInfo& out = mOperands[outs[0]];
+            Shape outShape = out.shape();
+
+            if (in1.type == OperandType::TENSOR_FLOAT32) {
+                success = addMulPrepare(in1.shape(), in2.shape(), &outShape) &&
+                          setInfoAndAllocateIfNeeded(&out, outShape) &&
+                          addFloat32(reinterpret_cast<const float*>(in1.buffer),
+                                     in1.shape(),
+                                     reinterpret_cast<const float*>(in2.buffer),
+                                     in2.shape(),
+                                     activation,
+                                     reinterpret_cast<float*>(out.buffer),
+                                     outShape);
+            } else if (in1.type == OperandType::TENSOR_QUANT8_ASYMM) {
+                success = addMulPrepare(in1.shape(), in2.shape(), &outShape) &&
+                          setInfoAndAllocateIfNeeded(&out, outShape) &&
+                          addQuant8(reinterpret_cast<const uint8_t*>(in1.buffer),
+                                    in1.shape(),
+                                    reinterpret_cast<const uint8_t*>(in2.buffer),
+                                    in2.shape(),
+                                    activation,
+                                    reinterpret_cast<uint8_t*>(out.buffer),
+                                    outShape);
+            }
+        } break;
+        case OperationType::MUL: {
+            if (!allParametersPresent(3, 1)) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            const RunTimeOperandInfo& in1 = mOperands[ins[0]];
+            const RunTimeOperandInfo& in2 = mOperands[ins[1]];
+            int32_t activation = getScalarData<int32_t>(mOperands[ins[2]]);
+
+            RunTimeOperandInfo& out = mOperands[outs[0]];
+            Shape outShape = out.shape();
+
+            if (in1.type == OperandType::TENSOR_FLOAT32) {
+                success = addMulPrepare(in1.shape(), in2.shape(), &outShape) &&
+                          setInfoAndAllocateIfNeeded(&out, outShape) &&
+                          mulFloat32(reinterpret_cast<const float*>(in1.buffer),
+                                     in1.shape(),
+                                     reinterpret_cast<const float*>(in2.buffer),
+                                     in2.shape(),
+                                     activation,
+                                     reinterpret_cast<float*>(out.buffer),
+                                     outShape);
+            } else if (in1.type == OperandType::TENSOR_QUANT8_ASYMM) {
+                success = addMulPrepare(in1.shape(), in2.shape(), &outShape) &&
+                          setInfoAndAllocateIfNeeded(&out, outShape) &&
+                          mulQuant8(reinterpret_cast<const uint8_t*>(in1.buffer),
+                                    in1.shape(),
+                                    reinterpret_cast<const uint8_t*>(in2.buffer),
+                                    in2.shape(),
+                                    activation,
+                                    reinterpret_cast<uint8_t*>(out.buffer),
+                                    outShape);
+            }
+        } break;
+#if 0 // REF-ANN
+        case OperationType::FLOOR: {
+            if (!allParametersPresent(1, 1)) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            const RunTimeOperandInfo& input = mOperands[ins[0]];
+            RunTimeOperandInfo& output = mOperands[outs[0]];
+            Shape outShape = output.shape();
+
+            if (input.type == OperandType::TENSOR_FLOAT32) {
+                success = floorPrepare(input.shape(), &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          floorFloat32(reinterpret_cast<const float*>(input.buffer),
+                                       reinterpret_cast<float*>(output.buffer),
+                                       outShape);
+            }
+        } break;
+        case OperationType::DEQUANTIZE: {
+            if (!allParametersPresent(1, 1)) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            const RunTimeOperandInfo& input = mOperands[ins[0]];
+            RunTimeOperandInfo& output = mOperands[outs[0]];
+            Shape outShape = output.shape();
+
+            if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
+                success = dequantizePrepare(input.shape(), &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          dequantizeQuant8ToFloat32(
+                                  reinterpret_cast<const uint8_t*>(input.buffer),
+                                  reinterpret_cast<float*>(output.buffer),
+                                  input.shape());
+            }
+        } break;
+#endif
+        case OperationType::DEPTHWISE_CONV_2D: {
+            const size_t inCount = ins.size();
+            if ((inCount != 11 && inCount != 8) ||
+                    !allParametersPresent(inCount, 1)) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            const RunTimeOperandInfo& input  = mOperands[ins[0]];
+            const RunTimeOperandInfo& filter = mOperands[ins[1]];
+            const RunTimeOperandInfo& bias   = mOperands[ins[2]];
+
+            int32_t padding_left, padding_right;
+            int32_t padding_top, padding_bottom;
+            int32_t stride_width, stride_height;
+            int32_t depth_multiplier;
+            int32_t activation;
+
+            if (inCount == 11) {
+                padding_left     = getScalarData<int32_t>(mOperands[ins[3]]);
+                padding_right    = getScalarData<int32_t>(mOperands[ins[4]]);
+                padding_top      = getScalarData<int32_t>(mOperands[ins[5]]);
+                padding_bottom   = getScalarData<int32_t>(mOperands[ins[6]]);
+                stride_width     = getScalarData<int32_t>(mOperands[ins[7]]);
+                stride_height    = getScalarData<int32_t>(mOperands[ins[8]]);
+                depth_multiplier = getScalarData<int32_t>(mOperands[ins[9]]);
+                activation       = getScalarData<int32_t>(mOperands[ins[10]]);
+            } else {
+                int32_t padding_implicit = getScalarData<int32_t>(mOperands[ins[3]]);
+                stride_width     = getScalarData<int32_t>(mOperands[ins[4]]);
+                stride_height    = getScalarData<int32_t>(mOperands[ins[5]]);
+                depth_multiplier = getScalarData<int32_t>(mOperands[ins[6]]);
+                activation       = getScalarData<int32_t>(mOperands[ins[7]]);
+
+                Shape inputShape = input.shape();
+                Shape filterShape = filter.shape();
+                int32_t input_width  = getSizeOfDimension(inputShape, 2);
+                int32_t input_height = getSizeOfDimension(inputShape, 1);
+                int32_t filter_width  = getSizeOfDimension(filterShape, 2);
+                int32_t filter_height = getSizeOfDimension(filterShape, 1);
+                calculateExplicitPadding(input_width, stride_width,
+                                         filter_width, padding_implicit,
+                                         &padding_left, &padding_right);
+                calculateExplicitPadding(input_height, stride_height,
+                                         filter_height, padding_implicit,
+                                         &padding_top, &padding_bottom);
+            }
+
+            RunTimeOperandInfo& output = mOperands[outs[0]];
+            Shape outShape = output.shape();
+
+            if (input.type == OperandType::TENSOR_FLOAT32) {
+                NNFW_KERNEL(func, depthwiseConvFloat32);
+                success = depthwiseConvPrepare(input.shape(), filter.shape(), bias.shape(),
+                                               padding_left, padding_right,
+                                               padding_top, padding_bottom,
+                                               stride_width, stride_height,
+                                               &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          func(reinterpret_cast<const float*>(input.buffer),
+                                               input.shape(),
+                                               reinterpret_cast<const float*>(filter.buffer),
+                                               filter.shape(),
+                                               reinterpret_cast<const float*>(bias.buffer),
+                                               bias.shape(),
+                                               padding_left, padding_right,
+                                               padding_top, padding_bottom,
+                                               stride_width, stride_height,
+                                               depth_multiplier, activation,
+                                               reinterpret_cast<float*>(output.buffer),
+                                               outShape);
+            } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
+#if 0 // REF-ANN We don't support depthwiseConvQuant8 yet
+                success = depthwiseConvPrepare(input.shape(), filter.shape(), bias.shape(),
+                                               padding_left, padding_right,
+                                               padding_top, padding_bottom,
+                                               stride_width, stride_height,
+                                               &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          depthwiseConvQuant8(reinterpret_cast<const uint8_t*>(input.buffer),
+                                              input.shape(),
+                                              reinterpret_cast<const uint8_t*>(filter.buffer),
+                                              filter.shape(),
+                                              reinterpret_cast<const int32_t*>(bias.buffer),
+                                              bias.shape(),
+                                              padding_left, padding_right,
+                                              padding_top, padding_bottom,
+                                              stride_width, stride_height,
+                                              depth_multiplier, activation,
+                                              reinterpret_cast<uint8_t*>(output.buffer),
+                                              outShape);
+#else // REF-ANN
+                LOG(ERROR) << getOperationName(operation.type) << " failed.";
+                NYI("We dont' support TENSOR_QUANT8_ASYMM yet.");
+#endif // REF-ANN
+            }
+
+        } break;
+        case OperationType::CONV_2D: {
+            const size_t inCount = ins.size();
+            if ((inCount != 10 && inCount != 7) ||
+                    !allParametersPresent(inCount, 1)) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            const RunTimeOperandInfo& input  = mOperands[ins[0]];
+            const RunTimeOperandInfo& filter = mOperands[ins[1]];
+            const RunTimeOperandInfo& bias   = mOperands[ins[2]];
+
+            int32_t padding_left, padding_right;
+            int32_t padding_top, padding_bottom;
+            int32_t stride_width, stride_height;
+            int32_t activation;
+
+            if (inCount == 10) {
+                padding_left     = getScalarData<int32_t>(mOperands[ins[3]]);
+                padding_right    = getScalarData<int32_t>(mOperands[ins[4]]);
+                padding_top      = getScalarData<int32_t>(mOperands[ins[5]]);
+                padding_bottom   = getScalarData<int32_t>(mOperands[ins[6]]);
+                stride_width     = getScalarData<int32_t>(mOperands[ins[7]]);
+                stride_height    = getScalarData<int32_t>(mOperands[ins[8]]);
+                activation       = getScalarData<int32_t>(mOperands[ins[9]]);
+            } else {
+                int32_t padding_implicit = getScalarData<int32_t>(mOperands[ins[3]]);
+                stride_width     = getScalarData<int32_t>(mOperands[ins[4]]);
+                stride_height    = getScalarData<int32_t>(mOperands[ins[5]]);
+                activation       = getScalarData<int32_t>(mOperands[ins[6]]);
+
+                Shape inputShape = input.shape();
+                Shape filterShape = filter.shape();
+                int32_t input_width  = getSizeOfDimension(inputShape, 2);
+                int32_t input_height = getSizeOfDimension(inputShape, 1);
+                int32_t filter_width  = getSizeOfDimension(filterShape, 2);
+                int32_t filter_height = getSizeOfDimension(filterShape, 1);
+                calculateExplicitPadding(input_width, stride_width,
+                                         filter_width, padding_implicit,
+                                         &padding_left, &padding_right);
+                calculateExplicitPadding(input_height, stride_height,
+                                         filter_height, padding_implicit,
+                                         &padding_top, &padding_bottom);
+            }
+
+            RunTimeOperandInfo& output = mOperands[outs[0]];
+            Shape outShape = output.shape();
+
+            if (input.type == OperandType::TENSOR_FLOAT32) {
+                NNFW_KERNEL(func, convFloat32);
+                success = convPrepare(input.shape(), filter.shape(), bias.shape(),
+                                      padding_left, padding_right,
+                                      padding_top, padding_bottom,
+                                      stride_width, stride_height,
+                                      &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          func(reinterpret_cast<const float*>(input.buffer), input.shape(),
+                               reinterpret_cast<const float*>(filter.buffer), filter.shape(),
+                               reinterpret_cast<const float*>(bias.buffer), bias.shape(),
+                               padding_left, padding_right,
+                               padding_top, padding_bottom,
+                               stride_width, stride_height, activation,
+                               reinterpret_cast<float*>(output.buffer), outShape);
+            } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
+                success = convPrepare(input.shape(), filter.shape(), bias.shape(),
+                                      padding_left, padding_right,
+                                      padding_top, padding_bottom,
+                                      stride_width, stride_height,
+                                      &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          convQuant8(reinterpret_cast<const uint8_t*>(input.buffer),
+                                     input.shape(),
+                                     reinterpret_cast<const uint8_t*>(filter.buffer),
+                                     filter.shape(),
+                                     reinterpret_cast<const int32_t*>(bias.buffer),
+                                     bias.shape(),
+                                     padding_left, padding_right,
+                                     padding_top, padding_bottom,
+                                     stride_width, stride_height, activation,
+                                     reinterpret_cast<uint8_t*>(output.buffer),
+                                     outShape);
+            }
+        } break;
+        case OperationType::AVERAGE_POOL_2D: {
+            const size_t inCount = ins.size();
+            if ((inCount != 10 && inCount != 7) ||
+                    !allParametersPresent(inCount, 1)) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            const RunTimeOperandInfo& input = mOperands[ins[0]];
+
+            int32_t padding_left, padding_right;
+            int32_t padding_top, padding_bottom;
+            int32_t stride_width, stride_height;
+            int32_t filter_width, filter_height;
+            int32_t activation;
+
+            if (inCount == 10) {
+                padding_left     = getScalarData<int32_t>(mOperands[ins[1]]);
+                padding_right    = getScalarData<int32_t>(mOperands[ins[2]]);
+                padding_top      = getScalarData<int32_t>(mOperands[ins[3]]);
+                padding_bottom   = getScalarData<int32_t>(mOperands[ins[4]]);
+                stride_width     = getScalarData<int32_t>(mOperands[ins[5]]);
+                stride_height    = getScalarData<int32_t>(mOperands[ins[6]]);
+                filter_width     = getScalarData<int32_t>(mOperands[ins[7]]);
+                filter_height    = getScalarData<int32_t>(mOperands[ins[8]]);
+                activation       = getScalarData<int32_t>(mOperands[ins[9]]);
+            } else {
+                int32_t padding_implicit = getScalarData<int32_t>(mOperands[ins[1]]);
+                stride_width     = getScalarData<int32_t>(mOperands[ins[2]]);
+                stride_height    = getScalarData<int32_t>(mOperands[ins[3]]);
+                filter_width     = getScalarData<int32_t>(mOperands[ins[4]]);
+                filter_height    = getScalarData<int32_t>(mOperands[ins[5]]);
+                activation       = getScalarData<int32_t>(mOperands[ins[6]]);
+
+                Shape inputShape = input.shape();
+                int32_t input_width  = getSizeOfDimension(inputShape, 2);
+                int32_t input_height = getSizeOfDimension(inputShape, 1);
+                calculateExplicitPadding(input_width, stride_width,
+                                         filter_width, padding_implicit,
+                                         &padding_left, &padding_right);
+                calculateExplicitPadding(input_height, stride_height,
+                                         filter_height, padding_implicit,
+                                         &padding_top, &padding_bottom);
+            }
+
+            RunTimeOperandInfo& output = mOperands[outs[0]];
+            Shape outShape = output.shape();
+
+            if (input.type == OperandType::TENSOR_FLOAT32) {
+                NNFW_KERNEL(func, averagePoolFloat32);
+                success = genericPoolingPrepare(input.shape(),
+                                                padding_left, padding_right,
+                                                padding_top, padding_bottom,
+                                                stride_width, stride_height,
+                                                filter_width, filter_height,
+                                                &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          func(reinterpret_cast<const float*>(input.buffer),
+                               input.shape(),
+                               padding_left, padding_right,
+                               padding_top, padding_bottom,
+                               stride_width, stride_height,
+                               filter_width, filter_height, activation,
+                               reinterpret_cast<float*>(output.buffer),
+                               outShape);
+            } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
+                success = genericPoolingPrepare(input.shape(),
+                                                padding_left, padding_right,
+                                                padding_top, padding_bottom,
+                                                stride_width, stride_height,
+                                                filter_width, filter_height,
+                                                &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          averagePoolQuant8(reinterpret_cast<const uint8_t*>(input.buffer),
+                                            input.shape(),
+                                            padding_left, padding_right,
+                                            padding_top, padding_bottom,
+                                            stride_width, stride_height,
+                                            filter_width, filter_height, activation,
+                                            reinterpret_cast<uint8_t*>(output.buffer),
+                                            outShape);
+            }
+        } break;
+#if 0 // REF-ANN
+        case OperationType::L2_POOL_2D: {
+            const size_t inCount = ins.size();
+            if ((inCount != 10 && inCount != 7) ||
+                    !allParametersPresent(inCount, 1)) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            const RunTimeOperandInfo& input = mOperands[ins[0]];
+
+            int32_t padding_left, padding_right;
+            int32_t padding_top, padding_bottom;
+            int32_t stride_width, stride_height;
+            int32_t filter_width, filter_height;
+            int32_t activation;
+
+            if (inCount == 10) {
+                padding_left     = getScalarData<int32_t>(mOperands[ins[1]]);
+                padding_right    = getScalarData<int32_t>(mOperands[ins[2]]);
+                padding_top      = getScalarData<int32_t>(mOperands[ins[3]]);
+                padding_bottom   = getScalarData<int32_t>(mOperands[ins[4]]);
+                stride_width     = getScalarData<int32_t>(mOperands[ins[5]]);
+                stride_height    = getScalarData<int32_t>(mOperands[ins[6]]);
+                filter_width     = getScalarData<int32_t>(mOperands[ins[7]]);
+                filter_height    = getScalarData<int32_t>(mOperands[ins[8]]);
+                activation       = getScalarData<int32_t>(mOperands[ins[9]]);
+            } else {
+                int32_t padding_implicit = getScalarData<int32_t>(mOperands[ins[1]]);
+                stride_width     = getScalarData<int32_t>(mOperands[ins[2]]);
+                stride_height    = getScalarData<int32_t>(mOperands[ins[3]]);
+                filter_width     = getScalarData<int32_t>(mOperands[ins[4]]);
+                filter_height    = getScalarData<int32_t>(mOperands[ins[5]]);
+                activation       = getScalarData<int32_t>(mOperands[ins[6]]);
+
+                Shape inputShape = input.shape();
+                int32_t input_width  = getSizeOfDimension(inputShape, 2);
+                int32_t input_height = getSizeOfDimension(inputShape, 1);
+                calculateExplicitPadding(input_width, stride_width,
+                                         filter_width, padding_implicit,
+                                         &padding_left, &padding_right);
+                calculateExplicitPadding(input_height, stride_height,
+                                         filter_height, padding_implicit,
+                                         &padding_top, &padding_bottom);
+            }
+
+            RunTimeOperandInfo& output = mOperands[outs[0]];
+            Shape outShape = output.shape();
+
+            if (input.type == OperandType::TENSOR_FLOAT32) {
+                success = genericPoolingPrepare(input.shape(),
+                                                padding_left, padding_right,
+                                                padding_top, padding_bottom,
+                                                stride_width, stride_height,
+                                                filter_width, filter_height,
+                                                &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          l2PoolFloat32(reinterpret_cast<const float*>(input.buffer),
+                                        input.shape(),
+                                        padding_left, padding_right,
+                                        padding_top, padding_bottom,
+                                        stride_width, stride_height,
+                                        filter_width, filter_height, activation,
+                                        reinterpret_cast<float*>(output.buffer),
+                                        outShape);
+            }
+        } break;
+#endif // REF-ANN
+        case OperationType::MAX_POOL_2D: {
+            const size_t inCount = ins.size();
+            if ((inCount != 10 && inCount != 7) ||
+                    !allParametersPresent(inCount, 1)) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            const RunTimeOperandInfo& input = mOperands[ins[0]];
+
+            int32_t padding_left, padding_right;
+            int32_t padding_top, padding_bottom;
+            int32_t stride_width, stride_height;
+            int32_t filter_width, filter_height;
+            int32_t activation;
+
+            if (inCount == 10) {
+                padding_left     = getScalarData<int32_t>(mOperands[ins[1]]);
+                padding_right    = getScalarData<int32_t>(mOperands[ins[2]]);
+                padding_top      = getScalarData<int32_t>(mOperands[ins[3]]);
+                padding_bottom   = getScalarData<int32_t>(mOperands[ins[4]]);
+                stride_width     = getScalarData<int32_t>(mOperands[ins[5]]);
+                stride_height    = getScalarData<int32_t>(mOperands[ins[6]]);
+                filter_width     = getScalarData<int32_t>(mOperands[ins[7]]);
+                filter_height    = getScalarData<int32_t>(mOperands[ins[8]]);
+                activation       = getScalarData<int32_t>(mOperands[ins[9]]);
+            } else {
+                int32_t padding_implicit = getScalarData<int32_t>(mOperands[ins[1]]);
+                stride_width     = getScalarData<int32_t>(mOperands[ins[2]]);
+                stride_height    = getScalarData<int32_t>(mOperands[ins[3]]);
+                filter_width     = getScalarData<int32_t>(mOperands[ins[4]]);
+                filter_height    = getScalarData<int32_t>(mOperands[ins[5]]);
+                activation       = getScalarData<int32_t>(mOperands[ins[6]]);
+
+                Shape inputShape = input.shape();
+                int32_t input_width  = getSizeOfDimension(inputShape, 2);
+                int32_t input_height = getSizeOfDimension(inputShape, 1);
+                calculateExplicitPadding(input_width, stride_width,
+                                         filter_width, padding_implicit,
+                                         &padding_left, &padding_right);
+                calculateExplicitPadding(input_height, stride_height,
+                                         filter_height, padding_implicit,
+                                         &padding_top, &padding_bottom);
+            }
+
+            RunTimeOperandInfo& output = mOperands[outs[0]];
+            Shape outShape = output.shape();
+
+            if (input.type == OperandType::TENSOR_FLOAT32) {
+                NNFW_KERNEL(func, maxPoolFloat32);
+                success = genericPoolingPrepare(input.shape(),
+                                                padding_left, padding_right,
+                                                padding_top, padding_bottom,
+                                                stride_width, stride_height,
+                                                filter_width, filter_height,
+                                                &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          func(reinterpret_cast<const float*>(input.buffer),
+                               input.shape(),
+                               padding_left, padding_right,
+                               padding_top, padding_bottom,
+                               stride_width, stride_height,
+                               filter_width, filter_height, activation,
+                               reinterpret_cast<float*>(output.buffer),
+                               outShape);
+            } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
+                success = genericPoolingPrepare(input.shape(),
+                                                padding_left, padding_right,
+                                                padding_top, padding_bottom,
+                                                stride_width, stride_height,
+                                                filter_width, filter_height,
+                                                &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          maxPoolQuant8(reinterpret_cast<const uint8_t*>(input.buffer),
+                                        input.shape(),
+                                        padding_left, padding_right,
+                                        padding_top, padding_bottom,
+                                        stride_width, stride_height,
+                                        filter_width, filter_height, activation,
+                                        reinterpret_cast<uint8_t*>(output.buffer),
+                                        outShape);
+            }
+
+        } break;
+        case OperationType::RELU: {
+            if (!allParametersPresent(1, 1)) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            const RunTimeOperandInfo& input = mOperands[ins[0]];
+            RunTimeOperandInfo& output = mOperands[outs[0]];
+            Shape outShape = output.shape();
+
+            if (input.type == OperandType::TENSOR_FLOAT32) {
+                success = genericActivationPrepare(input.shape(), &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          reluFloat32(reinterpret_cast<const float*>(input.buffer),
+                                      input.shape(),
+                                      reinterpret_cast<float*>(output.buffer),
+                                      outShape);
+            } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
+                success = genericActivationPrepare(input.shape(), &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          reluQuant8(reinterpret_cast<const uint8_t*>(input.buffer),
+                                     input.shape(),
+                                     reinterpret_cast<uint8_t*>(output.buffer),
+                                     outShape);
+            }
+        } break;
+#if 0 // REF-ANN
+        case OperationType::RELU1: {
+            if (!allParametersPresent(1, 1)) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            const RunTimeOperandInfo& input = mOperands[ins[0]];
+            RunTimeOperandInfo& output = mOperands[outs[0]];
+            Shape outShape = output.shape();
+
+            if (input.type == OperandType::TENSOR_FLOAT32) {
+                success = genericActivationPrepare(input.shape(), &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          relu1Float32(reinterpret_cast<const float*>(input.buffer),
+                                       input.shape(),
+                                       reinterpret_cast<float*>(output.buffer),
+                                       outShape);
+            } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
+                success = genericActivationPrepare(input.shape(), &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          relu1Quant8(reinterpret_cast<const uint8_t*>(input.buffer),
+                                      input.shape(),
+                                      reinterpret_cast<uint8_t*>(output.buffer),
+                                      outShape);
+            }
+        } break;
+#endif // REF-ANN
+        case OperationType::RELU6: {
+            if (!allParametersPresent(1, 1)) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            const RunTimeOperandInfo& input = mOperands[ins[0]];
+            RunTimeOperandInfo& output = mOperands[outs[0]];
+            Shape outShape = output.shape();
+
+            if (input.type == OperandType::TENSOR_FLOAT32) {
+                success = genericActivationPrepare(input.shape(), &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          relu6Float32(reinterpret_cast<const float*>(input.buffer),
+                                       input.shape(),
+                                       reinterpret_cast<float*>(output.buffer),
+                                       outShape);
+            } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
+                success = genericActivationPrepare(input.shape(), &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          relu6Quant8(reinterpret_cast<const uint8_t*>(input.buffer),
+                                      input.shape(),
+                                      reinterpret_cast<uint8_t*>(output.buffer),
+                                      outShape);
+            }
+        } break;
+#if 0 // REF-ANN
+        case OperationType::TANH: {
+            if (!allParametersPresent(1, 1)) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            const RunTimeOperandInfo& input = mOperands[ins[0]];
+            RunTimeOperandInfo& output = mOperands[outs[0]];
+            Shape outShape = output.shape();
+
+            if (input.type == OperandType::TENSOR_FLOAT32) {
+                success = genericActivationPrepare(input.shape(), &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          tanhFloat32(reinterpret_cast<const float*>(input.buffer),
+                                      input.shape(),
+                                      reinterpret_cast<float*>(output.buffer),
+                                      outShape);
+            }
+        } break;
+        case OperationType::LOGISTIC: {
+            if (!allParametersPresent(1, 1)) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            const RunTimeOperandInfo& input = mOperands[ins[0]];
+            RunTimeOperandInfo& output = mOperands[outs[0]];
+            Shape outShape = output.shape();
+
+            if (input.type == OperandType::TENSOR_FLOAT32) {
+                success = genericActivationPrepare(input.shape(), &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          logisticFloat32(reinterpret_cast<const float*>(input.buffer),
+                                          input.shape(),
+                                          reinterpret_cast<float*>(output.buffer),
+                                          outShape);
+            } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
+                success = genericActivationPrepare(input.shape(), &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          logisticQuant8(reinterpret_cast<const uint8_t*>(input.buffer),
+                                         input.shape(),
+                                         reinterpret_cast<uint8_t*>(output.buffer),
+                                         outShape);
+            }
+        } break;
+#endif // REF-ANN
+        case OperationType::SOFTMAX: {
+            if (!allParametersPresent(2, 1)) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            RunTimeOperandInfo& input = mOperands[ins[0]];
+            float beta = getScalarData<float>(mOperands[ins[1]]);
+            if (beta <= 0.0f) {
+                LOG(ERROR) << "beta must be positive for softmax";
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+
+            RunTimeOperandInfo& output = mOperands[outs[0]];
+            Shape outShape = output.shape();
+
+            if (input.type == OperandType::TENSOR_FLOAT32) {
+                NNFW_KERNEL(func, softmaxFloat32);
+                success = genericActivationPrepare(input.shape(), &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          func(reinterpret_cast<const float*>(input.buffer),
+                               input.shape(),
+                               beta,
+                               reinterpret_cast<float*>(output.buffer),
+                               output.shape());
+            } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
+                success = genericActivationPrepare(input.shape(), &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          softmaxQuant8(reinterpret_cast<const uint8_t*>(input.buffer),
+                                        input.shape(),
+                                        beta,
+                                        reinterpret_cast<uint8_t*>(output.buffer),
+                                        output.shape());
+            }
+        } break;
+        case OperationType::FULLY_CONNECTED: {
+            if (!allParametersPresent(4, 1)) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            RunTimeOperandInfo& input   = mOperands[ins[0]];
+            RunTimeOperandInfo& weights = mOperands[ins[1]];
+            RunTimeOperandInfo& bias    = mOperands[ins[2]];
+
+            int32_t activation = getScalarData<int32_t>(mOperands[ins[3]]);
+
+            RunTimeOperandInfo& output = mOperands[outs[0]];
+            Shape outShape = output.shape();
+
+            if (input.type == OperandType::TENSOR_FLOAT32) {
+                NNFW_KERNEL(func, fullyConnectedFloat32);
+                success = fullyConnectedPrepare(input.shape(), weights.shape(), bias.shape(),
+                                                &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          func(reinterpret_cast<const float*>(input.buffer),
+                               input.shape(),
+                               reinterpret_cast<const float*>(weights.buffer),
+                               weights.shape(),
+                               reinterpret_cast<const float*>(bias.buffer),
+                               bias.shape(),
+                               activation,
+                               reinterpret_cast<float*>(output.buffer),
+                               outShape);
+            } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
+                success = fullyConnectedPrepare(input.shape(), weights.shape(), bias.shape(),
+                                                &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          fullyConnectedQuant8(reinterpret_cast<const uint8_t*>(input.buffer),
+                                               input.shape(),
+                                               reinterpret_cast<const uint8_t*>(weights.buffer),
+                                               weights.shape(),
+                                               reinterpret_cast<const int32_t*>(bias.buffer),
+                                               bias.shape(),
+                                               activation,
+                                               reinterpret_cast<uint8_t*>(output.buffer),
+                                               outShape);
+            }
+        } break;
+        case OperationType::CONCATENATION: {
+            if (outs.size() != 1 || ins.size() < 2) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            int numInputTensors = ins.size() - 1;
+            int32_t axis = getScalarData<int32_t>(mOperands[ins[numInputTensors]]);
+
+            RunTimeOperandInfo& output = mOperands[outs[0]];
+            Shape outShape = output.shape();
+
+            const RunTimeOperandInfo& firstInput = mOperands[ins[0]];
+            if (firstInput.type == OperandType::TENSOR_FLOAT32) {
+                std::vector<Shape> inputShapes(numInputTensors);
+                std::vector<const float*> inputDataPtrs(numInputTensors);
+
+                for (int i=0; i<numInputTensors; i++) {
+                    RunTimeOperandInfo& input = mOperands[ins[i]];
+                    inputShapes[i] = input.shape();
+                    inputDataPtrs[i] = reinterpret_cast<const float*>(input.buffer);
+                }
+                NNFW_KERNEL(func, concatenationFloat32);
+                success = concatenationPrepare(inputShapes, axis, &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          func(inputDataPtrs, inputShapes, axis,
+                               reinterpret_cast<float*>(output.buffer), outShape);
+            } else if (firstInput.type == OperandType::TENSOR_QUANT8_ASYMM) {
+                std::vector<Shape> inputShapes(numInputTensors);
+                std::vector<const uint8_t*> inputDataPtrs(numInputTensors);
+
+                for (int i=0; i<numInputTensors; i++) {
+                    RunTimeOperandInfo& input = mOperands[ins[i]];
+                    inputShapes[i] = input.shape();
+                    inputDataPtrs[i] = reinterpret_cast<const uint8_t*>(input.buffer);
+                }
+                success = concatenationPrepare(inputShapes, axis, &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          concatenationQuant8(inputDataPtrs, inputShapes, axis,
+                                              reinterpret_cast<uint8_t*>(output.buffer),
+                                              outShape);
+            }
+        } break;
+#if 0 // REF-ANN
+        case OperationType::L2_NORMALIZATION: {
+            if (!allParametersPresent(1, 1)) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            const RunTimeOperandInfo& input = mOperands[ins[0]];
+            RunTimeOperandInfo& output = mOperands[outs[0]];
+            Shape outShape = output.shape();
+
+            if (input.type == OperandType::TENSOR_FLOAT32) {
+                success = genericNormalizationPrepare(input.shape(), &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          l2normFloat32(reinterpret_cast<const float*>(input.buffer),
+                                        input.shape(),
+                                        reinterpret_cast<float*>(output.buffer),
+                                        outShape);
+            } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
+                success = genericNormalizationPrepare(input.shape(), &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          l2normQuant8(reinterpret_cast<const uint8_t*>(input.buffer),
+                                       input.shape(),
+                                       reinterpret_cast<uint8_t*>(output.buffer),
+                                       outShape);
+            }
+        } break;
+        case OperationType::LOCAL_RESPONSE_NORMALIZATION: {
+            if (!allParametersPresent(5, 1)) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            const RunTimeOperandInfo& input = mOperands[ins[0]];
+            int32_t radius = getScalarData<int32_t>(mOperands[ins[1]]);
+            float bias = getScalarData<float>(mOperands[ins[2]]);
+            float alpha = getScalarData<float>(mOperands[ins[3]]);
+            float beta = getScalarData<float>(mOperands[ins[4]]);
+
+            RunTimeOperandInfo& output = mOperands[outs[0]];
+            Shape outShape = output.shape();
+
+            if (input.type == OperandType::TENSOR_FLOAT32) {
+                success = genericNormalizationPrepare(input.shape(), &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          localResponseNormFloat32(reinterpret_cast<const float*>(input.buffer),
+                                                   input.shape(),
+                                                   radius, bias, alpha, beta,
+                                                   reinterpret_cast<float*>(output.buffer),
+                                                   outShape);
+            }
+        } break;
+#endif //REF_ANN
+        case OperationType::RESHAPE: {
+            if (!allParametersPresent(2, 1)) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            const RunTimeOperandInfo& input = mOperands[ins[0]];
+            const RunTimeOperandInfo& targetShape = mOperands[ins[1]];
+
+            RunTimeOperandInfo& output = mOperands[outs[0]];
+            Shape outShape = output.shape();
+
+            success = reshapePrepare(input.shape(),
+                                     reinterpret_cast<const int32_t*>(targetShape.buffer),
+                                     getNumberOfElements(targetShape.shape()),
+                                     &outShape) &&
+                      setInfoAndAllocateIfNeeded(&output, outShape) &&
+                      reshapeGeneric(reinterpret_cast<const void*>(input.buffer),
+                                     input.shape(),
+                                     reinterpret_cast<void*>(output.buffer),
+                                     outShape);
+        } break;
+#if 0 //REF-ANN
+        case OperationType::RESIZE_BILINEAR: {
+            if (!allParametersPresent(3, 1)) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            const RunTimeOperandInfo& input = mOperands[ins[0]];
+            int32_t width = getScalarData<int32_t>(mOperands[ins[1]]);
+            int32_t height = getScalarData<int32_t>(mOperands[ins[2]]);
+
+            RunTimeOperandInfo& output = mOperands[outs[0]];
+            Shape outShape = output.shape();
+
+            if (input.type == OperandType::TENSOR_FLOAT32) {
+                success = resizeBilinearPrepare(input.shape(),
+                                                width, height,
+                                                &outShape) &&
+                          setInfoAndAllocateIfNeeded(&output, outShape) &&
+                          resizeBilinearFloat32(reinterpret_cast<const float*>(input.buffer),
+                                                input.shape(),
+                                                reinterpret_cast<float*>(output.buffer),
+                                                outShape);
+            }
+        } break;
+        case OperationType::DEPTH_TO_SPACE: {
+            if (!allParametersPresent(2, 1)) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            const RunTimeOperandInfo& input = mOperands[ins[0]];
+            int32_t blockSize = getScalarData<int32_t>(mOperands[ins[1]]);
+
+            RunTimeOperandInfo& output = mOperands[outs[0]];
+            Shape outShape = output.shape();
+
+            success = depthToSpacePrepare(input.shape(),
+                                          blockSize,
+                                          &outShape) &&
+                      setInfoAndAllocateIfNeeded(&output, outShape) &&
+                      depthToSpaceGeneric(input.buffer,
+                                          input.shape(),
+                                          blockSize,
+                                          output.buffer,
+                                          outShape);
+        } break;
+        case OperationType::SPACE_TO_DEPTH: {
+            if (!allParametersPresent(2, 1)) {
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+            const RunTimeOperandInfo& input = mOperands[ins[0]];
+            int32_t blockSize = getScalarData<int32_t>(mOperands[ins[1]]);
+
+            RunTimeOperandInfo& output = mOperands[outs[0]];
+            Shape outShape = output.shape();
+
+            success = spaceToDepthPrepare(input.shape(),
+                                          blockSize,
+                                          &outShape) &&
+                      setInfoAndAllocateIfNeeded(&output, outShape) &&
+                      spaceToDepthGeneric(input.buffer,
+                                          input.shape(),
+                                          blockSize,
+                                          output.buffer,
+                                          outShape);
+        } break;
+        case OperationType::EMBEDDING_LOOKUP: {
+            const RunTimeOperandInfo &values =
+                mOperands[ins[EmbeddingLookup::kValueTensor]];
+            const RunTimeOperandInfo &lookups =
+                mOperands[ins[EmbeddingLookup::kLookupTensor]];
+            RunTimeOperandInfo &output =
+                mOperands[outs[EmbeddingLookup::kOutputTensor]];
+
+            Shape outputShape;
+            EmbeddingLookup lookup(operation, mOperands);
+
+            success = embeddingLookupPrepare(values.shape(), lookups.shape(), &outputShape) &&
+                setInfoAndAllocateIfNeeded(&output, outputShape) &&
+                lookup.Eval();
+        } break;
+        case OperationType::HASHTABLE_LOOKUP: {
+            const RunTimeOperandInfo &lookups =
+                mOperands[ins[HashtableLookup::kLookupTensor]];
+            const RunTimeOperandInfo &keys =
+                mOperands[ins[HashtableLookup::kKeyTensor]];
+            const RunTimeOperandInfo &values =
+                mOperands[ins[HashtableLookup::kValueTensor]];
+
+            RunTimeOperandInfo &output =
+                mOperands[outs[HashtableLookup::kOutputTensor]];
+            RunTimeOperandInfo &hits =
+                mOperands[outs[HashtableLookup::kHitsTensor]];
+
+            Shape outputShape, hitShape;
+            HashtableLookup lookup(operation, mOperands);
+
+            success = hashtableLookupPrepare(lookups.shape(), keys.shape(), values.shape(),
+                                             &outputShape, &hitShape) &&
+                setInfoAndAllocateIfNeeded(&output, outputShape) &&
+                setInfoAndAllocateIfNeeded(&hits, hitShape) &&
+                lookup.Eval();
+        } break;
+        case OperationType::LSH_PROJECTION: {
+            RunTimeOperandInfo &output =
+                mOperands[outs[LSHProjection::kOutputTensor]];
+
+            Shape outputShape;
+            LSHProjection lsh(operation, mOperands);
+
+            success = LSHProjection::Prepare(operation, mOperands,
+                                             &outputShape) &&
+                setInfoAndAllocateIfNeeded(&output, outputShape) &&
+                lsh.Eval();
+        } break;
+        case OperationType::LSTM: {
+            RunTimeOperandInfo &scratch =
+                mOperands[outs[LSTMCell::kScratchBufferTensor]];
+            RunTimeOperandInfo &outputStateOut =
+                mOperands[outs[LSTMCell::kOutputStateOutTensor]];
+            RunTimeOperandInfo &cellStateOut =
+                mOperands[outs[LSTMCell::kCellStateOutTensor]];
+            RunTimeOperandInfo &output =
+                mOperands[outs[LSTMCell::kOutputTensor]];
+
+            Shape scratchShape, outputStateShape, cellStateShape, outputShape;
+            LSTMCell lstm_cell(operation, mOperands);
+
+            success = LSTMCell::Prepare(operation, mOperands,
+                                        &scratchShape, &outputStateShape,
+                                        &cellStateShape, &outputShape) &&
+                setInfoAndAllocateIfNeeded(&scratch, scratchShape) &&
+                setInfoAndAllocateIfNeeded(&outputStateOut, outputStateShape) &&
+                setInfoAndAllocateIfNeeded(&cellStateOut, cellStateShape) &&
+                setInfoAndAllocateIfNeeded(&output, outputShape) &&
+                lstm_cell.Eval();
+        } break;
+        case OperationType::RNN: {
+            RunTimeOperandInfo &hiddenStateOut =
+                mOperands[outs[RNN::kHiddenStateOutTensor]];
+            RunTimeOperandInfo &output =
+                mOperands[outs[RNN::kOutputTensor]];
+
+            Shape hiddenStateShape, outputShape;
+            RNN rnn_cell(operation, mOperands);
+
+            success = RNN::Prepare(operation, mOperands,
+                                   &hiddenStateShape, &outputShape) &&
+                setInfoAndAllocateIfNeeded(&hiddenStateOut, hiddenStateShape) &&
+                setInfoAndAllocateIfNeeded(&output, outputShape) &&
+                rnn_cell.Eval();
+        } break;
+        case OperationType::SVDF: {
+            RunTimeOperandInfo &stateOut =
+                mOperands[outs[SVDF::kStateOutTensor]];
+            RunTimeOperandInfo &output =
+                mOperands[outs[SVDF::kOutputTensor]];
+
+            Shape stateShape, outputShape;
+            SVDF svdf(operation, mOperands);
+
+            success = SVDF::Prepare(operation, mOperands,
+                                    &stateShape, &outputShape) &&
+                setInfoAndAllocateIfNeeded(&stateOut, stateShape) &&
+                setInfoAndAllocateIfNeeded(&output, outputShape) &&
+                svdf.Eval();
+        } break;
+#endif // REF-ANN
+        default:
+#if 0 // TODO-NNRT : Enable if it is needed.
+            nnAssert(false);
+#endif
+            NYI(getOperationName(operation.type));
+            break;
+    }
+    if (!success) {
+        LOG(ERROR) << getOperationName(operation.type) << " failed.";
+        return ANEURALNETWORKS_OP_FAILED;
+    }
+
+    freeNoLongerUsedOperands(ins);
+    return ANEURALNETWORKS_NO_ERROR;
+}
+
+#ifdef NNFW_KERNEL
+#undef NNFW_KERNEL
+#else
+#error NNFW_KERNEL should be defined
+#endif
+
+} // namespace rt
+} // namespace nnfw
diff --git a/runtimes/nn/common/Logging.cpp b/runtimes/nn/common/Logging.cpp
new file mode 100644
index 000000000..21107bcf9
--- /dev/null
+++ b/runtimes/nn/common/Logging.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Logging.h"
+
+namespace nnfw {
+namespace rt{
+
+BoolConfig::BoolConfig(const std::string &tag, bool default_value) : _value(default_value)
+{
+  const char *str = getenv(tag.c_str());
+
+  if (str != nullptr)
+  {
+    std::string s = std::string(str);
+    _value = ((s != "0") && (s != "false") && (s != "FALSE"));
+  }
+}
+
+VLogging::VLogging()
+{
+  BoolConfig vlog_enabled("VLOG", false);
+  _enabled = vlog_enabled.value();
+}
+
+VLogging& VLogging::access()
+{
+  static VLogging instance;
+  return instance;
+}
+
+std::ostream& VLogging::stream()
+{
+  return std::cout;
+}
+
+} // namespace rt
+} // namespace nnfw
diff --git a/runtimes/nn/common/NNFWKernels.cpp b/runtimes/nn/common/NNFWKernels.cpp
new file mode 100644
index 000000000..dd5c2d2bc
--- /dev/null
+++ b/runtimes/nn/common/NNFWKernels.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CpuExecutor.h"
+#include "NeuralNetworks.h"
+#include "Operations.h"
+
+#include "NNFWKernels.h"
+
+#ifdef USE_NNFW_ACL_KERNELS
+#include "kernel/acl/Conv2D.h"
+#include "kernel/acl/DepthwiseConv2D.h"
+#include "kernel/acl/Pooling.h"
+#include "kernel/acl/Softmax.h"
+#include "kernel/acl/FullyConnected.h"
+#include "kernel/acl/Concatenation.h"
+#include "kernel/acl/Reshape.h"
+#include "kernel/acl/nnfw_kernel_acl.h"
+#endif // USE_NNFW_ACL_KERNELS
+
+#include <map>
+
+namespace nnfw {
+namespace rt {
+
+#define NNFW_KERNEL(Name, Ret, Params) \
+    NNFW_KERNELS_##Name nnfw_kernels_##Name;
+
+#include "NNFWKernels.lst"
+#undef NNFW_KERNEL
+
+void init_nnfw_kernels()
+{
+#ifdef USE_NNFW_ACL_KERNELS
+  nnfw::kernel::acl::Initialize();
+
+  nnfw_kernels_convFloat32["acl"] = nnfw::kernel::acl::convFloat32;
+  nnfw_kernels_depthwiseConvFloat32["acl"] = nnfw::kernel::acl::depthwiseConvFloat32;
+  nnfw_kernels_averagePoolFloat32["acl"] = nnfw::kernel::acl::averagePoolFloat32;
+  nnfw_kernels_maxPoolFloat32["acl"] = nnfw::kernel::acl::maxPoolFloat32;
+  nnfw_kernels_softmaxFloat32["acl"] = nnfw::kernel::acl::softmaxFloat32;
+  nnfw_kernels_fullyConnectedFloat32["acl"] = nnfw::kernel::acl::fullyConnectedFloat32;
+  nnfw_kernels_concatenationFloat32["acl"] = nnfw::kernel::acl::concatenationFloat32;
+  nnfw_kernels_reshapeGeneric["acl"] = nnfw::kernel::acl::reshapeGeneric;
+
+  nnfw_kernels_convFloat32["neon"] = nnfw::kernel::acl::neon::convFloat32;
+  nnfw_kernels_depthwiseConvFloat32["neon"] = nnfw::kernel::acl::neon::depthwiseConvFloat32;
+  nnfw_kernels_averagePoolFloat32["neon"] = nnfw::kernel::acl::neon::averagePoolFloat32;
+  nnfw_kernels_maxPoolFloat32["neon"] = nnfw::kernel::acl::neon::maxPoolFloat32;
+  nnfw_kernels_softmaxFloat32["neon"] = nnfw::kernel::acl::neon::softmaxFloat32;
+  nnfw_kernels_fullyConnectedFloat32["neon"] = nnfw::kernel::acl::neon::fullyConnectedFloat32;
+  nnfw_kernels_concatenationFloat32["neon"] = nnfw::kernel::acl::neon::concatenationFloat32;
+  nnfw_kernels_reshapeGeneric["neon"] = nnfw::kernel::acl::reshapeGeneric;
+#endif // USE_NNFW_ACL_KERNELS
+  return;
+}
+
+} // namespace rt
+} // namespace nnfw
diff --git a/runtimes/nn/common/NNFWKernels.h b/runtimes/nn/common/NNFWKernels.h
new file mode 100644
index 000000000..f38431d15
--- /dev/null
+++ b/runtimes/nn/common/NNFWKernels.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RT_NNFW_KERNELS_H__
+#define __NNFW_RT_NNFW_KERNELS_H__
+
+#include "CpuExecutor.h"
+#include "NeuralNetworks.h"
+#include "Operations.h"
+
+#include <map>
+
+namespace nnfw {
+namespace rt {
+
+#define NNFW_KERNEL(Name, Ret, Params) \
+    typedef Ret (*KERNEL_##Name) Params; \
+    typedef std::map<std::string, KERNEL_##Name> NNFW_KERNELS_##Name; \
+    extern NNFW_KERNELS_##Name nnfw_kernels_##Name;
+
+#include "NNFWKernels.lst"
+#undef NNFW_KERNEL
+
+void init_nnfw_kernels();
+
+} // namespace rt
+} // namespace nnfw
+#endif // __NNFW_RT_NNFW_KERNELS_H__
diff --git a/runtimes/nn/common/NNFWKernels.lst b/runtimes/nn/common/NNFWKernels.lst
new file mode 100644
index 000000000..2a60e0120
--- /dev/null
+++ b/runtimes/nn/common/NNFWKernels.lst
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+NNFW_KERNEL(convFloat32, bool, 
+            (const float* inputData, const Shape& inputShape,
+                 const float* filterData, const Shape& filterShape,
+                 const float* biasData, const Shape& biasShape,
+                 int32_t padding_left, int32_t padding_right,
+                 int32_t padding_top, int32_t padding_bottom,
+                 int32_t stride_width, int32_t stride_height,
+                 int32_t activation,
+                 float* outputData, const Shape& outputShape)
+           );
+
+NNFW_KERNEL(depthwiseConvFloat32, bool,
+             (const float* inputData, const Shape& inputShape,
+              const float* filterData, const Shape& filterShape,
+              const float* biasData, const Shape& biasShape,
+              int32_t padding_left, int32_t padding_right,
+              int32_t padding_top, int32_t padding_bottom,
+              int32_t stride_width, int32_t stride_height,
+              int32_t depth_multiplier, int32_t activation,
+              float* outputData, const Shape& outputShape)
+            );
+
+NNFW_KERNEL(averagePoolFloat32, bool,
+            (const float* inputData, const Shape& inputShape,
+             int32_t padding_left, int32_t padding_right,
+             int32_t padding_top, int32_t padding_bottom,
+             int32_t stride_width, int32_t stride_height,
+             int32_t filter_width, int32_t filter_height, int32_t activation,
+             float* outputData, const Shape& outputShape)
+           );
+
+NNFW_KERNEL(maxPoolFloat32, bool,
+            (const float* inputData, const Shape& inputShape,
+             int32_t padding_left, int32_t padding_right,
+             int32_t padding_top, int32_t padding_bottom,
+             int32_t stride_width, int32_t stride_height,
+             int32_t filter_width, int32_t filter_height, int32_t activation,
+             float* outputData, const Shape& outputShape)
+           );
+
+NNFW_KERNEL(softmaxFloat32, bool,
+            (const float* inputData, const Shape& inputShape,
+             const float beta,
+             float* outputData, const Shape& outputShape)
+           );
+
+NNFW_KERNEL(fullyConnectedFloat32, bool,
+            (const float* inputData, const Shape& inputShape,
+             const float* weights, const Shape& weightsShape,
+             const float* biasData, const Shape& biasShape,
+             int32_t activation,
+             float* outputData, const Shape& outputShape)
+           );
+
+NNFW_KERNEL(concatenationFloat32, bool,
+            (const std::vector<const float*>& inputDataPtrs,
+             const std::vector<Shape>& inputShapes, int32_t axis,
+             float* outputData, const Shape& outputShape)
+           );
+
+NNFW_KERNEL(reshapeGeneric, bool,
+            (const void* inputData, const Shape& inputShape,
+                    void* outputData, const Shape& outputShape)
+           );
diff --git a/runtimes/nn/common/OperationsUtils.cpp b/runtimes/nn/common/OperationsUtils.cpp
new file mode 100644
index 000000000..04e54d0f3
--- /dev/null
+++ b/runtimes/nn/common/OperationsUtils.cpp
@@ -0,0 +1,565 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define LOG_TAG "OperationsUtils"
+
+#include "OperationsUtils.h"
+#include "Operations.h"
+#include "Utils.h"
+
+// TODO-NNRT In Android NN, ActivationFunctor.h is included from Utils.h through RNN.h
+//           Remove this when Utils.h includes RNN.h
+#include "ActivationFunctor.h"
+
+#include <cmath>
+
+namespace nnfw {
+namespace rt {
+
+bool SameShape(const Shape& in1, const Shape& in2) {
+    if (in1.type != in2.type || in1.dimensions.size() != in2.dimensions.size()) {
+        return false;
+    }
+    for (size_t i = 0; i < in1.dimensions.size(); i++) {
+        if (in1.dimensions[i] != in2.dimensions[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+bool SetShape(const Shape& in, Shape* out) {
+    if (in.type != out->type || in.dimensions.size() != out->dimensions.size()) {
+        return false;
+    }
+    out->dimensions = in.dimensions;
+    return true;
+}
+
+uint32_t getNumberOfElements(const Shape& shape) {
+    uint32_t count = 1;
+    for (size_t i = 0; i < shape.dimensions.size(); i++) {
+        count *= shape.dimensions[i];
+    }
+    return count;
+}
+
+uint32_t getNumberOfDimensions(const Shape& shape) {
+    return shape.dimensions.size();
+}
+
+uint32_t getSizeOfDimension(const Shape& shape, uint32_t dimensionIdx) {
+    if (dimensionIdx >= shape.dimensions.size()) {
+        // TODO, log the error
+        return 0;
+    }
+    return shape.dimensions[dimensionIdx];
+}
+
+bool QuantizeMultiplierSmallerThanOne(double double_multiplier,
+                                      int32_t* quantized_multiplier,
+                                      int32_t* right_shift) {
+    NN_OPS_CHECK(double_multiplier >= 0.);
+    NN_OPS_CHECK(double_multiplier < 1.);
+    if (double_multiplier == 0.) {
+        *quantized_multiplier = 0;
+        *right_shift = 0;
+        return true;
+    }
+    NN_OPS_CHECK(double_multiplier > 0.);
+    const double q = std::frexp(double_multiplier, right_shift);
+    *right_shift *= -1;
+    int64_t q_fixed = static_cast<int64_t>(std::round(q * (1ll << 31)));
+    NN_OPS_CHECK(q_fixed <= (1ll << 31));
+    if (q_fixed == (1ll << 31)) {
+        q_fixed /= 2;
+        --*right_shift;
+    }
+    NN_OPS_CHECK(*right_shift >= 0);
+    NN_OPS_CHECK(q_fixed <= std::numeric_limits<int32_t>::max());
+    *quantized_multiplier = static_cast<int32_t>(q_fixed);
+    return true;
+}
+
+bool QuantizeMultiplierGreaterThanOne(double double_multiplier,
+                                      int32_t* quantized_multiplier,
+                                      int* left_shift) {
+    NN_OPS_CHECK(double_multiplier > 1.);
+    const double q = std::frexp(double_multiplier, left_shift);
+    int64_t q_fixed = static_cast<int64_t>(std::round(q * (1ll << 31)));
+    NN_OPS_CHECK(q_fixed <= (1ll << 31));
+    if (q_fixed == (1ll << 31)) {
+        q_fixed /= 2;
+        ++*left_shift;
+    }
+    NN_OPS_CHECK(*left_shift >= 0);
+    NN_OPS_CHECK(q_fixed <= std::numeric_limits<int32_t>::max());
+    *quantized_multiplier = static_cast<int32_t>(q_fixed);
+    return true;
+}
+
+bool GetQuantizedConvolutionMultipler(const Shape& inputShape,
+                                      const Shape& filterShape,
+                                      const Shape& biasShape,
+                                      const Shape& outputShape,
+                                      float* multiplier) {
+    const float input_product_scale = inputShape.scale * filterShape.scale;
+    const float bias_scale = biasShape.scale;
+    const float output_scale = outputShape.scale;
+
+    // The following conditions must be guaranteed by the training pipeline.
+    NN_OPS_CHECK(std::abs(input_product_scale - bias_scale) <=
+              1e-6 * std::min(input_product_scale, bias_scale));
+    NN_OPS_CHECK(input_product_scale >= 0);
+    NN_OPS_CHECK(input_product_scale < output_scale);
+    *multiplier = input_product_scale / output_scale;
+    return true;
+}
+
+void CalculateActivationRangeUint8(int32_t activation,
+                                   const Shape& outputShape,
+                                   int32_t* act_min,
+                                   int32_t* act_max) {
+    const int32_t qmin = std::numeric_limits<uint8_t>::min();
+    const int32_t qmax = std::numeric_limits<uint8_t>::max();
+
+    const auto scale = outputShape.scale;
+    const auto zero_point = outputShape.offset;
+
+    auto quantize = [scale, zero_point](float f) {
+        return zero_point + static_cast<int32_t>(std::round(f / scale));
+    };
+
+    if (activation == kActivationRelu) {
+        *act_min = std::max(qmin, quantize(0.0));
+        *act_max = qmax;
+    } else if (activation == kActivationRelu6) {
+        *act_min = std::max(qmin, quantize(0.0));
+        *act_max = std::min(qmax, quantize(6.0));
+    } else if (activation == kActivationRelu1) {
+        *act_min = std::max(qmin, quantize(-1.0));
+        *act_max = std::min(qmax, quantize(1.0));
+    } else {
+        *act_min = qmin;
+        *act_max = qmax;
+    }
+}
+
+int32_t CalculateInputRadius(int input_integer_bits, int input_left_shift) {
+    const double max_input_rescaled = 1.0 * ((1 << input_integer_bits) - 1) *
+                                      (1ll << (31 - input_integer_bits)) /
+                                      (1ll << input_left_shift);
+    // Tighten bound using floor.  Suppose that we could use the exact value.
+    // After scaling the difference, the result would be at the maximum.  Thus we
+    // must ensure that our value has lower magnitude.
+    return static_cast<int32_t>(std::floor(max_input_rescaled));
+}
+
+bool addMulPrepare(const Shape& in1, const Shape& in2, Shape* out) {
+    NN_OPS_CHECK(getNumberOfDimensions(in1) <= 4 && getNumberOfDimensions(in2) <= 4);
+    NN_OPS_CHECK(in1.type == in2.type);
+    if (SameShape(in1, in2)) {
+        return SetShape(in1, out);
+    } else {
+        // BroadcastAdd needed
+        uint32_t numberOfDims1 = getNumberOfDimensions(in1);
+        uint32_t numberOfDims2 = getNumberOfDimensions(in2);
+        uint32_t maxDims = std::max(numberOfDims1, numberOfDims2);
+        out->dimensions = std::vector<uint32_t>(maxDims);
+        for (uint32_t i = 1; i <= maxDims; i++) {
+            uint32_t dim1 = 1;
+            if (i <= numberOfDims1) {
+                dim1 = getSizeOfDimension(in1, numberOfDims1 - i);
+            }
+            uint32_t dim2 = 1;
+            if (i <= numberOfDims2) {
+                dim2 = getSizeOfDimension(in2, numberOfDims2 - i);
+            }
+            if (dim1 != dim2 && dim1 != 1 && dim2 != 1) {
+                LOG(ERROR) << "Dimensions mismatch for BroadcastAdd";
+                return false;
+            }
+            out->dimensions[maxDims - i] = std::max(dim1, dim2);
+        }
+    }
+    return true;
+}
+
+bool floorPrepare(const Shape& input, Shape* output) {
+    return SetShape(input, output);
+}
+
+bool dequantizePrepare(const Shape& input, Shape* output) {
+    if (input.type != OperandType::TENSOR_QUANT8_ASYMM ||
+            output->type != OperandType::TENSOR_FLOAT32) {
+        LOG(ERROR) << "bad input / output operand type.";
+        return false;
+    }
+    if (input.dimensions.size() != output->dimensions.size()) {
+        LOG(ERROR) << "input and output tensors don't have the same rank.";
+        return false;
+    }
+    output->dimensions = input.dimensions;
+    return true;
+}
+
+bool convPrepare(const Shape& input,
+                 const Shape& filter,
+                 const Shape& bias,
+                 int32_t padding_left, int32_t padding_right,
+                 int32_t padding_top, int32_t padding_bottom,
+                 int32_t stride_width, int32_t stride_height,
+                 Shape* output) {
+    NN_OPS_CHECK(input.type == filter.type);
+    if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        NN_OPS_CHECK(bias.type == OperandType::TENSOR_INT32);
+    } else {
+        NN_OPS_CHECK(input.type == bias.type);
+    }
+    NN_OPS_CHECK(getNumberOfDimensions(input) == 4);
+    NN_OPS_CHECK(getNumberOfDimensions(filter) == 4);
+    NN_OPS_CHECK(getNumberOfDimensions(bias) == 1);
+
+    NN_OPS_CHECK(getSizeOfDimension(filter, 0) == getSizeOfDimension(bias, 0));
+    NN_OPS_CHECK(getSizeOfDimension(filter, 3) == getSizeOfDimension(input, 3));
+
+    uint32_t channels_out = getSizeOfDimension(filter, 0);
+    uint32_t width        = getSizeOfDimension(input, 2);
+    uint32_t height       = getSizeOfDimension(input, 1);
+    uint32_t filterWidth  = getSizeOfDimension(filter, 2);
+    uint32_t filterHeight = getSizeOfDimension(filter, 1);
+    uint32_t batches      = getSizeOfDimension(input, 0);
+
+    uint32_t outWidth = computeOutSize(width, filterWidth, stride_width,
+                                       padding_left, padding_right);
+    uint32_t outHeight = computeOutSize(height, filterHeight, stride_height,
+                                        padding_top, padding_bottom);
+
+    output->type = input.type;
+    output->dimensions = {batches, outHeight, outWidth, channels_out};
+    return true;
+}
+
+bool depthwiseConvPrepare(const Shape& input,
+                          const Shape& filter,
+                          const Shape& bias,
+                          int32_t padding_left, int32_t padding_right,
+                          int32_t padding_top, int32_t padding_bottom,
+                          int32_t stride_width, int32_t stride_height,
+                          Shape* output) {
+    NN_OPS_CHECK(input.type == filter.type);
+    if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        NN_OPS_CHECK(bias.type == OperandType::TENSOR_INT32);
+    } else {
+        NN_OPS_CHECK(input.type == bias.type);
+    }
+    NN_OPS_CHECK(getNumberOfDimensions(input) == 4);
+    NN_OPS_CHECK(getNumberOfDimensions(filter) == 4);
+    NN_OPS_CHECK(getNumberOfDimensions(bias) == 1);
+
+    NN_OPS_CHECK(getSizeOfDimension(filter, 3) == getSizeOfDimension(bias, 0));
+
+    uint32_t channels_out = getSizeOfDimension(filter, 3);
+    uint32_t width        = getSizeOfDimension(input, 2);
+    uint32_t height       = getSizeOfDimension(input, 1);
+    uint32_t filterWidth  = getSizeOfDimension(filter, 2);
+    uint32_t filterHeight = getSizeOfDimension(filter, 1);
+    uint32_t batches      = getSizeOfDimension(input, 0);
+
+    uint32_t outWidth = computeOutSize(width, filterWidth, stride_width,
+                                       padding_left, padding_right);
+    uint32_t outHeight = computeOutSize(height, filterHeight, stride_height,
+                                        padding_top, padding_bottom);
+
+    output->type = input.type;
+    output->dimensions = {batches, outHeight, outWidth, channels_out};
+    return true;
+}
+
+
+bool genericPoolingPrepare(const Shape& input,
+                           int32_t padding_left, int32_t padding_right,
+                           int32_t padding_top, int32_t padding_bottom,
+                           int32_t stride_width, int32_t stride_height,
+                           int32_t filter_width, int32_t filter_height,
+                           Shape* output) {
+    NN_OPS_CHECK(getNumberOfDimensions(input) == 4);
+
+    uint32_t batches      = getSizeOfDimension(input, 0);
+    uint32_t width        = getSizeOfDimension(input, 2);
+    uint32_t height       = getSizeOfDimension(input, 1);
+    uint32_t channels_out = getSizeOfDimension(input, 3);
+
+    uint32_t outWidth = computeOutSize(width, filter_width, stride_width,
+                                       padding_left, padding_right);
+    uint32_t outHeight = computeOutSize(height, filter_height, stride_height,
+                                        padding_top, padding_bottom);
+
+    output->type = input.type;
+    output->dimensions = {batches, outHeight, outWidth, channels_out};
+    return true;
+}
+
+
+bool genericActivationPrepare(const Shape& input,
+                              Shape* output) {
+    NN_OPS_CHECK(getNumberOfDimensions(input) <= 4);
+    return SetShape(input, output);
+}
+
+bool fullyConnectedPrepare(const Shape& input,
+                           const Shape& weights,
+                           const Shape& bias,
+                           Shape* output) {
+    // Check all the parameters of tensor match within themselves and match the
+    // input configuration.
+    NN_OPS_CHECK(input.type == weights.type);
+    if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        NN_OPS_CHECK(bias.type == OperandType::TENSOR_INT32);
+    } else {
+        NN_OPS_CHECK(input.type == bias.type);
+    }
+    NN_OPS_CHECK(getNumberOfDimensions(input) >= 2);
+    uint32_t input_size = getNumberOfElements(input);
+    uint32_t num_units  = getSizeOfDimension(weights, 0);
+
+    // modified to resolve Coverity 118949 (Apr 25, 2018) by hyunsik.yoon
+    // Original Code:
+    // uint32_t batch_size = input_size / getSizeOfDimension(weights, 1);
+    // 
+    // Coverity Detection: Division by zero
+    //
+    // Code below is modified code
+
+    uint32_t shape_size = getSizeOfDimension(weights, 1);
+    if (shape_size == 0)
+    {
+        return false;
+    }
+
+    uint32_t batch_size = input_size / shape_size;
+
+    NN_OPS_CHECK(getSizeOfDimension(bias, 0) == num_units);
+    NN_OPS_CHECK(getSizeOfDimension(weights, 1) * batch_size == input_size);
+    NN_OPS_CHECK(getNumberOfDimensions(weights) == 2);
+
+    output->type = input.type;
+    output->dimensions = {batch_size, num_units};
+
+    return true;
+}
+
+bool concatenationPrepare(const std::vector<Shape>& inputShapes,
+                          int32_t axis,
+                          Shape* output) {
+
+    int num_inputs = inputShapes.size();
+    OperandType input_type = inputShapes[0].type;
+    uint32_t num_dimensions = getNumberOfDimensions(inputShapes[0]);
+
+    NN_OPS_CHECK(axis >= 0);
+    NN_OPS_CHECK(axis < (int32_t)num_dimensions);
+
+    int sum_axis = getSizeOfDimension(inputShapes[0], axis);
+    for (int i = 1; i < num_inputs; ++i) {
+        NN_OPS_CHECK(getNumberOfDimensions(inputShapes[i]) == num_dimensions);
+        NN_OPS_CHECK(inputShapes[i].type == inputShapes[0].type);
+        if (input_type == OperandType::TENSOR_QUANT8_ASYMM) {
+            NN_OPS_CHECK(inputShapes[0].offset == inputShapes[i].offset);
+            NN_OPS_CHECK(inputShapes[0].scale == inputShapes[i].scale);
+        }
+        for (int d = 0; d < (int32_t)num_dimensions; ++d) {
+            if (d == axis) {
+                sum_axis += getSizeOfDimension(inputShapes[i], axis);
+            } else {
+                NN_OPS_CHECK(getSizeOfDimension(inputShapes[0], d) ==
+                           getSizeOfDimension(inputShapes[i], d));
+            }
+        }
+    }
+
+    output->type = input_type;
+    output->dimensions = inputShapes[0].dimensions;
+    output->dimensions[axis] = sum_axis;
+
+    if (input_type == OperandType::TENSOR_QUANT8_ASYMM) {
+        NN_OPS_CHECK(inputShapes[0].offset == output->offset);
+        NN_OPS_CHECK(inputShapes[0].scale == output->scale);
+    }
+
+    return true;
+}
+
+
+bool genericNormalizationPrepare(const Shape& input, Shape* output) {
+    NN_OPS_CHECK(getNumberOfDimensions(input) == 4);
+    return SetShape(input, output);
+}
+
+bool reshapePrepare(const Shape& input,
+                    const int32_t* targetDims,
+                    const int32_t targetDimsSize,
+                    Shape* output) {
+    // Reshape allows one of the targetDims components to have the
+    // special -1 value, meaning it will be calculated automatically based on the
+    // input. Here we calculate what that dimension should be so that the number
+    // of output elements in the same as the number of input elements.
+    int32_t numInputElements = (int32_t) getNumberOfElements(input);
+
+    std::vector<uint32_t> outDims(targetDimsSize);
+    int32_t numOutputElements = 1;
+    int32_t strechDim = -1;
+    for (int32_t i = 0; i < targetDimsSize; ++i) {
+        int32_t value = targetDims[i];
+        if (value == -1) {
+            NN_OPS_CHECK(strechDim == -1);
+            strechDim = i;
+        } else {
+            numOutputElements *= value;
+            outDims[i] = (uint32_t)value;
+        }
+    }
+    if (strechDim != -1) {
+        int32_t strechValue = numInputElements / numOutputElements;
+        outDims[strechDim] = (uint32_t) strechValue;
+        numOutputElements *= strechValue;
+    }
+
+    NN_OPS_CHECK(numInputElements == numOutputElements);
+
+    output->type = input.type;
+    output->dimensions = outDims;
+    output->offset = input.offset;
+    output->scale = input.scale;
+
+    return true;
+}
+
+bool resizeBilinearPrepare(const Shape& input,
+                           int32_t width,
+                           int32_t height,
+                           Shape* output) {
+    NN_OPS_CHECK(getNumberOfDimensions(input) == 4);
+    uint32_t batches  = getSizeOfDimension(input, 0);
+    uint32_t channels = getSizeOfDimension(input, 3);
+
+    output->type = input.type;
+    output->dimensions = {batches, (uint32_t)height, (uint32_t)width, channels};
+
+    return true;
+}
+
+bool depthToSpacePrepare(const Shape& input,
+                         int32_t blockSize,
+                         Shape* output) {
+    NN_OPS_CHECK(getNumberOfDimensions(input) == 4);
+    NN_OPS_CHECK(blockSize > 0);
+
+    uint32_t batches  = getSizeOfDimension(input, 0);
+    uint32_t height   = getSizeOfDimension(input, 1);
+    uint32_t width    = getSizeOfDimension(input, 2);
+    uint32_t channels = getSizeOfDimension(input, 3);
+
+    NN_OPS_CHECK(channels % (blockSize * blockSize) == 0);
+    output->type = input.type;
+    output->dimensions = {batches,
+                          height * blockSize,
+                          width * blockSize,
+                          channels / (blockSize * blockSize)};
+    output->offset = input.offset;
+    output->scale = input.scale;
+
+    return true;
+}
+
+bool spaceToDepthPrepare(const Shape& input,
+                         int32_t blockSize,
+                         Shape* output) {
+    NN_OPS_CHECK(getNumberOfDimensions(input) == 4);
+    NN_OPS_CHECK(blockSize > 0);
+
+    uint32_t batches  = getSizeOfDimension(input, 0);
+    uint32_t height   = getSizeOfDimension(input, 1);
+    uint32_t width    = getSizeOfDimension(input, 2);
+    uint32_t channels = getSizeOfDimension(input, 3);
+
+    NN_OPS_CHECK(height % blockSize == 0);
+    NN_OPS_CHECK(width % blockSize == 0);
+
+    output->type = input.type;
+    output->dimensions = {batches,
+                          height / blockSize,
+                          width / blockSize,
+                          channels * (blockSize * blockSize)};
+    output->offset = input.offset;
+    output->scale = input.scale;
+
+    return true;
+}
+
+bool embeddingLookupPrepare(const Shape &valueShape,
+                            const Shape &lookupShape,
+                            Shape *outputShape) {
+    NN_OPS_CHECK(getNumberOfDimensions(valueShape) >= 2);
+    NN_OPS_CHECK(getNumberOfDimensions(lookupShape) == 1);
+
+    const uint32_t rows     = getSizeOfDimension(valueShape, 0);
+    const uint32_t columns  = getSizeOfDimension(valueShape, 1);
+
+    const uint32_t lookups  = getSizeOfDimension(lookupShape, 0);
+
+    outputShape->type = valueShape.type;
+    outputShape->dimensions = { lookups, columns };
+    for (uint32_t i = 2; i < getNumberOfDimensions(valueShape); i++) {
+        outputShape->dimensions.push_back(getSizeOfDimension(valueShape, i));
+    }
+    outputShape->offset = valueShape.offset;
+    outputShape->scale = valueShape.scale;
+
+    return true;
+}
+
+bool hashtableLookupPrepare(const Shape &lookupShape,
+                            const Shape &keyShape,
+                            const Shape &valueShape,
+                            Shape *outputShape,
+                            Shape *hitShape) {
+    NN_OPS_CHECK(getNumberOfDimensions(lookupShape) == 1);
+    NN_OPS_CHECK(getNumberOfDimensions(keyShape) == 1);
+    NN_OPS_CHECK(getNumberOfDimensions(valueShape) >= 1);
+
+    const uint32_t lookups  = getSizeOfDimension(lookupShape, 0);
+    const uint32_t keys     = getSizeOfDimension(keyShape, 0);
+    const uint32_t rows     = getSizeOfDimension(valueShape, 0);
+    outputShape->type = valueShape.type;
+    outputShape->dimensions = { lookups };
+    for (uint32_t i = 1; i < getNumberOfDimensions(valueShape); i++) {
+        outputShape->dimensions.push_back(getSizeOfDimension(valueShape, i));
+    }
+    outputShape->offset = valueShape.offset;
+    outputShape->scale = valueShape.scale;
+
+    hitShape->type = OperandType::TENSOR_QUANT8_ASYMM;
+    hitShape->dimensions = { lookups };
+    hitShape->offset = 0;
+    hitShape->scale = 1.f;
+
+    return true;
+}
+
+} // namespace rt
+} // namespace nnfw
diff --git a/runtimes/nn/common/Utils.cpp b/runtimes/nn/common/Utils.cpp
new file mode 100644
index 000000000..7f0adea8e
--- /dev/null
+++ b/runtimes/nn/common/Utils.cpp
@@ -0,0 +1,397 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Utils.h"
+#include "NeuralNetworks.h"
+
+namespace nnfw {
+namespace rt {
+
+#define COUNT(X) (sizeof(X) / sizeof(X[0]))
+
+const char* kTypeNames[kNumberOfDataTypes] = {
+        "FLOAT32",        "INT32",        "UINT32",
+        "TENSOR_FLOAT32", "TENSOR_INT32", "TENSOR_QUANT8_ASYMM",
+};
+
+static_assert(COUNT(kTypeNames) == kNumberOfDataTypes, "kTypeNames is incorrect");
+
+const char* kTypeNamesOEM[kNumberOfDataTypesOEM] = {
+        "OEM",            "TENSOR_OEM_BYTE",
+};
+
+static_assert(COUNT(kTypeNamesOEM) == kNumberOfDataTypesOEM, "kTypeNamesOEM is incorrect");
+
+// TODO Check if this useful
+const char* kErrorNames[] = {
+        "NO_ERROR", "OUT_OF_MEMORY", "INCOMPLETE", "NULL", "BAD_DATA",
+};
+
+namespace {
+
+template <typename EntryType, uint32_t entryCount, uint32_t entryCountOEM>
+EntryType tableLookup(const EntryType (&table)[entryCount],
+                      const EntryType (&tableOEM)[entryCountOEM],
+                      uint32_t code) {
+    if (code < entryCount) {
+        return table[code];
+    } else if (code >= kOEMCodeBase && (code - kOEMCodeBase) < entryCountOEM) {
+        return tableOEM[code - kOEMCodeBase];
+    } else {
+        nnAssert(!"tableLookup: bad code");
+        return EntryType();
+    }
+}
+
+};  // anonymous namespace
+
+const char* kOperationNames[kNumberOfOperationTypes] = {
+        "ADD",
+        "AVERAGE_POOL",
+        "CONCATENATION",
+        "CONV",
+        "DEPTHWISE_CONV",
+        "DEPTH_TO_SPACE",
+        "DEQUANTIZE",
+        "EMBEDDING_LOOKUP",
+        "FLOOR",
+        "FULLY_CONNECTED",
+        "HASHTABLE_LOOKUP",
+        "L2_NORMALIZATION",
+        "L2_POOL",
+        "LOCAL_RESPONSE_NORMALIZATION",
+        "LOGISTIC",
+        "LSH_PROJECTION",
+        "LSTM",
+        "MAX_POOL",
+        "MUL",
+        "RELU",
+        "RELU1",
+        "RELU6",
+        "RESHAPE",
+        "RESIZE_BILINEAR",
+        "RNN",
+        "SOFTMAX",
+        "SPACE_TO_DEPTH",
+        "SVDF",
+        "TANH",
+};
+
+static_assert(COUNT(kOperationNames) == kNumberOfOperationTypes, "kOperationNames is incorrect");
+
+const char* kOperationNamesOEM[kNumberOfOperationTypesOEM] = {
+        "OEM_OPERATION",
+};
+
+static_assert(COUNT(kOperationNamesOEM) == kNumberOfOperationTypesOEM,
+              "kOperationNamesOEM is incorrect");
+
+const char* getOperationName(OperationType type) {
+    uint32_t n = static_cast<uint32_t>(type);
+    return tableLookup(kOperationNames, kOperationNamesOEM, n);
+}
+
+const uint32_t kSizeOfDataType[]{
+        4, // ANEURALNETWORKS_FLOAT32
+        4, // ANEURALNETWORKS_INT32
+        4, // ANEURALNETWORKS_UINT32
+        4, // ANEURALNETWORKS_TENSOR_FLOAT32
+        4, // ANEURALNETWORKS_TENSOR_INT32
+        1  // ANEURALNETWORKS_TENSOR_SYMMETRICAL_QUANT8
+};
+
+static_assert(COUNT(kSizeOfDataType) == kNumberOfDataTypes, "kSizeOfDataType is incorrect");
+
+const bool kScalarDataType[]{
+        true,  // ANEURALNETWORKS_FLOAT32
+        true,  // ANEURALNETWORKS_INT32
+        true,  // ANEURALNETWORKS_UINT32
+        false, // ANEURALNETWORKS_TENSOR_FLOAT32
+        false, // ANEURALNETWORKS_TENSOR_INT32
+        false, // ANEURALNETWORKS_TENSOR_SYMMETRICAL_QUANT8
+};
+
+static_assert(COUNT(kScalarDataType) == kNumberOfDataTypes, "kScalarDataType is incorrect");
+
+const uint32_t kSizeOfDataTypeOEM[]{
+        0, // ANEURALNETWORKS_OEM
+        1, // ANEURALNETWORKS_TENSOR_OEM_BYTE
+};
+
+static_assert(COUNT(kSizeOfDataTypeOEM) == kNumberOfDataTypesOEM,
+              "kSizeOfDataTypeOEM is incorrect");
+
+const bool kScalarDataTypeOEM[]{
+        true,  // ANEURALNETWORKS_OEM
+        false, // ANEURALNETWORKS_TENSOR_OEM_BYTE
+};
+
+static_assert(COUNT(kScalarDataTypeOEM) == kNumberOfDataTypesOEM,
+              "kScalarDataTypeOEM is incorrect");
+
+uint32_t sizeOfData(OperandType type, const std::vector<uint32_t>& dimensions) {
+    int n = static_cast<int>(type);
+
+    uint32_t size = tableLookup(kSizeOfDataType, kSizeOfDataTypeOEM, n);
+
+    if (tableLookup(kScalarDataType, kScalarDataTypeOEM, n) == true) {
+        return size;
+    }
+
+    for (auto d : dimensions) {
+        size *= d;
+    }
+    return size;
+}
+
+// TODO-NNRT : Should be changed to allocate hidl_memory using Allocator.
+//             And Should change naming to "allocateMemory".
+hidl_memory allocateSharedMemory(int64_t size) {
+    hidl_memory memory;
+#if 0 // TODO-NNRT : Use shared memory or hidl memory
+
+    // TODO: should we align memory size to nearest page? doesn't seem necessary...
+    const std::string& type = "ashmem";
+    sp<IAllocator> allocator = IAllocator::getService(type);
+    allocator->allocate(size, [&](bool success, const hidl_memory& mem) {
+        if (!success) {
+            LOG(ERROR) << "unable to allocate " << size << " bytes of " << type;
+        } else {
+            memory = mem;
+        }
+    });
+#endif
+    LOG(ERROR) << "Not support to allocate shared memory now.";
+    return memory;
+}
+
+uint32_t alignBytesNeeded(uint32_t index, size_t length) {
+    uint32_t pattern;
+    if (length < 2) {
+        pattern = 0; // No alignment necessary
+    } else if (length < 4) {
+        pattern = 1; // Align on 2-byte boundary
+    } else {
+        pattern = 3; // Align on 4-byte boundary
+    }
+    uint32_t extra = (~(index - 1)) & pattern;
+    return extra;
+}
+
+// Validates the type. The used dimensions can be underspecified.
+int validateOperandType(const ANeuralNetworksOperandType& type, const char* tag,
+                        bool allowPartial) {
+    if (!allowPartial) {
+        for (uint32_t i = 0; i < type.dimensionCount; i++) {
+            if (type.dimensions[i] == 0) {
+                LOG(ERROR) << tag << " OperandType invalid dimensions[" << i
+                           << "] = " << type.dimensions[i];
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+        }
+    }
+    if (!validCode(kNumberOfDataTypes, kNumberOfDataTypesOEM, type.type)) {
+        LOG(ERROR) << tag << " OperandType invalid type " << type.type;
+        return ANEURALNETWORKS_BAD_DATA;
+    }
+    if (type.type == ANEURALNETWORKS_TENSOR_QUANT8_ASYMM) {
+        if (type.zeroPoint < 0 || type.zeroPoint > 255) {
+            LOG(ERROR) << tag << " OperandType invalid zeroPoint " << type.zeroPoint;
+            return ANEURALNETWORKS_BAD_DATA;
+        }
+        if (type.scale < 0.f) {
+            LOG(ERROR) << tag << " OperandType invalid scale " << type.scale;
+            return ANEURALNETWORKS_BAD_DATA;
+        }
+    }
+
+    // TODO-NNRT : add 'type.type == ANEURALNETWORKS_OEM_SCALAR' later.
+    //             OEM operaters are not supported now.
+    if (type.type == ANEURALNETWORKS_FLOAT32 ||
+        type.type == ANEURALNETWORKS_INT32 ||
+        type.type == ANEURALNETWORKS_UINT32) {
+        if (type.dimensionCount != 0 || type.dimensions != nullptr) {
+            LOG(ERROR) << tag << " Invalid dimensions for scalar type";
+            return ANEURALNETWORKS_BAD_DATA;
+        }
+    }
+
+    return ANEURALNETWORKS_NO_ERROR;
+}
+
+int validateOperandList(uint32_t count, const uint32_t* list, uint32_t operandCount,
+                        const char* tag) {
+    for (uint32_t i = 0; i < count; i++) {
+        if (list[i] >= operandCount) {
+            LOG(ERROR) << tag << " invalid operand index at " << i << " = " << list[i]
+                       << ", operandCount " << operandCount;
+            return ANEURALNETWORKS_BAD_DATA;
+        }
+    }
+    return ANEURALNETWORKS_NO_ERROR;
+}
+
+static bool validOperandIndexes(const hidl_vec<uint32_t> indexes, size_t operandCount) {
+    for (uint32_t i : indexes) {
+        if (i >= operandCount) {
+            LOG(ERROR) << "Index out of range " << i << "/" << operandCount;
+            return false;
+        }
+    }
+    return true;
+}
+
+static bool validOperands(const hidl_vec<Operand>& operands, const hidl_vec<uint8_t>& operandValues,
+                          size_t poolCount) {
+    for (auto& operand : operands) {
+        if (!validCode(kNumberOfDataTypes, kNumberOfDataTypesOEM,
+                       static_cast<uint32_t>(operand.type))) {
+            LOG(ERROR) << "Invalid operand type ";
+            return false;
+        }
+        /* TODO validate dim with type
+        if (!validOperandIndexes(operand.dimensions, mDimensions)) {
+            return false;
+        }
+        */
+        switch (operand.lifetime) {
+            case OperandLifeTime::CONSTANT_COPY:
+                if (operand.location.offset + operand.location.length > operandValues.size()) {
+                    LOG(ERROR) << "OperandValue location out of range.  Starts at "
+                               << operand.location.offset << ", length " << operand.location.length
+                           << ", max " << operandValues.size();
+                    return false;
+                }
+                break;
+            case OperandLifeTime::TEMPORARY_VARIABLE:
+            case OperandLifeTime::MODEL_INPUT:
+            case OperandLifeTime::MODEL_OUTPUT:
+            case OperandLifeTime::NO_VALUE:
+                if (operand.location.offset != 0 || operand.location.length != 0) {
+                    LOG(ERROR) << "Unexpected offset " << operand.location.offset << " or length "
+                               << operand.location.length << " for runtime location.";
+                    return false;
+                }
+                break;
+            case OperandLifeTime::CONSTANT_REFERENCE:
+                if (operand.location.poolIndex >= poolCount) {
+                    LOG(ERROR) << "Invalid poolIndex " << operand.location.poolIndex << "/"
+                               << poolCount;
+                    return false;
+                }
+                break;
+            // TODO: Validate that we are within the pool.
+            default:
+                LOG(ERROR) << "Invalid lifetime";
+                return false;
+        }
+    }
+    return true;
+}
+
+static bool validOperations(const hidl_vec<Operation>& operations, size_t operandCount) {
+    for (auto& op : operations) {
+        if (!validCode(kNumberOfOperationTypes, kNumberOfOperationTypesOEM,
+                       static_cast<uint32_t>(op.type))) {
+            LOG(ERROR) << "Invalid operation type ";
+            return false;
+        }
+        if (!validOperandIndexes(op.inputs, operandCount) ||
+            !validOperandIndexes(op.outputs, operandCount)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+// TODO doublecheck
+bool validateModel(const Model& model) {
+    const size_t operandCount = model.operands.size();
+    return (validOperands(model.operands, model.operandValues, model.pools.size()) &&
+            validOperations(model.operations, operandCount) &&
+            validOperandIndexes(model.inputIndexes, operandCount) &&
+            validOperandIndexes(model.outputIndexes, operandCount));
+}
+
+bool validRequestArguments(const hidl_vec<RequestArgument>& arguments,
+                           const hidl_vec<uint32_t>& operandIndexes,
+                           const hidl_vec<Operand>& operands, size_t poolCount,
+                           const char* type) {
+    const size_t argumentCount = arguments.size();
+    if (argumentCount != operandIndexes.size()) {
+        LOG(ERROR) << "Request specifies " << argumentCount << " " << type << "s but the model has "
+                   << operandIndexes.size();
+        return false;
+    }
+    for (size_t argumentIndex = 0; argumentIndex < argumentCount; argumentIndex++) {
+        const RequestArgument& argument = arguments[argumentIndex];
+        const uint32_t operandIndex = operandIndexes[argumentIndex];
+        const Operand& operand = operands[operandIndex];
+        if (argument.hasNoValue) {
+            if (argument.location.poolIndex != 0 ||
+                argument.location.offset != 0 ||
+                argument.location.length != 0 ||
+                argument.dimensions.size() != 0) {
+                LOG(ERROR) << "Request " << type << " " << argumentIndex
+                           << " has no value yet has details.";
+                return false;
+            }
+        }
+        if (argument.location.poolIndex >= poolCount) {
+            LOG(ERROR) << "Request " << type << " " << argumentIndex << " has an invalid poolIndex "
+                       << argument.location.poolIndex << "/" << poolCount;
+            return false;
+        }
+        // TODO: Validate that we are within the pool.
+        uint32_t rank = argument.dimensions.size();
+        if (rank > 0) {
+            if (rank != operand.dimensions.size()) {
+                LOG(ERROR) << "Request " << type << " " << argumentIndex
+                           << " has number of dimensions (" << rank
+                           << ") different than the model's (" << operand.dimensions.size() << ")";
+                return false;
+            }
+            for (size_t i = 0; i < rank; i++) {
+                if (argument.dimensions[i] != operand.dimensions[i] &&
+                    operand.dimensions[i] != 0) {
+                    LOG(ERROR) << "Request " << type << " " << argumentIndex
+                               << " has dimension " << i << " of " << operand.dimensions[i]
+                               << " different than the model's " << operand.dimensions[i];
+                    return false;
+                }
+                if (argument.dimensions[i] == 0) {
+                    LOG(ERROR) << "Request " << type << " " << argumentIndex
+                               << " has dimension " << i << " of zero";
+                    return false;
+                }
+            }
+        }
+    }
+    return true;
+}
+
+// TODO doublecheck
+bool validateRequest(const Request& request, const Model& model) {
+    const size_t poolCount = request.pools.size();
+    return (validRequestArguments(request.inputs, model.inputIndexes, model.operands, poolCount,
+                                  "input") &&
+            validRequestArguments(request.outputs, model.outputIndexes, model.operands, poolCount,
+                                  "output"));
+}
+
+} // namespace rt
+} // namespace nnfw
diff --git a/runtimes/nn/common/include/ActivationFunctor.h b/runtimes/nn/common/include/ActivationFunctor.h
new file mode 100644
index 000000000..788962e4c
--- /dev/null
+++ b/runtimes/nn/common/include/ActivationFunctor.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RT_ACTIVATION_FUNCTOR_H__
+#define __NNFW_RT_ACTIVATION_FUNCTOR_H__
+
+#if 0 // REF-ANN
+#include "android/log.h"
+#endif
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+#include <cstdint>
+
+enum ActivationFn {
+    kActivationNone = 0,
+    kActivationRelu,
+    kActivationRelu1,
+    kActivationRelu6,
+    kActivationTanh,
+    kActivationSignBit,
+    kActivationSigmoid,
+};
+
+class ActivationFunctor {
+ public:
+  explicit ActivationFunctor(ActivationFn act) : act_(act) {}
+
+  float operator()(float a) const {
+    switch (act_) {
+      case kActivationNone:
+        return a;
+      case kActivationRelu:
+        return a < 0.f ? 0.f : a;
+      case kActivationRelu6:
+        return std::max(0.f, std::min(a, 6.f));
+      case kActivationTanh:
+        return std::tanh(a);
+      case kActivationSigmoid:
+        return 1.0f / (1.0f + std::exp(-a));
+      default:
+#if 0 // REF-ANN
+        __android_log_print(ANDROID_LOG_ERROR, "NN API",
+                            "Invalid enum value for activation function: 0x%0X",
+                            act_);
+#endif
+        exit(1);
+    }
+  }
+
+ private:
+  ActivationFn act_;
+};
+
+#endif  // __NNFW_RT_ACTIVATION_FUNCTOR_H__
diff --git a/runtimes/nn/common/include/CpuExecutor.h b/runtimes/nn/common/include/CpuExecutor.h
new file mode 100644
index 000000000..385a461de
--- /dev/null
+++ b/runtimes/nn/common/include/CpuExecutor.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RT_CPU_EXECUTOR_H__
+#define __NNFW_RT_CPU_EXECUTOR_H__
+
+#include "HalInterfaces.h"
+#include "OperationsUtils.h"
+#include "Utils.h"
+
+#include <algorithm>
+#include <vector>
+
+namespace nnfw {
+namespace rt {
+
+// Information we maintain about each operand during execution that
+// may change during execution.
+struct RunTimeOperandInfo {
+    // TODO Storing the type here is redundant, as it won't change during execution.
+    OperandType type;
+    // The type and dimensions of the operand.  The dimensions can
+    // change at runtime.  We include the type because it's useful
+    // to pass together with the dimension to the functions implementing
+    // the operators.
+    std::vector<uint32_t> dimensions;
+
+    float scale;
+    int32_t zeroPoint;
+    // Where the operand's data is stored.  Check the corresponding
+    // location information in the model to figure out if this points
+    // to memory we have allocated for an temporary operand.
+    uint8_t* buffer;
+    // The length of the buffer.
+    uint32_t length;
+    // Whether this is a temporary variable, a model input, a constant, etc.
+    OperandLifeTime lifetime;
+    // Keeps track of how many operations have yet to make use
+    // of this temporary variable.  When the count is decremented to 0,
+    // we free the buffer.  For non-temporary variables, this count is
+    // always 0.
+    uint32_t numberOfUsesLeft;
+
+    Shape shape() const {
+        return Shape{.type = type, .dimensions = dimensions, .scale = scale, .offset = zeroPoint};
+    }
+};
+
+// Used to keep a pointer to each of the memory pools.
+struct RunTimePoolInfo {
+#if 0 // REF-ANN
+    sp<IMemory> memory;
+#endif
+    hidl_memory hidlMemory;
+    uint8_t* buffer;
+
+    bool set(const hidl_memory& hidlMemory);
+    bool update();
+};
+
+bool setRunTimePoolInfosFromHidlMemories(std::vector<RunTimePoolInfo>* poolInfos,
+                                         const hidl_vec<hidl_memory>& pools);
+
+// This class is used to execute a model on the CPU.
+class CpuExecutor {
+public:
+    // Executes the model. The results will be stored at the locations
+    // specified in the constructor.
+    // The model must outlive the executor.  We prevent it from being modified
+    // while this is executing.
+    int run(const Model& model, const Request& request,
+            const std::vector<RunTimePoolInfo>& modelPoolInfos,
+            const std::vector<RunTimePoolInfo>& requestPoolInfos);
+
+private:
+    bool initializeRunTimeInfo(const std::vector<RunTimePoolInfo>& modelPoolInfos,
+                               const std::vector<RunTimePoolInfo>& requestPoolInfos);
+    // Runs one operation of the graph.
+    int executeOperation(const Operation& entry);
+    // Decrement the usage count for the operands listed.  Frees the memory
+    // allocated for any temporary variable with a count of zero.
+    void freeNoLongerUsedOperands(const std::vector<uint32_t>& inputs);
+
+    // The model and the request that we'll execute. Only valid while run()
+    // is being executed.
+    const Model* mModel = nullptr;
+    const Request* mRequest = nullptr;
+
+    // We're copying the list of all the dimensions from the model, as
+    // these may be modified when we run the operatins.  Since we're
+    // making a full copy, the indexes used in the operand description
+    // stay valid.
+    //    std::vector<uint32_t> mDimensions;
+    // Runtime information about all the operands.
+    std::vector<RunTimeOperandInfo> mOperands;
+};
+
+namespace {
+
+template <typename T>
+T getScalarData(const RunTimeOperandInfo& info) {
+  // TODO: Check buffer is at least as long as size of data.
+  T* data = reinterpret_cast<T*>(info.buffer);
+  return data[0];
+}
+
+inline bool IsNullInput(const RunTimeOperandInfo *input) {
+    return input->lifetime == OperandLifeTime::NO_VALUE;
+}
+
+#if 0 // REF-ANN
+inline int NumInputsWithValues(const Operation &operation,
+                               std::vector<RunTimeOperandInfo> &operands) {
+  const std::vector<uint32_t> &inputs = operation.inputs;
+  return std::count_if(inputs.begin(), inputs.end(),
+                       [&operands](uint32_t i) {
+                         return !IsNullInput(&operands[i]);
+                       });
+}
+
+inline int NumOutputs(const Operation &operation) {
+  return operation.outputs.size();
+}
+
+inline size_t NumDimensions(const RunTimeOperandInfo *operand) {
+  return operand->shape().dimensions.size();
+}
+
+inline uint32_t SizeOfDimension(const RunTimeOperandInfo *operand, int i) {
+  return operand->shape().dimensions[i];
+}
+
+inline RunTimeOperandInfo *GetInput(const Operation &operation,
+                                    std::vector<RunTimeOperandInfo> &operands,
+                                    int index) {
+  return &operands[operation.inputs[index]];
+}
+
+inline RunTimeOperandInfo *GetOutput(const Operation &operation,
+                                     std::vector<RunTimeOperandInfo> &operands,
+                                     int index) {
+  return &operands[operation.outputs[index]];
+}
+#endif
+
+}  // anonymous namespace
+
+} // namespace rt
+} // namespace nnfw
+
+#endif // __NNFW_RT_CPU_EXECUTOR_H__
diff --git a/runtimes/nn/common/include/HalInterfaces.h b/runtimes/nn/common/include/HalInterfaces.h
new file mode 100644
index 000000000..9a086c09d
--- /dev/null
+++ b/runtimes/nn/common/include/HalInterfaces.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RT_HAL_INTERFACES_H__
+#define __NNFW_RT_HAL_INTERFACES_H__
+
+#if 0 // REF-ANN
+#include <android/hardware/neuralnetworks/1.0/IDevice.h>
+#include <android/hardware/neuralnetworks/1.0/IExecutionCallback.h>
+#include <android/hardware/neuralnetworks/1.0/IPreparedModel.h>
+#include <android/hardware/neuralnetworks/1.0/IPreparedModelCallback.h>
+#endif
+#include <android/hardware/neuralnetworks/1.0/types.h>
+
+#if 0 // REF-ANN
+#include <android/hidl/allocator/1.0/IAllocator.h>
+#include <android/hidl/memory/1.0/IMemory.h>
+#include <hidlmemory/mapping.h>
+#endif
+
+using ::android::hardware::hidl_memory;
+using ::android::hardware::hidl_vec;
+using ::android::hardware::neuralnetworks::V1_0::DataLocation;
+using ::android::hardware::neuralnetworks::V1_0::ErrorStatus;
+using ::android::hardware::neuralnetworks::V1_0::FusedActivationFunc;
+using ::android::hardware::neuralnetworks::V1_0::Model;
+using ::android::hardware::neuralnetworks::V1_0::Operand;
+using ::android::hardware::neuralnetworks::V1_0::OperandLifeTime;
+using ::android::hardware::neuralnetworks::V1_0::OperandType;
+using ::android::hardware::neuralnetworks::V1_0::Operation;
+using ::android::hardware::neuralnetworks::V1_0::OperationType;
+using ::android::hardware::neuralnetworks::V1_0::PerformanceInfo;
+using ::android::hardware::neuralnetworks::V1_0::Request;
+using ::android::hardware::neuralnetworks::V1_0::RequestArgument;
+#if 0 // REF-ANN
+using ::android::hardware::Return;
+using ::android::hardware::Void;
+using ::android::hardware::hidl_memory;
+using ::android::hardware::hidl_string;
+using ::android::hardware::hidl_vec;
+using ::android::hardware::neuralnetworks::V1_0::Capabilities;
+using ::android::hardware::neuralnetworks::V1_0::DataLocation;
+using ::android::hardware::neuralnetworks::V1_0::DeviceStatus;
+using ::android::hardware::neuralnetworks::V1_0::FusedActivationFunc;
+using ::android::hardware::neuralnetworks::V1_0::IDevice;
+using ::android::hardware::neuralnetworks::V1_0::IExecutionCallback;
+using ::android::hardware::neuralnetworks::V1_0::IPreparedModel;
+using ::android::hardware::neuralnetworks::V1_0::IPreparedModelCallback;
+using ::android::hardware::neuralnetworks::V1_0::Model;
+using ::android::hardware::neuralnetworks::V1_0::Operand;
+using ::android::hardware::neuralnetworks::V1_0::OperandLifeTime;
+using ::android::hardware::neuralnetworks::V1_0::OperandType;
+using ::android::hardware::neuralnetworks::V1_0::Operation;
+using ::android::hardware::neuralnetworks::V1_0::OperationType;
+using ::android::hardware::neuralnetworks::V1_0::PerformanceInfo;
+using ::android::hardware::neuralnetworks::V1_0::Request;
+using ::android::hardware::neuralnetworks::V1_0::RequestArgument;
+using ::android::hidl::allocator::V1_0::IAllocator;
+using ::android::hidl::memory::V1_0::IMemory;
+#endif
+
+namespace nnfw {
+namespace rt {
+
+} // namespace rt
+} // namespace nnfw
+
+#endif // __NNFW_RT_HAL_INTERFACES_H__
diff --git a/runtimes/nn/common/include/Logging.h b/runtimes/nn/common/include/Logging.h
new file mode 100644
index 000000000..060458b85
--- /dev/null
+++ b/runtimes/nn/common/include/Logging.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RT_LOGGING_H__
+#define __NNFW_RT_LOGGING_H__
+
+#include <iostream>
+
+namespace nnfw {
+namespace rt {
+
+// TODO-NNRT Move this to proper place
+class BoolConfig
+{
+public:
+  BoolConfig(const std::string &tag, bool default_value);
+
+public:
+  bool value(void) const { return _value; }
+
+private:
+  bool _value;
+};
+
+class VLogging
+{
+public:
+  static VLogging& access(void);
+  bool enabled() const { return _enabled; }
+  std::ostream& stream(void);
+
+private:
+  VLogging();
+
+private:
+  bool _enabled;
+};
+
+#define LOG(...) std::cout << std::endl
+#define VLOG(...) if (VLogging::access().enabled()) \
+    (VLogging::access().stream() << std::endl)
+#define PLOG(...) LOG(...)
+#define NYI(module) std::cout << "NYI : '" << module << "' is not supported now." << std::endl;
+
+} // namespace rt
+} // namespace nnfw
+
+#endif // __NNFW_RT_LOGGING_H__
diff --git a/runtimes/nn/common/include/Operations.h b/runtimes/nn/common/include/Operations.h
new file mode 100644
index 000000000..33730ea17
--- /dev/null
+++ b/runtimes/nn/common/include/Operations.h
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RT_OPERATIONS_H__
+#define __NNFW_RT_OPERATIONS_H__
+
+#if 0 // REF-ANN
+#include "operations/EmbeddingLookup.h"
+#include "operations/HashtableLookup.h"
+#include "operations/LSHProjection.h"
+#include "operations/LSTM.h"
+#include "operations/RNN.h"
+#include "operations/SVDF.h"
+#endif
+
+#include <stddef.h>
+
+#include <cstdint>
+#include <vector>
+
+namespace nnfw {
+namespace rt {
+
+struct Shape;
+
+bool addFloat32(const float* in1, const Shape& shape1,
+                const float* in2, const Shape& shape2,
+                int32_t activation,
+                float* out, const Shape& shapeOut);
+bool addQuant8(const uint8_t* in1, const Shape& shape1,
+               const uint8_t* in2, const Shape& shape2,
+               int32_t activation,
+               uint8_t* out, const Shape& shapeOut);
+
+bool mulFloat32(const float* in1, const Shape& shape1,
+                const float* in2, const Shape& shape2,
+                int32_t activation,
+                float* out, const Shape& shapeOut);
+bool mulQuant8(const uint8_t* in1, const Shape& shape1,
+               const uint8_t* in2, const Shape& shape2,
+               int32_t activation,
+               uint8_t* out, const Shape& shapeOut);
+
+bool floorFloat32(const float* inputData,
+                  float* outputData,
+                  const Shape& shape);
+
+bool dequantizeQuant8ToFloat32(const uint8_t* inputData,
+                               float* outputData,
+                               const Shape& shape);
+
+bool depthwiseConvFloat32(const float* inputData, const Shape& inputShape,
+                          const float* filterData, const Shape& filterShape,
+                          const float* biasData, const Shape& biasShape,
+                          int32_t padding_left, int32_t padding_right,
+                          int32_t padding_top, int32_t padding_bottom,
+                          int32_t stride_width, int32_t stride_height,
+                          int32_t depth_multiplier, int32_t activation,
+                          float* outputData, const Shape& outputShape);
+#if 0 // REF-ANN We don't support depthwiseConvQuant8 yet
+bool depthwiseConvQuant8(const uint8_t* inputData, const Shape& inputShape,
+                         const uint8_t* filterData, const Shape& filterShape,
+                         const int32_t* biasData, const Shape& biasShape,
+                         int32_t padding_left, int32_t padding_right,
+                         int32_t padding_top, int32_t padding_bottom,
+                         int32_t stride_width, int32_t stride_height,
+                         int32_t depth_multiplier, int32_t activation,
+                         uint8_t* outputData, const Shape& outputShape);
+#endif // REF-ANN
+
+bool convFloat32(const float* inputData, const Shape& inputShape,
+                 const float* filterData, const Shape& filterShape,
+                 const float* biasData, const Shape& biasShape,
+                 int32_t padding_left, int32_t padding_right,
+                 int32_t padding_top, int32_t padding_bottom,
+                 int32_t stride_width, int32_t stride_height,
+                 int32_t activation,
+                 float* outputData, const Shape& outputShape);
+bool convQuant8(const uint8_t* inputData, const Shape& inputShape,
+                const uint8_t* filterData, const Shape& filterShape,
+                const int32_t* biasData, const Shape& biasShape,
+                int32_t padding_left, int32_t padding_right,
+                int32_t padding_top, int32_t padding_bottom,
+                int32_t stride_width, int32_t stride_height,
+                int32_t activation,
+                uint8_t* outputData, const Shape& outputShape);
+
+bool averagePoolFloat32(const float* inputData, const Shape& inputShape,
+                        int32_t padding_left, int32_t padding_right,
+                        int32_t padding_top, int32_t padding_bottom,
+                        int32_t stride_width, int32_t stride_height,
+                        int32_t filter_width, int32_t filter_height, int32_t activation,
+                        float* outputData, const Shape& outputShape);
+bool averagePoolQuant8(const uint8_t* inputData, const Shape& inputShape,
+                       int32_t padding_left, int32_t padding_right,
+                       int32_t padding_top, int32_t padding_bottom,
+                       int32_t stride_width, int32_t stride_height,
+                       int32_t filter_width, int32_t filter_height, int32_t activation,
+                       uint8_t* outputData, const Shape& outputShape);
+bool l2PoolFloat32(const float* inputData, const Shape& inputShape,
+                   int32_t padding_left, int32_t padding_right,
+                   int32_t padding_top, int32_t padding_bottom,
+                   int32_t stride_width, int32_t stride_height,
+                   int32_t filter_width, int32_t filter_height, int32_t activation,
+                   float* outputData, const Shape& outputShape);
+bool maxPoolFloat32(const float* inputData, const Shape& inputShape,
+                    int32_t padding_left, int32_t padding_right,
+                    int32_t padding_top, int32_t padding_bottom,
+                    int32_t stride_width, int32_t stride_height,
+                    int32_t filter_width, int32_t filter_height, int32_t activation,
+                    float* outputData, const Shape& outputShape);
+bool maxPoolQuant8(const uint8_t* inputData, const Shape& inputShape,
+                   int32_t padding_left, int32_t padding_right,
+                   int32_t padding_top, int32_t padding_bottom,
+                   int32_t stride_width, int32_t stride_height,
+                   int32_t filter_width, int32_t filter_height, int32_t activation,
+                   uint8_t* outputData, const Shape& outputShape);
+
+bool reluFloat32(const float* inputData, const Shape& inputShape,
+                 float* outputData, const Shape& outputShape);
+bool relu1Float32(const float* inputData, const Shape& inputShape,
+                  float* outputData, const Shape& outputShape);
+bool relu6Float32(const float* inputData, const Shape& inputShape,
+                  float* outputData, const Shape& outputShape);
+bool tanhFloat32(const float* inputData, const Shape& inputShape,
+                 float* outputData, const Shape& outputShape);
+bool logisticFloat32(const float* inputData, const Shape& inputShape,
+                     float* outputData, const Shape& outputShape);
+bool softmaxFloat32(const float* inputData, const Shape& inputShape,
+                    const float beta,
+                    float* outputData, const Shape& outputShape);
+bool reluQuant8(const uint8_t* inputData, const Shape& inputShape,
+                uint8_t* outputData, const Shape& outputShape);
+bool relu1Quant8(const uint8_t* inputData, const Shape& inputShape,
+                 uint8_t* outputData, const Shape& outputShape);
+bool relu6Quant8(const uint8_t* inputData, const Shape& inputShape,
+                 uint8_t* outputData, const Shape& outputShape);
+bool logisticQuant8(const uint8_t* inputData, const Shape& inputShape,
+                    uint8_t* outputData, const Shape& outputShape);
+bool softmaxQuant8(const uint8_t* inputData, const Shape& inputShape,
+                   const float beta,
+                   uint8_t* outputData, const Shape& outputShape);
+
+bool fullyConnectedFloat32(const float* inputData, const Shape& inputShape,
+                           const float* weights, const Shape& weightsShape,
+                           const float* biasData, const Shape& biasShape,
+                           int32_t activation,
+                           float* outputData, const Shape& outputShape);
+bool fullyConnectedQuant8(const uint8_t* inputData, const Shape& inputShape,
+                          const uint8_t* weights, const Shape& weightsShape,
+                          const int32_t* biasData, const Shape& biasShape,
+                          int32_t activation,
+                          uint8_t* outputData, const Shape& outputShape);
+
+bool concatenationFloat32(const std::vector<const float*>& inputDataPtrs,
+                          const std::vector<Shape>& inputShapes, int32_t axis,
+                          float* outputData, const Shape& outputShape);
+bool concatenationQuant8(const std::vector<const uint8_t*>& inputDataPtrs,
+                         const std::vector<Shape>& inputShapes, int32_t axis,
+                         uint8_t* outputData, const Shape& outputShape);
+
+bool l2normFloat32(const float* inputData, const Shape& inputShape,
+                   float* outputData, const Shape& outputShape);
+bool l2normQuant8(const uint8_t* inputData, const Shape& inputShape,
+                  uint8_t* outputData, const Shape& outputShape);
+bool localResponseNormFloat32(const float* inputData, const Shape& inputShape,
+                              int32_t radius, float bias, float alpha, float beta,
+                              float* outputData, const Shape& outputShape);
+
+bool reshapeGeneric(const void* inputData, const Shape& inputShape,
+                    void* outputData, const Shape& outputShape);
+
+bool resizeBilinearFloat32(const float* inputData,
+                           const Shape& inputShape,
+                           float* outputData,
+                           const Shape& outputShape);
+
+bool depthToSpaceGeneric(const uint8_t* inputData, const Shape& inputShape,
+                         int32_t blockSize,
+                         uint8_t* outputData, const Shape& outputShape);
+
+bool spaceToDepthGeneric(const uint8_t* inputData, const Shape& inputShape,
+                         int32_t blockSize,
+                         uint8_t* outputData, const Shape& outputShape);
+
+} // namespace rt
+} // namespace nnfw
+
+#endif // __NNFW_RT_OPERATIONS_H__
diff --git a/runtimes/nn/common/include/OperationsUtils.h b/runtimes/nn/common/include/OperationsUtils.h
new file mode 100644
index 000000000..c66ad891b
--- /dev/null
+++ b/runtimes/nn/common/include/OperationsUtils.h
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RT_OPERATIONS_UTILS_H__
+#define __NNFW_RT_OPERATIONS_UTILS_H__
+
+#include "Utils.h"
+
+#include <cstdint>
+#include <vector>
+
+// Macro to check if the input parameters for operation are valid or not.
+#define NN_CHECK(v)                                                     \
+  do {                                                                  \
+    if (!(v)) {                                                         \
+      LOG(ERROR) << "NN_CHECK failed: "  << #v << "'\n";                \
+      return false;                                                     \
+    }                                                                   \
+  } while(0);
+
+#define NN_CHECK_EQ(actual, expected)           \
+  NN_CHECK((actual) == (expected))
+
+#define NN_OPS_CHECK NN_CHECK
+
+namespace nnfw {
+namespace rt {
+
+enum PaddingScheme {
+    kPaddingUnknown = 0,
+    kPaddingSame = 1,
+    kPaddingValid = 2,
+};
+
+// The type and dimensions of an operand.
+struct Shape {
+    OperandType type;
+    std::vector<uint32_t> dimensions;
+    float scale;
+    int32_t offset;
+};
+
+// Verifies that the two shapes are the same.
+bool SameShape(const Shape& in1, const Shape& in2);
+
+// Sets out to the same shape as in.
+bool SetShape(const Shape& in, Shape* out);
+
+// Return the total number of elements, i.e. all the dimensions multiplied
+// together. For a scalar, returns one.
+uint32_t getNumberOfElements(const Shape& shape);
+
+uint32_t getNumberOfDimensions(const Shape& shape);
+
+uint32_t getSizeOfDimension(const Shape& shape, uint32_t dimensionIdx);
+
+inline uint32_t computeOutSize(uint32_t imageSize, uint32_t filterSize, uint32_t stride,
+                               uint32_t paddingHead, uint32_t paddingTail) {
+    return (imageSize - filterSize + stride + paddingHead + paddingTail) / stride;
+}
+
+__wur
+bool QuantizeMultiplierSmallerThanOne(double double_multiplier,
+                                      int32_t* quantized_multiplier,
+                                      int32_t* right_shift);
+
+__wur
+bool QuantizeMultiplierGreaterThanOne(double double_multiplier,
+                                      int32_t* quantized_multiplier,
+                                      int* left_shift);
+
+__wur
+bool GetQuantizedConvolutionMultipler(const Shape& inputShape,
+                                      const Shape& filterShape,
+                                      const Shape& biasShape,
+                                      const Shape& outputShape,
+                                      float* multiplier);
+
+void CalculateActivationRangeUint8(int32_t activation,
+                                   const Shape& outputShape,
+                                   int32_t* act_min,
+                                   int32_t* act_max);
+
+int32_t CalculateInputRadius(int input_integer_bits, int input_left_shift);
+
+inline void calculateExplicitPadding(int32_t in_size, int32_t stride,
+                                     int32_t filter_size, int32_t padding_implicit,
+                                     int32_t* padding_head, int32_t* padding_tail) {
+    *padding_head = 0;
+    *padding_tail = 0;
+
+    if (padding_implicit == kPaddingSame) {
+        int32_t out_size = (in_size + stride - 1) / stride;
+        int32_t tmp = (out_size - 1) * stride + filter_size;
+        if (tmp > in_size) {
+            *padding_head = (tmp - in_size) / 2;
+            *padding_tail = (tmp - in_size) - *padding_head;
+        }
+    }
+}
+
+inline PaddingScheme getPaddingScheme(int32_t inWidth, int32_t inHeight,
+                                      int32_t strideWidth, int32_t strideHeight,
+                                      int32_t filterWidth, int32_t filterHeight,
+                                      int32_t paddingLeft, int32_t paddingRight,
+                                      int32_t paddingTop, int32_t paddingBottom) {
+    if (paddingLeft == 0 && paddingRight == 0 && paddingTop == 0 && paddingBottom == 0) {
+        return kPaddingValid;
+    }
+
+    int32_t expectedPaddingLeft, expectedPaddingRight;
+    int32_t expectedPaddingTop, expectedPaddingBottom;
+
+    calculateExplicitPadding(inWidth, strideWidth, filterWidth, kPaddingSame,
+                             &expectedPaddingLeft, &expectedPaddingRight);
+    calculateExplicitPadding(inHeight, strideHeight, filterHeight, kPaddingSame,
+                             &expectedPaddingTop, &expectedPaddingBottom);
+    if (expectedPaddingLeft == paddingLeft && expectedPaddingRight == paddingRight &&
+        expectedPaddingTop == paddingTop && expectedPaddingBottom == paddingBottom) {
+        return kPaddingSame;
+    } else {
+        return kPaddingUnknown;
+    }
+}
+
+// Preparation functions for the corresponding ops
+bool addMulPrepare(const Shape& in1, const Shape& in2, Shape* out1);
+
+bool floorPrepare(const Shape& input, Shape* output);
+
+bool dequantizePrepare(const Shape& input, Shape* output);
+
+bool depthwiseConvPrepare(const Shape& input,
+                          const Shape& filter,
+                          const Shape& bias,
+                          int32_t padding_left, int32_t padding_right,
+                          int32_t padding_top, int32_t padding_bottom,
+                          int32_t stride_width, int32_t stride_height,
+                          Shape* output);
+
+bool convPrepare(const Shape& input,
+                 const Shape& filter,
+                 const Shape& bias,
+                 int32_t padding_left, int32_t padding_right,
+                 int32_t padding_top, int32_t padding_bottom,
+                 int32_t stride_width, int32_t stride_height,
+                 Shape* output);
+
+bool genericPoolingPrepare(const Shape& input,
+                           int32_t padding_left, int32_t padding_right,
+                           int32_t padding_top, int32_t padding_bottom,
+                           int32_t stride_width, int32_t stride_height,
+                           int32_t filter_width, int32_t filter_height,
+                           Shape* output);
+
+bool genericActivationPrepare(const Shape& input, Shape* output);
+
+bool fullyConnectedPrepare(const Shape& input,
+                           const Shape& weights,
+                           const Shape& bias,
+                           Shape* output);
+
+bool concatenationPrepare(const std::vector<Shape>& inputShapes,
+                          int32_t axis,
+                          Shape* output);
+
+bool genericNormalizationPrepare(const Shape& input, Shape* output);
+
+bool reshapePrepare(const Shape& input,
+                    const int32_t* targetDims,
+                    const int32_t targetDimsSize,
+                    Shape* output);
+
+bool resizeBilinearPrepare(const Shape& input,
+                           int32_t height,
+                           int32_t width,
+                           Shape* output);
+
+bool depthToSpacePrepare(const Shape& input,
+                         int32_t blockSize,
+                         Shape* output);
+
+bool spaceToDepthPrepare(const Shape& input,
+                         int32_t blockSize,
+                         Shape* output);
+
+bool embeddingLookupPrepare(const Shape &valueShape,
+                            const Shape &lookupShape,
+                            Shape *outputShape);
+
+bool hashtableLookupPrepare(const Shape &lookupShape,
+                            const Shape &keyShape,
+                            const Shape &valueShape,
+                            Shape *outputShape,
+                            Shape *hitShape);
+
+#define ANDROID_NN_MACRO_DISPATCH_INTERNAL(macro)                           \
+        case (int32_t) FusedActivationFunc::NONE:                           \
+            macro(kNone);                                                   \
+            break;                                                          \
+        case (int32_t) FusedActivationFunc::RELU:                           \
+            macro(kRelu);                                                   \
+            break;                                                          \
+        case (int32_t) FusedActivationFunc::RELU1:                          \
+            macro(kRelu1);                                                  \
+            break;                                                          \
+        case (int32_t) FusedActivationFunc::RELU6:                          \
+            macro(kRelu6);                                                  \
+            break;
+
+#define ANDROID_NN_MACRO_DISPATCH(macro)                                    \
+    switch (activation) {                                                   \
+        ANDROID_NN_MACRO_DISPATCH_INTERNAL(macro)                           \
+        default:                                                            \
+            LOG(ERROR) << "Unsupported fused activation function type";     \
+            return false;                                                   \
+    }
+
+#define ANDROID_NN_MACRO_DISPATCH_WITH_DELETE(macro)                        \
+    switch (activation) {                                                   \
+        ANDROID_NN_MACRO_DISPATCH_INTERNAL(macro)                           \
+        default:                                                            \
+            LOG(ERROR) << "Unsupported fused activation function type";     \
+            if (im2colByteSize > kStaticBufferSize) {                       \
+                delete[] im2colData;                                        \
+            }                                                               \
+            return false;                                                   \
+    }
+
+} // namespace rt
+} // namespace nnfw
+
+#endif // __NNFW_RT_OPERATIONS_UTILS_H__
diff --git a/runtimes/nn/common/include/Utils.h b/runtimes/nn/common/include/Utils.h
new file mode 100644
index 000000000..aae4cff90
--- /dev/null
+++ b/runtimes/nn/common/include/Utils.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RT_UTILS_H__
+#define __NNFW_RT_UTILS_H__
+
+#include "HalInterfaces.h"
+#include "NeuralNetworks.h"
+#include "Logging.h"
+
+#include <vector>
+
+namespace nnfw {
+namespace rt {
+
+// The number of data types (OperandCode) defined in NeuralNetworks.h.
+const int kNumberOfDataTypes = 6;
+
+// The number of operation types (OperationCode) defined in NeuralNetworks.h.
+const int kNumberOfOperationTypes = 30;
+
+// The number of execution preferences defined in NeuralNetworks.h.
+const int kNumberOfPreferences = 3;
+
+// The number of data types (OperandCode) defined in NeuralNetworksOEM.h.
+const int kNumberOfDataTypesOEM = 2;
+
+// The number of operation types (OperationCode) defined in NeuralNetworksOEM.h.
+const int kNumberOfOperationTypesOEM = 1;
+
+// The lowest number assigned to any OEM Code in NeuralNetworksOEM.h.
+const int kOEMCodeBase = 10000;
+
+// Assert macro, as Android does not generally support assert.
+#define nnAssert(v)                                                                            \
+    do {                                                                                       \
+        if (!(v)) {                                                                            \
+            LOG(ERROR) << "nnAssert failed at " << __FILE__ << ":" << __LINE__ << " - '" << #v \
+                       << "'\n";                                                               \
+            abort();                                                                           \
+        }                                                                                      \
+    } while (0)
+// Returns the amount of space needed to store a value of the specified
+// dimensions and type.
+uint32_t sizeOfData(OperandType type, const std::vector<uint32_t>& dimensions);
+
+// Returns the amount of space needed to store a value of the dimensions and
+// type of this operand.
+inline uint32_t sizeOfData(const Operand& operand) {
+    return sizeOfData(operand.type, operand.dimensions);
+}
+
+// Returns the name of the operation in ASCII.
+const char* getOperationName(OperationType opCode);
+// Memory is unmapped.
+// Memory is reference counted by hidl_memory instances, and is deallocated
+// once there are no more references.
+hidl_memory allocateSharedMemory(int64_t size);
+
+// Returns the number of padding bytes needed to align data of the
+// specified length.  It aligns object of length:
+// 2, 3 on a 2 byte boundary,
+// 4+ on a 4 byte boundary.
+// We may want to have different alignments for tensors.
+// TODO: This is arbitrary, more a proof of concept.  We need
+// to determine what this should be.
+uint32_t alignBytesNeeded(uint32_t index, size_t length);
+
+inline void setFromIntList(hidl_vec<uint32_t>* vec, uint32_t count, const uint32_t* data) {
+    vec->resize(count);
+    for (uint32_t i = 0; i < count; i++) {
+        (*vec)[i] = data[i];
+    }
+}
+
+inline void setFromIntList(std::vector<uint32_t>* vec, uint32_t count, const uint32_t* data) {
+    vec->resize(count);
+    for (uint32_t i = 0; i < count; i++) {
+        (*vec)[i] = data[i];
+    }
+}
+
+inline std::string toString(uint32_t obj) {
+    return std::to_string(obj);
+}
+
+template <typename Type>
+std::string toString(const std::vector<Type>& range) {
+    std::string os = "[";
+    for (size_t i = 0; i < range.size(); ++i) {
+        os += (i == 0 ? "" : ", ") + toString(range[i]);
+    }
+    return os += "]";
+}
+
+inline bool validCode(uint32_t codeCount, uint32_t codeCountOEM, uint32_t code) {
+    return (code < codeCount) || (code >= kOEMCodeBase && (code - kOEMCodeBase) < codeCountOEM);
+}
+
+int validateOperandType(const ANeuralNetworksOperandType& type, const char* tag, bool allowPartial);
+int validateOperandList(uint32_t count, const uint32_t* list, uint32_t operandCount,
+                        const char* tag);
+
+bool validateModel(const Model& model);
+bool validateRequest(const Request& request, const Model& model);
+
+inline size_t getSizeFromInts(int lower, int higher) {
+    return (uint32_t)(lower) + ((uint64_t)(uint32_t)(higher) << 32);
+}
+
+}  // namespace rt
+}  // namespace nnfw
+
+#endif  // __NNFW_RT_UTILS_H__
diff --git a/runtimes/nn/common/operations/Activation.cpp b/runtimes/nn/common/operations/Activation.cpp
new file mode 100644
index 000000000..091ffabb3
--- /dev/null
+++ b/runtimes/nn/common/operations/Activation.cpp
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Operations.h"
+#include "OperationsUtils.h"
+
+#include "internal/optimized/optimized_ops.h"
+
+// TODO-NNRT: There was no inlcude "ActivationFunctor.h" in Android NN.
+//            This may be included from some other header files.
+#include "ActivationFunctor.h"
+
+namespace nnfw {
+namespace rt {
+
+bool reluFloat32(const float* inputData, const Shape& inputShape,
+                 float* outputData, const Shape& outputShape) {
+    int numElements = getNumberOfElements(inputShape);
+    for (int i=0; i<numElements; i++, inputData++, outputData++) {
+        *outputData = std::max(0.f, *inputData);
+    }
+    return true;
+}
+
+bool relu1Float32(const float* inputData, const Shape& inputShape,
+                  float* outputData, const Shape& outputShape) {
+    int numElements = getNumberOfElements(inputShape);
+    for (int i=0; i<numElements; i++, inputData++, outputData++) {
+        *outputData = std::min(std::max(-1.f, *inputData), 1.f);
+    }
+    return true;
+}
+
+bool relu6Float32(const float* inputData, const Shape& inputShape,
+                  float* outputData, const Shape& outputShape) {
+    int numElements = getNumberOfElements(inputShape);
+    for (int i=0; i<numElements; i++, inputData++, outputData++) {
+        *outputData = std::min(std::max(0.f, *inputData), 6.f);
+    }
+    return true;
+}
+
+bool tanhFloat32(const float* inputData, const Shape& inputShape,
+                 float* outputData, const Shape& outputShape) {
+    int numElements = getNumberOfElements(inputShape);
+    for (int i=0; i<numElements; i++, inputData++, outputData++) {
+        *outputData = std::tanh(*inputData);
+    }
+    return true;
+}
+
+bool logisticFloat32(const float* inputData, const Shape& inputShape,
+                     float* outputData, const Shape& outputShape) {
+    int numElements = getNumberOfElements(inputShape);
+    for (int i=0; i<numElements; i++, inputData++, outputData++) {
+        *outputData = 1.f / (1.f + std::exp(-*inputData));
+    }
+    return true;
+}
+
+bool softmaxFloat32(const float* inputData, const Shape& inputShape,
+                    const float beta,
+                    float* outputData, const Shape& outputShape) {
+    Dims<4> dim;
+    if (getNumberOfDimensions(inputShape) == 2) {
+        uint32_t batch_size = getSizeOfDimension(inputShape, 0);
+        uint32_t input_size = getNumberOfElements(inputShape) / batch_size;
+
+        Shape shapeIn4D;
+        shapeIn4D.dimensions = {batch_size, 1, 1, input_size};
+        dim = convertShapeToDims(shapeIn4D);
+    } else if (getNumberOfDimensions(inputShape) == 4) {
+        dim = convertShapeToDims(inputShape);
+    } else {
+        LOG(ERROR) << "only 2D and 4D tensors supported";
+        return false;
+    }
+
+    optimized_ops::Softmax(inputData, dim, beta,
+                           outputData, dim);
+    return true;
+}
+
+#define ANDROID_NN_RELUX_QUANT8(activation)                             \
+    int numElements = getNumberOfElements(inputShape);                  \
+    int32_t output_activation_min = 0;                                  \
+    int32_t output_activation_max = 0;                                  \
+                                                                        \
+    CalculateActivationRangeUint8(activation, inputShape,               \
+                                  &output_activation_min,               \
+                                  &output_activation_max);              \
+                                                                        \
+    for (int i=0; i<numElements; i++, inputData++, outputData++) {      \
+        *outputData = std::min((uint8_t)output_activation_max,          \
+                std::max((uint8_t)output_activation_min, *inputData));  \
+    }
+
+
+bool reluQuant8(const uint8_t* inputData, const Shape& inputShape,
+                uint8_t* outputData, const Shape& outputShape) {
+    ANDROID_NN_RELUX_QUANT8(kActivationRelu)
+    return true;
+}
+
+bool relu1Quant8(const uint8_t* inputData, const Shape& inputShape,
+                 uint8_t* outputData, const Shape& outputShape) {
+    ANDROID_NN_RELUX_QUANT8(kActivationRelu1)
+    return true;
+}
+
+bool relu6Quant8(const uint8_t* inputData, const Shape& inputShape,
+                 uint8_t* outputData, const Shape& outputShape) {
+    ANDROID_NN_RELUX_QUANT8(kActivationRelu6)
+    return true;
+}
+
+#undef ANDROID_NN_RELUX_QUANT8
+
+bool logisticQuant8(const uint8_t* inputData, const Shape& inputShape,
+                    uint8_t* outputData, const Shape& outputShape) {
+    if (outputShape.offset != 0 || outputShape.scale != 1.f / 256) {
+        LOG(ERROR) << "incorrect scale / offset for output";
+        return false;
+    }
+
+    static constexpr int kInputIntegerBits = 4;
+
+    const double input_real_multiplier =
+            inputShape.scale *
+            static_cast<double>(1 << (31 - kInputIntegerBits));
+
+    int32_t input_multiplier = 0;
+    int32_t input_left_shift = 0;
+    if (!QuantizeMultiplierGreaterThanOne(input_real_multiplier,
+                                          &input_multiplier,
+                                          &input_left_shift)) {
+        return false;
+    }
+    int32_t input_range_radius =
+            CalculateInputRadius(kInputIntegerBits, input_left_shift);
+
+    optimized_ops::Logistic(
+            inputData, convertShapeToDims(inputShape),
+            inputShape.offset, input_range_radius,
+            input_multiplier, input_left_shift,
+            outputData, convertShapeToDims(outputShape));
+
+    return true;
+}
+
+bool softmaxQuant8(const uint8_t* inputData, const Shape& inputShape,
+                   const float beta,
+                   uint8_t* outputData, const Shape& outputShape) {
+    Dims<4> dim;
+    if (getNumberOfDimensions(inputShape) == 2) {
+        uint32_t batch_size = getSizeOfDimension(inputShape, 0);
+        uint32_t input_size = getNumberOfElements(inputShape) / batch_size;
+
+        Shape shapeIn4D;
+        shapeIn4D.dimensions = {batch_size, 1, 1, input_size};
+        dim = convertShapeToDims(shapeIn4D);
+    } else if (getNumberOfDimensions(inputShape) == 4) {
+        dim = convertShapeToDims(inputShape);
+    } else {
+        LOG(ERROR) << "only 2D and 4D tensors supported";
+        return false;
+    }
+
+    if (outputShape.offset != 0 || outputShape.scale != 1.f / 256) {
+        LOG(ERROR) << "incorrect scale / offset for output";
+        return false;
+    }
+
+    static const int32_t kScaledDiffIntegerBits = 5;
+    const double input_beta_real_multiplier = std::min(
+            1.0 * beta * inputShape.scale * (1 << (31 - kScaledDiffIntegerBits)),
+            (1ll << 31) - 1.0);
+
+    int32_t input_multiplier = 0;
+    int32_t input_left_shift = 0;
+    if (!QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier,
+                                          &input_multiplier,
+                                          &input_left_shift)) {
+        return false;
+    }
+    float diff_min = -1.0f * CalculateInputRadius(kScaledDiffIntegerBits,
+                                                  input_left_shift);
+
+    optimized_ops::Softmax(inputData, dim, input_multiplier,
+                           input_left_shift, diff_min,
+                           outputData, dim);
+    return true;
+}
+
+
+}  // namespace rt
+}  // namespace nnfw
diff --git a/runtimes/nn/common/operations/Concatenation.cpp b/runtimes/nn/common/operations/Concatenation.cpp
new file mode 100644
index 000000000..55de24d4d
--- /dev/null
+++ b/runtimes/nn/common/operations/Concatenation.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Operations.h"
+#include "OperationsUtils.h"
+
+#include "internal/optimized/optimized_ops.h"
+
+namespace nnfw {
+namespace rt {
+
+bool concatenationFloat32(const std::vector<const float*>& inputDataPtrs,
+                          const std::vector<Shape>& inputShapes, int32_t axis,
+                          float* outputData, const Shape& outputShape) {
+    int num_inputs = inputShapes.size();
+    std::vector<Dims<4>*> inputDimsPtr(num_inputs);
+    std::vector<Dims<4> > inputDims(num_inputs);
+    for (int i=0; i<num_inputs; i++) {
+        inputDims[i] = convertShapeToDims(inputShapes[i]);
+        inputDimsPtr[i] = &inputDims[i];
+    }
+
+    optimized_ops::Concatenation<FusedActivationFunctionType::kNone, float>(
+            getNumberOfDimensions(outputShape) - axis - 1,
+            inputDataPtrs.data(), inputDimsPtr.data(), num_inputs,
+            outputData, convertShapeToDims(outputShape));
+
+    return true;
+}
+
+bool concatenationQuant8(const std::vector<const uint8_t*>& inputDataPtrs,
+                         const std::vector<Shape>& inputShapes, int32_t axis,
+                         uint8_t* outputData, const Shape& outputShape) {
+    int num_inputs = inputShapes.size();
+    std::vector<Dims<4>*> inputDimsPtr(num_inputs);
+    std::vector<Dims<4> > inputDims(num_inputs);
+    for (int i=0; i<num_inputs; i++) {
+        inputDims[i] = convertShapeToDims(inputShapes[i]);
+        inputDimsPtr[i] = &inputDims[i];
+    }
+
+    optimized_ops::Concatenation<FusedActivationFunctionType::kNone, uint8_t>(
+            getNumberOfDimensions(outputShape) - axis - 1,
+            inputDataPtrs.data(), inputDimsPtr.data(), num_inputs,
+            outputData, convertShapeToDims(outputShape));
+
+    return true;
+}
+}  // namespace rt
+}  // namespace nnfw
diff --git a/runtimes/nn/common/operations/Conv2D.cpp b/runtimes/nn/common/operations/Conv2D.cpp
new file mode 100644
index 000000000..01f6797e3
--- /dev/null
+++ b/runtimes/nn/common/operations/Conv2D.cpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Operations.h"
+#include "OperationsUtils.h"
+
+#include "internal/optimized/optimized_ops.h"
+
+namespace nnfw {
+namespace rt {
+
+// If possible we will use this static buffer for the tensor.
+static constexpr int kStaticBufferSize = 1605632;
+static char static_scratch_buffer[kStaticBufferSize];
+
+#define ANDROID_NN_CONV_PARAMETERS(Type)                                        \
+    uint32_t height       = getSizeOfDimension(inputShape, 1);                  \
+    uint32_t width        = getSizeOfDimension(inputShape, 2);                  \
+    uint32_t filterHeight = getSizeOfDimension(filterShape, 1);                 \
+    uint32_t filterWidth  = getSizeOfDimension(filterShape, 2);                 \
+    uint32_t outHeight    = getSizeOfDimension(outputShape, 1);                 \
+    uint32_t outWidth     = getSizeOfDimension(outputShape, 2);                 \
+    uint32_t inDepth      = getSizeOfDimension(inputShape, 3);                  \
+                                                                                \
+    uint32_t paddingHeight = (uint32_t)padding_top;                             \
+    uint32_t paddingWidth = (uint32_t)padding_left;                             \
+                                                                                \
+    Dims<4> im2colDim;                                                          \
+    im2colDim.sizes[3] = (int)getSizeOfDimension(outputShape, 0);               \
+    im2colDim.sizes[2] = (int)getSizeOfDimension(outputShape, 1);               \
+    im2colDim.sizes[1] = (int)getSizeOfDimension(outputShape, 2);               \
+    im2colDim.sizes[0] = (int)inDepth * filterHeight * filterWidth;             \
+                                                                                \
+    im2colDim.strides[0] = 1;                                                   \
+    for (int i=1; i<4; i++) {                                                   \
+        im2colDim.strides[i] = im2colDim.strides[i-1] * im2colDim.sizes[i-1];   \
+    }                                                                           \
+                                                                                \
+    Type* im2colData = nullptr;                                                 \
+    int im2colByteSize = sizeof(Type);                                          \
+    for (int i=0; i<4; i++) {                                                   \
+        im2colByteSize *= im2colDim.sizes[i];                                   \
+    }                                                                           \
+    if (im2colByteSize <= kStaticBufferSize) {                                  \
+        im2colData = reinterpret_cast<Type *>(static_scratch_buffer);           \
+    } else {                                                                    \
+        im2colData = new (std::nothrow) Type[im2colByteSize / sizeof(Type)];    \
+    }
+
+bool convFloat32(const float* inputData, const Shape& inputShape,
+                 const float* filterData, const Shape& filterShape,
+                 const float* biasData, const Shape& biasShape,
+                 int32_t padding_left, int32_t padding_right,
+                 int32_t padding_top, int32_t padding_bottom,
+                 int32_t stride_width, int32_t stride_height,
+                 int32_t activation,
+                 float* outputData, const Shape& outputShape) {
+
+    ANDROID_NN_CONV_PARAMETERS(float)
+
+    #define ANDROID_NN_CONV(activation)                                        \
+        optimized_ops::Conv<FusedActivationFunctionType::activation>(          \
+            inputData, convertShapeToDims(inputShape),                         \
+            filterData, convertShapeToDims(filterShape),                       \
+            biasData, convertShapeToDims(biasShape),                           \
+            stride_width, stride_height, paddingWidth, paddingHeight,          \
+            outputData, convertShapeToDims(outputShape),                       \
+            im2colData, im2colDim)
+
+    ANDROID_NN_MACRO_DISPATCH_WITH_DELETE(ANDROID_NN_CONV)
+    #undef ANDROID_NN_CONV
+
+    if (im2colByteSize > kStaticBufferSize) {
+        delete[] im2colData;
+    }
+    return true;
+}
+
+bool convQuant8(const uint8_t* inputData, const Shape& inputShape,
+                const uint8_t* filterData, const Shape& filterShape,
+                const int32_t* biasData, const Shape& biasShape,
+                int32_t padding_left, int32_t padding_right,
+                int32_t padding_top, int32_t padding_bottom,
+                int32_t stride_width, int32_t stride_height,
+                int32_t activation,
+                uint8_t* outputData, const Shape& outputShape) {
+
+    ANDROID_NN_CONV_PARAMETERS(uint8_t)
+
+    int32_t inputOffset = -inputShape.offset;
+    int32_t filterOffset = -filterShape.offset;
+    int32_t outputOffset = outputShape.offset;
+
+    float real_multiplier = 0.0;
+    int32_t output_multiplier = 0;
+    int32_t output_shift = 0;
+    int32_t output_activation_min = 0;
+    int32_t output_activation_max = 0;
+
+    if (!GetQuantizedConvolutionMultipler(inputShape, filterShape, biasShape,
+                                          outputShape, &real_multiplier) ||
+            !QuantizeMultiplierSmallerThanOne(real_multiplier, &output_multiplier,
+                                              &output_shift)){
+        // Following code inserted to resolve Coverity (118950 Resource leak)
+        if (im2colByteSize > kStaticBufferSize) {
+            delete[] im2colData;
+        }
+        return false;
+    }
+    CalculateActivationRangeUint8(activation, outputShape,
+                                  &output_activation_min,
+                                  &output_activation_max);
+
+    static gemmlowp::GemmContext gemm_context;
+    // Alow gemmlowp automatcally decide how many threads to use.
+    gemm_context.set_max_num_threads(0);
+
+    #define ANDROID_NN_CONV(activation)                                        \
+        optimized_ops::Conv<FusedActivationFunctionType::activation>(          \
+            inputData, convertShapeToDims(inputShape), inputOffset,            \
+            filterData, convertShapeToDims(filterShape), filterOffset,         \
+            biasData, convertShapeToDims(biasShape),                           \
+            stride_width, stride_height, paddingWidth, paddingHeight,          \
+            outputOffset, output_multiplier, output_shift,                     \
+            output_activation_min, output_activation_max,                      \
+            outputData, convertShapeToDims(outputShape),                       \
+            im2colData, im2colDim, &gemm_context)
+
+    ANDROID_NN_MACRO_DISPATCH_WITH_DELETE(ANDROID_NN_CONV)
+    #undef ANDROID_NN_CONV
+
+    if (im2colByteSize > kStaticBufferSize) {
+        delete[] im2colData;
+    }
+    return true;
+}
+
+#undef ANDROID_NN_CONV_PARAMETERS
+}  // namespace rt
+}  // namespace nnfw
diff --git a/runtimes/nn/common/operations/DepthwiseConv2D.cpp b/runtimes/nn/common/operations/DepthwiseConv2D.cpp
new file mode 100644
index 000000000..94a78f942
--- /dev/null
+++ b/runtimes/nn/common/operations/DepthwiseConv2D.cpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Operations.h"
+#include "OperationsUtils.h"
+
+#include "internal/optimized/depthwiseconv_float.h"
+#if 0 // REF-ANN We don't support depthwiseConvQuant8 yet
+#include "internal/optimized/depthwiseconv_uint8.h"
+#endif
+
+namespace nnfw {
+namespace rt {
+
+#define ANDROID_NN_DEPTHWISE_CONV_PARAMETERS                                    \
+    uint32_t height       = getSizeOfDimension(inputShape, 1);                  \
+    uint32_t width        = getSizeOfDimension(inputShape, 2);                  \
+    uint32_t filterHeight = getSizeOfDimension(filterShape, 1);                 \
+    uint32_t filterWidth  = getSizeOfDimension(filterShape, 2);                 \
+    uint32_t outHeight    = getSizeOfDimension(outputShape, 1);                 \
+    uint32_t outWidth     = getSizeOfDimension(outputShape, 2);                 \
+                                                                                \
+    uint32_t paddingHeight = (uint32_t)padding_top;                             \
+    uint32_t paddingWidth = (uint32_t)padding_left;
+
+bool depthwiseConvFloat32(const float* inputData, const Shape& inputShape,
+                          const float* filterData, const Shape& filterShape,
+                          const float* biasData, const Shape& biasShape,
+                          int32_t padding_left, int32_t padding_right,
+                          int32_t padding_top, int32_t padding_bottom,
+                          int32_t stride_width, int32_t stride_height,
+                          int32_t depth_multiplier, int32_t activation,
+                          float* outputData, const Shape& outputShape) {
+
+    ANDROID_NN_DEPTHWISE_CONV_PARAMETERS
+
+    #define ANDROID_NN_DEPTHWISE_CONV(activation)                              \
+        optimized_ops::DepthwiseConv<FusedActivationFunctionType::activation>( \
+            inputData, convertShapeToDims(inputShape),                         \
+            filterData, convertShapeToDims(filterShape),                       \
+            biasData, convertShapeToDims(biasShape),                           \
+            stride_width, stride_height,                                       \
+            paddingWidth, paddingHeight, depth_multiplier,                     \
+            outputData, convertShapeToDims(outputShape))
+
+    ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_DEPTHWISE_CONV)
+    #undef ANDROID_NN_DEPTHWISE_CONV
+
+    return true;
+}
+
+
+#if 0 // REF-ANN We don't support depthwiseConvQuant8 yet
+bool depthwiseConvQuant8(const uint8_t* inputData, const Shape& inputShape,
+                         const uint8_t* filterData, const Shape& filterShape,
+                         const int32_t* biasData, const Shape& biasShape,
+                         int32_t padding_left, int32_t padding_right,
+                         int32_t padding_top, int32_t padding_bottom,
+                         int32_t stride_width, int32_t stride_height,
+                         int32_t depth_multiplier, int32_t activation,
+                         uint8_t* outputData, const Shape& outputShape) {
+
+    ANDROID_NN_DEPTHWISE_CONV_PARAMETERS
+
+    float real_multiplier = 0.0;
+    int32_t output_multiplier = 0;
+    int32_t output_shift = 0;
+    int32_t output_activation_min = 0;
+    int32_t output_activation_max = 0;
+
+
+    if (!GetQuantizedConvolutionMultipler(inputShape, filterShape, biasShape,
+                                          outputShape, &real_multiplier) ||
+            !QuantizeMultiplierSmallerThanOne(real_multiplier, &output_multiplier,
+                                              &output_shift)) {
+        return false;
+    }
+    CalculateActivationRangeUint8(activation, outputShape,
+                                  &output_activation_min,
+                                  &output_activation_max);
+
+    uint32_t inputOffset = -inputShape.offset;
+    uint32_t filterOffset = -filterShape.offset;
+    uint32_t outputOffset = outputShape.offset;
+    #define ANDROID_NN_DEPTHWISE_CONV(activation)                              \
+        optimized_ops::DepthwiseConv<FusedActivationFunctionType::activation>( \
+            inputData, convertShapeToDims(inputShape), inputOffset,            \
+            filterData, convertShapeToDims(filterShape), filterOffset,         \
+            biasData, convertShapeToDims(biasShape),                           \
+            stride_width, stride_height,                                       \
+            paddingWidth, paddingHeight, depth_multiplier,                     \
+            outputOffset, output_multiplier, output_shift,                     \
+            output_activation_min, output_activation_max,                      \
+            outputData, convertShapeToDims(outputShape))
+
+    ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_DEPTHWISE_CONV)
+    #undef ANDROID_NN_DEPTHWISE_CONV
+
+    return true;
+}
+#endif // REF-ANN
+
+#undef ANDROID_NN_DEPTHWISE_CONV_PARAMETERS
+}  // namespace rt
+}  // namespace nnfw
diff --git a/runtimes/nn/common/operations/FullyConnected.cpp b/runtimes/nn/common/operations/FullyConnected.cpp
new file mode 100644
index 000000000..393d0ff9e
--- /dev/null
+++ b/runtimes/nn/common/operations/FullyConnected.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Operations.h"
+#include "OperationsUtils.h"
+
+#include "internal/optimized/optimized_ops.h"
+
+namespace nnfw {
+namespace rt {
+
+bool fullyConnectedFloat32(const float* inputData, const Shape& inputShape,
+                           const float* weightsData, const Shape& weightsShape,
+                           const float* biasData, const Shape& biasShape,
+                           int32_t activation,
+                           float* outputData, const Shape& outputShape) {
+
+    #define ANDROID_NN_FULLY_CONNECTED(activation)                              \
+        optimized_ops::FullyConnected<FusedActivationFunctionType::activation>( \
+            inputData, convertShapeToDims(inputShape),                          \
+            weightsData, convertShapeToDims(weightsShape),                      \
+            biasData, convertShapeToDims(biasShape),                            \
+            outputData, convertShapeToDims(outputShape))
+
+    ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_FULLY_CONNECTED)
+    #undef ANDROID_NN_FULLY_CONNECTED
+    return true;
+}
+
+bool fullyConnectedQuant8(const uint8_t* inputData, const Shape& inputShape,
+                          const uint8_t* weightsData, const Shape& weightsShape,
+                          const int32_t* biasData, const Shape& biasShape,
+                          int32_t activation,
+                          uint8_t* outputData, const Shape& outputShape) {
+    int32_t inputOffset = -inputShape.offset;
+    int32_t weightsOffset = -weightsShape.offset;
+    int32_t outputOffset = outputShape.offset;
+
+    float real_multiplier = 0.0;
+    int32_t output_multiplier = 0;
+    int32_t output_shift = 0;
+    int32_t output_activation_min = 0;
+    int32_t output_activation_max = 0;
+
+    if (!GetQuantizedConvolutionMultipler(inputShape, weightsShape, biasShape,
+                                          outputShape, &real_multiplier) ||
+            !QuantizeMultiplierSmallerThanOne(real_multiplier, &output_multiplier,
+                                              &output_shift)) {
+        return false;
+    }
+    CalculateActivationRangeUint8(activation, outputShape,
+                                  &output_activation_min,
+                                  &output_activation_max);
+
+    static gemmlowp::GemmContext gemm_context;
+    // Alow gemmlowp automatcally decide how many threads to use.
+    gemm_context.set_max_num_threads(0);
+
+    #define ANDROID_NN_FULLY_CONNECTED(activation)                              \
+        optimized_ops::FullyConnected<FusedActivationFunctionType::activation>( \
+            inputData, convertShapeToDims(inputShape), inputOffset,             \
+            weightsData, convertShapeToDims(weightsShape), weightsOffset,       \
+            biasData, convertShapeToDims(biasShape),                            \
+            outputOffset, output_multiplier, output_shift,                      \
+            output_activation_min, output_activation_max,                       \
+            outputData, convertShapeToDims(outputShape), &gemm_context)
+
+    ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_FULLY_CONNECTED)
+    #undef ANDROID_NN_FULLY_CONNECTED
+    return true;
+}
+}  // namespace rt
+}  // namespace nnfw
diff --git a/runtimes/nn/common/operations/Pooling.cpp b/runtimes/nn/common/operations/Pooling.cpp
new file mode 100644
index 000000000..958164c1b
--- /dev/null
+++ b/runtimes/nn/common/operations/Pooling.cpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Operations.h"
+#include "OperationsUtils.h"
+
+#include "internal/optimized/optimized_ops.h"
+
+namespace nnfw {
+namespace rt {
+
+#define ANDROID_NN_POOLING_PARAMETERS                                           \
+    uint32_t height       = getSizeOfDimension(inputShape, 1);                  \
+    uint32_t width        = getSizeOfDimension(inputShape, 2);                  \
+    uint32_t outHeight    = getSizeOfDimension(outputShape, 1);                 \
+    uint32_t outWidth     = getSizeOfDimension(outputShape, 2);                 \
+                                                                                \
+    uint32_t paddingHeight = (uint32_t)padding_top;                             \
+    uint32_t paddingWidth = (uint32_t)padding_left;
+
+bool averagePoolFloat32(const float* inputData, const Shape& inputShape,
+                        int32_t padding_left, int32_t padding_right,
+                        int32_t padding_top, int32_t padding_bottom,
+                        int32_t stride_width, int32_t stride_height,
+                        int32_t filter_width, int32_t filter_height, int32_t activation,
+                        float* outputData, const Shape& outputShape) {
+
+    ANDROID_NN_POOLING_PARAMETERS
+
+    #define ANDROID_NN_AVERAGE_POOL(activation)                                \
+        optimized_ops::AveragePool<FusedActivationFunctionType::activation>(   \
+            inputData, convertShapeToDims(inputShape),                         \
+            stride_width, stride_height, paddingWidth, paddingHeight,          \
+            filter_width, filter_height,                                       \
+            outputData, convertShapeToDims(outputShape))
+
+    ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_AVERAGE_POOL)
+    #undef ANDROID_NN_AVERAGE_POOL
+
+    return true;
+}
+
+bool averagePoolQuant8(const uint8_t* inputData, const Shape& inputShape,
+                       int32_t padding_left, int32_t padding_right,
+                       int32_t padding_top, int32_t padding_bottom,
+                       int32_t stride_width, int32_t stride_height,
+                       int32_t filter_width, int32_t filter_height, int32_t activation,
+                       uint8_t* outputData, const Shape& outputShape) {
+
+    ANDROID_NN_POOLING_PARAMETERS
+
+    int32_t output_activation_min = 0;
+    int32_t output_activation_max = 0;
+
+    CalculateActivationRangeUint8(activation, outputShape,
+                                  &output_activation_min,
+                                  &output_activation_max);
+
+    #define ANDROID_NN_AVERAGE_POOL(activation)                                \
+        optimized_ops::AveragePool<FusedActivationFunctionType::activation>(   \
+            inputData, convertShapeToDims(inputShape),                         \
+            stride_width, stride_height, paddingWidth, paddingHeight,          \
+            filter_width, filter_height,                                       \
+            output_activation_min, output_activation_max,                      \
+            outputData, convertShapeToDims(outputShape))
+
+    ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_AVERAGE_POOL)
+    #undef ANDROID_NN_AVERAGE_POOL
+
+    return true;
+}
+
+bool l2PoolFloat32(const float* inputData, const Shape& inputShape,
+                   int32_t padding_left, int32_t padding_right,
+                   int32_t padding_top, int32_t padding_bottom,
+                   int32_t stride_width, int32_t stride_height,
+                   int32_t filter_width, int32_t filter_height, int32_t activation,
+                   float* outputData, const Shape& outputShape) {
+
+    ANDROID_NN_POOLING_PARAMETERS
+
+    #define ANDROID_NN_L2_POOL(activation)                                     \
+        optimized_ops::L2Pool<FusedActivationFunctionType::activation>(        \
+            inputData, convertShapeToDims(inputShape),                         \
+            stride_width, stride_height, paddingWidth, paddingHeight,          \
+            filter_width, filter_height,                                       \
+            outputData, convertShapeToDims(outputShape))
+
+    ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_L2_POOL)
+    #undef ANDROID_NN_L2_POOL
+
+    return true;
+}
+
+bool maxPoolFloat32(const float* inputData, const Shape& inputShape,
+                    int32_t padding_left, int32_t padding_right,
+                    int32_t padding_top, int32_t padding_bottom,
+                    int32_t stride_width, int32_t stride_height,
+                    int32_t filter_width, int32_t filter_height, int32_t activation,
+                    float* outputData, const Shape& outputShape) {
+
+    ANDROID_NN_POOLING_PARAMETERS
+
+    #define ANDROID_NN_MAX_POOL(activation)                                    \
+        optimized_ops::MaxPool<FusedActivationFunctionType::activation>(       \
+            inputData, convertShapeToDims(inputShape),                         \
+            stride_width, stride_height, paddingWidth, paddingHeight,          \
+            filter_width, filter_height,                                       \
+            outputData, convertShapeToDims(outputShape))
+
+    ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_MAX_POOL)
+    #undef ANDROID_NN_MAX_POOL
+
+    return true;
+}
+
+bool maxPoolQuant8(const uint8_t* inputData, const Shape& inputShape,
+                   int32_t padding_left, int32_t padding_right,
+                   int32_t padding_top, int32_t padding_bottom,
+                   int32_t stride_width, int32_t stride_height,
+                   int32_t filter_width, int32_t filter_height, int32_t activation,
+                   uint8_t* outputData, const Shape& outputShape) {
+
+    ANDROID_NN_POOLING_PARAMETERS
+
+    int32_t output_activation_min = 0;
+    int32_t output_activation_max = 0;
+
+    CalculateActivationRangeUint8(activation, outputShape,
+                                  &output_activation_min,
+                                  &output_activation_max);
+
+    #define ANDROID_NN_MAX_POOL(activation)                                    \
+        optimized_ops::MaxPool<FusedActivationFunctionType::activation>(       \
+            inputData, convertShapeToDims(inputShape),                         \
+            stride_width, stride_height, paddingWidth, paddingHeight,          \
+            filter_width, filter_height,                                       \
+            output_activation_min, output_activation_max,                      \
+            outputData, convertShapeToDims(outputShape))
+
+    ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_MAX_POOL)
+    #undef ANDROID_NN_MAX_POOL
+
+    return true;
+}
+
+#undef ANDROID_NN_POOLING_PARAMETERS
+}  // namespace rt
+}  // namespace nnfw
diff --git a/runtimes/nn/common/operations/Reshape.cpp b/runtimes/nn/common/operations/Reshape.cpp
new file mode 100644
index 000000000..120918b0d
--- /dev/null
+++ b/runtimes/nn/common/operations/Reshape.cpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Contains the implementation of the operations.
+
+#define LOG_TAG "Operations"
+
+#include "Operations.h"
+#include "OperationsUtils.h"
+
+#include "internal/optimized/optimized_ops.h"
+
+namespace nnfw {
+namespace rt {
+
+bool reshapeGeneric(const void* inputData, const Shape& inputShape,
+                    void* outputData, const Shape& outputShape) {
+    size_t count = sizeOfData(inputShape.type, inputShape.dimensions);
+    memcpy(outputData, inputData, count);
+    return true;
+}
+
+bool resizeBilinearFloat32(const float* inputData, const Shape& inputShape,
+                           float* outputData, const Shape& outputShape) {
+    int32_t height = (int32_t) getSizeOfDimension(outputShape, 1);
+    int32_t width  = (int32_t) getSizeOfDimension(outputShape, 2);
+
+    int32_t outDimData[2] = {height, width};
+    // We have to fake a tensor here, to satisfy ResizeBilinear().
+    Shape outDimShape;
+    outDimShape.dimensions = {1, 1, 1, 2};
+
+    optimized_ops::ResizeBilinear(
+            inputData, convertShapeToDims(inputShape),
+            outDimData, convertShapeToDims(outDimShape),
+            outputData, convertShapeToDims(outputShape));
+    return true;
+}
+
+bool depthToSpaceGeneric(const uint8_t* inputData, const Shape& inputShape,
+                         int32_t blockSize,
+                         uint8_t* outputData, const Shape& outputShape) {
+    if (inputShape.type == OperandType::TENSOR_FLOAT32) {
+        optimized_ops::DepthToSpace(
+                reinterpret_cast<const float*>(inputData),
+                convertShapeToDims(inputShape),
+                blockSize,
+                reinterpret_cast<float*>(outputData),
+                convertShapeToDims(outputShape));
+    } else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        optimized_ops::DepthToSpace(
+                reinterpret_cast<const uint8_t*>(inputData),
+                convertShapeToDims(inputShape),
+                blockSize,
+                reinterpret_cast<uint8_t*>(outputData),
+                convertShapeToDims(outputShape));
+    } else {
+        LOG(ERROR) << "Unsupported data type";
+        return false;
+    }
+    return true;
+}
+
+bool spaceToDepthGeneric(const uint8_t* inputData, const Shape& inputShape,
+                         int32_t blockSize,
+                         uint8_t* outputData, const Shape& outputShape) {
+    if (inputShape.type == OperandType::TENSOR_FLOAT32) {
+        optimized_ops::SpaceToDepth(
+                reinterpret_cast<const float*>(inputData),
+                convertShapeToDims(inputShape),
+                blockSize,
+                reinterpret_cast<float*>(outputData),
+                convertShapeToDims(outputShape));
+    } else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) {
+        optimized_ops::SpaceToDepth(
+                reinterpret_cast<const uint8_t*>(inputData),
+                convertShapeToDims(inputShape),
+                blockSize,
+                reinterpret_cast<uint8_t*>(outputData),
+                convertShapeToDims(outputShape));
+    } else {
+        LOG(ERROR) << "Unsupported data type";
+        return false;
+    }
+    return true;
+}
+
+} // namespace rt
+} // namespace nnfw
diff --git a/runtimes/nn/common/operations/SimpleMath.cpp b/runtimes/nn/common/operations/SimpleMath.cpp
new file mode 100644
index 000000000..79b1175c0
--- /dev/null
+++ b/runtimes/nn/common/operations/SimpleMath.cpp
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Contains the implementation of the operations.
+
+#define LOG_TAG "Operations"
+
+#include "Operations.h"
+#include "OperationsUtils.h"
+
+#include "internal/optimized/optimized_ops.h"
+
+namespace nnfw {
+namespace rt {
+bool addFloat32(const float* in1, const Shape& shape1,
+                const float* in2, const Shape& shape2,
+                int32_t activation,
+                float* out, const Shape& shapeOut) {
+    bool needBroadcast = !SameShape(shape1, shape2);
+
+    #define ANDROID_NN_NORMAL_ADD(activation)                        \
+        optimized_ops::Add<FusedActivationFunctionType::activation>( \
+                in1, convertShapeToDims(shape1),                     \
+                in2, convertShapeToDims(shape2),                     \
+                out, convertShapeToDims(shapeOut))
+
+    #define ANDROID_NN_BROADCAST_ADD(activation)                              \
+        optimized_ops::BroadcastAdd<FusedActivationFunctionType::activation>( \
+                in1, convertShapeToDims(shape1),                              \
+                in2, convertShapeToDims(shape2),                              \
+                out, convertShapeToDims(shapeOut))
+
+    if (needBroadcast) {
+        ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_BROADCAST_ADD)
+    } else {
+        ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_NORMAL_ADD)
+    }
+
+    #undef ANDROID_NN_NORMAL_ADD
+    #undef ANDROID_NN_BROADCAST_ADD
+    return true;
+}
+
+bool addQuant8(const uint8_t* in1, const Shape& shape1,
+               const uint8_t* in2, const Shape& shape2,
+               int32_t activation,
+               uint8_t* out, const Shape& shapeOut) {
+    bool needBroadcast = !SameShape(shape1, shape2);
+
+    const int32_t input1_offset = -shape1.offset;
+    const int32_t input2_offset = -shape2.offset;
+    const int32_t output_offset = shapeOut.offset;
+    const int left_shift = 20;
+    const double twice_max_input_scale = 2 * std::max(shape1.scale, shape2.scale);
+    const double real_input1_multiplier = shape1.scale / twice_max_input_scale;
+    const double real_input2_multiplier = shape2.scale / twice_max_input_scale;
+    const double real_output_multiplier =
+            twice_max_input_scale /
+            ((1 << left_shift) * shapeOut.scale);
+
+    int32_t input1_multiplier;
+    int32_t input1_shift;
+    if (!QuantizeMultiplierSmallerThanOne(real_input1_multiplier,
+                                          &input1_multiplier, &input1_shift)) {
+        return false;
+    }
+    int32_t input2_multiplier;
+    int32_t input2_shift;
+    if (!QuantizeMultiplierSmallerThanOne(real_input2_multiplier,
+                                          &input2_multiplier, &input2_shift)) {
+        return false;
+    }
+    int32_t output_multiplier;
+    int32_t output_shift;
+    if (!QuantizeMultiplierSmallerThanOne(real_output_multiplier,
+                                          &output_multiplier, &output_shift)) {
+        return false;
+    }
+    int32_t output_activation_min;
+    int32_t output_activation_max;
+    CalculateActivationRangeUint8(activation, shapeOut,
+                                  &output_activation_min,
+                                  &output_activation_max);
+
+    #define ANDROID_NN_NORMAL_ADD(activation)                           \
+        optimized_ops::Add<FusedActivationFunctionType::activation>(    \
+                left_shift,                                             \
+                in1, convertShapeToDims(shape1),                        \
+                input1_offset, input1_multiplier, input1_shift,         \
+                in2, convertShapeToDims(shape2),                        \
+                input2_offset, input2_multiplier, input2_shift,         \
+                output_offset, output_multiplier, output_shift,         \
+                output_activation_min, output_activation_max,           \
+                out, convertShapeToDims(shapeOut))
+
+    #define ANDROID_NN_BROADCAST_ADD(activation)                                 \
+        optimized_ops::BroadcastAdd<FusedActivationFunctionType::activation>(    \
+                left_shift,                                                      \
+                in1, convertShapeToDims(shape1),                                 \
+                input1_offset, input1_multiplier, input1_shift,                  \
+                in2, convertShapeToDims(shape2),                                 \
+                input2_offset, input2_multiplier, input2_shift,                  \
+                output_offset, output_multiplier, output_shift,                  \
+                output_activation_min, output_activation_max,                    \
+                out, convertShapeToDims(shapeOut))
+
+    if (needBroadcast) {
+        ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_BROADCAST_ADD)
+    } else {
+        ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_NORMAL_ADD)
+    }
+
+    #undef ANDROID_NN_NORMAL_ADD
+    #undef ANDROID_NN_BROADCAST_ADD
+    return true;
+}
+
+bool mulFloat32(const float* in1, const Shape& shape1,
+                const float* in2, const Shape& shape2,
+                int32_t activation,
+                float* out, const Shape& shapeOut) {
+    bool needBroadcast = !SameShape(shape1, shape2);
+
+    #define ANDROID_NN_NORMAL_MUL(activation)                        \
+        optimized_ops::Mul<FusedActivationFunctionType::activation>( \
+                in1, convertShapeToDims(shape1),                     \
+                in2, convertShapeToDims(shape2),                     \
+                out, convertShapeToDims(shapeOut))
+
+    #define ANDROID_NN_BROADCAST_MUL(activation)                              \
+        optimized_ops::BroadcastMul<FusedActivationFunctionType::activation>( \
+                in1, convertShapeToDims(shape1),                              \
+                in2, convertShapeToDims(shape2),                              \
+                out, convertShapeToDims(shapeOut))
+
+    if (needBroadcast) {
+        ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_BROADCAST_MUL)
+    } else {
+        ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_NORMAL_MUL)
+    }
+
+    #undef ANDROID_NN_NORMAL_MUL
+    #undef ANDROID_NN_BROADCAST_MUL
+    return true;
+}
+
+bool mulQuant8(const uint8_t* in1, const Shape& shape1,
+               const uint8_t* in2, const Shape& shape2,
+               int32_t activation,
+               uint8_t* out, const Shape& shapeOut) {
+    const int32_t input1_offset = -shape1.offset;
+    const int32_t input2_offset = -shape2.offset;
+    const int32_t output_offset = shapeOut.offset;
+    const double input_product_scale = shape1.scale * shape2.scale;
+    const double real_multiplier = input_product_scale / shapeOut.scale;
+    int32 output_multiplier;
+    int output_shift;
+    if (!QuantizeMultiplierSmallerThanOne(real_multiplier, &output_multiplier,
+                                          &output_shift)) {
+        return false;
+    }
+    int32_t output_activation_min;
+    int32_t output_activation_max;
+    CalculateActivationRangeUint8(activation, shapeOut,
+                                  &output_activation_min,
+                                  &output_activation_max);
+
+    // Use BROADCAST version to handle the normal case until we have a optimized Mul.
+    #define ANDROID_NN_BROADCAST_MUL(activation)                                 \
+        optimized_ops::BroadcastMul<FusedActivationFunctionType::activation>(    \
+                in1, convertShapeToDims(shape1), input1_offset,                  \
+                in2, convertShapeToDims(shape2), input2_offset,                  \
+                output_offset, output_multiplier, output_shift,                  \
+                output_activation_min, output_activation_max,                    \
+                out, convertShapeToDims(shapeOut))
+
+    ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_BROADCAST_MUL)
+
+    #undef ANDROID_NN_NORMAL_MUL
+    #undef ANDROID_NN_BROADCAST_MUL
+    return true;
+}
+
+bool floorFloat32(const float* inputData,
+                  float* outputData,
+                  const Shape& shape) {
+    Dims<4> dim = convertShapeToDims(shape);
+    optimized_ops::Floor(inputData, dim, outputData, dim);
+    return true;
+}
+
+bool dequantizeQuant8ToFloat32(const uint8_t* inputData,
+                               float* outputData,
+                               const Shape& shape) {
+    Dims<4> dim = convertShapeToDims(shape);
+    optimized_ops::Dequantize(inputData, dim,
+                              shape.offset, shape.scale,
+                              outputData, dim);
+    return true;
+}
+
+} // namespace rt
+} // namespace nnfw
diff --git a/runtimes/nn/common/operations/internal/common.h b/runtimes/nn/common/operations/internal/common.h
new file mode 100644
index 000000000..1bf1050fd
--- /dev/null
+++ b/runtimes/nn/common/operations/internal/common.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RT_COMMON_H__
+#define __NNFW_RT_COMMON_H__
+
+#ifndef USE_NEON
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#include <arm_neon.h>
+#endif
+#endif
+
+#include "gemmlowp.h"
+#include "types.h"
+
+namespace nnfw {
+namespace rt {
+
+template <FusedActivationFunctionType Ac>
+struct ActivationFunctionImpl {};
+
+template <>
+struct ActivationFunctionImpl<FusedActivationFunctionType::kNone> {
+  static float Eval(float x) { return x; }
+};
+
+template <>
+struct ActivationFunctionImpl<FusedActivationFunctionType::kRelu> {
+  static float Eval(float x) { return x < 0.f ? 0.f : x; }
+};
+
+template <>
+struct ActivationFunctionImpl<FusedActivationFunctionType::kRelu1> {
+  static float Eval(float x) { return x > 1.f ? 1.f : x < -1.f ? -1.f : x; }
+};
+
+template <>
+struct ActivationFunctionImpl<FusedActivationFunctionType::kRelu6> {
+  static float Eval(float x) { return x > 6.f ? 6.f : x < 0.f ? 0.f : x; }
+};
+
+template <FusedActivationFunctionType Ac>
+float ActivationFunction(float x) {
+  return ActivationFunctionImpl<Ac>::Eval(x);
+}
+
+inline int32 MultiplyByQuantizedMultiplierSmallerThanOne(
+    int32 x, int32 quantized_multiplier, int right_shift) {
+  using gemmlowp::RoundingDivideByPOT;
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  return RoundingDivideByPOT(
+      SaturatingRoundingDoublingHighMul(x, quantized_multiplier), right_shift);
+}
+
+inline int32 MultiplyByQuantizedMultiplierGreaterThanOne(
+    int32 x, int32 quantized_multiplier, int left_shift) {
+  using gemmlowp::SaturatingRoundingDoublingHighMul;
+  return SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
+                                           quantized_multiplier);
+}
+
+}  // namespace rt
+}  // namespace nnfw
+
+#endif  // __NNFW_RT_COMMON_H__
diff --git a/runtimes/nn/common/operations/internal/compatibility.h b/runtimes/nn/common/operations/internal/compatibility.h
new file mode 100644
index 000000000..fd33cbd97
--- /dev/null
+++ b/runtimes/nn/common/operations/internal/compatibility.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RT_COMPATIBILITY_H__
+#define __NNFW_RT_COMPATIBILITY_H__
+
+#include <cassert>
+#include <cstdint>
+
+#ifndef DCHECK
+#define DCHECK(condition) (condition) ? (void)0 : assert(false)
+#endif
+
+#ifndef DCHECK_EQ
+#define DCHECK_EQ(x, y) ((x) == (y)) ? (void)0 : assert(false)
+#endif
+
+#ifndef DCHECK_GE
+#define DCHECK_GE(x, y) ((x) >= (y)) ? (void)0 : assert(false)
+#endif
+
+#ifndef DCHECK_GT
+#define DCHECK_GT(x, y) ((x) > (y)) ? (void)0 : assert(false)
+#endif
+
+#ifndef DCHECK_LE
+#define DCHECK_LE(x, y) ((x) <= (y)) ? (void)0 : assert(false)
+#endif
+
+#ifndef DCHECK_LT
+#define DCHECK_LT(x, y) ((x) < (y)) ? (void)0 : assert(false)
+#endif
+
+#ifndef CHECK_EQ
+#define CHECK_EQ(x, y) ((x) == (y)) ? (void)0 : assert(false)
+#endif
+
+using uint8 = std::uint8_t;
+using int16 = std::int16_t;
+using uint16 = std::uint16_t;
+using int32 = std::int32_t;
+using uint32 = std::uint32_t;
+
+#endif  // __NNFW_RT_COMPATIBILITY_H__
diff --git a/runtimes/nn/common/operations/internal/optimized/cpu_check.h b/runtimes/nn/common/operations/internal/optimized/cpu_check.h
new file mode 100644
index 000000000..02f42fd42
--- /dev/null
+++ b/runtimes/nn/common/operations/internal/optimized/cpu_check.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef FRAMEWORKS_ML_NN_COMMON_OPERATIONS_INTERNAL_OPTIMIZED_CPU_CHECK_
+#define FRAMEWORKS_ML_NN_COMMON_OPERATIONS_INTERNAL_OPTIMIZED_CPU_CHECK_
+
+// NEON_OR_PORTABLE(SomeFunc, arcs) calls NeonSomeFunc(args) if NEON is
+// enabled at build time, or PortableSomeFunc(args) otherwise.
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define NEON_OR_PORTABLE(funcname, ...) Neon##funcname(__VA_ARGS__)
+#else
+#define NEON_OR_PORTABLE(funcname, ...) Portable##funcname(__VA_ARGS__)
+#endif
+
+#endif  // FRAMEWORKS_ML_NN_COMMON_OPERATIONS_INTERNAL_OPTIMIZED_CPU_CHECK_
diff --git a/runtimes/nn/common/operations/internal/optimized/depthwiseconv_float.h b/runtimes/nn/common/operations/internal/optimized/depthwiseconv_float.h
new file mode 100644
index 000000000..5c05bf20f
--- /dev/null
+++ b/runtimes/nn/common/operations/internal/optimized/depthwiseconv_float.h
@@ -0,0 +1,792 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_FLOAT_H__
+#define __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_FLOAT_H__
+
+#include "gemmlowp.h"
+#include "../common.h"
+#include "../types.h"
+
+namespace nnfw {
+namespace rt {
+namespace optimized_ops {
+
+// Implementation of float DepthwiseConv
+
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+struct FloatDepthwiseConvKernel {};
+
+#ifdef USE_NEON
+
+template <>
+struct FloatDepthwiseConvKernel<false, 8, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Load the filters
+    float32x4_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vld1q_f32(filter_ptr + 4 * i);
+    }
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the inputs
+      float32x4_t input[4];
+      for (int i = 0; i < 4; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 16;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlaq_f32(acc[0], input[0], filter[0]);
+      acc[1] = vmlaq_f32(acc[1], input[1], filter[1]);
+      acc[2] = vmlaq_f32(acc[2], input[2], filter[0]);
+      acc[3] = vmlaq_f32(acc[3], input[3], filter[1]);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      float32x4_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 8;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<false, 2, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    const float32x2_t filters = vld1_f32(filter_ptr);
+    const float32x4_t filters_dup2 = vcombine_f32(filters, filters);
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8) {
+      // Load the inputs
+      float32x4_t input[4];
+      for (int i = 0; i < 4; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 16;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the inputs
+      float32x4_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vld1q_f32(input_ptr + 4 * i);
+      }
+      input_ptr += 8;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the inputs
+      const float32x4_t input = vld1q_f32(input_ptr);
+      input_ptr += 4;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc = vld1q_f32(acc_buffer_ptr);
+      // Multiply-accumulate
+      acc = vmlaq_f32(acc, input, filters_dup2);
+      // Store the accumulators back to acc_buffer
+      vst1q_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+    // Handle 1 output pixel at a time
+    for (; outp < num_output_pixels; outp++) {
+      // Load the inputs
+      const float32x2_t input = vld1_f32(input_ptr);
+      input_ptr += 2;
+      // Load the accumulators from acc_buffer
+      float32x2_t acc = vld1_f32(acc_buffer_ptr);
+      // Multiply-accumulate
+      acc = vmla_f32(acc, input, filters);
+      // Store the accumulators back to acc_buffer
+      vst1_f32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float* local_filter_ptr = filter_ptr;
+      const float* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 16 input channels at a time.
+      for (; ic <= input_depth - 16; ic += 16) {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++) {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        float32x4_t input[4];
+        for (int i = 0; i < 4; i++) {
+          input[i] = vld1q_f32(local_input_ptr + 4 * i);
+        }
+        local_input_ptr += 16;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle 4 input channels at a time.
+      for (; ic <= input_depth - 4; ic += 4) {
+        // Load the filters
+        float32x4_t filter;
+        filter = vld1q_f32(local_filter_ptr);
+        local_filter_ptr += 4;
+        // Load the inputs
+        float32x4_t input;
+        input = vld1q_f32(local_input_ptr);
+        local_input_ptr += 4;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc;
+        acc = vld1q_f32(acc_buffer_ptr);
+        // Multiply-accumulate
+        acc = vmlaq_f32(acc, input, filter);
+        // Store the accumulators back to acc_buffer
+        vst1q_f32(acc_buffer_ptr, acc);
+        acc_buffer_ptr += 4;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        const float input_val = *local_input_ptr++;
+        const float filter_val = *local_filter_ptr++;
+        *acc_buffer_ptr++ += filter_val * input_val;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 8> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float* local_filter_ptr = filter_ptr;
+      const float* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 2 input channels at a time.
+      for (; ic <= input_depth - 2; ic += 2) {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++) {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        const float32x2_t input = vld1_f32(local_input_ptr);
+        local_input_ptr += 2;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0);
+        acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0);
+        acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1);
+        acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        // Load the filters
+        float32x4_t filter[2];
+        for (int i = 0; i < 2; i++) {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 8;
+        // Load the inputs
+        const float input_val = *local_input_ptr++;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        for (int i = 0; i < 2; i++) {
+          acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++) {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float* local_filter_ptr = filter_ptr;
+      const float* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8) {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++) {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        float32x4x2_t input_dup2[2];
+        for (int i = 0; i < 2; i++) {
+          const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i);
+          input_dup2[i] = vzipq_f32(input, input);
+        }
+        local_input_ptr += 8;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]);
+        acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]);
+        acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]);
+        acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle 4 input channels at a time.
+      for (; ic <= input_depth - 4; ic += 4) {
+        // Load the filters
+        float32x2_t filter[4];
+        for (int i = 0; i < 4; i++) {
+          filter[i] = vld1_f32(local_filter_ptr + 2 * i);
+        }
+        local_filter_ptr += 8;
+        // Load the inputs
+        const float32x4_t input = vld1q_f32(local_input_ptr);
+        local_input_ptr += 4;
+        // Load the accumulators from acc_buffer
+        float32x2_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0);
+        acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1);
+        acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0);
+        acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      // Handle 2 input channels at a time.
+      for (; ic <= input_depth - 2; ic += 2) {
+        // Load the filters
+        const float32x4_t filter = vld1q_f32(local_filter_ptr);
+        local_filter_ptr += 4;
+        // Load the inputs
+        const float32x2_t input = vld1_f32(local_input_ptr);
+        local_input_ptr += 2;
+        // Load the accumulators from acc_buffer
+        float32x2_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0);
+        acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1);
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++) {
+          vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
+        }
+        acc_buffer_ptr += 4;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        // Load the inputs
+        const float input_val = *local_input_ptr++;
+        // Multiply-accumulate
+        for (int i = 0; i < 2; i++) {
+          acc_buffer_ptr[i] += local_filter_ptr[i] * input_val;
+        }
+        local_filter_ptr += 2;
+        acc_buffer_ptr += 2;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 1, 8> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the filters
+      float32x4_t filter[2];
+      for (int i = 0; i < 2; i++) {
+        filter[i] = vld1q_f32(filter_ptr + 4 * i);
+      }
+      // Load the inputs
+      const float input_val = *input_ptr;
+      input_ptr += input_ptr_increment;
+      // Load the accumulators from acc_buffer
+      float32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <>
+struct FloatDepthwiseConvKernel<true, 0, 16> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const float* input_ptr, int input_ptr_increment,
+                  const float* filter_ptr, float* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const float* local_filter_ptr = filter_ptr;
+      const float* local_input_ptr = input_ptr;
+      for (int ic = 0; ic < input_depth; ic++) {
+        // Load the filters
+        float32x4_t filter[4];
+        for (int i = 0; i < 4; i++) {
+          filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
+        }
+        local_filter_ptr += 16;
+        // Load the inputs
+        const float input_val = *local_input_ptr++;
+        // Load the accumulators from acc_buffer
+        float32x4_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+#endif
+
+// Accumulates the effect of one row of the filter, on a segment of one row
+// of the output, accessing the corresponding one row of the input.
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+void FloatDepthwiseConvAccumRow(int stride, int input_depth, int input_width,
+                                const float* input_data, int pad_width,
+                                int depth_multiplier, int filter_width,
+                                const float* filter_data,
+                                int out_x_buffer_start, int out_x_buffer_end,
+                                int output_depth, float* acc_buffer) {
+#ifdef GEMMLOWP_PROFILING
+  gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__);
+#endif
+  // Sanity check parameters. This is important in particular to ensure
+  // that we keep the number of template instantiations minimal, so we don't
+  // increase binary size unnecessarily.
+  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
+  static_assert(kFixedInputDepth || kAllowStrided, "");
+  DCHECK(stride == 1 || kAllowStrided);
+  if (kFixedInputDepth) {
+    DCHECK_EQ(input_depth, kFixedInputDepth);
+  }
+  if (kFixedDepthMultiplier) {
+    DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier);
+  }
+  DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  const int input_ptr_increment = stride * input_depth;
+  const float* filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+    // For the current (filter_x, filter_y) point in the filter,
+    // compute the boundaries of the corresponding output row segment.
+    int out_x_loop_start_unclampled = 0;
+    int out_x_loop_end_unclampled = 0;
+    if (kAllowStrided) {
+      if (stride == 2) {
+        out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2;
+        out_x_loop_end_unclampled =
+            (pad_width + input_width - filter_x + 1) / 2;
+      } else if (stride == 4) {
+        out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4;
+        out_x_loop_end_unclampled =
+            (pad_width + input_width - filter_x + 3) / 4;
+      } else {
+        out_x_loop_start_unclampled =
+            (pad_width - filter_x + stride - 1) / stride;
+        out_x_loop_end_unclampled =
+            (pad_width + input_width - filter_x + stride - 1) / stride;
+      }
+    } else {
+      out_x_loop_start_unclampled = pad_width - filter_x;
+      out_x_loop_end_unclampled = pad_width + input_width - filter_x;
+    }
+    // The kernel will have to iterate on the segment of the
+    // output row that starts at out_x_loop_start and out_x_loop_end.
+    const int out_x_loop_start =
+        std::max(out_x_buffer_start, out_x_loop_start_unclampled);
+    const int out_x_loop_end =
+        std::min(out_x_buffer_end, out_x_loop_end_unclampled);
+
+    float* acc_buffer_ptr =
+        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
+    const float* input_ptr = input_data + in_x_origin * input_depth;
+    const int num_output_pixels = out_x_loop_end - out_x_loop_start;
+    FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth,
+                             kFixedDepthMultiplier>::Run(num_output_pixels,
+                                                         input_depth,
+                                                         depth_multiplier,
+                                                         input_ptr,
+                                                         input_ptr_increment,
+                                                         filter_base_ptr,
+                                                         acc_buffer_ptr);
+    filter_base_ptr += output_depth;
+  }
+}
+
+// generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized.
+inline void FloatDepthwiseConvAccumRowGeneric(
+    int stride, int input_depth, int input_width, const float* input_data,
+    int pad_width, int depth_multiplier, int filter_width,
+    const float* filter_data, int out_x_buffer_start, int out_x_buffer_end,
+    int output_depth, float* acc_buffer) {
+  gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)");
+  const float* filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+    const int out_x_loop_start = std::max(
+        out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride);
+    const int out_x_loop_end =
+        std::min(out_x_buffer_end,
+                 (pad_width + input_width - filter_x + stride - 1) / stride);
+
+    float* acc_buffer_ptr =
+        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
+    const float* input_ptr = input_data + in_x_origin * input_depth;
+    const int input_ptr_increment = (stride - 1) * input_depth;
+    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) {
+      const float* filter_ptr = filter_base_ptr;
+      for (int ic = 0; ic < input_depth; ++ic) {
+        const float input_val = *input_ptr++;
+        for (int m = 0; m < depth_multiplier; m++) {
+          const float filter_val = *filter_ptr++;
+          *acc_buffer_ptr++ += filter_val * input_val;
+        }
+      }
+      input_ptr += input_ptr_increment;
+    }
+    filter_base_ptr += output_depth;
+  }
+}
+
+// Initializes the accumulator buffer with bias values.
+inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
+                                       const float* bias_data,
+                                       float* acc_buffer) {
+  for (int i = 0; i < num_output_pixels; i++) {
+    memcpy(acc_buffer + i * output_depth, bias_data,
+           sizeof(acc_buffer[0]) * output_depth);
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                   const float* filter_data, const Dims<4>& filter_dims,
+                   const float* bias_data, const Dims<4>& bias_dims,
+                   int stride_width, int stride_height,
+                   int pad_width, int pad_height, int depth_multiplier,
+                   float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("DepthwiseConv");
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int input_depth = ArraySize(input_dims, 0);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+#if 0 // TODO-NNRT : Check if assertion is needed, output depth some times not equal to input * depthmultiplier
+  DCHECK(output_depth == input_depth * depth_multiplier);
+#endif
+
+  static const int kAccBufferMaxSize = 1024;
+  float acc_buffer[kAccBufferMaxSize];
+  DCHECK_GE(kAccBufferMaxSize, output_depth);
+  const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
+  const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
+  DCHECK_LE(kOutputPixelsInAccBuffer * output_depth, kAccBufferActualSize);
+  DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
+  DCHECK_GE(kOutputPixelsInAccBuffer, 1);
+
+  // row_accum_func will point to the core accumulation function to be used
+  // for this DepthwiseConv op.
+  auto* row_accum_func = FloatDepthwiseConvAccumRowGeneric;
+
+  const int kMaxFixedDepthMultiplier = 16;
+  int fixed_depth_multiplier = 0;
+  if (depth_multiplier <= kMaxFixedDepthMultiplier) {
+    fixed_depth_multiplier = depth_multiplier;
+  }
+  // kMaxUnrolling is the max number of output values that we aim to handle
+  // in one unrolled iteration of the inner loop. For practical performance
+  // reasons, it is limited by the number of available registers. We could
+  // fine-tune it depending on the architecture, but that's not worth doing
+  // since this whole code is not very optimized to begin with. The
+  // present value reflects what's realistic on ARM 32bit NEON with 16 128-bit
+  // vector registers.
+  const int kMaxUnrolling = 8;
+  int fixed_input_depth = 0;
+  if (fixed_depth_multiplier &&
+      input_depth * fixed_depth_multiplier <= kMaxUnrolling) {
+    fixed_input_depth = input_depth;
+  }
+#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
+                                        FIXED_DEPTH_MULTIPLIER)           \
+  if ((stride_width == 1 || ALLOW_STRIDED) &&                             \
+      fixed_input_depth == FIXED_INPUT_DEPTH &&                           \
+      fixed_depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                 \
+    row_accum_func =                                                      \
+        FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,      \
+                                   FIXED_DEPTH_MULTIPLIER>;               \
+  }
+
+#ifdef USE_NEON
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 16)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
+#endif  // USE_NEON
+
+#undef TFMINI_USE_DEPTHWISECONV_KERNEL
+
+  // Now that we have determined row_accum_func, we can start work.
+  float* output_ptr = output_data;
+  for (int b = 0; b < batches; ++b) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      const int filter_y_start = std::max(0, -in_y_origin);
+      const int filter_y_end =
+          std::min(filter_height, input_height - in_y_origin);
+      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
+           out_x_buffer_start += kOutputPixelsInAccBuffer) {
+        const int out_x_buffer_end = std::min(
+            output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+        // We call a 'pixel' a group of activation that share all but the
+        // 'depth'/'channel' coordinate. num_output_pixels is the number of
+        // output pixels that we will accumulate in this loop iteration.
+        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
+        // Initialize our local accumulator with the bias values, so we don't
+        // have to add them later.
+        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data,
+                                   acc_buffer);
+        // Accumulation loop. Most of the time should be spent in here.
+        for (int filter_y = filter_y_start; filter_y < filter_y_end;
+             ++filter_y) {
+          const int in_y = in_y_origin + filter_y;
+          row_accum_func(stride_width, input_depth, input_width,
+                         input_data + in_y * input_dims.strides[2] +
+                             b * input_dims.strides[3],
+                         pad_width, depth_multiplier, filter_width,
+                         filter_data + filter_y * filter_dims.strides[2],
+                         out_x_buffer_start, out_x_buffer_end, output_depth,
+                         acc_buffer);
+        }
+        // Finished accumulating. Now store to destination.
+        const int num_output_values = output_depth * num_output_pixels;
+        int i = 0;
+#ifdef USE_NEON
+        // Handle 16 values at a time
+        for (; i <= num_output_values - 16; i += 16) {
+          float32x4_t acc[4];
+          for (int k = 0; k < 4; k++) {
+            acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
+          }
+          if (Ac == FusedActivationFunctionType::kRelu) {
+            for (int k = 0; k < 4; k++) {
+              acc[k] = vmaxq_f32(vdupq_n_f32(0.f), acc[k]);
+            }
+          } else if (Ac == FusedActivationFunctionType::kRelu6) {
+            for (int k = 0; k < 4; k++) {
+              acc[k] = vmaxq_f32(vdupq_n_f32(0.f),
+                                 vminq_f32(vdupq_n_f32(6.f), acc[k]));
+            }
+          } else if (Ac == FusedActivationFunctionType::kRelu1) {
+            for (int k = 0; k < 4; k++) {
+              acc[k] = vmaxq_f32(vdupq_n_f32(-1.f),
+                                 vminq_f32(vdupq_n_f32(1.f), acc[k]));
+            }
+          }
+          for (int k = 0; k < 4; k++) {
+            vst1q_f32(output_ptr + 4 * k, acc[k]);
+          }
+          output_ptr += 16;
+        }
+        // Handle 4 values at a time
+        for (; i <= num_output_values - 4; i += 4) {
+          float32x4_t acc = vld1q_f32(acc_buffer + i);
+          if (Ac == FusedActivationFunctionType::kRelu) {
+            acc = vmaxq_f32(vdupq_n_f32(0.f), acc);
+          } else if (Ac == FusedActivationFunctionType::kRelu6) {
+            acc = vmaxq_f32(vdupq_n_f32(0.f), vminq_f32(vdupq_n_f32(6.f), acc));
+          } else if (Ac == FusedActivationFunctionType::kRelu1) {
+            acc =
+                vmaxq_f32(vdupq_n_f32(-1.f), vminq_f32(vdupq_n_f32(1.f), acc));
+          }
+          vst1q_f32(output_ptr, acc);
+          output_ptr += 4;
+        }
+#endif
+        // Handle leftover values, one by one. This is very slow.
+        for (; i < num_output_values; i++) {
+          float acc = acc_buffer[i];
+          if (Ac == FusedActivationFunctionType::kRelu) {
+            acc = std::max(0.f, acc);
+          } else if (Ac == FusedActivationFunctionType::kRelu6) {
+            acc = std::max(0.f, std::min(6.f, acc));
+          } else if (Ac == FusedActivationFunctionType::kRelu1) {
+            acc = std::max(-1.f, std::min(1.f, acc));
+          }
+          *output_ptr++ = acc;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace optimized_ops
+}  // namespace rt
+}  // namespace nnfw
+
+
+#endif  // __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_FLOAT_H__
diff --git a/runtimes/nn/common/operations/internal/optimized/depthwiseconv_uint8.h b/runtimes/nn/common/operations/internal/optimized/depthwiseconv_uint8.h
new file mode 100644
index 000000000..220f8793e
--- /dev/null
+++ b/runtimes/nn/common/operations/internal/optimized/depthwiseconv_uint8.h
@@ -0,0 +1,1606 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_UINT8_H__
+#define __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_UINT8_H__
+
+#include "fixedpoint.h"
+#include "gemmlowp.h"
+#include "../common.h"
+#include "../types.h"
+
+namespace nnfw {
+namespace rt {
+namespace optimized_ops {
+
+// Implementation of quantized DepthwiseConv
+
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+struct QuantizedDepthwiseConvKernel {};
+
+#ifdef USE_NEON
+template <>
+struct QuantizedDepthwiseConvKernel<true, 8, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8x2_t filter_u8;
+    filter_u8.val[0] = vld1_u8(filter_ptr);
+    filter_u8.val[1] = vld1_u8(filter_ptr + 8);
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])),
+                            vdupq_n_s16(filter_offset));
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4x2_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+        acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+      }
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += input_ptr_increment;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[0].val[i] = vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]),
+                                  vget_low_s16(input_dup2.val[i]));
+        acc[1].val[i] = vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]),
+                                  vget_high_s16(input_dup2.val[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+        vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 8, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
+    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8[2];
+      for (int i = 0; i < 2; i++) {
+        input_u8[i] = vld1_u8(input_ptr + 8 * i);
+      }
+      input_ptr += 16;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
+      }
+      for (int i = 0; i < 2; i++) {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0]));
+      acc[1] =
+          vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0]));
+      acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1]));
+      acc[3] =
+          vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[2];
+      acc[0] = vld1q_s32(acc_buffer_ptr);
+      acc[1] = vld1q_s32(acc_buffer_ptr + 4);
+
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input));
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input));
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc[0]);
+      vst1q_s32(acc_buffer_ptr + 4, acc[1]);
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 4, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
+    const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+    const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(filter),
+                                   vget_low_s16(input_dup2.val[i]));
+        acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(filter),
+                                   vget_high_s16(input_dup2.val[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x4x2_t input_dup2 = vzip_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]);
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 2, 8> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
+      const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+      filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+    }
+    int outp = 0;
+    // Handle two output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate.
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
+      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2);
+      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2);
+      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3);
+      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3);
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 8; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
+
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 2, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x4_t input_dup2 = vzip_s16(input, input).val[0];
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input_dup2);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 2, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8[2];
+      for (int i = 0; i < 2; i++) {
+        input_u8[i] = vld1_u8(input_ptr + 8 * i);
+      }
+      input_ptr += 16;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
+      }
+      for (int i = 0; i < 2; i++) {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1]));
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input));
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 2; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer.
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer.
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer.
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 1, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      const uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      const uint32 input = *input_ptr++ + input_offset;
+
+      // Multiply-accumulate
+      acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 1, 4> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0);
+      acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1);
+      acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2);
+      acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3);
+      acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0);
+      acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1);
+      acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2);
+      acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3);
+
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 8; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], filter, input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], filter, input, 1);
+      acc[2] = vmlal_lane_s16(acc[2], filter, input, 2);
+      acc[3] = vmlal_lane_s16(acc[3], filter, input, 3);
+
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      const uint32 input = *input_ptr++ + input_offset;
+
+      // Multiply-accumulate
+      acc = vmlal_n_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 4, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8 = vdup_n_u8(0);
+    filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0);
+    filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1);
+    filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2);
+    filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3);
+    const int16x4_t filter_s16 =
+        vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8)));
+    const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset));
+
+    int outp = 0;
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++) {
+        const uint8x8_t input_u8 = vld1_u8(input_ptr + 8 * i);
+        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+        input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      }
+      input_ptr += 16;
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[2 * i + 0] =
+            vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i]));
+        acc[2 * i + 1] =
+            vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc;
+      acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<false, 4, 4> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i);
+      const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+      filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+    }
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vld1_u8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]),
+                              vget_low_s16(input), 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]),
+                              vget_low_s16(input), 1);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]),
+                              vget_low_s16(input), 2);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]),
+                              vget_low_s16(input), 3);
+      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]),
+                              vget_high_s16(input), 0);
+      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]),
+                              vget_high_s16(input), 1);
+      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]),
+                              vget_high_s16(input), 2);
+      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]),
+                              vget_high_s16(input), 3);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 8; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++) {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8 = vdup_n_u8(0);
+      input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0);
+      input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1);
+      input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2);
+      input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 =
+          vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 0, 3> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // We will have to duplicate bytes in a NEON register, 3-fold.
+    // We will do that by register-level table-look-up using VTBL instructions.
+    // Here we prepare the registers containing the table-lookup indices.
+    static const uint8 dup3_indices_array[3][8] = {{0, 0, 0, 1, 1, 1, 2, 2},
+                                                   {2, 3, 3, 3, 4, 4, 4, 5},
+                                                   {5, 5, 6, 6, 6, 7, 7, 7}};
+    uint8x8_t dup3_indices[3];
+    for (int i = 0; i < 3; i++) {
+      dup3_indices[i] = vld1_u8(dup3_indices_array[i]);
+    }
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const uint8* local_filter_ptr = filter_ptr;
+      const uint8* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8) {
+        // Load the filters, add filter_offset.
+        int16x8_t filter[3];
+        uint8x8x3_t filter_u8;
+        filter_u8.val[0] = vld1_u8(local_filter_ptr);
+        filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
+        filter_u8.val[2] = vld1_u8(local_filter_ptr + 16);
+        local_filter_ptr += 24;
+        for (int i = 0; i < 3; i++) {
+          const int16x8_t filter_s16 =
+              vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
+          filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+        }
+        // Load the inputs, duplicate 3-fold, add input_offset.
+        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
+        local_input_ptr += 8;
+
+        uint8x8_t input_u8_dup3[3];
+        for (int i = 0; i < 3; i++) {
+          input_u8_dup3[i] = vtbl1_u8(input_u8, dup3_indices[i]);
+        }
+        int16x8_t input_dup3[3];
+        for (int i = 0; i < 3; i++) {
+          const int16x8_t input_s16_dup3 =
+              vreinterpretq_s16_u16(vmovl_u8(input_u8_dup3[i]));
+          input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset));
+        }
+        // Load the accumulators from acc_buffer
+        int32x4x3_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+          acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16);
+        }
+        // Multiply-accumulate
+        for (int j = 0; j < 3; j++) {
+          acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]),
+                                    vget_low_s16(filter[j]));
+          acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]),
+                                    vget_high_s16(filter[j]));
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++) {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]);
+        }
+        acc_buffer_ptr += 24;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        const int16 input_val = *local_input_ptr++ + input_offset;
+        for (int i = 0; i < 3; i++) {
+          const int16 filter_val = local_filter_ptr[i] + filter_offset;
+          *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
+        }
+        local_filter_ptr += 3;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 0, 2> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const uint8* local_filter_ptr = filter_ptr;
+      const uint8* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8) {
+        // Load the filters, add filter_offset.
+        int16x8_t filter[2];
+        uint8x8x2_t filter_u8;
+        filter_u8.val[0] = vld1_u8(local_filter_ptr);
+        filter_u8.val[1] = vld1_u8(local_filter_ptr + 8);
+        local_filter_ptr += 16;
+        for (int i = 0; i < 2; i++) {
+          const int16x8_t filter_s16 =
+              vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i]));
+          filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+        }
+        // Load the inputs, add input_offset, duplicate 2-fold.
+        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
+        local_input_ptr += 8;
+        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+        const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+        // Load the accumulators from acc_buffer.
+        int32x4x2_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+        }
+        // Multiply-accumulate.
+        for (int j = 0; j < 2; j++) {
+          acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]),
+                                    vget_low_s16(input_dup2.val[j]));
+          acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]),
+                                    vget_high_s16(input_dup2.val[j]));
+        }
+        // Store the accumulators back to acc_buffer.
+        for (int i = 0; i < 2; i++) {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        // Load the inputs.
+        const int16 input_val = *local_input_ptr++ + input_offset;
+        for (int i = 0; i < 2; i++) {
+          const int16 filter_val = local_filter_ptr[i] + filter_offset;
+          *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
+        }
+        local_filter_ptr += 2;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 0, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      const uint8* local_filter_ptr = filter_ptr;
+      const uint8* local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 16 input channels at a time.
+      for (; ic <= input_depth - 16; ic += 16) {
+        // Load the filters, add filter_offset.
+        uint8x8_t filter_u8[2];
+        for (int i = 0; i < 2; i++) {
+          filter_u8[i] = vld1_u8(local_filter_ptr + 8 * i);
+        }
+        local_filter_ptr += 16;
+        int16x8_t filter[2];
+        for (int i = 0; i < 2; i++) {
+          filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
+        }
+        for (int i = 0; i < 2; i++) {
+          filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
+        }
+        // Load the inputs, add input_offset.
+        uint8x8_t input_u8[2];
+        for (int i = 0; i < 2; i++) {
+          input_u8[i] = vld1_u8(local_input_ptr + 8 * i);
+        }
+        local_input_ptr += 16;
+        int16x8_t input[2];
+        for (int i = 0; i < 2; i++) {
+          input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
+        }
+        for (int i = 0; i < 2; i++) {
+          input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+        }
+        // Load the accumulators from acc_buffer
+        int32x4_t acc[4];
+        for (int i = 0; i < 4; i++) {
+          acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        for (int i = 0; i < 2; i++) {
+          acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]),
+                                     vget_low_s16(filter[i]));
+          acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]),
+                                     vget_high_s16(filter[i]));
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 4; i++) {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8) {
+        // Load the filters, add filter_offset.
+        const uint8x8_t filter_u8 = vld1_u8(local_filter_ptr);
+        local_filter_ptr += 8;
+        const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8));
+        const int16x8_t filter =
+            vaddq_s16(filter_s16, vdupq_n_s16(filter_offset));
+        // Load the inputs, add input_offset.
+        const uint8x8_t input_u8 = vld1_u8(local_input_ptr);
+        local_input_ptr += 8;
+        const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8));
+        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+        // Load the accumulators from acc_buffer
+        int32x4_t acc[2];
+        for (int i = 0; i < 2; i++) {
+          acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
+        acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++) {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++) {
+        const int16 input_val = *local_input_ptr++ + input_offset;
+        const int16 filter_val = *local_filter_ptr++ + filter_offset;
+        *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 16, 1> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8[2];
+    for (int i = 0; i < 2; i++) {
+      filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
+    }
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
+    }
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      // Load the inputs, add input_offset.
+      uint8x8_t input_u8[2];
+      for (int i = 0; i < 2; i++) {
+        input_u8[i] = vld1_u8(input_ptr + 8 * i);
+      }
+      input_ptr += input_ptr_increment;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++) {
+        input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i]));
+      }
+      for (int i = 0; i < 2; i++) {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]),
+                                   vget_low_s16(filter[i]));
+        acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]),
+                                   vget_high_s16(filter[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 1, 16> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    uint8x8_t filter_u8[2];
+    for (int i = 0; i < 2; i++) {
+      filter_u8[i] = vld1_u8(filter_ptr + 8 * i);
+    }
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i]));
+    }
+    for (int i = 0; i < 2; i++) {
+      filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset));
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      uint8 input_u8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16 input = static_cast<int16>(input_u8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++) {
+        acc[2 * i + 0] =
+            vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input);
+        acc[2 * i + 1] =
+            vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <>
+struct QuantizedDepthwiseConvKernel<true, 1, 8> {
+  static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
+                  const uint8* input_ptr, int16 input_offset,
+                  int input_ptr_increment, const uint8* filter_ptr,
+                  int16 filter_offset, int32* acc_buffer_ptr) {
+    // Load the filters, add filter_offset.
+    const uint8x8_t filter_u8 = vld1_u8(filter_ptr);
+    const int16x8_t filter = vaddq_s16(
+        vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset));
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++) {
+      uint8 input_u8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16 input = static_cast<int16>(input_u8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++) {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input);
+      acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++) {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+#endif
+
+// Accumulates the effect of one row of the filter, on a segment of one row
+// of the output, accessing the corresponding one row of the input.
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+void QuantizedDepthwiseConvAccumRow(
+    int stride, int input_depth, int input_width, const uint8* input_data,
+    int16 input_offset, int pad_width, int depth_multiplier, int filter_width,
+    const uint8* filter_data, int16 filter_offset, int out_x_buffer_start,
+    int out_x_buffer_end, int output_depth, int32* acc_buffer) {
+#ifdef GEMMLOWP_PROFILING
+  gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__);
+#endif
+  // Sanity check parameters. This is important in particular to ensure
+  // that we keep the number of template instantiations minimal, so we don't
+  // increase binary size unnecessarily.
+  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
+  static_assert(kFixedInputDepth || kAllowStrided, "");
+  DCHECK(stride == 1 || kAllowStrided);
+  if (kFixedInputDepth) {
+    DCHECK_EQ(input_depth, kFixedInputDepth);
+  }
+  if (kFixedDepthMultiplier) {
+    DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier);
+  }
+  DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  const int input_ptr_increment = stride * input_depth;
+  const uint8* filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+    // For the current (filter_x, filter_y) point in the filter,
+    // compute the boundaries of the corresponding output row segment.
+    int out_x_loop_start_unclampled = 0;
+    int out_x_loop_end_unclampled = 0;
+    if (kAllowStrided) {
+      if (stride == 2) {
+        out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2;
+        out_x_loop_end_unclampled =
+            (pad_width + input_width - filter_x + 1) / 2;
+      } else if (stride == 4) {
+        out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4;
+        out_x_loop_end_unclampled =
+            (pad_width + input_width - filter_x + 3) / 4;
+      } else {
+        out_x_loop_start_unclampled =
+            (pad_width - filter_x + stride - 1) / stride;
+        out_x_loop_end_unclampled =
+            (pad_width + input_width - filter_x + stride - 1) / stride;
+      }
+    } else {
+      out_x_loop_start_unclampled = pad_width - filter_x;
+      out_x_loop_end_unclampled = pad_width + input_width - filter_x;
+    }
+    // The kernel will have to iterate on the segment of the
+    // output row that starts at out_x_loop_start and out_x_loop_end.
+    const int out_x_loop_start =
+        std::max(out_x_buffer_start, out_x_loop_start_unclampled);
+    const int out_x_loop_end =
+        std::min(out_x_buffer_end, out_x_loop_end_unclampled);
+
+    int32* acc_buffer_ptr =
+        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
+    const uint8* input_ptr = input_data + in_x_origin * input_depth;
+    const int num_output_pixels = out_x_loop_end - out_x_loop_start;
+    QuantizedDepthwiseConvKernel<
+        kAllowStrided, kFixedInputDepth,
+        kFixedDepthMultiplier>::Run(num_output_pixels, input_depth,
+                                    depth_multiplier, input_ptr, input_offset,
+                                    input_ptr_increment, filter_base_ptr,
+                                    filter_offset, acc_buffer_ptr);
+    filter_base_ptr += output_depth;
+  }
+}
+
+// generic fallback of DepthwiseConvAccumRow, portable, non-templatized.
+inline void QuantizedDepthwiseConvAccumRowGeneric(
+    int stride, int input_depth, int input_width, const uint8* input_data,
+    int16 input_offset, int pad_width, int depth_multiplier, int filter_width,
+    const uint8* filter_data, int16 filter_offset, int out_x_buffer_start,
+    int out_x_buffer_end, int output_depth, int32* acc_buffer) {
+  gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)");
+  const uint8* filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+    const int out_x_loop_start = std::max(
+        out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride);
+    const int out_x_loop_end =
+        std::min(out_x_buffer_end,
+                 (pad_width + input_width - filter_x + stride - 1) / stride);
+
+    int32* acc_buffer_ptr =
+        acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
+    const uint8* input_ptr = input_data + in_x_origin * input_depth;
+    const int input_ptr_increment = (stride - 1) * input_depth;
+    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) {
+      const uint8* filter_ptr = filter_base_ptr;
+      for (int ic = 0; ic < input_depth; ++ic) {
+        const int16 input_val = *input_ptr++ + input_offset;
+        for (int m = 0; m < depth_multiplier; m++) {
+          const int16 filter_val = *filter_ptr++ + filter_offset;
+          *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val;
+        }
+      }
+      input_ptr += input_ptr_increment;
+    }
+    filter_base_ptr += output_depth;
+  }
+}
+
+// Initializes the accumulator buffer with bias values.
+inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
+                                       const int32* bias_data,
+                                       int32* acc_buffer) {
+  int i = 0;
+#ifdef USE_NEON
+  if (output_depth == 1) {
+    const int32x4_t b = vdupq_n_s32(bias_data[0]);
+    for (; i <= num_output_pixels - 16; i += 16) {
+      vst1q_s32(acc_buffer + i + 0, b);
+      vst1q_s32(acc_buffer + i + 4, b);
+      vst1q_s32(acc_buffer + i + 8, b);
+      vst1q_s32(acc_buffer + i + 12, b);
+    }
+    for (; i <= num_output_pixels - 4; i += 4) {
+      vst1q_s32(acc_buffer + i, b);
+    }
+  } else if (output_depth == 2) {
+    int32x4_t b = vdupq_n_s32(bias_data[0]);
+    b = vsetq_lane_s32(bias_data[1], b, 1);
+    b = vsetq_lane_s32(bias_data[1], b, 3);
+    for (; i <= num_output_pixels - 8; i += 8) {
+      vst1q_s32(acc_buffer + 2 * i + 0, b);
+      vst1q_s32(acc_buffer + 2 * i + 4, b);
+      vst1q_s32(acc_buffer + 2 * i + 8, b);
+      vst1q_s32(acc_buffer + 2 * i + 12, b);
+    }
+    for (; i <= num_output_pixels - 2; i += 2) {
+      vst1q_s32(acc_buffer + 2 * i, b);
+    }
+  } else if (output_depth == 4) {
+    const int32x4_t b = vld1q_s32(bias_data);
+    for (; i <= num_output_pixels - 4; i += 4) {
+      vst1q_s32(acc_buffer + 4 * i + 0, b);
+      vst1q_s32(acc_buffer + 4 * i + 4, b);
+      vst1q_s32(acc_buffer + 4 * i + 8, b);
+      vst1q_s32(acc_buffer + 4 * i + 12, b);
+    }
+    for (; i < num_output_pixels; i++) {
+      vst1q_s32(acc_buffer + 4 * i, b);
+    }
+  } else if (output_depth == 8) {
+    const int32x4_t b0 = vld1q_s32(bias_data);
+    const int32x4_t b1 = vld1q_s32(bias_data + 4);
+    for (; i <= num_output_pixels - 2; i += 2) {
+      vst1q_s32(acc_buffer + 8 * i + 0, b0);
+      vst1q_s32(acc_buffer + 8 * i + 4, b1);
+      vst1q_s32(acc_buffer + 8 * i + 8, b0);
+      vst1q_s32(acc_buffer + 8 * i + 12, b1);
+    }
+    for (; i < num_output_pixels; i++) {
+      vst1q_s32(acc_buffer + 8 * i + 0, b0);
+      vst1q_s32(acc_buffer + 8 * i + 4, b1);
+    }
+  } else if (output_depth == 16) {
+    const int32x4_t b0 = vld1q_s32(bias_data);
+    const int32x4_t b1 = vld1q_s32(bias_data + 4);
+    const int32x4_t b2 = vld1q_s32(bias_data + 8);
+    const int32x4_t b3 = vld1q_s32(bias_data + 12);
+    for (; i < num_output_pixels; i++) {
+      vst1q_s32(acc_buffer + 16 * i + 0, b0);
+      vst1q_s32(acc_buffer + 16 * i + 4, b1);
+      vst1q_s32(acc_buffer + 16 * i + 8, b2);
+      vst1q_s32(acc_buffer + 16 * i + 12, b3);
+    }
+  }
+#endif
+  for (; i < num_output_pixels; i++) {
+    memcpy(acc_buffer + i * output_depth, bias_data,
+           sizeof(acc_buffer[0]) * output_depth);
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
+                   int32 input_offset, const uint8* filter_data,
+                   const Dims<4>& filter_dims, int32 filter_offset,
+                   const int32* bias_data, const Dims<4>& bias_dims,
+                   int stride_width, int stride_height,
+                   int pad_width, int pad_height, int depth_multiplier,
+                   int32 output_offset, int32 output_multiplier,
+                   int output_shift, int32 output_activation_min,
+                   int32 output_activation_max, uint8* output_data,
+                   const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit");
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    DCHECK_EQ(output_activation_min, 0);
+    DCHECK_EQ(output_activation_max, 255);
+  }
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int input_depth = ArraySize(input_dims, 0);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  DCHECK(output_depth == input_depth * depth_multiplier);
+
+  static const int kAccBufferMaxSize = 1024;
+  int32 acc_buffer[kAccBufferMaxSize];
+  DCHECK_GE(kAccBufferMaxSize, output_depth);
+  const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
+  const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
+  DCHECK_LE(kOutputPixelsInAccBuffer * output_depth, kAccBufferActualSize);
+  DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
+  DCHECK_GE(kOutputPixelsInAccBuffer, 1);
+
+  // row_accum_func will point to the core accumulation function to be used
+  // for this DepthwiseConv op.
+  auto* row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
+
+  const int kMaxFixedDepthMultiplier = 16;
+  int fixed_depth_multiplier = 0;
+  if (depth_multiplier <= kMaxFixedDepthMultiplier) {
+    fixed_depth_multiplier = depth_multiplier;
+  }
+  // kMaxUnrolling is the max number of output values that we aim to handle
+  // in one unrolled iteration of the inner loop. For practical performance
+  // reasons, it is limited by the number of available registers. We could
+  // fine-tune it depending on the architecture, but that's not worth doing
+  // since this whole code is not very optimized to begin with. The
+  // present value reflects what's realistic on ARM 32bit NEON with 16 128-bit
+  // vector registers.
+  const int kMaxUnrolling = 16;
+  int fixed_input_depth = 0;
+  if (fixed_depth_multiplier &&
+      input_depth * fixed_depth_multiplier <= kMaxUnrolling) {
+    fixed_input_depth = input_depth;
+  }
+#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
+                                        FIXED_DEPTH_MULTIPLIER)           \
+  if ((stride_width == 1 || ALLOW_STRIDED) &&                             \
+      fixed_input_depth == FIXED_INPUT_DEPTH &&                           \
+      fixed_depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                 \
+    row_accum_func =                                                      \
+        QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,  \
+                                       FIXED_DEPTH_MULTIPLIER>;           \
+  }
+
+#ifdef USE_NEON
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
+#endif  // USE_NEON
+
+#undef TFMINI_USE_DEPTHWISECONV_KERNEL
+
+  // Now that we have determined row_accum_func, we can start work.
+  uint8* output_ptr = output_data;
+  for (int b = 0; b < batches; ++b) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      const int filter_y_start = std::max(0, -in_y_origin);
+      const int filter_y_end =
+          std::min(filter_height, input_height - in_y_origin);
+      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
+           out_x_buffer_start += kOutputPixelsInAccBuffer) {
+        const int out_x_buffer_end = std::min(
+            output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+        // We call a 'pixel' a group of activation that share all but the
+        // 'depth'/'channel' coordinate. num_output_pixels is the number of
+        // output pixels that we will accumulate in this loop iteration.
+        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
+        // Initialize our local accumulator with the bias values, so we don't
+        // have to add them later.
+        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data,
+                                   acc_buffer);
+        // Accumulation loop. Most of the time should be spent in here.
+        for (int filter_y = filter_y_start; filter_y < filter_y_end;
+             ++filter_y) {
+          const int in_y = in_y_origin + filter_y;
+          row_accum_func(
+              stride_width, input_depth, input_width,
+              input_data + in_y * input_dims.strides[2] +
+                  b * input_dims.strides[3],
+              input_offset, pad_width, depth_multiplier, filter_width,
+              filter_data + filter_y * filter_dims.strides[2], filter_offset,
+              out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer);
+        }
+        // Finished accumulating int32 values. Now need to convert them to
+        // the final 8bit form and store them.
+        gemmlowp::ScopedProfilingLabel label("downquantize+store");
+        const int num_output_values = output_depth * num_output_pixels;
+        int i = 0;
+#ifdef USE_NEON
+        using gemmlowp::RoundingDivideByPOT;
+        const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+        const int32x4_t output_activation_min_vec =
+            vdupq_n_s32(output_activation_min);
+        const int32x4_t output_activation_max_vec =
+            vdupq_n_s32(output_activation_max);
+        // Handle 16 values at once.
+        // This allows us to issue 4 mutually independent int32
+        // multiplications (vqrdmulh), which should alleviate most of their
+        // high latency.
+        for (; i <= num_output_values - 16; i += 16) {
+          int32x4_t acc[4];
+          for (int j = 0; j < 4; j++) {
+            acc[j] = vld1q_s32(acc_buffer + i + 4 * j);
+          }
+
+          // Fixed-point multiplication.
+          for (int j = 0; j < 4; j++) {
+            acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
+          }
+          for (int j = 0; j < 4; j++) {
+            acc[j] = RoundingDivideByPOT(acc[j], output_shift);
+          }
+          // Add the output offset.
+          for (int j = 0; j < 4; j++) {
+            acc[j] = vaddq_s32(acc[j], output_offset_vec);
+          }
+          // Apply the activation function.
+          if (Ac != FusedActivationFunctionType::kNone) {
+            for (int j = 0; j < 4; j++) {
+              acc[j] = vmaxq_s32(acc[j], output_activation_min_vec);
+            }
+            for (int j = 0; j < 4; j++) {
+              acc[j] = vminq_s32(acc[j], output_activation_max_vec);
+            }
+          }
+          // Saturating cast to uint8 and store to destination.
+          int16x4_t acc_s16[4];
+          for (int j = 0; j < 4; j++) {
+            acc_s16[j] = vqmovn_s32(acc[j]);
+          }
+          const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]);
+          const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]);
+          const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0);
+          const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1);
+          vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1));
+          output_ptr += 16;
+        }
+        // Handle 8 values at once.
+        // Not as good as 16 (now we're only issuing 2 mutually independent
+        // vqrdmulh instructions, so we're probably paying for their high
+        // latency).
+        for (; i <= num_output_values - 8; i += 8) {
+          int32x4_t acc0 = vld1q_s32(acc_buffer + i);
+          int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4);
+          // Fixed-point multiplication.
+          acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+          acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+          // Rounding right shift.
+          acc0 = RoundingDivideByPOT(acc0, output_shift);
+          acc1 = RoundingDivideByPOT(acc1, output_shift);
+          // Add the output offset.
+          acc0 = vaddq_s32(acc0, output_offset_vec);
+          acc1 = vaddq_s32(acc1, output_offset_vec);
+          // Apply the activation function.
+          if (Ac != FusedActivationFunctionType::kNone) {
+            acc0 = vmaxq_s32(acc0, output_activation_min_vec);
+            acc1 = vmaxq_s32(acc1, output_activation_min_vec);
+            acc0 = vminq_s32(acc0, output_activation_max_vec);
+            acc1 = vminq_s32(acc1, output_activation_max_vec);
+          }
+          // Saturating cast to uint8 and store to destination.
+          const int16x4_t acc0_s16 = vqmovn_s32(acc0);
+          const int16x4_t acc1_s16 = vqmovn_s32(acc1);
+          const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16);
+          const uint8x8_t res_u8 = vqmovun_s16(res_s16);
+          vst1_u8(output_ptr, res_u8);
+          output_ptr += 8;
+        }
+        // Handle 4 values at once. Now we're paying the full price of the
+        // high latency of vqrdmulh. Also, storing only 4 bytes at the end
+        // (without any alignment) can only be done 1 byte at a time.
+        // Yet, that is still worth doing to minimize the amount of leftover
+        // that will have to go through the very slow scalar code.
+        for (; i <= num_output_values - 4; i += 4) {
+          int32x4_t acc = vld1q_s32(acc_buffer + i);
+          // Fixed-point multiplication.
+          acc = vqrdmulhq_n_s32(acc, output_multiplier);
+          // Rounding right shift.
+          acc = RoundingDivideByPOT(acc, output_shift);
+          // Add the output offset.
+          acc = vaddq_s32(acc, output_offset_vec);
+          // Apply the activation function.
+          if (Ac != FusedActivationFunctionType::kNone) {
+            acc = vmaxq_s32(acc, output_activation_min_vec);
+            acc = vminq_s32(acc, output_activation_max_vec);
+          }
+          // Saturating cast to uint8 and store to destination.
+          const int16x4_t acc_s16 = vqmovn_s32(acc);
+          const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16);
+          const uint8x8_t res_u8 = vqmovun_s16(res_s16);
+          vst1_lane_u8(output_ptr + 0, res_u8, 0);
+          vst1_lane_u8(output_ptr + 1, res_u8, 1);
+          vst1_lane_u8(output_ptr + 2, res_u8, 2);
+          vst1_lane_u8(output_ptr + 3, res_u8, 3);
+          output_ptr += 4;
+        }
+#endif  // USE_NEON
+
+        // Handle leftover values, one by one. This is very slow.
+        for (; i < num_output_values; i++) {
+          int32 acc = acc_buffer[i];
+          acc = MultiplyByQuantizedMultiplierSmallerThanOne(
+              acc, output_multiplier, output_shift);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          *output_ptr++ = static_cast<uint8>(acc);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace optimized_ops
+}  // namespace rt
+}  // namespace nnfw
+
+#endif  // __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_UINT8_H__
diff --git a/runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.cc b/runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.cc
new file mode 100644
index 000000000..7af122517
--- /dev/null
+++ b/runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.cc
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <string.h>
+
+#include "ActivationFunctor.h"
+#include "tensor_utils_impl.h"
+
+#ifdef USE_NEON
+
+#include <arm_neon.h>
+#define kFloatWeightsPerNeonLane 4
+
+namespace nnfw {
+namespace rt {
+namespace tensor_utils {
+
+void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+                                             int m_cols, const float* vector,
+                                             int n_batch, float* result,
+                                             int result_stride) {
+  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+  // vectorized loop, and we need to process sequentially. postamble_start shows
+  // the start index where this should happen.
+  const int postamble_start =
+      m_cols - (m_cols & (kFloatWeightsPerNeonLane - 1));
+
+  // The arrays used to cache the vector.
+  float32x4_t* vector_cache_float32x4 =
+      new float32x4_t[(m_cols / kFloatWeightsPerNeonLane) *
+                      sizeof(float32x4_t)];
+
+  for (int b = 0; b < n_batch; b++) {
+    float* result_in_batch = result + b * m_rows;
+    const float* vector_in_batch = vector + b * m_cols;
+    const float* matrix_ptr = matrix;
+    for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) {
+      vector_cache_float32x4[c >> 2] = vld1q_f32(vector_in_batch + c);
+    }
+    for (int r = 0; r < m_rows; r++) {
+      float32x4_t acc_32x4 = vmovq_n_f32(0.0);
+      for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) {
+        float32x4_t temp = vector_cache_float32x4[c >> 2];
+        // Load 4 float values from vector1 and vector2 and accumulator.
+        float32x4_t v1_f32x4 = vld1q_f32(matrix_ptr + c);
+        // Vector multiply-accumulate 4 float
+        acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, temp);
+      }
+      // Add the 4 intermediate sum values to get the final dot-prod value for
+      // this column.
+      *result_in_batch +=
+          (vgetq_lane_f32(acc_32x4, 0) + vgetq_lane_f32(acc_32x4, 1) +
+           vgetq_lane_f32(acc_32x4, 2) + vgetq_lane_f32(acc_32x4, 3));
+      for (int c = postamble_start; c < m_cols; c++) {
+        *result_in_batch += matrix_ptr[c] * vector_in_batch[c];
+      }
+      matrix_ptr += m_cols;
+      result_in_batch += result_stride;
+    }
+  }
+  delete[] vector_cache_float32x4;
+}
+
+void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
+                                  int v_size, float* result) {
+  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+  // vectorized loop, and we need to process sequentially. postamble_start shows
+  // the start index where this should happen.
+  const int postamble_start =
+      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+    // Load 4 float values from vector1 and vector2.
+    float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
+    float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
+    // Vector multiply 4 float
+    float32x4_t mul_32x4 = vmulq_f32(v1_f32x4, v2_f32x4);
+    // Save to result array.
+    vst1q_f32(&result[v], mul_32x4);
+  }
+  for (int v = postamble_start; v < v_size; v++) {
+    result[v] = vector1[v] * vector2[v];
+  }
+}
+
+void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
+                                            const float* vector2, int v_size,
+                                            float* result) {
+  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+  // vectorized loop, and we need to process sequentially. postamble_start shows
+  // the start index where this should happen.
+  const int postamble_start =
+      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+    // Load 4 float values from vector1 and vector2 and accumulator.
+    float32x4_t v1_f32x4 = vld1q_f32(vector1 + v);
+    float32x4_t v2_f32x4 = vld1q_f32(vector2 + v);
+    float32x4_t acc_32x4 = vld1q_f32(result + v);
+    // Vector multiply-accumulate 4 float
+    acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, v2_f32x4);
+    // Save to result array.
+    vst1q_f32(&result[v], acc_32x4);
+  }
+  for (int v = postamble_start; v < v_size; v++) {
+    result[v] += vector1[v] * vector2[v];
+  }
+}
+
+void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
+                                                 int v_size,
+                                                 const float* batch_vector,
+                                                 int n_batch, float* result) {
+  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+  // vectorized loop, and we need to process sequentially. postamble_start shows
+  // the start index where this should happen.
+  const int postamble_start =
+      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+
+  // The arrays used to cache the vector.
+  float32x4_t* vector_cache_float32x4 =
+      new float32x4_t[(v_size / kFloatWeightsPerNeonLane) *
+                      sizeof(float32x4_t)];
+  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+    vector_cache_float32x4[v >> 2] = vld1q_f32(vector + v);
+  }
+
+  float* result_ptr = result;
+  const float* batch_vector_ptr = batch_vector;
+  for (int b = 0; b < n_batch; b++) {
+    for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+      // Load from memory to vectors.
+      float32x4_t result_f32x4 = vld1q_f32(result_ptr + v);
+      float32x4_t batch_vector_f32x4 = vld1q_f32(batch_vector_ptr + v);
+      // Multiply-accumulate.
+      result_f32x4 = vmlaq_f32(result_f32x4, batch_vector_f32x4,
+                               vector_cache_float32x4[v >> 2]);
+      // Store.
+      vst1q_f32(result_ptr + v, result_f32x4);
+    }
+    // Postamble loop
+    for (int v = postamble_start; v < v_size; v++) {
+      result_ptr[v] += vector[v] * batch_vector_ptr[v];
+    }
+    // Update the pointers.
+    result_ptr += v_size;
+    batch_vector_ptr += v_size;
+  }
+  delete[] vector_cache_float32x4;
+}
+
+void NeonSub1Vector(const float* vector, int v_size, float* result) {
+  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+  // vectorized loop, and we need to process sequentially. postamble_start shows
+  // the start index where this should happen.
+  const int postamble_start =
+      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+
+  float32x4_t one_f32x4 = vmovq_n_f32(1.0);
+  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+    // Load 4 float values from the current pointers of the input column and
+    // subtract from 1.
+    float32x4_t v_f32x4 = vld1q_f32(vector + v);
+    float32x4_t result_f32x4 = vsubq_f32(one_f32x4, v_f32x4);
+    // Save to output.
+    vst1q_f32(result + v, result_f32x4);
+  }
+  for (int v = postamble_start; v < v_size; v++) {
+    result[v] = 1.0f - vector[v];
+  }
+}
+
+void NeonClipVector(const float* vector, int v_size, float abs_limit,
+                    float* result) {
+  // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
+  // vectorized loop, and we need to process sequentially. postamble_start shows
+  // the start index where this should happen.
+  const int postamble_start =
+      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+
+  // Replicate abs_limit and -abs_limit in two vectors.
+  const float32x4_t abs_limit_f32x4 = vmovq_n_f32(abs_limit);
+  const float32x4_t neg_abs_limit_f32x4 = vmovq_n_f32(-abs_limit);
+
+  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+    // Load from memory to vector.
+    float32x4_t v_f32x4 = vld1q_f32(vector + v);
+    // Clip between abs_limit and -abs_limit.
+    float32x4_t result_f32x4 = vminq_f32(abs_limit_f32x4, v_f32x4);
+    result_f32x4 = vmaxq_f32(neg_abs_limit_f32x4, result_f32x4);
+    // Save to output.
+    vst1q_f32(result + v, result_f32x4);
+  }
+  // Postamble loop.
+  for (int v = postamble_start; v < v_size; v++) {
+    result[v] = (abs_limit < vector[v]) ? abs_limit : vector[v];
+    result[v] = (-abs_limit > result[v]) ? -abs_limit : result[v];
+  }
+}
+
+}  // namespace tensor_utils
+}  // namespace rt
+}  // namespace nnfw
+
+#endif  // USE_NEON
diff --git a/runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.h b/runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.h
new file mode 100644
index 000000000..2a6f31572
--- /dev/null
+++ b/runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RT_NEON_TENSOR_UTILS_H__
+#define __NNFW_RT_NEON_TENSOR_UTILS_H__
+
+#include "ActivationFunctor.h"
+#include "cpu_check.h"
+#include "tensor_utils_impl.h"
+
+namespace nnfw {
+namespace rt {
+namespace tensor_utils {
+
+void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+                                         int m_cols, const float* vector,
+                                         int n_batch, float* result,
+                                         int result_stride) {
+  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
+                   vector, n_batch, result, result_stride);
+}
+
+void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
+                              int v_size, float* result) {
+  NEON_OR_PORTABLE(VectorVectorCwiseProduct, vector1, vector2, v_size, result);
+}
+
+void VectorVectorCwiseProductAccumulate(const float* vector1,
+                                        const float* vector2, int v_size,
+                                        float* result) {
+  NEON_OR_PORTABLE(VectorVectorCwiseProductAccumulate, vector1, vector2, v_size,
+                   result);
+}
+
+void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size,
+                                             const float* batch_vector,
+                                             int n_batch, float* result) {
+  NEON_OR_PORTABLE(VectorBatchVectorCwiseProductAccumulate, vector, v_size,
+                   batch_vector, n_batch, result);
+}
+
+float VectorVectorDotProduct(const float* vector1, const float* vector2,
+                             int v_size) {
+  return PortableVectorVectorDotProduct(vector1, vector2, v_size);
+}
+
+void BatchVectorBatchVectorDotProduct(const float* vector1,
+                                      const float* vector2, int v_size,
+                                      int n_batch, float* result,
+                                      int result_stride) {
+  PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size, n_batch,
+                                           result, result_stride);
+}
+
+void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch,
+                             float* batch_vector) {
+  PortableVectorBatchVectorAssign(vector, v_size, n_batch, batch_vector);
+}
+
+void ApplySigmoidToVector(const float* vector, int v_size, float* result) {
+  PortableApplySigmoidToVector(vector, v_size, result);
+}
+
+void ApplyActivationToVector(const float* vector, int v_size,
+                             ActivationFn activation, float* result) {
+  PortableApplyActivationToVector(vector, v_size, activation, result);
+}
+
+void CopyVector(const float* vector, int v_size, float* result) {
+  PortableCopyVector(vector, v_size, result);
+}
+
+void Sub1Vector(const float* vector, int v_size, float* result) {
+  NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result);
+}
+
+void ZeroVector(float* vector, int v_size) {
+  PortableZeroVector(vector, v_size);
+}
+
+float Clip(float f, float abs_limit) { return PortableClip(f, abs_limit); }
+
+void ClipVector(const float* vector, int v_size, float abs_limit,
+                float* result) {
+  NEON_OR_PORTABLE(ClipVector, vector, v_size, abs_limit, result);
+}
+
+// TODO(ghodrat): Implement Neon version.
+void VectorShiftLeft(float* vector, int v_size, float shift_value) {
+  PortableVectorShiftLeft(vector, v_size, shift_value);
+}
+
+// TODO(ghodrat): Implement Neon version.
+void ReductionSumVector(const float* input_vector, int input_stride,
+                        float* output_vector, int output_size,
+                        int reduction_size) {
+  PortableReductionSumVector(input_vector, input_stride, output_vector,
+                             output_size, reduction_size);
+}
+
+}  // namespace tensor_utils
+}  // namespace rt
+}  // namespace nnfw
+
+#endif  // __NNFW_RT_NEON_TENSOR_UTILS_H__
diff --git a/runtimes/nn/common/operations/internal/optimized/optimized_ops.h b/runtimes/nn/common/operations/internal/optimized/optimized_ops.h
new file mode 100644
index 000000000..33862a0d7
--- /dev/null
+++ b/runtimes/nn/common/operations/internal/optimized/optimized_ops.h
@@ -0,0 +1,2717 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RT_OPTIMIZED_OPS_H__
+#define __NNFW_RT_OPTIMIZED_OPS_H__
+
+#include <assert.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <memory>
+#include <tuple>
+#include <type_traits>
+
+#include "Eigen/Core"
+#include "fixedpoint.h"
+#include "gemmlowp.h"
+#include "../common.h"
+#include "../types.h"
+
+namespace nnfw {
+namespace rt {
+namespace optimized_ops {
+
+// Make a local VectorMap typedef allowing to map a float array
+// as a Eigen vector expression. The std::conditional here is to
+// construct the suitable Eigen type for the constness of the
+// data. Indeed, for const data, we need to produce
+//    Eigen::Map<const Eigen::Matrix<float, ...>>
+// and not the more straightforward
+//    Eigen::Map<Eigen::Matrix<const float, ...>>
+template <typename Scalar>
+using VectorMap = typename std::conditional<
+    std::is_const<Scalar>::value,
+    Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type,
+                                   Eigen::Dynamic, 1>>,
+    Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>>::type;
+
+template <typename Scalar, int N>
+VectorMap<Scalar> MapAsVector(Scalar* data, const Dims<N>& dims) {
+  const int size = RequiredBufferSizeForDims(dims);
+  return VectorMap<Scalar>(data, size, 1);
+}
+
+// Make a local VectorMap typedef allowing to map a float array
+// as a Eigen matrix expression. The same explanation as for VectorMap
+// above also applies here.
+template <typename Scalar>
+using MatrixMap = typename std::conditional<
+    std::is_const<Scalar>::value,
+    Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type,
+                                   Eigen::Dynamic, Eigen::Dynamic>>,
+    Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
+
+template <typename Scalar, int N>
+MatrixMap<Scalar> MapAsMatrixWithFirstDimAsRows(Scalar* data,
+                                                const Dims<N>& dims) {
+  const int rows = dims.sizes[0];
+  int cols = 1;
+  for (int d = 1; d < N; d++) {
+    cols *= dims.sizes[d];
+  }
+  return MatrixMap<Scalar>(data, rows, cols);
+}
+
+template <typename Scalar, int N>
+MatrixMap<Scalar> MapAsMatrixWithLastDimAsCols(Scalar* data,
+                                               const Dims<N>& dims) {
+  const int cols = dims.sizes[N - 1];
+  int rows = 1;
+  for (int d = 0; d < N - 1; d++) {
+    rows *= dims.sizes[d];
+  }
+  return MatrixMap<Scalar>(data, rows, cols);
+}
+
+template <typename Scalar>
+using ArrayMap = typename std::conditional<
+    std::is_const<Scalar>::value,
+    Eigen::Map<const Eigen::Array<typename std::remove_const<Scalar>::type,
+                                  Eigen::Dynamic, Eigen::Dynamic>>,
+    Eigen::Map<Eigen::Array<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type;
+
+template <typename Scalar, int N>
+ArrayMap<Scalar> MapAsArrayWithFirstDimAsRows(Scalar* data,
+                                              const Dims<N>& dims) {
+  const int rows = dims.sizes[0];
+  int cols = 1;
+  for (int d = 1; d < N; d++) {
+    cols *= dims.sizes[d];
+  }
+  return ArrayMap<Scalar>(data, rows, cols);
+}
+
+// TODO(b/62193649): this function is only needed as long
+// as we have the --variable_batch hack.
+template <typename Scalar, int N>
+MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data,
+                                                   const Dims<N>& dims,
+                                                   int rows) {
+  int cols = 1;
+  bool matched_rows = false;
+  for (int d = 0; d < N; d++) {
+    cols *= dims.sizes[d];
+    if (cols == rows) {
+      matched_rows = true;
+      cols = 1;
+    }
+  }
+  DCHECK(matched_rows);
+  return MatrixMap<Scalar>(data, rows, cols);
+}
+
+// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE
+// BROADCASTING.
+//
+// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
+// rectangular array of numbers.
+//
+// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
+// However, as Dims<N> is to be deprecated, this class exists as an adaptor
+// to enable simple unoptimized implementations of element-wise broadcasting
+// operations.
+template<int N>
+struct NdArrayDesc {
+  // The "extent" of each dimension. Indices along dimension d must be in the
+  // half-open interval [0, extents[d]).
+  int extents[N];
+
+  // The number of *elements* (not bytes) between consecutive indices of each
+  // dimension.
+  int strides[N];
+};
+
+// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
+// ELEMENT-WISE BROADCASTING.
+//
+// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
+inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2,
+                            int i3) {
+  DCHECK(i0 >= 0 && i0 < desc.extents[0]);
+  DCHECK(i1 >= 0 && i1 < desc.extents[1]);
+  DCHECK(i2 >= 0 && i2 < desc.extents[2]);
+  DCHECK(i3 >= 0 && i3 < desc.extents[3]);
+  return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] +
+         i3 * desc.strides[3];
+}
+
+// Given the dimensions of the operands for an element-wise binary broadcast,
+// adjusts them so that they can be directly iterated over with simple loops.
+// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and
+// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr.
+//
+// This function assumes that the two input shapes are compatible up to
+// broadcasting and the shorter one has already been prepended with 1s to be the
+// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64),
+// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that
+// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be
+// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1).
+//
+// When two shapes are compatible up to broadcasting, for each dimension d,
+// the input extents are either equal, or one of them is 1.
+//
+// This function performs the following for each dimension d:
+// - If the extents are equal, then do nothing since the loop that walks over
+//   both of the input arrays is correct.
+// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1
+//   and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows
+//   array0 to be referenced *at any index* in dimension d and still access the
+//   same slice.
+template <int N>
+inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims,
+                                                const Dims<N>& input1_dims,
+                                                NdArrayDesc<N>* desc0_out,
+                                                NdArrayDesc<N>* desc1_out) {
+  DCHECK(desc0_out != nullptr);
+  DCHECK(desc1_out != nullptr);
+
+  // Copy dims to desc.
+  for (int i = 0; i < N; ++i) {
+    desc0_out->extents[i] = input0_dims.sizes[i];
+    desc0_out->strides[i] = input0_dims.strides[i];
+    desc1_out->extents[i] = input1_dims.sizes[i];
+    desc1_out->strides[i] = input1_dims.strides[i];
+  }
+
+  // Walk over each dimension. If the extents are equal do nothing.
+  // Otherwise, set the desc with extent 1 to have extent equal to the other and
+  // stride 0.
+  for (int i = 0; i < N; ++i) {
+    const int extent0 = ArraySize(input0_dims, i);
+    const int extent1 = ArraySize(input1_dims, i);
+    if (extent0 != extent1) {
+      if (extent0 == 1) {
+        desc0_out->strides[i] = 0;
+        desc0_out->extents[i] = extent1;
+      } else {
+        DCHECK_EQ(extent1, 1);
+        desc1_out->strides[i] = 0;
+        desc1_out->extents[i] = extent0;
+      }
+    }
+  }
+}
+
+#ifdef USE_NEON
+template <FusedActivationFunctionType Ac>
+void AddBiasAndEvalActivationFunction(const float* bias_data,
+                                      const Dims<4>& bias_dims,
+                                      float* array_data,
+                                      const Dims<4>& array_dims) {
+  gemmlowp::ScopedProfilingLabel label("AddBiasAndEvalActivationFunction");
+  const int bias_size = bias_dims.sizes[3] * bias_dims.strides[3];
+  const int array_size = array_dims.sizes[3] * array_dims.strides[3];
+  DCHECK_EQ((array_size % bias_size), 0);
+  float* array_ptr = array_data;
+  float* array_end_ptr = array_ptr + array_size;
+  const auto zero = vdupq_n_f32(0);
+  const auto six = vdupq_n_f32(6);
+  const auto neg_one = vdupq_n_f32(-1);
+  const auto one = vdupq_n_f32(1);
+  for (; array_ptr != array_end_ptr; array_ptr += bias_size) {
+    int i = 0;
+    for (; i <= bias_size - 16; i += 16) {
+      auto b0 = vld1q_f32(bias_data + i);
+      auto b1 = vld1q_f32(bias_data + i + 4);
+      auto b2 = vld1q_f32(bias_data + i + 8);
+      auto b3 = vld1q_f32(bias_data + i + 12);
+      auto a0 = vld1q_f32(array_ptr + i);
+      auto a1 = vld1q_f32(array_ptr + i + 4);
+      auto a2 = vld1q_f32(array_ptr + i + 8);
+      auto a3 = vld1q_f32(array_ptr + i + 12);
+      auto x0 = vaddq_f32(a0, b0);
+      auto x1 = vaddq_f32(a1, b1);
+      auto x2 = vaddq_f32(a2, b2);
+      auto x3 = vaddq_f32(a3, b3);
+      if (Ac == FusedActivationFunctionType::kRelu ||
+          Ac == FusedActivationFunctionType::kRelu6) {
+        x0 = vmaxq_f32(zero, x0);
+        x1 = vmaxq_f32(zero, x1);
+        x2 = vmaxq_f32(zero, x2);
+        x3 = vmaxq_f32(zero, x3);
+        if (Ac == FusedActivationFunctionType::kRelu6) {
+          x0 = vminq_f32(six, x0);
+          x1 = vminq_f32(six, x1);
+          x2 = vminq_f32(six, x2);
+          x3 = vminq_f32(six, x3);
+        }
+      } else if (Ac == FusedActivationFunctionType::kRelu1) {
+        x0 = vmaxq_f32(neg_one, x0);
+        x1 = vmaxq_f32(neg_one, x1);
+        x2 = vmaxq_f32(neg_one, x2);
+        x3 = vmaxq_f32(neg_one, x3);
+        x0 = vminq_f32(one, x0);
+        x1 = vminq_f32(one, x1);
+        x2 = vminq_f32(one, x2);
+        x3 = vminq_f32(one, x3);
+      }
+      vst1q_f32(array_ptr + i, x0);
+      vst1q_f32(array_ptr + i + 4, x1);
+      vst1q_f32(array_ptr + i + 8, x2);
+      vst1q_f32(array_ptr + i + 12, x3);
+    }
+    for (; i <= bias_size - 4; i += 4) {
+      auto b = vld1q_f32(bias_data + i);
+      auto a = vld1q_f32(array_ptr + i);
+      auto x = vaddq_f32(a, b);
+      if (Ac == FusedActivationFunctionType::kRelu ||
+          Ac == FusedActivationFunctionType::kRelu6) {
+        x = vmaxq_f32(zero, x);
+        if (Ac == FusedActivationFunctionType::kRelu6) {
+          x = vminq_f32(six, x);
+        }
+      } else if (Ac == FusedActivationFunctionType::kRelu1) {
+        x = vmaxq_f32(neg_one, x);
+        x = vminq_f32(one, x);
+      }
+      vst1q_f32(array_ptr + i, x);
+    }
+    for (; i < bias_size; i++) {
+      array_ptr[i] = ActivationFunction<Ac>(array_ptr[i] + bias_data[i]);
+    }
+  }
+}
+#else  // not NEON
+template <FusedActivationFunctionType Ac>
+void AddBiasAndEvalActivationFunction(const float* bias_data,
+                                      const Dims<4>& bias_dims,
+                                      float* array_data,
+                                      const Dims<4>& array_dims) {
+  gemmlowp::ScopedProfilingLabel label("AddBiasAndEvalActivationFunction");
+  const int bias_size = bias_dims.sizes[3] * bias_dims.strides[3];
+  const int array_size = array_dims.sizes[3] * array_dims.strides[3];
+  DCHECK_EQ((array_size % bias_size), 0);
+  for (int array_offset = 0; array_offset < array_size;
+       array_offset += bias_size) {
+    for (int i = 0; i < bias_size; i++) {
+      array_data[array_offset + i] =
+          ActivationFunction<Ac>(array_data[array_offset + i] + bias_data[i]);
+    }
+  }
+}
+#endif
+
+template <typename Lhs, typename Rhs, typename Result>
+void Gemm(const Eigen::MatrixBase<Lhs>& lhs, const Eigen::MatrixBase<Rhs>& rhs,
+          Eigen::MatrixBase<Result>* result) {
+  if (rhs.cols() == 1) {
+    gemmlowp::ScopedProfilingLabel label("GEMV");
+    result->col(0).noalias() = lhs * rhs.col(0);
+  } else {
+    gemmlowp::ScopedProfilingLabel label("GEMM");
+    result->noalias() = lhs * rhs;
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+void FullyConnected(const float* input_data, const Dims<4>& input_dims,
+                    const float* weights_data, const Dims<4>& weights_dims,
+                    const float* bias_data, const Dims<4>& bias_dims,
+                    float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("FullyConnected");
+  // TODO(b/62193649): this convoluted shape computation (determining
+  // input_rows from the weights_dims, then MapAsMatrixWithGivenNumberOfRows)
+  // is because the current --variable_batch hack consists in overwriting the
+  // 3rd dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  // When that is fixed, this should become:
+  // const auto input_matrix_map =
+  //     MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+  const int input_rows = ArraySize(weights_dims, 0);
+  const auto input_matrix_map =
+      MapAsMatrixWithGivenNumberOfRows(input_data, input_dims, input_rows);
+  const auto filter_matrix_map =
+      MapAsMatrixWithFirstDimAsRows(weights_data, weights_dims);
+  auto output_matrix_map =
+      MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+
+  Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map);
+  AddBiasAndEvalActivationFunction<Ac>(bias_data, bias_dims, output_data,
+                                       output_dims);
+}
+
+inline void preload_l1_stream(const uint8* ptr) {
+#ifdef GEMMLOWP_ARM_64
+  asm volatile("prfm pldl1strm, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+#else
+  gemmlowp::Prefetch(ptr);
+#endif
+}
+
+#ifdef USE_NEON
+template <FusedActivationFunctionType Ac>
+void FullyConnectedAsGEMV(const uint8* input_data, const Dims<4>& input_dims,
+                          int32 input_offset, const uint8* filter_data,
+                          const Dims<4>& filter_dims, int32 filter_offset,
+                          const int32* bias_data, const Dims<4>& bias_dims,
+                          int32 output_offset, int32 output_multiplier,
+                          int output_shift, int32 output_activation_min,
+                          int32 output_activation_max, uint8* output_data,
+                          const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("FullyConnectedAsGEMV/8bit");
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  DCHECK(IsPackedWithoutStrides(input_dims));
+  DCHECK(IsPackedWithoutStrides(filter_dims));
+  DCHECK(IsPackedWithoutStrides(bias_dims));
+  DCHECK(IsPackedWithoutStrides(output_dims));
+  DCHECK_EQ(ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
+                ArraySize(output_dims, 3),
+            1);
+  const int input_size = input_dims.strides[3];
+  const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0);
+  static constexpr int kPeel = 4;
+  for (int k = 0; k < input_size; k += 64) {
+    preload_l1_stream(input_data + k);
+  }
+  for (int k = 0; k < kPeel * input_size; k += 64) {
+    preload_l1_stream(filter_data + k);
+  }
+  DCHECK(!(output_size % kPeel));
+  const int32* bias_ptr = bias_data;
+  uint8* output_ptr = output_data;
+  for (int out = 0; out < output_size; out += kPeel) {
+    int32x4_t acc[kPeel];
+    for (int k = 0; k < kPeel; k++) {
+      acc[k] = vdupq_n_s32(0);
+    }
+    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
+    const int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset);
+    int in = 0;
+    for (; in <= input_size - 16; in += 16) {
+      const uint8x16_t input_val_u8 = vld1q_u8(input_data + in);
+      uint8x16_t filter_val_u8[kPeel];
+      for (int k = 0; k < kPeel; k++) {
+        const uint8* filter_ptr = filter_data + in + (out + k) * input_size;
+        filter_val_u8[k] = vld1q_u8(filter_ptr);
+        preload_l1_stream(filter_ptr + 64);
+      }
+      int16x8_t input_val[2];
+      const uint8x8_t low = vget_low_u8(input_val_u8);
+      const uint8x8_t high = vget_high_u8(input_val_u8);
+      input_val[0] = vreinterpretq_s16_u16(vmovl_u8(low));
+      input_val[1] = vreinterpretq_s16_u16(vmovl_u8(high));
+      input_val[0] = vaddq_s16(input_val[0], input_offset_vec);
+      input_val[1] = vaddq_s16(input_val[1], input_offset_vec);
+      int16x8_t filter_val[kPeel][2];
+      for (int k = 0; k < kPeel; k++) {
+        const uint8x8_t low = vget_low_u8(filter_val_u8[k]);
+        const uint8x8_t high = vget_high_u8(filter_val_u8[k]);
+        filter_val[k][0] = vreinterpretq_s16_u16(vmovl_u8(low));
+        filter_val[k][1] = vreinterpretq_s16_u16(vmovl_u8(high));
+        filter_val[k][0] = vaddq_s16(filter_val[k][0], filter_offset_vec);
+        filter_val[k][1] = vaddq_s16(filter_val[k][1], filter_offset_vec);
+      }
+      for (int p = 0; p < 2; p++) {
+        for (int k = 0; k < kPeel; k++) {
+          acc[k] = vmlal_s16(acc[k], vget_low_s16(filter_val[k][p]),
+                             vget_low_s16(input_val[p]));
+        }
+        for (int k = 0; k < kPeel; k++) {
+          acc[k] = vmlal_s16(acc[k], vget_high_s16(filter_val[k][p]),
+                             vget_high_s16(input_val[p]));
+        }
+      }
+    }
+    for (; in <= input_size - 8; in += 8) {
+      const uint8x8_t input_val_u8 = vld1_u8(input_data + in);
+      uint8x8_t filter_val_u8[kPeel];
+      for (int k = 0; k < kPeel; k++) {
+        const uint8* filter_ptr = filter_data + in + (out + k) * input_size;
+        filter_val_u8[k] = vld1_u8(filter_ptr);
+      }
+      int16x8_t input_val;
+      input_val = vreinterpretq_s16_u16(vmovl_u8(input_val_u8));
+      input_val = vaddq_s16(input_val, input_offset_vec);
+      int16x8_t filter_val[kPeel];
+      for (int k = 0; k < kPeel; k++) {
+        filter_val[k] = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8[k]));
+        filter_val[k] = vaddq_s16(filter_val[k], filter_offset_vec);
+      }
+      for (int k = 0; k < kPeel; k++) {
+        acc[k] = vmlal_s16(acc[k], vget_low_s16(filter_val[k]),
+                           vget_low_s16(input_val));
+      }
+      for (int k = 0; k < kPeel; k++) {
+        acc[k] = vmlal_s16(acc[k], vget_high_s16(filter_val[k]),
+                           vget_high_s16(input_val));
+      }
+    }
+    if (in < input_size) {
+      int32 buf[4 * kPeel];
+      for (int k = 0; k < 4; k++) {
+        vst1q_s32(buf + 4 * k, acc[k]);
+      }
+      for (; in < input_size; in++) {
+        int lane = (in + 8 - input_size) % 4;
+        const int32 input_val = input_data[in] + input_offset;
+        for (int k = 0; k < kPeel; k++) {
+          int32 filter_val =
+              filter_data[in + (out + k) * input_size] + filter_offset;
+          buf[lane + 4 * k] += filter_val * input_val;
+        }
+      }
+      for (int k = 0; k < 4; k++) {
+        acc[k] = vld1q_s32(buf + 4 * k);
+      }
+    }
+
+    // Horizontally reduce accumulators
+    int32x2_t pairwise_reduced_acc[kPeel];
+    for (int k = 0; k < kPeel; k++) {
+      pairwise_reduced_acc[k] =
+          vpadd_s32(vget_low_s32(acc[k]), vget_high_s32(acc[k]));
+    }
+    static_assert(kPeel == 4, "the code below currently assumes kPeel = 4");
+    const int32x2_t reduced_lo =
+        vpadd_s32(pairwise_reduced_acc[0], pairwise_reduced_acc[1]);
+    const int32x2_t reduced_hi =
+        vpadd_s32(pairwise_reduced_acc[2], pairwise_reduced_acc[3]);
+    int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi);
+    // Add bias values.
+    int32x4_t bias_vec = vld1q_s32(bias_ptr);
+    bias_ptr += 4;
+    reduced = vaddq_s32(reduced, bias_vec);
+    // Multiply by the fixed-point multiplier.
+    reduced = vqrdmulhq_n_s32(reduced, output_multiplier);
+    // Rounding-shift-right.
+    using gemmlowp::RoundingDivideByPOT;
+    reduced = RoundingDivideByPOT(reduced, output_shift);
+    // Add the output offset.
+    const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
+    reduced = vaddq_s32(reduced, output_offset_vec);
+    // Narrow values down to 16 bit signed.
+    const int16x4_t res16 = vqmovn_s32(reduced);
+    // Narrow values down to 8 bit unsigned, saturating.
+    uint8x8_t res8 = vqmovun_s16(vcombine_s16(res16, res16));
+    if (Ac != FusedActivationFunctionType::kNone) {
+      // Apply the clamping from the activation function
+      res8 = vmax_u8(res8, vdup_n_u8(output_activation_min));
+      res8 = vmin_u8(res8, vdup_n_u8(output_activation_max));
+    }
+    // Store results to destination. Assumes 32bit alignment.
+    vst1_lane_u32(reinterpret_cast<uint32*>(output_ptr),
+                  vreinterpret_u32_u8(res8), 0);
+    output_ptr += kPeel;
+  }
+}
+#endif  // USE_NEON
+
+template <FusedActivationFunctionType Ac>
+struct GemmlowpOutputPipeline {
+  typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>
+      ColVectorMap;
+  typedef std::tuple<
+      gemmlowp::OutputStageBiasAddition<ColVectorMap>,
+      gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
+      gemmlowp::OutputStageClamp, gemmlowp::OutputStageSaturatingCastToUint8>
+      Pipeline;
+  static Pipeline Make(const int32* bias_data, int output_rows,
+                       int32 output_offset, int32 output_multiplier,
+                       int output_shift, int32 output_activation_min,
+                       int32 output_activation_max) {
+    ColVectorMap bias_vector(bias_data, output_rows);
+    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
+    bias_addition_stage.bias_vector = bias_vector;
+    gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
+        quantize_down_stage;
+    quantize_down_stage.result_offset_after_shift = output_offset;
+    quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
+    quantize_down_stage.result_shift = output_shift;
+    gemmlowp::OutputStageClamp clamp_stage;
+    clamp_stage.min = output_activation_min;
+    clamp_stage.max = output_activation_max;
+    gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
+    return std::make_tuple(bias_addition_stage, quantize_down_stage,
+                           clamp_stage, saturating_cast_stage);
+  }
+};
+
+template <>
+struct GemmlowpOutputPipeline<FusedActivationFunctionType::kNone> {
+  typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col>
+      ColVectorMap;
+  typedef std::tuple<
+      gemmlowp::OutputStageBiasAddition<ColVectorMap>,
+      gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
+      gemmlowp::OutputStageSaturatingCastToUint8>
+      Pipeline;
+  static Pipeline Make(const int32* bias_data, int output_rows,
+                       int32 output_offset, int32 output_multiplier,
+                       int output_shift, int32 output_activation_min,
+                       int32 output_activation_max) {
+    DCHECK_EQ(output_activation_min, 0);
+    DCHECK_EQ(output_activation_max, 255);
+    ColVectorMap bias_vector(bias_data, output_rows);
+    gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
+    bias_addition_stage.bias_vector = bias_vector;
+    gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
+        quantize_down_stage;
+    quantize_down_stage.result_offset_after_shift = output_offset;
+    quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
+    quantize_down_stage.result_shift = output_shift;
+    gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
+    return std::make_tuple(bias_addition_stage, quantize_down_stage,
+                           saturating_cast_stage);
+  }
+};
+
+template <FusedActivationFunctionType Ac>
+void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
+                    int32 input_offset, const uint8* filter_data,
+                    const Dims<4>& filter_dims, int32 filter_offset,
+                    const int32* bias_data, const Dims<4>& bias_dims,
+                    int32 output_offset, int32 output_multiplier,
+                    int output_shift, int32 output_activation_min,
+                    int32 output_activation_max, uint8* output_data,
+                    const Dims<4>& output_dims,
+                    gemmlowp::GemmContext* gemm_context) {
+  gemmlowp::ScopedProfilingLabel label("FullyConnected/8bit");
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  // TODO: This really should be:
+  //     const int batches = ArraySize(output_dims, 1);
+  // but the current --variable_batch hack consists in overwriting the 3rd
+  // dimension with the runtime batch size, as we don't keep track for each
+  // array of which dimension is the batch dimension in it.
+  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
+                      ArraySize(output_dims, 3);
+#ifdef USE_NEON
+  const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0);
+  if (batches == 1 && !(output_size % 4)) {
+    return FullyConnectedAsGEMV<Ac>(
+        input_data, input_dims, input_offset, filter_data, filter_dims,
+        filter_offset, bias_data, bias_dims, output_offset, output_multiplier,
+        output_shift, output_activation_min, output_activation_max, output_data,
+        output_dims);
+  }
+#endif  // USE_NEON
+  const int filter_rows = filter_dims.sizes[1];
+  const int filter_cols = filter_dims.sizes[0];
+  DCHECK_EQ(filter_dims.sizes[2], 1);
+  DCHECK_EQ(filter_dims.sizes[3], 1);
+  const int output_rows = output_dims.sizes[0];
+  DCHECK_EQ(output_rows, filter_rows);
+  DCHECK_EQ(bias_dims.sizes[0], output_rows);
+  DCHECK_EQ(bias_dims.sizes[1], 1);
+  DCHECK_EQ(bias_dims.sizes[2], 1);
+  DCHECK_EQ(bias_dims.sizes[3], 1);
+
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
+      filter_data, output_rows, filter_cols, filter_cols);
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
+      input_data, filter_cols, batches, filter_cols);
+  gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix(
+      output_data, output_rows, batches, output_rows);
+  const auto& output_pipeline = GemmlowpOutputPipeline<Ac>::Make(
+      bias_data, output_rows, output_offset, output_multiplier, output_shift,
+      output_activation_min, output_activation_max);
+  gemmlowp::GemmWithOutputPipeline<uint8, uint8,
+                                   gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+      gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset,
+      input_offset, output_pipeline);
+}
+
+template <typename T>
+inline void ExtractPatchIntoBufferColumn(
+    const Dims<4>& input_dims, int w, int h, int b, int kheight, int kwidth,
+    int stride_width, int stride_height, int pad_width, int pad_height,
+    int in_width, int in_height, int in_depth, int single_buffer_length,
+    int buffer_id, const T* in_data, T* conv_buffer_data, uint8 byte_zero) {
+  gemmlowp::ScopedProfilingLabel label("ExtractPatchIntoBufferColumn");
+  // This chunk of code reshapes all the inputs corresponding to
+  // output (b, h, w) to a column vector in conv_buffer(:, buffer_id).
+  const int kwidth_times_indepth = kwidth * in_depth;
+  const int inwidth_times_indepth = in_width * in_depth;
+  const int ih_ungated_start = h * stride_height - pad_height;
+  const int ih_ungated_end = (ih_ungated_start + kheight);
+  const int ih_end = std::min(ih_ungated_end, in_height);
+  const int iw_ungated_start = w * stride_width - pad_width;
+  const int iw_ungated_end = (iw_ungated_start + kwidth);
+  const int iw_end = std::min(iw_ungated_end, in_width);
+  // If the patch is off the edge of the input image, skip writing those rows
+  // and columns from the patch into the output array.
+  const int h_offset = std::max(0, -ih_ungated_start);
+  const int w_offset = std::max(0, -iw_ungated_start);
+  const int ih_start = std::max(0, ih_ungated_start);
+  const int iw_start = std::max(0, iw_ungated_start);
+  const int single_row_num =
+      std::min(kwidth - w_offset, in_width - iw_start) * in_depth;
+  const int output_row_offset = (buffer_id * single_buffer_length);
+  int out_offset =
+      output_row_offset + (h_offset * kwidth + w_offset) * in_depth;
+  int in_offset = Offset(input_dims, 0, iw_start, ih_start, b);
+
+  // Express all of the calculations as padding around the input patch.
+  const int top_padding = h_offset;
+  const int bottom_padding = (ih_ungated_end - ih_end);
+  const int left_padding = w_offset;
+  const int right_padding = (iw_ungated_end - iw_end);
+  assert(single_row_num ==
+         ((kwidth - (left_padding + right_padding)) * in_depth));
+
+  // Write out zeroes to the elements representing the top rows of the input
+  // patch that are off the edge of the input image.
+  if (top_padding > 0) {
+    const int top_row_elements = (top_padding * kwidth * in_depth);
+    memset(conv_buffer_data + output_row_offset, byte_zero,
+           (top_row_elements * sizeof(T)));
+  }
+
+  // If the patch is on the interior of the input image horizontally, just copy
+  // over the rows sequentially, otherwise add zero padding at the start or end.
+  if ((left_padding == 0) && (right_padding == 0)) {
+    for (int ih = ih_start; ih < ih_end; ++ih) {
+      memcpy(conv_buffer_data + out_offset, in_data + in_offset,
+             single_row_num * sizeof(T));
+      out_offset += kwidth_times_indepth;
+      in_offset += inwidth_times_indepth;
+    }
+  } else {
+    for (int ih = ih_start; ih < ih_end; ++ih) {
+      if (left_padding > 0) {
+        const int left_start = (out_offset - (left_padding * in_depth));
+        memset(conv_buffer_data + left_start, byte_zero,
+               (left_padding * in_depth * sizeof(T)));
+      }
+      memcpy(conv_buffer_data + out_offset, in_data + in_offset,
+             single_row_num * sizeof(T));
+      if (right_padding > 0) {
+        const int right_start = (out_offset + single_row_num);
+        memset(conv_buffer_data + right_start, byte_zero,
+               (right_padding * in_depth * sizeof(T)));
+      }
+      out_offset += kwidth_times_indepth;
+      in_offset += inwidth_times_indepth;
+    }
+  }
+
+  // If the bottom of the patch falls off the input image, pad the values
+  // representing those input rows with zeroes.
+  if (bottom_padding > 0) {
+    const int bottom_row_elements = (bottom_padding * kwidth * in_depth);
+    const int bottom_start =
+        output_row_offset +
+        ((top_padding + (ih_end - ih_start)) * kwidth * in_depth);
+    memset(conv_buffer_data + bottom_start, byte_zero,
+           (bottom_row_elements * sizeof(T)));
+  }
+}
+
+template <typename T>
+void Im2col(const T* input_data, const Dims<4>& input_dims, int stride_width,
+            int stride_height, int pad_width, int pad_height, int kheight,
+            int kwidth, uint8 byte_zero, T* output_data,
+            const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Im2col");
+  DCHECK(IsPackedWithoutStrides(input_dims));
+  DCHECK(IsPackedWithoutStrides(output_dims));
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int input_depth = ArraySize(input_dims, 0);
+  const int input_width = ArraySize(input_dims, 1);
+  const int input_height = ArraySize(input_dims, 2);
+  const int output_depth = ArraySize(output_dims, 0);
+  const int output_width = ArraySize(output_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+
+  int buffer_id = 0;
+  // Loop over the output nodes.
+  for (int b = 0; b < batches; ++b) {
+    for (int h = 0; h < output_height; ++h) {
+      for (int w = 0; w < output_width; ++w) {
+        ExtractPatchIntoBufferColumn(
+            input_dims, w, h, b, kheight, kwidth, stride_width, stride_height,
+            pad_width, pad_height, input_width, input_height, input_depth,
+            output_depth, buffer_id, input_data, output_data, byte_zero);
+        ++buffer_id;
+      }
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+void Conv(const float* input_data, const Dims<4>& input_dims,
+          const float* filter_data, const Dims<4>& filter_dims,
+          const float* bias_data, const Dims<4>& bias_dims, int stride_width,
+          int stride_height, int pad_width, int pad_height, float* output_data,
+          const Dims<4>& output_dims, float* im2col_data,
+          const Dims<4>& im2col_dims) {
+  (void)im2col_data;
+  (void)im2col_dims;
+  gemmlowp::ScopedProfilingLabel label("Conv");
+
+  const float* gemm_input_data = nullptr;
+  const Dims<4>* gemm_input_dims = nullptr;
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const bool need_im2col = stride_width != 1 || stride_height != 1 ||
+                           filter_width != 1 || filter_height != 1;
+  if (need_im2col) {
+    DCHECK(im2col_data);
+    Im2col(input_data, input_dims, stride_width, stride_height, pad_width,
+           pad_height, filter_height, filter_width, 0, im2col_data,
+           im2col_dims);
+    gemm_input_data = im2col_data;
+    gemm_input_dims = &im2col_dims;
+  } else {
+#if 0 // TODO-NNRT : Check if it needs, 'im2col_data' seems to be always not null.
+    DCHECK(!im2col_data);
+#endif
+    gemm_input_data = input_data;
+    gemm_input_dims = &input_dims;
+  }
+
+  const auto im2col_matrix_map =
+      MapAsMatrixWithFirstDimAsRows(gemm_input_data, *gemm_input_dims);
+  const auto filter_matrix_map =
+      MapAsMatrixWithLastDimAsCols(filter_data, filter_dims);
+  auto output_matrix_map =
+      MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+
+  Gemm(filter_matrix_map.transpose(), im2col_matrix_map, &output_matrix_map);
+
+  AddBiasAndEvalActivationFunction<Ac>(bias_data, bias_dims, output_data,
+                                       output_dims);
+}
+
+template <FusedActivationFunctionType Ac>
+void Conv(const uint8* input_data, const Dims<4>& input_dims,
+          int32 input_offset, const uint8* filter_data,
+          const Dims<4>& filter_dims, int32 filter_offset,
+          const int32* bias_data, const Dims<4>& bias_dims, int stride_width,
+          int stride_height, int pad_width, int pad_height, int32 output_offset,
+          int32 output_multiplier, int output_shift,
+          int32 output_activation_min, int32 output_activation_max,
+          uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data,
+          const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) {
+  gemmlowp::ScopedProfilingLabel label("Conv/8bit");
+
+  DCHECK(IsPackedWithoutStrides(input_dims));
+  DCHECK(IsPackedWithoutStrides(filter_dims));
+  DCHECK(IsPackedWithoutStrides(output_dims));
+
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+
+  const uint8* gemm_input_data = nullptr;
+  const Dims<4>* gemm_input_dims = nullptr;
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const bool need_im2col = stride_width != 1 || stride_height != 1 ||
+                           filter_width != 1 || filter_height != 1;
+  if (need_im2col) {
+    DCHECK(im2col_data);
+    const int input_zero_point = -input_offset;
+    DCHECK_GE(input_zero_point, 0);
+    DCHECK_LE(input_zero_point, 255);
+    Im2col(input_data, input_dims, stride_width, stride_height, pad_width,
+           pad_height, filter_height, filter_width, input_zero_point,
+           im2col_data, im2col_dims);
+    gemm_input_data = im2col_data;
+    gemm_input_dims = &im2col_dims;
+  } else {
+#if 0 // TODO-NNRT : Check if it needs, 'im2col_data' seems to be always not null.
+    DCHECK(!im2col_data);
+#endif
+    gemm_input_data = input_data;
+    gemm_input_dims = &input_dims;
+  }
+
+  const int gemm_input_rows = gemm_input_dims->sizes[0];
+  const int gemm_input_cols = gemm_input_dims->sizes[1] *
+                              gemm_input_dims->sizes[2] *
+                              gemm_input_dims->sizes[3];
+  const int filter_rows = filter_dims.sizes[3];
+  const int filter_cols =
+      filter_dims.sizes[0] * filter_dims.sizes[1] * filter_dims.sizes[2];
+  const int output_rows = output_dims.sizes[0];
+  const int output_cols =
+      output_dims.sizes[1] * output_dims.sizes[2] * output_dims.sizes[3];
+  DCHECK_EQ(output_rows, filter_rows);
+  DCHECK_EQ(output_cols, gemm_input_cols);
+  DCHECK_EQ(filter_cols, gemm_input_rows);
+  DCHECK_EQ(bias_dims.sizes[0], output_rows);
+  DCHECK_EQ(bias_dims.sizes[1], 1);
+  DCHECK_EQ(bias_dims.sizes[2], 1);
+  DCHECK_EQ(bias_dims.sizes[3], 1);
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
+      filter_data, filter_rows, filter_cols);
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
+      gemm_input_data, gemm_input_rows, gemm_input_cols);
+  gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix(
+      output_data, output_rows, output_cols);
+  const auto& output_pipeline = GemmlowpOutputPipeline<Ac>::Make(
+      bias_data, output_rows, output_offset, output_multiplier, output_shift,
+      output_activation_min, output_activation_max);
+  gemmlowp::GemmWithOutputPipeline<uint8, uint8,
+                                   gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+      gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset,
+      input_offset, output_pipeline);
+}
+
+template <typename T>
+inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims,
+                         int block_size, T* output_data,
+                         const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("DepthToSpace");
+
+  const int input_depth = ArraySize(input_dims, 0);
+  const int input_width = ArraySize(input_dims, 1);
+  const int input_height = ArraySize(input_dims, 2);
+
+  const int output_depth = ArraySize(output_dims, 0);
+  const int batch_size = ArraySize(output_dims, 3);
+
+  // Number of continuous values that we can copy in one interation.
+  const int stride = block_size * output_depth;
+
+  for (int batch = 0; batch < batch_size; ++batch) {
+    for (int in_h = 0; in_h < input_height; ++in_h) {
+      const T* input_ptr = input_data + Offset(input_dims, 0, 0, in_h, batch);
+      for (int offset_h = 0; offset_h < block_size; ++offset_h) {
+        const T* src = input_ptr;
+        for (int in_w = 0; in_w < input_width; ++in_w) {
+          memcpy(output_data, src, stride * sizeof(T));
+          output_data += stride;
+          src += input_depth;
+        }
+        input_ptr += stride;
+      }
+    }
+  }
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void Im2col(const T* input_data, const Dims<4>& input_dims, int stride,
+            int pad_width, int pad_height, int kheight, int kwidth,
+            uint8 byte_zero, T* output_data, const Dims<4>& output_dims) {
+  Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight,
+         kwidth, byte_zero, output_data, output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void ConvAsGemm(const float* input_data, const Dims<4>& input_dims,
+                const float* filter_data, const Dims<4>& filter_dims,
+                const float* bias_data, const Dims<4>& bias_dims,
+                float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("ConvAsGemm");
+
+  const auto input_matrix_map =
+      MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+  const auto filter_matrix_map =
+      MapAsMatrixWithLastDimAsCols(filter_data, filter_dims);
+  auto output_matrix_map =
+      MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+
+  Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map);
+
+  AddBiasAndEvalActivationFunction<Ac>(bias_data, bias_dims, output_data,
+                                       output_dims);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void ConvAsGemm(const uint8* input_data, const Dims<4>& input_dims,
+                int32 input_offset, const uint8* filter_data,
+                const Dims<4>& filter_dims, int32 filter_offset,
+                const int32* bias_data, const Dims<4>& bias_dims,
+                int32 output_offset, int32 output_multiplier, int output_shift,
+                int32 output_activation_min, int32 output_activation_max,
+                uint8* output_data, const Dims<4>& output_dims,
+                gemmlowp::GemmContext* gemm_context) {
+  gemmlowp::ScopedProfilingLabel label("ConvAsGemm/8bit");
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  const int input_rows = input_dims.sizes[0];
+  const int input_cols =
+      input_dims.sizes[1] * input_dims.sizes[2] * input_dims.sizes[3];
+  const int filter_rows = filter_dims.sizes[3];
+  const int filter_cols =
+      filter_dims.sizes[0] * filter_dims.sizes[1] * filter_dims.sizes[2];
+  const int output_rows = output_dims.sizes[0];
+  const int output_cols =
+      output_dims.sizes[1] * output_dims.sizes[2] * output_dims.sizes[3];
+  DCHECK_EQ(output_rows, filter_rows);
+  DCHECK_EQ(output_cols, input_cols);
+  DCHECK_EQ(filter_cols, input_rows);
+  DCHECK_EQ(bias_dims.sizes[0], output_rows);
+  DCHECK_EQ(bias_dims.sizes[1], 1);
+  DCHECK_EQ(bias_dims.sizes[2], 1);
+  DCHECK_EQ(bias_dims.sizes[3], 1);
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix(
+      filter_data, output_rows, filter_cols, filter_cols);
+  gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix(
+      input_data, filter_cols, output_cols, filter_cols);
+  gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix(
+      output_data, output_rows, output_cols, output_rows);
+  const auto& output_pipeline = GemmlowpOutputPipeline<Ac>::Make(
+      bias_data, output_rows, output_offset, output_multiplier, output_shift,
+      output_activation_min, output_activation_max);
+  gemmlowp::GemmWithOutputPipeline<uint8, uint8,
+                                   gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+      gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset,
+      input_offset, output_pipeline);
+}
+
+template <typename T>
+inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims,
+                         int block_size, T* output_data,
+                         const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("SpaceToDepth");
+
+  const int output_depth = ArraySize(output_dims, 0);
+  const int output_width = ArraySize(output_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+
+  const int input_depth = ArraySize(input_dims, 0);
+  const int batch_size = ArraySize(input_dims, 3);
+
+  // Number of continuous values that we can copy in one interation.
+  const int stride = block_size * input_depth;
+
+  for (int batch = 0; batch < batch_size; ++batch) {
+    for (int out_h = 0; out_h < output_height; ++out_h) {
+      T* output_ptr = output_data + Offset(output_dims, 0, 0, out_h, batch);
+      for (int offset_h = 0; offset_h < block_size; ++offset_h) {
+        T* dst = output_ptr;
+        for (int out_w = 0; out_w < output_width; ++out_w) {
+          memcpy(dst, input_data, stride * sizeof(T));
+          input_data += stride;
+          dst += output_depth;
+        }
+        output_ptr += stride;
+      }
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+void NonGlobalBatchNormalization(
+    const float* input_data, const Dims<4>& input_dims, const float* mean_data,
+    const Dims<4>& mean_dims, const float* multiplier_data,
+    const Dims<4>& multiplier_dims, const float* offset_data,
+    const Dims<4>& offset_dims, float* output_data,
+    const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("NonGlobalBatchNormalization");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height =
+      MatchingArraySize(input_dims, 2, mean_dims, 2, multiplier_dims, 2,
+                        offset_dims, 2, output_dims, 2);
+  const int width =
+      MatchingArraySize(input_dims, 1, mean_dims, 1, multiplier_dims, 1,
+                        offset_dims, 1, output_dims, 1);
+  const int depth =
+      MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
+                        offset_dims, 0, output_dims, 0);
+
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
+              (input_data[Offset(input_dims, c, x, y, b)] -
+               mean_data[Offset(mean_dims, c, x, y, 0)]) *
+                  multiplier_data[Offset(multiplier_dims, c, x, y, 0)] +
+              offset_data[Offset(offset_dims, c, x, y, 0)]);
+        }
+      }
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+void GlobalBatchNormalization(const float* input_data,
+                              const Dims<4>& input_dims, const float* mean_data,
+                              const Dims<4>& mean_dims,
+                              const float* multiplier_data,
+                              const Dims<4>& multiplier_dims,
+                              const float* offset_data,
+                              const Dims<4>& offset_dims, float* output_data,
+                              const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("GlobalBatchNormalization");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth =
+      MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
+                        offset_dims, 0, output_dims, 0);
+
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
+              (input_data[Offset(input_dims, c, x, y, b)] -
+               mean_data[Offset(mean_dims, c, 0, 0, 0)]) *
+                  multiplier_data[Offset(multiplier_dims, c, 0, 0, 0)] +
+              offset_data[Offset(offset_dims, c, 0, 0, 0)]);
+        }
+      }
+    }
+  }
+}
+
+inline void Relu(const float* input_data, const Dims<4>& input_dims,
+                 float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Relu (not fused)");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          float val = input_data[Offset(input_dims, c, x, y, b)];
+          const float lower = 0;
+          float clamped = val < lower ? lower : val;
+          output_data[Offset(output_dims, c, x, y, b)] = clamped;
+        }
+      }
+    }
+  }
+}
+
+inline void Relu1(const float* input_data, const Dims<4>& input_dims,
+                  float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Relu1 (not fused)");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          float val = input_data[Offset(input_dims, c, x, y, b)];
+          const float upper = 1;
+          const float lower = -1;
+          float clamped = val > upper ? upper : val < lower ? lower : val;
+          output_data[Offset(output_dims, c, x, y, b)] = clamped;
+        }
+      }
+    }
+  }
+}
+
+inline void Relu6(const float* input_data, const Dims<4>& input_dims,
+                  float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Relu6 (not fused)");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          float val = input_data[Offset(input_dims, c, x, y, b)];
+          const float upper = 6;
+          const float lower = 0;
+          float clamped = val > upper ? upper : val < lower ? lower : val;
+          output_data[Offset(output_dims, c, x, y, b)] = clamped;
+        }
+      }
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+void L2Normalization(const float* input_data, const Dims<4>& input_dims,
+                     float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("L2Normalization");
+  static_assert(Ac == FusedActivationFunctionType::kNone, "");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        float squared_l2_norm = 0;
+        for (int c = 0; c < depth; ++c) {
+          float val = input_data[Offset(input_dims, c, x, y, b)];
+          squared_l2_norm += val * val;
+        }
+        float inverse_l2_norm = 1.0f / std::sqrt(squared_l2_norm);
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              input_data[Offset(input_dims, c, x, y, b)] * inverse_l2_norm;
+        }
+      }
+    }
+  }
+}
+
+inline void GetInvSqrtQuantizedMultiplier(int32 input, int32* output_inv_sqrt,
+                                          int* output_shift) {
+  *output_shift = 11;
+  while (input >= (1 << 29)) {
+    input /= 4;
+    ++*output_shift;
+  }
+  DCHECK_GT(input, 0);
+  const unsigned max_left_shift_bits = __builtin_clz(input) - 1;
+  const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
+  const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
+  *output_shift -= left_shift_bit_pairs;
+  input <<= 2 * left_shift_bit_pairs;
+  DCHECK_GE(input, (1 << 27));
+  DCHECK_LT(input, (1 << 29));
+  using gemmlowp::FixedPoint;
+  using gemmlowp::Rescale;
+  using gemmlowp::SaturatingRoundingMultiplyByPOT;
+  // Using 3 integer bits gives us enough room for the internal arithmetic in
+  // this Newton-Raphson iteration.
+  using F3 = FixedPoint<int32, 3>;
+  using F0 = FixedPoint<int32, 0>;
+  const F3 fixedpoint_input = F3::FromRaw(input >> 1);
+  const F3 fixedpoint_half_input =
+      SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
+  const F3 fixedpoint_half_three =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
+  // Newton-Raphson iteration
+  // Naive unoptimized starting guess: x = 1
+  F3 x = F3::One();
+  // Naive unoptimized number of iterations: 5
+  for (int i = 0; i < 5; i++) {
+    const F3 x3 = Rescale<3>(x * x * x);
+    x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
+  }
+  const F0 fixedpoint_half_sqrt_2 =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
+  x = x * fixedpoint_half_sqrt_2;
+  *output_inv_sqrt = x.raw();
+  if (*output_shift < 0) {
+    *output_inv_sqrt <<= -*output_shift;
+    *output_shift = 0;
+  }
+}
+
+inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims,
+                            int32 input_zero_point, uint8* output_data,
+                            const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("L2Normalization/8bit");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  DCHECK(IsPackedWithoutStrides(input_dims));
+  DCHECK(IsPackedWithoutStrides(output_dims));
+  DCHECK_EQ(batches, 1);
+  DCHECK_EQ(height, 1);
+  DCHECK_EQ(width, 1);
+  int32 square_l2_norm = 0;
+  for (int i = 0; i < depth; i++) {
+    int32 diff = input_data[i] - input_zero_point;
+    square_l2_norm += diff * diff;
+  }
+  int32 inv_l2norm_multiplier;
+  int inv_l2norm_shift;
+  GetInvSqrtQuantizedMultiplier(square_l2_norm, &inv_l2norm_multiplier,
+                                &inv_l2norm_shift);
+
+  for (int i = 0; i < depth; i++) {
+    int32 diff = input_data[i] - input_zero_point;
+    int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOne(
+        128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
+    int32 unclamped_output_val = 128 + rescaled_diff;
+    int32 output_val = std::min(255, std::max(0, unclamped_output_val));
+    output_data[i] = static_cast<uint8>(output_val);
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+void Add(const float* input1_data, const Dims<4>& input1_dims,
+         const float* input2_data, const Dims<4>& input2_dims,
+         float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Add");
+  /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3,
+                                              output_dims, 3);
+  /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2,
+                                             output_dims, 2);
+  /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1,
+                                            output_dims, 1);
+  /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0,
+                                            output_dims, 0);
+  DCHECK(IsPackedWithoutStrides(input1_dims));
+  DCHECK(IsPackedWithoutStrides(input2_dims));
+  DCHECK(IsPackedWithoutStrides(output_dims));
+
+  int i = 0;
+  const int size = input1_dims.sizes[3] * input1_dims.strides[3];
+#ifdef USE_NEON
+  const auto zero = vdupq_n_f32(0);
+  const auto six = vdupq_n_f32(6);
+  const auto neg_one = vdupq_n_f32(-1);
+  const auto one = vdupq_n_f32(1);
+  for (; i <= size - 16; i += 16) {
+    auto a10 = vld1q_f32(input1_data + i);
+    auto a11 = vld1q_f32(input1_data + i + 4);
+    auto a12 = vld1q_f32(input1_data + i + 8);
+    auto a13 = vld1q_f32(input1_data + i + 12);
+    auto a20 = vld1q_f32(input2_data + i);
+    auto a21 = vld1q_f32(input2_data + i + 4);
+    auto a22 = vld1q_f32(input2_data + i + 8);
+    auto a23 = vld1q_f32(input2_data + i + 12);
+    auto x0 = vaddq_f32(a10, a20);
+    auto x1 = vaddq_f32(a11, a21);
+    auto x2 = vaddq_f32(a12, a22);
+    auto x3 = vaddq_f32(a13, a23);
+    if (Ac == FusedActivationFunctionType::kRelu ||
+        Ac == FusedActivationFunctionType::kRelu6) {
+      x0 = vmaxq_f32(zero, x0);
+      x1 = vmaxq_f32(zero, x1);
+      x2 = vmaxq_f32(zero, x2);
+      x3 = vmaxq_f32(zero, x3);
+      if (Ac == FusedActivationFunctionType::kRelu6) {
+        x0 = vminq_f32(six, x0);
+        x1 = vminq_f32(six, x1);
+        x2 = vminq_f32(six, x2);
+        x3 = vminq_f32(six, x3);
+      }
+    } else if (Ac == FusedActivationFunctionType::kRelu1) {
+      x0 = vmaxq_f32(neg_one, x0);
+      x1 = vmaxq_f32(neg_one, x1);
+      x2 = vmaxq_f32(neg_one, x2);
+      x3 = vmaxq_f32(neg_one, x3);
+      x0 = vminq_f32(one, x0);
+      x1 = vminq_f32(one, x1);
+      x2 = vminq_f32(one, x2);
+      x3 = vminq_f32(one, x3);
+    }
+    vst1q_f32(output_data + i, x0);
+    vst1q_f32(output_data + i + 4, x1);
+    vst1q_f32(output_data + i + 8, x2);
+    vst1q_f32(output_data + i + 12, x3);
+  }
+  for (; i <= size - 4; i += 4) {
+    auto a1 = vld1q_f32(input1_data + i);
+    auto a2 = vld1q_f32(input2_data + i);
+    auto x = vaddq_f32(a1, a2);
+    if (Ac == FusedActivationFunctionType::kRelu ||
+        Ac == FusedActivationFunctionType::kRelu6) {
+      x = vmaxq_f32(zero, x);
+      if (Ac == FusedActivationFunctionType::kRelu6) {
+        x = vminq_f32(six, x);
+      }
+    } else if (Ac == FusedActivationFunctionType::kRelu1) {
+      x = vmaxq_f32(neg_one, x);
+      x = vminq_f32(one, x);
+    }
+    vst1q_f32(output_data + i, x);
+  }
+#endif  // NEON
+
+  for (; i < size; i++) {
+    auto x = input1_data[i] + input2_data[i];
+    output_data[i] = ActivationFunction<Ac>(x);
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+inline void Add(int left_shift, const uint8* input1_data,
+                const Dims<4>& input1_dims, int32 input1_offset,
+                int32 input1_multiplier, int input1_shift,
+                const uint8* input2_data, const Dims<4>& input2_dims,
+                int32 input2_offset, int32 input2_multiplier, int input2_shift,
+                int32 output_offset, int32 output_multiplier, int output_shift,
+                int32 output_activation_min, int32 output_activation_max,
+                uint8* output_data, const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    DCHECK_EQ(output_activation_min, 0);
+    DCHECK_EQ(output_activation_max, 255);
+  }
+  gemmlowp::ScopedProfilingLabel label("Add/8bit");
+  /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3,
+                                              output_dims, 3);
+  /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2,
+                                             output_dims, 2);
+  /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1,
+                                            output_dims, 1);
+  /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0,
+                                            output_dims, 0);
+  DCHECK(IsPackedWithoutStrides(input1_dims));
+  DCHECK(IsPackedWithoutStrides(input2_dims));
+  DCHECK(IsPackedWithoutStrides(output_dims));
+
+  int i = 0;
+  const int size = input1_dims.sizes[3] * input1_dims.strides[3];
+  DCHECK_GT(input1_offset, -256);
+  DCHECK_GT(input2_offset, -256);
+  DCHECK_LT(input1_offset, 256);
+  DCHECK_LT(input2_offset, 256);
+#ifdef USE_NEON
+  for (; i <= size - 8; i += 8) {
+    const auto input1_val_original = vld1_u8(input1_data + i);
+    const auto input2_val_original = vld1_u8(input2_data + i);
+    const auto input1_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input1_val_original));
+    const auto input2_val_s16 =
+        vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
+    const auto input1_val =
+        vaddq_s16(input1_val_s16, vdupq_n_s16(input1_offset));
+    const auto input2_val =
+        vaddq_s16(input2_val_s16, vdupq_n_s16(input2_offset));
+    const auto input1_val_high = vget_high_s16(input1_val);
+    const auto input1_val_low = vget_low_s16(input1_val);
+    const auto input2_val_high = vget_high_s16(input2_val);
+    const auto input2_val_low = vget_low_s16(input2_val);
+    auto x11 = vmovl_s16(input1_val_low);
+    auto x12 = vmovl_s16(input1_val_high);
+    auto x21 = vmovl_s16(input2_val_low);
+    auto x22 = vmovl_s16(input2_val_high);
+    const auto left_shift_dup = vdupq_n_s32(left_shift);
+    x11 = vshlq_s32(x11, left_shift_dup);
+    x12 = vshlq_s32(x12, left_shift_dup);
+    x21 = vshlq_s32(x21, left_shift_dup);
+    x22 = vshlq_s32(x22, left_shift_dup);
+    x11 = vqrdmulhq_n_s32(x11, input1_multiplier);
+    x12 = vqrdmulhq_n_s32(x12, input1_multiplier);
+    x21 = vqrdmulhq_n_s32(x21, input2_multiplier);
+    x22 = vqrdmulhq_n_s32(x22, input2_multiplier);
+    const auto input1_shift_dup = vdupq_n_s32(-input1_shift);
+    const auto input2_shift_dup = vdupq_n_s32(-input2_shift);
+    x11 = vshlq_s32(x11, input1_shift_dup);
+    x12 = vshlq_s32(x12, input1_shift_dup);
+    x21 = vshlq_s32(x21, input2_shift_dup);
+    x22 = vshlq_s32(x22, input2_shift_dup);
+    auto s1 = vaddq_s32(x11, x21);
+    auto s2 = vaddq_s32(x12, x22);
+    s1 = vqrdmulhq_n_s32(s1, output_multiplier);
+    s2 = vqrdmulhq_n_s32(s2, output_multiplier);
+    using gemmlowp::RoundingDivideByPOT;
+    s1 = RoundingDivideByPOT(s1, output_shift);
+    s2 = RoundingDivideByPOT(s2, output_shift);
+    const auto s1_narrowed = vmovn_s32(s1);
+    const auto s2_narrowed = vmovn_s32(s2);
+    const auto s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
+                             vdupq_n_s16(output_offset));
+    vst1_u8(output_data + i, vqmovun_s16(s));
+  }
+#endif  // NEON
+
+  for (; i < size; i++) {
+    const int32 input1_val = input1_offset + input1_data[i];
+    const int32 input2_val = input2_offset + input2_data[i];
+    const int32 shifted_input1_val = input1_val * (1 << left_shift);
+    const int32 shifted_input2_val = input2_val * (1 << left_shift);
+    const int32 scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOne(
+        shifted_input1_val, input1_multiplier, input1_shift);
+    const int32 scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOne(
+        shifted_input2_val, input2_multiplier, input2_shift);
+    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32 raw_output = MultiplyByQuantizedMultiplierSmallerThanOne(
+                                 raw_sum, output_multiplier, output_shift) +
+                             output_offset;
+    const int32 clamped_output = std::min(
+        output_activation_max, std::max(output_activation_min, raw_output));
+    output_data[i] = static_cast<uint8>(clamped_output);
+  }
+}
+
+
+// TODO: We can implement BroadcastAdd on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+// TODO: BroadcastAdd is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+template <FusedActivationFunctionType Ac>
+void BroadcastAdd(const float* input1_data, const Dims<4>& input1_dims,
+                  const float* input2_data, const Dims<4>& input2_dims,
+                  float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastAdd");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
+              input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
+              input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+        }
+      }
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+inline void BroadcastAdd(int left_shift, const uint8* input1_data,
+                         const Dims<4>& input1_dims, int32 input1_offset,
+                         int32 input1_multiplier, int input1_shift,
+                         const uint8* input2_data, const Dims<4>& input2_dims,
+                         int32 input2_offset, int32 input2_multiplier,
+                         int input2_shift, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    DCHECK_EQ(output_activation_min, 0);
+    DCHECK_EQ(output_activation_max, 255);
+  }
+  gemmlowp::ScopedProfilingLabel label("BroadcastAdd/8bit");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          const int32 input1_val =
+              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+          const int32 input2_val =
+              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+          const int32 shifted_input1_val = input1_val * (1 << left_shift);
+          const int32 shifted_input2_val = input2_val * (1 << left_shift);
+          const int32 scaled_input1_val =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  shifted_input1_val, input1_multiplier, input1_shift);
+          const int32 scaled_input2_val =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  shifted_input2_val, input2_multiplier, input2_shift);
+          const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+          const int32 raw_output =
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  raw_sum, output_multiplier, output_shift) +
+              output_offset;
+          const int32 clamped_output =
+              std::min(output_activation_max,
+                       std::max(output_activation_min, raw_output));
+          output_data[Offset(output_dims, c, x, y, b)] =
+              static_cast<uint8>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+void Mul(const float* input1_data, const Dims<4>& input1_dims,
+         const float* input2_data, const Dims<4>& input2_dims,
+         float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Mul");
+  /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3,
+                                              output_dims, 3);
+  /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2,
+                                             output_dims, 2);
+  /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1,
+                                            output_dims, 1);
+  /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0,
+                                            output_dims, 0);
+  DCHECK(IsPackedWithoutStrides(input1_dims));
+  DCHECK(IsPackedWithoutStrides(input2_dims));
+  DCHECK(IsPackedWithoutStrides(output_dims));
+
+  int i = 0;
+  const int size = input1_dims.sizes[3] * input1_dims.strides[3];
+#ifdef USE_NEON
+  const auto zero = vdupq_n_f32(0);
+  const auto six = vdupq_n_f32(6);
+  const auto neg_one = vdupq_n_f32(-1);
+  const auto one = vdupq_n_f32(1);
+  for (; i <= size - 16; i += 16) {
+    auto a10 = vld1q_f32(input1_data + i);
+    auto a11 = vld1q_f32(input1_data + i + 4);
+    auto a12 = vld1q_f32(input1_data + i + 8);
+    auto a13 = vld1q_f32(input1_data + i + 12);
+    auto a20 = vld1q_f32(input2_data + i);
+    auto a21 = vld1q_f32(input2_data + i + 4);
+    auto a22 = vld1q_f32(input2_data + i + 8);
+    auto a23 = vld1q_f32(input2_data + i + 12);
+    auto x0 = vmulq_f32(a10, a20);
+    auto x1 = vmulq_f32(a11, a21);
+    auto x2 = vmulq_f32(a12, a22);
+    auto x3 = vmulq_f32(a13, a23);
+    if (Ac == FusedActivationFunctionType::kRelu ||
+        Ac == FusedActivationFunctionType::kRelu6) {
+      x0 = vmaxq_f32(zero, x0);
+      x1 = vmaxq_f32(zero, x1);
+      x2 = vmaxq_f32(zero, x2);
+      x3 = vmaxq_f32(zero, x3);
+      if (Ac == FusedActivationFunctionType::kRelu6) {
+        x0 = vminq_f32(six, x0);
+        x1 = vminq_f32(six, x1);
+        x2 = vminq_f32(six, x2);
+        x3 = vminq_f32(six, x3);
+      }
+    } else if (Ac == FusedActivationFunctionType::kRelu1) {
+      x0 = vmaxq_f32(neg_one, x0);
+      x1 = vmaxq_f32(neg_one, x1);
+      x2 = vmaxq_f32(neg_one, x2);
+      x3 = vmaxq_f32(neg_one, x3);
+      x0 = vminq_f32(one, x0);
+      x1 = vminq_f32(one, x1);
+      x2 = vminq_f32(one, x2);
+      x3 = vminq_f32(one, x3);
+    }
+    vst1q_f32(output_data + i, x0);
+    vst1q_f32(output_data + i + 4, x1);
+    vst1q_f32(output_data + i + 8, x2);
+    vst1q_f32(output_data + i + 12, x3);
+  }
+  for (; i <= size - 4; i += 4) {
+    auto a1 = vld1q_f32(input1_data + i);
+    auto a2 = vld1q_f32(input2_data + i);
+    auto x = vmulq_f32(a1, a2);
+    if (Ac == FusedActivationFunctionType::kRelu ||
+        Ac == FusedActivationFunctionType::kRelu6) {
+      x = vmaxq_f32(zero, x);
+      if (Ac == FusedActivationFunctionType::kRelu6) {
+        x = vminq_f32(six, x);
+      }
+    } else if (Ac == FusedActivationFunctionType::kRelu1) {
+      x = vmaxq_f32(neg_one, x);
+      x = vminq_f32(one, x);
+    }
+    vst1q_f32(output_data + i, x);
+  }
+#endif  // NEON
+
+  for (; i < size; i++) {
+    auto x = input1_data[i] * input2_data[i];
+    output_data[i] = ActivationFunction<Ac>(x);
+  }
+}
+
+// TODO: We can implement BroadcastMul on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+// TODO: BroadcastMul is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+template <FusedActivationFunctionType Ac>
+void BroadcastMul(const float* input1_data, const Dims<4>& input1_dims,
+                  const float* input2_data, const Dims<4>& input2_dims,
+                  float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastMul");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
+              input1_data[SubscriptToIndex(desc1, c, x, y, b)] *
+              input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+        }
+      }
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
+                         int32 input1_offset, const uint8* input2_data,
+                         const Dims<4>& input2_dims, int32 input2_offset,
+                         int32 output_offset, int32 output_multiplier,
+                         int output_shift, int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    DCHECK_EQ(output_activation_min, 0);
+    DCHECK_EQ(output_activation_max, 255);
+  }
+  gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          const int32 input1_val =
+              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+          const int32 input2_val =
+              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+          const int32 unclamped_result =
+              output_offset +
+              MultiplyByQuantizedMultiplierSmallerThanOne(
+                  input1_val * input2_val, output_multiplier, output_shift);
+          const int32 clamped_output =
+              std::min(output_activation_max,
+                       std::max(output_activation_min, unclamped_result));
+          output_data[Offset(output_dims, c, x, y, b)] =
+              static_cast<uint8>(clamped_output);
+        }
+      }
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac, typename Scalar>
+void Concatenation(int concat_dim, const Scalar* const* input_data,
+                   const Dims<4>* const* input_dims, int inputs_count,
+                   Scalar* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Concatenation");
+  DCHECK_GT(inputs_count, 1);
+  int concat_size = 0;
+  for (int i = 0; i < inputs_count; i++) {
+    for (int j = 0; j < 4; j++) {
+      if (j != concat_dim) {
+        MatchingArraySize(*input_dims[i], j, output_dims, j);
+      }
+    }
+    concat_size += ArraySize(*input_dims[i], concat_dim);
+  }
+  DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim));
+  DCHECK(IsPackedWithoutStrides(output_dims));
+  // for now we dont have a model with a Concatenation
+  // with fused activation function.
+  DCHECK(Ac == FusedActivationFunctionType::kNone);
+  int outer_size = 1;
+  for (int i = concat_dim + 1; i < 4; i++) {
+    outer_size *= output_dims.sizes[i];
+  }
+  Scalar* output_ptr = output_data;
+  for (int k = 0; k < outer_size; k++) {
+    for (int i = 0; i < inputs_count; ++i) {
+      const int copy_size =
+          input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim];
+      memcpy(output_ptr, input_data[i] + k * copy_size,
+             copy_size * sizeof(Scalar));
+      output_ptr += copy_size;
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac, typename Scalar>
+void DepthConcatenation(const Scalar* const* input_data,
+                        const Dims<4>* const* input_dims, int inputs_count,
+                        Scalar* output_data, const Dims<4>& output_dims) {
+  Concatenation<Ac, Scalar>(0, input_data, input_dims, inputs_count,
+                            output_data, output_dims);
+}
+
+inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
+                     const float* prev_activ_data,
+                     const Dims<4>& prev_activ_dims, const float* weights_data,
+                     const Dims<4>& weights_dims, const float* bias_data,
+                     const Dims<4>& bias_dims, const float* prev_state_data,
+                     const Dims<4>& prev_state_dims, float* output_state_data,
+                     const Dims<4>& output_state_dims, float* output_activ_data,
+                     const Dims<4>& output_activ_dims, float* concat_temp_data,
+                     const Dims<4>& concat_temp_dims, float* activ_temp_data,
+                     const Dims<4>& activ_temp_dims) {
+  gemmlowp::ScopedProfilingLabel label("LstmCell");
+  MatchingArraySize(  // batches
+      input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3, output_state_dims,
+      3, output_activ_dims, 3);
+  MatchingArraySize(  // height
+      input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2, output_state_dims,
+      2, output_activ_dims, 2);
+  MatchingArraySize(  // width
+      input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1, output_state_dims,
+      1, output_activ_dims, 1);
+  CHECK_EQ(ArraySize(weights_dims, 2), 1);
+  CHECK_EQ(ArraySize(weights_dims, 3), 1);
+  const int input_depth = ArraySize(input_dims, 0);
+  const int prev_activ_depth = ArraySize(prev_activ_dims, 0);
+  const int total_input_depth = prev_activ_depth + input_depth;
+  CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth);
+  CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3), 1);
+  const int intern_activ_depth = MatchingArraySize(
+      weights_dims, 1,
+      bias_dims,    0);
+  CHECK_EQ(intern_activ_depth % 4, 0);
+  const int output_depth = MatchingArraySize(
+      prev_state_dims,   0,
+      prev_activ_dims,   0,
+      output_state_dims, 0,
+      output_activ_dims, 0);
+  CHECK_EQ(output_depth, intern_activ_depth / 4);
+
+  // Concatenate prev_activ and input data together
+  std::vector<float const*> concat_input_arrays_data;
+  std::vector<Dims<4> const*> concat_input_arrays_dims;
+  concat_input_arrays_data.push_back(input_data);
+  concat_input_arrays_data.push_back(prev_activ_data);
+  concat_input_arrays_dims.push_back(&input_dims);
+  concat_input_arrays_dims.push_back(&prev_activ_dims);
+  Concatenation<FusedActivationFunctionType::kNone, float>(
+      0, &(concat_input_arrays_data[0]), &(concat_input_arrays_dims[0]),
+      concat_input_arrays_data.size(), concat_temp_data, concat_temp_dims);
+
+  // Fully connected
+  FullyConnected<FusedActivationFunctionType::kNone>(
+      concat_temp_data, concat_temp_dims, weights_data, weights_dims, bias_data,
+      bias_dims, activ_temp_data, activ_temp_dims);
+
+  // Map raw arrays to Eigen arrays so we can use Eigen's optimized array
+  // operations.
+  ArrayMap<float> activ_temp_map =
+      MapAsArrayWithFirstDimAsRows(activ_temp_data, activ_temp_dims);
+  auto input_gate_sm = activ_temp_map.block(0 * output_depth, 0, output_depth,
+                                            activ_temp_map.cols());
+  auto new_input_sm = activ_temp_map.block(1 * output_depth, 0, output_depth,
+                                           activ_temp_map.cols());
+  auto forget_gate_sm = activ_temp_map.block(2 * output_depth, 0, output_depth,
+                                             activ_temp_map.cols());
+  auto output_gate_sm = activ_temp_map.block(3 * output_depth, 0, output_depth,
+                                             activ_temp_map.cols());
+  ArrayMap<const float> prev_state_map =
+      MapAsArrayWithFirstDimAsRows(prev_state_data, prev_state_dims);
+  ArrayMap<float> output_state_map =
+      MapAsArrayWithFirstDimAsRows(output_state_data, output_state_dims);
+  ArrayMap<float> output_activ_map =
+      MapAsArrayWithFirstDimAsRows(output_activ_data, output_activ_dims);
+
+  // Combined memory state and final output calculation
+  gemmlowp::ScopedProfilingLabel label2("MemoryStateAndFinalOutput");
+  output_state_map =
+      input_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
+          new_input_sm.tanh() +
+      forget_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
+          prev_state_map;
+  output_activ_map =
+      output_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) *
+      output_state_map.tanh();
+}
+
+template <FusedActivationFunctionType Ac, typename Scalar>
+void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims,
+                     int outputs_count, Scalar* const* output_data,
+                     const Dims<4>* const* output_dims) {
+  gemmlowp::ScopedProfilingLabel label("TensorFlowSplit");
+  DCHECK_GE(outputs_count, 1);
+  for (int i = 0; i < outputs_count; i++) {
+    /* batches = */ MatchingArraySize(*output_dims[i], 3, input_dims, 3);
+    /* height = */ MatchingArraySize(*output_dims[i], 2, input_dims, 2);
+    /* width = */ MatchingArraySize(*output_dims[i], 1, input_dims, 1);
+  }
+  const int batches = MatchingArraySize(*output_dims[0], 3, input_dims, 3);
+  const int height = MatchingArraySize(*output_dims[0], 2, input_dims, 2);
+  const int width = MatchingArraySize(*output_dims[0], 1, input_dims, 1);
+  DCHECK(IsPackedWithoutStrides(input_dims));
+  // for now we dont have a model with a TensorFlowSplit
+  // with fused activation function.
+  DCHECK(Ac == FusedActivationFunctionType::kNone);
+  const int whb = width * height * batches;
+  const Scalar* input_ptr = input_data;
+  for (int k = 0; k < whb; k++) {
+    for (int i = 0; i < outputs_count; ++i) {
+      memcpy(output_data[i] + k * output_dims[i]->sizes[0], input_ptr,
+             output_dims[i]->sizes[0] * sizeof(Scalar));
+      input_ptr += output_dims[i]->sizes[0];
+    }
+  }
+}
+
+inline int NodeOffset(int b, int h, int w, int height, int width) {
+  return (b * height + h) * width + w;
+}
+
+template <FusedActivationFunctionType Ac>
+void AveragePool(const float* input_data, const Dims<4>& input_dims,
+                 int stride_width, int stride_height,
+                 int pad_width, int pad_height, int kwidth, int kheight,
+                 float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("AveragePool");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+  const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+  auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+  // TODO: get rid of the dynamic memory allocation here!
+  Eigen::VectorXf out_count(out_mat.cols());
+  out_count.setZero();
+  // Prefill the output to 0.
+  out_mat.setZero();
+  for (int b = 0; b < batches; ++b) {
+    for (int h = 0; h < input_height; ++h) {
+      for (int w = 0; w < input_width; ++w) {
+        // (h_start, h_end) * (w_start, w_end) is the range that the input
+        // vector projects to.
+        int hpad = h + pad_height;
+        int wpad = w + pad_width;
+        int h_start = (hpad < kheight) ? 0 : (hpad - kheight) / stride_height + 1;
+        int h_end = std::min(hpad / stride_height + 1, output_height);
+        int w_start = (wpad < kwidth) ? 0 : (wpad - kwidth) / stride_width + 1;
+        int w_end = std::min(wpad / stride_width + 1, output_width);
+        // compute elementwise sum
+        for (int ph = h_start; ph < h_end; ++ph) {
+          for (int pw = w_start; pw < w_end; ++pw) {
+            int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
+            out_mat.col(out_offset) +=
+                in_mat.col(NodeOffset(b, h, w, input_height, input_width));
+            out_count(out_offset)++;
+          }
+        }
+      }
+    }
+  }
+  // Divide the output by the actual number of elements being averaged over
+  DCHECK_GT(out_count.minCoeff(), 0);
+  out_mat.array().rowwise() /= out_count.transpose().array();
+
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < output_height; ++y) {
+      for (int x = 0; x < output_width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
+              output_data[Offset(output_dims, c, x, y, b)]);
+        }
+      }
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+void AveragePool(const uint8* input_data, const Dims<4>& input_dims,
+                 int stride_width, int stride_height,
+                 int pad_width, int pad_height, int filter_width,
+                 int filter_height, int32 output_activation_min,
+                 int32 output_activation_max, uint8* output_data,
+                 const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("AveragePool/8bit");
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    DCHECK_EQ(output_activation_min, 0);
+    DCHECK_EQ(output_activation_max, 255);
+  }
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        const int in_y_origin = (out_y * stride_height) - pad_height;
+        const int filter_x_start = std::max(0, -in_x_origin);
+        const int filter_x_end =
+            std::min(filter_width, input_width - in_x_origin);
+        const int filter_y_start = std::max(0, -in_y_origin);
+        const int filter_y_end =
+            std::min(filter_height, input_height - in_y_origin);
+        const int filter_count =
+            (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
+        // TODO: Add a dynamic buffer allocation path instead of hardcoded size.
+        static constexpr int kAccBufferMaxSize = 2048;
+        DCHECK_LE(depth, kAccBufferMaxSize);
+        uint16 acc[kAccBufferMaxSize];
+        memset(acc, 0, depth * sizeof(acc[0]));
+        const uint8* input_ptr =
+            input_data + input_dims.strides[1] * in_x_origin +
+            input_dims.strides[2] * in_y_origin + input_dims.strides[3] * batch;
+        for (int fy = filter_y_start; fy < filter_y_end; fy++) {
+          const uint8* input_row_ptr = input_ptr + fy * input_dims.strides[2] +
+                                       filter_x_start * input_dims.strides[1];
+          for (int fx = filter_x_start; fx < filter_x_end; fx++) {
+            int channel = 0;
+#ifdef USE_NEON
+            for (; channel <= depth - 16; channel += 16) {
+              uint16x8_t acc_reg[2];
+              for (int i = 0; i < 2; i++) {
+                acc_reg[i] = vld1q_u16(acc + channel + 8 * i);
+              }
+              uint8x16_t input_reg = vld1q_u8(input_row_ptr);
+              input_row_ptr += 16;
+              acc_reg[0] = vaddw_u8(acc_reg[0], vget_low_u8(input_reg));
+              acc_reg[1] = vaddw_u8(acc_reg[1], vget_high_u8(input_reg));
+              for (int i = 0; i < 2; i++) {
+                vst1q_u16(acc + channel + 8 * i, acc_reg[i]);
+              }
+            }
+            for (; channel <= depth - 8; channel += 8) {
+              uint16x8_t acc_reg = vld1q_u16(acc + channel);
+              uint8x8_t input_reg = vld1_u8(input_row_ptr);
+              input_row_ptr += 8;
+              acc_reg = vaddw_u8(acc_reg, input_reg);
+              vst1q_u16(acc + channel, acc_reg);
+            }
+#endif
+            for (; channel < depth; ++channel) {
+              acc[channel] += *input_row_ptr++;
+            }
+          }
+        }
+        uint8* output_ptr =
+            output_data + Offset(output_dims, 0, out_x, out_y, batch);
+        int channel = 0;
+#ifdef USE_NEON
+#define AVGPOOL_DIVIDING_BY(FILTER_COUNT)                              \
+  if (filter_count == FILTER_COUNT) {                                  \
+    for (; channel <= depth - 8; channel += 8) {                       \
+      uint16 buf[8];                                                   \
+      for (int i = 0; i < 8; i++) {                                    \
+        buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT; \
+      }                                                                \
+      uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));                     \
+      buf8 = vmin_u8(buf8, vdup_n_u8(output_activation_max));          \
+      buf8 = vmax_u8(buf8, vdup_n_u8(output_activation_min));          \
+      vst1_u8(output_ptr + channel, buf8);                             \
+    }                                                                  \
+  }
+        AVGPOOL_DIVIDING_BY(9)
+        AVGPOOL_DIVIDING_BY(15)
+#undef AVGPOOL_DIVIDING_BY
+        for (; channel <= depth - 8; channel += 8) {
+          uint16 buf[8];
+          for (int i = 0; i < 8; i++) {
+            buf[i] = (acc[channel + i] + filter_count / 2) / filter_count;
+          }
+          uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf));
+          buf8 = vmin_u8(buf8, vdup_n_u8(output_activation_max));
+          buf8 = vmax_u8(buf8, vdup_n_u8(output_activation_min));
+          vst1_u8(output_ptr + channel, buf8);
+        }
+#endif
+        for (; channel < depth; ++channel) {
+          uint16 a = (acc[channel] + filter_count / 2) / filter_count;
+          a = std::max<uint16>(a, output_activation_min);
+          a = std::min<uint16>(a, output_activation_max);
+          output_ptr[channel] = static_cast<uint8>(a);
+        }
+      }
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+void MaxPool(const float* input_data, const Dims<4>& input_dims,
+             int stride_width, int stride_height,
+             int pad_width, int pad_height, int kwidth, int kheight,
+             float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("MaxPool");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+  const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+  auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+  // Prefill the output to minimum representable float value
+  out_mat.setConstant(std::numeric_limits<float>::lowest());
+  for (int b = 0; b < batches; ++b) {
+    for (int h = 0; h < input_height; ++h) {
+      for (int w = 0; w < input_width; ++w) {
+        // (h_start, h_end) * (w_start, w_end) is the range that the input
+        // vector projects to.
+        int hpad = h + pad_height;
+        int wpad = w + pad_width;
+        int h_start = (hpad < kheight) ? 0 : (hpad - kheight) / stride_height + 1;
+        int h_end = std::min(hpad / stride_height + 1, output_height);
+        int w_start = (wpad < kwidth) ? 0 : (wpad - kwidth) / stride_width + 1;
+        int w_end = std::min(wpad / stride_width + 1, output_width);
+        // compute elementwise sum
+        for (int ph = h_start; ph < h_end; ++ph) {
+          for (int pw = w_start; pw < w_end; ++pw) {
+            int out_offset = NodeOffset(b, ph, pw, output_height, output_width);
+            out_mat.col(out_offset) =
+                out_mat.col(out_offset)
+                    .cwiseMax(in_mat.col(
+                        NodeOffset(b, h, w, input_height, input_width)));
+          }
+        }
+      }
+    }
+  }
+
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < output_height; ++y) {
+      for (int x = 0; x < output_width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
+              output_data[Offset(output_dims, c, x, y, b)]);
+        }
+      }
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+void MaxPool(const uint8* input_data, const Dims<4>& input_dims,
+             int stride_width, int stride_height,
+             int pad_width, int pad_height, int filter_width, int filter_height,
+             int32 output_activation_min, int32 output_activation_max,
+             uint8* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("MaxPool/8bit");
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    DCHECK_EQ(output_activation_min, 0);
+    DCHECK_EQ(output_activation_max, 255);
+  }
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        const int in_y_origin = (out_y * stride_height) - pad_height;
+        const int filter_x_start = std::max(0, -in_x_origin);
+        const int filter_x_end =
+            std::min(filter_width, input_width - in_x_origin);
+        const int filter_y_start = std::max(0, -in_y_origin);
+        const int filter_y_end =
+            std::min(filter_height, input_height - in_y_origin);
+        // TODO: Add a dynamic buffer allocation path instead of hardcoded size.
+        static constexpr int kAccBufferMaxSize = 2048;
+        DCHECK_LE(depth, kAccBufferMaxSize);
+        uint8 acc[kAccBufferMaxSize];
+        memset(acc, 0, depth * sizeof(acc[0]));
+        const uint8* input_ptr =
+            input_data + input_dims.strides[1] * in_x_origin +
+            input_dims.strides[2] * in_y_origin + input_dims.strides[3] * batch;
+        for (int fy = filter_y_start; fy < filter_y_end; fy++) {
+          const uint8* input_row_ptr = input_ptr + fy * input_dims.strides[2] +
+                                       filter_x_start * input_dims.strides[1];
+          for (int fx = filter_x_start; fx < filter_x_end; fx++) {
+            int channel = 0;
+#ifdef USE_NEON
+            for (; channel <= depth - 16; channel += 16) {
+              uint8x16_t acc_reg = vld1q_u8(acc + channel);
+              uint8x16_t input_reg = vld1q_u8(input_row_ptr);
+              input_row_ptr += 16;
+              acc_reg = vmaxq_u8(acc_reg, input_reg);
+              vst1q_u8(acc + channel, acc_reg);
+            }
+
+            for (; channel <= depth - 8; channel += 8) {
+              uint8x8_t acc_reg = vld1_u8(acc + channel);
+              uint8x8_t input_reg = vld1_u8(input_row_ptr);
+              input_row_ptr += 8;
+              acc_reg = vmax_u8(acc_reg, input_reg);
+              vst1_u8(acc + channel, acc_reg);
+            }
+#endif
+            for (; channel < depth; ++channel) {
+              acc[channel] = std::max(acc[channel], *input_row_ptr++);
+            }
+          }
+        }
+        uint8* output_ptr =
+            output_data + Offset(output_dims, 0, out_x, out_y, batch);
+        int channel = 0;
+#ifdef USE_NEON
+        for (; channel <= depth - 16; channel += 16) {
+          uint8x16_t a = vld1q_u8(acc + channel);
+          a = vminq_u8(a, vdupq_n_u8(output_activation_max));
+          a = vmaxq_u8(a, vdupq_n_u8(output_activation_min));
+          vst1q_u8(output_ptr + channel, a);
+        }
+        for (; channel <= depth - 8; channel += 8) {
+          uint8x8_t a = vld1_u8(acc + channel);
+          a = vmin_u8(a, vdup_n_u8(output_activation_max));
+          a = vmax_u8(a, vdup_n_u8(output_activation_min));
+          vst1_u8(output_ptr + channel, a);
+        }
+#endif
+        for (; channel < depth; ++channel) {
+          uint8 a = acc[channel];
+          a = std::max<uint8>(a, output_activation_min);
+          a = std::min<uint8>(a, output_activation_max);
+          output_ptr[channel] = static_cast<uint8>(a);
+        }
+      }
+    }
+  }
+}
+
+template <FusedActivationFunctionType Ac>
+void L2Pool(const float* input_data, const Dims<4>& input_dims,
+            int stride_width, int stride_height,
+            int pad_width, int pad_height, int filter_width, int filter_height,
+            float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("L2Pool");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  // Actually carry out L2 Pool. Code is written in forward mode: we go through
+  // the input values once, and write to all the pooled regions that it maps to.
+  const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+  auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+  Eigen::VectorXf in_square(in_mat.rows());
+  Eigen::VectorXf out_count(out_mat.cols());
+  out_count.setZero();
+  // Prefill the output to 0.
+  out_mat.setZero();
+  for (int b = 0; b < batches; ++b) {
+    for (int h = 0; h < input_height; ++h) {
+      for (int w = 0; w < input_width; ++w) {
+        // (h_start, h_end) * (w_start, w_end) is the range that the input
+        // vector projects to.
+        const int hpad = h + pad_height;
+        const int wpad = w + pad_width;
+        const int h_start =
+            (hpad < filter_height) ? 0 : (hpad - filter_height) / stride_height + 1;
+        const int h_end = std::min(hpad / stride_height + 1, output_height);
+        const int w_start =
+            (wpad < filter_width) ? 0 : (wpad - filter_width) / stride_width + 1;
+        const int w_end = std::min(wpad / stride_width + 1, output_width);
+        // pre-compute square
+        const int in_offset = w + input_width * (h + input_height * b);
+        in_square =
+            in_mat.col(in_offset).array() * in_mat.col(in_offset).array();
+        // compute elementwise sum of squares
+        for (int ph = h_start; ph < h_end; ++ph) {
+          for (int pw = w_start; pw < w_end; ++pw) {
+            const int out_offset = pw + output_width * (ph + output_height * b);
+            out_mat.col(out_offset) += in_square;
+            out_count(out_offset)++;
+          }
+        }
+      }
+    }
+  }
+
+  out_count = out_count.array().inverse();
+  out_mat =
+      (out_mat.array().rowwise() * out_count.transpose().array()).cwiseSqrt();
+}
+
+inline void LocalResponseNormalization(const float* input_data,
+                                       const Dims<4>& input_dims, int range,
+                                       float bias, float alpha, float beta,
+                                       float* output_data,
+                                       const Dims<4>& output_dims) {
+  /* const int batches = */ MatchingArraySize(input_dims, 3, output_dims, 3);
+  /* const int height = */ MatchingArraySize(input_dims, 2, output_dims, 2);
+  /* const int width = */ MatchingArraySize(input_dims, 1, output_dims, 1);
+  /* const int depth = */ MatchingArraySize(input_dims, 0, output_dims, 0);
+
+  const auto data_in = MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+  auto data_out = MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+
+  // Carry out local response normalization, vector by vector.
+  // Since the data are stored column major, making row-wise operation
+  // probably not memory efficient anyway, we do an explicit for loop over
+  // the columns.
+  const int double_range = range * 2;
+  Eigen::VectorXf padded_square(data_in.rows() + double_range);
+  padded_square.setZero();
+  for (int r = 0; r < data_in.cols(); ++r) {
+    // Do local response normalization for data_in(:, r)
+    // first, compute the square and store them in buffer for repeated use
+    padded_square.block(range, 0, data_in.rows(), 1) =
+        data_in.col(r).cwiseProduct(data_in.col(r)) * alpha;
+    // Then, compute the scale and writes them to data_out
+    float accumulated_scale = 0;
+    for (int i = 0; i < double_range; ++i) {
+      accumulated_scale += padded_square(i);
+    }
+    for (int i = 0; i < data_in.rows(); ++i) {
+      accumulated_scale += padded_square(i + double_range);
+      data_out(i, r) = bias + accumulated_scale;
+      accumulated_scale -= padded_square(i);
+    }
+  }
+
+  // In a few cases, the pow computation could benefit from speedups.
+  if (beta == 1) {
+    data_out.array() = data_in.array() * data_out.array().inverse();
+  } else if (beta == 0.5) {
+    data_out.array() = data_in.array() * data_out.array().sqrt().inverse();
+  } else {
+    data_out.array() = data_in.array() * data_out.array().pow(-beta);
+  }
+}
+
+inline void Softmax(const float* input_data, const Dims<4>& input_dims,
+                    float beta, float* output_data,
+                    const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Softmax");
+  /* const int batches = */ MatchingArraySize(input_dims, 3, output_dims, 3);
+  /* const int height = */ MatchingArraySize(input_dims, 2, output_dims, 2);
+  /* const int width = */ MatchingArraySize(input_dims, 1, output_dims, 1);
+  /* const int depth = */ MatchingArraySize(input_dims, 0, output_dims, 0);
+
+  const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
+  auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
+  // Compute the exponential first, removing the max coefficient for numerical
+  // stability.
+  out_mat = (in_mat.rowwise() - in_mat.colwise().maxCoeff()).array() * beta;
+  // We are separating out the exp function so that exp can be vectorized.
+  out_mat = out_mat.array().exp();
+  // Normalize to get the activations.
+  Eigen::Array<float, 1, Eigen::Dynamic> scale =
+      out_mat.array().colwise().sum().inverse();
+  out_mat.array().rowwise() *= scale;
+}
+
+inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
+                    int32 input_beta_multiplier, int32 input_beta_left_shift,
+                    int diff_min, uint8* output_data,
+                    const Dims<4>& output_dims) {
+  // The representation chosen for the input to the exp() function is Q5.26.
+  // We need to leave extra space since values that we skip might be as large as
+  // -32 before multiplying by input_beta_multiplier, and therefore as large as
+  // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
+  // accumulation, but exp(-16) definitely is.
+  static const int kScaledDiffIntegerBits = 5;
+  static const int kAccumulationIntegerBits = 12;
+  using FixedPointScaledDiff =
+      gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
+  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+
+  gemmlowp::ScopedProfilingLabel label("Softmax");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+  for (int b = 0; b < batches; ++b) {
+    for (int x = 0; x < width; ++x) {
+      for (int y = 0; y < height; ++y) {
+        uint8 max_in_row = 0;
+        for (int c = 0; c < depth; ++c) {
+          max_in_row =
+              std::max(max_in_row, input_data[Offset(input_dims, c, x, y, b)]);
+        }
+
+        FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
+        for (int c = 0; c < depth; ++c) {
+          int32 input_diff =
+              static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
+              max_in_row;
+          if (input_diff >= diff_min) {
+            const int32 input_diff_rescaled =
+                MultiplyByQuantizedMultiplierGreaterThanOne(
+                    input_diff, input_beta_multiplier, input_beta_left_shift);
+            const FixedPointScaledDiff scaled_diff_f8 =
+                FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+            sum_of_exps =
+                sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
+                                  exp_on_negative_values(scaled_diff_f8));
+          }
+        }
+
+        int32 fixed_sum_of_exps = sum_of_exps.raw();
+        // TODO: Use a NEON intrinsic like vclzq_u32 instead.
+        int headroom_plus_one =
+            __builtin_clz(static_cast<uint32>(fixed_sum_of_exps));
+        // This is the number of bits to the left of the binary point above 1.0.
+        // Consider fixed_sum_of_exps=1.25.  In that case shifted_scale=0.8 and
+        // no later adjustment will be needed.
+        int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
+        int32 shifted_sum_minus_one = static_cast<int32>(
+            (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) -
+            (static_cast<uint32>(1) << 31));
+
+        FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1(
+            FixedPoint0::FromRaw(shifted_sum_minus_one));
+
+        for (int c = 0; c < depth; ++c) {
+          int32 input_diff =
+              static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) -
+              max_in_row;
+          if (input_diff >= diff_min) {
+            const int32 input_diff_rescaled =
+                MultiplyByQuantizedMultiplierGreaterThanOne(
+                    input_diff, input_beta_multiplier, input_beta_left_shift);
+            const FixedPointScaledDiff scaled_diff_f8 =
+                FixedPointScaledDiff::FromRaw(input_diff_rescaled);
+
+            FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
+            int32 unsat_output = gemmlowp::RoundingDivideByPOT(
+                (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8);
+
+            output_data[Offset(output_dims, c, x, y, b)] =
+                std::max(std::min(unsat_output, 255), 0);
+
+          } else {
+            output_data[Offset(output_dims, c, x, y, b)] = 0;
+          }
+        }
+      }
+    }
+  }
+}
+
+inline void Logistic(const float* input_data, const Dims<4>& input_dims,
+                     float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Logistic");
+  auto input_map = MapAsVector(input_data, input_dims);
+  auto output_map = MapAsVector(output_data, output_dims);
+  output_map.array() =
+      input_map.array().unaryExpr(Eigen::internal::scalar_sigmoid_op<float>());
+}
+
+inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
+                     int32 input_zero_point, int32 input_range_radius,
+                     int32 input_multiplier, int input_left_shift,
+                     uint8* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Logistic");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)];
+          const int32 input_val_centered =
+              static_cast<int32>(input_val_u8) - input_zero_point;
+          uint8 output_val;
+          if (input_val_centered < -input_range_radius) {
+            output_val = 0;
+          } else if (input_val_centered > input_range_radius) {
+            output_val = 255;
+          } else {
+            const int32 input_val_rescaled =
+                MultiplyByQuantizedMultiplierGreaterThanOne(
+                    input_val_centered, input_multiplier, input_left_shift);
+            using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>;
+            using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+            const FixedPoint4 input_val_f4 =
+                FixedPoint4::FromRaw(input_val_rescaled);
+            const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4);
+            using gemmlowp::RoundingDivideByPOT;
+            int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23);
+            if (output_val_s32 == 256) {
+              output_val_s32 = 255;
+            }
+            DCHECK_GE(output_val_s32, 0);
+            DCHECK_LE(output_val_s32, 255);
+            output_val = static_cast<uint8>(output_val_s32);
+          }
+          output_data[Offset(output_dims, c, x, y, b)] = output_val;
+        }
+      }
+    }
+  }
+}
+
+inline void Tanh(const float* input_data, const Dims<4>& input_dims,
+                 float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Tanh");
+  auto input_map = MapAsVector(input_data, input_dims);
+  auto output_map = MapAsVector(output_data, output_dims);
+  output_map.array() = input_map.array().tanh();
+}
+
+inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
+                       int32 zero_point, double scale, float* output_data,
+                       const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Dequantize");
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          int32 val = input_data[Offset(input_dims, c, x, y, b)];
+          float result = static_cast<float>(scale * (val - zero_point));
+          output_data[Offset(output_dims, c, x, y, b)] = result;
+        }
+      }
+    }
+  }
+}
+
+inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
+                      float rmin, float rmax, float* output_data,
+                      const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("FakeQuant");
+
+  // 0 should always be a representable value. Let's assume that the initial
+  // min,max range contains 0.
+  DCHECK_LE(rmin, 0.);
+  DCHECK_GE(rmax, 0.);
+
+  // Determine quantization parameters: zero_point, scale.
+  using Integer = uint8;
+  const Integer qmin = std::numeric_limits<Integer>::min();
+  const Integer qmax = std::numeric_limits<Integer>::max();
+  const float qmin_float = qmin;
+  const float qmax_float = qmax;
+  int32 zero_point = 0;
+  float scale = 0.f;
+  // If rmin==rmax, both must be zero per the above assertion,
+  // so we are done.
+  if (rmin != rmax) {
+    // First determine the scale.
+    scale = (rmax - rmin) / (qmax_float - qmin_float);
+
+    // Zero-point computation.
+    // First the initial floating-point computation. The zero-point can be
+    // determined from solving an affine equation for any known pair
+    // (real value, corresponding quantized value).
+    // We know two such pairs: (rmin, qmin) and (rmax, qmax).
+    // The arithmetic error on the zero point computed from either pair
+    // will be roughly machine_epsilon * (sum of absolute values of terms)
+    // so we want to use the variant that adds the smaller terms.
+    const float zero_point_from_min = qmin_float - rmin / scale;
+    const float zero_point_from_max = qmax_float - rmax / scale;
+    const float zero_point_from_min_error =
+        std::abs(qmin_float) + std::abs(rmin / scale);
+    const float zero_point_from_max_error =
+        std::abs(qmax_float) + std::abs(rmax / scale);
+
+    const float zero_point_float =
+        zero_point_from_min_error < zero_point_from_max_error
+            ? zero_point_from_min
+            : zero_point_from_max;
+
+    // Now we need to nudge the zero point to be an integer
+    // (our zero points are integer, and this is motivated by the requirement
+    // to be able to represent the real value "0" exactly as a quantized value,
+    // which is required in multiple places, for example in Im2col with SAME
+    // padding).
+    if (zero_point_float < qmin_float) {
+      zero_point = qmin;
+    } else if (zero_point_float > qmax_float) {
+      zero_point = qmax;
+    } else {
+      zero_point = static_cast<int32>(std::round(zero_point_float));
+    }
+    // The zero point should always be in the range of quantized value,
+    // [qmin, qmax].
+    DCHECK_GE(zero_point, qmin);
+    DCHECK_LE(zero_point, qmax);
+  }
+
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
+  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        for (int c = 0; c < depth; ++c) {
+          const float src_val = input_data[Offset(input_dims, c, x, y, b)];
+          const float unclamped_quantized_val =
+              std::round(zero_point + src_val / scale);
+          const float quantized_val = std::min(
+              qmax_float, std::max(qmin_float, unclamped_quantized_val));
+          const float dst_val = scale * (quantized_val - zero_point);
+          output_data[Offset(output_dims, c, x, y, b)] = dst_val;
+        }
+      }
+    }
+  }
+}
+
+template <typename SrcT, typename DstT>
+inline void Cast(const SrcT* input_data, const Dims<4>& input_dims,
+                 DstT* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Cast");
+  auto input_map = MapAsVector(input_data, input_dims);
+  auto output_map = MapAsVector(output_data, output_dims);
+  output_map.array() = input_map.array().template cast<DstT>();
+}
+
+inline void Floor(const float* input_data, const Dims<4>& input_dims,
+                  float* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Floor");
+  auto input_map = MapAsVector(input_data, input_dims);
+  auto output_map = MapAsVector(output_data, output_dims);
+  output_map.array() = Eigen::floor(input_map.array());
+}
+
+template <typename T>
+inline void Gather(const T* input_data, const Dims<4>& input_dims,
+                   const int32* coords_data, const Dims<4>& coords_dims,
+                   T* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Gather");
+  DCHECK_EQ(RequiredBufferSizeForDims(output_dims),
+            RequiredBufferSizeForDims(coords_dims));
+  for (int i = 0; i < RequiredBufferSizeForDims(coords_dims); i++) {
+    DCHECK_GE(coords_data[i], 0);
+    DCHECK_LT(coords_data[i], RequiredBufferSizeForDims(input_dims));
+    output_data[i] = input_data[coords_data[i]];
+  }
+}
+
+inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
+                           const int32* output_size_data,
+                           const Dims<4>& output_size_dims, float* output_data,
+                           const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("ResizeBilinear");
+  int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  int32 input_height = ArraySize(input_dims, 2);
+  int32 input_width = ArraySize(input_dims, 1);
+  int32 depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+  DCHECK_EQ(ArraySize(output_size_dims, 3), 1);
+  DCHECK_EQ(ArraySize(output_size_dims, 2), 1);
+  DCHECK_EQ(ArraySize(output_size_dims, 1), 1);
+  DCHECK_EQ(ArraySize(output_size_dims, 0), 2);
+  int32 output_height = output_size_data[Offset(output_size_dims, 0, 0, 0, 0)];
+  int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)];
+  float height_scale = static_cast<float>(input_height) / output_height;
+  float width_scale = static_cast<float>(input_width) / output_width;
+
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < output_height; ++y) {
+      float input_y = y * height_scale;
+      int32 y0 = static_cast<int32>(input_y);
+      int32 y1 = std::min(y0 + 1, input_height - 1);
+      for (int x = 0; x < output_width; ++x) {
+        float input_x = x * width_scale;
+        int32 x0 = static_cast<int32>(input_x);
+        int32 x1 = std::min(x0 + 1, input_width - 1);
+        for (int c = 0; c < depth; ++c) {
+          float interpolation = input_data[Offset(input_dims, c, x0, y0, b)] *
+                                    (1 - (input_y - y0)) *
+                                    (1 - (input_x - x0)) +
+                                input_data[Offset(input_dims, c, x0, y1, b)] *
+                                    (input_y - y0) * (1 - (input_x - x0)) +
+                                input_data[Offset(input_dims, c, x1, y0, b)] *
+                                    (1 - (input_y - y0)) * (input_x - x0) +
+                                input_data[Offset(input_dims, c, x1, y1, b)] *
+                                    (input_y - y0) * (input_x - x0);
+          output_data[Offset(output_dims, c, x, y, b)] = interpolation;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace optimized_ops
+}  // namespace rt
+}  // namespace nnfw
+
+#if defined OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS
+#undef OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS
+#pragma GCC diagnostic pop
+#endif
+
+#endif  // __NNFW_RT_OPTIMIZED_OPS_H__
diff --git a/runtimes/nn/common/operations/internal/optimized/tensor_utils_impl.h b/runtimes/nn/common/operations/internal/optimized/tensor_utils_impl.h
new file mode 100644
index 000000000..bf659d0a3
--- /dev/null
+++ b/runtimes/nn/common/operations/internal/optimized/tensor_utils_impl.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RT_TENSOR_UTILS_IMPL_H__
+#define __NNFW_RT_TENSOR_UTILS_IMPL_H__
+
+#include "ActivationFunctor.h"
+
+#ifndef USE_NEON
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#endif  //  defined(__ARM_NEON__) || defined(__ARM_NEON)
+#endif  //  USE_NEON
+
+namespace nnfw {
+namespace rt {
+namespace tensor_utils {
+
+// Multiply a matrix by a batch vector, and store results in a batch-size
+// vector.
+void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
+                                                 int m_rows, int m_cols,
+                                                 const float* vector,
+                                                 int n_batch, float* result,
+                                                 int result_stride);
+void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+                                             int m_cols, const float* vector,
+                                             int n_batch, float* result,
+                                             int result_stride);
+
+// Cwise product of two vectors.
+void PortableVectorVectorCwiseProduct(const float* vector1,
+                                      const float* vector2, int v_size,
+                                      float* result);
+void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
+                                  int v_size, float* result);
+
+// Cwise product and accumulate of two vectors. Since it's a MAC operation, the
+// assumption here is that result array is initialized to valid values.
+void PortableVectorVectorCwiseProductAccumulate(const float* vector1,
+                                                const float* vector2,
+                                                int v_size, float* result);
+void NeonVectorVectorCwiseProductAccumulate(const float* vector1,
+                                            const float* vector2, int v_size,
+                                            float* result);
+
+// Dot product of two vectors.
+float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
+                                     int v_size);
+
+// Dot product of two batch vectors.
+void PortableBatchVectorBatchVectorDotProduct(const float* vector1,
+                                              const float* vector2, int v_size,
+                                              int n_batch, float* result,
+                                              int result_stride);
+
+// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
+// operation, the assumption here is that result array is initialized to valid
+// values.
+void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector,
+                                                     int v_size,
+                                                     const float* batch_vector,
+                                                     int n_batch,
+                                                     float* result);
+void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
+                                                 int v_size,
+                                                 const float* batch_vector,
+                                                 int n_batch, float* result);
+
+// Compute "1.0f - elements of vector" (used in CIFG).
+void PortableSub1Vector(const float* vector, int v_size, float* result);
+void NeonSub1Vector(const float* vector, int v_size, float* result);
+
+// Clip elements of a vector using a abs_limit value.
+void PortableClipVector(const float* vector, int v_size, float abs_limit,
+                        float* result);
+void NeonClipVector(const float* vector, int v_size, float abs_limit,
+                    float* result);
+
+// Batch vector initialization with another vector.
+void PortableVectorBatchVectorAssign(const float* vector, int v_size,
+                                     int n_batch, float* batch_vector);
+
+// Apply sigmoid to elements of a vector.
+void PortableApplySigmoidToVector(const float* vector, int v_size,
+                                  float* result);
+
+// Apply activation function to elements of a vector.
+void PortableApplyActivationToVector(const float* vector, int v_size,
+                                     ActivationFn activation,
+                                     float* result);
+
+// Copy vector to another vector.
+void PortableCopyVector(const float* vector, int v_size, float* result);
+
+// Fill vector with 0.f.
+void PortableZeroVector(float* vector, int v_size);
+
+// Limit a float input f between +abs_limit and -abs_limit.
+float PortableClip(float f, float abs_limit);
+
+// Shift left a vector in place with v_size size.
+void PortableVectorShiftLeft(float* vector, int v_size, float shift_value);
+
+// Reduce-sum on a float input vector:
+// input_vector: float pointer to input vector.
+// input_stride: input vector stride.
+// output_vector: float pointer to vector.
+// output_size: output vector size.
+// reduction_size: number of consecutive elements from input vector which are
+// added to get one element of output.
+void PortableReductionSumVector(const float* input_vector, int input_stride,
+                                float* output_vector, int output_size,
+                                int reduction_size);
+}  // namespace tensor_utils
+}  // namespace rt
+}  // namespace nnfw
+
+#endif  // __NNFW_RT_TENSOR_UTILS_IMPL_H__
diff --git a/runtimes/nn/common/operations/internal/tensor_utils.cc b/runtimes/nn/common/operations/internal/tensor_utils.cc
new file mode 100644
index 000000000..78275bb29
--- /dev/null
+++ b/runtimes/nn/common/operations/internal/tensor_utils.cc
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensor_utils.h"
+
+#ifndef USE_NEON
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#endif  //  defined(__ARM_NEON__) || defined(__ARM_NEON)
+#endif  //  USE_NEON
+
+#ifdef USE_NEON
+#include "optimized/neon_tensor_utils.h"
+#else
+#include "reference/portable_tensor_utils.h"
+#endif  // USE_NEON
diff --git a/runtimes/nn/common/operations/internal/tensor_utils.h b/runtimes/nn/common/operations/internal/tensor_utils.h
new file mode 100644
index 000000000..df3d4e27b
--- /dev/null
+++ b/runtimes/nn/common/operations/internal/tensor_utils.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RT_TENSOR_UTILS_H__
+#define __NNFW_RT_TENSOR_UTILS_H__
+
+#include "ActivationFunctor.h"
+
+namespace nnfw {
+namespace rt {
+namespace tensor_utils {
+
+// Limit a float input f betweeen +abs_limit and -abs_limit.
+float Clip(float f, float abs_limit);
+
+// Multiply a matrix by a batch vector, and store results in a batch-size
+// vector using a stride value provided in result_stride. 'result_stride' shows
+// how the number of elements between consecutive result values. For example
+// result_stride = 1, will cause the output to look like this:
+// [O_1, 0_2, ... O_rows] in memory, but result_stride = 3, will cause it to be
+// arranged like this in memory: [O_1, x, x, 0_2, x, x, ..., O_rows]
+void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
+                                         int m_cols, const float* vector,
+                                         int n_batch, float* result,
+                                         int result_stride);
+
+// Cwise product of two vectors.
+void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
+                              int v_size, float* result);
+
+// Cwise product and accumulate of two vectors. Since it's a MAC opertation, the
+// assumption here is that result array is initialized to valid values.
+void VectorVectorCwiseProductAccumulate(const float* vector1,
+                                        const float* vector2, int v_size,
+                                        float* result);
+
+// Dot product of two vectors.
+float VectorVectorDotProduct(const float* vector1, const float* vector2,
+                             int v_size);
+
+// Dot product of two batch vectors of size n_batch * v_size:
+// vector1 = [x_1_1, x_1_2, ..., x_1_vsize,
+//            x_2_1, x_2_2, ..., x_2_vsize,
+//            ...
+//            x_nbatch_1,..., x_nbatch_vsize]
+// vector2 = [y_1_1, y_1_2, ..., y_1_vsize,
+//            y_2_1, y_2_2, ..., y_2_vsize,
+//            ...
+//            y_nbatch_1,..., y_nbatch_vsize]
+// Then result will be a vector of n_batch size which will be saved with a
+// stride of result_stride in memory starting from 'result':
+// [x_1_1 * y_1_1 + x_1_2 * y_1_2 + ... + x_1_vsize * y_1_vsize,
+//  x_2_1 * y_2_1 + x_2_2 * y_2_2 + ... + x_2_vsize * y_2_vsize,
+//  ...
+//  x_nbatch_1 * y_nbatch_1 + ... + x_nbatch_vsize * y_nbatch_vsize]
+void BatchVectorBatchVectorDotProduct(const float* vector1,
+                                      const float* vector2, int v_size,
+                                      int n_batch, float* result,
+                                      int result_stride);
+
+// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC
+// operation, the assumption here is that result array is initialized to valid
+// values.
+void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size,
+                                             const float* batch_vector,
+                                             int n_batch, float* result);
+
+// Batch vector initialization with another vector.
+void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch,
+                             float* batch_vector);
+
+// Apply sigmoid to elements of a vector.
+void ApplySigmoidToVector(const float* vector, int v_size, float* result);
+
+// Apply activation function to elements of a vector.
+void ApplyActivationToVector(const float* vector, int v_size,
+                             ActivationFn activation, float* result);
+
+// Copy vector to another vector.
+void CopyVector(const float* vector, int v_size, float* result);
+
+// Compute "1.0f - elements of vector" (used in CIFG).
+void Sub1Vector(const float* vector, int v_size, float* result);
+
+// Fill vector with 0.f.
+void ZeroVector(float* vector, int v_size);
+
+// Clip elements of a vector using a abs_limit value.
+void ClipVector(const float* vector, int v_size, float abs_limit,
+                float* result);
+
+// Shift left a vector in place with v_size size.
+void VectorShiftLeft(float* vector, int v_size, float shift_value);
+
+// Reduce-sum on a float input vector:
+// input_vector: float pointer to input vector.
+// input_stride: input vector stride.
+// output_vector: float pointer to vector.
+// output_size: output vector size.
+// reduction_size: number of consecutive elements from input vector which are
+// added to get one element of output.
+void ReductionSumVector(const float* input_vector, int input_stride,
+                        float* output_vector, int output_size,
+                        int reduction_size);
+}  // namespace tensor_utils
+}  // namespace rt
+}  // namespace nnfw
+
+#endif  // __NNFW_RT_TENSOR_UTILS_H__
diff --git a/runtimes/nn/common/operations/internal/tensor_utils_test.cc b/runtimes/nn/common/operations/internal/tensor_utils_test.cc
new file mode 100644
index 000000000..b68982164
--- /dev/null
+++ b/runtimes/nn/common/operations/internal/tensor_utils_test.cc
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "gmock/gmock-matchers.h"
+#include "gtest/gtest.h"
+#include "tensor_utils.h"
+
+namespace nnfw {
+namespace rt {
+namespace tensor_utils {
+
+namespace {
+
+using ::testing::FloatNear;
+using ::testing::Matcher;
+
+std::vector<Matcher<float>> ArrayFloatNear(const std::vector<float>& values,
+                                           float max_abs_error=1.e-6) {
+  std::vector<Matcher<float>> matchers;
+  matchers.reserve(values.size());
+  for (const float& v : values) {
+    matchers.emplace_back(FloatNear(v, max_abs_error));
+  }
+  return matchers;
+}
+
+}  // anonymous namespace
+
+TEST(uKernels, ClipTest) {
+  constexpr int kVectorSize = 10;
+  constexpr float kAbsLimit = 2.0;
+  static float input[kVectorSize] = {0.0,  -0.5, 1.0,  -1.5, 2.0,
+                                     -2.5, 3.0,  -3.5, 4.0,  -4.5};
+  std::vector<float> output(kVectorSize);
+  ClipVector(input, kVectorSize, kAbsLimit, output.data());
+  EXPECT_THAT(output,
+              ElementsAreArray(ArrayFloatNear(
+                  {0.0, -0.5, 1.0, -1.5, 2.0, -2.0, 2.0, -2.0, 2.0, -2.0})));
+}
+
+TEST(uKernels, MatrixBatchVectorMultiplyAccumulateTest) {
+  constexpr int kRow = 3;
+  constexpr int kCol = 4;
+  constexpr int kBatch = 2;
+  static float matrix[kRow * kCol] = {1.0,  2.0,  3.0,  4.0,   //
+                                      -1.0, -2.0, -3.0, -4.0,  //
+                                      1.0,  -2.0, 3.0,  -4.0};
+  static float vector[kCol * kBatch] = {1.0, -1.0, 1.0, -1.0,  //
+                                        2.0, -2.0, 2.0, -2.0};
+  std::vector<float> output(kRow * kBatch);
+  std::fill(output.begin(), output.end(), 3.0);
+  MatrixBatchVectorMultiplyAccumulate(matrix, kRow, kCol, vector, kBatch,
+                                      output.data(), /*result_stride=*/1);
+  EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear({1., 5., 13.,  //
+                                                       -1., 7., 23.})));
+}
+
+TEST(uKernels, VectorVectorCwiseProductTest) {
+  constexpr int kVectorSize = 10;
+  static float input1[kVectorSize] = {0.0,  -0.5, 1.0,  -1.5, 2.0,
+                                      -2.5, 3.0,  -3.5, 4.0,  -4.5};
+  static float input2[kVectorSize] = {0.1,  -0.1, 0.1,  -0.1, 0.1,
+                                      -0.1, 0.1,  -0.1, 0.1,  -0.1};
+  std::vector<float> output(kVectorSize);
+  VectorVectorCwiseProduct(input1, input2, kVectorSize, output.data());
+  EXPECT_THAT(output,
+              ElementsAreArray(ArrayFloatNear(
+                  {0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45})));
+}
+
+TEST(uKernels, VectorVectorCwiseProductAccumulateTest) {
+  constexpr int kVectorSize = 10;
+  static float input1[kVectorSize] = {0.0,  -0.5, 1.0,  -1.5, 2.0,
+                                      -2.5, 3.0,  -3.5, 4.0,  -4.5};
+  static float input2[kVectorSize] = {0.1,  -0.1, 0.1,  -0.1, 0.1,
+                                      -0.1, 0.1,  -0.1, 0.1,  -0.1};
+  std::vector<float> output(kVectorSize);
+  std::fill(output.begin(), output.end(), 1.0);
+  VectorVectorCwiseProductAccumulate(input1, input2, kVectorSize,
+                                     output.data());
+  EXPECT_THAT(output,
+              ElementsAreArray(ArrayFloatNear(
+                  {1.0, 1.05, 1.1, 1.15, 1.2, 1.25, 1.3, 1.35, 1.4, 1.45})));
+}
+
+TEST(uKernels, VectorBatchVectorAssignTest) {
+  constexpr int kVectorSize = 5;
+  constexpr int kBatchSize = 3;
+  static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
+  std::vector<float> output(kVectorSize * kBatchSize);
+  VectorBatchVectorAssign(input, kVectorSize, kBatchSize, output.data());
+  EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear(
+                          {0.0, -0.5, 1.0, -1.5, 2.0, 0.0, -0.5, 1.0, -1.5, 2.0,
+                           0.0, -0.5, 1.0, -1.5, 2.0})));
+}
+
+TEST(uKernels, ApplySigmoidToVectorTest) {
+  constexpr int kVectorSize = 5;
+  static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
+  std::vector<float> output(kVectorSize);
+  ApplySigmoidToVector(input, kVectorSize, output.data());
+  EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear(
+                          {0.5, 0.377541, 0.731059, 0.182426, 0.880797})));
+}
+
+TEST(uKernels, ApplyActivationToVectorTest) {
+  constexpr int kVectorSize = 5;
+  static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
+  std::vector<float> output(kVectorSize);
+  ApplyActivationToVector(input, kVectorSize, kActivationRelu, output.data());
+  EXPECT_THAT(output,
+              ElementsAreArray(ArrayFloatNear({0.0, 0.0, 1.0, 0.0, 2.0})));
+
+  ApplyActivationToVector(input, kVectorSize, kActivationTanh, output.data());
+  EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear(
+                          {0.0, -0.462117, 0.761594, -0.905148, 0.964028})));
+}
+
+TEST(uKernels, CopyVectorTest) {
+  constexpr int kVectorSize = 5;
+  static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
+  std::vector<float> output(kVectorSize);
+  CopyVector(input, kVectorSize, output.data());
+  EXPECT_THAT(output,
+              ElementsAreArray(ArrayFloatNear({0.0, -0.5, 1.0, -1.5, 2.0})));
+}
+
+TEST(uKernels, Sub1VectorTest) {
+  constexpr int kVectorSize = 5;
+  static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
+  std::vector<float> output(kVectorSize);
+  Sub1Vector(input, kVectorSize, output.data());
+  EXPECT_THAT(output,
+              ElementsAreArray(ArrayFloatNear({1.0, 1.5, 0.0, 2.5, -1.0})));
+}
+
+TEST(uKernels, ZeroVectorTest) {
+  constexpr int kVectorSize = 5;
+  std::vector<float> output(kVectorSize);
+  ZeroVector(output.data(), kVectorSize);
+  EXPECT_THAT(output,
+              ElementsAreArray(ArrayFloatNear({0.0, 0.0, 0.0, 0.0, 0.0})));
+}
+
+TEST(uKernels, BatchVectorBatchVectorDotProductTest) {
+  constexpr int kVectorSize = 5;
+  constexpr int kBatch = 2;
+  static float input1[kVectorSize * kBatch] = {0.0,  -0.5, 1.0,  -1.5, 2.0,
+                                               -2.5, 3.0,  -3.5, 4.0,  -4.5};
+  static float input2[kVectorSize * kBatch] = {0.1,  -0.1, 0.1,  -0.1, 0.1,
+                                               -0.1, 0.1,  -0.1, 0.1,  -0.1};
+  std::vector<float> output(kBatch);
+  BatchVectorBatchVectorDotProduct(input1, input2, kVectorSize, kBatch,
+                                   output.data(), /*result_stride=*/1);
+  EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear({0.5, 1.75})));
+}
+
+TEST(uKernels, VectorShiftLeftTest) {
+  constexpr int kVectorSize = 5;
+  static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0};
+  std::vector<float> result(kVectorSize);
+  VectorShiftLeft(input, kVectorSize, 3.0);
+  result.assign(input, input + kVectorSize);
+  EXPECT_THAT(result,
+              ElementsAreArray(ArrayFloatNear({-0.5, 1.0, -1.5, 2.0, 3.0})));
+}
+
+TEST(uKernels, ReductionSumVectorTest) {
+  constexpr int kInputVectorSize = 10;
+  constexpr int kOutputVectorSize = 5;
+  constexpr int kReductionSize = 2;
+  static float input[kInputVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0,
+                                          0.0, -0.5, 1.0, 1.0,  2.0};
+  std::vector<float> result(kOutputVectorSize);
+  ReductionSumVector(input,
+                     /*input_stride=*/1, result.data(), kOutputVectorSize,
+                     kReductionSize);
+  EXPECT_THAT(result,
+              ElementsAreArray(ArrayFloatNear({-0.5, -0.5, 2.0, 0.5, 3.0})));
+}
+
+}  // namespace tensor_utils
+}  // namespace rt
+}  // namespace nnfw
diff --git a/runtimes/nn/common/operations/internal/types.h b/runtimes/nn/common/operations/internal/types.h
new file mode 100644
index 000000000..bd5880edd
--- /dev/null
+++ b/runtimes/nn/common/operations/internal/types.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (C) 2017 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_RT_TYPES_H__
+#define __NNFW_RT_TYPES_H__
+
+#include "compatibility.h"
+
+namespace nnfw {
+namespace rt {
+
+enum class FusedActivationFunctionType { kNone, kRelu6, kRelu1, kRelu };
+
+template <int N>
+struct Dims {
+  int sizes[N];
+  int strides[N];
+};
+
+struct Shape;
+
+inline Dims<4> convertShapeToDims(const Shape& shape) {
+  Dims<4> dims;
+  for (int i=0; i<4; i++) {
+    dims.sizes[i] = 1;
+  }
+
+  if (shape.dimensions.size() == 1) {
+    dims.sizes[0] = (int)getSizeOfDimension(shape, 0);
+  } else {
+    for (int i=0; i<4; i++) {
+      int src = (int)shape.dimensions.size()-i-1;
+      if (src >= 0) {
+        dims.sizes[i] = (int)getSizeOfDimension(shape, src);
+      }
+    }
+  }
+
+  dims.strides[0] = 1;
+  for (int i = 1; i<4; i++) {
+    dims.strides[i] = dims.strides[i-1] * dims.sizes[i-1];
+  }
+  return dims;
+}
+
+inline int Offset(const Dims<4>& dims, int i0, int i1, int i2, int i3) {
+  DCHECK(i0 >= 0 && i0 < dims.sizes[0]);
+  DCHECK(i1 >= 0 && i1 < dims.sizes[1]);
+  DCHECK(i2 >= 0 && i2 < dims.sizes[2]);
+  DCHECK(i3 >= 0 && i3 < dims.sizes[3]);
+  return i0 * dims.strides[0] + i1 * dims.strides[1] + i2 * dims.strides[2] +
+         i3 * dims.strides[3];
+}
+
+// Get array size, DCHECKing that the dim index is in range.
+template <int N>
+int ArraySize(const Dims<N>& array, int index) {
+  DCHECK(index >= 0 && index < N);
+  return array.sizes[index];
+}
+
+// Get common array size, DCHECKing that they all agree.
+template <typename ArrayType1, typename ArrayType2>
+int MatchingArraySize(const ArrayType1& array1, int index1,
+                      const ArrayType2& array2, int index2) {
+  DCHECK_EQ(ArraySize(array1, index1), ArraySize(array2, index2));
+  return ArraySize(array1, index1);
+}
+
+template <typename ArrayType1, typename ArrayType2, typename... Args>
+int MatchingArraySize(const ArrayType1& array1, int index1,
+                      const ArrayType2& array2, int index2, Args... args) {
+  DCHECK_EQ(ArraySize(array1, index1), ArraySize(array2, index2));
+  return MatchingArraySize(array1, index1, args...);
+}
+
+inline int RequiredBufferSizeForDims(const Dims<4>& dims) {
+  int max_offset = 0;
+  for (int i = 0; i < 4; i++) {
+    max_offset += (dims.sizes[i] - 1) * dims.strides[i];
+  }
+  return max_offset + 1;
+}
+
+template <int N>
+bool IsPackedWithoutStrides(const Dims<N>& dims) {
+  int expected_stride = 1;
+  for (int d = 0; d < N; d++) {
+    if (dims.strides[d] != expected_stride) return false;
+    expected_stride *= dims.sizes[d];
+  }
+  return true;
+}
+
+}  // namespace rt
+}  // namespace nnfw
+
+#endif  // __NNFW_RT_TYPES_H__