diff options
Diffstat (limited to 'runtimes/nn/common')
36 files changed, 10846 insertions, 0 deletions
diff --git a/runtimes/nn/common/CMakeLists.txt b/runtimes/nn/common/CMakeLists.txt new file mode 100644 index 000000000..31d2d8086 --- /dev/null +++ b/runtimes/nn/common/CMakeLists.txt @@ -0,0 +1,31 @@ +SET (CUR_INCS + ${CMAKE_CURRENT_SOURCE_DIR}/include + ${CMAKE_CURRENT_SOURCE_DIR}/operations + ${CMAKE_CURRENT_SOURCE_DIR} +) +SET (INC_DIRS + ${INC_DIRS} + ${CUR_INCS} + PARENT_SCOPE +) + +SET (CUR_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/CpuExecutor.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/OperationsUtils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/Utils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/NNFWKernels.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/operations/Activation.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/operations/Conv2D.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/operations/Concatenation.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/operations/FullyConnected.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/operations/Pooling.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/operations/SimpleMath.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/operations/Reshape.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/Logging.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/operations/DepthwiseConv2D.cpp +) +SET (SRCS + ${SRCS} + ${CUR_SRCS} + PARENT_SCOPE +) diff --git a/runtimes/nn/common/CpuExecutor.cpp b/runtimes/nn/common/CpuExecutor.cpp new file mode 100644 index 000000000..5a8f6f18b --- /dev/null +++ b/runtimes/nn/common/CpuExecutor.cpp @@ -0,0 +1,1324 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define LOG_TAG "CpuExecutor" + +#include "CpuExecutor.h" + +#include "NeuralNetworks.h" +#include "Operations.h" + +#include "NNFWKernels.h" + +#include <sys/mman.h> + +namespace nnfw { +namespace rt { + +// TODO: short term, make share memory mapping and updating a utility function. +// TODO: long term, implement mmap_fd as a hidl IMemory service. +bool RunTimePoolInfo::set(const hidl_memory& hidlMemory) { + this->hidlMemory = hidlMemory; + auto memType = hidlMemory.name(); + if (memType == "ashmem") { +#if 0 // REF-ANN Enable if ashmem type and IMemory are use + memory = mapMemory(hidlMemory); + if (memory == nullptr) { + LOG(ERROR) << "Can't map shared memory."; + return false; + } + memory->update(); + buffer = reinterpret_cast<uint8_t*>(static_cast<void*>(memory->getPointer())); + if (buffer == nullptr) { + LOG(ERROR) << "Can't access shared memory."; + return false; + } + return true; +#endif + LOG(ERROR) << "Currently, Not Support \"ashmem\" type"; + return false; + } else if (memType == "mmap_fd") { + size_t size = hidlMemory.size(); + int fd = hidlMemory.handle()->data[0]; + int prot = hidlMemory.handle()->data[1]; + size_t offset = getSizeFromInts(hidlMemory.handle()->data[2], + hidlMemory.handle()->data[3]); + buffer = static_cast<uint8_t*>(mmap(nullptr, size, prot, MAP_SHARED, fd, offset)); + if (buffer == MAP_FAILED) { + LOG(ERROR) << "Can't mmap the file descriptor."; + return false; + } + return true; + } else { + LOG(ERROR) << "unsupported hidl_memory type"; + return false; + } +} + +// Making sure the output data are correctly updated after execution. +bool RunTimePoolInfo::update() { + auto memType = hidlMemory.name(); + if (memType == "ashmem") { +#if 0 // REF-ANN Enable if ashmem type and IMemory are use + memory->commit(); + return true; +#endif + LOG(ERROR) << "Currently, Not Support \"ashmem\" type"; + return false; + } else if (memType == "mmap_fd") { + int prot = hidlMemory.handle()->data[1]; + if (prot & PROT_WRITE) { + size_t size = hidlMemory.size(); + return msync(buffer, size, MS_SYNC) == 0; + } + } + // No-op for other types of memory. + return true; +} + +bool setRunTimePoolInfosFromHidlMemories(std::vector<RunTimePoolInfo>* poolInfos, + const hidl_vec<hidl_memory>& pools) { + poolInfos->resize(pools.size()); + for (size_t i = 0; i < pools.size(); i++) { + auto& poolInfo = (*poolInfos)[i]; + if (!poolInfo.set(pools[i])) { + LOG(ERROR) << "Could not map pool"; + return false; + } + } + return true; +} + +// Updates the RunTimeOperandInfo with the newly calculated shape. +// Allocate the buffer if we need to. +static bool setInfoAndAllocateIfNeeded(RunTimeOperandInfo* info, const Shape& shape) { + // For user-provided model output operands, the parameters must match the Shape + // calculated from the preparation step. + if (info->lifetime == OperandLifeTime::MODEL_OUTPUT) { + if (info->type != shape.type || + info->dimensions != shape.dimensions) { + LOG(ERROR) << "Invalid type or dimensions for model output"; + return false; + } + if (info->type == OperandType::TENSOR_QUANT8_ASYMM && + (info->scale != shape.scale || info->zeroPoint != shape.offset)) { + LOG(ERROR) << "Invalid scale or zeroPoint for model output"; + return false; + } + } + info->type = shape.type; + info->dimensions = shape.dimensions; + info->scale = shape.scale; + info->zeroPoint = shape.offset; + if (info->lifetime == OperandLifeTime::TEMPORARY_VARIABLE && info->buffer == nullptr) { + uint32_t length = sizeOfData(info->type, info->dimensions); + info->buffer = new uint8_t[length]; + if (info->buffer == nullptr) { + return false; + } + } + return true; +} + +// Ignore the .pools entry in model and request. This will have been taken care of +// by the caller. +int CpuExecutor::run(const Model& model, const Request& request, + const std::vector<RunTimePoolInfo>& modelPoolInfos, + const std::vector<RunTimePoolInfo>& requestPoolInfos) { + VLOG(CPUEXE) << "CpuExecutor::run()"; + // VLOG(CPUEXE) << "model: " << toString(model); +#if 0 // REF-ANN + VLOG(CPUEXE) << "request: " << toString(request); +#endif + + // Prepare NNFW_KERNELS + nnfw::rt::init_nnfw_kernels(); + + mModel = &model; + mRequest = &request; // TODO check if mRequest is needed + initializeRunTimeInfo(modelPoolInfos, requestPoolInfos); + // The model has serialized the operation in execution order. + for (const auto& operation : model.operations) { + int n = executeOperation(operation); + if (n != ANEURALNETWORKS_NO_ERROR) { + return n; + } + } + for (auto runtimeInfo : modelPoolInfos) { + runtimeInfo.update(); + } + for (auto runtimeInfo : requestPoolInfos) { + runtimeInfo.update(); + } + mModel = nullptr; + mRequest = nullptr; + VLOG(CPUEXE) << "Completed run normally"; + return ANEURALNETWORKS_NO_ERROR; +} + +bool CpuExecutor::initializeRunTimeInfo(const std::vector<RunTimePoolInfo>& modelPoolInfos, + const std::vector<RunTimePoolInfo>& requestPoolInfos) { + VLOG(CPUEXE) << "CpuExecutor::initializeRunTimeInfo"; + const size_t count = mModel->operands.size(); + mOperands.resize(count); + + // Start by setting the runtime info to what's in the model. + for (size_t i = 0; i < count; i++) { + const Operand& from = mModel->operands[i]; + RunTimeOperandInfo& to = mOperands[i]; + to.type = from.type; + to.dimensions = from.dimensions; + to.scale = from.scale; + to.zeroPoint = from.zeroPoint; + to.length = from.location.length; + to.lifetime = from.lifetime; + switch (from.lifetime) { + case OperandLifeTime::TEMPORARY_VARIABLE: + to.buffer = nullptr; + to.numberOfUsesLeft = from.numberOfConsumers; + break; + case OperandLifeTime::CONSTANT_COPY: + to.buffer = const_cast<uint8_t*>(&mModel->operandValues[from.location.offset]); + to.numberOfUsesLeft = 0; + break; + case OperandLifeTime::CONSTANT_REFERENCE: { + auto poolIndex = from.location.poolIndex; + nnAssert(poolIndex < modelPoolInfos.size()); + auto& r = modelPoolInfos[poolIndex]; + to.buffer = r.buffer + from.location.offset; + to.numberOfUsesLeft = 0; + break; + } + case OperandLifeTime::MODEL_INPUT: + case OperandLifeTime::MODEL_OUTPUT: + case OperandLifeTime::NO_VALUE: + to.buffer = nullptr; + to.numberOfUsesLeft = 0; + break; + default: + nnAssert(false); + break; + } + } + + // Adjust the runtime info for the arguments passed to the model, + // modifying the buffer location, and possibly the dimensions. + auto updateForArguments = [this, &requestPoolInfos](const std::vector<uint32_t>& indexes, + const hidl_vec<RequestArgument>& arguments) { + nnAssert(indexes.size() == arguments.size()); + for (size_t i = 0; i < indexes.size(); i++) { + const uint32_t operandIndex = indexes[i]; + const RequestArgument& from = arguments[i]; + RunTimeOperandInfo& to = mOperands[operandIndex]; + if (from.dimensions.size() > 0) { + // It's the responsibility of the caller to validate that + // from.dimensions only modifies the dimensions that were + // unspecified in the model. That's the case in SampleDriver.cpp + // with the call to validateRequest(). + // TODO make sure that's the case for the default CPU path. + to.dimensions = from.dimensions; + } + if (from.hasNoValue) { + to.lifetime = OperandLifeTime::NO_VALUE; + nnAssert(to.buffer == nullptr); + } else { + auto poolIndex = from.location.poolIndex; + nnAssert(poolIndex < requestPoolInfos.size()); + auto& r = requestPoolInfos[poolIndex]; + to.buffer = r.buffer + from.location.offset; + } + } + }; + updateForArguments(mModel->inputIndexes, mRequest->inputs); + updateForArguments(mModel->outputIndexes, mRequest->outputs); + + return true; +} + +void CpuExecutor::freeNoLongerUsedOperands(const std::vector<uint32_t>& inputs) { + for (uint32_t i : inputs) { + auto& info = mOperands[i]; + // Check if it's a static or model input/output. + if (info.numberOfUsesLeft == 0) { + continue; + } + info.numberOfUsesLeft--; + if (info.numberOfUsesLeft == 0) { + nnAssert(info.buffer != nullptr); + delete[] info.buffer; + info.buffer = nullptr; + } + } +} + +#ifdef NNFW_KERNEL +#error NNFW_KERNEL should not be defined elsewhere. +#else +#define NNFW_KERNEL(_func_name_, _kernel_name_) \ + auto _func_name_ = _kernel_name_; \ + { \ + auto target = std::getenv("NNFW_KERNEL_" #_kernel_name_); \ + if (target != nullptr) \ + { \ + auto it = nnfw_kernels_##_kernel_name_.find(target); \ + if (it != nnfw_kernels_##_kernel_name_.end()) \ + { \ + _func_name_ = it->second; \ + } \ + } \ + } +#endif + +int CpuExecutor::executeOperation(const Operation& operation) { + // VLOG(CPUEXE) << "CpuExecutor::executeOperation(" << toString(operation) << ")"; + const hidl_vec<uint32_t>& ins = operation.inputs; + const hidl_vec<uint32_t>& outs = operation.outputs; + bool success = false; + + // Function to verify that the number of input and output parameters + // matches what is expected. Also checks that all the parameters have + // values. This function is to be used only for operations that do not + // accept optional arguments. + // TODO Have a version that works for optional arguments. + auto allParametersPresent = [&operation, &ins, &outs, this](size_t requiredIns, + size_t requiredOuts) -> bool { + auto verify = [&operation, this](size_t requiredCount, const hidl_vec<uint32_t>& indexes, + const char* type) -> bool { + size_t actualCount = indexes.size(); + if (actualCount != requiredCount) { + LOG(ERROR) << getOperationName(operation.type) + << ": Invalid number of " << type << " operands. Got " << actualCount + << " of " << requiredCount; + return false; + } + for (size_t i = 0; i < actualCount; i++) { + if (mOperands[indexes[i]].lifetime == OperandLifeTime::NO_VALUE) { + LOG(ERROR) << getOperationName(operation.type) << " " << type + << " operand " << i << " is required but missing."; + return false; + } + } + return true; + }; + return verify(requiredIns, ins, "in") && verify(requiredOuts, outs, "out"); + }; + + switch (operation.type) { +#if 0 // REF-ANN + case OperationType::OEM_OPERATION: { + LOG(ERROR) << "OEM operation not supported for CPU execution"; + success = false; + } break; +#endif + case OperationType::ADD: { + if (!allParametersPresent(3, 1)) { + return ANEURALNETWORKS_BAD_DATA; + } + const RunTimeOperandInfo& in1 = mOperands[ins[0]]; + const RunTimeOperandInfo& in2 = mOperands[ins[1]]; + int32_t activation = getScalarData<int32_t>(mOperands[ins[2]]); + + RunTimeOperandInfo& out = mOperands[outs[0]]; + Shape outShape = out.shape(); + + if (in1.type == OperandType::TENSOR_FLOAT32) { + success = addMulPrepare(in1.shape(), in2.shape(), &outShape) && + setInfoAndAllocateIfNeeded(&out, outShape) && + addFloat32(reinterpret_cast<const float*>(in1.buffer), + in1.shape(), + reinterpret_cast<const float*>(in2.buffer), + in2.shape(), + activation, + reinterpret_cast<float*>(out.buffer), + outShape); + } else if (in1.type == OperandType::TENSOR_QUANT8_ASYMM) { + success = addMulPrepare(in1.shape(), in2.shape(), &outShape) && + setInfoAndAllocateIfNeeded(&out, outShape) && + addQuant8(reinterpret_cast<const uint8_t*>(in1.buffer), + in1.shape(), + reinterpret_cast<const uint8_t*>(in2.buffer), + in2.shape(), + activation, + reinterpret_cast<uint8_t*>(out.buffer), + outShape); + } + } break; + case OperationType::MUL: { + if (!allParametersPresent(3, 1)) { + return ANEURALNETWORKS_BAD_DATA; + } + const RunTimeOperandInfo& in1 = mOperands[ins[0]]; + const RunTimeOperandInfo& in2 = mOperands[ins[1]]; + int32_t activation = getScalarData<int32_t>(mOperands[ins[2]]); + + RunTimeOperandInfo& out = mOperands[outs[0]]; + Shape outShape = out.shape(); + + if (in1.type == OperandType::TENSOR_FLOAT32) { + success = addMulPrepare(in1.shape(), in2.shape(), &outShape) && + setInfoAndAllocateIfNeeded(&out, outShape) && + mulFloat32(reinterpret_cast<const float*>(in1.buffer), + in1.shape(), + reinterpret_cast<const float*>(in2.buffer), + in2.shape(), + activation, + reinterpret_cast<float*>(out.buffer), + outShape); + } else if (in1.type == OperandType::TENSOR_QUANT8_ASYMM) { + success = addMulPrepare(in1.shape(), in2.shape(), &outShape) && + setInfoAndAllocateIfNeeded(&out, outShape) && + mulQuant8(reinterpret_cast<const uint8_t*>(in1.buffer), + in1.shape(), + reinterpret_cast<const uint8_t*>(in2.buffer), + in2.shape(), + activation, + reinterpret_cast<uint8_t*>(out.buffer), + outShape); + } + } break; +#if 0 // REF-ANN + case OperationType::FLOOR: { + if (!allParametersPresent(1, 1)) { + return ANEURALNETWORKS_BAD_DATA; + } + const RunTimeOperandInfo& input = mOperands[ins[0]]; + RunTimeOperandInfo& output = mOperands[outs[0]]; + Shape outShape = output.shape(); + + if (input.type == OperandType::TENSOR_FLOAT32) { + success = floorPrepare(input.shape(), &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + floorFloat32(reinterpret_cast<const float*>(input.buffer), + reinterpret_cast<float*>(output.buffer), + outShape); + } + } break; + case OperationType::DEQUANTIZE: { + if (!allParametersPresent(1, 1)) { + return ANEURALNETWORKS_BAD_DATA; + } + const RunTimeOperandInfo& input = mOperands[ins[0]]; + RunTimeOperandInfo& output = mOperands[outs[0]]; + Shape outShape = output.shape(); + + if (input.type == OperandType::TENSOR_QUANT8_ASYMM) { + success = dequantizePrepare(input.shape(), &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + dequantizeQuant8ToFloat32( + reinterpret_cast<const uint8_t*>(input.buffer), + reinterpret_cast<float*>(output.buffer), + input.shape()); + } + } break; +#endif + case OperationType::DEPTHWISE_CONV_2D: { + const size_t inCount = ins.size(); + if ((inCount != 11 && inCount != 8) || + !allParametersPresent(inCount, 1)) { + return ANEURALNETWORKS_BAD_DATA; + } + const RunTimeOperandInfo& input = mOperands[ins[0]]; + const RunTimeOperandInfo& filter = mOperands[ins[1]]; + const RunTimeOperandInfo& bias = mOperands[ins[2]]; + + int32_t padding_left, padding_right; + int32_t padding_top, padding_bottom; + int32_t stride_width, stride_height; + int32_t depth_multiplier; + int32_t activation; + + if (inCount == 11) { + padding_left = getScalarData<int32_t>(mOperands[ins[3]]); + padding_right = getScalarData<int32_t>(mOperands[ins[4]]); + padding_top = getScalarData<int32_t>(mOperands[ins[5]]); + padding_bottom = getScalarData<int32_t>(mOperands[ins[6]]); + stride_width = getScalarData<int32_t>(mOperands[ins[7]]); + stride_height = getScalarData<int32_t>(mOperands[ins[8]]); + depth_multiplier = getScalarData<int32_t>(mOperands[ins[9]]); + activation = getScalarData<int32_t>(mOperands[ins[10]]); + } else { + int32_t padding_implicit = getScalarData<int32_t>(mOperands[ins[3]]); + stride_width = getScalarData<int32_t>(mOperands[ins[4]]); + stride_height = getScalarData<int32_t>(mOperands[ins[5]]); + depth_multiplier = getScalarData<int32_t>(mOperands[ins[6]]); + activation = getScalarData<int32_t>(mOperands[ins[7]]); + + Shape inputShape = input.shape(); + Shape filterShape = filter.shape(); + int32_t input_width = getSizeOfDimension(inputShape, 2); + int32_t input_height = getSizeOfDimension(inputShape, 1); + int32_t filter_width = getSizeOfDimension(filterShape, 2); + int32_t filter_height = getSizeOfDimension(filterShape, 1); + calculateExplicitPadding(input_width, stride_width, + filter_width, padding_implicit, + &padding_left, &padding_right); + calculateExplicitPadding(input_height, stride_height, + filter_height, padding_implicit, + &padding_top, &padding_bottom); + } + + RunTimeOperandInfo& output = mOperands[outs[0]]; + Shape outShape = output.shape(); + + if (input.type == OperandType::TENSOR_FLOAT32) { + NNFW_KERNEL(func, depthwiseConvFloat32); + success = depthwiseConvPrepare(input.shape(), filter.shape(), bias.shape(), + padding_left, padding_right, + padding_top, padding_bottom, + stride_width, stride_height, + &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + func(reinterpret_cast<const float*>(input.buffer), + input.shape(), + reinterpret_cast<const float*>(filter.buffer), + filter.shape(), + reinterpret_cast<const float*>(bias.buffer), + bias.shape(), + padding_left, padding_right, + padding_top, padding_bottom, + stride_width, stride_height, + depth_multiplier, activation, + reinterpret_cast<float*>(output.buffer), + outShape); + } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) { +#if 0 // REF-ANN We don't support depthwiseConvQuant8 yet + success = depthwiseConvPrepare(input.shape(), filter.shape(), bias.shape(), + padding_left, padding_right, + padding_top, padding_bottom, + stride_width, stride_height, + &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + depthwiseConvQuant8(reinterpret_cast<const uint8_t*>(input.buffer), + input.shape(), + reinterpret_cast<const uint8_t*>(filter.buffer), + filter.shape(), + reinterpret_cast<const int32_t*>(bias.buffer), + bias.shape(), + padding_left, padding_right, + padding_top, padding_bottom, + stride_width, stride_height, + depth_multiplier, activation, + reinterpret_cast<uint8_t*>(output.buffer), + outShape); +#else // REF-ANN + LOG(ERROR) << getOperationName(operation.type) << " failed."; + NYI("We dont' support TENSOR_QUANT8_ASYMM yet."); +#endif // REF-ANN + } + + } break; + case OperationType::CONV_2D: { + const size_t inCount = ins.size(); + if ((inCount != 10 && inCount != 7) || + !allParametersPresent(inCount, 1)) { + return ANEURALNETWORKS_BAD_DATA; + } + const RunTimeOperandInfo& input = mOperands[ins[0]]; + const RunTimeOperandInfo& filter = mOperands[ins[1]]; + const RunTimeOperandInfo& bias = mOperands[ins[2]]; + + int32_t padding_left, padding_right; + int32_t padding_top, padding_bottom; + int32_t stride_width, stride_height; + int32_t activation; + + if (inCount == 10) { + padding_left = getScalarData<int32_t>(mOperands[ins[3]]); + padding_right = getScalarData<int32_t>(mOperands[ins[4]]); + padding_top = getScalarData<int32_t>(mOperands[ins[5]]); + padding_bottom = getScalarData<int32_t>(mOperands[ins[6]]); + stride_width = getScalarData<int32_t>(mOperands[ins[7]]); + stride_height = getScalarData<int32_t>(mOperands[ins[8]]); + activation = getScalarData<int32_t>(mOperands[ins[9]]); + } else { + int32_t padding_implicit = getScalarData<int32_t>(mOperands[ins[3]]); + stride_width = getScalarData<int32_t>(mOperands[ins[4]]); + stride_height = getScalarData<int32_t>(mOperands[ins[5]]); + activation = getScalarData<int32_t>(mOperands[ins[6]]); + + Shape inputShape = input.shape(); + Shape filterShape = filter.shape(); + int32_t input_width = getSizeOfDimension(inputShape, 2); + int32_t input_height = getSizeOfDimension(inputShape, 1); + int32_t filter_width = getSizeOfDimension(filterShape, 2); + int32_t filter_height = getSizeOfDimension(filterShape, 1); + calculateExplicitPadding(input_width, stride_width, + filter_width, padding_implicit, + &padding_left, &padding_right); + calculateExplicitPadding(input_height, stride_height, + filter_height, padding_implicit, + &padding_top, &padding_bottom); + } + + RunTimeOperandInfo& output = mOperands[outs[0]]; + Shape outShape = output.shape(); + + if (input.type == OperandType::TENSOR_FLOAT32) { + NNFW_KERNEL(func, convFloat32); + success = convPrepare(input.shape(), filter.shape(), bias.shape(), + padding_left, padding_right, + padding_top, padding_bottom, + stride_width, stride_height, + &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + func(reinterpret_cast<const float*>(input.buffer), input.shape(), + reinterpret_cast<const float*>(filter.buffer), filter.shape(), + reinterpret_cast<const float*>(bias.buffer), bias.shape(), + padding_left, padding_right, + padding_top, padding_bottom, + stride_width, stride_height, activation, + reinterpret_cast<float*>(output.buffer), outShape); + } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) { + success = convPrepare(input.shape(), filter.shape(), bias.shape(), + padding_left, padding_right, + padding_top, padding_bottom, + stride_width, stride_height, + &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + convQuant8(reinterpret_cast<const uint8_t*>(input.buffer), + input.shape(), + reinterpret_cast<const uint8_t*>(filter.buffer), + filter.shape(), + reinterpret_cast<const int32_t*>(bias.buffer), + bias.shape(), + padding_left, padding_right, + padding_top, padding_bottom, + stride_width, stride_height, activation, + reinterpret_cast<uint8_t*>(output.buffer), + outShape); + } + } break; + case OperationType::AVERAGE_POOL_2D: { + const size_t inCount = ins.size(); + if ((inCount != 10 && inCount != 7) || + !allParametersPresent(inCount, 1)) { + return ANEURALNETWORKS_BAD_DATA; + } + const RunTimeOperandInfo& input = mOperands[ins[0]]; + + int32_t padding_left, padding_right; + int32_t padding_top, padding_bottom; + int32_t stride_width, stride_height; + int32_t filter_width, filter_height; + int32_t activation; + + if (inCount == 10) { + padding_left = getScalarData<int32_t>(mOperands[ins[1]]); + padding_right = getScalarData<int32_t>(mOperands[ins[2]]); + padding_top = getScalarData<int32_t>(mOperands[ins[3]]); + padding_bottom = getScalarData<int32_t>(mOperands[ins[4]]); + stride_width = getScalarData<int32_t>(mOperands[ins[5]]); + stride_height = getScalarData<int32_t>(mOperands[ins[6]]); + filter_width = getScalarData<int32_t>(mOperands[ins[7]]); + filter_height = getScalarData<int32_t>(mOperands[ins[8]]); + activation = getScalarData<int32_t>(mOperands[ins[9]]); + } else { + int32_t padding_implicit = getScalarData<int32_t>(mOperands[ins[1]]); + stride_width = getScalarData<int32_t>(mOperands[ins[2]]); + stride_height = getScalarData<int32_t>(mOperands[ins[3]]); + filter_width = getScalarData<int32_t>(mOperands[ins[4]]); + filter_height = getScalarData<int32_t>(mOperands[ins[5]]); + activation = getScalarData<int32_t>(mOperands[ins[6]]); + + Shape inputShape = input.shape(); + int32_t input_width = getSizeOfDimension(inputShape, 2); + int32_t input_height = getSizeOfDimension(inputShape, 1); + calculateExplicitPadding(input_width, stride_width, + filter_width, padding_implicit, + &padding_left, &padding_right); + calculateExplicitPadding(input_height, stride_height, + filter_height, padding_implicit, + &padding_top, &padding_bottom); + } + + RunTimeOperandInfo& output = mOperands[outs[0]]; + Shape outShape = output.shape(); + + if (input.type == OperandType::TENSOR_FLOAT32) { + NNFW_KERNEL(func, averagePoolFloat32); + success = genericPoolingPrepare(input.shape(), + padding_left, padding_right, + padding_top, padding_bottom, + stride_width, stride_height, + filter_width, filter_height, + &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + func(reinterpret_cast<const float*>(input.buffer), + input.shape(), + padding_left, padding_right, + padding_top, padding_bottom, + stride_width, stride_height, + filter_width, filter_height, activation, + reinterpret_cast<float*>(output.buffer), + outShape); + } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) { + success = genericPoolingPrepare(input.shape(), + padding_left, padding_right, + padding_top, padding_bottom, + stride_width, stride_height, + filter_width, filter_height, + &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + averagePoolQuant8(reinterpret_cast<const uint8_t*>(input.buffer), + input.shape(), + padding_left, padding_right, + padding_top, padding_bottom, + stride_width, stride_height, + filter_width, filter_height, activation, + reinterpret_cast<uint8_t*>(output.buffer), + outShape); + } + } break; +#if 0 // REF-ANN + case OperationType::L2_POOL_2D: { + const size_t inCount = ins.size(); + if ((inCount != 10 && inCount != 7) || + !allParametersPresent(inCount, 1)) { + return ANEURALNETWORKS_BAD_DATA; + } + const RunTimeOperandInfo& input = mOperands[ins[0]]; + + int32_t padding_left, padding_right; + int32_t padding_top, padding_bottom; + int32_t stride_width, stride_height; + int32_t filter_width, filter_height; + int32_t activation; + + if (inCount == 10) { + padding_left = getScalarData<int32_t>(mOperands[ins[1]]); + padding_right = getScalarData<int32_t>(mOperands[ins[2]]); + padding_top = getScalarData<int32_t>(mOperands[ins[3]]); + padding_bottom = getScalarData<int32_t>(mOperands[ins[4]]); + stride_width = getScalarData<int32_t>(mOperands[ins[5]]); + stride_height = getScalarData<int32_t>(mOperands[ins[6]]); + filter_width = getScalarData<int32_t>(mOperands[ins[7]]); + filter_height = getScalarData<int32_t>(mOperands[ins[8]]); + activation = getScalarData<int32_t>(mOperands[ins[9]]); + } else { + int32_t padding_implicit = getScalarData<int32_t>(mOperands[ins[1]]); + stride_width = getScalarData<int32_t>(mOperands[ins[2]]); + stride_height = getScalarData<int32_t>(mOperands[ins[3]]); + filter_width = getScalarData<int32_t>(mOperands[ins[4]]); + filter_height = getScalarData<int32_t>(mOperands[ins[5]]); + activation = getScalarData<int32_t>(mOperands[ins[6]]); + + Shape inputShape = input.shape(); + int32_t input_width = getSizeOfDimension(inputShape, 2); + int32_t input_height = getSizeOfDimension(inputShape, 1); + calculateExplicitPadding(input_width, stride_width, + filter_width, padding_implicit, + &padding_left, &padding_right); + calculateExplicitPadding(input_height, stride_height, + filter_height, padding_implicit, + &padding_top, &padding_bottom); + } + + RunTimeOperandInfo& output = mOperands[outs[0]]; + Shape outShape = output.shape(); + + if (input.type == OperandType::TENSOR_FLOAT32) { + success = genericPoolingPrepare(input.shape(), + padding_left, padding_right, + padding_top, padding_bottom, + stride_width, stride_height, + filter_width, filter_height, + &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + l2PoolFloat32(reinterpret_cast<const float*>(input.buffer), + input.shape(), + padding_left, padding_right, + padding_top, padding_bottom, + stride_width, stride_height, + filter_width, filter_height, activation, + reinterpret_cast<float*>(output.buffer), + outShape); + } + } break; +#endif // REF-ANN + case OperationType::MAX_POOL_2D: { + const size_t inCount = ins.size(); + if ((inCount != 10 && inCount != 7) || + !allParametersPresent(inCount, 1)) { + return ANEURALNETWORKS_BAD_DATA; + } + const RunTimeOperandInfo& input = mOperands[ins[0]]; + + int32_t padding_left, padding_right; + int32_t padding_top, padding_bottom; + int32_t stride_width, stride_height; + int32_t filter_width, filter_height; + int32_t activation; + + if (inCount == 10) { + padding_left = getScalarData<int32_t>(mOperands[ins[1]]); + padding_right = getScalarData<int32_t>(mOperands[ins[2]]); + padding_top = getScalarData<int32_t>(mOperands[ins[3]]); + padding_bottom = getScalarData<int32_t>(mOperands[ins[4]]); + stride_width = getScalarData<int32_t>(mOperands[ins[5]]); + stride_height = getScalarData<int32_t>(mOperands[ins[6]]); + filter_width = getScalarData<int32_t>(mOperands[ins[7]]); + filter_height = getScalarData<int32_t>(mOperands[ins[8]]); + activation = getScalarData<int32_t>(mOperands[ins[9]]); + } else { + int32_t padding_implicit = getScalarData<int32_t>(mOperands[ins[1]]); + stride_width = getScalarData<int32_t>(mOperands[ins[2]]); + stride_height = getScalarData<int32_t>(mOperands[ins[3]]); + filter_width = getScalarData<int32_t>(mOperands[ins[4]]); + filter_height = getScalarData<int32_t>(mOperands[ins[5]]); + activation = getScalarData<int32_t>(mOperands[ins[6]]); + + Shape inputShape = input.shape(); + int32_t input_width = getSizeOfDimension(inputShape, 2); + int32_t input_height = getSizeOfDimension(inputShape, 1); + calculateExplicitPadding(input_width, stride_width, + filter_width, padding_implicit, + &padding_left, &padding_right); + calculateExplicitPadding(input_height, stride_height, + filter_height, padding_implicit, + &padding_top, &padding_bottom); + } + + RunTimeOperandInfo& output = mOperands[outs[0]]; + Shape outShape = output.shape(); + + if (input.type == OperandType::TENSOR_FLOAT32) { + NNFW_KERNEL(func, maxPoolFloat32); + success = genericPoolingPrepare(input.shape(), + padding_left, padding_right, + padding_top, padding_bottom, + stride_width, stride_height, + filter_width, filter_height, + &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + func(reinterpret_cast<const float*>(input.buffer), + input.shape(), + padding_left, padding_right, + padding_top, padding_bottom, + stride_width, stride_height, + filter_width, filter_height, activation, + reinterpret_cast<float*>(output.buffer), + outShape); + } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) { + success = genericPoolingPrepare(input.shape(), + padding_left, padding_right, + padding_top, padding_bottom, + stride_width, stride_height, + filter_width, filter_height, + &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + maxPoolQuant8(reinterpret_cast<const uint8_t*>(input.buffer), + input.shape(), + padding_left, padding_right, + padding_top, padding_bottom, + stride_width, stride_height, + filter_width, filter_height, activation, + reinterpret_cast<uint8_t*>(output.buffer), + outShape); + } + + } break; + case OperationType::RELU: { + if (!allParametersPresent(1, 1)) { + return ANEURALNETWORKS_BAD_DATA; + } + const RunTimeOperandInfo& input = mOperands[ins[0]]; + RunTimeOperandInfo& output = mOperands[outs[0]]; + Shape outShape = output.shape(); + + if (input.type == OperandType::TENSOR_FLOAT32) { + success = genericActivationPrepare(input.shape(), &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + reluFloat32(reinterpret_cast<const float*>(input.buffer), + input.shape(), + reinterpret_cast<float*>(output.buffer), + outShape); + } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) { + success = genericActivationPrepare(input.shape(), &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + reluQuant8(reinterpret_cast<const uint8_t*>(input.buffer), + input.shape(), + reinterpret_cast<uint8_t*>(output.buffer), + outShape); + } + } break; +#if 0 // REF-ANN + case OperationType::RELU1: { + if (!allParametersPresent(1, 1)) { + return ANEURALNETWORKS_BAD_DATA; + } + const RunTimeOperandInfo& input = mOperands[ins[0]]; + RunTimeOperandInfo& output = mOperands[outs[0]]; + Shape outShape = output.shape(); + + if (input.type == OperandType::TENSOR_FLOAT32) { + success = genericActivationPrepare(input.shape(), &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + relu1Float32(reinterpret_cast<const float*>(input.buffer), + input.shape(), + reinterpret_cast<float*>(output.buffer), + outShape); + } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) { + success = genericActivationPrepare(input.shape(), &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + relu1Quant8(reinterpret_cast<const uint8_t*>(input.buffer), + input.shape(), + reinterpret_cast<uint8_t*>(output.buffer), + outShape); + } + } break; +#endif // REF-ANN + case OperationType::RELU6: { + if (!allParametersPresent(1, 1)) { + return ANEURALNETWORKS_BAD_DATA; + } + const RunTimeOperandInfo& input = mOperands[ins[0]]; + RunTimeOperandInfo& output = mOperands[outs[0]]; + Shape outShape = output.shape(); + + if (input.type == OperandType::TENSOR_FLOAT32) { + success = genericActivationPrepare(input.shape(), &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + relu6Float32(reinterpret_cast<const float*>(input.buffer), + input.shape(), + reinterpret_cast<float*>(output.buffer), + outShape); + } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) { + success = genericActivationPrepare(input.shape(), &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + relu6Quant8(reinterpret_cast<const uint8_t*>(input.buffer), + input.shape(), + reinterpret_cast<uint8_t*>(output.buffer), + outShape); + } + } break; +#if 0 // REF-ANN + case OperationType::TANH: { + if (!allParametersPresent(1, 1)) { + return ANEURALNETWORKS_BAD_DATA; + } + const RunTimeOperandInfo& input = mOperands[ins[0]]; + RunTimeOperandInfo& output = mOperands[outs[0]]; + Shape outShape = output.shape(); + + if (input.type == OperandType::TENSOR_FLOAT32) { + success = genericActivationPrepare(input.shape(), &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + tanhFloat32(reinterpret_cast<const float*>(input.buffer), + input.shape(), + reinterpret_cast<float*>(output.buffer), + outShape); + } + } break; + case OperationType::LOGISTIC: { + if (!allParametersPresent(1, 1)) { + return ANEURALNETWORKS_BAD_DATA; + } + const RunTimeOperandInfo& input = mOperands[ins[0]]; + RunTimeOperandInfo& output = mOperands[outs[0]]; + Shape outShape = output.shape(); + + if (input.type == OperandType::TENSOR_FLOAT32) { + success = genericActivationPrepare(input.shape(), &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + logisticFloat32(reinterpret_cast<const float*>(input.buffer), + input.shape(), + reinterpret_cast<float*>(output.buffer), + outShape); + } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) { + success = genericActivationPrepare(input.shape(), &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + logisticQuant8(reinterpret_cast<const uint8_t*>(input.buffer), + input.shape(), + reinterpret_cast<uint8_t*>(output.buffer), + outShape); + } + } break; +#endif // REF-ANN + case OperationType::SOFTMAX: { + if (!allParametersPresent(2, 1)) { + return ANEURALNETWORKS_BAD_DATA; + } + RunTimeOperandInfo& input = mOperands[ins[0]]; + float beta = getScalarData<float>(mOperands[ins[1]]); + if (beta <= 0.0f) { + LOG(ERROR) << "beta must be positive for softmax"; + return ANEURALNETWORKS_BAD_DATA; + } + + RunTimeOperandInfo& output = mOperands[outs[0]]; + Shape outShape = output.shape(); + + if (input.type == OperandType::TENSOR_FLOAT32) { + NNFW_KERNEL(func, softmaxFloat32); + success = genericActivationPrepare(input.shape(), &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + func(reinterpret_cast<const float*>(input.buffer), + input.shape(), + beta, + reinterpret_cast<float*>(output.buffer), + output.shape()); + } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) { + success = genericActivationPrepare(input.shape(), &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + softmaxQuant8(reinterpret_cast<const uint8_t*>(input.buffer), + input.shape(), + beta, + reinterpret_cast<uint8_t*>(output.buffer), + output.shape()); + } + } break; + case OperationType::FULLY_CONNECTED: { + if (!allParametersPresent(4, 1)) { + return ANEURALNETWORKS_BAD_DATA; + } + RunTimeOperandInfo& input = mOperands[ins[0]]; + RunTimeOperandInfo& weights = mOperands[ins[1]]; + RunTimeOperandInfo& bias = mOperands[ins[2]]; + + int32_t activation = getScalarData<int32_t>(mOperands[ins[3]]); + + RunTimeOperandInfo& output = mOperands[outs[0]]; + Shape outShape = output.shape(); + + if (input.type == OperandType::TENSOR_FLOAT32) { + NNFW_KERNEL(func, fullyConnectedFloat32); + success = fullyConnectedPrepare(input.shape(), weights.shape(), bias.shape(), + &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + func(reinterpret_cast<const float*>(input.buffer), + input.shape(), + reinterpret_cast<const float*>(weights.buffer), + weights.shape(), + reinterpret_cast<const float*>(bias.buffer), + bias.shape(), + activation, + reinterpret_cast<float*>(output.buffer), + outShape); + } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) { + success = fullyConnectedPrepare(input.shape(), weights.shape(), bias.shape(), + &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + fullyConnectedQuant8(reinterpret_cast<const uint8_t*>(input.buffer), + input.shape(), + reinterpret_cast<const uint8_t*>(weights.buffer), + weights.shape(), + reinterpret_cast<const int32_t*>(bias.buffer), + bias.shape(), + activation, + reinterpret_cast<uint8_t*>(output.buffer), + outShape); + } + } break; + case OperationType::CONCATENATION: { + if (outs.size() != 1 || ins.size() < 2) { + return ANEURALNETWORKS_BAD_DATA; + } + int numInputTensors = ins.size() - 1; + int32_t axis = getScalarData<int32_t>(mOperands[ins[numInputTensors]]); + + RunTimeOperandInfo& output = mOperands[outs[0]]; + Shape outShape = output.shape(); + + const RunTimeOperandInfo& firstInput = mOperands[ins[0]]; + if (firstInput.type == OperandType::TENSOR_FLOAT32) { + std::vector<Shape> inputShapes(numInputTensors); + std::vector<const float*> inputDataPtrs(numInputTensors); + + for (int i=0; i<numInputTensors; i++) { + RunTimeOperandInfo& input = mOperands[ins[i]]; + inputShapes[i] = input.shape(); + inputDataPtrs[i] = reinterpret_cast<const float*>(input.buffer); + } + NNFW_KERNEL(func, concatenationFloat32); + success = concatenationPrepare(inputShapes, axis, &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + func(inputDataPtrs, inputShapes, axis, + reinterpret_cast<float*>(output.buffer), outShape); + } else if (firstInput.type == OperandType::TENSOR_QUANT8_ASYMM) { + std::vector<Shape> inputShapes(numInputTensors); + std::vector<const uint8_t*> inputDataPtrs(numInputTensors); + + for (int i=0; i<numInputTensors; i++) { + RunTimeOperandInfo& input = mOperands[ins[i]]; + inputShapes[i] = input.shape(); + inputDataPtrs[i] = reinterpret_cast<const uint8_t*>(input.buffer); + } + success = concatenationPrepare(inputShapes, axis, &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + concatenationQuant8(inputDataPtrs, inputShapes, axis, + reinterpret_cast<uint8_t*>(output.buffer), + outShape); + } + } break; +#if 0 // REF-ANN + case OperationType::L2_NORMALIZATION: { + if (!allParametersPresent(1, 1)) { + return ANEURALNETWORKS_BAD_DATA; + } + const RunTimeOperandInfo& input = mOperands[ins[0]]; + RunTimeOperandInfo& output = mOperands[outs[0]]; + Shape outShape = output.shape(); + + if (input.type == OperandType::TENSOR_FLOAT32) { + success = genericNormalizationPrepare(input.shape(), &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + l2normFloat32(reinterpret_cast<const float*>(input.buffer), + input.shape(), + reinterpret_cast<float*>(output.buffer), + outShape); + } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) { + success = genericNormalizationPrepare(input.shape(), &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + l2normQuant8(reinterpret_cast<const uint8_t*>(input.buffer), + input.shape(), + reinterpret_cast<uint8_t*>(output.buffer), + outShape); + } + } break; + case OperationType::LOCAL_RESPONSE_NORMALIZATION: { + if (!allParametersPresent(5, 1)) { + return ANEURALNETWORKS_BAD_DATA; + } + const RunTimeOperandInfo& input = mOperands[ins[0]]; + int32_t radius = getScalarData<int32_t>(mOperands[ins[1]]); + float bias = getScalarData<float>(mOperands[ins[2]]); + float alpha = getScalarData<float>(mOperands[ins[3]]); + float beta = getScalarData<float>(mOperands[ins[4]]); + + RunTimeOperandInfo& output = mOperands[outs[0]]; + Shape outShape = output.shape(); + + if (input.type == OperandType::TENSOR_FLOAT32) { + success = genericNormalizationPrepare(input.shape(), &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + localResponseNormFloat32(reinterpret_cast<const float*>(input.buffer), + input.shape(), + radius, bias, alpha, beta, + reinterpret_cast<float*>(output.buffer), + outShape); + } + } break; +#endif //REF_ANN + case OperationType::RESHAPE: { + if (!allParametersPresent(2, 1)) { + return ANEURALNETWORKS_BAD_DATA; + } + const RunTimeOperandInfo& input = mOperands[ins[0]]; + const RunTimeOperandInfo& targetShape = mOperands[ins[1]]; + + RunTimeOperandInfo& output = mOperands[outs[0]]; + Shape outShape = output.shape(); + + success = reshapePrepare(input.shape(), + reinterpret_cast<const int32_t*>(targetShape.buffer), + getNumberOfElements(targetShape.shape()), + &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + reshapeGeneric(reinterpret_cast<const void*>(input.buffer), + input.shape(), + reinterpret_cast<void*>(output.buffer), + outShape); + } break; +#if 0 //REF-ANN + case OperationType::RESIZE_BILINEAR: { + if (!allParametersPresent(3, 1)) { + return ANEURALNETWORKS_BAD_DATA; + } + const RunTimeOperandInfo& input = mOperands[ins[0]]; + int32_t width = getScalarData<int32_t>(mOperands[ins[1]]); + int32_t height = getScalarData<int32_t>(mOperands[ins[2]]); + + RunTimeOperandInfo& output = mOperands[outs[0]]; + Shape outShape = output.shape(); + + if (input.type == OperandType::TENSOR_FLOAT32) { + success = resizeBilinearPrepare(input.shape(), + width, height, + &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + resizeBilinearFloat32(reinterpret_cast<const float*>(input.buffer), + input.shape(), + reinterpret_cast<float*>(output.buffer), + outShape); + } + } break; + case OperationType::DEPTH_TO_SPACE: { + if (!allParametersPresent(2, 1)) { + return ANEURALNETWORKS_BAD_DATA; + } + const RunTimeOperandInfo& input = mOperands[ins[0]]; + int32_t blockSize = getScalarData<int32_t>(mOperands[ins[1]]); + + RunTimeOperandInfo& output = mOperands[outs[0]]; + Shape outShape = output.shape(); + + success = depthToSpacePrepare(input.shape(), + blockSize, + &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + depthToSpaceGeneric(input.buffer, + input.shape(), + blockSize, + output.buffer, + outShape); + } break; + case OperationType::SPACE_TO_DEPTH: { + if (!allParametersPresent(2, 1)) { + return ANEURALNETWORKS_BAD_DATA; + } + const RunTimeOperandInfo& input = mOperands[ins[0]]; + int32_t blockSize = getScalarData<int32_t>(mOperands[ins[1]]); + + RunTimeOperandInfo& output = mOperands[outs[0]]; + Shape outShape = output.shape(); + + success = spaceToDepthPrepare(input.shape(), + blockSize, + &outShape) && + setInfoAndAllocateIfNeeded(&output, outShape) && + spaceToDepthGeneric(input.buffer, + input.shape(), + blockSize, + output.buffer, + outShape); + } break; + case OperationType::EMBEDDING_LOOKUP: { + const RunTimeOperandInfo &values = + mOperands[ins[EmbeddingLookup::kValueTensor]]; + const RunTimeOperandInfo &lookups = + mOperands[ins[EmbeddingLookup::kLookupTensor]]; + RunTimeOperandInfo &output = + mOperands[outs[EmbeddingLookup::kOutputTensor]]; + + Shape outputShape; + EmbeddingLookup lookup(operation, mOperands); + + success = embeddingLookupPrepare(values.shape(), lookups.shape(), &outputShape) && + setInfoAndAllocateIfNeeded(&output, outputShape) && + lookup.Eval(); + } break; + case OperationType::HASHTABLE_LOOKUP: { + const RunTimeOperandInfo &lookups = + mOperands[ins[HashtableLookup::kLookupTensor]]; + const RunTimeOperandInfo &keys = + mOperands[ins[HashtableLookup::kKeyTensor]]; + const RunTimeOperandInfo &values = + mOperands[ins[HashtableLookup::kValueTensor]]; + + RunTimeOperandInfo &output = + mOperands[outs[HashtableLookup::kOutputTensor]]; + RunTimeOperandInfo &hits = + mOperands[outs[HashtableLookup::kHitsTensor]]; + + Shape outputShape, hitShape; + HashtableLookup lookup(operation, mOperands); + + success = hashtableLookupPrepare(lookups.shape(), keys.shape(), values.shape(), + &outputShape, &hitShape) && + setInfoAndAllocateIfNeeded(&output, outputShape) && + setInfoAndAllocateIfNeeded(&hits, hitShape) && + lookup.Eval(); + } break; + case OperationType::LSH_PROJECTION: { + RunTimeOperandInfo &output = + mOperands[outs[LSHProjection::kOutputTensor]]; + + Shape outputShape; + LSHProjection lsh(operation, mOperands); + + success = LSHProjection::Prepare(operation, mOperands, + &outputShape) && + setInfoAndAllocateIfNeeded(&output, outputShape) && + lsh.Eval(); + } break; + case OperationType::LSTM: { + RunTimeOperandInfo &scratch = + mOperands[outs[LSTMCell::kScratchBufferTensor]]; + RunTimeOperandInfo &outputStateOut = + mOperands[outs[LSTMCell::kOutputStateOutTensor]]; + RunTimeOperandInfo &cellStateOut = + mOperands[outs[LSTMCell::kCellStateOutTensor]]; + RunTimeOperandInfo &output = + mOperands[outs[LSTMCell::kOutputTensor]]; + + Shape scratchShape, outputStateShape, cellStateShape, outputShape; + LSTMCell lstm_cell(operation, mOperands); + + success = LSTMCell::Prepare(operation, mOperands, + &scratchShape, &outputStateShape, + &cellStateShape, &outputShape) && + setInfoAndAllocateIfNeeded(&scratch, scratchShape) && + setInfoAndAllocateIfNeeded(&outputStateOut, outputStateShape) && + setInfoAndAllocateIfNeeded(&cellStateOut, cellStateShape) && + setInfoAndAllocateIfNeeded(&output, outputShape) && + lstm_cell.Eval(); + } break; + case OperationType::RNN: { + RunTimeOperandInfo &hiddenStateOut = + mOperands[outs[RNN::kHiddenStateOutTensor]]; + RunTimeOperandInfo &output = + mOperands[outs[RNN::kOutputTensor]]; + + Shape hiddenStateShape, outputShape; + RNN rnn_cell(operation, mOperands); + + success = RNN::Prepare(operation, mOperands, + &hiddenStateShape, &outputShape) && + setInfoAndAllocateIfNeeded(&hiddenStateOut, hiddenStateShape) && + setInfoAndAllocateIfNeeded(&output, outputShape) && + rnn_cell.Eval(); + } break; + case OperationType::SVDF: { + RunTimeOperandInfo &stateOut = + mOperands[outs[SVDF::kStateOutTensor]]; + RunTimeOperandInfo &output = + mOperands[outs[SVDF::kOutputTensor]]; + + Shape stateShape, outputShape; + SVDF svdf(operation, mOperands); + + success = SVDF::Prepare(operation, mOperands, + &stateShape, &outputShape) && + setInfoAndAllocateIfNeeded(&stateOut, stateShape) && + setInfoAndAllocateIfNeeded(&output, outputShape) && + svdf.Eval(); + } break; +#endif // REF-ANN + default: +#if 0 // TODO-NNRT : Enable if it is needed. + nnAssert(false); +#endif + NYI(getOperationName(operation.type)); + break; + } + if (!success) { + LOG(ERROR) << getOperationName(operation.type) << " failed."; + return ANEURALNETWORKS_OP_FAILED; + } + + freeNoLongerUsedOperands(ins); + return ANEURALNETWORKS_NO_ERROR; +} + +#ifdef NNFW_KERNEL +#undef NNFW_KERNEL +#else +#error NNFW_KERNEL should be defined +#endif + +} // namespace rt +} // namespace nnfw diff --git a/runtimes/nn/common/Logging.cpp b/runtimes/nn/common/Logging.cpp new file mode 100644 index 000000000..21107bcf9 --- /dev/null +++ b/runtimes/nn/common/Logging.cpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Logging.h" + +namespace nnfw { +namespace rt{ + +BoolConfig::BoolConfig(const std::string &tag, bool default_value) : _value(default_value) +{ + const char *str = getenv(tag.c_str()); + + if (str != nullptr) + { + std::string s = std::string(str); + _value = ((s != "0") && (s != "false") && (s != "FALSE")); + } +} + +VLogging::VLogging() +{ + BoolConfig vlog_enabled("VLOG", false); + _enabled = vlog_enabled.value(); +} + +VLogging& VLogging::access() +{ + static VLogging instance; + return instance; +} + +std::ostream& VLogging::stream() +{ + return std::cout; +} + +} // namespace rt +} // namespace nnfw diff --git a/runtimes/nn/common/NNFWKernels.cpp b/runtimes/nn/common/NNFWKernels.cpp new file mode 100644 index 000000000..dd5c2d2bc --- /dev/null +++ b/runtimes/nn/common/NNFWKernels.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "CpuExecutor.h" +#include "NeuralNetworks.h" +#include "Operations.h" + +#include "NNFWKernels.h" + +#ifdef USE_NNFW_ACL_KERNELS +#include "kernel/acl/Conv2D.h" +#include "kernel/acl/DepthwiseConv2D.h" +#include "kernel/acl/Pooling.h" +#include "kernel/acl/Softmax.h" +#include "kernel/acl/FullyConnected.h" +#include "kernel/acl/Concatenation.h" +#include "kernel/acl/Reshape.h" +#include "kernel/acl/nnfw_kernel_acl.h" +#endif // USE_NNFW_ACL_KERNELS + +#include <map> + +namespace nnfw { +namespace rt { + +#define NNFW_KERNEL(Name, Ret, Params) \ + NNFW_KERNELS_##Name nnfw_kernels_##Name; + +#include "NNFWKernels.lst" +#undef NNFW_KERNEL + +void init_nnfw_kernels() +{ +#ifdef USE_NNFW_ACL_KERNELS + nnfw::kernel::acl::Initialize(); + + nnfw_kernels_convFloat32["acl"] = nnfw::kernel::acl::convFloat32; + nnfw_kernels_depthwiseConvFloat32["acl"] = nnfw::kernel::acl::depthwiseConvFloat32; + nnfw_kernels_averagePoolFloat32["acl"] = nnfw::kernel::acl::averagePoolFloat32; + nnfw_kernels_maxPoolFloat32["acl"] = nnfw::kernel::acl::maxPoolFloat32; + nnfw_kernels_softmaxFloat32["acl"] = nnfw::kernel::acl::softmaxFloat32; + nnfw_kernels_fullyConnectedFloat32["acl"] = nnfw::kernel::acl::fullyConnectedFloat32; + nnfw_kernels_concatenationFloat32["acl"] = nnfw::kernel::acl::concatenationFloat32; + nnfw_kernels_reshapeGeneric["acl"] = nnfw::kernel::acl::reshapeGeneric; + + nnfw_kernels_convFloat32["neon"] = nnfw::kernel::acl::neon::convFloat32; + nnfw_kernels_depthwiseConvFloat32["neon"] = nnfw::kernel::acl::neon::depthwiseConvFloat32; + nnfw_kernels_averagePoolFloat32["neon"] = nnfw::kernel::acl::neon::averagePoolFloat32; + nnfw_kernels_maxPoolFloat32["neon"] = nnfw::kernel::acl::neon::maxPoolFloat32; + nnfw_kernels_softmaxFloat32["neon"] = nnfw::kernel::acl::neon::softmaxFloat32; + nnfw_kernels_fullyConnectedFloat32["neon"] = nnfw::kernel::acl::neon::fullyConnectedFloat32; + nnfw_kernels_concatenationFloat32["neon"] = nnfw::kernel::acl::neon::concatenationFloat32; + nnfw_kernels_reshapeGeneric["neon"] = nnfw::kernel::acl::reshapeGeneric; +#endif // USE_NNFW_ACL_KERNELS + return; +} + +} // namespace rt +} // namespace nnfw diff --git a/runtimes/nn/common/NNFWKernels.h b/runtimes/nn/common/NNFWKernels.h new file mode 100644 index 000000000..f38431d15 --- /dev/null +++ b/runtimes/nn/common/NNFWKernels.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RT_NNFW_KERNELS_H__ +#define __NNFW_RT_NNFW_KERNELS_H__ + +#include "CpuExecutor.h" +#include "NeuralNetworks.h" +#include "Operations.h" + +#include <map> + +namespace nnfw { +namespace rt { + +#define NNFW_KERNEL(Name, Ret, Params) \ + typedef Ret (*KERNEL_##Name) Params; \ + typedef std::map<std::string, KERNEL_##Name> NNFW_KERNELS_##Name; \ + extern NNFW_KERNELS_##Name nnfw_kernels_##Name; + +#include "NNFWKernels.lst" +#undef NNFW_KERNEL + +void init_nnfw_kernels(); + +} // namespace rt +} // namespace nnfw +#endif // __NNFW_RT_NNFW_KERNELS_H__ diff --git a/runtimes/nn/common/NNFWKernels.lst b/runtimes/nn/common/NNFWKernels.lst new file mode 100644 index 000000000..2a60e0120 --- /dev/null +++ b/runtimes/nn/common/NNFWKernels.lst @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +NNFW_KERNEL(convFloat32, bool, + (const float* inputData, const Shape& inputShape, + const float* filterData, const Shape& filterShape, + const float* biasData, const Shape& biasShape, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + int32_t activation, + float* outputData, const Shape& outputShape) + ); + +NNFW_KERNEL(depthwiseConvFloat32, bool, + (const float* inputData, const Shape& inputShape, + const float* filterData, const Shape& filterShape, + const float* biasData, const Shape& biasShape, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + int32_t depth_multiplier, int32_t activation, + float* outputData, const Shape& outputShape) + ); + +NNFW_KERNEL(averagePoolFloat32, bool, + (const float* inputData, const Shape& inputShape, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + int32_t filter_width, int32_t filter_height, int32_t activation, + float* outputData, const Shape& outputShape) + ); + +NNFW_KERNEL(maxPoolFloat32, bool, + (const float* inputData, const Shape& inputShape, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + int32_t filter_width, int32_t filter_height, int32_t activation, + float* outputData, const Shape& outputShape) + ); + +NNFW_KERNEL(softmaxFloat32, bool, + (const float* inputData, const Shape& inputShape, + const float beta, + float* outputData, const Shape& outputShape) + ); + +NNFW_KERNEL(fullyConnectedFloat32, bool, + (const float* inputData, const Shape& inputShape, + const float* weights, const Shape& weightsShape, + const float* biasData, const Shape& biasShape, + int32_t activation, + float* outputData, const Shape& outputShape) + ); + +NNFW_KERNEL(concatenationFloat32, bool, + (const std::vector<const float*>& inputDataPtrs, + const std::vector<Shape>& inputShapes, int32_t axis, + float* outputData, const Shape& outputShape) + ); + +NNFW_KERNEL(reshapeGeneric, bool, + (const void* inputData, const Shape& inputShape, + void* outputData, const Shape& outputShape) + ); diff --git a/runtimes/nn/common/OperationsUtils.cpp b/runtimes/nn/common/OperationsUtils.cpp new file mode 100644 index 000000000..04e54d0f3 --- /dev/null +++ b/runtimes/nn/common/OperationsUtils.cpp @@ -0,0 +1,565 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define LOG_TAG "OperationsUtils" + +#include "OperationsUtils.h" +#include "Operations.h" +#include "Utils.h" + +// TODO-NNRT In Android NN, ActivationFunctor.h is included from Utils.h through RNN.h +// Remove this when Utils.h includes RNN.h +#include "ActivationFunctor.h" + +#include <cmath> + +namespace nnfw { +namespace rt { + +bool SameShape(const Shape& in1, const Shape& in2) { + if (in1.type != in2.type || in1.dimensions.size() != in2.dimensions.size()) { + return false; + } + for (size_t i = 0; i < in1.dimensions.size(); i++) { + if (in1.dimensions[i] != in2.dimensions[i]) { + return false; + } + } + return true; +} + +bool SetShape(const Shape& in, Shape* out) { + if (in.type != out->type || in.dimensions.size() != out->dimensions.size()) { + return false; + } + out->dimensions = in.dimensions; + return true; +} + +uint32_t getNumberOfElements(const Shape& shape) { + uint32_t count = 1; + for (size_t i = 0; i < shape.dimensions.size(); i++) { + count *= shape.dimensions[i]; + } + return count; +} + +uint32_t getNumberOfDimensions(const Shape& shape) { + return shape.dimensions.size(); +} + +uint32_t getSizeOfDimension(const Shape& shape, uint32_t dimensionIdx) { + if (dimensionIdx >= shape.dimensions.size()) { + // TODO, log the error + return 0; + } + return shape.dimensions[dimensionIdx]; +} + +bool QuantizeMultiplierSmallerThanOne(double double_multiplier, + int32_t* quantized_multiplier, + int32_t* right_shift) { + NN_OPS_CHECK(double_multiplier >= 0.); + NN_OPS_CHECK(double_multiplier < 1.); + if (double_multiplier == 0.) { + *quantized_multiplier = 0; + *right_shift = 0; + return true; + } + NN_OPS_CHECK(double_multiplier > 0.); + const double q = std::frexp(double_multiplier, right_shift); + *right_shift *= -1; + int64_t q_fixed = static_cast<int64_t>(std::round(q * (1ll << 31))); + NN_OPS_CHECK(q_fixed <= (1ll << 31)); + if (q_fixed == (1ll << 31)) { + q_fixed /= 2; + --*right_shift; + } + NN_OPS_CHECK(*right_shift >= 0); + NN_OPS_CHECK(q_fixed <= std::numeric_limits<int32_t>::max()); + *quantized_multiplier = static_cast<int32_t>(q_fixed); + return true; +} + +bool QuantizeMultiplierGreaterThanOne(double double_multiplier, + int32_t* quantized_multiplier, + int* left_shift) { + NN_OPS_CHECK(double_multiplier > 1.); + const double q = std::frexp(double_multiplier, left_shift); + int64_t q_fixed = static_cast<int64_t>(std::round(q * (1ll << 31))); + NN_OPS_CHECK(q_fixed <= (1ll << 31)); + if (q_fixed == (1ll << 31)) { + q_fixed /= 2; + ++*left_shift; + } + NN_OPS_CHECK(*left_shift >= 0); + NN_OPS_CHECK(q_fixed <= std::numeric_limits<int32_t>::max()); + *quantized_multiplier = static_cast<int32_t>(q_fixed); + return true; +} + +bool GetQuantizedConvolutionMultipler(const Shape& inputShape, + const Shape& filterShape, + const Shape& biasShape, + const Shape& outputShape, + float* multiplier) { + const float input_product_scale = inputShape.scale * filterShape.scale; + const float bias_scale = biasShape.scale; + const float output_scale = outputShape.scale; + + // The following conditions must be guaranteed by the training pipeline. + NN_OPS_CHECK(std::abs(input_product_scale - bias_scale) <= + 1e-6 * std::min(input_product_scale, bias_scale)); + NN_OPS_CHECK(input_product_scale >= 0); + NN_OPS_CHECK(input_product_scale < output_scale); + *multiplier = input_product_scale / output_scale; + return true; +} + +void CalculateActivationRangeUint8(int32_t activation, + const Shape& outputShape, + int32_t* act_min, + int32_t* act_max) { + const int32_t qmin = std::numeric_limits<uint8_t>::min(); + const int32_t qmax = std::numeric_limits<uint8_t>::max(); + + const auto scale = outputShape.scale; + const auto zero_point = outputShape.offset; + + auto quantize = [scale, zero_point](float f) { + return zero_point + static_cast<int32_t>(std::round(f / scale)); + }; + + if (activation == kActivationRelu) { + *act_min = std::max(qmin, quantize(0.0)); + *act_max = qmax; + } else if (activation == kActivationRelu6) { + *act_min = std::max(qmin, quantize(0.0)); + *act_max = std::min(qmax, quantize(6.0)); + } else if (activation == kActivationRelu1) { + *act_min = std::max(qmin, quantize(-1.0)); + *act_max = std::min(qmax, quantize(1.0)); + } else { + *act_min = qmin; + *act_max = qmax; + } +} + +int32_t CalculateInputRadius(int input_integer_bits, int input_left_shift) { + const double max_input_rescaled = 1.0 * ((1 << input_integer_bits) - 1) * + (1ll << (31 - input_integer_bits)) / + (1ll << input_left_shift); + // Tighten bound using floor. Suppose that we could use the exact value. + // After scaling the difference, the result would be at the maximum. Thus we + // must ensure that our value has lower magnitude. + return static_cast<int32_t>(std::floor(max_input_rescaled)); +} + +bool addMulPrepare(const Shape& in1, const Shape& in2, Shape* out) { + NN_OPS_CHECK(getNumberOfDimensions(in1) <= 4 && getNumberOfDimensions(in2) <= 4); + NN_OPS_CHECK(in1.type == in2.type); + if (SameShape(in1, in2)) { + return SetShape(in1, out); + } else { + // BroadcastAdd needed + uint32_t numberOfDims1 = getNumberOfDimensions(in1); + uint32_t numberOfDims2 = getNumberOfDimensions(in2); + uint32_t maxDims = std::max(numberOfDims1, numberOfDims2); + out->dimensions = std::vector<uint32_t>(maxDims); + for (uint32_t i = 1; i <= maxDims; i++) { + uint32_t dim1 = 1; + if (i <= numberOfDims1) { + dim1 = getSizeOfDimension(in1, numberOfDims1 - i); + } + uint32_t dim2 = 1; + if (i <= numberOfDims2) { + dim2 = getSizeOfDimension(in2, numberOfDims2 - i); + } + if (dim1 != dim2 && dim1 != 1 && dim2 != 1) { + LOG(ERROR) << "Dimensions mismatch for BroadcastAdd"; + return false; + } + out->dimensions[maxDims - i] = std::max(dim1, dim2); + } + } + return true; +} + +bool floorPrepare(const Shape& input, Shape* output) { + return SetShape(input, output); +} + +bool dequantizePrepare(const Shape& input, Shape* output) { + if (input.type != OperandType::TENSOR_QUANT8_ASYMM || + output->type != OperandType::TENSOR_FLOAT32) { + LOG(ERROR) << "bad input / output operand type."; + return false; + } + if (input.dimensions.size() != output->dimensions.size()) { + LOG(ERROR) << "input and output tensors don't have the same rank."; + return false; + } + output->dimensions = input.dimensions; + return true; +} + +bool convPrepare(const Shape& input, + const Shape& filter, + const Shape& bias, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + Shape* output) { + NN_OPS_CHECK(input.type == filter.type); + if (input.type == OperandType::TENSOR_QUANT8_ASYMM) { + NN_OPS_CHECK(bias.type == OperandType::TENSOR_INT32); + } else { + NN_OPS_CHECK(input.type == bias.type); + } + NN_OPS_CHECK(getNumberOfDimensions(input) == 4); + NN_OPS_CHECK(getNumberOfDimensions(filter) == 4); + NN_OPS_CHECK(getNumberOfDimensions(bias) == 1); + + NN_OPS_CHECK(getSizeOfDimension(filter, 0) == getSizeOfDimension(bias, 0)); + NN_OPS_CHECK(getSizeOfDimension(filter, 3) == getSizeOfDimension(input, 3)); + + uint32_t channels_out = getSizeOfDimension(filter, 0); + uint32_t width = getSizeOfDimension(input, 2); + uint32_t height = getSizeOfDimension(input, 1); + uint32_t filterWidth = getSizeOfDimension(filter, 2); + uint32_t filterHeight = getSizeOfDimension(filter, 1); + uint32_t batches = getSizeOfDimension(input, 0); + + uint32_t outWidth = computeOutSize(width, filterWidth, stride_width, + padding_left, padding_right); + uint32_t outHeight = computeOutSize(height, filterHeight, stride_height, + padding_top, padding_bottom); + + output->type = input.type; + output->dimensions = {batches, outHeight, outWidth, channels_out}; + return true; +} + +bool depthwiseConvPrepare(const Shape& input, + const Shape& filter, + const Shape& bias, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + Shape* output) { + NN_OPS_CHECK(input.type == filter.type); + if (input.type == OperandType::TENSOR_QUANT8_ASYMM) { + NN_OPS_CHECK(bias.type == OperandType::TENSOR_INT32); + } else { + NN_OPS_CHECK(input.type == bias.type); + } + NN_OPS_CHECK(getNumberOfDimensions(input) == 4); + NN_OPS_CHECK(getNumberOfDimensions(filter) == 4); + NN_OPS_CHECK(getNumberOfDimensions(bias) == 1); + + NN_OPS_CHECK(getSizeOfDimension(filter, 3) == getSizeOfDimension(bias, 0)); + + uint32_t channels_out = getSizeOfDimension(filter, 3); + uint32_t width = getSizeOfDimension(input, 2); + uint32_t height = getSizeOfDimension(input, 1); + uint32_t filterWidth = getSizeOfDimension(filter, 2); + uint32_t filterHeight = getSizeOfDimension(filter, 1); + uint32_t batches = getSizeOfDimension(input, 0); + + uint32_t outWidth = computeOutSize(width, filterWidth, stride_width, + padding_left, padding_right); + uint32_t outHeight = computeOutSize(height, filterHeight, stride_height, + padding_top, padding_bottom); + + output->type = input.type; + output->dimensions = {batches, outHeight, outWidth, channels_out}; + return true; +} + + +bool genericPoolingPrepare(const Shape& input, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + int32_t filter_width, int32_t filter_height, + Shape* output) { + NN_OPS_CHECK(getNumberOfDimensions(input) == 4); + + uint32_t batches = getSizeOfDimension(input, 0); + uint32_t width = getSizeOfDimension(input, 2); + uint32_t height = getSizeOfDimension(input, 1); + uint32_t channels_out = getSizeOfDimension(input, 3); + + uint32_t outWidth = computeOutSize(width, filter_width, stride_width, + padding_left, padding_right); + uint32_t outHeight = computeOutSize(height, filter_height, stride_height, + padding_top, padding_bottom); + + output->type = input.type; + output->dimensions = {batches, outHeight, outWidth, channels_out}; + return true; +} + + +bool genericActivationPrepare(const Shape& input, + Shape* output) { + NN_OPS_CHECK(getNumberOfDimensions(input) <= 4); + return SetShape(input, output); +} + +bool fullyConnectedPrepare(const Shape& input, + const Shape& weights, + const Shape& bias, + Shape* output) { + // Check all the parameters of tensor match within themselves and match the + // input configuration. + NN_OPS_CHECK(input.type == weights.type); + if (input.type == OperandType::TENSOR_QUANT8_ASYMM) { + NN_OPS_CHECK(bias.type == OperandType::TENSOR_INT32); + } else { + NN_OPS_CHECK(input.type == bias.type); + } + NN_OPS_CHECK(getNumberOfDimensions(input) >= 2); + uint32_t input_size = getNumberOfElements(input); + uint32_t num_units = getSizeOfDimension(weights, 0); + + // modified to resolve Coverity 118949 (Apr 25, 2018) by hyunsik.yoon + // Original Code: + // uint32_t batch_size = input_size / getSizeOfDimension(weights, 1); + // + // Coverity Detection: Division by zero + // + // Code below is modified code + + uint32_t shape_size = getSizeOfDimension(weights, 1); + if (shape_size == 0) + { + return false; + } + + uint32_t batch_size = input_size / shape_size; + + NN_OPS_CHECK(getSizeOfDimension(bias, 0) == num_units); + NN_OPS_CHECK(getSizeOfDimension(weights, 1) * batch_size == input_size); + NN_OPS_CHECK(getNumberOfDimensions(weights) == 2); + + output->type = input.type; + output->dimensions = {batch_size, num_units}; + + return true; +} + +bool concatenationPrepare(const std::vector<Shape>& inputShapes, + int32_t axis, + Shape* output) { + + int num_inputs = inputShapes.size(); + OperandType input_type = inputShapes[0].type; + uint32_t num_dimensions = getNumberOfDimensions(inputShapes[0]); + + NN_OPS_CHECK(axis >= 0); + NN_OPS_CHECK(axis < (int32_t)num_dimensions); + + int sum_axis = getSizeOfDimension(inputShapes[0], axis); + for (int i = 1; i < num_inputs; ++i) { + NN_OPS_CHECK(getNumberOfDimensions(inputShapes[i]) == num_dimensions); + NN_OPS_CHECK(inputShapes[i].type == inputShapes[0].type); + if (input_type == OperandType::TENSOR_QUANT8_ASYMM) { + NN_OPS_CHECK(inputShapes[0].offset == inputShapes[i].offset); + NN_OPS_CHECK(inputShapes[0].scale == inputShapes[i].scale); + } + for (int d = 0; d < (int32_t)num_dimensions; ++d) { + if (d == axis) { + sum_axis += getSizeOfDimension(inputShapes[i], axis); + } else { + NN_OPS_CHECK(getSizeOfDimension(inputShapes[0], d) == + getSizeOfDimension(inputShapes[i], d)); + } + } + } + + output->type = input_type; + output->dimensions = inputShapes[0].dimensions; + output->dimensions[axis] = sum_axis; + + if (input_type == OperandType::TENSOR_QUANT8_ASYMM) { + NN_OPS_CHECK(inputShapes[0].offset == output->offset); + NN_OPS_CHECK(inputShapes[0].scale == output->scale); + } + + return true; +} + + +bool genericNormalizationPrepare(const Shape& input, Shape* output) { + NN_OPS_CHECK(getNumberOfDimensions(input) == 4); + return SetShape(input, output); +} + +bool reshapePrepare(const Shape& input, + const int32_t* targetDims, + const int32_t targetDimsSize, + Shape* output) { + // Reshape allows one of the targetDims components to have the + // special -1 value, meaning it will be calculated automatically based on the + // input. Here we calculate what that dimension should be so that the number + // of output elements in the same as the number of input elements. + int32_t numInputElements = (int32_t) getNumberOfElements(input); + + std::vector<uint32_t> outDims(targetDimsSize); + int32_t numOutputElements = 1; + int32_t strechDim = -1; + for (int32_t i = 0; i < targetDimsSize; ++i) { + int32_t value = targetDims[i]; + if (value == -1) { + NN_OPS_CHECK(strechDim == -1); + strechDim = i; + } else { + numOutputElements *= value; + outDims[i] = (uint32_t)value; + } + } + if (strechDim != -1) { + int32_t strechValue = numInputElements / numOutputElements; + outDims[strechDim] = (uint32_t) strechValue; + numOutputElements *= strechValue; + } + + NN_OPS_CHECK(numInputElements == numOutputElements); + + output->type = input.type; + output->dimensions = outDims; + output->offset = input.offset; + output->scale = input.scale; + + return true; +} + +bool resizeBilinearPrepare(const Shape& input, + int32_t width, + int32_t height, + Shape* output) { + NN_OPS_CHECK(getNumberOfDimensions(input) == 4); + uint32_t batches = getSizeOfDimension(input, 0); + uint32_t channels = getSizeOfDimension(input, 3); + + output->type = input.type; + output->dimensions = {batches, (uint32_t)height, (uint32_t)width, channels}; + + return true; +} + +bool depthToSpacePrepare(const Shape& input, + int32_t blockSize, + Shape* output) { + NN_OPS_CHECK(getNumberOfDimensions(input) == 4); + NN_OPS_CHECK(blockSize > 0); + + uint32_t batches = getSizeOfDimension(input, 0); + uint32_t height = getSizeOfDimension(input, 1); + uint32_t width = getSizeOfDimension(input, 2); + uint32_t channels = getSizeOfDimension(input, 3); + + NN_OPS_CHECK(channels % (blockSize * blockSize) == 0); + output->type = input.type; + output->dimensions = {batches, + height * blockSize, + width * blockSize, + channels / (blockSize * blockSize)}; + output->offset = input.offset; + output->scale = input.scale; + + return true; +} + +bool spaceToDepthPrepare(const Shape& input, + int32_t blockSize, + Shape* output) { + NN_OPS_CHECK(getNumberOfDimensions(input) == 4); + NN_OPS_CHECK(blockSize > 0); + + uint32_t batches = getSizeOfDimension(input, 0); + uint32_t height = getSizeOfDimension(input, 1); + uint32_t width = getSizeOfDimension(input, 2); + uint32_t channels = getSizeOfDimension(input, 3); + + NN_OPS_CHECK(height % blockSize == 0); + NN_OPS_CHECK(width % blockSize == 0); + + output->type = input.type; + output->dimensions = {batches, + height / blockSize, + width / blockSize, + channels * (blockSize * blockSize)}; + output->offset = input.offset; + output->scale = input.scale; + + return true; +} + +bool embeddingLookupPrepare(const Shape &valueShape, + const Shape &lookupShape, + Shape *outputShape) { + NN_OPS_CHECK(getNumberOfDimensions(valueShape) >= 2); + NN_OPS_CHECK(getNumberOfDimensions(lookupShape) == 1); + + const uint32_t rows = getSizeOfDimension(valueShape, 0); + const uint32_t columns = getSizeOfDimension(valueShape, 1); + + const uint32_t lookups = getSizeOfDimension(lookupShape, 0); + + outputShape->type = valueShape.type; + outputShape->dimensions = { lookups, columns }; + for (uint32_t i = 2; i < getNumberOfDimensions(valueShape); i++) { + outputShape->dimensions.push_back(getSizeOfDimension(valueShape, i)); + } + outputShape->offset = valueShape.offset; + outputShape->scale = valueShape.scale; + + return true; +} + +bool hashtableLookupPrepare(const Shape &lookupShape, + const Shape &keyShape, + const Shape &valueShape, + Shape *outputShape, + Shape *hitShape) { + NN_OPS_CHECK(getNumberOfDimensions(lookupShape) == 1); + NN_OPS_CHECK(getNumberOfDimensions(keyShape) == 1); + NN_OPS_CHECK(getNumberOfDimensions(valueShape) >= 1); + + const uint32_t lookups = getSizeOfDimension(lookupShape, 0); + const uint32_t keys = getSizeOfDimension(keyShape, 0); + const uint32_t rows = getSizeOfDimension(valueShape, 0); + outputShape->type = valueShape.type; + outputShape->dimensions = { lookups }; + for (uint32_t i = 1; i < getNumberOfDimensions(valueShape); i++) { + outputShape->dimensions.push_back(getSizeOfDimension(valueShape, i)); + } + outputShape->offset = valueShape.offset; + outputShape->scale = valueShape.scale; + + hitShape->type = OperandType::TENSOR_QUANT8_ASYMM; + hitShape->dimensions = { lookups }; + hitShape->offset = 0; + hitShape->scale = 1.f; + + return true; +} + +} // namespace rt +} // namespace nnfw diff --git a/runtimes/nn/common/Utils.cpp b/runtimes/nn/common/Utils.cpp new file mode 100644 index 000000000..7f0adea8e --- /dev/null +++ b/runtimes/nn/common/Utils.cpp @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Utils.h" +#include "NeuralNetworks.h" + +namespace nnfw { +namespace rt { + +#define COUNT(X) (sizeof(X) / sizeof(X[0])) + +const char* kTypeNames[kNumberOfDataTypes] = { + "FLOAT32", "INT32", "UINT32", + "TENSOR_FLOAT32", "TENSOR_INT32", "TENSOR_QUANT8_ASYMM", +}; + +static_assert(COUNT(kTypeNames) == kNumberOfDataTypes, "kTypeNames is incorrect"); + +const char* kTypeNamesOEM[kNumberOfDataTypesOEM] = { + "OEM", "TENSOR_OEM_BYTE", +}; + +static_assert(COUNT(kTypeNamesOEM) == kNumberOfDataTypesOEM, "kTypeNamesOEM is incorrect"); + +// TODO Check if this useful +const char* kErrorNames[] = { + "NO_ERROR", "OUT_OF_MEMORY", "INCOMPLETE", "NULL", "BAD_DATA", +}; + +namespace { + +template <typename EntryType, uint32_t entryCount, uint32_t entryCountOEM> +EntryType tableLookup(const EntryType (&table)[entryCount], + const EntryType (&tableOEM)[entryCountOEM], + uint32_t code) { + if (code < entryCount) { + return table[code]; + } else if (code >= kOEMCodeBase && (code - kOEMCodeBase) < entryCountOEM) { + return tableOEM[code - kOEMCodeBase]; + } else { + nnAssert(!"tableLookup: bad code"); + return EntryType(); + } +} + +}; // anonymous namespace + +const char* kOperationNames[kNumberOfOperationTypes] = { + "ADD", + "AVERAGE_POOL", + "CONCATENATION", + "CONV", + "DEPTHWISE_CONV", + "DEPTH_TO_SPACE", + "DEQUANTIZE", + "EMBEDDING_LOOKUP", + "FLOOR", + "FULLY_CONNECTED", + "HASHTABLE_LOOKUP", + "L2_NORMALIZATION", + "L2_POOL", + "LOCAL_RESPONSE_NORMALIZATION", + "LOGISTIC", + "LSH_PROJECTION", + "LSTM", + "MAX_POOL", + "MUL", + "RELU", + "RELU1", + "RELU6", + "RESHAPE", + "RESIZE_BILINEAR", + "RNN", + "SOFTMAX", + "SPACE_TO_DEPTH", + "SVDF", + "TANH", +}; + +static_assert(COUNT(kOperationNames) == kNumberOfOperationTypes, "kOperationNames is incorrect"); + +const char* kOperationNamesOEM[kNumberOfOperationTypesOEM] = { + "OEM_OPERATION", +}; + +static_assert(COUNT(kOperationNamesOEM) == kNumberOfOperationTypesOEM, + "kOperationNamesOEM is incorrect"); + +const char* getOperationName(OperationType type) { + uint32_t n = static_cast<uint32_t>(type); + return tableLookup(kOperationNames, kOperationNamesOEM, n); +} + +const uint32_t kSizeOfDataType[]{ + 4, // ANEURALNETWORKS_FLOAT32 + 4, // ANEURALNETWORKS_INT32 + 4, // ANEURALNETWORKS_UINT32 + 4, // ANEURALNETWORKS_TENSOR_FLOAT32 + 4, // ANEURALNETWORKS_TENSOR_INT32 + 1 // ANEURALNETWORKS_TENSOR_SYMMETRICAL_QUANT8 +}; + +static_assert(COUNT(kSizeOfDataType) == kNumberOfDataTypes, "kSizeOfDataType is incorrect"); + +const bool kScalarDataType[]{ + true, // ANEURALNETWORKS_FLOAT32 + true, // ANEURALNETWORKS_INT32 + true, // ANEURALNETWORKS_UINT32 + false, // ANEURALNETWORKS_TENSOR_FLOAT32 + false, // ANEURALNETWORKS_TENSOR_INT32 + false, // ANEURALNETWORKS_TENSOR_SYMMETRICAL_QUANT8 +}; + +static_assert(COUNT(kScalarDataType) == kNumberOfDataTypes, "kScalarDataType is incorrect"); + +const uint32_t kSizeOfDataTypeOEM[]{ + 0, // ANEURALNETWORKS_OEM + 1, // ANEURALNETWORKS_TENSOR_OEM_BYTE +}; + +static_assert(COUNT(kSizeOfDataTypeOEM) == kNumberOfDataTypesOEM, + "kSizeOfDataTypeOEM is incorrect"); + +const bool kScalarDataTypeOEM[]{ + true, // ANEURALNETWORKS_OEM + false, // ANEURALNETWORKS_TENSOR_OEM_BYTE +}; + +static_assert(COUNT(kScalarDataTypeOEM) == kNumberOfDataTypesOEM, + "kScalarDataTypeOEM is incorrect"); + +uint32_t sizeOfData(OperandType type, const std::vector<uint32_t>& dimensions) { + int n = static_cast<int>(type); + + uint32_t size = tableLookup(kSizeOfDataType, kSizeOfDataTypeOEM, n); + + if (tableLookup(kScalarDataType, kScalarDataTypeOEM, n) == true) { + return size; + } + + for (auto d : dimensions) { + size *= d; + } + return size; +} + +// TODO-NNRT : Should be changed to allocate hidl_memory using Allocator. +// And Should change naming to "allocateMemory". +hidl_memory allocateSharedMemory(int64_t size) { + hidl_memory memory; +#if 0 // TODO-NNRT : Use shared memory or hidl memory + + // TODO: should we align memory size to nearest page? doesn't seem necessary... + const std::string& type = "ashmem"; + sp<IAllocator> allocator = IAllocator::getService(type); + allocator->allocate(size, [&](bool success, const hidl_memory& mem) { + if (!success) { + LOG(ERROR) << "unable to allocate " << size << " bytes of " << type; + } else { + memory = mem; + } + }); +#endif + LOG(ERROR) << "Not support to allocate shared memory now."; + return memory; +} + +uint32_t alignBytesNeeded(uint32_t index, size_t length) { + uint32_t pattern; + if (length < 2) { + pattern = 0; // No alignment necessary + } else if (length < 4) { + pattern = 1; // Align on 2-byte boundary + } else { + pattern = 3; // Align on 4-byte boundary + } + uint32_t extra = (~(index - 1)) & pattern; + return extra; +} + +// Validates the type. The used dimensions can be underspecified. +int validateOperandType(const ANeuralNetworksOperandType& type, const char* tag, + bool allowPartial) { + if (!allowPartial) { + for (uint32_t i = 0; i < type.dimensionCount; i++) { + if (type.dimensions[i] == 0) { + LOG(ERROR) << tag << " OperandType invalid dimensions[" << i + << "] = " << type.dimensions[i]; + return ANEURALNETWORKS_BAD_DATA; + } + } + } + if (!validCode(kNumberOfDataTypes, kNumberOfDataTypesOEM, type.type)) { + LOG(ERROR) << tag << " OperandType invalid type " << type.type; + return ANEURALNETWORKS_BAD_DATA; + } + if (type.type == ANEURALNETWORKS_TENSOR_QUANT8_ASYMM) { + if (type.zeroPoint < 0 || type.zeroPoint > 255) { + LOG(ERROR) << tag << " OperandType invalid zeroPoint " << type.zeroPoint; + return ANEURALNETWORKS_BAD_DATA; + } + if (type.scale < 0.f) { + LOG(ERROR) << tag << " OperandType invalid scale " << type.scale; + return ANEURALNETWORKS_BAD_DATA; + } + } + + // TODO-NNRT : add 'type.type == ANEURALNETWORKS_OEM_SCALAR' later. + // OEM operaters are not supported now. + if (type.type == ANEURALNETWORKS_FLOAT32 || + type.type == ANEURALNETWORKS_INT32 || + type.type == ANEURALNETWORKS_UINT32) { + if (type.dimensionCount != 0 || type.dimensions != nullptr) { + LOG(ERROR) << tag << " Invalid dimensions for scalar type"; + return ANEURALNETWORKS_BAD_DATA; + } + } + + return ANEURALNETWORKS_NO_ERROR; +} + +int validateOperandList(uint32_t count, const uint32_t* list, uint32_t operandCount, + const char* tag) { + for (uint32_t i = 0; i < count; i++) { + if (list[i] >= operandCount) { + LOG(ERROR) << tag << " invalid operand index at " << i << " = " << list[i] + << ", operandCount " << operandCount; + return ANEURALNETWORKS_BAD_DATA; + } + } + return ANEURALNETWORKS_NO_ERROR; +} + +static bool validOperandIndexes(const hidl_vec<uint32_t> indexes, size_t operandCount) { + for (uint32_t i : indexes) { + if (i >= operandCount) { + LOG(ERROR) << "Index out of range " << i << "/" << operandCount; + return false; + } + } + return true; +} + +static bool validOperands(const hidl_vec<Operand>& operands, const hidl_vec<uint8_t>& operandValues, + size_t poolCount) { + for (auto& operand : operands) { + if (!validCode(kNumberOfDataTypes, kNumberOfDataTypesOEM, + static_cast<uint32_t>(operand.type))) { + LOG(ERROR) << "Invalid operand type "; + return false; + } + /* TODO validate dim with type + if (!validOperandIndexes(operand.dimensions, mDimensions)) { + return false; + } + */ + switch (operand.lifetime) { + case OperandLifeTime::CONSTANT_COPY: + if (operand.location.offset + operand.location.length > operandValues.size()) { + LOG(ERROR) << "OperandValue location out of range. Starts at " + << operand.location.offset << ", length " << operand.location.length + << ", max " << operandValues.size(); + return false; + } + break; + case OperandLifeTime::TEMPORARY_VARIABLE: + case OperandLifeTime::MODEL_INPUT: + case OperandLifeTime::MODEL_OUTPUT: + case OperandLifeTime::NO_VALUE: + if (operand.location.offset != 0 || operand.location.length != 0) { + LOG(ERROR) << "Unexpected offset " << operand.location.offset << " or length " + << operand.location.length << " for runtime location."; + return false; + } + break; + case OperandLifeTime::CONSTANT_REFERENCE: + if (operand.location.poolIndex >= poolCount) { + LOG(ERROR) << "Invalid poolIndex " << operand.location.poolIndex << "/" + << poolCount; + return false; + } + break; + // TODO: Validate that we are within the pool. + default: + LOG(ERROR) << "Invalid lifetime"; + return false; + } + } + return true; +} + +static bool validOperations(const hidl_vec<Operation>& operations, size_t operandCount) { + for (auto& op : operations) { + if (!validCode(kNumberOfOperationTypes, kNumberOfOperationTypesOEM, + static_cast<uint32_t>(op.type))) { + LOG(ERROR) << "Invalid operation type "; + return false; + } + if (!validOperandIndexes(op.inputs, operandCount) || + !validOperandIndexes(op.outputs, operandCount)) { + return false; + } + } + return true; +} + +// TODO doublecheck +bool validateModel(const Model& model) { + const size_t operandCount = model.operands.size(); + return (validOperands(model.operands, model.operandValues, model.pools.size()) && + validOperations(model.operations, operandCount) && + validOperandIndexes(model.inputIndexes, operandCount) && + validOperandIndexes(model.outputIndexes, operandCount)); +} + +bool validRequestArguments(const hidl_vec<RequestArgument>& arguments, + const hidl_vec<uint32_t>& operandIndexes, + const hidl_vec<Operand>& operands, size_t poolCount, + const char* type) { + const size_t argumentCount = arguments.size(); + if (argumentCount != operandIndexes.size()) { + LOG(ERROR) << "Request specifies " << argumentCount << " " << type << "s but the model has " + << operandIndexes.size(); + return false; + } + for (size_t argumentIndex = 0; argumentIndex < argumentCount; argumentIndex++) { + const RequestArgument& argument = arguments[argumentIndex]; + const uint32_t operandIndex = operandIndexes[argumentIndex]; + const Operand& operand = operands[operandIndex]; + if (argument.hasNoValue) { + if (argument.location.poolIndex != 0 || + argument.location.offset != 0 || + argument.location.length != 0 || + argument.dimensions.size() != 0) { + LOG(ERROR) << "Request " << type << " " << argumentIndex + << " has no value yet has details."; + return false; + } + } + if (argument.location.poolIndex >= poolCount) { + LOG(ERROR) << "Request " << type << " " << argumentIndex << " has an invalid poolIndex " + << argument.location.poolIndex << "/" << poolCount; + return false; + } + // TODO: Validate that we are within the pool. + uint32_t rank = argument.dimensions.size(); + if (rank > 0) { + if (rank != operand.dimensions.size()) { + LOG(ERROR) << "Request " << type << " " << argumentIndex + << " has number of dimensions (" << rank + << ") different than the model's (" << operand.dimensions.size() << ")"; + return false; + } + for (size_t i = 0; i < rank; i++) { + if (argument.dimensions[i] != operand.dimensions[i] && + operand.dimensions[i] != 0) { + LOG(ERROR) << "Request " << type << " " << argumentIndex + << " has dimension " << i << " of " << operand.dimensions[i] + << " different than the model's " << operand.dimensions[i]; + return false; + } + if (argument.dimensions[i] == 0) { + LOG(ERROR) << "Request " << type << " " << argumentIndex + << " has dimension " << i << " of zero"; + return false; + } + } + } + } + return true; +} + +// TODO doublecheck +bool validateRequest(const Request& request, const Model& model) { + const size_t poolCount = request.pools.size(); + return (validRequestArguments(request.inputs, model.inputIndexes, model.operands, poolCount, + "input") && + validRequestArguments(request.outputs, model.outputIndexes, model.operands, poolCount, + "output")); +} + +} // namespace rt +} // namespace nnfw diff --git a/runtimes/nn/common/include/ActivationFunctor.h b/runtimes/nn/common/include/ActivationFunctor.h new file mode 100644 index 000000000..788962e4c --- /dev/null +++ b/runtimes/nn/common/include/ActivationFunctor.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RT_ACTIVATION_FUNCTOR_H__ +#define __NNFW_RT_ACTIVATION_FUNCTOR_H__ + +#if 0 // REF-ANN +#include "android/log.h" +#endif + +#include <algorithm> +#include <cmath> +#include <vector> +#include <cstdint> + +enum ActivationFn { + kActivationNone = 0, + kActivationRelu, + kActivationRelu1, + kActivationRelu6, + kActivationTanh, + kActivationSignBit, + kActivationSigmoid, +}; + +class ActivationFunctor { + public: + explicit ActivationFunctor(ActivationFn act) : act_(act) {} + + float operator()(float a) const { + switch (act_) { + case kActivationNone: + return a; + case kActivationRelu: + return a < 0.f ? 0.f : a; + case kActivationRelu6: + return std::max(0.f, std::min(a, 6.f)); + case kActivationTanh: + return std::tanh(a); + case kActivationSigmoid: + return 1.0f / (1.0f + std::exp(-a)); + default: +#if 0 // REF-ANN + __android_log_print(ANDROID_LOG_ERROR, "NN API", + "Invalid enum value for activation function: 0x%0X", + act_); +#endif + exit(1); + } + } + + private: + ActivationFn act_; +}; + +#endif // __NNFW_RT_ACTIVATION_FUNCTOR_H__ diff --git a/runtimes/nn/common/include/CpuExecutor.h b/runtimes/nn/common/include/CpuExecutor.h new file mode 100644 index 000000000..385a461de --- /dev/null +++ b/runtimes/nn/common/include/CpuExecutor.h @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RT_CPU_EXECUTOR_H__ +#define __NNFW_RT_CPU_EXECUTOR_H__ + +#include "HalInterfaces.h" +#include "OperationsUtils.h" +#include "Utils.h" + +#include <algorithm> +#include <vector> + +namespace nnfw { +namespace rt { + +// Information we maintain about each operand during execution that +// may change during execution. +struct RunTimeOperandInfo { + // TODO Storing the type here is redundant, as it won't change during execution. + OperandType type; + // The type and dimensions of the operand. The dimensions can + // change at runtime. We include the type because it's useful + // to pass together with the dimension to the functions implementing + // the operators. + std::vector<uint32_t> dimensions; + + float scale; + int32_t zeroPoint; + // Where the operand's data is stored. Check the corresponding + // location information in the model to figure out if this points + // to memory we have allocated for an temporary operand. + uint8_t* buffer; + // The length of the buffer. + uint32_t length; + // Whether this is a temporary variable, a model input, a constant, etc. + OperandLifeTime lifetime; + // Keeps track of how many operations have yet to make use + // of this temporary variable. When the count is decremented to 0, + // we free the buffer. For non-temporary variables, this count is + // always 0. + uint32_t numberOfUsesLeft; + + Shape shape() const { + return Shape{.type = type, .dimensions = dimensions, .scale = scale, .offset = zeroPoint}; + } +}; + +// Used to keep a pointer to each of the memory pools. +struct RunTimePoolInfo { +#if 0 // REF-ANN + sp<IMemory> memory; +#endif + hidl_memory hidlMemory; + uint8_t* buffer; + + bool set(const hidl_memory& hidlMemory); + bool update(); +}; + +bool setRunTimePoolInfosFromHidlMemories(std::vector<RunTimePoolInfo>* poolInfos, + const hidl_vec<hidl_memory>& pools); + +// This class is used to execute a model on the CPU. +class CpuExecutor { +public: + // Executes the model. The results will be stored at the locations + // specified in the constructor. + // The model must outlive the executor. We prevent it from being modified + // while this is executing. + int run(const Model& model, const Request& request, + const std::vector<RunTimePoolInfo>& modelPoolInfos, + const std::vector<RunTimePoolInfo>& requestPoolInfos); + +private: + bool initializeRunTimeInfo(const std::vector<RunTimePoolInfo>& modelPoolInfos, + const std::vector<RunTimePoolInfo>& requestPoolInfos); + // Runs one operation of the graph. + int executeOperation(const Operation& entry); + // Decrement the usage count for the operands listed. Frees the memory + // allocated for any temporary variable with a count of zero. + void freeNoLongerUsedOperands(const std::vector<uint32_t>& inputs); + + // The model and the request that we'll execute. Only valid while run() + // is being executed. + const Model* mModel = nullptr; + const Request* mRequest = nullptr; + + // We're copying the list of all the dimensions from the model, as + // these may be modified when we run the operatins. Since we're + // making a full copy, the indexes used in the operand description + // stay valid. + // std::vector<uint32_t> mDimensions; + // Runtime information about all the operands. + std::vector<RunTimeOperandInfo> mOperands; +}; + +namespace { + +template <typename T> +T getScalarData(const RunTimeOperandInfo& info) { + // TODO: Check buffer is at least as long as size of data. + T* data = reinterpret_cast<T*>(info.buffer); + return data[0]; +} + +inline bool IsNullInput(const RunTimeOperandInfo *input) { + return input->lifetime == OperandLifeTime::NO_VALUE; +} + +#if 0 // REF-ANN +inline int NumInputsWithValues(const Operation &operation, + std::vector<RunTimeOperandInfo> &operands) { + const std::vector<uint32_t> &inputs = operation.inputs; + return std::count_if(inputs.begin(), inputs.end(), + [&operands](uint32_t i) { + return !IsNullInput(&operands[i]); + }); +} + +inline int NumOutputs(const Operation &operation) { + return operation.outputs.size(); +} + +inline size_t NumDimensions(const RunTimeOperandInfo *operand) { + return operand->shape().dimensions.size(); +} + +inline uint32_t SizeOfDimension(const RunTimeOperandInfo *operand, int i) { + return operand->shape().dimensions[i]; +} + +inline RunTimeOperandInfo *GetInput(const Operation &operation, + std::vector<RunTimeOperandInfo> &operands, + int index) { + return &operands[operation.inputs[index]]; +} + +inline RunTimeOperandInfo *GetOutput(const Operation &operation, + std::vector<RunTimeOperandInfo> &operands, + int index) { + return &operands[operation.outputs[index]]; +} +#endif + +} // anonymous namespace + +} // namespace rt +} // namespace nnfw + +#endif // __NNFW_RT_CPU_EXECUTOR_H__ diff --git a/runtimes/nn/common/include/HalInterfaces.h b/runtimes/nn/common/include/HalInterfaces.h new file mode 100644 index 000000000..9a086c09d --- /dev/null +++ b/runtimes/nn/common/include/HalInterfaces.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RT_HAL_INTERFACES_H__ +#define __NNFW_RT_HAL_INTERFACES_H__ + +#if 0 // REF-ANN +#include <android/hardware/neuralnetworks/1.0/IDevice.h> +#include <android/hardware/neuralnetworks/1.0/IExecutionCallback.h> +#include <android/hardware/neuralnetworks/1.0/IPreparedModel.h> +#include <android/hardware/neuralnetworks/1.0/IPreparedModelCallback.h> +#endif +#include <android/hardware/neuralnetworks/1.0/types.h> + +#if 0 // REF-ANN +#include <android/hidl/allocator/1.0/IAllocator.h> +#include <android/hidl/memory/1.0/IMemory.h> +#include <hidlmemory/mapping.h> +#endif + +using ::android::hardware::hidl_memory; +using ::android::hardware::hidl_vec; +using ::android::hardware::neuralnetworks::V1_0::DataLocation; +using ::android::hardware::neuralnetworks::V1_0::ErrorStatus; +using ::android::hardware::neuralnetworks::V1_0::FusedActivationFunc; +using ::android::hardware::neuralnetworks::V1_0::Model; +using ::android::hardware::neuralnetworks::V1_0::Operand; +using ::android::hardware::neuralnetworks::V1_0::OperandLifeTime; +using ::android::hardware::neuralnetworks::V1_0::OperandType; +using ::android::hardware::neuralnetworks::V1_0::Operation; +using ::android::hardware::neuralnetworks::V1_0::OperationType; +using ::android::hardware::neuralnetworks::V1_0::PerformanceInfo; +using ::android::hardware::neuralnetworks::V1_0::Request; +using ::android::hardware::neuralnetworks::V1_0::RequestArgument; +#if 0 // REF-ANN +using ::android::hardware::Return; +using ::android::hardware::Void; +using ::android::hardware::hidl_memory; +using ::android::hardware::hidl_string; +using ::android::hardware::hidl_vec; +using ::android::hardware::neuralnetworks::V1_0::Capabilities; +using ::android::hardware::neuralnetworks::V1_0::DataLocation; +using ::android::hardware::neuralnetworks::V1_0::DeviceStatus; +using ::android::hardware::neuralnetworks::V1_0::FusedActivationFunc; +using ::android::hardware::neuralnetworks::V1_0::IDevice; +using ::android::hardware::neuralnetworks::V1_0::IExecutionCallback; +using ::android::hardware::neuralnetworks::V1_0::IPreparedModel; +using ::android::hardware::neuralnetworks::V1_0::IPreparedModelCallback; +using ::android::hardware::neuralnetworks::V1_0::Model; +using ::android::hardware::neuralnetworks::V1_0::Operand; +using ::android::hardware::neuralnetworks::V1_0::OperandLifeTime; +using ::android::hardware::neuralnetworks::V1_0::OperandType; +using ::android::hardware::neuralnetworks::V1_0::Operation; +using ::android::hardware::neuralnetworks::V1_0::OperationType; +using ::android::hardware::neuralnetworks::V1_0::PerformanceInfo; +using ::android::hardware::neuralnetworks::V1_0::Request; +using ::android::hardware::neuralnetworks::V1_0::RequestArgument; +using ::android::hidl::allocator::V1_0::IAllocator; +using ::android::hidl::memory::V1_0::IMemory; +#endif + +namespace nnfw { +namespace rt { + +} // namespace rt +} // namespace nnfw + +#endif // __NNFW_RT_HAL_INTERFACES_H__ diff --git a/runtimes/nn/common/include/Logging.h b/runtimes/nn/common/include/Logging.h new file mode 100644 index 000000000..060458b85 --- /dev/null +++ b/runtimes/nn/common/include/Logging.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RT_LOGGING_H__ +#define __NNFW_RT_LOGGING_H__ + +#include <iostream> + +namespace nnfw { +namespace rt { + +// TODO-NNRT Move this to proper place +class BoolConfig +{ +public: + BoolConfig(const std::string &tag, bool default_value); + +public: + bool value(void) const { return _value; } + +private: + bool _value; +}; + +class VLogging +{ +public: + static VLogging& access(void); + bool enabled() const { return _enabled; } + std::ostream& stream(void); + +private: + VLogging(); + +private: + bool _enabled; +}; + +#define LOG(...) std::cout << std::endl +#define VLOG(...) if (VLogging::access().enabled()) \ + (VLogging::access().stream() << std::endl) +#define PLOG(...) LOG(...) +#define NYI(module) std::cout << "NYI : '" << module << "' is not supported now." << std::endl; + +} // namespace rt +} // namespace nnfw + +#endif // __NNFW_RT_LOGGING_H__ diff --git a/runtimes/nn/common/include/Operations.h b/runtimes/nn/common/include/Operations.h new file mode 100644 index 000000000..33730ea17 --- /dev/null +++ b/runtimes/nn/common/include/Operations.h @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RT_OPERATIONS_H__ +#define __NNFW_RT_OPERATIONS_H__ + +#if 0 // REF-ANN +#include "operations/EmbeddingLookup.h" +#include "operations/HashtableLookup.h" +#include "operations/LSHProjection.h" +#include "operations/LSTM.h" +#include "operations/RNN.h" +#include "operations/SVDF.h" +#endif + +#include <stddef.h> + +#include <cstdint> +#include <vector> + +namespace nnfw { +namespace rt { + +struct Shape; + +bool addFloat32(const float* in1, const Shape& shape1, + const float* in2, const Shape& shape2, + int32_t activation, + float* out, const Shape& shapeOut); +bool addQuant8(const uint8_t* in1, const Shape& shape1, + const uint8_t* in2, const Shape& shape2, + int32_t activation, + uint8_t* out, const Shape& shapeOut); + +bool mulFloat32(const float* in1, const Shape& shape1, + const float* in2, const Shape& shape2, + int32_t activation, + float* out, const Shape& shapeOut); +bool mulQuant8(const uint8_t* in1, const Shape& shape1, + const uint8_t* in2, const Shape& shape2, + int32_t activation, + uint8_t* out, const Shape& shapeOut); + +bool floorFloat32(const float* inputData, + float* outputData, + const Shape& shape); + +bool dequantizeQuant8ToFloat32(const uint8_t* inputData, + float* outputData, + const Shape& shape); + +bool depthwiseConvFloat32(const float* inputData, const Shape& inputShape, + const float* filterData, const Shape& filterShape, + const float* biasData, const Shape& biasShape, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + int32_t depth_multiplier, int32_t activation, + float* outputData, const Shape& outputShape); +#if 0 // REF-ANN We don't support depthwiseConvQuant8 yet +bool depthwiseConvQuant8(const uint8_t* inputData, const Shape& inputShape, + const uint8_t* filterData, const Shape& filterShape, + const int32_t* biasData, const Shape& biasShape, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + int32_t depth_multiplier, int32_t activation, + uint8_t* outputData, const Shape& outputShape); +#endif // REF-ANN + +bool convFloat32(const float* inputData, const Shape& inputShape, + const float* filterData, const Shape& filterShape, + const float* biasData, const Shape& biasShape, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + int32_t activation, + float* outputData, const Shape& outputShape); +bool convQuant8(const uint8_t* inputData, const Shape& inputShape, + const uint8_t* filterData, const Shape& filterShape, + const int32_t* biasData, const Shape& biasShape, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + int32_t activation, + uint8_t* outputData, const Shape& outputShape); + +bool averagePoolFloat32(const float* inputData, const Shape& inputShape, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + int32_t filter_width, int32_t filter_height, int32_t activation, + float* outputData, const Shape& outputShape); +bool averagePoolQuant8(const uint8_t* inputData, const Shape& inputShape, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + int32_t filter_width, int32_t filter_height, int32_t activation, + uint8_t* outputData, const Shape& outputShape); +bool l2PoolFloat32(const float* inputData, const Shape& inputShape, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + int32_t filter_width, int32_t filter_height, int32_t activation, + float* outputData, const Shape& outputShape); +bool maxPoolFloat32(const float* inputData, const Shape& inputShape, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + int32_t filter_width, int32_t filter_height, int32_t activation, + float* outputData, const Shape& outputShape); +bool maxPoolQuant8(const uint8_t* inputData, const Shape& inputShape, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + int32_t filter_width, int32_t filter_height, int32_t activation, + uint8_t* outputData, const Shape& outputShape); + +bool reluFloat32(const float* inputData, const Shape& inputShape, + float* outputData, const Shape& outputShape); +bool relu1Float32(const float* inputData, const Shape& inputShape, + float* outputData, const Shape& outputShape); +bool relu6Float32(const float* inputData, const Shape& inputShape, + float* outputData, const Shape& outputShape); +bool tanhFloat32(const float* inputData, const Shape& inputShape, + float* outputData, const Shape& outputShape); +bool logisticFloat32(const float* inputData, const Shape& inputShape, + float* outputData, const Shape& outputShape); +bool softmaxFloat32(const float* inputData, const Shape& inputShape, + const float beta, + float* outputData, const Shape& outputShape); +bool reluQuant8(const uint8_t* inputData, const Shape& inputShape, + uint8_t* outputData, const Shape& outputShape); +bool relu1Quant8(const uint8_t* inputData, const Shape& inputShape, + uint8_t* outputData, const Shape& outputShape); +bool relu6Quant8(const uint8_t* inputData, const Shape& inputShape, + uint8_t* outputData, const Shape& outputShape); +bool logisticQuant8(const uint8_t* inputData, const Shape& inputShape, + uint8_t* outputData, const Shape& outputShape); +bool softmaxQuant8(const uint8_t* inputData, const Shape& inputShape, + const float beta, + uint8_t* outputData, const Shape& outputShape); + +bool fullyConnectedFloat32(const float* inputData, const Shape& inputShape, + const float* weights, const Shape& weightsShape, + const float* biasData, const Shape& biasShape, + int32_t activation, + float* outputData, const Shape& outputShape); +bool fullyConnectedQuant8(const uint8_t* inputData, const Shape& inputShape, + const uint8_t* weights, const Shape& weightsShape, + const int32_t* biasData, const Shape& biasShape, + int32_t activation, + uint8_t* outputData, const Shape& outputShape); + +bool concatenationFloat32(const std::vector<const float*>& inputDataPtrs, + const std::vector<Shape>& inputShapes, int32_t axis, + float* outputData, const Shape& outputShape); +bool concatenationQuant8(const std::vector<const uint8_t*>& inputDataPtrs, + const std::vector<Shape>& inputShapes, int32_t axis, + uint8_t* outputData, const Shape& outputShape); + +bool l2normFloat32(const float* inputData, const Shape& inputShape, + float* outputData, const Shape& outputShape); +bool l2normQuant8(const uint8_t* inputData, const Shape& inputShape, + uint8_t* outputData, const Shape& outputShape); +bool localResponseNormFloat32(const float* inputData, const Shape& inputShape, + int32_t radius, float bias, float alpha, float beta, + float* outputData, const Shape& outputShape); + +bool reshapeGeneric(const void* inputData, const Shape& inputShape, + void* outputData, const Shape& outputShape); + +bool resizeBilinearFloat32(const float* inputData, + const Shape& inputShape, + float* outputData, + const Shape& outputShape); + +bool depthToSpaceGeneric(const uint8_t* inputData, const Shape& inputShape, + int32_t blockSize, + uint8_t* outputData, const Shape& outputShape); + +bool spaceToDepthGeneric(const uint8_t* inputData, const Shape& inputShape, + int32_t blockSize, + uint8_t* outputData, const Shape& outputShape); + +} // namespace rt +} // namespace nnfw + +#endif // __NNFW_RT_OPERATIONS_H__ diff --git a/runtimes/nn/common/include/OperationsUtils.h b/runtimes/nn/common/include/OperationsUtils.h new file mode 100644 index 000000000..c66ad891b --- /dev/null +++ b/runtimes/nn/common/include/OperationsUtils.h @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RT_OPERATIONS_UTILS_H__ +#define __NNFW_RT_OPERATIONS_UTILS_H__ + +#include "Utils.h" + +#include <cstdint> +#include <vector> + +// Macro to check if the input parameters for operation are valid or not. +#define NN_CHECK(v) \ + do { \ + if (!(v)) { \ + LOG(ERROR) << "NN_CHECK failed: " << #v << "'\n"; \ + return false; \ + } \ + } while(0); + +#define NN_CHECK_EQ(actual, expected) \ + NN_CHECK((actual) == (expected)) + +#define NN_OPS_CHECK NN_CHECK + +namespace nnfw { +namespace rt { + +enum PaddingScheme { + kPaddingUnknown = 0, + kPaddingSame = 1, + kPaddingValid = 2, +}; + +// The type and dimensions of an operand. +struct Shape { + OperandType type; + std::vector<uint32_t> dimensions; + float scale; + int32_t offset; +}; + +// Verifies that the two shapes are the same. +bool SameShape(const Shape& in1, const Shape& in2); + +// Sets out to the same shape as in. +bool SetShape(const Shape& in, Shape* out); + +// Return the total number of elements, i.e. all the dimensions multiplied +// together. For a scalar, returns one. +uint32_t getNumberOfElements(const Shape& shape); + +uint32_t getNumberOfDimensions(const Shape& shape); + +uint32_t getSizeOfDimension(const Shape& shape, uint32_t dimensionIdx); + +inline uint32_t computeOutSize(uint32_t imageSize, uint32_t filterSize, uint32_t stride, + uint32_t paddingHead, uint32_t paddingTail) { + return (imageSize - filterSize + stride + paddingHead + paddingTail) / stride; +} + +__wur +bool QuantizeMultiplierSmallerThanOne(double double_multiplier, + int32_t* quantized_multiplier, + int32_t* right_shift); + +__wur +bool QuantizeMultiplierGreaterThanOne(double double_multiplier, + int32_t* quantized_multiplier, + int* left_shift); + +__wur +bool GetQuantizedConvolutionMultipler(const Shape& inputShape, + const Shape& filterShape, + const Shape& biasShape, + const Shape& outputShape, + float* multiplier); + +void CalculateActivationRangeUint8(int32_t activation, + const Shape& outputShape, + int32_t* act_min, + int32_t* act_max); + +int32_t CalculateInputRadius(int input_integer_bits, int input_left_shift); + +inline void calculateExplicitPadding(int32_t in_size, int32_t stride, + int32_t filter_size, int32_t padding_implicit, + int32_t* padding_head, int32_t* padding_tail) { + *padding_head = 0; + *padding_tail = 0; + + if (padding_implicit == kPaddingSame) { + int32_t out_size = (in_size + stride - 1) / stride; + int32_t tmp = (out_size - 1) * stride + filter_size; + if (tmp > in_size) { + *padding_head = (tmp - in_size) / 2; + *padding_tail = (tmp - in_size) - *padding_head; + } + } +} + +inline PaddingScheme getPaddingScheme(int32_t inWidth, int32_t inHeight, + int32_t strideWidth, int32_t strideHeight, + int32_t filterWidth, int32_t filterHeight, + int32_t paddingLeft, int32_t paddingRight, + int32_t paddingTop, int32_t paddingBottom) { + if (paddingLeft == 0 && paddingRight == 0 && paddingTop == 0 && paddingBottom == 0) { + return kPaddingValid; + } + + int32_t expectedPaddingLeft, expectedPaddingRight; + int32_t expectedPaddingTop, expectedPaddingBottom; + + calculateExplicitPadding(inWidth, strideWidth, filterWidth, kPaddingSame, + &expectedPaddingLeft, &expectedPaddingRight); + calculateExplicitPadding(inHeight, strideHeight, filterHeight, kPaddingSame, + &expectedPaddingTop, &expectedPaddingBottom); + if (expectedPaddingLeft == paddingLeft && expectedPaddingRight == paddingRight && + expectedPaddingTop == paddingTop && expectedPaddingBottom == paddingBottom) { + return kPaddingSame; + } else { + return kPaddingUnknown; + } +} + +// Preparation functions for the corresponding ops +bool addMulPrepare(const Shape& in1, const Shape& in2, Shape* out1); + +bool floorPrepare(const Shape& input, Shape* output); + +bool dequantizePrepare(const Shape& input, Shape* output); + +bool depthwiseConvPrepare(const Shape& input, + const Shape& filter, + const Shape& bias, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + Shape* output); + +bool convPrepare(const Shape& input, + const Shape& filter, + const Shape& bias, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + Shape* output); + +bool genericPoolingPrepare(const Shape& input, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + int32_t filter_width, int32_t filter_height, + Shape* output); + +bool genericActivationPrepare(const Shape& input, Shape* output); + +bool fullyConnectedPrepare(const Shape& input, + const Shape& weights, + const Shape& bias, + Shape* output); + +bool concatenationPrepare(const std::vector<Shape>& inputShapes, + int32_t axis, + Shape* output); + +bool genericNormalizationPrepare(const Shape& input, Shape* output); + +bool reshapePrepare(const Shape& input, + const int32_t* targetDims, + const int32_t targetDimsSize, + Shape* output); + +bool resizeBilinearPrepare(const Shape& input, + int32_t height, + int32_t width, + Shape* output); + +bool depthToSpacePrepare(const Shape& input, + int32_t blockSize, + Shape* output); + +bool spaceToDepthPrepare(const Shape& input, + int32_t blockSize, + Shape* output); + +bool embeddingLookupPrepare(const Shape &valueShape, + const Shape &lookupShape, + Shape *outputShape); + +bool hashtableLookupPrepare(const Shape &lookupShape, + const Shape &keyShape, + const Shape &valueShape, + Shape *outputShape, + Shape *hitShape); + +#define ANDROID_NN_MACRO_DISPATCH_INTERNAL(macro) \ + case (int32_t) FusedActivationFunc::NONE: \ + macro(kNone); \ + break; \ + case (int32_t) FusedActivationFunc::RELU: \ + macro(kRelu); \ + break; \ + case (int32_t) FusedActivationFunc::RELU1: \ + macro(kRelu1); \ + break; \ + case (int32_t) FusedActivationFunc::RELU6: \ + macro(kRelu6); \ + break; + +#define ANDROID_NN_MACRO_DISPATCH(macro) \ + switch (activation) { \ + ANDROID_NN_MACRO_DISPATCH_INTERNAL(macro) \ + default: \ + LOG(ERROR) << "Unsupported fused activation function type"; \ + return false; \ + } + +#define ANDROID_NN_MACRO_DISPATCH_WITH_DELETE(macro) \ + switch (activation) { \ + ANDROID_NN_MACRO_DISPATCH_INTERNAL(macro) \ + default: \ + LOG(ERROR) << "Unsupported fused activation function type"; \ + if (im2colByteSize > kStaticBufferSize) { \ + delete[] im2colData; \ + } \ + return false; \ + } + +} // namespace rt +} // namespace nnfw + +#endif // __NNFW_RT_OPERATIONS_UTILS_H__ diff --git a/runtimes/nn/common/include/Utils.h b/runtimes/nn/common/include/Utils.h new file mode 100644 index 000000000..aae4cff90 --- /dev/null +++ b/runtimes/nn/common/include/Utils.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RT_UTILS_H__ +#define __NNFW_RT_UTILS_H__ + +#include "HalInterfaces.h" +#include "NeuralNetworks.h" +#include "Logging.h" + +#include <vector> + +namespace nnfw { +namespace rt { + +// The number of data types (OperandCode) defined in NeuralNetworks.h. +const int kNumberOfDataTypes = 6; + +// The number of operation types (OperationCode) defined in NeuralNetworks.h. +const int kNumberOfOperationTypes = 30; + +// The number of execution preferences defined in NeuralNetworks.h. +const int kNumberOfPreferences = 3; + +// The number of data types (OperandCode) defined in NeuralNetworksOEM.h. +const int kNumberOfDataTypesOEM = 2; + +// The number of operation types (OperationCode) defined in NeuralNetworksOEM.h. +const int kNumberOfOperationTypesOEM = 1; + +// The lowest number assigned to any OEM Code in NeuralNetworksOEM.h. +const int kOEMCodeBase = 10000; + +// Assert macro, as Android does not generally support assert. +#define nnAssert(v) \ + do { \ + if (!(v)) { \ + LOG(ERROR) << "nnAssert failed at " << __FILE__ << ":" << __LINE__ << " - '" << #v \ + << "'\n"; \ + abort(); \ + } \ + } while (0) +// Returns the amount of space needed to store a value of the specified +// dimensions and type. +uint32_t sizeOfData(OperandType type, const std::vector<uint32_t>& dimensions); + +// Returns the amount of space needed to store a value of the dimensions and +// type of this operand. +inline uint32_t sizeOfData(const Operand& operand) { + return sizeOfData(operand.type, operand.dimensions); +} + +// Returns the name of the operation in ASCII. +const char* getOperationName(OperationType opCode); +// Memory is unmapped. +// Memory is reference counted by hidl_memory instances, and is deallocated +// once there are no more references. +hidl_memory allocateSharedMemory(int64_t size); + +// Returns the number of padding bytes needed to align data of the +// specified length. It aligns object of length: +// 2, 3 on a 2 byte boundary, +// 4+ on a 4 byte boundary. +// We may want to have different alignments for tensors. +// TODO: This is arbitrary, more a proof of concept. We need +// to determine what this should be. +uint32_t alignBytesNeeded(uint32_t index, size_t length); + +inline void setFromIntList(hidl_vec<uint32_t>* vec, uint32_t count, const uint32_t* data) { + vec->resize(count); + for (uint32_t i = 0; i < count; i++) { + (*vec)[i] = data[i]; + } +} + +inline void setFromIntList(std::vector<uint32_t>* vec, uint32_t count, const uint32_t* data) { + vec->resize(count); + for (uint32_t i = 0; i < count; i++) { + (*vec)[i] = data[i]; + } +} + +inline std::string toString(uint32_t obj) { + return std::to_string(obj); +} + +template <typename Type> +std::string toString(const std::vector<Type>& range) { + std::string os = "["; + for (size_t i = 0; i < range.size(); ++i) { + os += (i == 0 ? "" : ", ") + toString(range[i]); + } + return os += "]"; +} + +inline bool validCode(uint32_t codeCount, uint32_t codeCountOEM, uint32_t code) { + return (code < codeCount) || (code >= kOEMCodeBase && (code - kOEMCodeBase) < codeCountOEM); +} + +int validateOperandType(const ANeuralNetworksOperandType& type, const char* tag, bool allowPartial); +int validateOperandList(uint32_t count, const uint32_t* list, uint32_t operandCount, + const char* tag); + +bool validateModel(const Model& model); +bool validateRequest(const Request& request, const Model& model); + +inline size_t getSizeFromInts(int lower, int higher) { + return (uint32_t)(lower) + ((uint64_t)(uint32_t)(higher) << 32); +} + +} // namespace rt +} // namespace nnfw + +#endif // __NNFW_RT_UTILS_H__ diff --git a/runtimes/nn/common/operations/Activation.cpp b/runtimes/nn/common/operations/Activation.cpp new file mode 100644 index 000000000..091ffabb3 --- /dev/null +++ b/runtimes/nn/common/operations/Activation.cpp @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Operations.h" +#include "OperationsUtils.h" + +#include "internal/optimized/optimized_ops.h" + +// TODO-NNRT: There was no inlcude "ActivationFunctor.h" in Android NN. +// This may be included from some other header files. +#include "ActivationFunctor.h" + +namespace nnfw { +namespace rt { + +bool reluFloat32(const float* inputData, const Shape& inputShape, + float* outputData, const Shape& outputShape) { + int numElements = getNumberOfElements(inputShape); + for (int i=0; i<numElements; i++, inputData++, outputData++) { + *outputData = std::max(0.f, *inputData); + } + return true; +} + +bool relu1Float32(const float* inputData, const Shape& inputShape, + float* outputData, const Shape& outputShape) { + int numElements = getNumberOfElements(inputShape); + for (int i=0; i<numElements; i++, inputData++, outputData++) { + *outputData = std::min(std::max(-1.f, *inputData), 1.f); + } + return true; +} + +bool relu6Float32(const float* inputData, const Shape& inputShape, + float* outputData, const Shape& outputShape) { + int numElements = getNumberOfElements(inputShape); + for (int i=0; i<numElements; i++, inputData++, outputData++) { + *outputData = std::min(std::max(0.f, *inputData), 6.f); + } + return true; +} + +bool tanhFloat32(const float* inputData, const Shape& inputShape, + float* outputData, const Shape& outputShape) { + int numElements = getNumberOfElements(inputShape); + for (int i=0; i<numElements; i++, inputData++, outputData++) { + *outputData = std::tanh(*inputData); + } + return true; +} + +bool logisticFloat32(const float* inputData, const Shape& inputShape, + float* outputData, const Shape& outputShape) { + int numElements = getNumberOfElements(inputShape); + for (int i=0; i<numElements; i++, inputData++, outputData++) { + *outputData = 1.f / (1.f + std::exp(-*inputData)); + } + return true; +} + +bool softmaxFloat32(const float* inputData, const Shape& inputShape, + const float beta, + float* outputData, const Shape& outputShape) { + Dims<4> dim; + if (getNumberOfDimensions(inputShape) == 2) { + uint32_t batch_size = getSizeOfDimension(inputShape, 0); + uint32_t input_size = getNumberOfElements(inputShape) / batch_size; + + Shape shapeIn4D; + shapeIn4D.dimensions = {batch_size, 1, 1, input_size}; + dim = convertShapeToDims(shapeIn4D); + } else if (getNumberOfDimensions(inputShape) == 4) { + dim = convertShapeToDims(inputShape); + } else { + LOG(ERROR) << "only 2D and 4D tensors supported"; + return false; + } + + optimized_ops::Softmax(inputData, dim, beta, + outputData, dim); + return true; +} + +#define ANDROID_NN_RELUX_QUANT8(activation) \ + int numElements = getNumberOfElements(inputShape); \ + int32_t output_activation_min = 0; \ + int32_t output_activation_max = 0; \ + \ + CalculateActivationRangeUint8(activation, inputShape, \ + &output_activation_min, \ + &output_activation_max); \ + \ + for (int i=0; i<numElements; i++, inputData++, outputData++) { \ + *outputData = std::min((uint8_t)output_activation_max, \ + std::max((uint8_t)output_activation_min, *inputData)); \ + } + + +bool reluQuant8(const uint8_t* inputData, const Shape& inputShape, + uint8_t* outputData, const Shape& outputShape) { + ANDROID_NN_RELUX_QUANT8(kActivationRelu) + return true; +} + +bool relu1Quant8(const uint8_t* inputData, const Shape& inputShape, + uint8_t* outputData, const Shape& outputShape) { + ANDROID_NN_RELUX_QUANT8(kActivationRelu1) + return true; +} + +bool relu6Quant8(const uint8_t* inputData, const Shape& inputShape, + uint8_t* outputData, const Shape& outputShape) { + ANDROID_NN_RELUX_QUANT8(kActivationRelu6) + return true; +} + +#undef ANDROID_NN_RELUX_QUANT8 + +bool logisticQuant8(const uint8_t* inputData, const Shape& inputShape, + uint8_t* outputData, const Shape& outputShape) { + if (outputShape.offset != 0 || outputShape.scale != 1.f / 256) { + LOG(ERROR) << "incorrect scale / offset for output"; + return false; + } + + static constexpr int kInputIntegerBits = 4; + + const double input_real_multiplier = + inputShape.scale * + static_cast<double>(1 << (31 - kInputIntegerBits)); + + int32_t input_multiplier = 0; + int32_t input_left_shift = 0; + if (!QuantizeMultiplierGreaterThanOne(input_real_multiplier, + &input_multiplier, + &input_left_shift)) { + return false; + } + int32_t input_range_radius = + CalculateInputRadius(kInputIntegerBits, input_left_shift); + + optimized_ops::Logistic( + inputData, convertShapeToDims(inputShape), + inputShape.offset, input_range_radius, + input_multiplier, input_left_shift, + outputData, convertShapeToDims(outputShape)); + + return true; +} + +bool softmaxQuant8(const uint8_t* inputData, const Shape& inputShape, + const float beta, + uint8_t* outputData, const Shape& outputShape) { + Dims<4> dim; + if (getNumberOfDimensions(inputShape) == 2) { + uint32_t batch_size = getSizeOfDimension(inputShape, 0); + uint32_t input_size = getNumberOfElements(inputShape) / batch_size; + + Shape shapeIn4D; + shapeIn4D.dimensions = {batch_size, 1, 1, input_size}; + dim = convertShapeToDims(shapeIn4D); + } else if (getNumberOfDimensions(inputShape) == 4) { + dim = convertShapeToDims(inputShape); + } else { + LOG(ERROR) << "only 2D and 4D tensors supported"; + return false; + } + + if (outputShape.offset != 0 || outputShape.scale != 1.f / 256) { + LOG(ERROR) << "incorrect scale / offset for output"; + return false; + } + + static const int32_t kScaledDiffIntegerBits = 5; + const double input_beta_real_multiplier = std::min( + 1.0 * beta * inputShape.scale * (1 << (31 - kScaledDiffIntegerBits)), + (1ll << 31) - 1.0); + + int32_t input_multiplier = 0; + int32_t input_left_shift = 0; + if (!QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier, + &input_multiplier, + &input_left_shift)) { + return false; + } + float diff_min = -1.0f * CalculateInputRadius(kScaledDiffIntegerBits, + input_left_shift); + + optimized_ops::Softmax(inputData, dim, input_multiplier, + input_left_shift, diff_min, + outputData, dim); + return true; +} + + +} // namespace rt +} // namespace nnfw diff --git a/runtimes/nn/common/operations/Concatenation.cpp b/runtimes/nn/common/operations/Concatenation.cpp new file mode 100644 index 000000000..55de24d4d --- /dev/null +++ b/runtimes/nn/common/operations/Concatenation.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Operations.h" +#include "OperationsUtils.h" + +#include "internal/optimized/optimized_ops.h" + +namespace nnfw { +namespace rt { + +bool concatenationFloat32(const std::vector<const float*>& inputDataPtrs, + const std::vector<Shape>& inputShapes, int32_t axis, + float* outputData, const Shape& outputShape) { + int num_inputs = inputShapes.size(); + std::vector<Dims<4>*> inputDimsPtr(num_inputs); + std::vector<Dims<4> > inputDims(num_inputs); + for (int i=0; i<num_inputs; i++) { + inputDims[i] = convertShapeToDims(inputShapes[i]); + inputDimsPtr[i] = &inputDims[i]; + } + + optimized_ops::Concatenation<FusedActivationFunctionType::kNone, float>( + getNumberOfDimensions(outputShape) - axis - 1, + inputDataPtrs.data(), inputDimsPtr.data(), num_inputs, + outputData, convertShapeToDims(outputShape)); + + return true; +} + +bool concatenationQuant8(const std::vector<const uint8_t*>& inputDataPtrs, + const std::vector<Shape>& inputShapes, int32_t axis, + uint8_t* outputData, const Shape& outputShape) { + int num_inputs = inputShapes.size(); + std::vector<Dims<4>*> inputDimsPtr(num_inputs); + std::vector<Dims<4> > inputDims(num_inputs); + for (int i=0; i<num_inputs; i++) { + inputDims[i] = convertShapeToDims(inputShapes[i]); + inputDimsPtr[i] = &inputDims[i]; + } + + optimized_ops::Concatenation<FusedActivationFunctionType::kNone, uint8_t>( + getNumberOfDimensions(outputShape) - axis - 1, + inputDataPtrs.data(), inputDimsPtr.data(), num_inputs, + outputData, convertShapeToDims(outputShape)); + + return true; +} +} // namespace rt +} // namespace nnfw diff --git a/runtimes/nn/common/operations/Conv2D.cpp b/runtimes/nn/common/operations/Conv2D.cpp new file mode 100644 index 000000000..01f6797e3 --- /dev/null +++ b/runtimes/nn/common/operations/Conv2D.cpp @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Operations.h" +#include "OperationsUtils.h" + +#include "internal/optimized/optimized_ops.h" + +namespace nnfw { +namespace rt { + +// If possible we will use this static buffer for the tensor. +static constexpr int kStaticBufferSize = 1605632; +static char static_scratch_buffer[kStaticBufferSize]; + +#define ANDROID_NN_CONV_PARAMETERS(Type) \ + uint32_t height = getSizeOfDimension(inputShape, 1); \ + uint32_t width = getSizeOfDimension(inputShape, 2); \ + uint32_t filterHeight = getSizeOfDimension(filterShape, 1); \ + uint32_t filterWidth = getSizeOfDimension(filterShape, 2); \ + uint32_t outHeight = getSizeOfDimension(outputShape, 1); \ + uint32_t outWidth = getSizeOfDimension(outputShape, 2); \ + uint32_t inDepth = getSizeOfDimension(inputShape, 3); \ + \ + uint32_t paddingHeight = (uint32_t)padding_top; \ + uint32_t paddingWidth = (uint32_t)padding_left; \ + \ + Dims<4> im2colDim; \ + im2colDim.sizes[3] = (int)getSizeOfDimension(outputShape, 0); \ + im2colDim.sizes[2] = (int)getSizeOfDimension(outputShape, 1); \ + im2colDim.sizes[1] = (int)getSizeOfDimension(outputShape, 2); \ + im2colDim.sizes[0] = (int)inDepth * filterHeight * filterWidth; \ + \ + im2colDim.strides[0] = 1; \ + for (int i=1; i<4; i++) { \ + im2colDim.strides[i] = im2colDim.strides[i-1] * im2colDim.sizes[i-1]; \ + } \ + \ + Type* im2colData = nullptr; \ + int im2colByteSize = sizeof(Type); \ + for (int i=0; i<4; i++) { \ + im2colByteSize *= im2colDim.sizes[i]; \ + } \ + if (im2colByteSize <= kStaticBufferSize) { \ + im2colData = reinterpret_cast<Type *>(static_scratch_buffer); \ + } else { \ + im2colData = new (std::nothrow) Type[im2colByteSize / sizeof(Type)]; \ + } + +bool convFloat32(const float* inputData, const Shape& inputShape, + const float* filterData, const Shape& filterShape, + const float* biasData, const Shape& biasShape, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + int32_t activation, + float* outputData, const Shape& outputShape) { + + ANDROID_NN_CONV_PARAMETERS(float) + + #define ANDROID_NN_CONV(activation) \ + optimized_ops::Conv<FusedActivationFunctionType::activation>( \ + inputData, convertShapeToDims(inputShape), \ + filterData, convertShapeToDims(filterShape), \ + biasData, convertShapeToDims(biasShape), \ + stride_width, stride_height, paddingWidth, paddingHeight, \ + outputData, convertShapeToDims(outputShape), \ + im2colData, im2colDim) + + ANDROID_NN_MACRO_DISPATCH_WITH_DELETE(ANDROID_NN_CONV) + #undef ANDROID_NN_CONV + + if (im2colByteSize > kStaticBufferSize) { + delete[] im2colData; + } + return true; +} + +bool convQuant8(const uint8_t* inputData, const Shape& inputShape, + const uint8_t* filterData, const Shape& filterShape, + const int32_t* biasData, const Shape& biasShape, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + int32_t activation, + uint8_t* outputData, const Shape& outputShape) { + + ANDROID_NN_CONV_PARAMETERS(uint8_t) + + int32_t inputOffset = -inputShape.offset; + int32_t filterOffset = -filterShape.offset; + int32_t outputOffset = outputShape.offset; + + float real_multiplier = 0.0; + int32_t output_multiplier = 0; + int32_t output_shift = 0; + int32_t output_activation_min = 0; + int32_t output_activation_max = 0; + + if (!GetQuantizedConvolutionMultipler(inputShape, filterShape, biasShape, + outputShape, &real_multiplier) || + !QuantizeMultiplierSmallerThanOne(real_multiplier, &output_multiplier, + &output_shift)){ + // Following code inserted to resolve Coverity (118950 Resource leak) + if (im2colByteSize > kStaticBufferSize) { + delete[] im2colData; + } + return false; + } + CalculateActivationRangeUint8(activation, outputShape, + &output_activation_min, + &output_activation_max); + + static gemmlowp::GemmContext gemm_context; + // Alow gemmlowp automatcally decide how many threads to use. + gemm_context.set_max_num_threads(0); + + #define ANDROID_NN_CONV(activation) \ + optimized_ops::Conv<FusedActivationFunctionType::activation>( \ + inputData, convertShapeToDims(inputShape), inputOffset, \ + filterData, convertShapeToDims(filterShape), filterOffset, \ + biasData, convertShapeToDims(biasShape), \ + stride_width, stride_height, paddingWidth, paddingHeight, \ + outputOffset, output_multiplier, output_shift, \ + output_activation_min, output_activation_max, \ + outputData, convertShapeToDims(outputShape), \ + im2colData, im2colDim, &gemm_context) + + ANDROID_NN_MACRO_DISPATCH_WITH_DELETE(ANDROID_NN_CONV) + #undef ANDROID_NN_CONV + + if (im2colByteSize > kStaticBufferSize) { + delete[] im2colData; + } + return true; +} + +#undef ANDROID_NN_CONV_PARAMETERS +} // namespace rt +} // namespace nnfw diff --git a/runtimes/nn/common/operations/DepthwiseConv2D.cpp b/runtimes/nn/common/operations/DepthwiseConv2D.cpp new file mode 100644 index 000000000..94a78f942 --- /dev/null +++ b/runtimes/nn/common/operations/DepthwiseConv2D.cpp @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Operations.h" +#include "OperationsUtils.h" + +#include "internal/optimized/depthwiseconv_float.h" +#if 0 // REF-ANN We don't support depthwiseConvQuant8 yet +#include "internal/optimized/depthwiseconv_uint8.h" +#endif + +namespace nnfw { +namespace rt { + +#define ANDROID_NN_DEPTHWISE_CONV_PARAMETERS \ + uint32_t height = getSizeOfDimension(inputShape, 1); \ + uint32_t width = getSizeOfDimension(inputShape, 2); \ + uint32_t filterHeight = getSizeOfDimension(filterShape, 1); \ + uint32_t filterWidth = getSizeOfDimension(filterShape, 2); \ + uint32_t outHeight = getSizeOfDimension(outputShape, 1); \ + uint32_t outWidth = getSizeOfDimension(outputShape, 2); \ + \ + uint32_t paddingHeight = (uint32_t)padding_top; \ + uint32_t paddingWidth = (uint32_t)padding_left; + +bool depthwiseConvFloat32(const float* inputData, const Shape& inputShape, + const float* filterData, const Shape& filterShape, + const float* biasData, const Shape& biasShape, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + int32_t depth_multiplier, int32_t activation, + float* outputData, const Shape& outputShape) { + + ANDROID_NN_DEPTHWISE_CONV_PARAMETERS + + #define ANDROID_NN_DEPTHWISE_CONV(activation) \ + optimized_ops::DepthwiseConv<FusedActivationFunctionType::activation>( \ + inputData, convertShapeToDims(inputShape), \ + filterData, convertShapeToDims(filterShape), \ + biasData, convertShapeToDims(biasShape), \ + stride_width, stride_height, \ + paddingWidth, paddingHeight, depth_multiplier, \ + outputData, convertShapeToDims(outputShape)) + + ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_DEPTHWISE_CONV) + #undef ANDROID_NN_DEPTHWISE_CONV + + return true; +} + + +#if 0 // REF-ANN We don't support depthwiseConvQuant8 yet +bool depthwiseConvQuant8(const uint8_t* inputData, const Shape& inputShape, + const uint8_t* filterData, const Shape& filterShape, + const int32_t* biasData, const Shape& biasShape, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + int32_t depth_multiplier, int32_t activation, + uint8_t* outputData, const Shape& outputShape) { + + ANDROID_NN_DEPTHWISE_CONV_PARAMETERS + + float real_multiplier = 0.0; + int32_t output_multiplier = 0; + int32_t output_shift = 0; + int32_t output_activation_min = 0; + int32_t output_activation_max = 0; + + + if (!GetQuantizedConvolutionMultipler(inputShape, filterShape, biasShape, + outputShape, &real_multiplier) || + !QuantizeMultiplierSmallerThanOne(real_multiplier, &output_multiplier, + &output_shift)) { + return false; + } + CalculateActivationRangeUint8(activation, outputShape, + &output_activation_min, + &output_activation_max); + + uint32_t inputOffset = -inputShape.offset; + uint32_t filterOffset = -filterShape.offset; + uint32_t outputOffset = outputShape.offset; + #define ANDROID_NN_DEPTHWISE_CONV(activation) \ + optimized_ops::DepthwiseConv<FusedActivationFunctionType::activation>( \ + inputData, convertShapeToDims(inputShape), inputOffset, \ + filterData, convertShapeToDims(filterShape), filterOffset, \ + biasData, convertShapeToDims(biasShape), \ + stride_width, stride_height, \ + paddingWidth, paddingHeight, depth_multiplier, \ + outputOffset, output_multiplier, output_shift, \ + output_activation_min, output_activation_max, \ + outputData, convertShapeToDims(outputShape)) + + ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_DEPTHWISE_CONV) + #undef ANDROID_NN_DEPTHWISE_CONV + + return true; +} +#endif // REF-ANN + +#undef ANDROID_NN_DEPTHWISE_CONV_PARAMETERS +} // namespace rt +} // namespace nnfw diff --git a/runtimes/nn/common/operations/FullyConnected.cpp b/runtimes/nn/common/operations/FullyConnected.cpp new file mode 100644 index 000000000..393d0ff9e --- /dev/null +++ b/runtimes/nn/common/operations/FullyConnected.cpp @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Operations.h" +#include "OperationsUtils.h" + +#include "internal/optimized/optimized_ops.h" + +namespace nnfw { +namespace rt { + +bool fullyConnectedFloat32(const float* inputData, const Shape& inputShape, + const float* weightsData, const Shape& weightsShape, + const float* biasData, const Shape& biasShape, + int32_t activation, + float* outputData, const Shape& outputShape) { + + #define ANDROID_NN_FULLY_CONNECTED(activation) \ + optimized_ops::FullyConnected<FusedActivationFunctionType::activation>( \ + inputData, convertShapeToDims(inputShape), \ + weightsData, convertShapeToDims(weightsShape), \ + biasData, convertShapeToDims(biasShape), \ + outputData, convertShapeToDims(outputShape)) + + ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_FULLY_CONNECTED) + #undef ANDROID_NN_FULLY_CONNECTED + return true; +} + +bool fullyConnectedQuant8(const uint8_t* inputData, const Shape& inputShape, + const uint8_t* weightsData, const Shape& weightsShape, + const int32_t* biasData, const Shape& biasShape, + int32_t activation, + uint8_t* outputData, const Shape& outputShape) { + int32_t inputOffset = -inputShape.offset; + int32_t weightsOffset = -weightsShape.offset; + int32_t outputOffset = outputShape.offset; + + float real_multiplier = 0.0; + int32_t output_multiplier = 0; + int32_t output_shift = 0; + int32_t output_activation_min = 0; + int32_t output_activation_max = 0; + + if (!GetQuantizedConvolutionMultipler(inputShape, weightsShape, biasShape, + outputShape, &real_multiplier) || + !QuantizeMultiplierSmallerThanOne(real_multiplier, &output_multiplier, + &output_shift)) { + return false; + } + CalculateActivationRangeUint8(activation, outputShape, + &output_activation_min, + &output_activation_max); + + static gemmlowp::GemmContext gemm_context; + // Alow gemmlowp automatcally decide how many threads to use. + gemm_context.set_max_num_threads(0); + + #define ANDROID_NN_FULLY_CONNECTED(activation) \ + optimized_ops::FullyConnected<FusedActivationFunctionType::activation>( \ + inputData, convertShapeToDims(inputShape), inputOffset, \ + weightsData, convertShapeToDims(weightsShape), weightsOffset, \ + biasData, convertShapeToDims(biasShape), \ + outputOffset, output_multiplier, output_shift, \ + output_activation_min, output_activation_max, \ + outputData, convertShapeToDims(outputShape), &gemm_context) + + ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_FULLY_CONNECTED) + #undef ANDROID_NN_FULLY_CONNECTED + return true; +} +} // namespace rt +} // namespace nnfw diff --git a/runtimes/nn/common/operations/Pooling.cpp b/runtimes/nn/common/operations/Pooling.cpp new file mode 100644 index 000000000..958164c1b --- /dev/null +++ b/runtimes/nn/common/operations/Pooling.cpp @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "Operations.h" +#include "OperationsUtils.h" + +#include "internal/optimized/optimized_ops.h" + +namespace nnfw { +namespace rt { + +#define ANDROID_NN_POOLING_PARAMETERS \ + uint32_t height = getSizeOfDimension(inputShape, 1); \ + uint32_t width = getSizeOfDimension(inputShape, 2); \ + uint32_t outHeight = getSizeOfDimension(outputShape, 1); \ + uint32_t outWidth = getSizeOfDimension(outputShape, 2); \ + \ + uint32_t paddingHeight = (uint32_t)padding_top; \ + uint32_t paddingWidth = (uint32_t)padding_left; + +bool averagePoolFloat32(const float* inputData, const Shape& inputShape, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + int32_t filter_width, int32_t filter_height, int32_t activation, + float* outputData, const Shape& outputShape) { + + ANDROID_NN_POOLING_PARAMETERS + + #define ANDROID_NN_AVERAGE_POOL(activation) \ + optimized_ops::AveragePool<FusedActivationFunctionType::activation>( \ + inputData, convertShapeToDims(inputShape), \ + stride_width, stride_height, paddingWidth, paddingHeight, \ + filter_width, filter_height, \ + outputData, convertShapeToDims(outputShape)) + + ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_AVERAGE_POOL) + #undef ANDROID_NN_AVERAGE_POOL + + return true; +} + +bool averagePoolQuant8(const uint8_t* inputData, const Shape& inputShape, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + int32_t filter_width, int32_t filter_height, int32_t activation, + uint8_t* outputData, const Shape& outputShape) { + + ANDROID_NN_POOLING_PARAMETERS + + int32_t output_activation_min = 0; + int32_t output_activation_max = 0; + + CalculateActivationRangeUint8(activation, outputShape, + &output_activation_min, + &output_activation_max); + + #define ANDROID_NN_AVERAGE_POOL(activation) \ + optimized_ops::AveragePool<FusedActivationFunctionType::activation>( \ + inputData, convertShapeToDims(inputShape), \ + stride_width, stride_height, paddingWidth, paddingHeight, \ + filter_width, filter_height, \ + output_activation_min, output_activation_max, \ + outputData, convertShapeToDims(outputShape)) + + ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_AVERAGE_POOL) + #undef ANDROID_NN_AVERAGE_POOL + + return true; +} + +bool l2PoolFloat32(const float* inputData, const Shape& inputShape, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + int32_t filter_width, int32_t filter_height, int32_t activation, + float* outputData, const Shape& outputShape) { + + ANDROID_NN_POOLING_PARAMETERS + + #define ANDROID_NN_L2_POOL(activation) \ + optimized_ops::L2Pool<FusedActivationFunctionType::activation>( \ + inputData, convertShapeToDims(inputShape), \ + stride_width, stride_height, paddingWidth, paddingHeight, \ + filter_width, filter_height, \ + outputData, convertShapeToDims(outputShape)) + + ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_L2_POOL) + #undef ANDROID_NN_L2_POOL + + return true; +} + +bool maxPoolFloat32(const float* inputData, const Shape& inputShape, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + int32_t filter_width, int32_t filter_height, int32_t activation, + float* outputData, const Shape& outputShape) { + + ANDROID_NN_POOLING_PARAMETERS + + #define ANDROID_NN_MAX_POOL(activation) \ + optimized_ops::MaxPool<FusedActivationFunctionType::activation>( \ + inputData, convertShapeToDims(inputShape), \ + stride_width, stride_height, paddingWidth, paddingHeight, \ + filter_width, filter_height, \ + outputData, convertShapeToDims(outputShape)) + + ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_MAX_POOL) + #undef ANDROID_NN_MAX_POOL + + return true; +} + +bool maxPoolQuant8(const uint8_t* inputData, const Shape& inputShape, + int32_t padding_left, int32_t padding_right, + int32_t padding_top, int32_t padding_bottom, + int32_t stride_width, int32_t stride_height, + int32_t filter_width, int32_t filter_height, int32_t activation, + uint8_t* outputData, const Shape& outputShape) { + + ANDROID_NN_POOLING_PARAMETERS + + int32_t output_activation_min = 0; + int32_t output_activation_max = 0; + + CalculateActivationRangeUint8(activation, outputShape, + &output_activation_min, + &output_activation_max); + + #define ANDROID_NN_MAX_POOL(activation) \ + optimized_ops::MaxPool<FusedActivationFunctionType::activation>( \ + inputData, convertShapeToDims(inputShape), \ + stride_width, stride_height, paddingWidth, paddingHeight, \ + filter_width, filter_height, \ + output_activation_min, output_activation_max, \ + outputData, convertShapeToDims(outputShape)) + + ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_MAX_POOL) + #undef ANDROID_NN_MAX_POOL + + return true; +} + +#undef ANDROID_NN_POOLING_PARAMETERS +} // namespace rt +} // namespace nnfw diff --git a/runtimes/nn/common/operations/Reshape.cpp b/runtimes/nn/common/operations/Reshape.cpp new file mode 100644 index 000000000..120918b0d --- /dev/null +++ b/runtimes/nn/common/operations/Reshape.cpp @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Contains the implementation of the operations. + +#define LOG_TAG "Operations" + +#include "Operations.h" +#include "OperationsUtils.h" + +#include "internal/optimized/optimized_ops.h" + +namespace nnfw { +namespace rt { + +bool reshapeGeneric(const void* inputData, const Shape& inputShape, + void* outputData, const Shape& outputShape) { + size_t count = sizeOfData(inputShape.type, inputShape.dimensions); + memcpy(outputData, inputData, count); + return true; +} + +bool resizeBilinearFloat32(const float* inputData, const Shape& inputShape, + float* outputData, const Shape& outputShape) { + int32_t height = (int32_t) getSizeOfDimension(outputShape, 1); + int32_t width = (int32_t) getSizeOfDimension(outputShape, 2); + + int32_t outDimData[2] = {height, width}; + // We have to fake a tensor here, to satisfy ResizeBilinear(). + Shape outDimShape; + outDimShape.dimensions = {1, 1, 1, 2}; + + optimized_ops::ResizeBilinear( + inputData, convertShapeToDims(inputShape), + outDimData, convertShapeToDims(outDimShape), + outputData, convertShapeToDims(outputShape)); + return true; +} + +bool depthToSpaceGeneric(const uint8_t* inputData, const Shape& inputShape, + int32_t blockSize, + uint8_t* outputData, const Shape& outputShape) { + if (inputShape.type == OperandType::TENSOR_FLOAT32) { + optimized_ops::DepthToSpace( + reinterpret_cast<const float*>(inputData), + convertShapeToDims(inputShape), + blockSize, + reinterpret_cast<float*>(outputData), + convertShapeToDims(outputShape)); + } else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) { + optimized_ops::DepthToSpace( + reinterpret_cast<const uint8_t*>(inputData), + convertShapeToDims(inputShape), + blockSize, + reinterpret_cast<uint8_t*>(outputData), + convertShapeToDims(outputShape)); + } else { + LOG(ERROR) << "Unsupported data type"; + return false; + } + return true; +} + +bool spaceToDepthGeneric(const uint8_t* inputData, const Shape& inputShape, + int32_t blockSize, + uint8_t* outputData, const Shape& outputShape) { + if (inputShape.type == OperandType::TENSOR_FLOAT32) { + optimized_ops::SpaceToDepth( + reinterpret_cast<const float*>(inputData), + convertShapeToDims(inputShape), + blockSize, + reinterpret_cast<float*>(outputData), + convertShapeToDims(outputShape)); + } else if (inputShape.type == OperandType::TENSOR_QUANT8_ASYMM) { + optimized_ops::SpaceToDepth( + reinterpret_cast<const uint8_t*>(inputData), + convertShapeToDims(inputShape), + blockSize, + reinterpret_cast<uint8_t*>(outputData), + convertShapeToDims(outputShape)); + } else { + LOG(ERROR) << "Unsupported data type"; + return false; + } + return true; +} + +} // namespace rt +} // namespace nnfw diff --git a/runtimes/nn/common/operations/SimpleMath.cpp b/runtimes/nn/common/operations/SimpleMath.cpp new file mode 100644 index 000000000..79b1175c0 --- /dev/null +++ b/runtimes/nn/common/operations/SimpleMath.cpp @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Contains the implementation of the operations. + +#define LOG_TAG "Operations" + +#include "Operations.h" +#include "OperationsUtils.h" + +#include "internal/optimized/optimized_ops.h" + +namespace nnfw { +namespace rt { +bool addFloat32(const float* in1, const Shape& shape1, + const float* in2, const Shape& shape2, + int32_t activation, + float* out, const Shape& shapeOut) { + bool needBroadcast = !SameShape(shape1, shape2); + + #define ANDROID_NN_NORMAL_ADD(activation) \ + optimized_ops::Add<FusedActivationFunctionType::activation>( \ + in1, convertShapeToDims(shape1), \ + in2, convertShapeToDims(shape2), \ + out, convertShapeToDims(shapeOut)) + + #define ANDROID_NN_BROADCAST_ADD(activation) \ + optimized_ops::BroadcastAdd<FusedActivationFunctionType::activation>( \ + in1, convertShapeToDims(shape1), \ + in2, convertShapeToDims(shape2), \ + out, convertShapeToDims(shapeOut)) + + if (needBroadcast) { + ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_BROADCAST_ADD) + } else { + ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_NORMAL_ADD) + } + + #undef ANDROID_NN_NORMAL_ADD + #undef ANDROID_NN_BROADCAST_ADD + return true; +} + +bool addQuant8(const uint8_t* in1, const Shape& shape1, + const uint8_t* in2, const Shape& shape2, + int32_t activation, + uint8_t* out, const Shape& shapeOut) { + bool needBroadcast = !SameShape(shape1, shape2); + + const int32_t input1_offset = -shape1.offset; + const int32_t input2_offset = -shape2.offset; + const int32_t output_offset = shapeOut.offset; + const int left_shift = 20; + const double twice_max_input_scale = 2 * std::max(shape1.scale, shape2.scale); + const double real_input1_multiplier = shape1.scale / twice_max_input_scale; + const double real_input2_multiplier = shape2.scale / twice_max_input_scale; + const double real_output_multiplier = + twice_max_input_scale / + ((1 << left_shift) * shapeOut.scale); + + int32_t input1_multiplier; + int32_t input1_shift; + if (!QuantizeMultiplierSmallerThanOne(real_input1_multiplier, + &input1_multiplier, &input1_shift)) { + return false; + } + int32_t input2_multiplier; + int32_t input2_shift; + if (!QuantizeMultiplierSmallerThanOne(real_input2_multiplier, + &input2_multiplier, &input2_shift)) { + return false; + } + int32_t output_multiplier; + int32_t output_shift; + if (!QuantizeMultiplierSmallerThanOne(real_output_multiplier, + &output_multiplier, &output_shift)) { + return false; + } + int32_t output_activation_min; + int32_t output_activation_max; + CalculateActivationRangeUint8(activation, shapeOut, + &output_activation_min, + &output_activation_max); + + #define ANDROID_NN_NORMAL_ADD(activation) \ + optimized_ops::Add<FusedActivationFunctionType::activation>( \ + left_shift, \ + in1, convertShapeToDims(shape1), \ + input1_offset, input1_multiplier, input1_shift, \ + in2, convertShapeToDims(shape2), \ + input2_offset, input2_multiplier, input2_shift, \ + output_offset, output_multiplier, output_shift, \ + output_activation_min, output_activation_max, \ + out, convertShapeToDims(shapeOut)) + + #define ANDROID_NN_BROADCAST_ADD(activation) \ + optimized_ops::BroadcastAdd<FusedActivationFunctionType::activation>( \ + left_shift, \ + in1, convertShapeToDims(shape1), \ + input1_offset, input1_multiplier, input1_shift, \ + in2, convertShapeToDims(shape2), \ + input2_offset, input2_multiplier, input2_shift, \ + output_offset, output_multiplier, output_shift, \ + output_activation_min, output_activation_max, \ + out, convertShapeToDims(shapeOut)) + + if (needBroadcast) { + ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_BROADCAST_ADD) + } else { + ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_NORMAL_ADD) + } + + #undef ANDROID_NN_NORMAL_ADD + #undef ANDROID_NN_BROADCAST_ADD + return true; +} + +bool mulFloat32(const float* in1, const Shape& shape1, + const float* in2, const Shape& shape2, + int32_t activation, + float* out, const Shape& shapeOut) { + bool needBroadcast = !SameShape(shape1, shape2); + + #define ANDROID_NN_NORMAL_MUL(activation) \ + optimized_ops::Mul<FusedActivationFunctionType::activation>( \ + in1, convertShapeToDims(shape1), \ + in2, convertShapeToDims(shape2), \ + out, convertShapeToDims(shapeOut)) + + #define ANDROID_NN_BROADCAST_MUL(activation) \ + optimized_ops::BroadcastMul<FusedActivationFunctionType::activation>( \ + in1, convertShapeToDims(shape1), \ + in2, convertShapeToDims(shape2), \ + out, convertShapeToDims(shapeOut)) + + if (needBroadcast) { + ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_BROADCAST_MUL) + } else { + ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_NORMAL_MUL) + } + + #undef ANDROID_NN_NORMAL_MUL + #undef ANDROID_NN_BROADCAST_MUL + return true; +} + +bool mulQuant8(const uint8_t* in1, const Shape& shape1, + const uint8_t* in2, const Shape& shape2, + int32_t activation, + uint8_t* out, const Shape& shapeOut) { + const int32_t input1_offset = -shape1.offset; + const int32_t input2_offset = -shape2.offset; + const int32_t output_offset = shapeOut.offset; + const double input_product_scale = shape1.scale * shape2.scale; + const double real_multiplier = input_product_scale / shapeOut.scale; + int32 output_multiplier; + int output_shift; + if (!QuantizeMultiplierSmallerThanOne(real_multiplier, &output_multiplier, + &output_shift)) { + return false; + } + int32_t output_activation_min; + int32_t output_activation_max; + CalculateActivationRangeUint8(activation, shapeOut, + &output_activation_min, + &output_activation_max); + + // Use BROADCAST version to handle the normal case until we have a optimized Mul. + #define ANDROID_NN_BROADCAST_MUL(activation) \ + optimized_ops::BroadcastMul<FusedActivationFunctionType::activation>( \ + in1, convertShapeToDims(shape1), input1_offset, \ + in2, convertShapeToDims(shape2), input2_offset, \ + output_offset, output_multiplier, output_shift, \ + output_activation_min, output_activation_max, \ + out, convertShapeToDims(shapeOut)) + + ANDROID_NN_MACRO_DISPATCH(ANDROID_NN_BROADCAST_MUL) + + #undef ANDROID_NN_NORMAL_MUL + #undef ANDROID_NN_BROADCAST_MUL + return true; +} + +bool floorFloat32(const float* inputData, + float* outputData, + const Shape& shape) { + Dims<4> dim = convertShapeToDims(shape); + optimized_ops::Floor(inputData, dim, outputData, dim); + return true; +} + +bool dequantizeQuant8ToFloat32(const uint8_t* inputData, + float* outputData, + const Shape& shape) { + Dims<4> dim = convertShapeToDims(shape); + optimized_ops::Dequantize(inputData, dim, + shape.offset, shape.scale, + outputData, dim); + return true; +} + +} // namespace rt +} // namespace nnfw diff --git a/runtimes/nn/common/operations/internal/common.h b/runtimes/nn/common/operations/internal/common.h new file mode 100644 index 000000000..1bf1050fd --- /dev/null +++ b/runtimes/nn/common/operations/internal/common.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RT_COMMON_H__ +#define __NNFW_RT_COMMON_H__ + +#ifndef USE_NEON +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#define USE_NEON +#include <arm_neon.h> +#endif +#endif + +#include "gemmlowp.h" +#include "types.h" + +namespace nnfw { +namespace rt { + +template <FusedActivationFunctionType Ac> +struct ActivationFunctionImpl {}; + +template <> +struct ActivationFunctionImpl<FusedActivationFunctionType::kNone> { + static float Eval(float x) { return x; } +}; + +template <> +struct ActivationFunctionImpl<FusedActivationFunctionType::kRelu> { + static float Eval(float x) { return x < 0.f ? 0.f : x; } +}; + +template <> +struct ActivationFunctionImpl<FusedActivationFunctionType::kRelu1> { + static float Eval(float x) { return x > 1.f ? 1.f : x < -1.f ? -1.f : x; } +}; + +template <> +struct ActivationFunctionImpl<FusedActivationFunctionType::kRelu6> { + static float Eval(float x) { return x > 6.f ? 6.f : x < 0.f ? 0.f : x; } +}; + +template <FusedActivationFunctionType Ac> +float ActivationFunction(float x) { + return ActivationFunctionImpl<Ac>::Eval(x); +} + +inline int32 MultiplyByQuantizedMultiplierSmallerThanOne( + int32 x, int32 quantized_multiplier, int right_shift) { + using gemmlowp::RoundingDivideByPOT; + using gemmlowp::SaturatingRoundingDoublingHighMul; + return RoundingDivideByPOT( + SaturatingRoundingDoublingHighMul(x, quantized_multiplier), right_shift); +} + +inline int32 MultiplyByQuantizedMultiplierGreaterThanOne( + int32 x, int32 quantized_multiplier, int left_shift) { + using gemmlowp::SaturatingRoundingDoublingHighMul; + return SaturatingRoundingDoublingHighMul(x * (1 << left_shift), + quantized_multiplier); +} + +} // namespace rt +} // namespace nnfw + +#endif // __NNFW_RT_COMMON_H__ diff --git a/runtimes/nn/common/operations/internal/compatibility.h b/runtimes/nn/common/operations/internal/compatibility.h new file mode 100644 index 000000000..fd33cbd97 --- /dev/null +++ b/runtimes/nn/common/operations/internal/compatibility.h @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RT_COMPATIBILITY_H__ +#define __NNFW_RT_COMPATIBILITY_H__ + +#include <cassert> +#include <cstdint> + +#ifndef DCHECK +#define DCHECK(condition) (condition) ? (void)0 : assert(false) +#endif + +#ifndef DCHECK_EQ +#define DCHECK_EQ(x, y) ((x) == (y)) ? (void)0 : assert(false) +#endif + +#ifndef DCHECK_GE +#define DCHECK_GE(x, y) ((x) >= (y)) ? (void)0 : assert(false) +#endif + +#ifndef DCHECK_GT +#define DCHECK_GT(x, y) ((x) > (y)) ? (void)0 : assert(false) +#endif + +#ifndef DCHECK_LE +#define DCHECK_LE(x, y) ((x) <= (y)) ? (void)0 : assert(false) +#endif + +#ifndef DCHECK_LT +#define DCHECK_LT(x, y) ((x) < (y)) ? (void)0 : assert(false) +#endif + +#ifndef CHECK_EQ +#define CHECK_EQ(x, y) ((x) == (y)) ? (void)0 : assert(false) +#endif + +using uint8 = std::uint8_t; +using int16 = std::int16_t; +using uint16 = std::uint16_t; +using int32 = std::int32_t; +using uint32 = std::uint32_t; + +#endif // __NNFW_RT_COMPATIBILITY_H__ diff --git a/runtimes/nn/common/operations/internal/optimized/cpu_check.h b/runtimes/nn/common/operations/internal/optimized/cpu_check.h new file mode 100644 index 000000000..02f42fd42 --- /dev/null +++ b/runtimes/nn/common/operations/internal/optimized/cpu_check.h @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef FRAMEWORKS_ML_NN_COMMON_OPERATIONS_INTERNAL_OPTIMIZED_CPU_CHECK_ +#define FRAMEWORKS_ML_NN_COMMON_OPERATIONS_INTERNAL_OPTIMIZED_CPU_CHECK_ + +// NEON_OR_PORTABLE(SomeFunc, arcs) calls NeonSomeFunc(args) if NEON is +// enabled at build time, or PortableSomeFunc(args) otherwise. +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#define NEON_OR_PORTABLE(funcname, ...) Neon##funcname(__VA_ARGS__) +#else +#define NEON_OR_PORTABLE(funcname, ...) Portable##funcname(__VA_ARGS__) +#endif + +#endif // FRAMEWORKS_ML_NN_COMMON_OPERATIONS_INTERNAL_OPTIMIZED_CPU_CHECK_ diff --git a/runtimes/nn/common/operations/internal/optimized/depthwiseconv_float.h b/runtimes/nn/common/operations/internal/optimized/depthwiseconv_float.h new file mode 100644 index 000000000..5c05bf20f --- /dev/null +++ b/runtimes/nn/common/operations/internal/optimized/depthwiseconv_float.h @@ -0,0 +1,792 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_FLOAT_H__ +#define __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_FLOAT_H__ + +#include "gemmlowp.h" +#include "../common.h" +#include "../types.h" + +namespace nnfw { +namespace rt { +namespace optimized_ops { + +// Implementation of float DepthwiseConv + +template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> +struct FloatDepthwiseConvKernel {}; + +#ifdef USE_NEON + +template <> +struct FloatDepthwiseConvKernel<false, 8, 1> { + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float* input_ptr, int input_ptr_increment, + const float* filter_ptr, float* acc_buffer_ptr) { + // Load the filters + float32x4_t filter[2]; + for (int i = 0; i < 2; i++) { + filter[i] = vld1q_f32(filter_ptr + 4 * i); + } + int outp = 0; + // Handle 2 output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) { + // Load the inputs + float32x4_t input[4]; + for (int i = 0; i < 4; i++) { + input[i] = vld1q_f32(input_ptr + 4 * i); + } + input_ptr += 16; + // Load the accumulators from acc_buffer + float32x4_t acc[4]; + for (int i = 0; i < 4; i++) { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + acc[0] = vmlaq_f32(acc[0], input[0], filter[0]); + acc[1] = vmlaq_f32(acc[1], input[1], filter[1]); + acc[2] = vmlaq_f32(acc[2], input[2], filter[0]); + acc[3] = vmlaq_f32(acc[3], input[3], filter[1]); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) { + // Load the inputs + float32x4_t input[2]; + for (int i = 0; i < 2; i++) { + input[i] = vld1q_f32(input_ptr + 4 * i); + } + input_ptr += 8; + // Load the accumulators from acc_buffer + float32x4_t acc[2]; + for (int i = 0; i < 2; i++) { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 2; i++) { + acc[i] = vmlaq_f32(acc[i], input[i], filter[i]); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + } +}; + +template <> +struct FloatDepthwiseConvKernel<false, 2, 1> { + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float* input_ptr, int input_ptr_increment, + const float* filter_ptr, float* acc_buffer_ptr) { + const float32x2_t filters = vld1_f32(filter_ptr); + const float32x4_t filters_dup2 = vcombine_f32(filters, filters); + int outp = 0; + // Handle 8 output pixels at a time. + for (; outp <= num_output_pixels - 8; outp += 8) { + // Load the inputs + float32x4_t input[4]; + for (int i = 0; i < 4; i++) { + input[i] = vld1q_f32(input_ptr + 4 * i); + } + input_ptr += 16; + // Load the accumulators from acc_buffer + float32x4_t acc[4]; + for (int i = 0; i < 4; i++) { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 4; i++) { + acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle 4 output pixels at a time. + for (; outp <= num_output_pixels - 4; outp += 4) { + // Load the inputs + float32x4_t input[2]; + for (int i = 0; i < 2; i++) { + input[i] = vld1q_f32(input_ptr + 4 * i); + } + input_ptr += 8; + // Load the accumulators from acc_buffer + float32x4_t acc[2]; + for (int i = 0; i < 2; i++) { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 2; i++) { + acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + // Handle 2 output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) { + // Load the inputs + const float32x4_t input = vld1q_f32(input_ptr); + input_ptr += 4; + // Load the accumulators from acc_buffer + float32x4_t acc = vld1q_f32(acc_buffer_ptr); + // Multiply-accumulate + acc = vmlaq_f32(acc, input, filters_dup2); + // Store the accumulators back to acc_buffer + vst1q_f32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + // Handle 1 output pixel at a time + for (; outp < num_output_pixels; outp++) { + // Load the inputs + const float32x2_t input = vld1_f32(input_ptr); + input_ptr += 2; + // Load the accumulators from acc_buffer + float32x2_t acc = vld1_f32(acc_buffer_ptr); + // Multiply-accumulate + acc = vmla_f32(acc, input, filters); + // Store the accumulators back to acc_buffer + vst1_f32(acc_buffer_ptr, acc); + acc_buffer_ptr += 2; + } + } +}; + +template <> +struct FloatDepthwiseConvKernel<true, 0, 1> { + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float* input_ptr, int input_ptr_increment, + const float* filter_ptr, float* acc_buffer_ptr) { + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) { + const float* local_filter_ptr = filter_ptr; + const float* local_input_ptr = input_ptr; + int ic = 0; + // Handle 16 input channels at a time. + for (; ic <= input_depth - 16; ic += 16) { + // Load the filters + float32x4_t filter[4]; + for (int i = 0; i < 4; i++) { + filter[i] = vld1q_f32(local_filter_ptr + 4 * i); + } + local_filter_ptr += 16; + // Load the inputs + float32x4_t input[4]; + for (int i = 0; i < 4; i++) { + input[i] = vld1q_f32(local_input_ptr + 4 * i); + } + local_input_ptr += 16; + // Load the accumulators from acc_buffer + float32x4_t acc[4]; + for (int i = 0; i < 4; i++) { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 4; i++) { + acc[i] = vmlaq_f32(acc[i], input[i], filter[i]); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle 4 input channels at a time. + for (; ic <= input_depth - 4; ic += 4) { + // Load the filters + float32x4_t filter; + filter = vld1q_f32(local_filter_ptr); + local_filter_ptr += 4; + // Load the inputs + float32x4_t input; + input = vld1q_f32(local_input_ptr); + local_input_ptr += 4; + // Load the accumulators from acc_buffer + float32x4_t acc; + acc = vld1q_f32(acc_buffer_ptr); + // Multiply-accumulate + acc = vmlaq_f32(acc, input, filter); + // Store the accumulators back to acc_buffer + vst1q_f32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + // Handle one input channel at a time. + for (; ic < input_depth; ic++) { + const float input_val = *local_input_ptr++; + const float filter_val = *local_filter_ptr++; + *acc_buffer_ptr++ += filter_val * input_val; + } + input_ptr += input_ptr_increment; + } + } +}; + +template <> +struct FloatDepthwiseConvKernel<true, 0, 8> { + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float* input_ptr, int input_ptr_increment, + const float* filter_ptr, float* acc_buffer_ptr) { + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) { + const float* local_filter_ptr = filter_ptr; + const float* local_input_ptr = input_ptr; + int ic = 0; + // Handle 2 input channels at a time. + for (; ic <= input_depth - 2; ic += 2) { + // Load the filters + float32x4_t filter[4]; + for (int i = 0; i < 4; i++) { + filter[i] = vld1q_f32(local_filter_ptr + 4 * i); + } + local_filter_ptr += 16; + // Load the inputs + const float32x2_t input = vld1_f32(local_input_ptr); + local_input_ptr += 2; + // Load the accumulators from acc_buffer + float32x4_t acc[4]; + for (int i = 0; i < 4; i++) { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0); + acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0); + acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1); + acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one input channel at a time. + for (; ic < input_depth; ic++) { + // Load the filters + float32x4_t filter[2]; + for (int i = 0; i < 2; i++) { + filter[i] = vld1q_f32(local_filter_ptr + 4 * i); + } + local_filter_ptr += 8; + // Load the inputs + const float input_val = *local_input_ptr++; + // Load the accumulators from acc_buffer + float32x4_t acc[2]; + for (int i = 0; i < 2; i++) { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 2; i++) { + acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + input_ptr += input_ptr_increment; + } + } +}; + +template <> +struct FloatDepthwiseConvKernel<true, 0, 2> { + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float* input_ptr, int input_ptr_increment, + const float* filter_ptr, float* acc_buffer_ptr) { + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) { + const float* local_filter_ptr = filter_ptr; + const float* local_input_ptr = input_ptr; + int ic = 0; + // Handle 8 input channels at a time. + for (; ic <= input_depth - 8; ic += 8) { + // Load the filters + float32x4_t filter[4]; + for (int i = 0; i < 4; i++) { + filter[i] = vld1q_f32(local_filter_ptr + 4 * i); + } + local_filter_ptr += 16; + // Load the inputs + float32x4x2_t input_dup2[2]; + for (int i = 0; i < 2; i++) { + const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i); + input_dup2[i] = vzipq_f32(input, input); + } + local_input_ptr += 8; + // Load the accumulators from acc_buffer + float32x4_t acc[4]; + for (int i = 0; i < 4; i++) { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]); + acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]); + acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]); + acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle 4 input channels at a time. + for (; ic <= input_depth - 4; ic += 4) { + // Load the filters + float32x2_t filter[4]; + for (int i = 0; i < 4; i++) { + filter[i] = vld1_f32(local_filter_ptr + 2 * i); + } + local_filter_ptr += 8; + // Load the inputs + const float32x4_t input = vld1q_f32(local_input_ptr); + local_input_ptr += 4; + // Load the accumulators from acc_buffer + float32x2_t acc[4]; + for (int i = 0; i < 4; i++) { + acc[i] = vld1_f32(acc_buffer_ptr + 2 * i); + } + // Multiply-accumulate + acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0); + acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1); + acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0); + acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) { + vst1_f32(acc_buffer_ptr + 2 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + // Handle 2 input channels at a time. + for (; ic <= input_depth - 2; ic += 2) { + // Load the filters + const float32x4_t filter = vld1q_f32(local_filter_ptr); + local_filter_ptr += 4; + // Load the inputs + const float32x2_t input = vld1_f32(local_input_ptr); + local_input_ptr += 2; + // Load the accumulators from acc_buffer + float32x2_t acc[2]; + for (int i = 0; i < 2; i++) { + acc[i] = vld1_f32(acc_buffer_ptr + 2 * i); + } + // Multiply-accumulate + acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0); + acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) { + vst1_f32(acc_buffer_ptr + 2 * i, acc[i]); + } + acc_buffer_ptr += 4; + } + // Handle one input channel at a time. + for (; ic < input_depth; ic++) { + // Load the inputs + const float input_val = *local_input_ptr++; + // Multiply-accumulate + for (int i = 0; i < 2; i++) { + acc_buffer_ptr[i] += local_filter_ptr[i] * input_val; + } + local_filter_ptr += 2; + acc_buffer_ptr += 2; + } + input_ptr += input_ptr_increment; + } + } +}; + +template <> +struct FloatDepthwiseConvKernel<true, 1, 8> { + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float* input_ptr, int input_ptr_increment, + const float* filter_ptr, float* acc_buffer_ptr) { + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) { + // Load the filters + float32x4_t filter[2]; + for (int i = 0; i < 2; i++) { + filter[i] = vld1q_f32(filter_ptr + 4 * i); + } + // Load the inputs + const float input_val = *input_ptr; + input_ptr += input_ptr_increment; + // Load the accumulators from acc_buffer + float32x4_t acc[2]; + for (int i = 0; i < 2; i++) { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 2; i++) { + acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + } +}; + +template <> +struct FloatDepthwiseConvKernel<true, 0, 16> { + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const float* input_ptr, int input_ptr_increment, + const float* filter_ptr, float* acc_buffer_ptr) { + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) { + const float* local_filter_ptr = filter_ptr; + const float* local_input_ptr = input_ptr; + for (int ic = 0; ic < input_depth; ic++) { + // Load the filters + float32x4_t filter[4]; + for (int i = 0; i < 4; i++) { + filter[i] = vld1q_f32(local_filter_ptr + 4 * i); + } + local_filter_ptr += 16; + // Load the inputs + const float input_val = *local_input_ptr++; + // Load the accumulators from acc_buffer + float32x4_t acc[4]; + for (int i = 0; i < 4; i++) { + acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 4; i++) { + acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) { + vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + input_ptr += input_ptr_increment; + } + } +}; +#endif + +// Accumulates the effect of one row of the filter, on a segment of one row +// of the output, accessing the corresponding one row of the input. +template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> +void FloatDepthwiseConvAccumRow(int stride, int input_depth, int input_width, + const float* input_data, int pad_width, + int depth_multiplier, int filter_width, + const float* filter_data, + int out_x_buffer_start, int out_x_buffer_end, + int output_depth, float* acc_buffer) { +#ifdef GEMMLOWP_PROFILING + gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__); +#endif + // Sanity check parameters. This is important in particular to ensure + // that we keep the number of template instantiations minimal, so we don't + // increase binary size unnecessarily. + static_assert(kFixedDepthMultiplier || !kFixedInputDepth, ""); + static_assert(kFixedInputDepth || kAllowStrided, ""); + DCHECK(stride == 1 || kAllowStrided); + if (kFixedInputDepth) { + DCHECK_EQ(input_depth, kFixedInputDepth); + } + if (kFixedDepthMultiplier) { + DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier); + } + DCHECK_EQ(output_depth, input_depth * depth_multiplier); + const int input_ptr_increment = stride * input_depth; + const float* filter_base_ptr = filter_data; + for (int filter_x = 0; filter_x < filter_width; ++filter_x) { + // For the current (filter_x, filter_y) point in the filter, + // compute the boundaries of the corresponding output row segment. + int out_x_loop_start_unclampled = 0; + int out_x_loop_end_unclampled = 0; + if (kAllowStrided) { + if (stride == 2) { + out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2; + out_x_loop_end_unclampled = + (pad_width + input_width - filter_x + 1) / 2; + } else if (stride == 4) { + out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4; + out_x_loop_end_unclampled = + (pad_width + input_width - filter_x + 3) / 4; + } else { + out_x_loop_start_unclampled = + (pad_width - filter_x + stride - 1) / stride; + out_x_loop_end_unclampled = + (pad_width + input_width - filter_x + stride - 1) / stride; + } + } else { + out_x_loop_start_unclampled = pad_width - filter_x; + out_x_loop_end_unclampled = pad_width + input_width - filter_x; + } + // The kernel will have to iterate on the segment of the + // output row that starts at out_x_loop_start and out_x_loop_end. + const int out_x_loop_start = + std::max(out_x_buffer_start, out_x_loop_start_unclampled); + const int out_x_loop_end = + std::min(out_x_buffer_end, out_x_loop_end_unclampled); + + float* acc_buffer_ptr = + acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; + const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x; + const float* input_ptr = input_data + in_x_origin * input_depth; + const int num_output_pixels = out_x_loop_end - out_x_loop_start; + FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, + kFixedDepthMultiplier>::Run(num_output_pixels, + input_depth, + depth_multiplier, + input_ptr, + input_ptr_increment, + filter_base_ptr, + acc_buffer_ptr); + filter_base_ptr += output_depth; + } +} + +// generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized. +inline void FloatDepthwiseConvAccumRowGeneric( + int stride, int input_depth, int input_width, const float* input_data, + int pad_width, int depth_multiplier, int filter_width, + const float* filter_data, int out_x_buffer_start, int out_x_buffer_end, + int output_depth, float* acc_buffer) { + gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)"); + const float* filter_base_ptr = filter_data; + for (int filter_x = 0; filter_x < filter_width; ++filter_x) { + const int out_x_loop_start = std::max( + out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride); + const int out_x_loop_end = + std::min(out_x_buffer_end, + (pad_width + input_width - filter_x + stride - 1) / stride); + + float* acc_buffer_ptr = + acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; + const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x; + const float* input_ptr = input_data + in_x_origin * input_depth; + const int input_ptr_increment = (stride - 1) * input_depth; + for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) { + const float* filter_ptr = filter_base_ptr; + for (int ic = 0; ic < input_depth; ++ic) { + const float input_val = *input_ptr++; + for (int m = 0; m < depth_multiplier; m++) { + const float filter_val = *filter_ptr++; + *acc_buffer_ptr++ += filter_val * input_val; + } + } + input_ptr += input_ptr_increment; + } + filter_base_ptr += output_depth; + } +} + +// Initializes the accumulator buffer with bias values. +inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth, + const float* bias_data, + float* acc_buffer) { + for (int i = 0; i < num_output_pixels; i++) { + memcpy(acc_buffer + i * output_depth, bias_data, + sizeof(acc_buffer[0]) * output_depth); + } +} + +template <FusedActivationFunctionType Ac> +void DepthwiseConv(const float* input_data, const Dims<4>& input_dims, + const float* filter_data, const Dims<4>& filter_dims, + const float* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, + int pad_width, int pad_height, int depth_multiplier, + float* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("DepthwiseConv"); + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); + const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0); + const int input_height = ArraySize(input_dims, 2); + const int input_width = ArraySize(input_dims, 1); + const int input_depth = ArraySize(input_dims, 0); + const int filter_height = ArraySize(filter_dims, 2); + const int filter_width = ArraySize(filter_dims, 1); + const int output_height = ArraySize(output_dims, 2); + const int output_width = ArraySize(output_dims, 1); +#if 0 // TODO-NNRT : Check if assertion is needed, output depth some times not equal to input * depthmultiplier + DCHECK(output_depth == input_depth * depth_multiplier); +#endif + + static const int kAccBufferMaxSize = 1024; + float acc_buffer[kAccBufferMaxSize]; + DCHECK_GE(kAccBufferMaxSize, output_depth); + const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth; + const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth; + DCHECK_LE(kOutputPixelsInAccBuffer * output_depth, kAccBufferActualSize); + DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize); + DCHECK_GE(kOutputPixelsInAccBuffer, 1); + + // row_accum_func will point to the core accumulation function to be used + // for this DepthwiseConv op. + auto* row_accum_func = FloatDepthwiseConvAccumRowGeneric; + + const int kMaxFixedDepthMultiplier = 16; + int fixed_depth_multiplier = 0; + if (depth_multiplier <= kMaxFixedDepthMultiplier) { + fixed_depth_multiplier = depth_multiplier; + } + // kMaxUnrolling is the max number of output values that we aim to handle + // in one unrolled iteration of the inner loop. For practical performance + // reasons, it is limited by the number of available registers. We could + // fine-tune it depending on the architecture, but that's not worth doing + // since this whole code is not very optimized to begin with. The + // present value reflects what's realistic on ARM 32bit NEON with 16 128-bit + // vector registers. + const int kMaxUnrolling = 8; + int fixed_input_depth = 0; + if (fixed_depth_multiplier && + input_depth * fixed_depth_multiplier <= kMaxUnrolling) { + fixed_input_depth = input_depth; + } +#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \ + FIXED_DEPTH_MULTIPLIER) \ + if ((stride_width == 1 || ALLOW_STRIDED) && \ + fixed_input_depth == FIXED_INPUT_DEPTH && \ + fixed_depth_multiplier == FIXED_DEPTH_MULTIPLIER) { \ + row_accum_func = \ + FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, \ + FIXED_DEPTH_MULTIPLIER>; \ + } + +#ifdef USE_NEON + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 8) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 16) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8) +#endif // USE_NEON + +#undef TFMINI_USE_DEPTHWISECONV_KERNEL + + // Now that we have determined row_accum_func, we can start work. + float* output_ptr = output_data; + for (int b = 0; b < batches; ++b) { + for (int out_y = 0; out_y < output_height; ++out_y) { + const int in_y_origin = (out_y * stride_height) - pad_height; + const int filter_y_start = std::max(0, -in_y_origin); + const int filter_y_end = + std::min(filter_height, input_height - in_y_origin); + for (int out_x_buffer_start = 0; out_x_buffer_start < output_width; + out_x_buffer_start += kOutputPixelsInAccBuffer) { + const int out_x_buffer_end = std::min( + output_width, out_x_buffer_start + kOutputPixelsInAccBuffer); + // We call a 'pixel' a group of activation that share all but the + // 'depth'/'channel' coordinate. num_output_pixels is the number of + // output pixels that we will accumulate in this loop iteration. + const int num_output_pixels = out_x_buffer_end - out_x_buffer_start; + // Initialize our local accumulator with the bias values, so we don't + // have to add them later. + DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, + acc_buffer); + // Accumulation loop. Most of the time should be spent in here. + for (int filter_y = filter_y_start; filter_y < filter_y_end; + ++filter_y) { + const int in_y = in_y_origin + filter_y; + row_accum_func(stride_width, input_depth, input_width, + input_data + in_y * input_dims.strides[2] + + b * input_dims.strides[3], + pad_width, depth_multiplier, filter_width, + filter_data + filter_y * filter_dims.strides[2], + out_x_buffer_start, out_x_buffer_end, output_depth, + acc_buffer); + } + // Finished accumulating. Now store to destination. + const int num_output_values = output_depth * num_output_pixels; + int i = 0; +#ifdef USE_NEON + // Handle 16 values at a time + for (; i <= num_output_values - 16; i += 16) { + float32x4_t acc[4]; + for (int k = 0; k < 4; k++) { + acc[k] = vld1q_f32(acc_buffer + i + 4 * k); + } + if (Ac == FusedActivationFunctionType::kRelu) { + for (int k = 0; k < 4; k++) { + acc[k] = vmaxq_f32(vdupq_n_f32(0.f), acc[k]); + } + } else if (Ac == FusedActivationFunctionType::kRelu6) { + for (int k = 0; k < 4; k++) { + acc[k] = vmaxq_f32(vdupq_n_f32(0.f), + vminq_f32(vdupq_n_f32(6.f), acc[k])); + } + } else if (Ac == FusedActivationFunctionType::kRelu1) { + for (int k = 0; k < 4; k++) { + acc[k] = vmaxq_f32(vdupq_n_f32(-1.f), + vminq_f32(vdupq_n_f32(1.f), acc[k])); + } + } + for (int k = 0; k < 4; k++) { + vst1q_f32(output_ptr + 4 * k, acc[k]); + } + output_ptr += 16; + } + // Handle 4 values at a time + for (; i <= num_output_values - 4; i += 4) { + float32x4_t acc = vld1q_f32(acc_buffer + i); + if (Ac == FusedActivationFunctionType::kRelu) { + acc = vmaxq_f32(vdupq_n_f32(0.f), acc); + } else if (Ac == FusedActivationFunctionType::kRelu6) { + acc = vmaxq_f32(vdupq_n_f32(0.f), vminq_f32(vdupq_n_f32(6.f), acc)); + } else if (Ac == FusedActivationFunctionType::kRelu1) { + acc = + vmaxq_f32(vdupq_n_f32(-1.f), vminq_f32(vdupq_n_f32(1.f), acc)); + } + vst1q_f32(output_ptr, acc); + output_ptr += 4; + } +#endif + // Handle leftover values, one by one. This is very slow. + for (; i < num_output_values; i++) { + float acc = acc_buffer[i]; + if (Ac == FusedActivationFunctionType::kRelu) { + acc = std::max(0.f, acc); + } else if (Ac == FusedActivationFunctionType::kRelu6) { + acc = std::max(0.f, std::min(6.f, acc)); + } else if (Ac == FusedActivationFunctionType::kRelu1) { + acc = std::max(-1.f, std::min(1.f, acc)); + } + *output_ptr++ = acc; + } + } + } + } +} + +} // namespace optimized_ops +} // namespace rt +} // namespace nnfw + + +#endif // __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_FLOAT_H__ diff --git a/runtimes/nn/common/operations/internal/optimized/depthwiseconv_uint8.h b/runtimes/nn/common/operations/internal/optimized/depthwiseconv_uint8.h new file mode 100644 index 000000000..220f8793e --- /dev/null +++ b/runtimes/nn/common/operations/internal/optimized/depthwiseconv_uint8.h @@ -0,0 +1,1606 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_UINT8_H__ +#define __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_UINT8_H__ + +#include "fixedpoint.h" +#include "gemmlowp.h" +#include "../common.h" +#include "../types.h" + +namespace nnfw { +namespace rt { +namespace optimized_ops { + +// Implementation of quantized DepthwiseConv + +template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> +struct QuantizedDepthwiseConvKernel {}; + +#ifdef USE_NEON +template <> +struct QuantizedDepthwiseConvKernel<true, 8, 2> { + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8* input_ptr, int16 input_offset, + int input_ptr_increment, const uint8* filter_ptr, + int16 filter_offset, int32* acc_buffer_ptr) { + // Load the filters, add filter_offset. + uint8x8x2_t filter_u8; + filter_u8.val[0] = vld1_u8(filter_ptr); + filter_u8.val[1] = vld1_u8(filter_ptr + 8); + int16x8_t filter[2]; + for (int i = 0; i < 2; i++) { + filter[i] = vaddq_s16(vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])), + vdupq_n_s16(filter_offset)); + } + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) { + // Load the accumulators from acc_buffer + int32x4x2_t acc[2]; + for (int i = 0; i < 2; i++) { + acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); + acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); + } + // Load the inputs, add input_offset. + const uint8x8_t input_u8 = vld1_u8(input_ptr); + input_ptr += input_ptr_increment; + const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Duplicate the input values, 2-fold + const int16x8x2_t input_dup2 = vzipq_s16(input, input); + // Multiply-accumulate + for (int i = 0; i < 2; i++) { + acc[0].val[i] = vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), + vget_low_s16(input_dup2.val[i])); + acc[1].val[i] = vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), + vget_high_s16(input_dup2.val[i])); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); + vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); + } + acc_buffer_ptr += 16; + } + } +}; + +template <> +struct QuantizedDepthwiseConvKernel<false, 8, 1> { + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8* input_ptr, int16 input_offset, + int input_ptr_increment, const uint8* filter_ptr, + int16 filter_offset, int32* acc_buffer_ptr) { + // Load the filters, add filter_offset. + const uint8x8_t filter_u8 = vld1_u8(filter_ptr); + const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); + const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); + + int outp = 0; + // Handle 2 output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) { + // Load the accumulators from acc_buffer. + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + uint8x8_t input_u8[2]; + for (int i = 0; i < 2; i++) { + input_u8[i] = vld1_u8(input_ptr + 8 * i); + } + input_ptr += 16; + int16x8_t input[2]; + for (int i = 0; i < 2; i++) { + input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i])); + } + for (int i = 0; i < 2; i++) { + input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); + } + // Multiply-accumulate. + acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0])); + acc[1] = + vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0])); + acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1])); + acc[3] = + vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1])); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle 1 output pixel at a time. + for (; outp < num_output_pixels; outp++) { + // Load the accumulators from acc_buffer. + int32x4_t acc[2]; + acc[0] = vld1q_s32(acc_buffer_ptr); + acc[1] = vld1q_s32(acc_buffer_ptr + 4); + + // Load the inputs, add input_offset. + const uint8x8_t input_u8 = vld1_u8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Multiply-accumulate. + acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input)); + acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input)); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr, acc[0]); + vst1q_s32(acc_buffer_ptr + 4, acc[1]); + acc_buffer_ptr += 8; + } + } +}; + +template <> +struct QuantizedDepthwiseConvKernel<false, 4, 2> { + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8* input_ptr, int16 input_offset, + int input_ptr_increment, const uint8* filter_ptr, + int16 filter_offset, int32* acc_buffer_ptr) { + // Load the filters, add filter_offset. + const uint8x8_t filter_u8 = vld1_u8(filter_ptr); + const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); + const int16x8_t filter = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); + + int outp = 0; + // Handle 2 output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) { + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + const uint8x8_t input_u8 = vld1_u8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Duplicate the input values, 2-fold + const int16x8x2_t input_dup2 = vzipq_s16(input, input); + // Multiply-accumulate + for (int i = 0; i < 2; i++) { + acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), + vget_low_s16(input_dup2.val[i])); + acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), + vget_high_s16(input_dup2.val[i])); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) { + // Load the accumulators from acc_buffer + int32x4_t acc[2]; + for (int i = 0; i < 2; i++) { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + uint8x8_t input_u8 = vdup_n_u8(0); + input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); + input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); + input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); + input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); + input_ptr += 4; + const int16x4_t input_s16 = + vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + // Duplicate the input values, 2-fold + const int16x4x2_t input_dup2 = vzip_s16(input, input); + // Multiply-accumulate + acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]); + acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + } +}; + +template <> +struct QuantizedDepthwiseConvKernel<false, 2, 8> { + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8* input_ptr, int16 input_offset, + int input_ptr_increment, const uint8* filter_ptr, + int16 filter_offset, int32* acc_buffer_ptr) { + // Load the filters, add filter_offset. + int16x8_t filter[2]; + for (int i = 0; i < 2; i++) { + const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i); + const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); + filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); + } + int outp = 0; + // Handle two output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) { + // Load the accumulators from acc_buffer. + int32x4_t acc[8]; + for (int i = 0; i < 8; i++) { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + uint8x8_t input_u8 = vdup_n_u8(0); + input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); + input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); + input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); + input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); + input_ptr += 4; + const int16x4_t input_s16 = + vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + // Multiply-accumulate. + acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); + acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0); + acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1); + acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1); + acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2); + acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2); + acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3); + acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3); + // Store the accumulators back to acc_buffer. + for (int i = 0; i < 8; i++) { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 32; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) { + // Load the accumulators from acc_buffer. + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + uint8x8_t input_u8 = vdup_n_u8(0); + input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); + input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); + input_ptr += 2; + const int16x4_t input_s16 = + vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate. + acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); + acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0); + acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1); + acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1); + + // Store the accumulators back to acc_buffer. + for (int i = 0; i < 4; i++) { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + } +}; + +template <> +struct QuantizedDepthwiseConvKernel<false, 2, 2> { + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8* input_ptr, int16 input_offset, + int input_ptr_increment, const uint8* filter_ptr, + int16 filter_offset, int32* acc_buffer_ptr) { + // Load the filters, add filter_offset. + uint8x8_t filter_u8 = vdup_n_u8(0); + filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); + filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); + filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2); + filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3); + const int16x4_t filter_s16 = + vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); + const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); + + int outp = 0; + // Handle 4 output pixels at a time. + for (; outp <= num_output_pixels - 4; outp += 4) { + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + + // Load the inputs, add input_offset. + const uint8x8_t input_u8 = vld1_u8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Duplicate the input values, 2-fold + const int16x8x2_t input_dup2 = vzipq_s16(input, input); + // Multiply-accumulate + acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0])); + acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0])); + acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1])); + acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1])); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) { + // Load the accumulators from acc_buffer + int32x4_t acc = vld1q_s32(acc_buffer_ptr); + + uint8x8_t input_u8 = vdup_n_u8(0); + input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); + input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); + input_ptr += 2; + const int16x4_t input_s16 = + vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + // Duplicate the input values, 2-fold + const int16x4_t input_dup2 = vzip_s16(input, input).val[0]; + // Multiply-accumulate + acc = vmlal_s16(acc, filter, input_dup2); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + } +}; + +template <> +struct QuantizedDepthwiseConvKernel<false, 2, 1> { + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8* input_ptr, int16 input_offset, + int input_ptr_increment, const uint8* filter_ptr, + int16 filter_offset, int32* acc_buffer_ptr) { + // Load the filters, add filter_offset. + uint8x8_t filter_u8 = vdup_n_u8(0); + filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); + filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); + filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2); + filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3); + const int16x4_t filter_s16 = + vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); + const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); + + int outp = 0; + // Handle 8 output pixels at a time. + for (; outp <= num_output_pixels - 8; outp += 8) { + // Load the accumulators from acc_buffer. + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + uint8x8_t input_u8[2]; + for (int i = 0; i < 2; i++) { + input_u8[i] = vld1_u8(input_ptr + 8 * i); + } + input_ptr += 16; + int16x8_t input[2]; + for (int i = 0; i < 2; i++) { + input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i])); + } + for (int i = 0; i < 2; i++) { + input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); + } + + // Multiply-accumulate. + acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0])); + acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0])); + acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1])); + acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1])); + // Store the accumulators back to acc_buffer. + for (int i = 0; i < 4; i++) { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle 4 output pixels at a time. + for (; outp <= num_output_pixels - 4; outp += 4) { + // Load the accumulators from acc_buffer. + int32x4_t acc[2]; + for (int i = 0; i < 2; i++) { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + const uint8x8_t input_u8 = vld1_u8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + + // Multiply-accumulate. + acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input)); + acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input)); + // Store the accumulators back to acc_buffer. + for (int i = 0; i < 2; i++) { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + // Handle 2 output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) { + // Load the accumulators from acc_buffer. + int32x4_t acc = vld1q_s32(acc_buffer_ptr); + // Load the inputs, add input_offset. + uint8x8_t input_u8 = vdup_n_u8(0); + input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); + input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); + input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); + input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); + input_ptr += 4; + const int16x4_t input_s16 = + vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate. + acc = vmlal_s16(acc, filter, input); + // Store the accumulators back to acc_buffer. + vst1q_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + // Handle 1 output pixel at a time. + for (; outp < num_output_pixels; outp++) { + // Load the accumulators from acc_buffer. + int32x2_t acc = vld1_s32(acc_buffer_ptr); + // Load the inputs, add input_offset. + uint8x8_t input_u8 = vdup_n_u8(0); + input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); + input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); + input_ptr += 2; + const int16x4_t input_s16 = + vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate. + acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input)); + // Store the accumulators back to acc_buffer. + vst1_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 2; + } + } +}; + +template <> +struct QuantizedDepthwiseConvKernel<false, 1, 2> { + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8* input_ptr, int16 input_offset, + int input_ptr_increment, const uint8* filter_ptr, + int16 filter_offset, int32* acc_buffer_ptr) { + // Load the filters, add filter_offset. + uint8x8_t filter_u8 = vdup_n_u8(0); + filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); + filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); + filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 2); + filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 3); + const int16x4_t filter_s16 = + vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); + const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); + + int outp = 0; + // Handle 8 output pixels at a time. + for (; outp <= num_output_pixels - 8; outp += 8) { + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + + // Load the inputs, add input_offset. + const uint8x8_t input_u8 = vld1_u8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Duplicate the input values, 2-fold + const int16x8x2_t input_dup2 = vzipq_s16(input, input); + // Multiply-accumulate + acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0])); + acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0])); + acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1])); + acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1])); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) { + // Load the accumulators from acc_buffer + int32x2_t acc = vld1_s32(acc_buffer_ptr); + + // Load the inputs, add input_offset. + const uint32 input = *input_ptr++ + input_offset; + + // Multiply-accumulate + acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input)); + // Store the accumulators back to acc_buffer + vst1_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 2; + } + } +}; + +template <> +struct QuantizedDepthwiseConvKernel<false, 1, 4> { + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8* input_ptr, int16 input_offset, + int input_ptr_increment, const uint8* filter_ptr, + int16 filter_offset, int32* acc_buffer_ptr) { + // Load the filters, add filter_offset. + uint8x8_t filter_u8 = vdup_n_u8(0); + filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); + filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); + filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2); + filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3); + const int16x4_t filter_s16 = + vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); + const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); + + int outp = 0; + // Handle 8 output pixels at a time. + for (; outp <= num_output_pixels - 8; outp += 8) { + // Load the accumulators from acc_buffer + int32x4_t acc[8]; + for (int i = 0; i < 8; i++) { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + + // Load the inputs, add input_offset. + uint8x8_t input_u8 = vld1_u8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + + // Multiply-accumulate + acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0); + acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1); + acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2); + acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3); + acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0); + acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1); + acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2); + acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3); + + // Store the accumulators back to acc_buffer + for (int i = 0; i < 8; i++) { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 32; + } + // Handle 4 output pixels at a time. + for (; outp <= num_output_pixels - 4; outp += 4) { + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + + // Load the inputs, add input_offset. + uint8x8_t input_u8 = vdup_n_u8(0); + input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); + input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); + input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); + input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); + input_ptr += 4; + const int16x4_t input_s16 = + vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate + acc[0] = vmlal_lane_s16(acc[0], filter, input, 0); + acc[1] = vmlal_lane_s16(acc[1], filter, input, 1); + acc[2] = vmlal_lane_s16(acc[2], filter, input, 2); + acc[3] = vmlal_lane_s16(acc[3], filter, input, 3); + + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) { + // Load the accumulators from acc_buffer + int32x4_t acc = vld1q_s32(acc_buffer_ptr); + + // Load the inputs, add input_offset. + const uint32 input = *input_ptr++ + input_offset; + + // Multiply-accumulate + acc = vmlal_n_s16(acc, filter, input); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + } +}; + +template <> +struct QuantizedDepthwiseConvKernel<false, 4, 1> { + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8* input_ptr, int16 input_offset, + int input_ptr_increment, const uint8* filter_ptr, + int16 filter_offset, int32* acc_buffer_ptr) { + // Load the filters, add filter_offset. + uint8x8_t filter_u8 = vdup_n_u8(0); + filter_u8 = vset_lane_u8(filter_ptr[0], filter_u8, 0); + filter_u8 = vset_lane_u8(filter_ptr[1], filter_u8, 1); + filter_u8 = vset_lane_u8(filter_ptr[2], filter_u8, 2); + filter_u8 = vset_lane_u8(filter_ptr[3], filter_u8, 3); + const int16x4_t filter_s16 = + vreinterpret_s16_u16(vget_low_u16(vmovl_u8(filter_u8))); + const int16x4_t filter = vadd_s16(filter_s16, vdup_n_s16(filter_offset)); + + int outp = 0; + // Handle 4 output pixels at a time. + for (; outp <= num_output_pixels - 4; outp += 4) { + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Load the inputs, add input_offset. + int16x8_t input[2]; + for (int i = 0; i < 2; i++) { + const uint8x8_t input_u8 = vld1_u8(input_ptr + 8 * i); + const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); + input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + } + input_ptr += 16; + // Multiply-accumulate + for (int i = 0; i < 2; i++) { + acc[2 * i + 0] = + vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i])); + acc[2 * i + 1] = + vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i])); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) { + // Load the accumulators from acc_buffer + int32x4_t acc; + acc = vld1q_s32(acc_buffer_ptr); + + // Load the inputs, add input_offset. + uint8x8_t input_u8 = vdup_n_u8(0); + input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); + input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); + input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); + input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); + input_ptr += 4; + const int16x4_t input_s16 = + vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + // Multiply-accumulate + acc = vmlal_s16(acc, filter, input); + // Store the accumulators back to acc_buffer + vst1q_s32(acc_buffer_ptr, acc); + acc_buffer_ptr += 4; + } + } +}; + +template <> +struct QuantizedDepthwiseConvKernel<false, 4, 4> { + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8* input_ptr, int16 input_offset, + int input_ptr_increment, const uint8* filter_ptr, + int16 filter_offset, int32* acc_buffer_ptr) { + // Load the filters, add filter_offset. + int16x8_t filter[2]; + for (int i = 0; i < 2; i++) { + const uint8x8_t filter_u8 = vld1_u8(filter_ptr + 8 * i); + const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); + filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); + } + + int outp = 0; + // Handle 2 output pixels at a time. + for (; outp <= num_output_pixels - 2; outp += 2) { + // Load the accumulators from acc_buffer + int32x4_t acc[8]; + for (int i = 0; i < 8; i++) { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + + // Load the inputs, add input_offset. + uint8x8_t input_u8 = vld1_u8(input_ptr); + input_ptr += 8; + const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + + // Multiply-accumulate + acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), + vget_low_s16(input), 0); + acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), + vget_low_s16(input), 1); + acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), + vget_low_s16(input), 2); + acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), + vget_low_s16(input), 3); + acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), + vget_high_s16(input), 0); + acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), + vget_high_s16(input), 1); + acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), + vget_high_s16(input), 2); + acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), + vget_high_s16(input), 3); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 8; i++) { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 32; + } + // Handle one output pixel at a time. + for (; outp < num_output_pixels; outp++) { + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + + // Load the inputs, add input_offset. + uint8x8_t input_u8 = vdup_n_u8(0); + input_u8 = vset_lane_u8(input_ptr[0], input_u8, 0); + input_u8 = vset_lane_u8(input_ptr[1], input_u8, 1); + input_u8 = vset_lane_u8(input_ptr[2], input_u8, 2); + input_u8 = vset_lane_u8(input_ptr[3], input_u8, 3); + input_ptr += 4; + const int16x4_t input_s16 = + vreinterpret_s16_u16(vget_low_u16(vmovl_u8(input_u8))); + const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset)); + + // Multiply-accumulate + acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0); + acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1); + acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2); + acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + } +}; + +template <> +struct QuantizedDepthwiseConvKernel<true, 0, 3> { + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8* input_ptr, int16 input_offset, + int input_ptr_increment, const uint8* filter_ptr, + int16 filter_offset, int32* acc_buffer_ptr) { + // We will have to duplicate bytes in a NEON register, 3-fold. + // We will do that by register-level table-look-up using VTBL instructions. + // Here we prepare the registers containing the table-lookup indices. + static const uint8 dup3_indices_array[3][8] = {{0, 0, 0, 1, 1, 1, 2, 2}, + {2, 3, 3, 3, 4, 4, 4, 5}, + {5, 5, 6, 6, 6, 7, 7, 7}}; + uint8x8_t dup3_indices[3]; + for (int i = 0; i < 3; i++) { + dup3_indices[i] = vld1_u8(dup3_indices_array[i]); + } + + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) { + const uint8* local_filter_ptr = filter_ptr; + const uint8* local_input_ptr = input_ptr; + int ic = 0; + // Handle 8 input channels at a time. + for (; ic <= input_depth - 8; ic += 8) { + // Load the filters, add filter_offset. + int16x8_t filter[3]; + uint8x8x3_t filter_u8; + filter_u8.val[0] = vld1_u8(local_filter_ptr); + filter_u8.val[1] = vld1_u8(local_filter_ptr + 8); + filter_u8.val[2] = vld1_u8(local_filter_ptr + 16); + local_filter_ptr += 24; + for (int i = 0; i < 3; i++) { + const int16x8_t filter_s16 = + vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])); + filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); + } + // Load the inputs, duplicate 3-fold, add input_offset. + const uint8x8_t input_u8 = vld1_u8(local_input_ptr); + local_input_ptr += 8; + + uint8x8_t input_u8_dup3[3]; + for (int i = 0; i < 3; i++) { + input_u8_dup3[i] = vtbl1_u8(input_u8, dup3_indices[i]); + } + int16x8_t input_dup3[3]; + for (int i = 0; i < 3; i++) { + const int16x8_t input_s16_dup3 = + vreinterpretq_s16_u16(vmovl_u8(input_u8_dup3[i])); + input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset)); + } + // Load the accumulators from acc_buffer + int32x4x3_t acc[2]; + for (int i = 0; i < 2; i++) { + acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); + acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); + acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16); + } + // Multiply-accumulate + for (int j = 0; j < 3; j++) { + acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), + vget_low_s16(filter[j])); + acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), + vget_high_s16(filter[j])); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); + vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); + vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]); + } + acc_buffer_ptr += 24; + } + // Handle one input channel at a time. + for (; ic < input_depth; ic++) { + const int16 input_val = *local_input_ptr++ + input_offset; + for (int i = 0; i < 3; i++) { + const int16 filter_val = local_filter_ptr[i] + filter_offset; + *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val; + } + local_filter_ptr += 3; + } + input_ptr += input_ptr_increment; + } + } +}; + +template <> +struct QuantizedDepthwiseConvKernel<true, 0, 2> { + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8* input_ptr, int16 input_offset, + int input_ptr_increment, const uint8* filter_ptr, + int16 filter_offset, int32* acc_buffer_ptr) { + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) { + const uint8* local_filter_ptr = filter_ptr; + const uint8* local_input_ptr = input_ptr; + int ic = 0; + // Handle 8 input channels at a time. + for (; ic <= input_depth - 8; ic += 8) { + // Load the filters, add filter_offset. + int16x8_t filter[2]; + uint8x8x2_t filter_u8; + filter_u8.val[0] = vld1_u8(local_filter_ptr); + filter_u8.val[1] = vld1_u8(local_filter_ptr + 8); + local_filter_ptr += 16; + for (int i = 0; i < 2; i++) { + const int16x8_t filter_s16 = + vreinterpretq_s16_u16(vmovl_u8(filter_u8.val[i])); + filter[i] = vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); + } + // Load the inputs, add input_offset, duplicate 2-fold. + const uint8x8_t input_u8 = vld1_u8(local_input_ptr); + local_input_ptr += 8; + const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + const int16x8x2_t input_dup2 = vzipq_s16(input, input); + // Load the accumulators from acc_buffer. + int32x4x2_t acc[2]; + for (int i = 0; i < 2; i++) { + acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i); + acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8); + } + // Multiply-accumulate. + for (int j = 0; j < 2; j++) { + acc[0].val[j] = vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), + vget_low_s16(input_dup2.val[j])); + acc[1].val[j] = vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), + vget_high_s16(input_dup2.val[j])); + } + // Store the accumulators back to acc_buffer. + for (int i = 0; i < 2; i++) { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]); + vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]); + } + acc_buffer_ptr += 16; + } + // Handle one input channel at a time. + for (; ic < input_depth; ic++) { + // Load the inputs. + const int16 input_val = *local_input_ptr++ + input_offset; + for (int i = 0; i < 2; i++) { + const int16 filter_val = local_filter_ptr[i] + filter_offset; + *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val; + } + local_filter_ptr += 2; + } + input_ptr += input_ptr_increment; + } + } +}; + +template <> +struct QuantizedDepthwiseConvKernel<true, 0, 1> { + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8* input_ptr, int16 input_offset, + int input_ptr_increment, const uint8* filter_ptr, + int16 filter_offset, int32* acc_buffer_ptr) { + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) { + const uint8* local_filter_ptr = filter_ptr; + const uint8* local_input_ptr = input_ptr; + int ic = 0; + // Handle 16 input channels at a time. + for (; ic <= input_depth - 16; ic += 16) { + // Load the filters, add filter_offset. + uint8x8_t filter_u8[2]; + for (int i = 0; i < 2; i++) { + filter_u8[i] = vld1_u8(local_filter_ptr + 8 * i); + } + local_filter_ptr += 16; + int16x8_t filter[2]; + for (int i = 0; i < 2; i++) { + filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i])); + } + for (int i = 0; i < 2; i++) { + filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset)); + } + // Load the inputs, add input_offset. + uint8x8_t input_u8[2]; + for (int i = 0; i < 2; i++) { + input_u8[i] = vld1_u8(local_input_ptr + 8 * i); + } + local_input_ptr += 16; + int16x8_t input[2]; + for (int i = 0; i < 2; i++) { + input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i])); + } + for (int i = 0; i < 2; i++) { + input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); + } + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 2; i++) { + acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), + vget_low_s16(filter[i])); + acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), + vget_high_s16(filter[i])); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + // Handle 8 input channels at a time. + for (; ic <= input_depth - 8; ic += 8) { + // Load the filters, add filter_offset. + const uint8x8_t filter_u8 = vld1_u8(local_filter_ptr); + local_filter_ptr += 8; + const int16x8_t filter_s16 = vreinterpretq_s16_u16(vmovl_u8(filter_u8)); + const int16x8_t filter = + vaddq_s16(filter_s16, vdupq_n_s16(filter_offset)); + // Load the inputs, add input_offset. + const uint8x8_t input_u8 = vld1_u8(local_input_ptr); + local_input_ptr += 8; + const int16x8_t input_s16 = vreinterpretq_s16_u16(vmovl_u8(input_u8)); + const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset)); + // Load the accumulators from acc_buffer + int32x4_t acc[2]; + for (int i = 0; i < 2; i++) { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter)); + acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter)); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + // Handle one input channel at a time. + for (; ic < input_depth; ic++) { + const int16 input_val = *local_input_ptr++ + input_offset; + const int16 filter_val = *local_filter_ptr++ + filter_offset; + *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val; + } + input_ptr += input_ptr_increment; + } + } +}; + +template <> +struct QuantizedDepthwiseConvKernel<true, 16, 1> { + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8* input_ptr, int16 input_offset, + int input_ptr_increment, const uint8* filter_ptr, + int16 filter_offset, int32* acc_buffer_ptr) { + // Load the filters, add filter_offset. + uint8x8_t filter_u8[2]; + for (int i = 0; i < 2; i++) { + filter_u8[i] = vld1_u8(filter_ptr + 8 * i); + } + int16x8_t filter[2]; + for (int i = 0; i < 2; i++) { + filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i])); + } + for (int i = 0; i < 2; i++) { + filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset)); + } + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) { + // Load the inputs, add input_offset. + uint8x8_t input_u8[2]; + for (int i = 0; i < 2; i++) { + input_u8[i] = vld1_u8(input_ptr + 8 * i); + } + input_ptr += input_ptr_increment; + int16x8_t input[2]; + for (int i = 0; i < 2; i++) { + input[i] = vreinterpretq_s16_u16(vmovl_u8(input_u8[i])); + } + for (int i = 0; i < 2; i++) { + input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset)); + } + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 2; i++) { + acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), + vget_low_s16(filter[i])); + acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), + vget_high_s16(filter[i])); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + } +}; + +template <> +struct QuantizedDepthwiseConvKernel<true, 1, 16> { + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8* input_ptr, int16 input_offset, + int input_ptr_increment, const uint8* filter_ptr, + int16 filter_offset, int32* acc_buffer_ptr) { + // Load the filters, add filter_offset. + uint8x8_t filter_u8[2]; + for (int i = 0; i < 2; i++) { + filter_u8[i] = vld1_u8(filter_ptr + 8 * i); + } + int16x8_t filter[2]; + for (int i = 0; i < 2; i++) { + filter[i] = vreinterpretq_s16_u16(vmovl_u8(filter_u8[i])); + } + for (int i = 0; i < 2; i++) { + filter[i] = vaddq_s16(filter[i], vdupq_n_s16(filter_offset)); + } + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) { + uint8 input_u8 = *input_ptr; + input_ptr += input_ptr_increment; + int16 input = static_cast<int16>(input_u8 + input_offset); + // Load the accumulators from acc_buffer + int32x4_t acc[4]; + for (int i = 0; i < 4; i++) { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + for (int i = 0; i < 2; i++) { + acc[2 * i + 0] = + vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input); + acc[2 * i + 1] = + vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input); + } + // Store the accumulators back to acc_buffer + for (int i = 0; i < 4; i++) { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 16; + } + } +}; + +template <> +struct QuantizedDepthwiseConvKernel<true, 1, 8> { + static void Run(int num_output_pixels, int input_depth, int depth_multiplier, + const uint8* input_ptr, int16 input_offset, + int input_ptr_increment, const uint8* filter_ptr, + int16 filter_offset, int32* acc_buffer_ptr) { + // Load the filters, add filter_offset. + const uint8x8_t filter_u8 = vld1_u8(filter_ptr); + const int16x8_t filter = vaddq_s16( + vreinterpretq_s16_u16(vmovl_u8(filter_u8)), vdupq_n_s16(filter_offset)); + // Handle one output pixel at a time. + for (int outp = 0; outp < num_output_pixels; outp++) { + uint8 input_u8 = *input_ptr; + input_ptr += input_ptr_increment; + int16 input = static_cast<int16>(input_u8 + input_offset); + // Load the accumulators from acc_buffer + int32x4_t acc[2]; + for (int i = 0; i < 2; i++) { + acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i); + } + // Multiply-accumulate + acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input); + acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input); + // Store the accumulators back to acc_buffer + for (int i = 0; i < 2; i++) { + vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]); + } + acc_buffer_ptr += 8; + } + } +}; +#endif + +// Accumulates the effect of one row of the filter, on a segment of one row +// of the output, accessing the corresponding one row of the input. +template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier> +void QuantizedDepthwiseConvAccumRow( + int stride, int input_depth, int input_width, const uint8* input_data, + int16 input_offset, int pad_width, int depth_multiplier, int filter_width, + const uint8* filter_data, int16 filter_offset, int out_x_buffer_start, + int out_x_buffer_end, int output_depth, int32* acc_buffer) { +#ifdef GEMMLOWP_PROFILING + gemmlowp::ScopedProfilingLabel label(__PRETTY_FUNCTION__); +#endif + // Sanity check parameters. This is important in particular to ensure + // that we keep the number of template instantiations minimal, so we don't + // increase binary size unnecessarily. + static_assert(kFixedDepthMultiplier || !kFixedInputDepth, ""); + static_assert(kFixedInputDepth || kAllowStrided, ""); + DCHECK(stride == 1 || kAllowStrided); + if (kFixedInputDepth) { + DCHECK_EQ(input_depth, kFixedInputDepth); + } + if (kFixedDepthMultiplier) { + DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier); + } + DCHECK_EQ(output_depth, input_depth * depth_multiplier); + const int input_ptr_increment = stride * input_depth; + const uint8* filter_base_ptr = filter_data; + for (int filter_x = 0; filter_x < filter_width; ++filter_x) { + // For the current (filter_x, filter_y) point in the filter, + // compute the boundaries of the corresponding output row segment. + int out_x_loop_start_unclampled = 0; + int out_x_loop_end_unclampled = 0; + if (kAllowStrided) { + if (stride == 2) { + out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2; + out_x_loop_end_unclampled = + (pad_width + input_width - filter_x + 1) / 2; + } else if (stride == 4) { + out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4; + out_x_loop_end_unclampled = + (pad_width + input_width - filter_x + 3) / 4; + } else { + out_x_loop_start_unclampled = + (pad_width - filter_x + stride - 1) / stride; + out_x_loop_end_unclampled = + (pad_width + input_width - filter_x + stride - 1) / stride; + } + } else { + out_x_loop_start_unclampled = pad_width - filter_x; + out_x_loop_end_unclampled = pad_width + input_width - filter_x; + } + // The kernel will have to iterate on the segment of the + // output row that starts at out_x_loop_start and out_x_loop_end. + const int out_x_loop_start = + std::max(out_x_buffer_start, out_x_loop_start_unclampled); + const int out_x_loop_end = + std::min(out_x_buffer_end, out_x_loop_end_unclampled); + + int32* acc_buffer_ptr = + acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; + const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x; + const uint8* input_ptr = input_data + in_x_origin * input_depth; + const int num_output_pixels = out_x_loop_end - out_x_loop_start; + QuantizedDepthwiseConvKernel< + kAllowStrided, kFixedInputDepth, + kFixedDepthMultiplier>::Run(num_output_pixels, input_depth, + depth_multiplier, input_ptr, input_offset, + input_ptr_increment, filter_base_ptr, + filter_offset, acc_buffer_ptr); + filter_base_ptr += output_depth; + } +} + +// generic fallback of DepthwiseConvAccumRow, portable, non-templatized. +inline void QuantizedDepthwiseConvAccumRowGeneric( + int stride, int input_depth, int input_width, const uint8* input_data, + int16 input_offset, int pad_width, int depth_multiplier, int filter_width, + const uint8* filter_data, int16 filter_offset, int out_x_buffer_start, + int out_x_buffer_end, int output_depth, int32* acc_buffer) { + gemmlowp::ScopedProfilingLabel label("DepthwiseConvAccumRowGeneric (slow)"); + const uint8* filter_base_ptr = filter_data; + for (int filter_x = 0; filter_x < filter_width; ++filter_x) { + const int out_x_loop_start = std::max( + out_x_buffer_start, (pad_width - filter_x + stride - 1) / stride); + const int out_x_loop_end = + std::min(out_x_buffer_end, + (pad_width + input_width - filter_x + stride - 1) / stride); + + int32* acc_buffer_ptr = + acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth; + const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x; + const uint8* input_ptr = input_data + in_x_origin * input_depth; + const int input_ptr_increment = (stride - 1) * input_depth; + for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) { + const uint8* filter_ptr = filter_base_ptr; + for (int ic = 0; ic < input_depth; ++ic) { + const int16 input_val = *input_ptr++ + input_offset; + for (int m = 0; m < depth_multiplier; m++) { + const int16 filter_val = *filter_ptr++ + filter_offset; + *acc_buffer_ptr++ += static_cast<int32>(filter_val) * input_val; + } + } + input_ptr += input_ptr_increment; + } + filter_base_ptr += output_depth; + } +} + +// Initializes the accumulator buffer with bias values. +inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth, + const int32* bias_data, + int32* acc_buffer) { + int i = 0; +#ifdef USE_NEON + if (output_depth == 1) { + const int32x4_t b = vdupq_n_s32(bias_data[0]); + for (; i <= num_output_pixels - 16; i += 16) { + vst1q_s32(acc_buffer + i + 0, b); + vst1q_s32(acc_buffer + i + 4, b); + vst1q_s32(acc_buffer + i + 8, b); + vst1q_s32(acc_buffer + i + 12, b); + } + for (; i <= num_output_pixels - 4; i += 4) { + vst1q_s32(acc_buffer + i, b); + } + } else if (output_depth == 2) { + int32x4_t b = vdupq_n_s32(bias_data[0]); + b = vsetq_lane_s32(bias_data[1], b, 1); + b = vsetq_lane_s32(bias_data[1], b, 3); + for (; i <= num_output_pixels - 8; i += 8) { + vst1q_s32(acc_buffer + 2 * i + 0, b); + vst1q_s32(acc_buffer + 2 * i + 4, b); + vst1q_s32(acc_buffer + 2 * i + 8, b); + vst1q_s32(acc_buffer + 2 * i + 12, b); + } + for (; i <= num_output_pixels - 2; i += 2) { + vst1q_s32(acc_buffer + 2 * i, b); + } + } else if (output_depth == 4) { + const int32x4_t b = vld1q_s32(bias_data); + for (; i <= num_output_pixels - 4; i += 4) { + vst1q_s32(acc_buffer + 4 * i + 0, b); + vst1q_s32(acc_buffer + 4 * i + 4, b); + vst1q_s32(acc_buffer + 4 * i + 8, b); + vst1q_s32(acc_buffer + 4 * i + 12, b); + } + for (; i < num_output_pixels; i++) { + vst1q_s32(acc_buffer + 4 * i, b); + } + } else if (output_depth == 8) { + const int32x4_t b0 = vld1q_s32(bias_data); + const int32x4_t b1 = vld1q_s32(bias_data + 4); + for (; i <= num_output_pixels - 2; i += 2) { + vst1q_s32(acc_buffer + 8 * i + 0, b0); + vst1q_s32(acc_buffer + 8 * i + 4, b1); + vst1q_s32(acc_buffer + 8 * i + 8, b0); + vst1q_s32(acc_buffer + 8 * i + 12, b1); + } + for (; i < num_output_pixels; i++) { + vst1q_s32(acc_buffer + 8 * i + 0, b0); + vst1q_s32(acc_buffer + 8 * i + 4, b1); + } + } else if (output_depth == 16) { + const int32x4_t b0 = vld1q_s32(bias_data); + const int32x4_t b1 = vld1q_s32(bias_data + 4); + const int32x4_t b2 = vld1q_s32(bias_data + 8); + const int32x4_t b3 = vld1q_s32(bias_data + 12); + for (; i < num_output_pixels; i++) { + vst1q_s32(acc_buffer + 16 * i + 0, b0); + vst1q_s32(acc_buffer + 16 * i + 4, b1); + vst1q_s32(acc_buffer + 16 * i + 8, b2); + vst1q_s32(acc_buffer + 16 * i + 12, b3); + } + } +#endif + for (; i < num_output_pixels; i++) { + memcpy(acc_buffer + i * output_depth, bias_data, + sizeof(acc_buffer[0]) * output_depth); + } +} + +template <FusedActivationFunctionType Ac> +void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int stride_width, int stride_height, + int pad_width, int pad_height, int depth_multiplier, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("DepthwiseConv/8bit"); + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + DCHECK_LE(output_activation_min, output_activation_max); + if (Ac == FusedActivationFunctionType::kNone) { + DCHECK_EQ(output_activation_min, 0); + DCHECK_EQ(output_activation_max, 255); + } + const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); + const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0); + const int input_height = ArraySize(input_dims, 2); + const int input_width = ArraySize(input_dims, 1); + const int input_depth = ArraySize(input_dims, 0); + const int filter_height = ArraySize(filter_dims, 2); + const int filter_width = ArraySize(filter_dims, 1); + const int output_height = ArraySize(output_dims, 2); + const int output_width = ArraySize(output_dims, 1); + DCHECK(output_depth == input_depth * depth_multiplier); + + static const int kAccBufferMaxSize = 1024; + int32 acc_buffer[kAccBufferMaxSize]; + DCHECK_GE(kAccBufferMaxSize, output_depth); + const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth; + const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth; + DCHECK_LE(kOutputPixelsInAccBuffer * output_depth, kAccBufferActualSize); + DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize); + DCHECK_GE(kOutputPixelsInAccBuffer, 1); + + // row_accum_func will point to the core accumulation function to be used + // for this DepthwiseConv op. + auto* row_accum_func = QuantizedDepthwiseConvAccumRowGeneric; + + const int kMaxFixedDepthMultiplier = 16; + int fixed_depth_multiplier = 0; + if (depth_multiplier <= kMaxFixedDepthMultiplier) { + fixed_depth_multiplier = depth_multiplier; + } + // kMaxUnrolling is the max number of output values that we aim to handle + // in one unrolled iteration of the inner loop. For practical performance + // reasons, it is limited by the number of available registers. We could + // fine-tune it depending on the architecture, but that's not worth doing + // since this whole code is not very optimized to begin with. The + // present value reflects what's realistic on ARM 32bit NEON with 16 128-bit + // vector registers. + const int kMaxUnrolling = 16; + int fixed_input_depth = 0; + if (fixed_depth_multiplier && + input_depth * fixed_depth_multiplier <= kMaxUnrolling) { + fixed_input_depth = input_depth; + } +#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \ + FIXED_DEPTH_MULTIPLIER) \ + if ((stride_width == 1 || ALLOW_STRIDED) && \ + fixed_input_depth == FIXED_INPUT_DEPTH && \ + fixed_depth_multiplier == FIXED_DEPTH_MULTIPLIER) { \ + row_accum_func = \ + QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, \ + FIXED_DEPTH_MULTIPLIER>; \ + } + +#ifdef USE_NEON + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8) + TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1) + TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8) +#endif // USE_NEON + +#undef TFMINI_USE_DEPTHWISECONV_KERNEL + + // Now that we have determined row_accum_func, we can start work. + uint8* output_ptr = output_data; + for (int b = 0; b < batches; ++b) { + for (int out_y = 0; out_y < output_height; ++out_y) { + const int in_y_origin = (out_y * stride_height) - pad_height; + const int filter_y_start = std::max(0, -in_y_origin); + const int filter_y_end = + std::min(filter_height, input_height - in_y_origin); + for (int out_x_buffer_start = 0; out_x_buffer_start < output_width; + out_x_buffer_start += kOutputPixelsInAccBuffer) { + const int out_x_buffer_end = std::min( + output_width, out_x_buffer_start + kOutputPixelsInAccBuffer); + // We call a 'pixel' a group of activation that share all but the + // 'depth'/'channel' coordinate. num_output_pixels is the number of + // output pixels that we will accumulate in this loop iteration. + const int num_output_pixels = out_x_buffer_end - out_x_buffer_start; + // Initialize our local accumulator with the bias values, so we don't + // have to add them later. + DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, + acc_buffer); + // Accumulation loop. Most of the time should be spent in here. + for (int filter_y = filter_y_start; filter_y < filter_y_end; + ++filter_y) { + const int in_y = in_y_origin + filter_y; + row_accum_func( + stride_width, input_depth, input_width, + input_data + in_y * input_dims.strides[2] + + b * input_dims.strides[3], + input_offset, pad_width, depth_multiplier, filter_width, + filter_data + filter_y * filter_dims.strides[2], filter_offset, + out_x_buffer_start, out_x_buffer_end, output_depth, acc_buffer); + } + // Finished accumulating int32 values. Now need to convert them to + // the final 8bit form and store them. + gemmlowp::ScopedProfilingLabel label("downquantize+store"); + const int num_output_values = output_depth * num_output_pixels; + int i = 0; +#ifdef USE_NEON + using gemmlowp::RoundingDivideByPOT; + const int32x4_t output_offset_vec = vdupq_n_s32(output_offset); + const int32x4_t output_activation_min_vec = + vdupq_n_s32(output_activation_min); + const int32x4_t output_activation_max_vec = + vdupq_n_s32(output_activation_max); + // Handle 16 values at once. + // This allows us to issue 4 mutually independent int32 + // multiplications (vqrdmulh), which should alleviate most of their + // high latency. + for (; i <= num_output_values - 16; i += 16) { + int32x4_t acc[4]; + for (int j = 0; j < 4; j++) { + acc[j] = vld1q_s32(acc_buffer + i + 4 * j); + } + + // Fixed-point multiplication. + for (int j = 0; j < 4; j++) { + acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier); + } + for (int j = 0; j < 4; j++) { + acc[j] = RoundingDivideByPOT(acc[j], output_shift); + } + // Add the output offset. + for (int j = 0; j < 4; j++) { + acc[j] = vaddq_s32(acc[j], output_offset_vec); + } + // Apply the activation function. + if (Ac != FusedActivationFunctionType::kNone) { + for (int j = 0; j < 4; j++) { + acc[j] = vmaxq_s32(acc[j], output_activation_min_vec); + } + for (int j = 0; j < 4; j++) { + acc[j] = vminq_s32(acc[j], output_activation_max_vec); + } + } + // Saturating cast to uint8 and store to destination. + int16x4_t acc_s16[4]; + for (int j = 0; j < 4; j++) { + acc_s16[j] = vqmovn_s32(acc[j]); + } + const int16x8_t res_s16_0 = vcombine_s16(acc_s16[0], acc_s16[1]); + const int16x8_t res_s16_1 = vcombine_s16(acc_s16[2], acc_s16[3]); + const uint8x8_t res_u8_0 = vqmovun_s16(res_s16_0); + const uint8x8_t res_u8_1 = vqmovun_s16(res_s16_1); + vst1q_u8(output_ptr, vcombine_u8(res_u8_0, res_u8_1)); + output_ptr += 16; + } + // Handle 8 values at once. + // Not as good as 16 (now we're only issuing 2 mutually independent + // vqrdmulh instructions, so we're probably paying for their high + // latency). + for (; i <= num_output_values - 8; i += 8) { + int32x4_t acc0 = vld1q_s32(acc_buffer + i); + int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4); + // Fixed-point multiplication. + acc0 = vqrdmulhq_n_s32(acc0, output_multiplier); + acc1 = vqrdmulhq_n_s32(acc1, output_multiplier); + // Rounding right shift. + acc0 = RoundingDivideByPOT(acc0, output_shift); + acc1 = RoundingDivideByPOT(acc1, output_shift); + // Add the output offset. + acc0 = vaddq_s32(acc0, output_offset_vec); + acc1 = vaddq_s32(acc1, output_offset_vec); + // Apply the activation function. + if (Ac != FusedActivationFunctionType::kNone) { + acc0 = vmaxq_s32(acc0, output_activation_min_vec); + acc1 = vmaxq_s32(acc1, output_activation_min_vec); + acc0 = vminq_s32(acc0, output_activation_max_vec); + acc1 = vminq_s32(acc1, output_activation_max_vec); + } + // Saturating cast to uint8 and store to destination. + const int16x4_t acc0_s16 = vqmovn_s32(acc0); + const int16x4_t acc1_s16 = vqmovn_s32(acc1); + const int16x8_t res_s16 = vcombine_s16(acc0_s16, acc1_s16); + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + vst1_u8(output_ptr, res_u8); + output_ptr += 8; + } + // Handle 4 values at once. Now we're paying the full price of the + // high latency of vqrdmulh. Also, storing only 4 bytes at the end + // (without any alignment) can only be done 1 byte at a time. + // Yet, that is still worth doing to minimize the amount of leftover + // that will have to go through the very slow scalar code. + for (; i <= num_output_values - 4; i += 4) { + int32x4_t acc = vld1q_s32(acc_buffer + i); + // Fixed-point multiplication. + acc = vqrdmulhq_n_s32(acc, output_multiplier); + // Rounding right shift. + acc = RoundingDivideByPOT(acc, output_shift); + // Add the output offset. + acc = vaddq_s32(acc, output_offset_vec); + // Apply the activation function. + if (Ac != FusedActivationFunctionType::kNone) { + acc = vmaxq_s32(acc, output_activation_min_vec); + acc = vminq_s32(acc, output_activation_max_vec); + } + // Saturating cast to uint8 and store to destination. + const int16x4_t acc_s16 = vqmovn_s32(acc); + const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16); + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + vst1_lane_u8(output_ptr + 0, res_u8, 0); + vst1_lane_u8(output_ptr + 1, res_u8, 1); + vst1_lane_u8(output_ptr + 2, res_u8, 2); + vst1_lane_u8(output_ptr + 3, res_u8, 3); + output_ptr += 4; + } +#endif // USE_NEON + + // Handle leftover values, one by one. This is very slow. + for (; i < num_output_values; i++) { + int32 acc = acc_buffer[i]; + acc = MultiplyByQuantizedMultiplierSmallerThanOne( + acc, output_multiplier, output_shift); + acc += output_offset; + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + *output_ptr++ = static_cast<uint8>(acc); + } + } + } + } +} + +} // namespace optimized_ops +} // namespace rt +} // namespace nnfw + +#endif // __NNFW_RT_OPTIMIZED_OPS_DEPTHWISECONV_UINT8_H__ diff --git a/runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.cc b/runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.cc new file mode 100644 index 000000000..7af122517 --- /dev/null +++ b/runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.cc @@ -0,0 +1,217 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <string.h> + +#include "ActivationFunctor.h" +#include "tensor_utils_impl.h" + +#ifdef USE_NEON + +#include <arm_neon.h> +#define kFloatWeightsPerNeonLane 4 + +namespace nnfw { +namespace rt { +namespace tensor_utils { + +void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows, + int m_cols, const float* vector, + int n_batch, float* result, + int result_stride) { + // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main + // vectorized loop, and we need to process sequentially. postamble_start shows + // the start index where this should happen. + const int postamble_start = + m_cols - (m_cols & (kFloatWeightsPerNeonLane - 1)); + + // The arrays used to cache the vector. + float32x4_t* vector_cache_float32x4 = + new float32x4_t[(m_cols / kFloatWeightsPerNeonLane) * + sizeof(float32x4_t)]; + + for (int b = 0; b < n_batch; b++) { + float* result_in_batch = result + b * m_rows; + const float* vector_in_batch = vector + b * m_cols; + const float* matrix_ptr = matrix; + for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) { + vector_cache_float32x4[c >> 2] = vld1q_f32(vector_in_batch + c); + } + for (int r = 0; r < m_rows; r++) { + float32x4_t acc_32x4 = vmovq_n_f32(0.0); + for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) { + float32x4_t temp = vector_cache_float32x4[c >> 2]; + // Load 4 float values from vector1 and vector2 and accumulator. + float32x4_t v1_f32x4 = vld1q_f32(matrix_ptr + c); + // Vector multiply-accumulate 4 float + acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, temp); + } + // Add the 4 intermediate sum values to get the final dot-prod value for + // this column. + *result_in_batch += + (vgetq_lane_f32(acc_32x4, 0) + vgetq_lane_f32(acc_32x4, 1) + + vgetq_lane_f32(acc_32x4, 2) + vgetq_lane_f32(acc_32x4, 3)); + for (int c = postamble_start; c < m_cols; c++) { + *result_in_batch += matrix_ptr[c] * vector_in_batch[c]; + } + matrix_ptr += m_cols; + result_in_batch += result_stride; + } + } + delete[] vector_cache_float32x4; +} + +void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2, + int v_size, float* result) { + // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main + // vectorized loop, and we need to process sequentially. postamble_start shows + // the start index where this should happen. + const int postamble_start = + v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); + for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { + // Load 4 float values from vector1 and vector2. + float32x4_t v1_f32x4 = vld1q_f32(vector1 + v); + float32x4_t v2_f32x4 = vld1q_f32(vector2 + v); + // Vector multiply 4 float + float32x4_t mul_32x4 = vmulq_f32(v1_f32x4, v2_f32x4); + // Save to result array. + vst1q_f32(&result[v], mul_32x4); + } + for (int v = postamble_start; v < v_size; v++) { + result[v] = vector1[v] * vector2[v]; + } +} + +void NeonVectorVectorCwiseProductAccumulate(const float* vector1, + const float* vector2, int v_size, + float* result) { + // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main + // vectorized loop, and we need to process sequentially. postamble_start shows + // the start index where this should happen. + const int postamble_start = + v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); + for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { + // Load 4 float values from vector1 and vector2 and accumulator. + float32x4_t v1_f32x4 = vld1q_f32(vector1 + v); + float32x4_t v2_f32x4 = vld1q_f32(vector2 + v); + float32x4_t acc_32x4 = vld1q_f32(result + v); + // Vector multiply-accumulate 4 float + acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, v2_f32x4); + // Save to result array. + vst1q_f32(&result[v], acc_32x4); + } + for (int v = postamble_start; v < v_size; v++) { + result[v] += vector1[v] * vector2[v]; + } +} + +void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector, + int v_size, + const float* batch_vector, + int n_batch, float* result) { + // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main + // vectorized loop, and we need to process sequentially. postamble_start shows + // the start index where this should happen. + const int postamble_start = + v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); + + // The arrays used to cache the vector. + float32x4_t* vector_cache_float32x4 = + new float32x4_t[(v_size / kFloatWeightsPerNeonLane) * + sizeof(float32x4_t)]; + for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { + vector_cache_float32x4[v >> 2] = vld1q_f32(vector + v); + } + + float* result_ptr = result; + const float* batch_vector_ptr = batch_vector; + for (int b = 0; b < n_batch; b++) { + for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { + // Load from memory to vectors. + float32x4_t result_f32x4 = vld1q_f32(result_ptr + v); + float32x4_t batch_vector_f32x4 = vld1q_f32(batch_vector_ptr + v); + // Multiply-accumulate. + result_f32x4 = vmlaq_f32(result_f32x4, batch_vector_f32x4, + vector_cache_float32x4[v >> 2]); + // Store. + vst1q_f32(result_ptr + v, result_f32x4); + } + // Postamble loop + for (int v = postamble_start; v < v_size; v++) { + result_ptr[v] += vector[v] * batch_vector_ptr[v]; + } + // Update the pointers. + result_ptr += v_size; + batch_vector_ptr += v_size; + } + delete[] vector_cache_float32x4; +} + +void NeonSub1Vector(const float* vector, int v_size, float* result) { + // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main + // vectorized loop, and we need to process sequentially. postamble_start shows + // the start index where this should happen. + const int postamble_start = + v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); + + float32x4_t one_f32x4 = vmovq_n_f32(1.0); + for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { + // Load 4 float values from the current pointers of the input column and + // subtract from 1. + float32x4_t v_f32x4 = vld1q_f32(vector + v); + float32x4_t result_f32x4 = vsubq_f32(one_f32x4, v_f32x4); + // Save to output. + vst1q_f32(result + v, result_f32x4); + } + for (int v = postamble_start; v < v_size; v++) { + result[v] = 1.0f - vector[v]; + } +} + +void NeonClipVector(const float* vector, int v_size, float abs_limit, + float* result) { + // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main + // vectorized loop, and we need to process sequentially. postamble_start shows + // the start index where this should happen. + const int postamble_start = + v_size - (v_size & (kFloatWeightsPerNeonLane - 1)); + + // Replicate abs_limit and -abs_limit in two vectors. + const float32x4_t abs_limit_f32x4 = vmovq_n_f32(abs_limit); + const float32x4_t neg_abs_limit_f32x4 = vmovq_n_f32(-abs_limit); + + for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) { + // Load from memory to vector. + float32x4_t v_f32x4 = vld1q_f32(vector + v); + // Clip between abs_limit and -abs_limit. + float32x4_t result_f32x4 = vminq_f32(abs_limit_f32x4, v_f32x4); + result_f32x4 = vmaxq_f32(neg_abs_limit_f32x4, result_f32x4); + // Save to output. + vst1q_f32(result + v, result_f32x4); + } + // Postamble loop. + for (int v = postamble_start; v < v_size; v++) { + result[v] = (abs_limit < vector[v]) ? abs_limit : vector[v]; + result[v] = (-abs_limit > result[v]) ? -abs_limit : result[v]; + } +} + +} // namespace tensor_utils +} // namespace rt +} // namespace nnfw + +#endif // USE_NEON diff --git a/runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.h b/runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.h new file mode 100644 index 000000000..2a6f31572 --- /dev/null +++ b/runtimes/nn/common/operations/internal/optimized/neon_tensor_utils.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RT_NEON_TENSOR_UTILS_H__ +#define __NNFW_RT_NEON_TENSOR_UTILS_H__ + +#include "ActivationFunctor.h" +#include "cpu_check.h" +#include "tensor_utils_impl.h" + +namespace nnfw { +namespace rt { +namespace tensor_utils { + +void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows, + int m_cols, const float* vector, + int n_batch, float* result, + int result_stride) { + NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, + vector, n_batch, result, result_stride); +} + +void VectorVectorCwiseProduct(const float* vector1, const float* vector2, + int v_size, float* result) { + NEON_OR_PORTABLE(VectorVectorCwiseProduct, vector1, vector2, v_size, result); +} + +void VectorVectorCwiseProductAccumulate(const float* vector1, + const float* vector2, int v_size, + float* result) { + NEON_OR_PORTABLE(VectorVectorCwiseProductAccumulate, vector1, vector2, v_size, + result); +} + +void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size, + const float* batch_vector, + int n_batch, float* result) { + NEON_OR_PORTABLE(VectorBatchVectorCwiseProductAccumulate, vector, v_size, + batch_vector, n_batch, result); +} + +float VectorVectorDotProduct(const float* vector1, const float* vector2, + int v_size) { + return PortableVectorVectorDotProduct(vector1, vector2, v_size); +} + +void BatchVectorBatchVectorDotProduct(const float* vector1, + const float* vector2, int v_size, + int n_batch, float* result, + int result_stride) { + PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size, n_batch, + result, result_stride); +} + +void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch, + float* batch_vector) { + PortableVectorBatchVectorAssign(vector, v_size, n_batch, batch_vector); +} + +void ApplySigmoidToVector(const float* vector, int v_size, float* result) { + PortableApplySigmoidToVector(vector, v_size, result); +} + +void ApplyActivationToVector(const float* vector, int v_size, + ActivationFn activation, float* result) { + PortableApplyActivationToVector(vector, v_size, activation, result); +} + +void CopyVector(const float* vector, int v_size, float* result) { + PortableCopyVector(vector, v_size, result); +} + +void Sub1Vector(const float* vector, int v_size, float* result) { + NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result); +} + +void ZeroVector(float* vector, int v_size) { + PortableZeroVector(vector, v_size); +} + +float Clip(float f, float abs_limit) { return PortableClip(f, abs_limit); } + +void ClipVector(const float* vector, int v_size, float abs_limit, + float* result) { + NEON_OR_PORTABLE(ClipVector, vector, v_size, abs_limit, result); +} + +// TODO(ghodrat): Implement Neon version. +void VectorShiftLeft(float* vector, int v_size, float shift_value) { + PortableVectorShiftLeft(vector, v_size, shift_value); +} + +// TODO(ghodrat): Implement Neon version. +void ReductionSumVector(const float* input_vector, int input_stride, + float* output_vector, int output_size, + int reduction_size) { + PortableReductionSumVector(input_vector, input_stride, output_vector, + output_size, reduction_size); +} + +} // namespace tensor_utils +} // namespace rt +} // namespace nnfw + +#endif // __NNFW_RT_NEON_TENSOR_UTILS_H__ diff --git a/runtimes/nn/common/operations/internal/optimized/optimized_ops.h b/runtimes/nn/common/operations/internal/optimized/optimized_ops.h new file mode 100644 index 000000000..33862a0d7 --- /dev/null +++ b/runtimes/nn/common/operations/internal/optimized/optimized_ops.h @@ -0,0 +1,2717 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RT_OPTIMIZED_OPS_H__ +#define __NNFW_RT_OPTIMIZED_OPS_H__ + +#include <assert.h> +#include <stdint.h> +#include <sys/types.h> +#include <algorithm> +#include <cmath> +#include <limits> +#include <memory> +#include <tuple> +#include <type_traits> + +#include "Eigen/Core" +#include "fixedpoint.h" +#include "gemmlowp.h" +#include "../common.h" +#include "../types.h" + +namespace nnfw { +namespace rt { +namespace optimized_ops { + +// Make a local VectorMap typedef allowing to map a float array +// as a Eigen vector expression. The std::conditional here is to +// construct the suitable Eigen type for the constness of the +// data. Indeed, for const data, we need to produce +// Eigen::Map<const Eigen::Matrix<float, ...>> +// and not the more straightforward +// Eigen::Map<Eigen::Matrix<const float, ...>> +template <typename Scalar> +using VectorMap = typename std::conditional< + std::is_const<Scalar>::value, + Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, + Eigen::Dynamic, 1>>, + Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, 1>>>::type; + +template <typename Scalar, int N> +VectorMap<Scalar> MapAsVector(Scalar* data, const Dims<N>& dims) { + const int size = RequiredBufferSizeForDims(dims); + return VectorMap<Scalar>(data, size, 1); +} + +// Make a local VectorMap typedef allowing to map a float array +// as a Eigen matrix expression. The same explanation as for VectorMap +// above also applies here. +template <typename Scalar> +using MatrixMap = typename std::conditional< + std::is_const<Scalar>::value, + Eigen::Map<const Eigen::Matrix<typename std::remove_const<Scalar>::type, + Eigen::Dynamic, Eigen::Dynamic>>, + Eigen::Map<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type; + +template <typename Scalar, int N> +MatrixMap<Scalar> MapAsMatrixWithFirstDimAsRows(Scalar* data, + const Dims<N>& dims) { + const int rows = dims.sizes[0]; + int cols = 1; + for (int d = 1; d < N; d++) { + cols *= dims.sizes[d]; + } + return MatrixMap<Scalar>(data, rows, cols); +} + +template <typename Scalar, int N> +MatrixMap<Scalar> MapAsMatrixWithLastDimAsCols(Scalar* data, + const Dims<N>& dims) { + const int cols = dims.sizes[N - 1]; + int rows = 1; + for (int d = 0; d < N - 1; d++) { + rows *= dims.sizes[d]; + } + return MatrixMap<Scalar>(data, rows, cols); +} + +template <typename Scalar> +using ArrayMap = typename std::conditional< + std::is_const<Scalar>::value, + Eigen::Map<const Eigen::Array<typename std::remove_const<Scalar>::type, + Eigen::Dynamic, Eigen::Dynamic>>, + Eigen::Map<Eigen::Array<Scalar, Eigen::Dynamic, Eigen::Dynamic>>>::type; + +template <typename Scalar, int N> +ArrayMap<Scalar> MapAsArrayWithFirstDimAsRows(Scalar* data, + const Dims<N>& dims) { + const int rows = dims.sizes[0]; + int cols = 1; + for (int d = 1; d < N; d++) { + cols *= dims.sizes[d]; + } + return ArrayMap<Scalar>(data, rows, cols); +} + +// TODO(b/62193649): this function is only needed as long +// as we have the --variable_batch hack. +template <typename Scalar, int N> +MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data, + const Dims<N>& dims, + int rows) { + int cols = 1; + bool matched_rows = false; + for (int d = 0; d < N; d++) { + cols *= dims.sizes[d]; + if (cols == rows) { + matched_rows = true; + cols = 1; + } + } + DCHECK(matched_rows); + return MatrixMap<Scalar>(data, rows, cols); +} + +// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE +// BROADCASTING. +// +// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional +// rectangular array of numbers. +// +// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h. +// However, as Dims<N> is to be deprecated, this class exists as an adaptor +// to enable simple unoptimized implementations of element-wise broadcasting +// operations. +template<int N> +struct NdArrayDesc { + // The "extent" of each dimension. Indices along dimension d must be in the + // half-open interval [0, extents[d]). + int extents[N]; + + // The number of *elements* (not bytes) between consecutive indices of each + // dimension. + int strides[N]; +}; + +// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING +// ELEMENT-WISE BROADCASTING. +// +// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>. +inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2, + int i3) { + DCHECK(i0 >= 0 && i0 < desc.extents[0]); + DCHECK(i1 >= 0 && i1 < desc.extents[1]); + DCHECK(i2 >= 0 && i2 < desc.extents[2]); + DCHECK(i3 >= 0 && i3 < desc.extents[3]); + return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] + + i3 * desc.strides[3]; +} + +// Given the dimensions of the operands for an element-wise binary broadcast, +// adjusts them so that they can be directly iterated over with simple loops. +// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and +// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr. +// +// This function assumes that the two input shapes are compatible up to +// broadcasting and the shorter one has already been prepended with 1s to be the +// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64), +// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that +// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be +// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1). +// +// When two shapes are compatible up to broadcasting, for each dimension d, +// the input extents are either equal, or one of them is 1. +// +// This function performs the following for each dimension d: +// - If the extents are equal, then do nothing since the loop that walks over +// both of the input arrays is correct. +// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1 +// and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows +// array0 to be referenced *at any index* in dimension d and still access the +// same slice. +template <int N> +inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims, + const Dims<N>& input1_dims, + NdArrayDesc<N>* desc0_out, + NdArrayDesc<N>* desc1_out) { + DCHECK(desc0_out != nullptr); + DCHECK(desc1_out != nullptr); + + // Copy dims to desc. + for (int i = 0; i < N; ++i) { + desc0_out->extents[i] = input0_dims.sizes[i]; + desc0_out->strides[i] = input0_dims.strides[i]; + desc1_out->extents[i] = input1_dims.sizes[i]; + desc1_out->strides[i] = input1_dims.strides[i]; + } + + // Walk over each dimension. If the extents are equal do nothing. + // Otherwise, set the desc with extent 1 to have extent equal to the other and + // stride 0. + for (int i = 0; i < N; ++i) { + const int extent0 = ArraySize(input0_dims, i); + const int extent1 = ArraySize(input1_dims, i); + if (extent0 != extent1) { + if (extent0 == 1) { + desc0_out->strides[i] = 0; + desc0_out->extents[i] = extent1; + } else { + DCHECK_EQ(extent1, 1); + desc1_out->strides[i] = 0; + desc1_out->extents[i] = extent0; + } + } + } +} + +#ifdef USE_NEON +template <FusedActivationFunctionType Ac> +void AddBiasAndEvalActivationFunction(const float* bias_data, + const Dims<4>& bias_dims, + float* array_data, + const Dims<4>& array_dims) { + gemmlowp::ScopedProfilingLabel label("AddBiasAndEvalActivationFunction"); + const int bias_size = bias_dims.sizes[3] * bias_dims.strides[3]; + const int array_size = array_dims.sizes[3] * array_dims.strides[3]; + DCHECK_EQ((array_size % bias_size), 0); + float* array_ptr = array_data; + float* array_end_ptr = array_ptr + array_size; + const auto zero = vdupq_n_f32(0); + const auto six = vdupq_n_f32(6); + const auto neg_one = vdupq_n_f32(-1); + const auto one = vdupq_n_f32(1); + for (; array_ptr != array_end_ptr; array_ptr += bias_size) { + int i = 0; + for (; i <= bias_size - 16; i += 16) { + auto b0 = vld1q_f32(bias_data + i); + auto b1 = vld1q_f32(bias_data + i + 4); + auto b2 = vld1q_f32(bias_data + i + 8); + auto b3 = vld1q_f32(bias_data + i + 12); + auto a0 = vld1q_f32(array_ptr + i); + auto a1 = vld1q_f32(array_ptr + i + 4); + auto a2 = vld1q_f32(array_ptr + i + 8); + auto a3 = vld1q_f32(array_ptr + i + 12); + auto x0 = vaddq_f32(a0, b0); + auto x1 = vaddq_f32(a1, b1); + auto x2 = vaddq_f32(a2, b2); + auto x3 = vaddq_f32(a3, b3); + if (Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6) { + x0 = vmaxq_f32(zero, x0); + x1 = vmaxq_f32(zero, x1); + x2 = vmaxq_f32(zero, x2); + x3 = vmaxq_f32(zero, x3); + if (Ac == FusedActivationFunctionType::kRelu6) { + x0 = vminq_f32(six, x0); + x1 = vminq_f32(six, x1); + x2 = vminq_f32(six, x2); + x3 = vminq_f32(six, x3); + } + } else if (Ac == FusedActivationFunctionType::kRelu1) { + x0 = vmaxq_f32(neg_one, x0); + x1 = vmaxq_f32(neg_one, x1); + x2 = vmaxq_f32(neg_one, x2); + x3 = vmaxq_f32(neg_one, x3); + x0 = vminq_f32(one, x0); + x1 = vminq_f32(one, x1); + x2 = vminq_f32(one, x2); + x3 = vminq_f32(one, x3); + } + vst1q_f32(array_ptr + i, x0); + vst1q_f32(array_ptr + i + 4, x1); + vst1q_f32(array_ptr + i + 8, x2); + vst1q_f32(array_ptr + i + 12, x3); + } + for (; i <= bias_size - 4; i += 4) { + auto b = vld1q_f32(bias_data + i); + auto a = vld1q_f32(array_ptr + i); + auto x = vaddq_f32(a, b); + if (Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6) { + x = vmaxq_f32(zero, x); + if (Ac == FusedActivationFunctionType::kRelu6) { + x = vminq_f32(six, x); + } + } else if (Ac == FusedActivationFunctionType::kRelu1) { + x = vmaxq_f32(neg_one, x); + x = vminq_f32(one, x); + } + vst1q_f32(array_ptr + i, x); + } + for (; i < bias_size; i++) { + array_ptr[i] = ActivationFunction<Ac>(array_ptr[i] + bias_data[i]); + } + } +} +#else // not NEON +template <FusedActivationFunctionType Ac> +void AddBiasAndEvalActivationFunction(const float* bias_data, + const Dims<4>& bias_dims, + float* array_data, + const Dims<4>& array_dims) { + gemmlowp::ScopedProfilingLabel label("AddBiasAndEvalActivationFunction"); + const int bias_size = bias_dims.sizes[3] * bias_dims.strides[3]; + const int array_size = array_dims.sizes[3] * array_dims.strides[3]; + DCHECK_EQ((array_size % bias_size), 0); + for (int array_offset = 0; array_offset < array_size; + array_offset += bias_size) { + for (int i = 0; i < bias_size; i++) { + array_data[array_offset + i] = + ActivationFunction<Ac>(array_data[array_offset + i] + bias_data[i]); + } + } +} +#endif + +template <typename Lhs, typename Rhs, typename Result> +void Gemm(const Eigen::MatrixBase<Lhs>& lhs, const Eigen::MatrixBase<Rhs>& rhs, + Eigen::MatrixBase<Result>* result) { + if (rhs.cols() == 1) { + gemmlowp::ScopedProfilingLabel label("GEMV"); + result->col(0).noalias() = lhs * rhs.col(0); + } else { + gemmlowp::ScopedProfilingLabel label("GEMM"); + result->noalias() = lhs * rhs; + } +} + +template <FusedActivationFunctionType Ac> +void FullyConnected(const float* input_data, const Dims<4>& input_dims, + const float* weights_data, const Dims<4>& weights_dims, + const float* bias_data, const Dims<4>& bias_dims, + float* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("FullyConnected"); + // TODO(b/62193649): this convoluted shape computation (determining + // input_rows from the weights_dims, then MapAsMatrixWithGivenNumberOfRows) + // is because the current --variable_batch hack consists in overwriting the + // 3rd dimension with the runtime batch size, as we don't keep track for each + // array of which dimension is the batch dimension in it. + // When that is fixed, this should become: + // const auto input_matrix_map = + // MapAsMatrixWithFirstDimAsRows(input_data, input_dims); + const int input_rows = ArraySize(weights_dims, 0); + const auto input_matrix_map = + MapAsMatrixWithGivenNumberOfRows(input_data, input_dims, input_rows); + const auto filter_matrix_map = + MapAsMatrixWithFirstDimAsRows(weights_data, weights_dims); + auto output_matrix_map = + MapAsMatrixWithFirstDimAsRows(output_data, output_dims); + + Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map); + AddBiasAndEvalActivationFunction<Ac>(bias_data, bias_dims, output_data, + output_dims); +} + +inline void preload_l1_stream(const uint8* ptr) { +#ifdef GEMMLOWP_ARM_64 + asm volatile("prfm pldl1strm, [%[ptr]]\n" ::[ptr] "r"(ptr) :); +#else + gemmlowp::Prefetch(ptr); +#endif +} + +#ifdef USE_NEON +template <FusedActivationFunctionType Ac> +void FullyConnectedAsGEMV(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("FullyConnectedAsGEMV/8bit"); + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + DCHECK(IsPackedWithoutStrides(input_dims)); + DCHECK(IsPackedWithoutStrides(filter_dims)); + DCHECK(IsPackedWithoutStrides(bias_dims)); + DCHECK(IsPackedWithoutStrides(output_dims)); + DCHECK_EQ(ArraySize(output_dims, 1) * ArraySize(output_dims, 2) * + ArraySize(output_dims, 3), + 1); + const int input_size = input_dims.strides[3]; + const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0); + static constexpr int kPeel = 4; + for (int k = 0; k < input_size; k += 64) { + preload_l1_stream(input_data + k); + } + for (int k = 0; k < kPeel * input_size; k += 64) { + preload_l1_stream(filter_data + k); + } + DCHECK(!(output_size % kPeel)); + const int32* bias_ptr = bias_data; + uint8* output_ptr = output_data; + for (int out = 0; out < output_size; out += kPeel) { + int32x4_t acc[kPeel]; + for (int k = 0; k < kPeel; k++) { + acc[k] = vdupq_n_s32(0); + } + const int16x8_t input_offset_vec = vdupq_n_s16(input_offset); + const int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset); + int in = 0; + for (; in <= input_size - 16; in += 16) { + const uint8x16_t input_val_u8 = vld1q_u8(input_data + in); + uint8x16_t filter_val_u8[kPeel]; + for (int k = 0; k < kPeel; k++) { + const uint8* filter_ptr = filter_data + in + (out + k) * input_size; + filter_val_u8[k] = vld1q_u8(filter_ptr); + preload_l1_stream(filter_ptr + 64); + } + int16x8_t input_val[2]; + const uint8x8_t low = vget_low_u8(input_val_u8); + const uint8x8_t high = vget_high_u8(input_val_u8); + input_val[0] = vreinterpretq_s16_u16(vmovl_u8(low)); + input_val[1] = vreinterpretq_s16_u16(vmovl_u8(high)); + input_val[0] = vaddq_s16(input_val[0], input_offset_vec); + input_val[1] = vaddq_s16(input_val[1], input_offset_vec); + int16x8_t filter_val[kPeel][2]; + for (int k = 0; k < kPeel; k++) { + const uint8x8_t low = vget_low_u8(filter_val_u8[k]); + const uint8x8_t high = vget_high_u8(filter_val_u8[k]); + filter_val[k][0] = vreinterpretq_s16_u16(vmovl_u8(low)); + filter_val[k][1] = vreinterpretq_s16_u16(vmovl_u8(high)); + filter_val[k][0] = vaddq_s16(filter_val[k][0], filter_offset_vec); + filter_val[k][1] = vaddq_s16(filter_val[k][1], filter_offset_vec); + } + for (int p = 0; p < 2; p++) { + for (int k = 0; k < kPeel; k++) { + acc[k] = vmlal_s16(acc[k], vget_low_s16(filter_val[k][p]), + vget_low_s16(input_val[p])); + } + for (int k = 0; k < kPeel; k++) { + acc[k] = vmlal_s16(acc[k], vget_high_s16(filter_val[k][p]), + vget_high_s16(input_val[p])); + } + } + } + for (; in <= input_size - 8; in += 8) { + const uint8x8_t input_val_u8 = vld1_u8(input_data + in); + uint8x8_t filter_val_u8[kPeel]; + for (int k = 0; k < kPeel; k++) { + const uint8* filter_ptr = filter_data + in + (out + k) * input_size; + filter_val_u8[k] = vld1_u8(filter_ptr); + } + int16x8_t input_val; + input_val = vreinterpretq_s16_u16(vmovl_u8(input_val_u8)); + input_val = vaddq_s16(input_val, input_offset_vec); + int16x8_t filter_val[kPeel]; + for (int k = 0; k < kPeel; k++) { + filter_val[k] = vreinterpretq_s16_u16(vmovl_u8(filter_val_u8[k])); + filter_val[k] = vaddq_s16(filter_val[k], filter_offset_vec); + } + for (int k = 0; k < kPeel; k++) { + acc[k] = vmlal_s16(acc[k], vget_low_s16(filter_val[k]), + vget_low_s16(input_val)); + } + for (int k = 0; k < kPeel; k++) { + acc[k] = vmlal_s16(acc[k], vget_high_s16(filter_val[k]), + vget_high_s16(input_val)); + } + } + if (in < input_size) { + int32 buf[4 * kPeel]; + for (int k = 0; k < 4; k++) { + vst1q_s32(buf + 4 * k, acc[k]); + } + for (; in < input_size; in++) { + int lane = (in + 8 - input_size) % 4; + const int32 input_val = input_data[in] + input_offset; + for (int k = 0; k < kPeel; k++) { + int32 filter_val = + filter_data[in + (out + k) * input_size] + filter_offset; + buf[lane + 4 * k] += filter_val * input_val; + } + } + for (int k = 0; k < 4; k++) { + acc[k] = vld1q_s32(buf + 4 * k); + } + } + + // Horizontally reduce accumulators + int32x2_t pairwise_reduced_acc[kPeel]; + for (int k = 0; k < kPeel; k++) { + pairwise_reduced_acc[k] = + vpadd_s32(vget_low_s32(acc[k]), vget_high_s32(acc[k])); + } + static_assert(kPeel == 4, "the code below currently assumes kPeel = 4"); + const int32x2_t reduced_lo = + vpadd_s32(pairwise_reduced_acc[0], pairwise_reduced_acc[1]); + const int32x2_t reduced_hi = + vpadd_s32(pairwise_reduced_acc[2], pairwise_reduced_acc[3]); + int32x4_t reduced = vcombine_s32(reduced_lo, reduced_hi); + // Add bias values. + int32x4_t bias_vec = vld1q_s32(bias_ptr); + bias_ptr += 4; + reduced = vaddq_s32(reduced, bias_vec); + // Multiply by the fixed-point multiplier. + reduced = vqrdmulhq_n_s32(reduced, output_multiplier); + // Rounding-shift-right. + using gemmlowp::RoundingDivideByPOT; + reduced = RoundingDivideByPOT(reduced, output_shift); + // Add the output offset. + const int32x4_t output_offset_vec = vdupq_n_s32(output_offset); + reduced = vaddq_s32(reduced, output_offset_vec); + // Narrow values down to 16 bit signed. + const int16x4_t res16 = vqmovn_s32(reduced); + // Narrow values down to 8 bit unsigned, saturating. + uint8x8_t res8 = vqmovun_s16(vcombine_s16(res16, res16)); + if (Ac != FusedActivationFunctionType::kNone) { + // Apply the clamping from the activation function + res8 = vmax_u8(res8, vdup_n_u8(output_activation_min)); + res8 = vmin_u8(res8, vdup_n_u8(output_activation_max)); + } + // Store results to destination. Assumes 32bit alignment. + vst1_lane_u32(reinterpret_cast<uint32*>(output_ptr), + vreinterpret_u32_u8(res8), 0); + output_ptr += kPeel; + } +} +#endif // USE_NEON + +template <FusedActivationFunctionType Ac> +struct GemmlowpOutputPipeline { + typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col> + ColVectorMap; + typedef std::tuple< + gemmlowp::OutputStageBiasAddition<ColVectorMap>, + gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint, + gemmlowp::OutputStageClamp, gemmlowp::OutputStageSaturatingCastToUint8> + Pipeline; + static Pipeline Make(const int32* bias_data, int output_rows, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max) { + ColVectorMap bias_vector(bias_data, output_rows); + gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage; + bias_addition_stage.bias_vector = bias_vector; + gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint + quantize_down_stage; + quantize_down_stage.result_offset_after_shift = output_offset; + quantize_down_stage.result_fixedpoint_multiplier = output_multiplier; + quantize_down_stage.result_shift = output_shift; + gemmlowp::OutputStageClamp clamp_stage; + clamp_stage.min = output_activation_min; + clamp_stage.max = output_activation_max; + gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage; + return std::make_tuple(bias_addition_stage, quantize_down_stage, + clamp_stage, saturating_cast_stage); + } +}; + +template <> +struct GemmlowpOutputPipeline<FusedActivationFunctionType::kNone> { + typedef gemmlowp::VectorMap<const int32, gemmlowp::VectorShape::Col> + ColVectorMap; + typedef std::tuple< + gemmlowp::OutputStageBiasAddition<ColVectorMap>, + gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint, + gemmlowp::OutputStageSaturatingCastToUint8> + Pipeline; + static Pipeline Make(const int32* bias_data, int output_rows, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max) { + DCHECK_EQ(output_activation_min, 0); + DCHECK_EQ(output_activation_max, 255); + ColVectorMap bias_vector(bias_data, output_rows); + gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage; + bias_addition_stage.bias_vector = bias_vector; + gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint + quantize_down_stage; + quantize_down_stage.result_offset_after_shift = output_offset; + quantize_down_stage.result_fixedpoint_multiplier = output_multiplier; + quantize_down_stage.result_shift = output_shift; + gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage; + return std::make_tuple(bias_addition_stage, quantize_down_stage, + saturating_cast_stage); + } +}; + +template <FusedActivationFunctionType Ac> +void FullyConnected(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims, + gemmlowp::GemmContext* gemm_context) { + gemmlowp::ScopedProfilingLabel label("FullyConnected/8bit"); + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + // TODO: This really should be: + // const int batches = ArraySize(output_dims, 1); + // but the current --variable_batch hack consists in overwriting the 3rd + // dimension with the runtime batch size, as we don't keep track for each + // array of which dimension is the batch dimension in it. + const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) * + ArraySize(output_dims, 3); +#ifdef USE_NEON + const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0); + if (batches == 1 && !(output_size % 4)) { + return FullyConnectedAsGEMV<Ac>( + input_data, input_dims, input_offset, filter_data, filter_dims, + filter_offset, bias_data, bias_dims, output_offset, output_multiplier, + output_shift, output_activation_min, output_activation_max, output_data, + output_dims); + } +#endif // USE_NEON + const int filter_rows = filter_dims.sizes[1]; + const int filter_cols = filter_dims.sizes[0]; + DCHECK_EQ(filter_dims.sizes[2], 1); + DCHECK_EQ(filter_dims.sizes[3], 1); + const int output_rows = output_dims.sizes[0]; + DCHECK_EQ(output_rows, filter_rows); + DCHECK_EQ(bias_dims.sizes[0], output_rows); + DCHECK_EQ(bias_dims.sizes[1], 1); + DCHECK_EQ(bias_dims.sizes[2], 1); + DCHECK_EQ(bias_dims.sizes[3], 1); + + gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix( + filter_data, output_rows, filter_cols, filter_cols); + gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix( + input_data, filter_cols, batches, filter_cols); + gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix( + output_data, output_rows, batches, output_rows); + const auto& output_pipeline = GemmlowpOutputPipeline<Ac>::Make( + bias_data, output_rows, output_offset, output_multiplier, output_shift, + output_activation_min, output_activation_max); + gemmlowp::GemmWithOutputPipeline<uint8, uint8, + gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( + gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, + input_offset, output_pipeline); +} + +template <typename T> +inline void ExtractPatchIntoBufferColumn( + const Dims<4>& input_dims, int w, int h, int b, int kheight, int kwidth, + int stride_width, int stride_height, int pad_width, int pad_height, + int in_width, int in_height, int in_depth, int single_buffer_length, + int buffer_id, const T* in_data, T* conv_buffer_data, uint8 byte_zero) { + gemmlowp::ScopedProfilingLabel label("ExtractPatchIntoBufferColumn"); + // This chunk of code reshapes all the inputs corresponding to + // output (b, h, w) to a column vector in conv_buffer(:, buffer_id). + const int kwidth_times_indepth = kwidth * in_depth; + const int inwidth_times_indepth = in_width * in_depth; + const int ih_ungated_start = h * stride_height - pad_height; + const int ih_ungated_end = (ih_ungated_start + kheight); + const int ih_end = std::min(ih_ungated_end, in_height); + const int iw_ungated_start = w * stride_width - pad_width; + const int iw_ungated_end = (iw_ungated_start + kwidth); + const int iw_end = std::min(iw_ungated_end, in_width); + // If the patch is off the edge of the input image, skip writing those rows + // and columns from the patch into the output array. + const int h_offset = std::max(0, -ih_ungated_start); + const int w_offset = std::max(0, -iw_ungated_start); + const int ih_start = std::max(0, ih_ungated_start); + const int iw_start = std::max(0, iw_ungated_start); + const int single_row_num = + std::min(kwidth - w_offset, in_width - iw_start) * in_depth; + const int output_row_offset = (buffer_id * single_buffer_length); + int out_offset = + output_row_offset + (h_offset * kwidth + w_offset) * in_depth; + int in_offset = Offset(input_dims, 0, iw_start, ih_start, b); + + // Express all of the calculations as padding around the input patch. + const int top_padding = h_offset; + const int bottom_padding = (ih_ungated_end - ih_end); + const int left_padding = w_offset; + const int right_padding = (iw_ungated_end - iw_end); + assert(single_row_num == + ((kwidth - (left_padding + right_padding)) * in_depth)); + + // Write out zeroes to the elements representing the top rows of the input + // patch that are off the edge of the input image. + if (top_padding > 0) { + const int top_row_elements = (top_padding * kwidth * in_depth); + memset(conv_buffer_data + output_row_offset, byte_zero, + (top_row_elements * sizeof(T))); + } + + // If the patch is on the interior of the input image horizontally, just copy + // over the rows sequentially, otherwise add zero padding at the start or end. + if ((left_padding == 0) && (right_padding == 0)) { + for (int ih = ih_start; ih < ih_end; ++ih) { + memcpy(conv_buffer_data + out_offset, in_data + in_offset, + single_row_num * sizeof(T)); + out_offset += kwidth_times_indepth; + in_offset += inwidth_times_indepth; + } + } else { + for (int ih = ih_start; ih < ih_end; ++ih) { + if (left_padding > 0) { + const int left_start = (out_offset - (left_padding * in_depth)); + memset(conv_buffer_data + left_start, byte_zero, + (left_padding * in_depth * sizeof(T))); + } + memcpy(conv_buffer_data + out_offset, in_data + in_offset, + single_row_num * sizeof(T)); + if (right_padding > 0) { + const int right_start = (out_offset + single_row_num); + memset(conv_buffer_data + right_start, byte_zero, + (right_padding * in_depth * sizeof(T))); + } + out_offset += kwidth_times_indepth; + in_offset += inwidth_times_indepth; + } + } + + // If the bottom of the patch falls off the input image, pad the values + // representing those input rows with zeroes. + if (bottom_padding > 0) { + const int bottom_row_elements = (bottom_padding * kwidth * in_depth); + const int bottom_start = + output_row_offset + + ((top_padding + (ih_end - ih_start)) * kwidth * in_depth); + memset(conv_buffer_data + bottom_start, byte_zero, + (bottom_row_elements * sizeof(T))); + } +} + +template <typename T> +void Im2col(const T* input_data, const Dims<4>& input_dims, int stride_width, + int stride_height, int pad_width, int pad_height, int kheight, + int kwidth, uint8 byte_zero, T* output_data, + const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("Im2col"); + DCHECK(IsPackedWithoutStrides(input_dims)); + DCHECK(IsPackedWithoutStrides(output_dims)); + const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); + const int input_depth = ArraySize(input_dims, 0); + const int input_width = ArraySize(input_dims, 1); + const int input_height = ArraySize(input_dims, 2); + const int output_depth = ArraySize(output_dims, 0); + const int output_width = ArraySize(output_dims, 1); + const int output_height = ArraySize(output_dims, 2); + + int buffer_id = 0; + // Loop over the output nodes. + for (int b = 0; b < batches; ++b) { + for (int h = 0; h < output_height; ++h) { + for (int w = 0; w < output_width; ++w) { + ExtractPatchIntoBufferColumn( + input_dims, w, h, b, kheight, kwidth, stride_width, stride_height, + pad_width, pad_height, input_width, input_height, input_depth, + output_depth, buffer_id, input_data, output_data, byte_zero); + ++buffer_id; + } + } + } +} + +template <FusedActivationFunctionType Ac> +void Conv(const float* input_data, const Dims<4>& input_dims, + const float* filter_data, const Dims<4>& filter_dims, + const float* bias_data, const Dims<4>& bias_dims, int stride_width, + int stride_height, int pad_width, int pad_height, float* output_data, + const Dims<4>& output_dims, float* im2col_data, + const Dims<4>& im2col_dims) { + (void)im2col_data; + (void)im2col_dims; + gemmlowp::ScopedProfilingLabel label("Conv"); + + const float* gemm_input_data = nullptr; + const Dims<4>* gemm_input_dims = nullptr; + const int filter_width = ArraySize(filter_dims, 1); + const int filter_height = ArraySize(filter_dims, 2); + const bool need_im2col = stride_width != 1 || stride_height != 1 || + filter_width != 1 || filter_height != 1; + if (need_im2col) { + DCHECK(im2col_data); + Im2col(input_data, input_dims, stride_width, stride_height, pad_width, + pad_height, filter_height, filter_width, 0, im2col_data, + im2col_dims); + gemm_input_data = im2col_data; + gemm_input_dims = &im2col_dims; + } else { +#if 0 // TODO-NNRT : Check if it needs, 'im2col_data' seems to be always not null. + DCHECK(!im2col_data); +#endif + gemm_input_data = input_data; + gemm_input_dims = &input_dims; + } + + const auto im2col_matrix_map = + MapAsMatrixWithFirstDimAsRows(gemm_input_data, *gemm_input_dims); + const auto filter_matrix_map = + MapAsMatrixWithLastDimAsCols(filter_data, filter_dims); + auto output_matrix_map = + MapAsMatrixWithFirstDimAsRows(output_data, output_dims); + + Gemm(filter_matrix_map.transpose(), im2col_matrix_map, &output_matrix_map); + + AddBiasAndEvalActivationFunction<Ac>(bias_data, bias_dims, output_data, + output_dims); +} + +template <FusedActivationFunctionType Ac> +void Conv(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, int stride_width, + int stride_height, int pad_width, int pad_height, int32 output_offset, + int32 output_multiplier, int output_shift, + int32 output_activation_min, int32 output_activation_max, + uint8* output_data, const Dims<4>& output_dims, uint8* im2col_data, + const Dims<4>& im2col_dims, gemmlowp::GemmContext* gemm_context) { + gemmlowp::ScopedProfilingLabel label("Conv/8bit"); + + DCHECK(IsPackedWithoutStrides(input_dims)); + DCHECK(IsPackedWithoutStrides(filter_dims)); + DCHECK(IsPackedWithoutStrides(output_dims)); + + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + + const uint8* gemm_input_data = nullptr; + const Dims<4>* gemm_input_dims = nullptr; + const int filter_width = ArraySize(filter_dims, 1); + const int filter_height = ArraySize(filter_dims, 2); + const bool need_im2col = stride_width != 1 || stride_height != 1 || + filter_width != 1 || filter_height != 1; + if (need_im2col) { + DCHECK(im2col_data); + const int input_zero_point = -input_offset; + DCHECK_GE(input_zero_point, 0); + DCHECK_LE(input_zero_point, 255); + Im2col(input_data, input_dims, stride_width, stride_height, pad_width, + pad_height, filter_height, filter_width, input_zero_point, + im2col_data, im2col_dims); + gemm_input_data = im2col_data; + gemm_input_dims = &im2col_dims; + } else { +#if 0 // TODO-NNRT : Check if it needs, 'im2col_data' seems to be always not null. + DCHECK(!im2col_data); +#endif + gemm_input_data = input_data; + gemm_input_dims = &input_dims; + } + + const int gemm_input_rows = gemm_input_dims->sizes[0]; + const int gemm_input_cols = gemm_input_dims->sizes[1] * + gemm_input_dims->sizes[2] * + gemm_input_dims->sizes[3]; + const int filter_rows = filter_dims.sizes[3]; + const int filter_cols = + filter_dims.sizes[0] * filter_dims.sizes[1] * filter_dims.sizes[2]; + const int output_rows = output_dims.sizes[0]; + const int output_cols = + output_dims.sizes[1] * output_dims.sizes[2] * output_dims.sizes[3]; + DCHECK_EQ(output_rows, filter_rows); + DCHECK_EQ(output_cols, gemm_input_cols); + DCHECK_EQ(filter_cols, gemm_input_rows); + DCHECK_EQ(bias_dims.sizes[0], output_rows); + DCHECK_EQ(bias_dims.sizes[1], 1); + DCHECK_EQ(bias_dims.sizes[2], 1); + DCHECK_EQ(bias_dims.sizes[3], 1); + gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix( + filter_data, filter_rows, filter_cols); + gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix( + gemm_input_data, gemm_input_rows, gemm_input_cols); + gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix( + output_data, output_rows, output_cols); + const auto& output_pipeline = GemmlowpOutputPipeline<Ac>::Make( + bias_data, output_rows, output_offset, output_multiplier, output_shift, + output_activation_min, output_activation_max); + gemmlowp::GemmWithOutputPipeline<uint8, uint8, + gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( + gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, + input_offset, output_pipeline); +} + +template <typename T> +inline void DepthToSpace(const T* input_data, const Dims<4>& input_dims, + int block_size, T* output_data, + const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("DepthToSpace"); + + const int input_depth = ArraySize(input_dims, 0); + const int input_width = ArraySize(input_dims, 1); + const int input_height = ArraySize(input_dims, 2); + + const int output_depth = ArraySize(output_dims, 0); + const int batch_size = ArraySize(output_dims, 3); + + // Number of continuous values that we can copy in one interation. + const int stride = block_size * output_depth; + + for (int batch = 0; batch < batch_size; ++batch) { + for (int in_h = 0; in_h < input_height; ++in_h) { + const T* input_ptr = input_data + Offset(input_dims, 0, 0, in_h, batch); + for (int offset_h = 0; offset_h < block_size; ++offset_h) { + const T* src = input_ptr; + for (int in_w = 0; in_w < input_width; ++in_w) { + memcpy(output_data, src, stride * sizeof(T)); + output_data += stride; + src += input_depth; + } + input_ptr += stride; + } + } + } +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac, typename T> +void Im2col(const T* input_data, const Dims<4>& input_dims, int stride, + int pad_width, int pad_height, int kheight, int kwidth, + uint8 byte_zero, T* output_data, const Dims<4>& output_dims) { + Im2col(input_data, input_dims, stride, stride, pad_width, pad_height, kheight, + kwidth, byte_zero, output_data, output_dims); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void ConvAsGemm(const float* input_data, const Dims<4>& input_dims, + const float* filter_data, const Dims<4>& filter_dims, + const float* bias_data, const Dims<4>& bias_dims, + float* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("ConvAsGemm"); + + const auto input_matrix_map = + MapAsMatrixWithFirstDimAsRows(input_data, input_dims); + const auto filter_matrix_map = + MapAsMatrixWithLastDimAsCols(filter_data, filter_dims); + auto output_matrix_map = + MapAsMatrixWithFirstDimAsRows(output_data, output_dims); + + Gemm(filter_matrix_map.transpose(), input_matrix_map, &output_matrix_map); + + AddBiasAndEvalActivationFunction<Ac>(bias_data, bias_dims, output_data, + output_dims); +} + +// legacy, for compatibility with old checked-in code +template <FusedActivationFunctionType Ac> +void ConvAsGemm(const uint8* input_data, const Dims<4>& input_dims, + int32 input_offset, const uint8* filter_data, + const Dims<4>& filter_dims, int32 filter_offset, + const int32* bias_data, const Dims<4>& bias_dims, + int32 output_offset, int32 output_multiplier, int output_shift, + int32 output_activation_min, int32 output_activation_max, + uint8* output_data, const Dims<4>& output_dims, + gemmlowp::GemmContext* gemm_context) { + gemmlowp::ScopedProfilingLabel label("ConvAsGemm/8bit"); + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + const int input_rows = input_dims.sizes[0]; + const int input_cols = + input_dims.sizes[1] * input_dims.sizes[2] * input_dims.sizes[3]; + const int filter_rows = filter_dims.sizes[3]; + const int filter_cols = + filter_dims.sizes[0] * filter_dims.sizes[1] * filter_dims.sizes[2]; + const int output_rows = output_dims.sizes[0]; + const int output_cols = + output_dims.sizes[1] * output_dims.sizes[2] * output_dims.sizes[3]; + DCHECK_EQ(output_rows, filter_rows); + DCHECK_EQ(output_cols, input_cols); + DCHECK_EQ(filter_cols, input_rows); + DCHECK_EQ(bias_dims.sizes[0], output_rows); + DCHECK_EQ(bias_dims.sizes[1], 1); + DCHECK_EQ(bias_dims.sizes[2], 1); + DCHECK_EQ(bias_dims.sizes[3], 1); + gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::RowMajor> filter_matrix( + filter_data, output_rows, filter_cols, filter_cols); + gemmlowp::MatrixMap<const uint8, gemmlowp::MapOrder::ColMajor> input_matrix( + input_data, filter_cols, output_cols, filter_cols); + gemmlowp::MatrixMap<uint8, gemmlowp::MapOrder::ColMajor> output_matrix( + output_data, output_rows, output_cols, output_rows); + const auto& output_pipeline = GemmlowpOutputPipeline<Ac>::Make( + bias_data, output_rows, output_offset, output_multiplier, output_shift, + output_activation_min, output_activation_max); + gemmlowp::GemmWithOutputPipeline<uint8, uint8, + gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( + gemm_context, filter_matrix, input_matrix, &output_matrix, filter_offset, + input_offset, output_pipeline); +} + +template <typename T> +inline void SpaceToDepth(const T* input_data, const Dims<4>& input_dims, + int block_size, T* output_data, + const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("SpaceToDepth"); + + const int output_depth = ArraySize(output_dims, 0); + const int output_width = ArraySize(output_dims, 1); + const int output_height = ArraySize(output_dims, 2); + + const int input_depth = ArraySize(input_dims, 0); + const int batch_size = ArraySize(input_dims, 3); + + // Number of continuous values that we can copy in one interation. + const int stride = block_size * input_depth; + + for (int batch = 0; batch < batch_size; ++batch) { + for (int out_h = 0; out_h < output_height; ++out_h) { + T* output_ptr = output_data + Offset(output_dims, 0, 0, out_h, batch); + for (int offset_h = 0; offset_h < block_size; ++offset_h) { + T* dst = output_ptr; + for (int out_w = 0; out_w < output_width; ++out_w) { + memcpy(dst, input_data, stride * sizeof(T)); + input_data += stride; + dst += output_depth; + } + output_ptr += stride; + } + } + } +} + +template <FusedActivationFunctionType Ac> +void NonGlobalBatchNormalization( + const float* input_data, const Dims<4>& input_dims, const float* mean_data, + const Dims<4>& mean_dims, const float* multiplier_data, + const Dims<4>& multiplier_dims, const float* offset_data, + const Dims<4>& offset_dims, float* output_data, + const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("NonGlobalBatchNormalization"); + const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); + const int height = + MatchingArraySize(input_dims, 2, mean_dims, 2, multiplier_dims, 2, + offset_dims, 2, output_dims, 2); + const int width = + MatchingArraySize(input_dims, 1, mean_dims, 1, multiplier_dims, 1, + offset_dims, 1, output_dims, 1); + const int depth = + MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0, + offset_dims, 0, output_dims, 0); + + for (int b = 0; b < batches; ++b) { + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + for (int c = 0; c < depth; ++c) { + output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>( + (input_data[Offset(input_dims, c, x, y, b)] - + mean_data[Offset(mean_dims, c, x, y, 0)]) * + multiplier_data[Offset(multiplier_dims, c, x, y, 0)] + + offset_data[Offset(offset_dims, c, x, y, 0)]); + } + } + } + } +} + +template <FusedActivationFunctionType Ac> +void GlobalBatchNormalization(const float* input_data, + const Dims<4>& input_dims, const float* mean_data, + const Dims<4>& mean_dims, + const float* multiplier_data, + const Dims<4>& multiplier_dims, + const float* offset_data, + const Dims<4>& offset_dims, float* output_data, + const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("GlobalBatchNormalization"); + const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); + const int height = MatchingArraySize(input_dims, 2, output_dims, 2); + const int width = MatchingArraySize(input_dims, 1, output_dims, 1); + const int depth = + MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0, + offset_dims, 0, output_dims, 0); + + for (int b = 0; b < batches; ++b) { + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + for (int c = 0; c < depth; ++c) { + output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>( + (input_data[Offset(input_dims, c, x, y, b)] - + mean_data[Offset(mean_dims, c, 0, 0, 0)]) * + multiplier_data[Offset(multiplier_dims, c, 0, 0, 0)] + + offset_data[Offset(offset_dims, c, 0, 0, 0)]); + } + } + } + } +} + +inline void Relu(const float* input_data, const Dims<4>& input_dims, + float* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("Relu (not fused)"); + const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); + const int height = MatchingArraySize(input_dims, 2, output_dims, 2); + const int width = MatchingArraySize(input_dims, 1, output_dims, 1); + const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); + for (int b = 0; b < batches; ++b) { + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + for (int c = 0; c < depth; ++c) { + float val = input_data[Offset(input_dims, c, x, y, b)]; + const float lower = 0; + float clamped = val < lower ? lower : val; + output_data[Offset(output_dims, c, x, y, b)] = clamped; + } + } + } + } +} + +inline void Relu1(const float* input_data, const Dims<4>& input_dims, + float* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("Relu1 (not fused)"); + const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); + const int height = MatchingArraySize(input_dims, 2, output_dims, 2); + const int width = MatchingArraySize(input_dims, 1, output_dims, 1); + const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); + for (int b = 0; b < batches; ++b) { + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + for (int c = 0; c < depth; ++c) { + float val = input_data[Offset(input_dims, c, x, y, b)]; + const float upper = 1; + const float lower = -1; + float clamped = val > upper ? upper : val < lower ? lower : val; + output_data[Offset(output_dims, c, x, y, b)] = clamped; + } + } + } + } +} + +inline void Relu6(const float* input_data, const Dims<4>& input_dims, + float* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("Relu6 (not fused)"); + const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); + const int height = MatchingArraySize(input_dims, 2, output_dims, 2); + const int width = MatchingArraySize(input_dims, 1, output_dims, 1); + const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); + for (int b = 0; b < batches; ++b) { + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + for (int c = 0; c < depth; ++c) { + float val = input_data[Offset(input_dims, c, x, y, b)]; + const float upper = 6; + const float lower = 0; + float clamped = val > upper ? upper : val < lower ? lower : val; + output_data[Offset(output_dims, c, x, y, b)] = clamped; + } + } + } + } +} + +template <FusedActivationFunctionType Ac> +void L2Normalization(const float* input_data, const Dims<4>& input_dims, + float* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("L2Normalization"); + static_assert(Ac == FusedActivationFunctionType::kNone, ""); + const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); + const int height = MatchingArraySize(input_dims, 2, output_dims, 2); + const int width = MatchingArraySize(input_dims, 1, output_dims, 1); + const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); + for (int b = 0; b < batches; ++b) { + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + float squared_l2_norm = 0; + for (int c = 0; c < depth; ++c) { + float val = input_data[Offset(input_dims, c, x, y, b)]; + squared_l2_norm += val * val; + } + float inverse_l2_norm = 1.0f / std::sqrt(squared_l2_norm); + for (int c = 0; c < depth; ++c) { + output_data[Offset(output_dims, c, x, y, b)] = + input_data[Offset(input_dims, c, x, y, b)] * inverse_l2_norm; + } + } + } + } +} + +inline void GetInvSqrtQuantizedMultiplier(int32 input, int32* output_inv_sqrt, + int* output_shift) { + *output_shift = 11; + while (input >= (1 << 29)) { + input /= 4; + ++*output_shift; + } + DCHECK_GT(input, 0); + const unsigned max_left_shift_bits = __builtin_clz(input) - 1; + const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2; + const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1; + *output_shift -= left_shift_bit_pairs; + input <<= 2 * left_shift_bit_pairs; + DCHECK_GE(input, (1 << 27)); + DCHECK_LT(input, (1 << 29)); + using gemmlowp::FixedPoint; + using gemmlowp::Rescale; + using gemmlowp::SaturatingRoundingMultiplyByPOT; + // Using 3 integer bits gives us enough room for the internal arithmetic in + // this Newton-Raphson iteration. + using F3 = FixedPoint<int32, 3>; + using F0 = FixedPoint<int32, 0>; + const F3 fixedpoint_input = F3::FromRaw(input >> 1); + const F3 fixedpoint_half_input = + SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input); + const F3 fixedpoint_half_three = + GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5); + // Newton-Raphson iteration + // Naive unoptimized starting guess: x = 1 + F3 x = F3::One(); + // Naive unoptimized number of iterations: 5 + for (int i = 0; i < 5; i++) { + const F3 x3 = Rescale<3>(x * x * x); + x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3); + } + const F0 fixedpoint_half_sqrt_2 = + GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.); + x = x * fixedpoint_half_sqrt_2; + *output_inv_sqrt = x.raw(); + if (*output_shift < 0) { + *output_inv_sqrt <<= -*output_shift; + *output_shift = 0; + } +} + +inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims, + int32 input_zero_point, uint8* output_data, + const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("L2Normalization/8bit"); + const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); + const int height = MatchingArraySize(input_dims, 2, output_dims, 2); + const int width = MatchingArraySize(input_dims, 1, output_dims, 1); + const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); + DCHECK(IsPackedWithoutStrides(input_dims)); + DCHECK(IsPackedWithoutStrides(output_dims)); + DCHECK_EQ(batches, 1); + DCHECK_EQ(height, 1); + DCHECK_EQ(width, 1); + int32 square_l2_norm = 0; + for (int i = 0; i < depth; i++) { + int32 diff = input_data[i] - input_zero_point; + square_l2_norm += diff * diff; + } + int32 inv_l2norm_multiplier; + int inv_l2norm_shift; + GetInvSqrtQuantizedMultiplier(square_l2_norm, &inv_l2norm_multiplier, + &inv_l2norm_shift); + + for (int i = 0; i < depth; i++) { + int32 diff = input_data[i] - input_zero_point; + int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOne( + 128 * diff, inv_l2norm_multiplier, inv_l2norm_shift); + int32 unclamped_output_val = 128 + rescaled_diff; + int32 output_val = std::min(255, std::max(0, unclamped_output_val)); + output_data[i] = static_cast<uint8>(output_val); + } +} + +template <FusedActivationFunctionType Ac> +void Add(const float* input1_data, const Dims<4>& input1_dims, + const float* input2_data, const Dims<4>& input2_dims, + float* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("Add"); + /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3, + output_dims, 3); + /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2, + output_dims, 2); + /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1, + output_dims, 1); + /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0, + output_dims, 0); + DCHECK(IsPackedWithoutStrides(input1_dims)); + DCHECK(IsPackedWithoutStrides(input2_dims)); + DCHECK(IsPackedWithoutStrides(output_dims)); + + int i = 0; + const int size = input1_dims.sizes[3] * input1_dims.strides[3]; +#ifdef USE_NEON + const auto zero = vdupq_n_f32(0); + const auto six = vdupq_n_f32(6); + const auto neg_one = vdupq_n_f32(-1); + const auto one = vdupq_n_f32(1); + for (; i <= size - 16; i += 16) { + auto a10 = vld1q_f32(input1_data + i); + auto a11 = vld1q_f32(input1_data + i + 4); + auto a12 = vld1q_f32(input1_data + i + 8); + auto a13 = vld1q_f32(input1_data + i + 12); + auto a20 = vld1q_f32(input2_data + i); + auto a21 = vld1q_f32(input2_data + i + 4); + auto a22 = vld1q_f32(input2_data + i + 8); + auto a23 = vld1q_f32(input2_data + i + 12); + auto x0 = vaddq_f32(a10, a20); + auto x1 = vaddq_f32(a11, a21); + auto x2 = vaddq_f32(a12, a22); + auto x3 = vaddq_f32(a13, a23); + if (Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6) { + x0 = vmaxq_f32(zero, x0); + x1 = vmaxq_f32(zero, x1); + x2 = vmaxq_f32(zero, x2); + x3 = vmaxq_f32(zero, x3); + if (Ac == FusedActivationFunctionType::kRelu6) { + x0 = vminq_f32(six, x0); + x1 = vminq_f32(six, x1); + x2 = vminq_f32(six, x2); + x3 = vminq_f32(six, x3); + } + } else if (Ac == FusedActivationFunctionType::kRelu1) { + x0 = vmaxq_f32(neg_one, x0); + x1 = vmaxq_f32(neg_one, x1); + x2 = vmaxq_f32(neg_one, x2); + x3 = vmaxq_f32(neg_one, x3); + x0 = vminq_f32(one, x0); + x1 = vminq_f32(one, x1); + x2 = vminq_f32(one, x2); + x3 = vminq_f32(one, x3); + } + vst1q_f32(output_data + i, x0); + vst1q_f32(output_data + i + 4, x1); + vst1q_f32(output_data + i + 8, x2); + vst1q_f32(output_data + i + 12, x3); + } + for (; i <= size - 4; i += 4) { + auto a1 = vld1q_f32(input1_data + i); + auto a2 = vld1q_f32(input2_data + i); + auto x = vaddq_f32(a1, a2); + if (Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6) { + x = vmaxq_f32(zero, x); + if (Ac == FusedActivationFunctionType::kRelu6) { + x = vminq_f32(six, x); + } + } else if (Ac == FusedActivationFunctionType::kRelu1) { + x = vmaxq_f32(neg_one, x); + x = vminq_f32(one, x); + } + vst1q_f32(output_data + i, x); + } +#endif // NEON + + for (; i < size; i++) { + auto x = input1_data[i] + input2_data[i]; + output_data[i] = ActivationFunction<Ac>(x); + } +} + +template <FusedActivationFunctionType Ac> +inline void Add(int left_shift, const uint8* input1_data, + const Dims<4>& input1_dims, int32 input1_offset, + int32 input1_multiplier, int input1_shift, + const uint8* input2_data, const Dims<4>& input2_dims, + int32 input2_offset, int32 input2_multiplier, int input2_shift, + int32 output_offset, int32 output_multiplier, int output_shift, + int32 output_activation_min, int32 output_activation_max, + uint8* output_data, const Dims<4>& output_dims) { + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + DCHECK_LE(output_activation_min, output_activation_max); + if (Ac == FusedActivationFunctionType::kNone) { + DCHECK_EQ(output_activation_min, 0); + DCHECK_EQ(output_activation_max, 255); + } + gemmlowp::ScopedProfilingLabel label("Add/8bit"); + /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3, + output_dims, 3); + /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2, + output_dims, 2); + /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1, + output_dims, 1); + /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0, + output_dims, 0); + DCHECK(IsPackedWithoutStrides(input1_dims)); + DCHECK(IsPackedWithoutStrides(input2_dims)); + DCHECK(IsPackedWithoutStrides(output_dims)); + + int i = 0; + const int size = input1_dims.sizes[3] * input1_dims.strides[3]; + DCHECK_GT(input1_offset, -256); + DCHECK_GT(input2_offset, -256); + DCHECK_LT(input1_offset, 256); + DCHECK_LT(input2_offset, 256); +#ifdef USE_NEON + for (; i <= size - 8; i += 8) { + const auto input1_val_original = vld1_u8(input1_data + i); + const auto input2_val_original = vld1_u8(input2_data + i); + const auto input1_val_s16 = + vreinterpretq_s16_u16(vmovl_u8(input1_val_original)); + const auto input2_val_s16 = + vreinterpretq_s16_u16(vmovl_u8(input2_val_original)); + const auto input1_val = + vaddq_s16(input1_val_s16, vdupq_n_s16(input1_offset)); + const auto input2_val = + vaddq_s16(input2_val_s16, vdupq_n_s16(input2_offset)); + const auto input1_val_high = vget_high_s16(input1_val); + const auto input1_val_low = vget_low_s16(input1_val); + const auto input2_val_high = vget_high_s16(input2_val); + const auto input2_val_low = vget_low_s16(input2_val); + auto x11 = vmovl_s16(input1_val_low); + auto x12 = vmovl_s16(input1_val_high); + auto x21 = vmovl_s16(input2_val_low); + auto x22 = vmovl_s16(input2_val_high); + const auto left_shift_dup = vdupq_n_s32(left_shift); + x11 = vshlq_s32(x11, left_shift_dup); + x12 = vshlq_s32(x12, left_shift_dup); + x21 = vshlq_s32(x21, left_shift_dup); + x22 = vshlq_s32(x22, left_shift_dup); + x11 = vqrdmulhq_n_s32(x11, input1_multiplier); + x12 = vqrdmulhq_n_s32(x12, input1_multiplier); + x21 = vqrdmulhq_n_s32(x21, input2_multiplier); + x22 = vqrdmulhq_n_s32(x22, input2_multiplier); + const auto input1_shift_dup = vdupq_n_s32(-input1_shift); + const auto input2_shift_dup = vdupq_n_s32(-input2_shift); + x11 = vshlq_s32(x11, input1_shift_dup); + x12 = vshlq_s32(x12, input1_shift_dup); + x21 = vshlq_s32(x21, input2_shift_dup); + x22 = vshlq_s32(x22, input2_shift_dup); + auto s1 = vaddq_s32(x11, x21); + auto s2 = vaddq_s32(x12, x22); + s1 = vqrdmulhq_n_s32(s1, output_multiplier); + s2 = vqrdmulhq_n_s32(s2, output_multiplier); + using gemmlowp::RoundingDivideByPOT; + s1 = RoundingDivideByPOT(s1, output_shift); + s2 = RoundingDivideByPOT(s2, output_shift); + const auto s1_narrowed = vmovn_s32(s1); + const auto s2_narrowed = vmovn_s32(s2); + const auto s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), + vdupq_n_s16(output_offset)); + vst1_u8(output_data + i, vqmovun_s16(s)); + } +#endif // NEON + + for (; i < size; i++) { + const int32 input1_val = input1_offset + input1_data[i]; + const int32 input2_val = input2_offset + input2_data[i]; + const int32 shifted_input1_val = input1_val * (1 << left_shift); + const int32 shifted_input2_val = input2_val * (1 << left_shift); + const int32 scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOne( + shifted_input1_val, input1_multiplier, input1_shift); + const int32 scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOne( + shifted_input2_val, input2_multiplier, input2_shift); + const int32 raw_sum = scaled_input1_val + scaled_input2_val; + const int32 raw_output = MultiplyByQuantizedMultiplierSmallerThanOne( + raw_sum, output_multiplier, output_shift) + + output_offset; + const int32 clamped_output = std::min( + output_activation_max, std::max(output_activation_min, raw_output)); + output_data[i] = static_cast<uint8>(clamped_output); + } +} + + +// TODO: We can implement BroadcastAdd on buffers of arbitrary +// dimensionality if the runtime code does a single loop over one dimension +// that handles broadcasting as the base case. The code generator would then +// generate max(D1, D2) nested for loops. +// TODO: BroadcastAdd is intentionally duplicated from +// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T> +// is no longer referenced in this file, move NdArrayDesc<T> from types.h to +// reference_ops.h. +template <FusedActivationFunctionType Ac> +void BroadcastAdd(const float* input1_data, const Dims<4>& input1_dims, + const float* input2_data, const Dims<4>& input2_dims, + float* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("BroadcastAdd"); + + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); + + // In Tensorflow, the dimensions are canonically named (batch_number, row, + // col, channel), with extents (batches, height, width, depth), with the + // trailing dimension changing most rapidly (channels has the smallest stride, + // typically 1 element). + // + // In generated C code, we store arrays with the dimensions reversed. The + // first dimension has smallest stride. + // + // We name our variables by their Tensorflow convention, but generate C code + // nesting loops such that the innermost loop has the smallest stride for the + // best cache behavior. + for (int b = 0; b < ArraySize(output_dims, 3); ++b) { + for (int y = 0; y < ArraySize(output_dims, 2); ++y) { + for (int x = 0; x < ArraySize(output_dims, 1); ++x) { + for (int c = 0; c < ArraySize(output_dims, 0); ++c) { + output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>( + input1_data[SubscriptToIndex(desc1, c, x, y, b)] + + input2_data[SubscriptToIndex(desc2, c, x, y, b)]); + } + } + } + } +} + +template <FusedActivationFunctionType Ac> +inline void BroadcastAdd(int left_shift, const uint8* input1_data, + const Dims<4>& input1_dims, int32 input1_offset, + int32 input1_multiplier, int input1_shift, + const uint8* input2_data, const Dims<4>& input2_dims, + int32 input2_offset, int32 input2_multiplier, + int input2_shift, int32 output_offset, + int32 output_multiplier, int output_shift, + int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + DCHECK_LE(output_activation_min, output_activation_max); + if (Ac == FusedActivationFunctionType::kNone) { + DCHECK_EQ(output_activation_min, 0); + DCHECK_EQ(output_activation_max, 255); + } + gemmlowp::ScopedProfilingLabel label("BroadcastAdd/8bit"); + + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); + + // In Tensorflow, the dimensions are canonically named (batch_number, row, + // col, channel), with extents (batches, height, width, depth), with the + // trailing dimension changing most rapidly (channels has the smallest stride, + // typically 1 element). + // + // In generated C code, we store arrays with the dimensions reversed. The + // first dimension has smallest stride. + // + // We name our variables by their Tensorflow convention, but generate C code + // nesting loops such that the innermost loop has the smallest stride for the + // best cache behavior. + for (int b = 0; b < ArraySize(output_dims, 3); ++b) { + for (int y = 0; y < ArraySize(output_dims, 2); ++y) { + for (int x = 0; x < ArraySize(output_dims, 1); ++x) { + for (int c = 0; c < ArraySize(output_dims, 0); ++c) { + const int32 input1_val = + input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)]; + const int32 input2_val = + input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)]; + const int32 shifted_input1_val = input1_val * (1 << left_shift); + const int32 shifted_input2_val = input2_val * (1 << left_shift); + const int32 scaled_input1_val = + MultiplyByQuantizedMultiplierSmallerThanOne( + shifted_input1_val, input1_multiplier, input1_shift); + const int32 scaled_input2_val = + MultiplyByQuantizedMultiplierSmallerThanOne( + shifted_input2_val, input2_multiplier, input2_shift); + const int32 raw_sum = scaled_input1_val + scaled_input2_val; + const int32 raw_output = + MultiplyByQuantizedMultiplierSmallerThanOne( + raw_sum, output_multiplier, output_shift) + + output_offset; + const int32 clamped_output = + std::min(output_activation_max, + std::max(output_activation_min, raw_output)); + output_data[Offset(output_dims, c, x, y, b)] = + static_cast<uint8>(clamped_output); + } + } + } + } +} + +template <FusedActivationFunctionType Ac> +void Mul(const float* input1_data, const Dims<4>& input1_dims, + const float* input2_data, const Dims<4>& input2_dims, + float* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("Mul"); + /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3, + output_dims, 3); + /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2, + output_dims, 2); + /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1, + output_dims, 1); + /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0, + output_dims, 0); + DCHECK(IsPackedWithoutStrides(input1_dims)); + DCHECK(IsPackedWithoutStrides(input2_dims)); + DCHECK(IsPackedWithoutStrides(output_dims)); + + int i = 0; + const int size = input1_dims.sizes[3] * input1_dims.strides[3]; +#ifdef USE_NEON + const auto zero = vdupq_n_f32(0); + const auto six = vdupq_n_f32(6); + const auto neg_one = vdupq_n_f32(-1); + const auto one = vdupq_n_f32(1); + for (; i <= size - 16; i += 16) { + auto a10 = vld1q_f32(input1_data + i); + auto a11 = vld1q_f32(input1_data + i + 4); + auto a12 = vld1q_f32(input1_data + i + 8); + auto a13 = vld1q_f32(input1_data + i + 12); + auto a20 = vld1q_f32(input2_data + i); + auto a21 = vld1q_f32(input2_data + i + 4); + auto a22 = vld1q_f32(input2_data + i + 8); + auto a23 = vld1q_f32(input2_data + i + 12); + auto x0 = vmulq_f32(a10, a20); + auto x1 = vmulq_f32(a11, a21); + auto x2 = vmulq_f32(a12, a22); + auto x3 = vmulq_f32(a13, a23); + if (Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6) { + x0 = vmaxq_f32(zero, x0); + x1 = vmaxq_f32(zero, x1); + x2 = vmaxq_f32(zero, x2); + x3 = vmaxq_f32(zero, x3); + if (Ac == FusedActivationFunctionType::kRelu6) { + x0 = vminq_f32(six, x0); + x1 = vminq_f32(six, x1); + x2 = vminq_f32(six, x2); + x3 = vminq_f32(six, x3); + } + } else if (Ac == FusedActivationFunctionType::kRelu1) { + x0 = vmaxq_f32(neg_one, x0); + x1 = vmaxq_f32(neg_one, x1); + x2 = vmaxq_f32(neg_one, x2); + x3 = vmaxq_f32(neg_one, x3); + x0 = vminq_f32(one, x0); + x1 = vminq_f32(one, x1); + x2 = vminq_f32(one, x2); + x3 = vminq_f32(one, x3); + } + vst1q_f32(output_data + i, x0); + vst1q_f32(output_data + i + 4, x1); + vst1q_f32(output_data + i + 8, x2); + vst1q_f32(output_data + i + 12, x3); + } + for (; i <= size - 4; i += 4) { + auto a1 = vld1q_f32(input1_data + i); + auto a2 = vld1q_f32(input2_data + i); + auto x = vmulq_f32(a1, a2); + if (Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6) { + x = vmaxq_f32(zero, x); + if (Ac == FusedActivationFunctionType::kRelu6) { + x = vminq_f32(six, x); + } + } else if (Ac == FusedActivationFunctionType::kRelu1) { + x = vmaxq_f32(neg_one, x); + x = vminq_f32(one, x); + } + vst1q_f32(output_data + i, x); + } +#endif // NEON + + for (; i < size; i++) { + auto x = input1_data[i] * input2_data[i]; + output_data[i] = ActivationFunction<Ac>(x); + } +} + +// TODO: We can implement BroadcastMul on buffers of arbitrary +// dimensionality if the runtime code does a single loop over one dimension +// that handles broadcasting as the base case. The code generator would then +// generate max(D1, D2) nested for loops. +// TODO: BroadcastMul is intentionally duplicated from +// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T> +// is no longer referenced in this file, move NdArrayDesc<T> from types.h to +// reference_ops.h. +template <FusedActivationFunctionType Ac> +void BroadcastMul(const float* input1_data, const Dims<4>& input1_dims, + const float* input2_data, const Dims<4>& input2_dims, + float* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("BroadcastMul"); + + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); + + // In Tensorflow, the dimensions are canonically named (batch_number, row, + // col, channel), with extents (batches, height, width, depth), with the + // trailing dimension changing most rapidly (channels has the smallest stride, + // typically 1 element). + // + // In generated C code, we store arrays with the dimensions reversed. The + // first dimension has smallest stride. + // + // We name our variables by their Tensorflow convention, but generate C code + // nesting loops such that the innermost loop has the smallest stride for the + // best cache behavior. + for (int b = 0; b < ArraySize(output_dims, 3); ++b) { + for (int y = 0; y < ArraySize(output_dims, 2); ++y) { + for (int x = 0; x < ArraySize(output_dims, 1); ++x) { + for (int c = 0; c < ArraySize(output_dims, 0); ++c) { + output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>( + input1_data[SubscriptToIndex(desc1, c, x, y, b)] * + input2_data[SubscriptToIndex(desc2, c, x, y, b)]); + } + } + } + } +} + +template <FusedActivationFunctionType Ac> +inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims, + int32 input1_offset, const uint8* input2_data, + const Dims<4>& input2_dims, int32 input2_offset, + int32 output_offset, int32 output_multiplier, + int output_shift, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + DCHECK_LE(output_activation_min, output_activation_max); + if (Ac == FusedActivationFunctionType::kNone) { + DCHECK_EQ(output_activation_min, 0); + DCHECK_EQ(output_activation_max, 255); + } + gemmlowp::ScopedProfilingLabel label("BroadcastMul/8bit"); + + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2); + + // In Tensorflow, the dimensions are canonically named (batch_number, row, + // col, channel), with extents (batches, height, width, depth), with the + // trailing dimension changing most rapidly (channels has the smallest stride, + // typically 1 element). + // + // In generated C code, we store arrays with the dimensions reversed. The + // first dimension has smallest stride. + // + // We name our variables by their Tensorflow convention, but generate C code + // nesting loops such that the innermost loop has the smallest stride for the + // best cache behavior. + for (int b = 0; b < ArraySize(output_dims, 3); ++b) { + for (int y = 0; y < ArraySize(output_dims, 2); ++y) { + for (int x = 0; x < ArraySize(output_dims, 1); ++x) { + for (int c = 0; c < ArraySize(output_dims, 0); ++c) { + const int32 input1_val = + input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)]; + const int32 input2_val = + input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)]; + const int32 unclamped_result = + output_offset + + MultiplyByQuantizedMultiplierSmallerThanOne( + input1_val * input2_val, output_multiplier, output_shift); + const int32 clamped_output = + std::min(output_activation_max, + std::max(output_activation_min, unclamped_result)); + output_data[Offset(output_dims, c, x, y, b)] = + static_cast<uint8>(clamped_output); + } + } + } + } +} + +template <FusedActivationFunctionType Ac, typename Scalar> +void Concatenation(int concat_dim, const Scalar* const* input_data, + const Dims<4>* const* input_dims, int inputs_count, + Scalar* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("Concatenation"); + DCHECK_GT(inputs_count, 1); + int concat_size = 0; + for (int i = 0; i < inputs_count; i++) { + for (int j = 0; j < 4; j++) { + if (j != concat_dim) { + MatchingArraySize(*input_dims[i], j, output_dims, j); + } + } + concat_size += ArraySize(*input_dims[i], concat_dim); + } + DCHECK_EQ(concat_size, ArraySize(output_dims, concat_dim)); + DCHECK(IsPackedWithoutStrides(output_dims)); + // for now we dont have a model with a Concatenation + // with fused activation function. + DCHECK(Ac == FusedActivationFunctionType::kNone); + int outer_size = 1; + for (int i = concat_dim + 1; i < 4; i++) { + outer_size *= output_dims.sizes[i]; + } + Scalar* output_ptr = output_data; + for (int k = 0; k < outer_size; k++) { + for (int i = 0; i < inputs_count; ++i) { + const int copy_size = + input_dims[i]->sizes[concat_dim] * input_dims[i]->strides[concat_dim]; + memcpy(output_ptr, input_data[i] + k * copy_size, + copy_size * sizeof(Scalar)); + output_ptr += copy_size; + } + } +} + +template <FusedActivationFunctionType Ac, typename Scalar> +void DepthConcatenation(const Scalar* const* input_data, + const Dims<4>* const* input_dims, int inputs_count, + Scalar* output_data, const Dims<4>& output_dims) { + Concatenation<Ac, Scalar>(0, input_data, input_dims, inputs_count, + output_data, output_dims); +} + +inline void LstmCell(const float* input_data, const Dims<4>& input_dims, + const float* prev_activ_data, + const Dims<4>& prev_activ_dims, const float* weights_data, + const Dims<4>& weights_dims, const float* bias_data, + const Dims<4>& bias_dims, const float* prev_state_data, + const Dims<4>& prev_state_dims, float* output_state_data, + const Dims<4>& output_state_dims, float* output_activ_data, + const Dims<4>& output_activ_dims, float* concat_temp_data, + const Dims<4>& concat_temp_dims, float* activ_temp_data, + const Dims<4>& activ_temp_dims) { + gemmlowp::ScopedProfilingLabel label("LstmCell"); + MatchingArraySize( // batches + input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3, output_state_dims, + 3, output_activ_dims, 3); + MatchingArraySize( // height + input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2, output_state_dims, + 2, output_activ_dims, 2); + MatchingArraySize( // width + input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1, output_state_dims, + 1, output_activ_dims, 1); + CHECK_EQ(ArraySize(weights_dims, 2), 1); + CHECK_EQ(ArraySize(weights_dims, 3), 1); + const int input_depth = ArraySize(input_dims, 0); + const int prev_activ_depth = ArraySize(prev_activ_dims, 0); + const int total_input_depth = prev_activ_depth + input_depth; + CHECK_EQ(ArraySize(weights_dims, 0), total_input_depth); + CHECK_EQ(MatchingArraySize(bias_dims, 1, bias_dims, 2, bias_dims, 3), 1); + const int intern_activ_depth = MatchingArraySize( + weights_dims, 1, + bias_dims, 0); + CHECK_EQ(intern_activ_depth % 4, 0); + const int output_depth = MatchingArraySize( + prev_state_dims, 0, + prev_activ_dims, 0, + output_state_dims, 0, + output_activ_dims, 0); + CHECK_EQ(output_depth, intern_activ_depth / 4); + + // Concatenate prev_activ and input data together + std::vector<float const*> concat_input_arrays_data; + std::vector<Dims<4> const*> concat_input_arrays_dims; + concat_input_arrays_data.push_back(input_data); + concat_input_arrays_data.push_back(prev_activ_data); + concat_input_arrays_dims.push_back(&input_dims); + concat_input_arrays_dims.push_back(&prev_activ_dims); + Concatenation<FusedActivationFunctionType::kNone, float>( + 0, &(concat_input_arrays_data[0]), &(concat_input_arrays_dims[0]), + concat_input_arrays_data.size(), concat_temp_data, concat_temp_dims); + + // Fully connected + FullyConnected<FusedActivationFunctionType::kNone>( + concat_temp_data, concat_temp_dims, weights_data, weights_dims, bias_data, + bias_dims, activ_temp_data, activ_temp_dims); + + // Map raw arrays to Eigen arrays so we can use Eigen's optimized array + // operations. + ArrayMap<float> activ_temp_map = + MapAsArrayWithFirstDimAsRows(activ_temp_data, activ_temp_dims); + auto input_gate_sm = activ_temp_map.block(0 * output_depth, 0, output_depth, + activ_temp_map.cols()); + auto new_input_sm = activ_temp_map.block(1 * output_depth, 0, output_depth, + activ_temp_map.cols()); + auto forget_gate_sm = activ_temp_map.block(2 * output_depth, 0, output_depth, + activ_temp_map.cols()); + auto output_gate_sm = activ_temp_map.block(3 * output_depth, 0, output_depth, + activ_temp_map.cols()); + ArrayMap<const float> prev_state_map = + MapAsArrayWithFirstDimAsRows(prev_state_data, prev_state_dims); + ArrayMap<float> output_state_map = + MapAsArrayWithFirstDimAsRows(output_state_data, output_state_dims); + ArrayMap<float> output_activ_map = + MapAsArrayWithFirstDimAsRows(output_activ_data, output_activ_dims); + + // Combined memory state and final output calculation + gemmlowp::ScopedProfilingLabel label2("MemoryStateAndFinalOutput"); + output_state_map = + input_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) * + new_input_sm.tanh() + + forget_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) * + prev_state_map; + output_activ_map = + output_gate_sm.unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()) * + output_state_map.tanh(); +} + +template <FusedActivationFunctionType Ac, typename Scalar> +void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims, + int outputs_count, Scalar* const* output_data, + const Dims<4>* const* output_dims) { + gemmlowp::ScopedProfilingLabel label("TensorFlowSplit"); + DCHECK_GE(outputs_count, 1); + for (int i = 0; i < outputs_count; i++) { + /* batches = */ MatchingArraySize(*output_dims[i], 3, input_dims, 3); + /* height = */ MatchingArraySize(*output_dims[i], 2, input_dims, 2); + /* width = */ MatchingArraySize(*output_dims[i], 1, input_dims, 1); + } + const int batches = MatchingArraySize(*output_dims[0], 3, input_dims, 3); + const int height = MatchingArraySize(*output_dims[0], 2, input_dims, 2); + const int width = MatchingArraySize(*output_dims[0], 1, input_dims, 1); + DCHECK(IsPackedWithoutStrides(input_dims)); + // for now we dont have a model with a TensorFlowSplit + // with fused activation function. + DCHECK(Ac == FusedActivationFunctionType::kNone); + const int whb = width * height * batches; + const Scalar* input_ptr = input_data; + for (int k = 0; k < whb; k++) { + for (int i = 0; i < outputs_count; ++i) { + memcpy(output_data[i] + k * output_dims[i]->sizes[0], input_ptr, + output_dims[i]->sizes[0] * sizeof(Scalar)); + input_ptr += output_dims[i]->sizes[0]; + } + } +} + +inline int NodeOffset(int b, int h, int w, int height, int width) { + return (b * height + h) * width + w; +} + +template <FusedActivationFunctionType Ac> +void AveragePool(const float* input_data, const Dims<4>& input_dims, + int stride_width, int stride_height, + int pad_width, int pad_height, int kwidth, int kheight, + float* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("AveragePool"); + const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); + const int input_height = ArraySize(input_dims, 2); + const int input_width = ArraySize(input_dims, 1); + const int output_height = ArraySize(output_dims, 2); + const int output_width = ArraySize(output_dims, 1); + const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); + + const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims); + auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims); + // TODO: get rid of the dynamic memory allocation here! + Eigen::VectorXf out_count(out_mat.cols()); + out_count.setZero(); + // Prefill the output to 0. + out_mat.setZero(); + for (int b = 0; b < batches; ++b) { + for (int h = 0; h < input_height; ++h) { + for (int w = 0; w < input_width; ++w) { + // (h_start, h_end) * (w_start, w_end) is the range that the input + // vector projects to. + int hpad = h + pad_height; + int wpad = w + pad_width; + int h_start = (hpad < kheight) ? 0 : (hpad - kheight) / stride_height + 1; + int h_end = std::min(hpad / stride_height + 1, output_height); + int w_start = (wpad < kwidth) ? 0 : (wpad - kwidth) / stride_width + 1; + int w_end = std::min(wpad / stride_width + 1, output_width); + // compute elementwise sum + for (int ph = h_start; ph < h_end; ++ph) { + for (int pw = w_start; pw < w_end; ++pw) { + int out_offset = NodeOffset(b, ph, pw, output_height, output_width); + out_mat.col(out_offset) += + in_mat.col(NodeOffset(b, h, w, input_height, input_width)); + out_count(out_offset)++; + } + } + } + } + } + // Divide the output by the actual number of elements being averaged over + DCHECK_GT(out_count.minCoeff(), 0); + out_mat.array().rowwise() /= out_count.transpose().array(); + + for (int b = 0; b < batches; ++b) { + for (int y = 0; y < output_height; ++y) { + for (int x = 0; x < output_width; ++x) { + for (int c = 0; c < depth; ++c) { + output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>( + output_data[Offset(output_dims, c, x, y, b)]); + } + } + } + } +} + +template <FusedActivationFunctionType Ac> +void AveragePool(const uint8* input_data, const Dims<4>& input_dims, + int stride_width, int stride_height, + int pad_width, int pad_height, int filter_width, + int filter_height, int32 output_activation_min, + int32 output_activation_max, uint8* output_data, + const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("AveragePool/8bit"); + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + DCHECK_LE(output_activation_min, output_activation_max); + if (Ac == FusedActivationFunctionType::kNone) { + DCHECK_EQ(output_activation_min, 0); + DCHECK_EQ(output_activation_max, 255); + } + const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); + const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); + const int input_height = ArraySize(input_dims, 2); + const int input_width = ArraySize(input_dims, 1); + const int output_height = ArraySize(output_dims, 2); + const int output_width = ArraySize(output_dims, 1); + for (int batch = 0; batch < batches; ++batch) { + for (int out_y = 0; out_y < output_height; ++out_y) { + for (int out_x = 0; out_x < output_width; ++out_x) { + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + const int filter_x_start = std::max(0, -in_x_origin); + const int filter_x_end = + std::min(filter_width, input_width - in_x_origin); + const int filter_y_start = std::max(0, -in_y_origin); + const int filter_y_end = + std::min(filter_height, input_height - in_y_origin); + const int filter_count = + (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start); + // TODO: Add a dynamic buffer allocation path instead of hardcoded size. + static constexpr int kAccBufferMaxSize = 2048; + DCHECK_LE(depth, kAccBufferMaxSize); + uint16 acc[kAccBufferMaxSize]; + memset(acc, 0, depth * sizeof(acc[0])); + const uint8* input_ptr = + input_data + input_dims.strides[1] * in_x_origin + + input_dims.strides[2] * in_y_origin + input_dims.strides[3] * batch; + for (int fy = filter_y_start; fy < filter_y_end; fy++) { + const uint8* input_row_ptr = input_ptr + fy * input_dims.strides[2] + + filter_x_start * input_dims.strides[1]; + for (int fx = filter_x_start; fx < filter_x_end; fx++) { + int channel = 0; +#ifdef USE_NEON + for (; channel <= depth - 16; channel += 16) { + uint16x8_t acc_reg[2]; + for (int i = 0; i < 2; i++) { + acc_reg[i] = vld1q_u16(acc + channel + 8 * i); + } + uint8x16_t input_reg = vld1q_u8(input_row_ptr); + input_row_ptr += 16; + acc_reg[0] = vaddw_u8(acc_reg[0], vget_low_u8(input_reg)); + acc_reg[1] = vaddw_u8(acc_reg[1], vget_high_u8(input_reg)); + for (int i = 0; i < 2; i++) { + vst1q_u16(acc + channel + 8 * i, acc_reg[i]); + } + } + for (; channel <= depth - 8; channel += 8) { + uint16x8_t acc_reg = vld1q_u16(acc + channel); + uint8x8_t input_reg = vld1_u8(input_row_ptr); + input_row_ptr += 8; + acc_reg = vaddw_u8(acc_reg, input_reg); + vst1q_u16(acc + channel, acc_reg); + } +#endif + for (; channel < depth; ++channel) { + acc[channel] += *input_row_ptr++; + } + } + } + uint8* output_ptr = + output_data + Offset(output_dims, 0, out_x, out_y, batch); + int channel = 0; +#ifdef USE_NEON +#define AVGPOOL_DIVIDING_BY(FILTER_COUNT) \ + if (filter_count == FILTER_COUNT) { \ + for (; channel <= depth - 8; channel += 8) { \ + uint16 buf[8]; \ + for (int i = 0; i < 8; i++) { \ + buf[i] = (acc[channel + i] + FILTER_COUNT / 2) / FILTER_COUNT; \ + } \ + uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf)); \ + buf8 = vmin_u8(buf8, vdup_n_u8(output_activation_max)); \ + buf8 = vmax_u8(buf8, vdup_n_u8(output_activation_min)); \ + vst1_u8(output_ptr + channel, buf8); \ + } \ + } + AVGPOOL_DIVIDING_BY(9) + AVGPOOL_DIVIDING_BY(15) +#undef AVGPOOL_DIVIDING_BY + for (; channel <= depth - 8; channel += 8) { + uint16 buf[8]; + for (int i = 0; i < 8; i++) { + buf[i] = (acc[channel + i] + filter_count / 2) / filter_count; + } + uint8x8_t buf8 = vqmovn_u16(vld1q_u16(buf)); + buf8 = vmin_u8(buf8, vdup_n_u8(output_activation_max)); + buf8 = vmax_u8(buf8, vdup_n_u8(output_activation_min)); + vst1_u8(output_ptr + channel, buf8); + } +#endif + for (; channel < depth; ++channel) { + uint16 a = (acc[channel] + filter_count / 2) / filter_count; + a = std::max<uint16>(a, output_activation_min); + a = std::min<uint16>(a, output_activation_max); + output_ptr[channel] = static_cast<uint8>(a); + } + } + } + } +} + +template <FusedActivationFunctionType Ac> +void MaxPool(const float* input_data, const Dims<4>& input_dims, + int stride_width, int stride_height, + int pad_width, int pad_height, int kwidth, int kheight, + float* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("MaxPool"); + const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); + const int input_height = ArraySize(input_dims, 2); + const int input_width = ArraySize(input_dims, 1); + const int output_height = ArraySize(output_dims, 2); + const int output_width = ArraySize(output_dims, 1); + const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); + + const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims); + auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims); + // Prefill the output to minimum representable float value + out_mat.setConstant(std::numeric_limits<float>::lowest()); + for (int b = 0; b < batches; ++b) { + for (int h = 0; h < input_height; ++h) { + for (int w = 0; w < input_width; ++w) { + // (h_start, h_end) * (w_start, w_end) is the range that the input + // vector projects to. + int hpad = h + pad_height; + int wpad = w + pad_width; + int h_start = (hpad < kheight) ? 0 : (hpad - kheight) / stride_height + 1; + int h_end = std::min(hpad / stride_height + 1, output_height); + int w_start = (wpad < kwidth) ? 0 : (wpad - kwidth) / stride_width + 1; + int w_end = std::min(wpad / stride_width + 1, output_width); + // compute elementwise sum + for (int ph = h_start; ph < h_end; ++ph) { + for (int pw = w_start; pw < w_end; ++pw) { + int out_offset = NodeOffset(b, ph, pw, output_height, output_width); + out_mat.col(out_offset) = + out_mat.col(out_offset) + .cwiseMax(in_mat.col( + NodeOffset(b, h, w, input_height, input_width))); + } + } + } + } + } + + for (int b = 0; b < batches; ++b) { + for (int y = 0; y < output_height; ++y) { + for (int x = 0; x < output_width; ++x) { + for (int c = 0; c < depth; ++c) { + output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>( + output_data[Offset(output_dims, c, x, y, b)]); + } + } + } + } +} + +template <FusedActivationFunctionType Ac> +void MaxPool(const uint8* input_data, const Dims<4>& input_dims, + int stride_width, int stride_height, + int pad_width, int pad_height, int filter_width, int filter_height, + int32 output_activation_min, int32 output_activation_max, + uint8* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("MaxPool/8bit"); + static_assert(Ac == FusedActivationFunctionType::kNone || + Ac == FusedActivationFunctionType::kRelu || + Ac == FusedActivationFunctionType::kRelu6 || + Ac == FusedActivationFunctionType::kRelu1, + ""); + DCHECK_LE(output_activation_min, output_activation_max); + if (Ac == FusedActivationFunctionType::kNone) { + DCHECK_EQ(output_activation_min, 0); + DCHECK_EQ(output_activation_max, 255); + } + const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); + const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); + const int input_height = ArraySize(input_dims, 2); + const int input_width = ArraySize(input_dims, 1); + const int output_height = ArraySize(output_dims, 2); + const int output_width = ArraySize(output_dims, 1); + for (int batch = 0; batch < batches; ++batch) { + for (int out_y = 0; out_y < output_height; ++out_y) { + for (int out_x = 0; out_x < output_width; ++out_x) { + const int in_x_origin = (out_x * stride_width) - pad_width; + const int in_y_origin = (out_y * stride_height) - pad_height; + const int filter_x_start = std::max(0, -in_x_origin); + const int filter_x_end = + std::min(filter_width, input_width - in_x_origin); + const int filter_y_start = std::max(0, -in_y_origin); + const int filter_y_end = + std::min(filter_height, input_height - in_y_origin); + // TODO: Add a dynamic buffer allocation path instead of hardcoded size. + static constexpr int kAccBufferMaxSize = 2048; + DCHECK_LE(depth, kAccBufferMaxSize); + uint8 acc[kAccBufferMaxSize]; + memset(acc, 0, depth * sizeof(acc[0])); + const uint8* input_ptr = + input_data + input_dims.strides[1] * in_x_origin + + input_dims.strides[2] * in_y_origin + input_dims.strides[3] * batch; + for (int fy = filter_y_start; fy < filter_y_end; fy++) { + const uint8* input_row_ptr = input_ptr + fy * input_dims.strides[2] + + filter_x_start * input_dims.strides[1]; + for (int fx = filter_x_start; fx < filter_x_end; fx++) { + int channel = 0; +#ifdef USE_NEON + for (; channel <= depth - 16; channel += 16) { + uint8x16_t acc_reg = vld1q_u8(acc + channel); + uint8x16_t input_reg = vld1q_u8(input_row_ptr); + input_row_ptr += 16; + acc_reg = vmaxq_u8(acc_reg, input_reg); + vst1q_u8(acc + channel, acc_reg); + } + + for (; channel <= depth - 8; channel += 8) { + uint8x8_t acc_reg = vld1_u8(acc + channel); + uint8x8_t input_reg = vld1_u8(input_row_ptr); + input_row_ptr += 8; + acc_reg = vmax_u8(acc_reg, input_reg); + vst1_u8(acc + channel, acc_reg); + } +#endif + for (; channel < depth; ++channel) { + acc[channel] = std::max(acc[channel], *input_row_ptr++); + } + } + } + uint8* output_ptr = + output_data + Offset(output_dims, 0, out_x, out_y, batch); + int channel = 0; +#ifdef USE_NEON + for (; channel <= depth - 16; channel += 16) { + uint8x16_t a = vld1q_u8(acc + channel); + a = vminq_u8(a, vdupq_n_u8(output_activation_max)); + a = vmaxq_u8(a, vdupq_n_u8(output_activation_min)); + vst1q_u8(output_ptr + channel, a); + } + for (; channel <= depth - 8; channel += 8) { + uint8x8_t a = vld1_u8(acc + channel); + a = vmin_u8(a, vdup_n_u8(output_activation_max)); + a = vmax_u8(a, vdup_n_u8(output_activation_min)); + vst1_u8(output_ptr + channel, a); + } +#endif + for (; channel < depth; ++channel) { + uint8 a = acc[channel]; + a = std::max<uint8>(a, output_activation_min); + a = std::min<uint8>(a, output_activation_max); + output_ptr[channel] = static_cast<uint8>(a); + } + } + } + } +} + +template <FusedActivationFunctionType Ac> +void L2Pool(const float* input_data, const Dims<4>& input_dims, + int stride_width, int stride_height, + int pad_width, int pad_height, int filter_width, int filter_height, + float* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("L2Pool"); + const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); + const int input_height = ArraySize(input_dims, 2); + const int input_width = ArraySize(input_dims, 1); + const int output_height = ArraySize(output_dims, 2); + const int output_width = ArraySize(output_dims, 1); + // Actually carry out L2 Pool. Code is written in forward mode: we go through + // the input values once, and write to all the pooled regions that it maps to. + const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims); + auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims); + Eigen::VectorXf in_square(in_mat.rows()); + Eigen::VectorXf out_count(out_mat.cols()); + out_count.setZero(); + // Prefill the output to 0. + out_mat.setZero(); + for (int b = 0; b < batches; ++b) { + for (int h = 0; h < input_height; ++h) { + for (int w = 0; w < input_width; ++w) { + // (h_start, h_end) * (w_start, w_end) is the range that the input + // vector projects to. + const int hpad = h + pad_height; + const int wpad = w + pad_width; + const int h_start = + (hpad < filter_height) ? 0 : (hpad - filter_height) / stride_height + 1; + const int h_end = std::min(hpad / stride_height + 1, output_height); + const int w_start = + (wpad < filter_width) ? 0 : (wpad - filter_width) / stride_width + 1; + const int w_end = std::min(wpad / stride_width + 1, output_width); + // pre-compute square + const int in_offset = w + input_width * (h + input_height * b); + in_square = + in_mat.col(in_offset).array() * in_mat.col(in_offset).array(); + // compute elementwise sum of squares + for (int ph = h_start; ph < h_end; ++ph) { + for (int pw = w_start; pw < w_end; ++pw) { + const int out_offset = pw + output_width * (ph + output_height * b); + out_mat.col(out_offset) += in_square; + out_count(out_offset)++; + } + } + } + } + } + + out_count = out_count.array().inverse(); + out_mat = + (out_mat.array().rowwise() * out_count.transpose().array()).cwiseSqrt(); +} + +inline void LocalResponseNormalization(const float* input_data, + const Dims<4>& input_dims, int range, + float bias, float alpha, float beta, + float* output_data, + const Dims<4>& output_dims) { + /* const int batches = */ MatchingArraySize(input_dims, 3, output_dims, 3); + /* const int height = */ MatchingArraySize(input_dims, 2, output_dims, 2); + /* const int width = */ MatchingArraySize(input_dims, 1, output_dims, 1); + /* const int depth = */ MatchingArraySize(input_dims, 0, output_dims, 0); + + const auto data_in = MapAsMatrixWithFirstDimAsRows(input_data, input_dims); + auto data_out = MapAsMatrixWithFirstDimAsRows(output_data, output_dims); + + // Carry out local response normalization, vector by vector. + // Since the data are stored column major, making row-wise operation + // probably not memory efficient anyway, we do an explicit for loop over + // the columns. + const int double_range = range * 2; + Eigen::VectorXf padded_square(data_in.rows() + double_range); + padded_square.setZero(); + for (int r = 0; r < data_in.cols(); ++r) { + // Do local response normalization for data_in(:, r) + // first, compute the square and store them in buffer for repeated use + padded_square.block(range, 0, data_in.rows(), 1) = + data_in.col(r).cwiseProduct(data_in.col(r)) * alpha; + // Then, compute the scale and writes them to data_out + float accumulated_scale = 0; + for (int i = 0; i < double_range; ++i) { + accumulated_scale += padded_square(i); + } + for (int i = 0; i < data_in.rows(); ++i) { + accumulated_scale += padded_square(i + double_range); + data_out(i, r) = bias + accumulated_scale; + accumulated_scale -= padded_square(i); + } + } + + // In a few cases, the pow computation could benefit from speedups. + if (beta == 1) { + data_out.array() = data_in.array() * data_out.array().inverse(); + } else if (beta == 0.5) { + data_out.array() = data_in.array() * data_out.array().sqrt().inverse(); + } else { + data_out.array() = data_in.array() * data_out.array().pow(-beta); + } +} + +inline void Softmax(const float* input_data, const Dims<4>& input_dims, + float beta, float* output_data, + const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("Softmax"); + /* const int batches = */ MatchingArraySize(input_dims, 3, output_dims, 3); + /* const int height = */ MatchingArraySize(input_dims, 2, output_dims, 2); + /* const int width = */ MatchingArraySize(input_dims, 1, output_dims, 1); + /* const int depth = */ MatchingArraySize(input_dims, 0, output_dims, 0); + + const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims); + auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims); + // Compute the exponential first, removing the max coefficient for numerical + // stability. + out_mat = (in_mat.rowwise() - in_mat.colwise().maxCoeff()).array() * beta; + // We are separating out the exp function so that exp can be vectorized. + out_mat = out_mat.array().exp(); + // Normalize to get the activations. + Eigen::Array<float, 1, Eigen::Dynamic> scale = + out_mat.array().colwise().sum().inverse(); + out_mat.array().rowwise() *= scale; +} + +inline void Softmax(const uint8* input_data, const Dims<4>& input_dims, + int32 input_beta_multiplier, int32 input_beta_left_shift, + int diff_min, uint8* output_data, + const Dims<4>& output_dims) { + // The representation chosen for the input to the exp() function is Q5.26. + // We need to leave extra space since values that we skip might be as large as + // -32 before multiplying by input_beta_multiplier, and therefore as large as + // -16 afterwards. Note that exp(-8) is definitely not insignificant to + // accumulation, but exp(-16) definitely is. + static const int kScaledDiffIntegerBits = 5; + static const int kAccumulationIntegerBits = 12; + using FixedPointScaledDiff = + gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>; + using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>; + using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; + + gemmlowp::ScopedProfilingLabel label("Softmax"); + const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); + const int height = MatchingArraySize(input_dims, 2, output_dims, 2); + const int width = MatchingArraySize(input_dims, 1, output_dims, 1); + const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); + + for (int b = 0; b < batches; ++b) { + for (int x = 0; x < width; ++x) { + for (int y = 0; y < height; ++y) { + uint8 max_in_row = 0; + for (int c = 0; c < depth; ++c) { + max_in_row = + std::max(max_in_row, input_data[Offset(input_dims, c, x, y, b)]); + } + + FixedPointAccum sum_of_exps = FixedPointAccum::Zero(); + for (int c = 0; c < depth; ++c) { + int32 input_diff = + static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) - + max_in_row; + if (input_diff >= diff_min) { + const int32 input_diff_rescaled = + MultiplyByQuantizedMultiplierGreaterThanOne( + input_diff, input_beta_multiplier, input_beta_left_shift); + const FixedPointScaledDiff scaled_diff_f8 = + FixedPointScaledDiff::FromRaw(input_diff_rescaled); + sum_of_exps = + sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>( + exp_on_negative_values(scaled_diff_f8)); + } + } + + int32 fixed_sum_of_exps = sum_of_exps.raw(); + // TODO: Use a NEON intrinsic like vclzq_u32 instead. + int headroom_plus_one = + __builtin_clz(static_cast<uint32>(fixed_sum_of_exps)); + // This is the number of bits to the left of the binary point above 1.0. + // Consider fixed_sum_of_exps=1.25. In that case shifted_scale=0.8 and + // no later adjustment will be needed. + int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one; + int32 shifted_sum_minus_one = static_cast<int32>( + (static_cast<uint32>(fixed_sum_of_exps) << headroom_plus_one) - + (static_cast<uint32>(1) << 31)); + + FixedPoint0 shifted_scale = gemmlowp::one_over_one_plus_x_for_x_in_0_1( + FixedPoint0::FromRaw(shifted_sum_minus_one)); + + for (int c = 0; c < depth; ++c) { + int32 input_diff = + static_cast<int32>(input_data[Offset(input_dims, c, x, y, b)]) - + max_in_row; + if (input_diff >= diff_min) { + const int32 input_diff_rescaled = + MultiplyByQuantizedMultiplierGreaterThanOne( + input_diff, input_beta_multiplier, input_beta_left_shift); + const FixedPointScaledDiff scaled_diff_f8 = + FixedPointScaledDiff::FromRaw(input_diff_rescaled); + + FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8); + int32 unsat_output = gemmlowp::RoundingDivideByPOT( + (shifted_scale * exp_in_0).raw(), num_bits_over_unit + 31 - 8); + + output_data[Offset(output_dims, c, x, y, b)] = + std::max(std::min(unsat_output, 255), 0); + + } else { + output_data[Offset(output_dims, c, x, y, b)] = 0; + } + } + } + } + } +} + +inline void Logistic(const float* input_data, const Dims<4>& input_dims, + float* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("Logistic"); + auto input_map = MapAsVector(input_data, input_dims); + auto output_map = MapAsVector(output_data, output_dims); + output_map.array() = + input_map.array().unaryExpr(Eigen::internal::scalar_sigmoid_op<float>()); +} + +inline void Logistic(const uint8* input_data, const Dims<4>& input_dims, + int32 input_zero_point, int32 input_range_radius, + int32 input_multiplier, int input_left_shift, + uint8* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("Logistic"); + const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); + const int height = MatchingArraySize(input_dims, 2, output_dims, 2); + const int width = MatchingArraySize(input_dims, 1, output_dims, 1); + const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); + for (int b = 0; b < batches; ++b) { + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + for (int c = 0; c < depth; ++c) { + const uint8 input_val_u8 = input_data[Offset(input_dims, c, x, y, b)]; + const int32 input_val_centered = + static_cast<int32>(input_val_u8) - input_zero_point; + uint8 output_val; + if (input_val_centered < -input_range_radius) { + output_val = 0; + } else if (input_val_centered > input_range_radius) { + output_val = 255; + } else { + const int32 input_val_rescaled = + MultiplyByQuantizedMultiplierGreaterThanOne( + input_val_centered, input_multiplier, input_left_shift); + using FixedPoint4 = gemmlowp::FixedPoint<int32, 4>; + using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>; + const FixedPoint4 input_val_f4 = + FixedPoint4::FromRaw(input_val_rescaled); + const FixedPoint0 output_val_f0 = gemmlowp::logistic(input_val_f4); + using gemmlowp::RoundingDivideByPOT; + int32 output_val_s32 = RoundingDivideByPOT(output_val_f0.raw(), 23); + if (output_val_s32 == 256) { + output_val_s32 = 255; + } + DCHECK_GE(output_val_s32, 0); + DCHECK_LE(output_val_s32, 255); + output_val = static_cast<uint8>(output_val_s32); + } + output_data[Offset(output_dims, c, x, y, b)] = output_val; + } + } + } + } +} + +inline void Tanh(const float* input_data, const Dims<4>& input_dims, + float* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("Tanh"); + auto input_map = MapAsVector(input_data, input_dims); + auto output_map = MapAsVector(output_data, output_dims); + output_map.array() = input_map.array().tanh(); +} + +inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims, + int32 zero_point, double scale, float* output_data, + const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("Dequantize"); + const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); + const int height = MatchingArraySize(input_dims, 2, output_dims, 2); + const int width = MatchingArraySize(input_dims, 1, output_dims, 1); + const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); + for (int b = 0; b < batches; ++b) { + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + for (int c = 0; c < depth; ++c) { + int32 val = input_data[Offset(input_dims, c, x, y, b)]; + float result = static_cast<float>(scale * (val - zero_point)); + output_data[Offset(output_dims, c, x, y, b)] = result; + } + } + } + } +} + +inline void FakeQuant(const float* input_data, const Dims<4>& input_dims, + float rmin, float rmax, float* output_data, + const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("FakeQuant"); + + // 0 should always be a representable value. Let's assume that the initial + // min,max range contains 0. + DCHECK_LE(rmin, 0.); + DCHECK_GE(rmax, 0.); + + // Determine quantization parameters: zero_point, scale. + using Integer = uint8; + const Integer qmin = std::numeric_limits<Integer>::min(); + const Integer qmax = std::numeric_limits<Integer>::max(); + const float qmin_float = qmin; + const float qmax_float = qmax; + int32 zero_point = 0; + float scale = 0.f; + // If rmin==rmax, both must be zero per the above assertion, + // so we are done. + if (rmin != rmax) { + // First determine the scale. + scale = (rmax - rmin) / (qmax_float - qmin_float); + + // Zero-point computation. + // First the initial floating-point computation. The zero-point can be + // determined from solving an affine equation for any known pair + // (real value, corresponding quantized value). + // We know two such pairs: (rmin, qmin) and (rmax, qmax). + // The arithmetic error on the zero point computed from either pair + // will be roughly machine_epsilon * (sum of absolute values of terms) + // so we want to use the variant that adds the smaller terms. + const float zero_point_from_min = qmin_float - rmin / scale; + const float zero_point_from_max = qmax_float - rmax / scale; + const float zero_point_from_min_error = + std::abs(qmin_float) + std::abs(rmin / scale); + const float zero_point_from_max_error = + std::abs(qmax_float) + std::abs(rmax / scale); + + const float zero_point_float = + zero_point_from_min_error < zero_point_from_max_error + ? zero_point_from_min + : zero_point_from_max; + + // Now we need to nudge the zero point to be an integer + // (our zero points are integer, and this is motivated by the requirement + // to be able to represent the real value "0" exactly as a quantized value, + // which is required in multiple places, for example in Im2col with SAME + // padding). + if (zero_point_float < qmin_float) { + zero_point = qmin; + } else if (zero_point_float > qmax_float) { + zero_point = qmax; + } else { + zero_point = static_cast<int32>(std::round(zero_point_float)); + } + // The zero point should always be in the range of quantized value, + // [qmin, qmax]. + DCHECK_GE(zero_point, qmin); + DCHECK_LE(zero_point, qmax); + } + + const int batches = MatchingArraySize(input_dims, 3, output_dims, 3); + const int height = MatchingArraySize(input_dims, 2, output_dims, 2); + const int width = MatchingArraySize(input_dims, 1, output_dims, 1); + const int depth = MatchingArraySize(input_dims, 0, output_dims, 0); + for (int b = 0; b < batches; ++b) { + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + for (int c = 0; c < depth; ++c) { + const float src_val = input_data[Offset(input_dims, c, x, y, b)]; + const float unclamped_quantized_val = + std::round(zero_point + src_val / scale); + const float quantized_val = std::min( + qmax_float, std::max(qmin_float, unclamped_quantized_val)); + const float dst_val = scale * (quantized_val - zero_point); + output_data[Offset(output_dims, c, x, y, b)] = dst_val; + } + } + } + } +} + +template <typename SrcT, typename DstT> +inline void Cast(const SrcT* input_data, const Dims<4>& input_dims, + DstT* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("Cast"); + auto input_map = MapAsVector(input_data, input_dims); + auto output_map = MapAsVector(output_data, output_dims); + output_map.array() = input_map.array().template cast<DstT>(); +} + +inline void Floor(const float* input_data, const Dims<4>& input_dims, + float* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("Floor"); + auto input_map = MapAsVector(input_data, input_dims); + auto output_map = MapAsVector(output_data, output_dims); + output_map.array() = Eigen::floor(input_map.array()); +} + +template <typename T> +inline void Gather(const T* input_data, const Dims<4>& input_dims, + const int32* coords_data, const Dims<4>& coords_dims, + T* output_data, const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("Gather"); + DCHECK_EQ(RequiredBufferSizeForDims(output_dims), + RequiredBufferSizeForDims(coords_dims)); + for (int i = 0; i < RequiredBufferSizeForDims(coords_dims); i++) { + DCHECK_GE(coords_data[i], 0); + DCHECK_LT(coords_data[i], RequiredBufferSizeForDims(input_dims)); + output_data[i] = input_data[coords_data[i]]; + } +} + +inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims, + const int32* output_size_data, + const Dims<4>& output_size_dims, float* output_data, + const Dims<4>& output_dims) { + gemmlowp::ScopedProfilingLabel label("ResizeBilinear"); + int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3); + int32 input_height = ArraySize(input_dims, 2); + int32 input_width = ArraySize(input_dims, 1); + int32 depth = MatchingArraySize(input_dims, 0, output_dims, 0); + + DCHECK_EQ(ArraySize(output_size_dims, 3), 1); + DCHECK_EQ(ArraySize(output_size_dims, 2), 1); + DCHECK_EQ(ArraySize(output_size_dims, 1), 1); + DCHECK_EQ(ArraySize(output_size_dims, 0), 2); + int32 output_height = output_size_data[Offset(output_size_dims, 0, 0, 0, 0)]; + int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)]; + float height_scale = static_cast<float>(input_height) / output_height; + float width_scale = static_cast<float>(input_width) / output_width; + + for (int b = 0; b < batches; ++b) { + for (int y = 0; y < output_height; ++y) { + float input_y = y * height_scale; + int32 y0 = static_cast<int32>(input_y); + int32 y1 = std::min(y0 + 1, input_height - 1); + for (int x = 0; x < output_width; ++x) { + float input_x = x * width_scale; + int32 x0 = static_cast<int32>(input_x); + int32 x1 = std::min(x0 + 1, input_width - 1); + for (int c = 0; c < depth; ++c) { + float interpolation = input_data[Offset(input_dims, c, x0, y0, b)] * + (1 - (input_y - y0)) * + (1 - (input_x - x0)) + + input_data[Offset(input_dims, c, x0, y1, b)] * + (input_y - y0) * (1 - (input_x - x0)) + + input_data[Offset(input_dims, c, x1, y0, b)] * + (1 - (input_y - y0)) * (input_x - x0) + + input_data[Offset(input_dims, c, x1, y1, b)] * + (input_y - y0) * (input_x - x0); + output_data[Offset(output_dims, c, x, y, b)] = interpolation; + } + } + } + } +} + +} // namespace optimized_ops +} // namespace rt +} // namespace nnfw + +#if defined OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS +#undef OPTIMIZED_OPS_H__IGNORE_DEPRECATED_DECLARATIONS +#pragma GCC diagnostic pop +#endif + +#endif // __NNFW_RT_OPTIMIZED_OPS_H__ diff --git a/runtimes/nn/common/operations/internal/optimized/tensor_utils_impl.h b/runtimes/nn/common/operations/internal/optimized/tensor_utils_impl.h new file mode 100644 index 000000000..bf659d0a3 --- /dev/null +++ b/runtimes/nn/common/operations/internal/optimized/tensor_utils_impl.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RT_TENSOR_UTILS_IMPL_H__ +#define __NNFW_RT_TENSOR_UTILS_IMPL_H__ + +#include "ActivationFunctor.h" + +#ifndef USE_NEON +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#define USE_NEON +#endif // defined(__ARM_NEON__) || defined(__ARM_NEON) +#endif // USE_NEON + +namespace nnfw { +namespace rt { +namespace tensor_utils { + +// Multiply a matrix by a batch vector, and store results in a batch-size +// vector. +void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix, + int m_rows, int m_cols, + const float* vector, + int n_batch, float* result, + int result_stride); +void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows, + int m_cols, const float* vector, + int n_batch, float* result, + int result_stride); + +// Cwise product of two vectors. +void PortableVectorVectorCwiseProduct(const float* vector1, + const float* vector2, int v_size, + float* result); +void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2, + int v_size, float* result); + +// Cwise product and accumulate of two vectors. Since it's a MAC operation, the +// assumption here is that result array is initialized to valid values. +void PortableVectorVectorCwiseProductAccumulate(const float* vector1, + const float* vector2, + int v_size, float* result); +void NeonVectorVectorCwiseProductAccumulate(const float* vector1, + const float* vector2, int v_size, + float* result); + +// Dot product of two vectors. +float PortableVectorVectorDotProduct(const float* vector1, const float* vector2, + int v_size); + +// Dot product of two batch vectors. +void PortableBatchVectorBatchVectorDotProduct(const float* vector1, + const float* vector2, int v_size, + int n_batch, float* result, + int result_stride); + +// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC +// operation, the assumption here is that result array is initialized to valid +// values. +void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector, + int v_size, + const float* batch_vector, + int n_batch, + float* result); +void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector, + int v_size, + const float* batch_vector, + int n_batch, float* result); + +// Compute "1.0f - elements of vector" (used in CIFG). +void PortableSub1Vector(const float* vector, int v_size, float* result); +void NeonSub1Vector(const float* vector, int v_size, float* result); + +// Clip elements of a vector using a abs_limit value. +void PortableClipVector(const float* vector, int v_size, float abs_limit, + float* result); +void NeonClipVector(const float* vector, int v_size, float abs_limit, + float* result); + +// Batch vector initialization with another vector. +void PortableVectorBatchVectorAssign(const float* vector, int v_size, + int n_batch, float* batch_vector); + +// Apply sigmoid to elements of a vector. +void PortableApplySigmoidToVector(const float* vector, int v_size, + float* result); + +// Apply activation function to elements of a vector. +void PortableApplyActivationToVector(const float* vector, int v_size, + ActivationFn activation, + float* result); + +// Copy vector to another vector. +void PortableCopyVector(const float* vector, int v_size, float* result); + +// Fill vector with 0.f. +void PortableZeroVector(float* vector, int v_size); + +// Limit a float input f between +abs_limit and -abs_limit. +float PortableClip(float f, float abs_limit); + +// Shift left a vector in place with v_size size. +void PortableVectorShiftLeft(float* vector, int v_size, float shift_value); + +// Reduce-sum on a float input vector: +// input_vector: float pointer to input vector. +// input_stride: input vector stride. +// output_vector: float pointer to vector. +// output_size: output vector size. +// reduction_size: number of consecutive elements from input vector which are +// added to get one element of output. +void PortableReductionSumVector(const float* input_vector, int input_stride, + float* output_vector, int output_size, + int reduction_size); +} // namespace tensor_utils +} // namespace rt +} // namespace nnfw + +#endif // __NNFW_RT_TENSOR_UTILS_IMPL_H__ diff --git a/runtimes/nn/common/operations/internal/tensor_utils.cc b/runtimes/nn/common/operations/internal/tensor_utils.cc new file mode 100644 index 000000000..78275bb29 --- /dev/null +++ b/runtimes/nn/common/operations/internal/tensor_utils.cc @@ -0,0 +1,29 @@ +/* + * Copyright 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensor_utils.h" + +#ifndef USE_NEON +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#define USE_NEON +#endif // defined(__ARM_NEON__) || defined(__ARM_NEON) +#endif // USE_NEON + +#ifdef USE_NEON +#include "optimized/neon_tensor_utils.h" +#else +#include "reference/portable_tensor_utils.h" +#endif // USE_NEON diff --git a/runtimes/nn/common/operations/internal/tensor_utils.h b/runtimes/nn/common/operations/internal/tensor_utils.h new file mode 100644 index 000000000..df3d4e27b --- /dev/null +++ b/runtimes/nn/common/operations/internal/tensor_utils.h @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RT_TENSOR_UTILS_H__ +#define __NNFW_RT_TENSOR_UTILS_H__ + +#include "ActivationFunctor.h" + +namespace nnfw { +namespace rt { +namespace tensor_utils { + +// Limit a float input f betweeen +abs_limit and -abs_limit. +float Clip(float f, float abs_limit); + +// Multiply a matrix by a batch vector, and store results in a batch-size +// vector using a stride value provided in result_stride. 'result_stride' shows +// how the number of elements between consecutive result values. For example +// result_stride = 1, will cause the output to look like this: +// [O_1, 0_2, ... O_rows] in memory, but result_stride = 3, will cause it to be +// arranged like this in memory: [O_1, x, x, 0_2, x, x, ..., O_rows] +void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows, + int m_cols, const float* vector, + int n_batch, float* result, + int result_stride); + +// Cwise product of two vectors. +void VectorVectorCwiseProduct(const float* vector1, const float* vector2, + int v_size, float* result); + +// Cwise product and accumulate of two vectors. Since it's a MAC opertation, the +// assumption here is that result array is initialized to valid values. +void VectorVectorCwiseProductAccumulate(const float* vector1, + const float* vector2, int v_size, + float* result); + +// Dot product of two vectors. +float VectorVectorDotProduct(const float* vector1, const float* vector2, + int v_size); + +// Dot product of two batch vectors of size n_batch * v_size: +// vector1 = [x_1_1, x_1_2, ..., x_1_vsize, +// x_2_1, x_2_2, ..., x_2_vsize, +// ... +// x_nbatch_1,..., x_nbatch_vsize] +// vector2 = [y_1_1, y_1_2, ..., y_1_vsize, +// y_2_1, y_2_2, ..., y_2_vsize, +// ... +// y_nbatch_1,..., y_nbatch_vsize] +// Then result will be a vector of n_batch size which will be saved with a +// stride of result_stride in memory starting from 'result': +// [x_1_1 * y_1_1 + x_1_2 * y_1_2 + ... + x_1_vsize * y_1_vsize, +// x_2_1 * y_2_1 + x_2_2 * y_2_2 + ... + x_2_vsize * y_2_vsize, +// ... +// x_nbatch_1 * y_nbatch_1 + ... + x_nbatch_vsize * y_nbatch_vsize] +void BatchVectorBatchVectorDotProduct(const float* vector1, + const float* vector2, int v_size, + int n_batch, float* result, + int result_stride); + +// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC +// operation, the assumption here is that result array is initialized to valid +// values. +void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size, + const float* batch_vector, + int n_batch, float* result); + +// Batch vector initialization with another vector. +void VectorBatchVectorAssign(const float* vector, int v_size, int n_batch, + float* batch_vector); + +// Apply sigmoid to elements of a vector. +void ApplySigmoidToVector(const float* vector, int v_size, float* result); + +// Apply activation function to elements of a vector. +void ApplyActivationToVector(const float* vector, int v_size, + ActivationFn activation, float* result); + +// Copy vector to another vector. +void CopyVector(const float* vector, int v_size, float* result); + +// Compute "1.0f - elements of vector" (used in CIFG). +void Sub1Vector(const float* vector, int v_size, float* result); + +// Fill vector with 0.f. +void ZeroVector(float* vector, int v_size); + +// Clip elements of a vector using a abs_limit value. +void ClipVector(const float* vector, int v_size, float abs_limit, + float* result); + +// Shift left a vector in place with v_size size. +void VectorShiftLeft(float* vector, int v_size, float shift_value); + +// Reduce-sum on a float input vector: +// input_vector: float pointer to input vector. +// input_stride: input vector stride. +// output_vector: float pointer to vector. +// output_size: output vector size. +// reduction_size: number of consecutive elements from input vector which are +// added to get one element of output. +void ReductionSumVector(const float* input_vector, int input_stride, + float* output_vector, int output_size, + int reduction_size); +} // namespace tensor_utils +} // namespace rt +} // namespace nnfw + +#endif // __NNFW_RT_TENSOR_UTILS_H__ diff --git a/runtimes/nn/common/operations/internal/tensor_utils_test.cc b/runtimes/nn/common/operations/internal/tensor_utils_test.cc new file mode 100644 index 000000000..b68982164 --- /dev/null +++ b/runtimes/nn/common/operations/internal/tensor_utils_test.cc @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "gmock/gmock-matchers.h" +#include "gtest/gtest.h" +#include "tensor_utils.h" + +namespace nnfw { +namespace rt { +namespace tensor_utils { + +namespace { + +using ::testing::FloatNear; +using ::testing::Matcher; + +std::vector<Matcher<float>> ArrayFloatNear(const std::vector<float>& values, + float max_abs_error=1.e-6) { + std::vector<Matcher<float>> matchers; + matchers.reserve(values.size()); + for (const float& v : values) { + matchers.emplace_back(FloatNear(v, max_abs_error)); + } + return matchers; +} + +} // anonymous namespace + +TEST(uKernels, ClipTest) { + constexpr int kVectorSize = 10; + constexpr float kAbsLimit = 2.0; + static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0, + -2.5, 3.0, -3.5, 4.0, -4.5}; + std::vector<float> output(kVectorSize); + ClipVector(input, kVectorSize, kAbsLimit, output.data()); + EXPECT_THAT(output, + ElementsAreArray(ArrayFloatNear( + {0.0, -0.5, 1.0, -1.5, 2.0, -2.0, 2.0, -2.0, 2.0, -2.0}))); +} + +TEST(uKernels, MatrixBatchVectorMultiplyAccumulateTest) { + constexpr int kRow = 3; + constexpr int kCol = 4; + constexpr int kBatch = 2; + static float matrix[kRow * kCol] = {1.0, 2.0, 3.0, 4.0, // + -1.0, -2.0, -3.0, -4.0, // + 1.0, -2.0, 3.0, -4.0}; + static float vector[kCol * kBatch] = {1.0, -1.0, 1.0, -1.0, // + 2.0, -2.0, 2.0, -2.0}; + std::vector<float> output(kRow * kBatch); + std::fill(output.begin(), output.end(), 3.0); + MatrixBatchVectorMultiplyAccumulate(matrix, kRow, kCol, vector, kBatch, + output.data(), /*result_stride=*/1); + EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear({1., 5., 13., // + -1., 7., 23.}))); +} + +TEST(uKernels, VectorVectorCwiseProductTest) { + constexpr int kVectorSize = 10; + static float input1[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0, + -2.5, 3.0, -3.5, 4.0, -4.5}; + static float input2[kVectorSize] = {0.1, -0.1, 0.1, -0.1, 0.1, + -0.1, 0.1, -0.1, 0.1, -0.1}; + std::vector<float> output(kVectorSize); + VectorVectorCwiseProduct(input1, input2, kVectorSize, output.data()); + EXPECT_THAT(output, + ElementsAreArray(ArrayFloatNear( + {0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45}))); +} + +TEST(uKernels, VectorVectorCwiseProductAccumulateTest) { + constexpr int kVectorSize = 10; + static float input1[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0, + -2.5, 3.0, -3.5, 4.0, -4.5}; + static float input2[kVectorSize] = {0.1, -0.1, 0.1, -0.1, 0.1, + -0.1, 0.1, -0.1, 0.1, -0.1}; + std::vector<float> output(kVectorSize); + std::fill(output.begin(), output.end(), 1.0); + VectorVectorCwiseProductAccumulate(input1, input2, kVectorSize, + output.data()); + EXPECT_THAT(output, + ElementsAreArray(ArrayFloatNear( + {1.0, 1.05, 1.1, 1.15, 1.2, 1.25, 1.3, 1.35, 1.4, 1.45}))); +} + +TEST(uKernels, VectorBatchVectorAssignTest) { + constexpr int kVectorSize = 5; + constexpr int kBatchSize = 3; + static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0}; + std::vector<float> output(kVectorSize * kBatchSize); + VectorBatchVectorAssign(input, kVectorSize, kBatchSize, output.data()); + EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear( + {0.0, -0.5, 1.0, -1.5, 2.0, 0.0, -0.5, 1.0, -1.5, 2.0, + 0.0, -0.5, 1.0, -1.5, 2.0}))); +} + +TEST(uKernels, ApplySigmoidToVectorTest) { + constexpr int kVectorSize = 5; + static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0}; + std::vector<float> output(kVectorSize); + ApplySigmoidToVector(input, kVectorSize, output.data()); + EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear( + {0.5, 0.377541, 0.731059, 0.182426, 0.880797}))); +} + +TEST(uKernels, ApplyActivationToVectorTest) { + constexpr int kVectorSize = 5; + static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0}; + std::vector<float> output(kVectorSize); + ApplyActivationToVector(input, kVectorSize, kActivationRelu, output.data()); + EXPECT_THAT(output, + ElementsAreArray(ArrayFloatNear({0.0, 0.0, 1.0, 0.0, 2.0}))); + + ApplyActivationToVector(input, kVectorSize, kActivationTanh, output.data()); + EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear( + {0.0, -0.462117, 0.761594, -0.905148, 0.964028}))); +} + +TEST(uKernels, CopyVectorTest) { + constexpr int kVectorSize = 5; + static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0}; + std::vector<float> output(kVectorSize); + CopyVector(input, kVectorSize, output.data()); + EXPECT_THAT(output, + ElementsAreArray(ArrayFloatNear({0.0, -0.5, 1.0, -1.5, 2.0}))); +} + +TEST(uKernels, Sub1VectorTest) { + constexpr int kVectorSize = 5; + static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0}; + std::vector<float> output(kVectorSize); + Sub1Vector(input, kVectorSize, output.data()); + EXPECT_THAT(output, + ElementsAreArray(ArrayFloatNear({1.0, 1.5, 0.0, 2.5, -1.0}))); +} + +TEST(uKernels, ZeroVectorTest) { + constexpr int kVectorSize = 5; + std::vector<float> output(kVectorSize); + ZeroVector(output.data(), kVectorSize); + EXPECT_THAT(output, + ElementsAreArray(ArrayFloatNear({0.0, 0.0, 0.0, 0.0, 0.0}))); +} + +TEST(uKernels, BatchVectorBatchVectorDotProductTest) { + constexpr int kVectorSize = 5; + constexpr int kBatch = 2; + static float input1[kVectorSize * kBatch] = {0.0, -0.5, 1.0, -1.5, 2.0, + -2.5, 3.0, -3.5, 4.0, -4.5}; + static float input2[kVectorSize * kBatch] = {0.1, -0.1, 0.1, -0.1, 0.1, + -0.1, 0.1, -0.1, 0.1, -0.1}; + std::vector<float> output(kBatch); + BatchVectorBatchVectorDotProduct(input1, input2, kVectorSize, kBatch, + output.data(), /*result_stride=*/1); + EXPECT_THAT(output, ElementsAreArray(ArrayFloatNear({0.5, 1.75}))); +} + +TEST(uKernels, VectorShiftLeftTest) { + constexpr int kVectorSize = 5; + static float input[kVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0}; + std::vector<float> result(kVectorSize); + VectorShiftLeft(input, kVectorSize, 3.0); + result.assign(input, input + kVectorSize); + EXPECT_THAT(result, + ElementsAreArray(ArrayFloatNear({-0.5, 1.0, -1.5, 2.0, 3.0}))); +} + +TEST(uKernels, ReductionSumVectorTest) { + constexpr int kInputVectorSize = 10; + constexpr int kOutputVectorSize = 5; + constexpr int kReductionSize = 2; + static float input[kInputVectorSize] = {0.0, -0.5, 1.0, -1.5, 2.0, + 0.0, -0.5, 1.0, 1.0, 2.0}; + std::vector<float> result(kOutputVectorSize); + ReductionSumVector(input, + /*input_stride=*/1, result.data(), kOutputVectorSize, + kReductionSize); + EXPECT_THAT(result, + ElementsAreArray(ArrayFloatNear({-0.5, -0.5, 2.0, 0.5, 3.0}))); +} + +} // namespace tensor_utils +} // namespace rt +} // namespace nnfw diff --git a/runtimes/nn/common/operations/internal/types.h b/runtimes/nn/common/operations/internal/types.h new file mode 100644 index 000000000..bd5880edd --- /dev/null +++ b/runtimes/nn/common/operations/internal/types.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright (C) 2017 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __NNFW_RT_TYPES_H__ +#define __NNFW_RT_TYPES_H__ + +#include "compatibility.h" + +namespace nnfw { +namespace rt { + +enum class FusedActivationFunctionType { kNone, kRelu6, kRelu1, kRelu }; + +template <int N> +struct Dims { + int sizes[N]; + int strides[N]; +}; + +struct Shape; + +inline Dims<4> convertShapeToDims(const Shape& shape) { + Dims<4> dims; + for (int i=0; i<4; i++) { + dims.sizes[i] = 1; + } + + if (shape.dimensions.size() == 1) { + dims.sizes[0] = (int)getSizeOfDimension(shape, 0); + } else { + for (int i=0; i<4; i++) { + int src = (int)shape.dimensions.size()-i-1; + if (src >= 0) { + dims.sizes[i] = (int)getSizeOfDimension(shape, src); + } + } + } + + dims.strides[0] = 1; + for (int i = 1; i<4; i++) { + dims.strides[i] = dims.strides[i-1] * dims.sizes[i-1]; + } + return dims; +} + +inline int Offset(const Dims<4>& dims, int i0, int i1, int i2, int i3) { + DCHECK(i0 >= 0 && i0 < dims.sizes[0]); + DCHECK(i1 >= 0 && i1 < dims.sizes[1]); + DCHECK(i2 >= 0 && i2 < dims.sizes[2]); + DCHECK(i3 >= 0 && i3 < dims.sizes[3]); + return i0 * dims.strides[0] + i1 * dims.strides[1] + i2 * dims.strides[2] + + i3 * dims.strides[3]; +} + +// Get array size, DCHECKing that the dim index is in range. +template <int N> +int ArraySize(const Dims<N>& array, int index) { + DCHECK(index >= 0 && index < N); + return array.sizes[index]; +} + +// Get common array size, DCHECKing that they all agree. +template <typename ArrayType1, typename ArrayType2> +int MatchingArraySize(const ArrayType1& array1, int index1, + const ArrayType2& array2, int index2) { + DCHECK_EQ(ArraySize(array1, index1), ArraySize(array2, index2)); + return ArraySize(array1, index1); +} + +template <typename ArrayType1, typename ArrayType2, typename... Args> +int MatchingArraySize(const ArrayType1& array1, int index1, + const ArrayType2& array2, int index2, Args... args) { + DCHECK_EQ(ArraySize(array1, index1), ArraySize(array2, index2)); + return MatchingArraySize(array1, index1, args...); +} + +inline int RequiredBufferSizeForDims(const Dims<4>& dims) { + int max_offset = 0; + for (int i = 0; i < 4; i++) { + max_offset += (dims.sizes[i] - 1) * dims.strides[i]; + } + return max_offset + 1; +} + +template <int N> +bool IsPackedWithoutStrides(const Dims<N>& dims) { + int expected_stride = 1; + for (int d = 0; d < N; d++) { + if (dims.strides[d] != expected_stride) return false; + expected_stride *= dims.sizes[d]; + } + return true; +} + +} // namespace rt +} // namespace nnfw + +#endif // __NNFW_RT_TYPES_H__ |