diff options
Diffstat (limited to 'runtime/onert/backend')
43 files changed, 2986 insertions, 1622 deletions
diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.cc b/runtime/onert/backend/acl_cl/KernelGenerator.cc index 3ca405899..a84f983b4 100644 --- a/runtime/onert/backend/acl_cl/KernelGenerator.cc +++ b/runtime/onert/backend/acl_cl/KernelGenerator.cc @@ -31,6 +31,7 @@ #include "exec/FunctionSequence.h" #include "util/logging.h" #include "util/Utils.h" +#include "AclKernelGen.h" namespace onert { @@ -76,15 +77,15 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node) const auto block_size_index{ node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto block_size_alloc = _tensor_builder->at(block_size_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto block_size_tensor = _tensor_builder->at(block_size_index).get(); assert(_ctx.at(block_size_index).data()); auto fn = std::make_unique<::arm_compute::CLBatchToSpaceLayer>(); - fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), ofm_alloc->handle()); + fn->configure(ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -96,15 +97,27 @@ void KernelGenerator::visit(const ir::operation::Cast &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - const auto input_sub_type = _ctx.at(ifm_index).typeInfo().type() == ir::DataType::BOOL8 - ? arm_compute::SubDataType::BOOL - : arm_compute::SubDataType::NONE; + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); - auto fn = std::make_unique<::arm_compute::CLCast>(); + std::unique_ptr<::arm_compute::IFunction> fn; + if (ifm_tensor->data_type() == ofm_tensor->data_type()) + { + auto l = std::make_unique<::arm_compute::CLCopy>(); + + l->configure(ifm_tensor->handle(), ofm_tensor->handle()); + + fn = std::move(l); + } + else + { + auto l = std::make_unique<::arm_compute::CLCast>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), input_sub_type); + // TODO Support converting float to int32 as round down + l->configure(ifm_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE); + + fn = std::move(l); + } auto acl_fn = asAclClFunction(std::move(fn)); @@ -132,10 +145,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) ker_width, ker_height); const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto ker_alloc = _tensor_builder->at(ker_index).get(); - auto bias_alloc = _tensor_builder->at(bias_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto ker_tensor = _tensor_builder->at(ker_index).get(); + auto bias_tensor = _tensor_builder->at(bias_index).get(); const auto conv_info = acl_common::asPadStrideInfo(padding, stride); const auto act_info = acl_common::asActivationLayerInfo(activation); @@ -143,8 +156,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) auto fn = std::make_unique<::arm_compute::CLConvolutionLayer>( _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); - fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(), - conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info); + fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), + ofm_tensor->handle(), conv_info, ::arm_compute::WeightsInfo(), + ::arm_compute::Size2D(1U, 1U), act_info); _return_fn = asAclClFunction(std::move(fn)); } @@ -171,10 +185,10 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) const auto multiplier = node.param().multiplier; const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto ker_alloc = _tensor_builder->at(ker_index).get(); - auto bias_alloc = _tensor_builder->at(bias_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto ker_tensor = _tensor_builder->at(ker_index).get(); + auto bias_tensor = _tensor_builder->at(bias_index).get(); const auto conv_info = acl_common::asPadStrideInfo(padding, stride); const auto act_info = acl_common::asActivationLayerInfo(activation); @@ -182,8 +196,8 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) { auto fn = std::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>(); - fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), - ofm_alloc->handle(), conv_info, multiplier, act_info); + fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), + ofm_tensor->handle(), conv_info, multiplier, act_info); _return_fn = asAclClFunction(std::move(fn)); } @@ -191,88 +205,28 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) void KernelGenerator::visit(const ir::operation::MaxPool2D &node) { - const auto ofm_index{node.getOutputs().at(0)}; - const auto ifm_index{node.getInputs().at(ir::operation::MaxPool2D::Input::INPUT)}; - - const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout); - const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout); + auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::CLPoolingLayer>( + node, _ctx, _tensor_builder, _current_op_seq_layout, ::arm_compute::PoolingType::MAX); - const auto kh = node.param().kh; - const auto kw = node.param().kw; - const auto stride = node.param().stride; - const auto padding = - ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); + const auto ofm_index{node.getOutputs().at(0)}; + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); const auto activation = node.param().activation; - - VERBOSE(MaxPool2D) << "IFM_H: " << ifm_shape.H << std::endl; - VERBOSE(MaxPool2D) << "IFM_W: " << ifm_shape.W << std::endl; - VERBOSE(MaxPool2D) << "OFM_H: " << ofm_shape.H << std::endl; - VERBOSE(MaxPool2D) << "OFM_W: " << ofm_shape.W << std::endl; - VERBOSE(MaxPool2D) << "KER_H: " << kh << std::endl; - VERBOSE(MaxPool2D) << "KER_W: " << kw << std::endl; - VERBOSE(MaxPool2D) << "STRIDE_H: " << stride.vertical << std::endl; - VERBOSE(MaxPool2D) << "STRIDE_W: " << stride.horizontal << std::endl; - VERBOSE(MaxPool2D) << "PAD(T): " << padding.top << std::endl; - VERBOSE(MaxPool2D) << "PAD(B): " << padding.bottom << std::endl; - VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl; - VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl; - - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - - ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX, - ::arm_compute::Size2D{kw, kh}, - acl_common::asPadStrideInfo(padding, stride)}; - - auto fn = std::make_unique<::arm_compute::CLPoolingLayer>(); - - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); - _return_fn = std::make_unique<exec::FunctionSequence>( - asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclClFunction(std::move(raw_fn)), + ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::AvgPool2D &node) { - const auto ofm_index{node.getOutputs().at(0)}; - const auto ifm_index{node.getInputs().at(ir::operation::AvgPool2D::Input::INPUT)}; - - const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout); - const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout); + auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::CLPoolingLayer>( + node, _ctx, _tensor_builder, _current_op_seq_layout, ::arm_compute::PoolingType::AVG); - const auto kh = node.param().kh; - const auto kw = node.param().kw; - const auto stride = node.param().stride; - const auto padding = - ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); + const auto ofm_index{node.getOutputs().at(0)}; + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); const auto activation = node.param().activation; - - VERBOSE(AvgPool2D) << "IFM_H: " << ifm_shape.H << std::endl; - VERBOSE(AvgPool2D) << "IFM_W: " << ifm_shape.W << std::endl; - VERBOSE(AvgPool2D) << "OFM_H: " << ofm_shape.H << std::endl; - VERBOSE(AvgPool2D) << "OFM_W: " << ofm_shape.W << std::endl; - VERBOSE(AvgPool2D) << "KER_H: " << kh << std::endl; - VERBOSE(AvgPool2D) << "KER_W: " << kw << std::endl; - VERBOSE(AvgPool2D) << "STRIDE_H: " << stride.vertical << std::endl; - VERBOSE(AvgPool2D) << "STRIDE_W: " << stride.horizontal << std::endl; - VERBOSE(AvgPool2D) << "PAD(T): " << padding.top << std::endl; - VERBOSE(AvgPool2D) << "PAD(B): " << padding.bottom << std::endl; - VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl; - VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl; - - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - - ::arm_compute::PoolingLayerInfo info{ - ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh}, - acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */}; - - auto fn = std::make_unique<::arm_compute::CLPoolingLayer>(); - - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); - _return_fn = std::make_unique<exec::FunctionSequence>( - asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclClFunction(std::move(raw_fn)), + ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::Concat &node) @@ -296,7 +250,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node) return; } - auto output_alloc = _tensor_builder->at(ofm_index).get(); + auto output_tensor = _tensor_builder->at(ofm_index).get(); std::vector<::arm_compute::ICLTensor *> input_tensors; for (auto &ifm_ind : input_indexes) input_tensors.emplace_back(_tensor_builder->at(ifm_ind)->handle()); @@ -305,7 +259,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node) if (input_indexes.size() < 2) { auto l = std::make_unique<::arm_compute::CLCopy>(); - l->configure(input_tensors.at(0), output_alloc->handle()); + l->configure(input_tensors.at(0), output_tensor->handle()); fn = std::move(l); } else @@ -313,10 +267,10 @@ void KernelGenerator::visit(const ir::operation::Concat &node) auto l = std::make_unique<::arm_compute::CLConcatenateLayer>(); const auto rank = _ctx.at(ofm_index).shape().rank(); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = output_alloc->layout(); + const auto backend_layout = output_tensor->layout(); const auto fixed_axis = acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value(); - l->configure(input_tensors, output_alloc->handle(), fixed_axis); + l->configure(input_tensors, output_tensor->handle(), fixed_axis); fn = std::move(l); } @@ -327,75 +281,15 @@ void KernelGenerator::visit(const ir::operation::Concat &node) void KernelGenerator::visit(const ir::operation::FullyConnected &node) { - using ir::operation::FullyConnected; - const auto output_index{node.getOutputs().at(0)}; - const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)}; - const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)}; - const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)}; - - const auto input_rank = _ctx.at(input_index).shape().rank(); - - const auto output_size = - _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1); - UNUSED_RELEASE(output_size); - assert(_ctx.at(bias_index).shape().dim(0) == output_size); - assert(_ctx.at(weight_index).shape().dim(0) == output_size); - const auto batch_size = - _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 2); - const auto input_size = - _ctx.at(weight_index).shape().dim(_ctx.at(weight_index).shape().rank() - 1); - - // Check for reshaping input's shape into rank-2 - bool needs_reshape = false; - ir::Shape reshape(2); - if (input_rank == 3 || input_rank == 4) - { - const auto &ifm_shape = _ctx.at(input_index).shape(); - auto feature_size = 1; - for (int i = 0; i < ifm_shape.rank(); ++i) - { - feature_size *= ifm_shape.dim(i); - } - - UNUSED_RELEASE(feature_size); - assert(feature_size == batch_size * input_size); - - // for reshaping - needs_reshape = true; - reshape.dim(0) = batch_size; /* H */ - reshape.dim(1) = input_size; /* W */ - } - + auto output_tensor = _tensor_builder->at(output_index).get(); const auto activation = node.param().activation; - auto output_alloc = _tensor_builder->at(output_index).get(); - const auto input_alloc = _tensor_builder->at(input_index).get(); - const auto weight_alloc = _tensor_builder->at(weight_index).get(); - const auto bias_alloc = _tensor_builder->at(bias_index).get(); - const auto frontend_layout = _current_op_seq_layout; - const auto acl_layout = output_alloc->handle()->info()->data_layout(); - - auto fn = std::make_unique<arm_compute::CLFullyConnectedReshapingLayer>( - _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); - - arm_compute::CLFullyConnectedReshapingLayer::KernelType kernel_type = - arm_compute::CLFullyConnectedReshapingLayer::KernelType::GENERAL; - if (_ctx.at(weight_index).isConstant()) - { - kernel_type = arm_compute::CLFullyConnectedReshapingLayer::KernelType::PREPROCESSED_WEIGHTS; - assert(_ctx.at(weight_index).data()); - } - fn->configure( - input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(), - needs_reshape, - ::onert::backend::acl_common::asTensorShape( - reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)), - kernel_type); - + auto fn = acl_common::kernelGenFullyConnected<acl_common::AclClFunction, ::arm_compute::ICLTensor, + ::arm_compute::CLFullyConnectedReshapingLayer>( + node, _ctx, _tensor_builder, _current_op_seq_layout); _return_fn = std::make_unique<exec::FunctionSequence>( - asAclClFunction(std::move(fn)), - ActivationBuilder::generate(activation, output_alloc->handle())); + std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle())); } void KernelGenerator::visit(const ir::operation::Mul &node) @@ -406,17 +300,18 @@ void KernelGenerator::visit(const ir::operation::Mul &node) const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::CLPixelWiseMultiplication>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN); _return_fn = std::make_unique<exec::FunctionSequence>( - asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclClFunction(std::move(fn)), + ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::Reduce &node) @@ -427,14 +322,14 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) const auto keep_dims{node.param().keep_dims}; const auto reduce_type = node.param().reduce_type; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); // Convert to ACL axes taking into account negative values and possible duplicates. const auto &axes = _ctx.at(axes_index); const auto input_rank = _ctx.at(input_index).shape().rank(); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = input_alloc->layout(); + const auto backend_layout = input_tensor->layout(); std::unique_ptr<arm_compute::IFunction> fn; if (reduce_type == ir::operation::Reduce::ReduceType::MEAN) @@ -443,7 +338,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) const auto acl_axes = acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout); - l->configure(input_alloc->handle(), acl_axes, keep_dims, output_alloc->handle()); + l->configure(input_tensor->handle(), acl_axes, keep_dims, output_tensor->handle()); fn = std::move(l); } @@ -453,7 +348,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); const auto acl_axes = acl_common::asSet(axes, input_rank, frontend_layout, backend_layout); - l->configure(input_alloc->handle(), output_alloc->handle(), acl_axes, keep_dims, + l->configure(input_tensor->handle(), output_tensor->handle(), acl_axes, keep_dims, acl_common::convertReduceType(reduce_type)); fn = std::move(l); @@ -469,13 +364,13 @@ void KernelGenerator::visit(const ir::operation::Reshape &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); // NOTE This operation must not be changed the layout from frontend to backend // So, PermutationOperationPass makes layouts of frontend and backend the same. const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = output_alloc->layout(); + const auto backend_layout = output_tensor->layout(); assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) || frontend_layout == backend_layout); UNUSED_RELEASE(frontend_layout); @@ -483,7 +378,7 @@ void KernelGenerator::visit(const ir::operation::Reshape &node) auto fn = std::make_unique<::arm_compute::CLReshapeLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle()); + fn->configure(input_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -503,10 +398,10 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node) (void)dims; (void)ndim; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique<arm_compute::CLReshapeLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle()); + fn->configure(input_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); _return_fn = std::move(acl_fn); } @@ -516,15 +411,15 @@ void KernelGenerator::visit(const ir::operation::Tanh &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique<arm_compute::CLActivationLayer>(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f}; - fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + fn->configure(input_tensor->handle(), output_tensor->handle(), act_info); auto acl_fn = asAclClFunction(std::move(fn)); @@ -538,13 +433,13 @@ void KernelGenerator::visit(const ir::operation::Softmax &node) const auto beta = node.param().beta; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique<::arm_compute::CLSoftmaxLayer>( _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); - fn->configure(input_alloc->handle(), output_alloc->handle(), beta); + fn->configure(input_tensor->handle(), output_tensor->handle(), beta); auto acl_fn = asAclClFunction(std::move(fn)); @@ -558,10 +453,10 @@ void KernelGenerator::visit(const ir::operation::Slice &node) const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)}; const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)}; - auto outputData_alloc = _tensor_builder->at(output_index).get(); - auto inputData_alloc = _tensor_builder->at(input_index).get(); + auto outputData_tensor = _tensor_builder->at(output_index).get(); + auto inputData_tensor = _tensor_builder->at(input_index).get(); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = inputData_alloc->layout(); + const auto backend_layout = inputData_tensor->layout(); // Set initializers for indices data such as order of inputData int input_rank = _ctx.at(input_index).shape().rank(); @@ -613,7 +508,7 @@ void KernelGenerator::visit(const ir::operation::Slice &node) auto fn = std::make_unique<::arm_compute::CLSlice>(); - fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set); + fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set); auto acl_fn = asAclClFunction(std::move(fn)); @@ -628,10 +523,10 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node) const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)}; const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)}; - auto outputData_alloc = _tensor_builder->at(output_index).get(); - auto inputData_alloc = _tensor_builder->at(input_index).get(); + auto outputData_tensor = _tensor_builder->at(output_index).get(); + auto inputData_tensor = _tensor_builder->at(input_index).get(); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = inputData_alloc->layout(); + const auto backend_layout = inputData_tensor->layout(); // Set initializers for indices data such as order of inputData int input_rank = _ctx.at(input_index).shape().rank(); @@ -704,7 +599,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node) auto fn = std::make_unique<::arm_compute::CLStridedSlice>(); - fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set, + fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set, begin_mask, end_mask, shrink_axis_mask); auto acl_fn = asAclClFunction(std::move(fn)); @@ -720,10 +615,10 @@ void KernelGenerator::visit(const ir::operation::Transpose &node) const auto rank = _ctx.at(ifm_idx).shape().rank(); - auto ofm_alloc = _tensor_builder->at(ofm_idx).get(); - auto ifm_alloc = _tensor_builder->at(ifm_idx).get(); + auto ofm_tensor = _tensor_builder->at(ofm_idx).get(); + auto ifm_tensor = _tensor_builder->at(ifm_idx).get(); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = ifm_alloc->layout(); + const auto backend_layout = ifm_tensor->layout(); std::vector<std::int32_t> pv(perm.cbegin(), perm.cend()); // Reversed @@ -732,7 +627,7 @@ void KernelGenerator::visit(const ir::operation::Transpose &node) auto fn = std::make_unique<::arm_compute::CLPermute>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), backend_pv); auto acl_fn = asAclClFunction(std::move(fn)); @@ -747,17 +642,18 @@ void KernelGenerator::visit(const ir::operation::Add &node) const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::CLArithmeticAddition>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE); _return_fn = std::make_unique<exec::FunctionSequence>( - asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclClFunction(std::move(fn)), + ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::Sub &node) @@ -768,17 +664,18 @@ void KernelGenerator::visit(const ir::operation::Sub &node) const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::CLArithmeticSubtraction>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE); _return_fn = std::make_unique<exec::FunctionSequence>( - asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclClFunction(std::move(fn)), + ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::Div &node) @@ -789,16 +686,17 @@ void KernelGenerator::visit(const ir::operation::Div &node) const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::CLArithmeticDivision>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); _return_fn = std::make_unique<exec::FunctionSequence>( - asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclClFunction(std::move(fn)), + ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::Exp &node) @@ -806,12 +704,12 @@ void KernelGenerator::visit(const ir::operation::Exp &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique<::arm_compute::CLExpLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle()); + fn->configure(input_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -823,12 +721,12 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique<::arm_compute::CLReshapeLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle()); + fn->configure(input_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -842,20 +740,21 @@ void KernelGenerator::visit(const ir::operation::InstanceNorm &node) const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)}; const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto gamma_alloc = _tensor_builder->at(gamma_index).get(); - auto beta_alloc = _tensor_builder->at(beta_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto gamma_tensor = _tensor_builder->at(gamma_index).get(); + auto beta_tensor = _tensor_builder->at(beta_index).get(); auto epsilon = node.param().epsilon; auto activation = node.param().activation; auto fn = std::make_unique<::arm_compute::CLInstanceNormalizationLayerEx>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), gamma_alloc->handle(), - beta_alloc->handle(), epsilon); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), + beta_tensor->handle(), epsilon); _return_fn = std::make_unique<exec::FunctionSequence>( - asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclClFunction(std::move(fn)), + ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::Logistic &node) @@ -863,15 +762,15 @@ void KernelGenerator::visit(const ir::operation::Logistic &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC}; auto fn = std::make_unique<::arm_compute::CLActivationLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info); auto acl_fn = asAclClFunction(std::move(fn)); @@ -884,13 +783,13 @@ void KernelGenerator::visit(const ir::operation::LogicalAnd &node) const auto input0_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT0)}; const auto input1_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT1)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input0_alloc = _tensor_builder->at(input0_index).get(); - auto input1_alloc = _tensor_builder->at(input1_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input0_tensor = _tensor_builder->at(input0_index).get(); + auto input1_tensor = _tensor_builder->at(input1_index).get(); auto fn = std::make_unique<::arm_compute::CLBinaryLogicalOp>(); - fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(), + fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(), ::arm_compute::BinaryLogicalOperation::AND); auto acl_fn = asAclClFunction(std::move(fn)); @@ -900,159 +799,8 @@ void KernelGenerator::visit(const ir::operation::LogicalAnd &node) void KernelGenerator::visit(const ir::operation::LSTM &node) { - // TODO Support dynamic rnn - // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection. - const auto scratch_buffer_index{ - node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)}; - const auto output_state_out_index{ - node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)}; - const auto cell_state_out_index{ - node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)}; - const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)}; - - const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)}; - const auto input_to_input_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional - const auto input_to_forget_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)}; - const auto input_to_cell_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)}; - const auto input_to_output_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)}; - const auto recurrent_to_input_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional - const auto recurrent_to_forget_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)}; - const auto recurrent_to_cell_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)}; - const auto recurrent_to_output_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)}; - const auto cell_to_input_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional - const auto cell_to_forget_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional - const auto cell_to_output_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional - const auto input_gate_bias_index{ - node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)}; - const auto forget_gate_bias_index{ - node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)}; - const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)}; - const auto output_gate_bias_index{ - node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)}; - const auto projection_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional - const auto projection_bias_index{ - node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional - const auto output_state_in_index{ - node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)}; - const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)}; - const auto cell_threshold = node.param().cell_threshold; - const auto projection_threshold = node.param().projection_threshold; - - bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 && - _ctx.at(input_to_input_weights_index).shape().dim(1) != 0; - bool has_recurrent_to_input_weights = - _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 && - _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0; - bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0; - bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0; - bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 && - _ctx.at(projection_weights_index).shape().dim(1) != 0; - bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0); - - // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG. - // true: no CIFG - // false: CIFG - // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG). - bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights; - - // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole. - // But the cell_to_input_weights does not exist in regular CIFG although peephole. - // true: peephole - // false: no peephole - bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights; - - // NOTE Although the projection weights has data the projection bias may not have data. - bool has_projection_param = has_projection_weights; - - const auto activation = node.param().activation; - const auto cell_clip = cell_threshold; - const auto projection_clip = projection_threshold; - assert(cell_clip >= 0.f && projection_clip >= 0.f); - - auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get(); - auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get(); - auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get(); - auto output_alloc = _tensor_builder->at(output_index).get(); - - auto input_alloc = _tensor_builder->at(input_index).get(); - - auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get(); - auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get(); - auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get(); - auto recurrent_to_forget_weights_alloc = - _tensor_builder->at(recurrent_to_forget_weights_index).get(); - auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get(); - auto recurrent_to_output_weights_alloc = - _tensor_builder->at(recurrent_to_output_weights_index).get(); - - auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get(); - auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get(); - auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get(); - auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get(); - auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get(); - - auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation); - - auto fn = std::make_unique<::arm_compute::CLLSTMLayer>(); - - ::arm_compute::LSTMParams<::arm_compute::ICLTensor> lstm_params{}; - if (has_cifg_param) - { - auto input_to_input_weights_alloc = - _tensor_builder->at(input_to_input_weights_index).get(); // optional - auto recurrent_to_input_weights_alloc = - _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional - auto cell_to_input_weights_handle = - has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle() - : nullptr; // optional (non-cifg && peephole) - auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional - lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(), - recurrent_to_input_weights_alloc->handle(), - cell_to_input_weights_handle, input_gate_bias_alloc->handle()); - } - if (has_peephole_param) - { - auto cell_to_forget_weights_alloc = - _tensor_builder->at(cell_to_forget_weights_index).get(); // optional - auto cell_to_output_weights_alloc = - _tensor_builder->at(cell_to_output_weights_index).get(); // optional - lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(), - cell_to_output_weights_alloc->handle()); - } - if (has_projection_param) - { - auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional - auto projection_bias_handle = has_projection_bias - ? _tensor_builder->at(projection_bias_index).get()->handle() - : nullptr; // optional - lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle); - } - - fn->configure( - input_alloc->handle(), input_to_forget_weights_alloc->handle(), - input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(), - recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(), - recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(), - cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(), - cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(), - output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(), - lstm_params, act_info, cell_clip, projection_clip); - - auto acl_fn = asAclClFunction(std::move(fn)); - - _return_fn = std::move(acl_fn); + _return_fn = acl_common::kernelGenLSTM<acl_common::AclClFunction, ::arm_compute::ICLTensor, + ::arm_compute::CLLSTMLayer>(node, _ctx, _tensor_builder); } void KernelGenerator::visit(const ir::operation::Comparison &node) @@ -1063,13 +811,13 @@ void KernelGenerator::visit(const ir::operation::Comparison &node) const auto comparison_type = node.param().comparison_type; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input0_alloc = _tensor_builder->at(input0_index).get(); - auto input1_alloc = _tensor_builder->at(input1_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input0_tensor = _tensor_builder->at(input0_index).get(); + auto input1_tensor = _tensor_builder->at(input1_index).get(); auto fn = std::make_unique<::arm_compute::CLComparison>(); - fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(), + fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(), (arm_compute::ComparisonOperation)comparison_type); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1107,13 +855,13 @@ void KernelGenerator::visit(const ir::operation::Pack &node) for (const auto &input_index : input_indexes) { size_t input_rank = _ctx.at(input_index).shape().rank(); - const auto &input_alloc = _tensor_builder->at(input_index); - orig_inputs_acl_tensor_shapes.emplace_back(input_alloc->info()->tensor_shape()); - assert(input_rank == input_alloc->num_dimensions()); - if (input_rank != input_alloc->info()->num_dimensions()) + const auto &input_tensor = _tensor_builder->at(input_index); + orig_inputs_acl_tensor_shapes.emplace_back(input_tensor->info()->tensor_shape()); + assert(input_rank == input_tensor->num_dimensions()); + if (input_rank != input_tensor->info()->num_dimensions()) { // This means that high dimension's value is 1 and ifm tensor is applied dim_correction - input_alloc->info()->set_tensor_shape(acl_common::asTensorShape( + input_tensor->info()->set_tensor_shape(acl_common::asTensorShape( _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false)); } } @@ -1135,8 +883,8 @@ void KernelGenerator::visit(const ir::operation::Permute &node) const auto ofm_idx{node.getOutputs().at(0)}; const auto ifm_idx{node.getInputs().at(0)}; const auto permute_type = node.getPermuteType(); - auto ofm_alloc = _tensor_builder->at(ofm_idx).get(); - auto ifm_alloc = _tensor_builder->at(ifm_idx).get(); + auto ofm_tensor = _tensor_builder->at(ofm_idx).get(); + auto ifm_tensor = _tensor_builder->at(ifm_idx).get(); const auto rank = _ctx.at(ofm_idx).shape().rank(); assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank()); @@ -1149,7 +897,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node) auto l = std::make_unique<::arm_compute::CLPermute>(); - l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv); + l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv); fn = std::move(l); } @@ -1160,7 +908,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node) auto l = std::make_unique<::arm_compute::CLPermute>(); - l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv); + l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv); fn = std::move(l); } @@ -1168,7 +916,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node) { auto l = std::make_unique<::arm_compute::CLCopy>(); - l->configure(ifm_alloc->handle(), ofm_alloc->handle()); + l->configure(ifm_tensor->handle(), ofm_tensor->handle()); fn = std::move(l); } @@ -1183,12 +931,12 @@ void KernelGenerator::visit(const ir::operation::RSQRT &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); auto fn = std::make_unique<::arm_compute::CLRsqrtLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle()); _return_fn = asAclClFunction(std::move(fn)); } @@ -1198,15 +946,15 @@ void KernelGenerator::visit(const ir::operation::ReLU &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::ReLU::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique<arm_compute::CLActivationLayer>(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU}; - fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + fn->configure(input_tensor->handle(), output_tensor->handle(), act_info); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1219,12 +967,12 @@ void KernelGenerator::visit(const ir::operation::ResizeBilinear &node) const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); auto fn = std::make_unique<::arm_compute::CLScale>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE, ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT); @@ -1238,15 +986,15 @@ void KernelGenerator::visit(const ir::operation::ReLU1 &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::ReLU1::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f}; auto fn = std::make_unique<::arm_compute::CLActivationLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1258,15 +1006,15 @@ void KernelGenerator::visit(const ir::operation::ReLU6 &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::ReLU6::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f}; auto fn = std::make_unique<::arm_compute::CLActivationLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1288,25 +1036,25 @@ void KernelGenerator::visit(const ir::operation::RNN &node) const auto activation = node.param().activation; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto hidden_state_out_tensor = _tensor_builder->at(hidden_state_out_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); - auto weights_alloc = _tensor_builder->at(weights_index).get(); - auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get(); - auto bias_alloc = _tensor_builder->at(bias_index).get(); - auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); + auto weights_tensor = _tensor_builder->at(weights_index).get(); + auto recurrent_weights_tensor = _tensor_builder->at(recurrent_weights_index).get(); + auto bias_tensor = _tensor_builder->at(bias_index).get(); + auto hidden_state_in_tensor = _tensor_builder->at(hidden_state_in_index).get(); auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation); auto copy_layer = std::make_unique<::arm_compute::CLCopy>(); - copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle()); + copy_layer->configure(hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle()); _return_fn = asAclClFunction(std::move(copy_layer)); - auto fn = std::make_unique<::arm_compute::CLRNNLayerEx>( + auto fn = std::make_unique<::arm_compute::CLRNNLayer>( _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); - fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(), - bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(), - act_info); + fn->configure(input_tensor->handle(), weights_tensor->handle(), + recurrent_weights_tensor->handle(), bias_tensor->handle(), + hidden_state_out_tensor->handle(), output_tensor->handle(), act_info); _return_fn = asAclClFunction(std::move(fn)); } @@ -1315,12 +1063,12 @@ void KernelGenerator::visit(const ir::operation::Floor &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); auto fn = std::make_unique<::arm_compute::CLFloor>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1335,10 +1083,10 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node) node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)}; const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto block_size_alloc = _tensor_builder->at(block_size_index).get(); - auto paddings_alloc = _tensor_builder->at(paddings_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto block_size_tensor = _tensor_builder->at(block_size_index).get(); + auto paddings_tensor = _tensor_builder->at(paddings_index).get(); assert(_ctx.at(block_size_index).data()); assert(_ctx.at(paddings_index).data()); @@ -1346,8 +1094,8 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node) std::unique_ptr<::arm_compute::IFunction> fn; auto l = std::make_unique<::arm_compute::CLSpaceToBatchLayer>(); - l->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(), - ofm_alloc->handle()); + l->configure(ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(), + ofm_tensor->handle()); fn = std::move(l); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1362,12 +1110,12 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node) auto block_size = node.param().block_size; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); - auto fn = std::make_unique<::arm_compute::CLSpaceToDepth>(); + auto fn = std::make_unique<::arm_compute::CLSpaceToDepthLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), block_size); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1376,32 +1124,15 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node) void KernelGenerator::visit(const ir::operation::L2Pool2D &node) { - const auto ofm_index{node.getOutputs().at(0)}; - const auto ifm_index{node.getInputs().at(ir::operation::L2Pool2D::Input::INPUT)}; + auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::CLPoolingLayer>( + node, _ctx, _tensor_builder, _current_op_seq_layout, ::arm_compute::PoolingType::L2); - const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout); - const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout); - - uint32_t kw = node.param().kw; - uint32_t kh = node.param().kh; - const auto stride = node.param().stride; - const auto padding = - ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); + const auto ofm_index{node.getOutputs().at(0)}; + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); const auto activation = node.param().activation; - - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - - ::arm_compute::PoolingLayerInfo info{ - ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh}, - ::onert::backend::acl_common::asPadStrideInfo(padding, stride)}; - - auto fn = std::make_unique<::arm_compute::CLPoolingLayer>(); - - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); - _return_fn = std::make_unique<exec::FunctionSequence>( - asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclClFunction(std::move(raw_fn)), + ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node) @@ -1410,13 +1141,13 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node) const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)}; const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto lookups_alloc = _tensor_builder->at(lookups_index).get(); - auto values_alloc = _tensor_builder->at(values_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto lookups_tensor = _tensor_builder->at(lookups_index).get(); + auto values_tensor = _tensor_builder->at(values_index).get(); auto fn = std::make_unique<::arm_compute::CLEmbeddingLookup>(); - fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle()); + fn->configure(values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1442,15 +1173,15 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node) float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction) float bias = 0.0f; // Don't offset the reduction. - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP, radius, alpha, beta, bias, false); auto fn = std::make_unique<::arm_compute::CLNormalizationLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1466,17 +1197,17 @@ void KernelGenerator::visit(const ir::operation::HashtableLookup &node) const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)}; const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto hits_alloc = _tensor_builder->at(hits_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto hits_tensor = _tensor_builder->at(hits_index).get(); - auto lookups_alloc = _tensor_builder->at(lookups_index).get(); - auto keys_alloc = _tensor_builder->at(keys_index).get(); - auto values_alloc = _tensor_builder->at(values_index).get(); + auto lookups_tensor = _tensor_builder->at(lookups_index).get(); + auto keys_tensor = _tensor_builder->at(keys_index).get(); + auto values_tensor = _tensor_builder->at(values_index).get(); auto fn = std::make_unique<::arm_compute::CLHashtableLookup>(); - fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(), - output_alloc->handle(), hits_alloc->handle()); + fn->configure(lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(), + output_tensor->handle(), hits_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1489,13 +1220,13 @@ void KernelGenerator::visit(const ir::operation::PReLU &node) const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)}; const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto alpha_alloc = _tensor_builder->at(alpha_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto alpha_tensor = _tensor_builder->at(alpha_index).get(); - auto fn = std::make_unique<::arm_compute::CLPReLU>(); + auto fn = std::make_unique<::arm_compute::CLPReluLayer>(); - fn->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle()); + fn->configure(ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1518,7 +1249,6 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node) (node.param().padding.type == ir::PaddingType::VALID)); auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride, ker_shape.W, ker_shape.H); - uint32_t invalid_horizontal = 0; uint32_t invalid_vertical = 0; if (node.param().padding.type == ir::PaddingType::VALID) @@ -1528,17 +1258,17 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node) invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1); } - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto ker_alloc = _tensor_builder->at(ker_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto ker_tensor = _tensor_builder->at(ker_index).get(); const auto tconv_info = acl_common::asPadStrideInfo(padding, stride); auto fn = std::make_unique<::arm_compute::CLTransposeConvLayer>( _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); - fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info, - invalid_horizontal, invalid_vertical); + fn->configure(ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(), + tconv_info, invalid_horizontal, invalid_vertical); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1550,15 +1280,15 @@ void KernelGenerator::visit(const ir::operation::SQRT &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::SQRT::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT}; auto fn = std::make_unique<::arm_compute::CLActivationLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + fn->configure(input_tensor->handle(), output_tensor->handle(), act_info); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1571,13 +1301,13 @@ void KernelGenerator::visit(const ir::operation::LogicalOr &node) const auto input0_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT0)}; const auto input1_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT1)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input0_alloc = _tensor_builder->at(input0_index).get(); - auto input1_alloc = _tensor_builder->at(input1_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input0_tensor = _tensor_builder->at(input0_index).get(); + auto input1_tensor = _tensor_builder->at(input1_index).get(); auto fn = std::make_unique<::arm_compute::CLBitwiseOr>(); - fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle()); + fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1589,12 +1319,12 @@ void KernelGenerator::visit(const ir::operation::LogicalNot &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::LogicalNot::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique<::arm_compute::CLBitwiseNot>(); - fn->configure(input_alloc->handle(), output_alloc->handle()); + fn->configure(input_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1607,13 +1337,13 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node) const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::CLElementwiseSquaredDiff>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1634,13 +1364,13 @@ void KernelGenerator::visit(const ir::operation::TopKV2 &node) const auto k = node.param().k; - auto values_alloc = _tensor_builder->at(outputValues_index).get(); - auto indices_alloc = _tensor_builder->at(outputIndices_index).get(); - auto input_alloc = _tensor_builder->at(inputData_index).get(); + auto values_tensor = _tensor_builder->at(outputValues_index).get(); + auto indices_tensor = _tensor_builder->at(outputIndices_index).get(); + auto input_tensor = _tensor_builder->at(inputData_index).get(); auto fn = std::make_unique<::arm_compute::CLTopKV2>(); - fn->configure(input_alloc->handle(), k, values_alloc->handle(), indices_alloc->handle()); + fn->configure(input_tensor->handle(), k, values_tensor->handle(), indices_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1659,9 +1389,9 @@ void KernelGenerator::visit(const ir::operation::Gather &node) const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw); const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value(); - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto indices_alloc = _tensor_builder->at(indices_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto indices_tensor = _tensor_builder->at(indices_index).get(); // NOTE The frontend layout and backend layout must be the same for this operation. // If not the same, we have to add a stage(?) to perform permutation of output tensor. It @@ -1671,43 +1401,43 @@ void KernelGenerator::visit(const ir::operation::Gather &node) // a model. For example, if a model in NHWC has this operation as output rank == 4, indices // rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W // and C are not sequential in NCHW. So the backend in NCHW cannot handle this case. - const auto backend_layout = ofm_alloc->layout(); + const auto backend_layout = ofm_tensor->layout(); UNUSED_RELEASE(backend_layout); - assert(backend_layout == ifm_alloc->layout()); - assert(backend_layout == indices_alloc->layout()); + assert(backend_layout == ifm_tensor->layout()); + assert(backend_layout == indices_tensor->layout()); assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout); auto fn = std::make_unique<::arm_compute::CLGatherEx>(); // input is n-D, indices k-D, output is (n + k - 1)-D size_t n = ifm_rank; - assert(n == ifm_alloc->num_dimensions()); + assert(n == ifm_tensor->num_dimensions()); size_t k = _ctx.at(indices_index).shape().rank(); - assert(k == indices_alloc->num_dimensions()); + assert(k == indices_tensor->num_dimensions()); // Disable applied dim_correction - const auto orig_ifm_acl_tensor_shape = ifm_alloc->info()->tensor_shape(); - if (n != ifm_alloc->info()->num_dimensions()) + const auto orig_ifm_acl_tensor_shape = ifm_tensor->info()->tensor_shape(); + if (n != ifm_tensor->info()->num_dimensions()) { // This means that high dimension's value is 1 and ifm tensor is applied dim_correction const auto ifm = _ctx.at(ifm_index); - ifm_alloc->info()->set_tensor_shape( + ifm_tensor->info()->set_tensor_shape( acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false)); } - const auto orig_indice_acl_tensor_shape = indices_alloc->info()->tensor_shape(); - if (k != indices_alloc->info()->num_dimensions()) + const auto orig_indice_acl_tensor_shape = indices_tensor->info()->tensor_shape(); + if (k != indices_tensor->info()->num_dimensions()) { // This means that high dimension's value is 1 and indices tensor is applied dim_correction const auto indices = _ctx.at(indices_index); - indices_alloc->info()->set_tensor_shape( + indices_tensor->info()->set_tensor_shape( acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false)); } - fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis); + fn->configure(ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis); // Revert disabling applied dim_correction - ifm_alloc->info()->set_tensor_shape(orig_ifm_acl_tensor_shape); - indices_alloc->info()->set_tensor_shape(orig_indice_acl_tensor_shape); + ifm_tensor->info()->set_tensor_shape(orig_ifm_acl_tensor_shape); + indices_tensor->info()->set_tensor_shape(orig_indice_acl_tensor_shape); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1719,12 +1449,12 @@ void KernelGenerator::visit(const ir::operation::Neg &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); auto fn = std::make_unique<::arm_compute::CLNeg>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1736,15 +1466,15 @@ void KernelGenerator::visit(const ir::operation::Abs &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS}; auto fn = std::make_unique<::arm_compute::CLActivationLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + fn->configure(input_tensor->handle(), output_tensor->handle(), act_info); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1761,11 +1491,11 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node) assert((ifm_shape.rank() - 1) == ofm_shape.rank()); - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); const auto ifm_rank = _ctx.at(ifm_index).shape().rank(); auto frontend_layout = _current_op_seq_layout; - auto backend_layout = ifm_alloc->layout(); + auto backend_layout = ifm_tensor->layout(); int axis_value = node.param().axis; if (axis_value < 0) @@ -1776,10 +1506,10 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node) auto acl_axis = acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value(); - auto fn = std::make_unique<::arm_compute::CLArgOperation>(); + auto fn = std::make_unique<::arm_compute::CLArgMinMaxLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), {acl_axis}, - ::arm_compute::ArgOperation::MAX); + fn->configure(ifm_tensor->handle(), acl_axis, ofm_tensor->handle(), + ::arm_compute::ReductionOperation::ARG_IDX_MAX); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1791,12 +1521,12 @@ void KernelGenerator::visit(const ir::operation::Dequantize &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Dequantize::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); - auto fn = std::make_unique<::arm_compute::CLCast>(); + auto fn = std::make_unique<::arm_compute::CLDequantizationLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle(), arm_compute::SubDataType::NONE); + fn->configure(input_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1814,15 +1544,15 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod auto beta = node.param().beta; auto bias = node.param().bias; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); const auto norm_info = ::arm_compute::NormalizationLayerInfo( ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false); auto fn = std::make_unique<::arm_compute::CLNormalizationLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1837,12 +1567,12 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node) auto block_size = node.param().block_size; assert(block_size > 0); - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); - auto fn = std::make_unique<::arm_compute::CLDepthToSpace>(); + auto fn = std::make_unique<::arm_compute::CLDepthToSpaceLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle(), block_size); + fn->configure(input_tensor->handle(), output_tensor->handle(), block_size); auto acl_fn = asAclClFunction(std::move(fn)); @@ -1860,13 +1590,13 @@ void KernelGenerator::visit(const ir::operation::Split &node) for (const auto &output : node.getOutputs()) output_indexes.emplace_back(output); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - std::vector<arm_compute::ICLTensor *> output_allocs; + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + std::vector<arm_compute::ICLTensor *> output_tensors; for (const auto &ofm_ind : output_indexes) - output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle()); + output_tensors.emplace_back(_tensor_builder->at(ofm_ind).get()->handle()); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = ifm_alloc->layout(); + const auto backend_layout = ifm_tensor->layout(); auto axis = node.param().axis; if (axis < 0) axis += ifm_rank; @@ -1874,7 +1604,7 @@ void KernelGenerator::visit(const ir::operation::Split &node) auto fn = std::make_unique<::arm_compute::CLSplit>(); - fn->configure(ifm_alloc->handle(), output_allocs, axis); + fn->configure(ifm_tensor->handle(), output_tensors, axis); _return_fn = asAclClFunction(std::move(fn)); } @@ -1906,13 +1636,13 @@ void KernelGenerator::visit(const ir::operation::Unpack &node) for (const auto &output_index : output_indexes) { size_t output_rank = _ctx.at(output_index).shape().rank(); - const auto &output_alloc = _tensor_builder->at(output_index); - orig_outputs_acl_tensor_shapes.emplace_back(output_alloc->info()->tensor_shape()); - assert(output_rank == output_alloc->num_dimensions()); - if (output_rank != output_alloc->info()->num_dimensions()) + const auto &output_tensor = _tensor_builder->at(output_index); + orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape()); + assert(output_rank == output_tensor->num_dimensions()); + if (output_rank != output_tensor->info()->num_dimensions()) { // This means that high dimension's value is 1 and ifm tensor is applied dim_correction - output_alloc->info()->set_tensor_shape(acl_common::asTensorShape( + output_tensor->info()->set_tensor_shape(acl_common::asTensorShape( _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false)); } } @@ -1959,12 +1689,12 @@ void KernelGenerator::visit(const ir::operation::Pad &node) // Disable applied dim_correction size_t input_rank = _ctx.at(input_index).shape().rank(); - const auto &input_alloc = _tensor_builder->at(input_index); - assert(input_rank == input_alloc->num_dimensions()); - if (input_rank != input_alloc->info()->num_dimensions()) + const auto &input_tensor = _tensor_builder->at(input_index); + assert(input_rank == input_tensor->num_dimensions()); + if (input_rank != input_tensor->info()->num_dimensions()) { // This means that high dimension's value is 1 and ifm tensor is applied dim_correction - input_alloc->info()->set_tensor_shape(acl_common::asTensorShape( + input_tensor->info()->set_tensor_shape(acl_common::asTensorShape( _ctx.at(input_index).shape(), frontend_layout, backend_layout, false)); } @@ -1982,13 +1712,13 @@ void KernelGenerator::visit(const ir::operation::Min &node) const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::CLElementwiseMin>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -2001,13 +1731,13 @@ void KernelGenerator::visit(const ir::operation::Max &node) const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::CLElementwiseMax>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclClFunction(std::move(fn)); @@ -2019,12 +1749,12 @@ void KernelGenerator::visit(const ir::operation::ConvertFp32ToFp16 &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp32ToFp16::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); auto fn = std::make_unique<::arm_compute::CLDepthConvertLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), ::arm_compute::ConvertPolicy::SATURATE, + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0); auto acl_fn = asAclClFunction(std::move(fn)); @@ -2037,12 +1767,12 @@ void KernelGenerator::visit(const ir::operation::ConvertFp16ToFp32 &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp16ToFp32::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); auto fn = std::make_unique<::arm_compute::CLDepthConvertLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), ::arm_compute::ConvertPolicy::SATURATE, + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0); auto acl_fn = asAclClFunction(std::move(fn)); diff --git a/runtime/onert/backend/acl_common/AclKernelGen.h b/runtime/onert/backend/acl_common/AclKernelGen.h new file mode 100644 index 000000000..9f7ce3764 --- /dev/null +++ b/runtime/onert/backend/acl_common/AclKernelGen.h @@ -0,0 +1,315 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_ +#define __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_ + +#include <exec/IFunction.h> +#include <ir/Operands.h> + +#include <ir/operation/LSTM.h> +#include <arm_compute/runtime/CL/CLFunctions.h> + +namespace onert +{ +namespace backend +{ +namespace acl_common +{ + +template <typename T_FunctionWrapper, typename T_Tensor, typename T_ACLLayer, + typename T_TensorBuilder> +std::unique_ptr<exec::IFunction> +kernelGenLSTM(const ir::operation::LSTM &node, const ir::Operands &operands, + const std::shared_ptr<T_TensorBuilder> &tensor_builder) +{ + // TODO Support dynamic rnn + // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection. + const auto scratch_buffer_index{ + node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)}; + const auto output_state_out_index{ + node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)}; + const auto cell_state_out_index{ + node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)}; + const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)}; + + const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)}; + const auto input_to_input_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional + const auto input_to_forget_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)}; + const auto input_to_cell_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)}; + const auto input_to_output_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)}; + const auto recurrent_to_input_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional + const auto recurrent_to_forget_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)}; + const auto recurrent_to_cell_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)}; + const auto recurrent_to_output_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)}; + const auto cell_to_input_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional + const auto cell_to_forget_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional + const auto cell_to_output_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional + const auto input_gate_bias_index{ + node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)}; + const auto forget_gate_bias_index{ + node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)}; + const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)}; + const auto output_gate_bias_index{ + node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)}; + const auto projection_weights_index{ + node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional + const auto projection_bias_index{ + node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional + const auto output_state_in_index{ + node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)}; + const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)}; + const auto cell_threshold = node.param().cell_threshold; + const auto projection_threshold = node.param().projection_threshold; + + bool has_input_to_input_weights = operands.at(input_to_input_weights_index).shape().dim(0) != 0 && + operands.at(input_to_input_weights_index).shape().dim(1) != 0; + bool has_recurrent_to_input_weights = + operands.at(recurrent_to_input_weights_index).shape().dim(0) != 0 && + operands.at(recurrent_to_input_weights_index).shape().dim(1) != 0; + bool has_cell_to_forget_weights = operands.at(cell_to_forget_weights_index).shape().dim(0) != 0; + bool has_cell_to_output_weights = operands.at(cell_to_output_weights_index).shape().dim(0) != 0; + bool has_projection_weights = operands.at(projection_weights_index).shape().dim(0) != 0 && + operands.at(projection_weights_index).shape().dim(1) != 0; + bool has_projection_bias = operands.at(projection_bias_index).shape().dim(0); + + // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG. + // true: no CIFG + // false: CIFG + // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG). + bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights; + + // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole. + // But the cell_to_input_weights does not exist in regular CIFG although peephole. + // true: peephole + // false: no peephole + bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights; + + // NOTE Although the projection weights has data the projection bias may not have data. + bool has_projection_param = has_projection_weights; + + const auto activation = node.param().activation; + const auto cell_clip = cell_threshold; + const auto projection_clip = projection_threshold; + assert(cell_clip >= 0.f && projection_clip >= 0.f); + + auto scratch_buffer_tensor = tensor_builder->at(scratch_buffer_index).get(); + auto output_state_out_tensor = tensor_builder->at(output_state_out_index).get(); + auto cell_state_out_tensor = tensor_builder->at(cell_state_out_index).get(); + auto output_tensor = tensor_builder->at(output_index).get(); + + auto input_tensor = tensor_builder->at(input_index).get(); + + auto input_to_forget_weights_tensor = tensor_builder->at(input_to_forget_weights_index).get(); + auto input_to_cell_weights_tensor = tensor_builder->at(input_to_cell_weights_index).get(); + auto input_to_output_weights_tensor = tensor_builder->at(input_to_output_weights_index).get(); + auto recurrent_to_forget_weights_tensor = + tensor_builder->at(recurrent_to_forget_weights_index).get(); + auto recurrent_to_cell_weights_tensor = tensor_builder->at(recurrent_to_cell_weights_index).get(); + auto recurrent_to_output_weights_tensor = + tensor_builder->at(recurrent_to_output_weights_index).get(); + + auto forget_gate_bias_tensor = tensor_builder->at(forget_gate_bias_index).get(); + auto cell_bias_tensor = tensor_builder->at(cell_bias_index).get(); + auto output_gate_bias_tensor = tensor_builder->at(output_gate_bias_index).get(); + auto output_state_in_tensor = tensor_builder->at(output_state_in_index).get(); + auto cell_state_in_tensor = tensor_builder->at(cell_state_in_index).get(); + + auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation); + + auto fn = std::make_unique<T_ACLLayer>(); + + ::arm_compute::LSTMParams<T_Tensor> lstm_params{}; + if (has_cifg_param) + { + auto input_to_input_weights_tensor = + tensor_builder->at(input_to_input_weights_index).get(); // optional + auto recurrent_to_input_weights_tensor = + tensor_builder->at(recurrent_to_input_weights_index).get(); // optional + auto cell_to_input_weights_handle = + has_peephole_param ? tensor_builder->at(cell_to_input_weights_index).get()->handle() + : nullptr; // optional (non-cifg && peephole) + auto input_gate_bias_tensor = tensor_builder->at(input_gate_bias_index).get(); // optional + lstm_params.set_cifg_params(input_to_input_weights_tensor->handle(), + recurrent_to_input_weights_tensor->handle(), + cell_to_input_weights_handle, input_gate_bias_tensor->handle()); + } + if (has_peephole_param) + { + auto cell_to_forget_weights_tensor = + tensor_builder->at(cell_to_forget_weights_index).get(); // optional + auto cell_to_output_weights_tensor = + tensor_builder->at(cell_to_output_weights_index).get(); // optional + lstm_params.set_peephole_params(cell_to_forget_weights_tensor->handle(), + cell_to_output_weights_tensor->handle()); + } + if (has_projection_param) + { + auto projection_weights_tensor = tensor_builder->at(projection_weights_index).get(); // optional + auto projection_bias_handle = has_projection_bias + ? tensor_builder->at(projection_bias_index).get()->handle() + : nullptr; // optional + lstm_params.set_projection_params(projection_weights_tensor->handle(), projection_bias_handle); + } + + fn->configure(input_tensor->handle(), input_to_forget_weights_tensor->handle(), + input_to_cell_weights_tensor->handle(), input_to_output_weights_tensor->handle(), + recurrent_to_forget_weights_tensor->handle(), + recurrent_to_cell_weights_tensor->handle(), + recurrent_to_output_weights_tensor->handle(), forget_gate_bias_tensor->handle(), + cell_bias_tensor->handle(), output_gate_bias_tensor->handle(), + output_state_in_tensor->handle(), cell_state_in_tensor->handle(), + scratch_buffer_tensor->handle(), output_state_out_tensor->handle(), + cell_state_out_tensor->handle(), output_tensor->handle(), lstm_params, act_info, + cell_clip, projection_clip); + + return std::make_unique<T_FunctionWrapper>(std::move(fn)); +} + +template <typename T_FunctionWrapper, typename T_Tensor, typename T_ACLLayer, + typename T_TensorBuilder> +std::unique_ptr<exec::IFunction> +kernelGenFullyConnected(const ir::operation::FullyConnected &node, const ir::Operands &operands, + const std::shared_ptr<T_TensorBuilder> &tensor_builder, ir::Layout layout) +{ + using ir::operation::FullyConnected; + + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)}; + const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)}; + const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)}; + + const auto input_rank = operands.at(input_index).shape().rank(); + + const auto output_size = + operands.at(output_index).shape().dim(operands.at(output_index).shape().rank() - 1); + UNUSED_RELEASE(output_size); + assert(operands.at(bias_index).shape().dim(0) == output_size); + assert(operands.at(weight_index).shape().dim(0) == output_size); + const auto batch_size = + operands.at(output_index).shape().dim(operands.at(output_index).shape().rank() - 2); + const auto input_size = + operands.at(weight_index).shape().dim(operands.at(weight_index).shape().rank() - 1); + + // Check for reshaping input's shape into rank-2 + bool needs_reshape = false; + ir::Shape reshape(2); + if (input_rank == 3 || input_rank == 4) + { + const auto &ifm_shape = operands.at(input_index).shape(); + auto feature_size = 1; + for (int i = 0; i < ifm_shape.rank(); ++i) + { + feature_size *= ifm_shape.dim(i); + } + + UNUSED_RELEASE(feature_size); + assert(feature_size == batch_size * input_size); + + // for reshaping + needs_reshape = true; + reshape.dim(0) = batch_size; /* H */ + reshape.dim(1) = input_size; /* W */ + } + + auto output_tensor = tensor_builder->at(output_index).get(); + const auto input_tensor = tensor_builder->at(input_index).get(); + const auto weight_tensor = tensor_builder->at(weight_index).get(); + const auto bias_tensor = tensor_builder->at(bias_index).get(); + const auto frontend_layout = layout; + const auto acl_layout = output_tensor->handle()->info()->data_layout(); + + auto fn = + std::make_unique<T_ACLLayer>(tensor_builder->acl_tensor_manager()->internal_buffer_manager()); + + typename T_ACLLayer::KernelType kernel_type = T_ACLLayer::KernelType::GENERAL; + if (operands.at(weight_index).isConstant()) + { + kernel_type = T_ACLLayer::KernelType::PREPROCESSED_WEIGHTS; + assert(operands.at(weight_index).data()); + } + + fn->configure( + input_tensor->handle(), weight_tensor->handle(), bias_tensor->handle(), + output_tensor->handle(), needs_reshape, + ::onert::backend::acl_common::asTensorShape( + reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)), + kernel_type); + + return std::make_unique<T_FunctionWrapper>(std::move(fn)); +} + +template <typename T_ACLLayer, typename T_PoolOp, typename T_TensorBuilder> +std::unique_ptr<::arm_compute::IFunction> +kernelGenPool2D(const T_PoolOp &node, const ir::Operands &operands, + const std::shared_ptr<T_TensorBuilder> &tensor_builder, ir::Layout layout, + ::arm_compute::PoolingType pooling_type) +{ + const auto ofm_index{node.getOutputs().at(0)}; + const auto ifm_index{node.getInputs().at(0)}; + + const auto ofm_shape = operands.at(ofm_index).shape().asFeature(layout); + const auto ifm_shape = operands.at(ifm_index).shape().asFeature(layout); + + const auto kh = node.param().kh; + const auto kw = node.param().kw; + const auto stride = node.param().stride; + const auto padding = + ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); + + VERBOSE(Pool2DParam) << "IFM_H: " << ifm_shape.H << std::endl; + VERBOSE(Pool2DParam) << "IFM_W: " << ifm_shape.W << std::endl; + VERBOSE(Pool2DParam) << "OFM_H: " << ofm_shape.H << std::endl; + VERBOSE(Pool2DParam) << "OFM_W: " << ofm_shape.W << std::endl; + VERBOSE(Pool2DParam) << "KER_H: " << kh << std::endl; + VERBOSE(Pool2DParam) << "KER_W: " << kw << std::endl; + VERBOSE(Pool2DParam) << "STRIDE_H: " << stride.vertical << std::endl; + VERBOSE(Pool2DParam) << "STRIDE_W: " << stride.horizontal << std::endl; + VERBOSE(Pool2DParam) << "PAD(T): " << padding.top << std::endl; + VERBOSE(Pool2DParam) << "PAD(B): " << padding.bottom << std::endl; + VERBOSE(Pool2DParam) << "PAD(L): " << padding.left << std::endl; + VERBOSE(Pool2DParam) << "PAD(R): " << padding.right << std::endl; + + auto ofm_tensor = tensor_builder->at(ofm_index).get(); + auto ifm_tensor = tensor_builder->at(ifm_index).get(); + + ::arm_compute::PoolingLayerInfo info{ + pooling_type, ::arm_compute::Size2D{kw, kh}, ifm_tensor->info()->data_layout(), + acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */}; + + auto fn = std::make_unique<T_ACLLayer>(); + + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info); + + return fn; +} + +} // namespace acl_common +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_ diff --git a/runtime/onert/backend/acl_neon/KernelGenerator.cc b/runtime/onert/backend/acl_neon/KernelGenerator.cc index e47186754..1195b83cc 100644 --- a/runtime/onert/backend/acl_neon/KernelGenerator.cc +++ b/runtime/onert/backend/acl_neon/KernelGenerator.cc @@ -31,6 +31,7 @@ #include "exec/NopFunction.h" #include "util/logging.h" #include "util/Utils.h" +#include "AclKernelGen.h" namespace onert { @@ -74,15 +75,15 @@ void KernelGenerator::visit(const ir::operation::Abs &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS}; auto fn = std::make_unique<::arm_compute::NEActivationLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + fn->configure(input_tensor->handle(), output_tensor->handle(), act_info); auto acl_fn = asAclFunction(std::move(fn)); @@ -96,10 +97,10 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node) const auto ifm_rank = _ctx.at(ifm_index).shape().rank(); - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); auto frontend_layout = _current_op_seq_layout; - auto backend_layout = ifm_alloc->layout(); + auto backend_layout = ifm_tensor->layout(); int axis_value = node.param().axis; if (axis_value < 0) @@ -112,7 +113,7 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node) auto fn = std::make_unique<::arm_compute::NEArgMinMaxLayer>(); - fn->configure(ifm_alloc->handle(), fixed_axis, ofm_alloc->handle(), + fn->configure(ifm_tensor->handle(), fixed_axis, ofm_tensor->handle(), arm_compute::ReductionOperation::ARG_IDX_MAX); auto acl_fn = asAclFunction(std::move(fn)); @@ -127,15 +128,15 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node) const auto block_size_index{ node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto block_size_alloc = _tensor_builder->at(block_size_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto block_size_tensor = _tensor_builder->at(block_size_index).get(); assert(_ctx.at(block_size_index).data()); auto fn = std::make_unique<::arm_compute::NEBatchToSpaceLayer>(); - fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), ofm_alloc->handle()); + fn->configure(ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -147,15 +148,26 @@ void KernelGenerator::visit(const ir::operation::Cast &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); - auto fn = std::make_unique<::arm_compute::NECast>(); + std::unique_ptr<::arm_compute::IFunction> fn; + if (ifm_tensor->data_type() == ofm_tensor->data_type()) + { + auto l = std::make_unique<::arm_compute::NECopy>(); - auto input_sub_type = _ctx.at(ifm_index).typeInfo().type() == ir::DataType::BOOL8 - ? arm_compute::SubDataType::BOOL - : arm_compute::SubDataType::NONE; - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), input_sub_type); + l->configure(ifm_tensor->handle(), ofm_tensor->handle()); + + fn = std::move(l); + } + else + { + auto l = std::make_unique<::arm_compute::NECast>(); + + l->configure(ifm_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE); + + fn = std::move(l); + } auto acl_fn = asAclFunction(std::move(fn)); @@ -183,10 +195,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) ker_width, ker_height); const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto ker_alloc = _tensor_builder->at(ker_index).get(); - auto bias_alloc = _tensor_builder->at(bias_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto ker_tensor = _tensor_builder->at(ker_index).get(); + auto bias_tensor = _tensor_builder->at(bias_index).get(); const auto conv_info = acl_common::asPadStrideInfo(padding, stride); const auto act_info = acl_common::asActivationLayerInfo(activation); @@ -194,8 +206,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) auto fn = std::make_unique<::arm_compute::NEConvolutionLayer>( _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); - fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(), - conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info); + fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), + ofm_tensor->handle(), conv_info, ::arm_compute::WeightsInfo(), + ::arm_compute::Size2D(1U, 1U), act_info); _return_fn = asAclFunction(std::move(fn)); } @@ -208,12 +221,12 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node) auto block_size = node.param().block_size; assert(block_size > 0); - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); - auto fn = std::make_unique<::arm_compute::NEDepthToSpaceLayerEx>(); + auto fn = std::make_unique<::arm_compute::NEDepthToSpaceLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle(), block_size); + fn->configure(input_tensor->handle(), output_tensor->handle(), block_size); auto acl_fn = asAclFunction(std::move(fn)); @@ -242,10 +255,10 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) const auto multiplier = node.param().multiplier; const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto ker_alloc = _tensor_builder->at(ker_index).get(); - auto bias_alloc = _tensor_builder->at(bias_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto ker_tensor = _tensor_builder->at(ker_index).get(); + auto bias_tensor = _tensor_builder->at(bias_index).get(); const auto conv_info = acl_common::asPadStrideInfo(padding, stride); const auto act_info = acl_common::asActivationLayerInfo(activation); @@ -253,8 +266,8 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) { auto fn = std::make_unique<::arm_compute::NEDepthwiseConvolutionLayer>(); - fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), - ofm_alloc->handle(), conv_info, multiplier, act_info); + fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), + ofm_tensor->handle(), conv_info, multiplier, act_info); _return_fn = asAclFunction(std::move(fn)); } @@ -265,12 +278,12 @@ void KernelGenerator::visit(const ir::operation::Dequantize &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Dequantize::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique<::arm_compute::NEDequantizationLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle()); + fn->configure(input_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -279,88 +292,28 @@ void KernelGenerator::visit(const ir::operation::Dequantize &node) void KernelGenerator::visit(const ir::operation::MaxPool2D &node) { - const auto ofm_index{node.getOutputs().at(0)}; - const auto ifm_index{node.getInputs().at(ir::operation::MaxPool2D::Input::INPUT)}; - - const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout); - const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout); + auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::NEPoolingLayer>( + node, _ctx, _tensor_builder, _current_op_seq_layout, ::arm_compute::PoolingType::MAX); - const auto kh = node.param().kh; - const auto kw = node.param().kw; - const auto stride = node.param().stride; - const auto padding = - ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); + const auto ofm_index{node.getOutputs().at(0)}; + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); const auto activation = node.param().activation; - - VERBOSE(MaxPool2D) << "IFM_H: " << ifm_shape.H << std::endl; - VERBOSE(MaxPool2D) << "IFM_W: " << ifm_shape.W << std::endl; - VERBOSE(MaxPool2D) << "OFM_H: " << ofm_shape.H << std::endl; - VERBOSE(MaxPool2D) << "OFM_W: " << ofm_shape.W << std::endl; - VERBOSE(MaxPool2D) << "KER_H: " << kh << std::endl; - VERBOSE(MaxPool2D) << "KER_W: " << kw << std::endl; - VERBOSE(MaxPool2D) << "STRIDE_H: " << stride.vertical << std::endl; - VERBOSE(MaxPool2D) << "STRIDE_W: " << stride.horizontal << std::endl; - VERBOSE(MaxPool2D) << "PAD(T): " << padding.top << std::endl; - VERBOSE(MaxPool2D) << "PAD(B): " << padding.bottom << std::endl; - VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl; - VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl; - - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - - ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX, - ::arm_compute::Size2D{kw, kh}, - acl_common::asPadStrideInfo(padding, stride)}; - - auto fn = std::make_unique<::arm_compute::NEPoolingLayer>(); - - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); - _return_fn = std::make_unique<exec::FunctionSequence>( - asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclFunction(std::move(raw_fn)), + ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::AvgPool2D &node) { - const auto ofm_index{node.getOutputs().at(0)}; - const auto ifm_index{node.getInputs().at(ir::operation::AvgPool2D::Input::INPUT)}; - - const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout); - const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout); + auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::NEPoolingLayer>( + node, _ctx, _tensor_builder, _current_op_seq_layout, ::arm_compute::PoolingType::AVG); - const auto kh = node.param().kh; - const auto kw = node.param().kw; - const auto stride = node.param().stride; - const auto padding = - ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); + const auto ofm_index{node.getOutputs().at(0)}; + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); const auto activation = node.param().activation; - - VERBOSE(AvgPool2D) << "IFM_H: " << ifm_shape.H << std::endl; - VERBOSE(AvgPool2D) << "IFM_W: " << ifm_shape.W << std::endl; - VERBOSE(AvgPool2D) << "OFM_H: " << ofm_shape.H << std::endl; - VERBOSE(AvgPool2D) << "OFM_W: " << ofm_shape.W << std::endl; - VERBOSE(AvgPool2D) << "KER_H: " << kh << std::endl; - VERBOSE(AvgPool2D) << "KER_W: " << kw << std::endl; - VERBOSE(AvgPool2D) << "STRIDE_H: " << stride.vertical << std::endl; - VERBOSE(AvgPool2D) << "STRIDE_W: " << stride.horizontal << std::endl; - VERBOSE(AvgPool2D) << "PAD(T): " << padding.top << std::endl; - VERBOSE(AvgPool2D) << "PAD(B): " << padding.bottom << std::endl; - VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl; - VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl; - - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - - ::arm_compute::PoolingLayerInfo info{ - ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh}, - acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */}; - - auto fn = std::make_unique<::arm_compute::NEPoolingLayer>(); - - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); - _return_fn = std::make_unique<exec::FunctionSequence>( - asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclFunction(std::move(raw_fn)), + ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::Concat &node) @@ -383,7 +336,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node) return; } - auto output_alloc = _tensor_builder->at(ofm_index).get(); + auto output_tensor = _tensor_builder->at(ofm_index).get(); std::vector<::arm_compute::ITensor *> input_tensors; for (const auto &ifm_ind : input_indexes) input_tensors.emplace_back(_tensor_builder->at(ifm_ind)->handle()); @@ -392,7 +345,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node) if (input_indexes.size() < 2) { auto l = std::make_unique<::arm_compute::NECopy>(); - l->configure(input_tensors.at(0), output_alloc->handle()); + l->configure(input_tensors.at(0), output_tensor->handle()); fn = std::move(l); } else @@ -400,10 +353,10 @@ void KernelGenerator::visit(const ir::operation::Concat &node) auto l = std::make_unique<::arm_compute::NEConcatenateLayer>(); const auto rank = _ctx.at(ofm_index).shape().rank(); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = output_alloc->layout(); + const auto backend_layout = output_tensor->layout(); const auto fixed_axis = acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value(); - l->configure(input_tensors, output_alloc->handle(), fixed_axis); + l->configure(input_tensors, output_tensor->handle(), fixed_axis); fn = std::move(l); } @@ -418,13 +371,13 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node) const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)}; const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto lookups_alloc = _tensor_builder->at(lookups_index).get(); - auto values_alloc = _tensor_builder->at(values_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto lookups_tensor = _tensor_builder->at(lookups_index).get(); + auto values_tensor = _tensor_builder->at(values_index).get(); auto fn = std::make_unique<::arm_compute::NEEmbeddingLookup>(); - fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle()); + fn->configure(values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -436,12 +389,12 @@ void KernelGenerator::visit(const ir::operation::Floor &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); auto fn = std::make_unique<::arm_compute::NEFloor>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -450,76 +403,15 @@ void KernelGenerator::visit(const ir::operation::Floor &node) void KernelGenerator::visit(const ir::operation::FullyConnected &node) { - using ir::operation::FullyConnected; - const auto output_index{node.getOutputs().at(0)}; - const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)}; - const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)}; - const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)}; - - const auto input_rank = _ctx.at(input_index).shape().rank(); - - const auto output_size = - _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1); - UNUSED_RELEASE(output_size); - assert(_ctx.at(bias_index).shape().dim(0) == output_size); - assert(_ctx.at(weight_index).shape().dim(0) == output_size); - const auto batch_size = - _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 2); - const auto input_size = - _ctx.at(weight_index).shape().dim(_ctx.at(weight_index).shape().rank() - 1); - - // Check for reshaping input's shape into rank-2 - bool needs_reshape = false; - ir::Shape reshape(2); - if (input_rank == 3 || input_rank == 4) - { - const auto &ifm_shape = _ctx.at(input_index).shape(); - auto feature_size = 1; - for (int i = 0; i < ifm_shape.rank(); ++i) - { - feature_size *= ifm_shape.dim(i); - } - - UNUSED_RELEASE(feature_size); - assert(feature_size == batch_size * input_size); - - // for reshaping - needs_reshape = true; - reshape.dim(0) = batch_size; /* H */ - reshape.dim(1) = input_size; /* W */ - } - + auto output_tensor = _tensor_builder->at(output_index).get(); const auto activation = node.param().activation; - auto output_alloc = _tensor_builder->at(output_index).get(); - const auto input_alloc = _tensor_builder->at(input_index).get(); - const auto weight_alloc = _tensor_builder->at(weight_index).get(); - const auto bias_alloc = _tensor_builder->at(bias_index).get(); - const auto frontend_layout = _current_op_seq_layout; - const auto acl_layout = output_alloc->handle()->info()->data_layout(); - - auto fn = std::make_unique<arm_compute::NEFullyConnectedReshapingLayer>( - _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); - - arm_compute::NEFullyConnectedReshapingLayer::KernelType kernel_type = - arm_compute::NEFullyConnectedReshapingLayer::KernelType::GENERAL; - if (_ctx.at(weight_index).isConstant()) - { - kernel_type = arm_compute::NEFullyConnectedReshapingLayer::KernelType::PREPROCESSED_WEIGHTS; - assert(_ctx.at(weight_index).data()); - } - - fn->configure( - input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(), - needs_reshape, - ::onert::backend::acl_common::asTensorShape( - reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)), - kernel_type); - + auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ITensor, + ::arm_compute::NEFullyConnectedReshapingLayer>( + node, _ctx, _tensor_builder, _current_op_seq_layout); _return_fn = std::make_unique<exec::FunctionSequence>( - asAclFunction(std::move(fn)), - ActivationBuilder::generate(activation, output_alloc->handle())); + std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle())); } void KernelGenerator::visit(const ir::operation::HashtableLookup &node) @@ -531,17 +423,17 @@ void KernelGenerator::visit(const ir::operation::HashtableLookup &node) const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)}; const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto hits_alloc = _tensor_builder->at(hits_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto hits_tensor = _tensor_builder->at(hits_index).get(); - auto lookups_alloc = _tensor_builder->at(lookups_index).get(); - auto keys_alloc = _tensor_builder->at(keys_index).get(); - auto values_alloc = _tensor_builder->at(values_index).get(); + auto lookups_tensor = _tensor_builder->at(lookups_index).get(); + auto keys_tensor = _tensor_builder->at(keys_index).get(); + auto values_tensor = _tensor_builder->at(values_index).get(); auto fn = std::make_unique<::arm_compute::NEHashtableLookup>(); - fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(), - output_alloc->handle(), hits_alloc->handle()); + fn->configure(lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(), + output_tensor->handle(), hits_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -561,10 +453,10 @@ void KernelGenerator::visit(const ir::operation::Gather &node) // Converting in reverse order const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value(); - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto indices_alloc = _tensor_builder->at(indices_index).get(); - const auto backend_layout = ofm_alloc->layout(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto indices_tensor = _tensor_builder->at(indices_index).get(); + const auto backend_layout = ofm_tensor->layout(); UNUSED_RELEASE(backend_layout); // NOTE The frontend layout and backend layout must be the same for this operation. @@ -575,35 +467,35 @@ void KernelGenerator::visit(const ir::operation::Gather &node) // a model. For example, if a model in NHWC has this operation as output rank == 4, indices // rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W // and C are not sequential in NCHW. So the backend in NCHW cannot handle this case. - assert(backend_layout == ifm_alloc->layout()); - assert(backend_layout == indices_alloc->layout()); + assert(backend_layout == ifm_tensor->layout()); + assert(backend_layout == indices_tensor->layout()); assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout); auto fn = std::make_unique<::arm_compute::NEGatherEx>(); // input is n-D, indices k-D, output is (n + k - 1)-D size_t n = ifm_rank; - assert(n == ifm_alloc->num_dimensions()); + assert(n == ifm_tensor->num_dimensions()); size_t k = _ctx.at(indices_index).shape().rank(); - assert(k == indices_alloc->num_dimensions()); + assert(k == indices_tensor->num_dimensions()); // Disable applied dim_correction - if (n != ifm_alloc->info()->num_dimensions()) + if (n != ifm_tensor->info()->num_dimensions()) { // This means that high dimension's value is 1 and ifm tensor is applied dim_correction const auto ifm = _ctx.at(ifm_index); - ifm_alloc->info()->set_tensor_shape( + ifm_tensor->info()->set_tensor_shape( acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false)); } - if (k != indices_alloc->info()->num_dimensions()) + if (k != indices_tensor->info()->num_dimensions()) { // This means that high dimension's value is 1 and indices tensor is applied dim_correction const auto indices = _ctx.at(indices_index); - indices_alloc->info()->set_tensor_shape( + indices_tensor->info()->set_tensor_shape( acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false)); } - fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis); + fn->configure(ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis); // acl_neon doesn't not revert disabling applied dim_correction because acl_neon's kernels would // use arm_compute::TensorInfo::offset_element_in_bytes() @@ -621,20 +513,20 @@ void KernelGenerator::visit(const ir::operation::InstanceNorm &node) const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)}; const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto gamma_alloc = _tensor_builder->at(gamma_index).get(); - auto beta_alloc = _tensor_builder->at(beta_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto gamma_tensor = _tensor_builder->at(gamma_index).get(); + auto beta_tensor = _tensor_builder->at(beta_index).get(); auto epsilon = node.param().epsilon; auto activation = node.param().activation; auto fn = std::make_unique<::arm_compute::NEInstanceNormalizationLayerEx>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), gamma_alloc->handle(), - beta_alloc->handle(), epsilon); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), + beta_tensor->handle(), epsilon); _return_fn = std::make_unique<exec::FunctionSequence>( - asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::L2Normalization &node) @@ -656,15 +548,15 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node) float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction) float bias = 0.0f; // Don't offset the reduction. - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP, radius, alpha, beta, bias, false); auto fn = std::make_unique<::arm_compute::NENormalizationLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info); auto acl_fn = asAclFunction(std::move(fn)); @@ -673,32 +565,15 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node) void KernelGenerator::visit(const ir::operation::L2Pool2D &node) { - const auto ofm_index{node.getOutputs().at(0)}; - const auto ifm_index{node.getInputs().at(ir::operation::L2Pool2D::Input::INPUT)}; - - const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout); - const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout); + auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::NEPoolingLayer>( + node, _ctx, _tensor_builder, _current_op_seq_layout, ::arm_compute::PoolingType::L2); - uint32_t kw = node.param().kw; - uint32_t kh = node.param().kh; - const auto stride = node.param().stride; - const auto padding = - ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); + const auto ofm_index{node.getOutputs().at(0)}; + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); const auto activation = node.param().activation; - - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - - ::arm_compute::PoolingLayerInfo info{ - ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh}, - ::onert::backend::acl_common::asPadStrideInfo(padding, stride)}; - - auto fn = std::make_unique<::arm_compute::NEPoolingLayer>(); - - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info); - _return_fn = std::make_unique<exec::FunctionSequence>( - asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclFunction(std::move(raw_fn)), + ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node) @@ -712,15 +587,15 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod auto beta = node.param().beta; auto bias = node.param().bias; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); const auto norm_info = ::arm_compute::NormalizationLayerInfo( ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false); auto fn = std::make_unique<::arm_compute::NENormalizationLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info); auto acl_fn = asAclFunction(std::move(fn)); @@ -733,13 +608,13 @@ void KernelGenerator::visit(const ir::operation::LogicalAnd &node) const auto input0_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT0)}; const auto input1_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT1)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input0_alloc = _tensor_builder->at(input0_index).get(); - auto input1_alloc = _tensor_builder->at(input1_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input0_tensor = _tensor_builder->at(input0_index).get(); + auto input1_tensor = _tensor_builder->at(input1_index).get(); auto fn = std::make_unique<::arm_compute::NELogicalAnd>(); - fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle()); + fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -751,12 +626,12 @@ void KernelGenerator::visit(const ir::operation::LogicalNot &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::LogicalNot::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique<::arm_compute::NEBitwiseNot>(); - fn->configure(input_alloc->handle(), output_alloc->handle()); + fn->configure(input_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -769,13 +644,13 @@ void KernelGenerator::visit(const ir::operation::LogicalOr &node) const auto input0_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT0)}; const auto input1_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT1)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input0_alloc = _tensor_builder->at(input0_index).get(); - auto input1_alloc = _tensor_builder->at(input1_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input0_tensor = _tensor_builder->at(input0_index).get(); + auto input1_tensor = _tensor_builder->at(input1_index).get(); auto fn = std::make_unique<::arm_compute::NELogicalOr>(); - fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle()); + fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -787,8 +662,8 @@ void KernelGenerator::visit(const ir::operation::Logistic &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC}; @@ -798,7 +673,7 @@ void KernelGenerator::visit(const ir::operation::Logistic &node) // instead of 'INF', and then the result of this op will be errors due to the 'NaN'. auto fn = std::make_unique<::arm_compute::NEActivationLayerEx>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info); auto acl_fn = asAclFunction(std::move(fn)); @@ -807,159 +682,8 @@ void KernelGenerator::visit(const ir::operation::Logistic &node) void KernelGenerator::visit(const ir::operation::LSTM &node) { - // TODO Support dynamic rnn - // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection. - const auto scratch_buffer_index{ - node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)}; - const auto output_state_out_index{ - node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)}; - const auto cell_state_out_index{ - node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)}; - const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)}; - - const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)}; - const auto input_to_input_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional - const auto input_to_forget_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)}; - const auto input_to_cell_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)}; - const auto input_to_output_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)}; - const auto recurrent_to_input_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional - const auto recurrent_to_forget_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)}; - const auto recurrent_to_cell_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)}; - const auto recurrent_to_output_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)}; - const auto cell_to_input_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional - const auto cell_to_forget_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional - const auto cell_to_output_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional - const auto input_gate_bias_index{ - node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)}; - const auto forget_gate_bias_index{ - node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)}; - const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)}; - const auto output_gate_bias_index{ - node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)}; - const auto projection_weights_index{ - node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional - const auto projection_bias_index{ - node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional - const auto output_state_in_index{ - node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)}; - const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)}; - const auto cell_threshold = node.param().cell_threshold; - const auto projection_threshold = node.param().projection_threshold; - - bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 && - _ctx.at(input_to_input_weights_index).shape().dim(1) != 0; - bool has_recurrent_to_input_weights = - _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 && - _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0; - bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0; - bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0; - bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 && - _ctx.at(projection_weights_index).shape().dim(1) != 0; - bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0); - - // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG. - // true: no CIFG - // false: CIFG - // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG). - bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights; - - // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole. - // But the cell_to_input_weights does not exist in regular CIFG although peephole. - // true: peephole - // false: no peephole - bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights; - - // NOTE Although the projection weights has data the projection bias may not have data. - bool has_projection_param = has_projection_weights; - - const auto activation = node.param().activation; - const auto cell_clip = cell_threshold; - const auto projection_clip = projection_threshold; - assert(cell_clip >= 0.f && projection_clip >= 0.f); - - auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get(); - auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get(); - auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get(); - auto output_alloc = _tensor_builder->at(output_index).get(); - - auto input_alloc = _tensor_builder->at(input_index).get(); - - auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get(); - auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get(); - auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get(); - auto recurrent_to_forget_weights_alloc = - _tensor_builder->at(recurrent_to_forget_weights_index).get(); - auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get(); - auto recurrent_to_output_weights_alloc = - _tensor_builder->at(recurrent_to_output_weights_index).get(); - - auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get(); - auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get(); - auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get(); - auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get(); - auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get(); - - auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation); - - auto fn = std::make_unique<::arm_compute::NELSTMLayer>(); - - ::arm_compute::LSTMParams<::arm_compute::ITensor> lstm_params{}; - if (has_cifg_param) - { - auto input_to_input_weights_alloc = - _tensor_builder->at(input_to_input_weights_index).get(); // optional - auto recurrent_to_input_weights_alloc = - _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional - auto cell_to_input_weights_handle = - has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle() - : nullptr; // optional (non-cifg && peephole) - auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional - lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(), - recurrent_to_input_weights_alloc->handle(), - cell_to_input_weights_handle, input_gate_bias_alloc->handle()); - } - if (has_peephole_param) - { - auto cell_to_forget_weights_alloc = - _tensor_builder->at(cell_to_forget_weights_index).get(); // optional - auto cell_to_output_weights_alloc = - _tensor_builder->at(cell_to_output_weights_index).get(); // optional - lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(), - cell_to_output_weights_alloc->handle()); - } - if (has_projection_param) - { - auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional - auto projection_bias_handle = has_projection_bias - ? _tensor_builder->at(projection_bias_index).get()->handle() - : nullptr; // optional - lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle); - } - - fn->configure( - input_alloc->handle(), input_to_forget_weights_alloc->handle(), - input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(), - recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(), - recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(), - cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(), - cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(), - output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(), - lstm_params, act_info, cell_clip, projection_clip); - - auto acl_fn = asAclFunction(std::move(fn)); - - _return_fn = std::move(acl_fn); + _return_fn = acl_common::kernelGenLSTM<acl_common::AclFunction, ::arm_compute::ITensor, + ::arm_compute::NELSTMLayer>(node, _ctx, _tensor_builder); } void KernelGenerator::visit(const ir::operation::Mul &node) @@ -970,18 +694,18 @@ void KernelGenerator::visit(const ir::operation::Mul &node) const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::NEPixelWiseMultiplication>(); // RoundingPolicy for scale:1.0 is only allowed RoundingPolicy::TO_ZERO - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO); _return_fn = std::make_unique<exec::FunctionSequence>( - asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::Neg &node) @@ -989,12 +713,12 @@ void KernelGenerator::visit(const ir::operation::Neg &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); auto fn = std::make_unique<::arm_compute::NENegLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -1030,12 +754,12 @@ void KernelGenerator::visit(const ir::operation::Pack &node) for (const auto &input_index : input_indexes) { size_t input_rank = _ctx.at(input_index).shape().rank(); - const auto &input_alloc = _tensor_builder->at(input_index); - assert(input_rank == input_alloc->num_dimensions()); - if (input_rank != input_alloc->info()->num_dimensions()) + const auto &input_tensor = _tensor_builder->at(input_index); + assert(input_rank == input_tensor->num_dimensions()); + if (input_rank != input_tensor->info()->num_dimensions()) { // This means that high dimension's value is 1 and ifm tensor is applied dim_correction - input_alloc->info()->set_tensor_shape(acl_common::asTensorShape( + input_tensor->info()->set_tensor_shape(acl_common::asTensorShape( _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false)); } } @@ -1094,8 +818,8 @@ void KernelGenerator::visit(const ir::operation::Permute &node) const auto ofm_idx{node.getOutputs().at(0)}; const auto ifm_idx{node.getInputs().at(0)}; const auto permute_type = node.getPermuteType(); - auto ofm_alloc = _tensor_builder->at(ofm_idx).get(); - auto ifm_alloc = _tensor_builder->at(ifm_idx).get(); + auto ofm_tensor = _tensor_builder->at(ofm_idx).get(); + auto ifm_tensor = _tensor_builder->at(ifm_idx).get(); const auto rank = _ctx.at(ofm_idx).shape().rank(); assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank()); @@ -1108,7 +832,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node) auto l = std::make_unique<::arm_compute::NEPermute>(); - l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv); + l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv); fn = std::move(l); } @@ -1119,7 +843,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node) auto l = std::make_unique<::arm_compute::NEPermute>(); - l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv); + l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv); fn = std::move(l); } @@ -1127,7 +851,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node) { auto l = std::make_unique<::arm_compute::NECopy>(); - l->configure(ifm_alloc->handle(), ofm_alloc->handle()); + l->configure(ifm_tensor->handle(), ofm_tensor->handle()); fn = std::move(l); } @@ -1143,15 +867,15 @@ void KernelGenerator::visit(const ir::operation::PReLU &node) const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)}; const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto alpha_alloc = _tensor_builder->at(alpha_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto alpha_tensor = _tensor_builder->at(alpha_index).get(); std::unique_ptr<::arm_compute::IFunction> fn; - auto l = std::make_unique<::arm_compute::NEPReLU>(); + auto l = std::make_unique<::arm_compute::NEPReluLayer>(); - l->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle()); + l->configure(ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle()); fn = std::move(l); @@ -1166,14 +890,14 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)}; const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); // Convert to ACL axes taking into account negative values and possible duplicates. const auto &axes = _ctx.at(axes_index); const auto input_rank = _ctx.at(input_index).shape().rank(); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = input_alloc->layout(); + const auto backend_layout = input_tensor->layout(); const auto reduce_axes = acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout); const auto reduce_type = node.param().reduce_type; @@ -1182,11 +906,9 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) std::unique_ptr<::arm_compute::IFunction> fn; if (reduce_type == ir::operation::Reduce::ReduceType::MEAN) { - // NOTE NEReduceMean has a bug that does not support NHWC layout - // NEReduceMean intermediate tensors are always NCHW layout - auto l = std::make_unique<::arm_compute::NEReduceMeanEx>(); + auto l = std::make_unique<::arm_compute::NEReduceMean>(); - l->configure(input_alloc->handle(), reduce_axes, keep_dims, output_alloc->handle()); + l->configure(input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle()); fn = std::move(l); } @@ -1194,7 +916,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) { auto l = std::make_unique<::arm_compute::NEReduceSum>(); - l->configure(input_alloc->handle(), reduce_axes, keep_dims, output_alloc->handle()); + l->configure(input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle()); fn = std::move(l); } @@ -1202,7 +924,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) { auto l = std::make_unique<::arm_compute::NEReduceOperation>(); - l->configure(input_alloc->handle(), reduce_axes, keep_dims, output_alloc->handle(), + l->configure(input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle(), acl_common::convertReduceType(reduce_type)); fn = std::move(l); @@ -1218,15 +940,15 @@ void KernelGenerator::visit(const ir::operation::ReLU &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::ReLU::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique<arm_compute::NEActivationLayer>(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU}; - fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + fn->configure(input_tensor->handle(), output_tensor->handle(), act_info); auto acl_fn = asAclFunction(std::move(fn)); @@ -1238,15 +960,15 @@ void KernelGenerator::visit(const ir::operation::ReLU1 &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::ReLU1::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f}; auto fn = std::make_unique<::arm_compute::NEActivationLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info); auto acl_fn = asAclFunction(std::move(fn)); @@ -1258,15 +980,15 @@ void KernelGenerator::visit(const ir::operation::ReLU6 &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::ReLU6::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f}; auto fn = std::make_unique<::arm_compute::NEActivationLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info); auto acl_fn = asAclFunction(std::move(fn)); @@ -1278,13 +1000,13 @@ void KernelGenerator::visit(const ir::operation::Reshape &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); // NOTE This operation must not be changed the layout from frontend to backend // So, PermutationOperationPass makes layouts of frontend and backend the same. const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = output_alloc->layout(); + const auto backend_layout = output_tensor->layout(); assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) || frontend_layout == backend_layout); UNUSED_RELEASE(frontend_layout); @@ -1292,7 +1014,7 @@ void KernelGenerator::visit(const ir::operation::Reshape &node) auto fn = std::make_unique<arm_compute::NEReshapeLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle()); + fn->configure(input_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -1305,12 +1027,12 @@ void KernelGenerator::visit(const ir::operation::ResizeBilinear &node) const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); auto fn = std::make_unique<::arm_compute::NEScale>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE, ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT); @@ -1334,25 +1056,25 @@ void KernelGenerator::visit(const ir::operation::RNN &node) const auto activation = node.param().activation; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto hidden_state_out_tensor = _tensor_builder->at(hidden_state_out_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); - auto weights_alloc = _tensor_builder->at(weights_index).get(); - auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get(); - auto bias_alloc = _tensor_builder->at(bias_index).get(); - auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); + auto weights_tensor = _tensor_builder->at(weights_index).get(); + auto recurrent_weights_tensor = _tensor_builder->at(recurrent_weights_index).get(); + auto bias_tensor = _tensor_builder->at(bias_index).get(); + auto hidden_state_in_tensor = _tensor_builder->at(hidden_state_in_index).get(); auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation); auto copy_layer = std::make_unique<::arm_compute::NECopy>(); - copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle()); + copy_layer->configure(hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle()); _return_fn = asAclFunction(std::move(copy_layer)); - auto fn = std::make_unique<::arm_compute::NERNNLayerEx>( + auto fn = std::make_unique<::arm_compute::NERNNLayer>( _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); - fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(), - bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(), - act_info); + fn->configure(input_tensor->handle(), weights_tensor->handle(), + recurrent_weights_tensor->handle(), bias_tensor->handle(), + hidden_state_out_tensor->handle(), output_tensor->handle(), act_info); _return_fn = asAclFunction(std::move(fn)); } @@ -1361,12 +1083,12 @@ void KernelGenerator::visit(const ir::operation::RSQRT &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); auto fn = std::make_unique<::arm_compute::NERsqrtLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle()); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle()); _return_fn = asAclFunction(std::move(fn)); } @@ -1383,10 +1105,10 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node) (void)dims; (void)ndim; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique<arm_compute::NEReshapeLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle()); + fn->configure(input_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); _return_fn = std::move(acl_fn); } @@ -1396,15 +1118,15 @@ void KernelGenerator::visit(const ir::operation::Tanh &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique<arm_compute::NEActivationLayer>(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f}; - fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + fn->configure(input_tensor->handle(), output_tensor->handle(), act_info); auto acl_fn = asAclFunction(std::move(fn)); @@ -1417,13 +1139,25 @@ void KernelGenerator::visit(const ir::operation::Softmax &node) const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)}; const auto beta = node.param().beta; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); + const auto frontend_layout = _current_op_seq_layout; + const auto backend_layout = input_tensor->layout(); + + // Disable applied dim_correction + const size_t input_rank = _ctx.at(input_index).shape().rank(); + if (input_rank != input_tensor->info()->num_dimensions()) + { + // This means that high dimension's value is 1 and input tensor is applied dim_correction + const auto input = _ctx.at(input_index); + input_tensor->info()->set_tensor_shape( + acl_common::asTensorShape(input.shape(), frontend_layout, backend_layout, false)); + } auto fn = std::make_unique<::arm_compute::NESoftmaxLayer>( _tensor_builder->acl_tensor_manager()->internal_buffer_manager()); - fn->configure(input_alloc->handle(), output_alloc->handle(), beta); + fn->configure(input_tensor->handle(), output_tensor->handle(), beta); auto acl_fn = asAclFunction(std::move(fn)); @@ -1438,20 +1172,18 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node) node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)}; const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto block_size_alloc = _tensor_builder->at(block_size_index).get(); - auto paddings_alloc = _tensor_builder->at(paddings_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto block_size_tensor = _tensor_builder->at(block_size_index).get(); + auto paddings_tensor = _tensor_builder->at(paddings_index).get(); assert(_ctx.at(block_size_index).data()); assert(_ctx.at(paddings_index).data()); - // NESpaceToBatchLayer has a bug that padding's values are 0 even when zero point of QASYMM8 is - // not 0. - auto fn = std::make_unique<::arm_compute::NESpaceToBatchLayerEx>(); + auto fn = std::make_unique<::arm_compute::NESpaceToBatchLayer>(); - fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(), - ofm_alloc->handle()); + fn->configure(ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(), + ofm_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -1465,12 +1197,12 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node) auto block_size = node.param().block_size; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); - auto fn = std::make_unique<::arm_compute::NESpaceToDepthLayerEx>(); + auto fn = std::make_unique<::arm_compute::NESpaceToDepthLayer>(); - fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size); + fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), block_size); auto acl_fn = asAclFunction(std::move(fn)); @@ -1489,13 +1221,13 @@ void KernelGenerator::visit(const ir::operation::Split &node) for (const auto &output : node.getOutputs()) output_indexes.emplace_back(output); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - std::vector<arm_compute::ITensor *> output_allocs; + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + std::vector<arm_compute::ITensor *> output_tensors; for (const auto &ofm_ind : output_indexes) - output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle()); + output_tensors.emplace_back(_tensor_builder->at(ofm_ind).get()->handle()); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = ifm_alloc->layout(); + const auto backend_layout = ifm_tensor->layout(); auto axis = node.param().axis; if (axis < 0) axis += ifm_rank; @@ -1503,7 +1235,7 @@ void KernelGenerator::visit(const ir::operation::Split &node) auto fn = std::make_unique<::arm_compute::NESplit>(); - fn->configure(ifm_alloc->handle(), output_allocs, axis); + fn->configure(ifm_tensor->handle(), output_tensors, axis); _return_fn = asAclFunction(std::move(fn)); } @@ -1513,15 +1245,15 @@ void KernelGenerator::visit(const ir::operation::SQRT &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::SQRT::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); const ::arm_compute::ActivationLayerInfo act_info{ ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT}; auto fn = std::make_unique<::arm_compute::NEActivationLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle(), act_info); + fn->configure(input_tensor->handle(), output_tensor->handle(), act_info); auto acl_fn = asAclFunction(std::move(fn)); @@ -1534,13 +1266,13 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node) const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::NEElementwiseSquaredDiff>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -1555,17 +1287,17 @@ void KernelGenerator::visit(const ir::operation::Sub &node) const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::NEArithmeticSubtraction>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE); _return_fn = std::make_unique<exec::FunctionSequence>( - asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::Slice &node) @@ -1575,10 +1307,10 @@ void KernelGenerator::visit(const ir::operation::Slice &node) const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)}; const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)}; - auto outputData_alloc = _tensor_builder->at(output_index).get(); - auto inputData_alloc = _tensor_builder->at(input_index).get(); + auto outputData_tensor = _tensor_builder->at(output_index).get(); + auto inputData_tensor = _tensor_builder->at(input_index).get(); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = inputData_alloc->layout(); + const auto backend_layout = inputData_tensor->layout(); // Set initializers for indices data such as order of inputData int input_rank = _ctx.at(input_index).shape().rank(); @@ -1628,7 +1360,7 @@ void KernelGenerator::visit(const ir::operation::Slice &node) auto fn = std::make_unique<::arm_compute::NESlice>(); - fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set); + fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set); auto acl_fn = asAclFunction(std::move(fn)); @@ -1643,10 +1375,10 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node) const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)}; const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)}; - auto outputData_alloc = _tensor_builder->at(output_index).get(); - auto inputData_alloc = _tensor_builder->at(input_index).get(); + auto outputData_tensor = _tensor_builder->at(output_index).get(); + auto inputData_tensor = _tensor_builder->at(input_index).get(); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = inputData_alloc->layout(); + const auto backend_layout = inputData_tensor->layout(); // Set initializers for indices data such as order of inputData int input_rank = _ctx.at(input_index).shape().rank(); @@ -1715,7 +1447,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node) auto fn = std::make_unique<::arm_compute::NEStridedSlice>(); - fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set, + fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set, begin_mask, end_mask, shrink_axis_mask); auto acl_fn = asAclFunction(std::move(fn)); @@ -1749,16 +1481,16 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node) invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1); } - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto ifm_alloc = _tensor_builder->at(ifm_index).get(); - auto ker_alloc = _tensor_builder->at(ker_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto ifm_tensor = _tensor_builder->at(ifm_index).get(); + auto ker_tensor = _tensor_builder->at(ker_index).get(); const auto tconv_info = acl_common::asPadStrideInfo(padding, stride); auto fn = std::make_unique<::arm_compute::NETransposeConvLayer>(); - fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info, - invalid_horizontal, invalid_vertical); + fn->configure(ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(), + tconv_info, invalid_horizontal, invalid_vertical); auto acl_fn = asAclFunction(std::move(fn)); @@ -1771,10 +1503,10 @@ void KernelGenerator::visit(const ir::operation::Transpose &node) const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)}; const auto &perm{node.param().perm}; - auto ofm_alloc = _tensor_builder->at(ofm_idx).get(); - const auto ifm_alloc = _tensor_builder->at(ifm_idx).get(); + auto ofm_tensor = _tensor_builder->at(ofm_idx).get(); + const auto ifm_tensor = _tensor_builder->at(ifm_idx).get(); const auto frontend_layout = _current_op_seq_layout; - const auto backend_layout = ifm_alloc->layout(); + const auto backend_layout = ifm_tensor->layout(); const auto rank = _ctx.at(ifm_idx).shape().rank(); std::vector<std::int32_t> pv(perm.cbegin(), perm.cend()); @@ -1783,11 +1515,11 @@ void KernelGenerator::visit(const ir::operation::Transpose &node) std::unique_ptr<::arm_compute::IFunction> fn; - if (ifm_alloc->num_dimensions() <= 2 && ofm_alloc->num_dimensions() <= 2) + if (ifm_tensor->num_dimensions() <= 2 && ofm_tensor->num_dimensions() <= 2) { auto l = std::make_unique<::arm_compute::NETranspose>(); - l->configure(ifm_alloc->handle(), ofm_alloc->handle()); + l->configure(ifm_tensor->handle(), ofm_tensor->handle()); fn = std::move(l); } @@ -1795,7 +1527,7 @@ void KernelGenerator::visit(const ir::operation::Transpose &node) { auto l = std::make_unique<::arm_compute::NEPermute>(); - l->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv); + l->configure(ifm_tensor->handle(), ofm_tensor->handle(), backend_pv); fn = std::move(l); } @@ -1834,13 +1566,13 @@ void KernelGenerator::visit(const ir::operation::Unpack &node) for (const auto &output_index : output_indexes) { size_t output_rank = _ctx.at(output_index).shape().rank(); - const auto &output_alloc = _tensor_builder->at(output_index); - orig_outputs_acl_tensor_shapes.emplace_back(output_alloc->info()->tensor_shape()); - assert(output_rank == output_alloc->num_dimensions()); - if (output_rank != output_alloc->info()->num_dimensions()) + const auto &output_tensor = _tensor_builder->at(output_index); + orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape()); + assert(output_rank == output_tensor->num_dimensions()); + if (output_rank != output_tensor->info()->num_dimensions()) { // This means that high dimension's value is 1 and ifm tensor is applied dim_correction - output_alloc->info()->set_tensor_shape(acl_common::asTensorShape( + output_tensor->info()->set_tensor_shape(acl_common::asTensorShape( _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false)); } } @@ -1858,17 +1590,17 @@ void KernelGenerator::visit(const ir::operation::Add &node) const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::NEArithmeticAddition>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE); _return_fn = std::make_unique<exec::FunctionSequence>( - asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::Div &node) @@ -1879,16 +1611,16 @@ void KernelGenerator::visit(const ir::operation::Div &node) const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::NEElementwiseDivision>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); _return_fn = std::make_unique<exec::FunctionSequence>( - asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle())); + asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle())); } void KernelGenerator::visit(const ir::operation::Exp &node) @@ -1896,12 +1628,12 @@ void KernelGenerator::visit(const ir::operation::Exp &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique<::arm_compute::NEExpLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle()); + fn->configure(input_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -1913,12 +1645,12 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)}; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input_tensor = _tensor_builder->at(input_index).get(); auto fn = std::make_unique<::arm_compute::NEReshapeLayer>(); - fn->configure(input_alloc->handle(), output_alloc->handle()); + fn->configure(input_tensor->handle(), output_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -1933,13 +1665,13 @@ void KernelGenerator::visit(const ir::operation::Comparison &node) const auto comparison_type = node.param().comparison_type; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input0_alloc = _tensor_builder->at(input0_index).get(); - auto input1_alloc = _tensor_builder->at(input1_index).get(); + auto output_tensor = _tensor_builder->at(output_index).get(); + auto input0_tensor = _tensor_builder->at(input0_index).get(); + auto input1_tensor = _tensor_builder->at(input1_index).get(); auto fn = std::make_unique<::arm_compute::NEElementwiseComparison>(); - fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(), + fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(), (arm_compute::ComparisonOperation)comparison_type); auto acl_fn = asAclFunction(std::move(fn)); @@ -1953,13 +1685,13 @@ void KernelGenerator::visit(const ir::operation::Min &node) const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::NEElementwiseMin>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); @@ -1972,13 +1704,13 @@ void KernelGenerator::visit(const ir::operation::Max &node) const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)}; - auto ofm_alloc = _tensor_builder->at(ofm_index).get(); - auto lhs_alloc = _tensor_builder->at(lhs_index).get(); - auto rhs_alloc = _tensor_builder->at(rhs_index).get(); + auto ofm_tensor = _tensor_builder->at(ofm_index).get(); + auto lhs_tensor = _tensor_builder->at(lhs_index).get(); + auto rhs_tensor = _tensor_builder->at(rhs_index).get(); auto fn = std::make_unique<::arm_compute::NEElementwiseMax>(); - fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle()); + fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle()); auto acl_fn = asAclFunction(std::move(fn)); diff --git a/runtime/onert/backend/cpu/Backend.h b/runtime/onert/backend/cpu/Backend.h index 2daf06aca..56bd352e0 100644 --- a/runtime/onert/backend/cpu/Backend.h +++ b/runtime/onert/backend/cpu/Backend.h @@ -17,6 +17,7 @@ #ifndef __ONERT_BACKEND_CPU_BACKEND_H__ #define __ONERT_BACKEND_CPU_BACKEND_H__ +#include "BackendContext.h" #include "Config.h" #include "ConstantInitializer.h" #include "KernelGenerator.h" @@ -39,9 +40,9 @@ public: std::shared_ptr<IConfig> config() const override { return _config; } - std::unique_ptr<BackendContext> newContext(const ir::Graph &graph, - const std::shared_ptr<custom::IKernelBuilder> &kb, - bool) const override + std::unique_ptr<onert::backend::BackendContext> + newContext(const ir::Graph &graph, const std::shared_ptr<custom::IKernelBuilder> &kb, + bool) const override { const auto &operands = graph.operands(); const auto &operations = graph.operations(); @@ -49,7 +50,8 @@ public: auto tb = std::make_shared<TensorBuilder>(); context->tensor_builder = tb; context->constant_initializer = std::make_shared<ConstantInitializer>(operands, tb); - context->kernel_gen = std::make_shared<KernelGenerator>(operands, operations, tb, kb); + context->kernel_gen = std::make_shared<KernelGenerator>(operands, operations, tb, kb, + context->external_context()); context->tensor_register = nullptr; context->optimizer = nullptr; return context; diff --git a/runtime/onert/backend/cpu/BackendContext.h b/runtime/onert/backend/cpu/BackendContext.h new file mode 100644 index 000000000..f314a8e39 --- /dev/null +++ b/runtime/onert/backend/cpu/BackendContext.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_CPU_BACKEND_CONTEXT_H__ +#define __ONERT_BACKEND_CPU_BACKEND_CONTEXT_H__ + +#include <backend/BackendContext.h> +#include "ExternalContext.h" + +namespace onert +{ +namespace backend +{ +namespace cpu +{ + +class BackendContext : public onert::backend::BackendContext +{ +public: + BackendContext(const Backend *backend, const ir::Graph *graph, + std::shared_ptr<ITensorBuilder> tensor_builder = nullptr, + std::shared_ptr<IConstantInitializer> constant_initializer = nullptr, + std::shared_ptr<IKernelGenerator> kernel_gen = nullptr, + std::shared_ptr<ITensorRegister> tensor_register = nullptr, + std::shared_ptr<IOptimizer> optimizer = nullptr) + : onert::backend::BackendContext(backend, graph, tensor_builder, constant_initializer, + kernel_gen, tensor_register, optimizer), + _external_context(new ExternalContext) + { + } + + std::shared_ptr<ExternalContext> external_context() { return _external_context; } + +private: + // NOTE ruy context has a thread pool, and when multiple ruy contexts are created, + // the thread pool is also created in duplicate + // TODO Create one ruy context for session + std::shared_ptr<ExternalContext> _external_context; +}; + +} // namespace cpu +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_CPU_BACKEND_CONTEXT_H__ diff --git a/runtime/onert/backend/cpu/CMakeLists.txt b/runtime/onert/backend/cpu/CMakeLists.txt index e997a2291..01a3cd178 100644 --- a/runtime/onert/backend/cpu/CMakeLists.txt +++ b/runtime/onert/backend/cpu/CMakeLists.txt @@ -1,5 +1,7 @@ set(LIB_ONERT_BACKEND_CPU onert_backend_cpu) +nnfw_find_package(Ruy REQUIRED) + file(GLOB_RECURSE SOURCES "*.cc") add_library(${LIB_ONERT_BACKEND_CPU} SHARED ${SOURCES}) @@ -8,6 +10,8 @@ target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE nnfw_lib_cker) target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE onert_core) target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE nnfw_common) target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE nnfw_coverage) +target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE ruy) +target_link_libraries(${LIB_ONERT_BACKEND_CPU} INTERFACE ruy_instrumentation) set_target_properties(${LIB_ONERT_BACKEND_CPU} PROPERTIES OUTPUT_NAME backend_cpu) diff --git a/runtime/onert/backend/cpu/ConstantInitializer.cc b/runtime/onert/backend/cpu/ConstantInitializer.cc index 71e313628..deb27f0fe 100644 --- a/runtime/onert/backend/cpu/ConstantInitializer.cc +++ b/runtime/onert/backend/cpu/ConstantInitializer.cc @@ -15,6 +15,7 @@ */ #include "ConstantInitializer.h" +#include "Tensor.h" namespace onert { @@ -30,39 +31,61 @@ ConstantInitializer::ConstantInitializer(const ir::Operands &operands, // DO NOTHING } +void ConstantInitializer::registerDefaultInitializer(const ir::OperandIndex &index, + const ir::Operand &obj) +{ + registerExternalInitializer(index, obj); +} + +void ConstantInitializer::registerExternalInitializer(const ir::OperandIndex &index, + const ir::Operand &obj) +{ + // For only CONSTANTS + // TODO Add to check if tensor has been allocated + if (!obj.isConstant()) + return; + + _init_map[index] = [](const onert::ir::Operand &model_obj, onert::backend::ITensor &itensor) { + auto data = model_obj.shareData(); + assert(data && data->base()); + ExternalTensor &tensor = dynamic_cast<ExternalTensor &>(itensor); + tensor.setData(data); + }; +} + void ConstantInitializer::visit(const ir::operation::Conv2D &node) { const auto &kernel_index = node.getInputs().at(ir::operation::Conv2D::KERNEL); const auto &kernel_obj = _operands.at(kernel_index); - registerCopyInitializer(kernel_index, kernel_obj); + registerExternalInitializer(kernel_index, kernel_obj); const auto &bias_index = node.getInputs().at(ir::operation::Conv2D::BIAS); const auto &bias_obj = _operands.at(bias_index); - registerCopyInitializer(bias_index, bias_obj); + registerExternalInitializer(bias_index, bias_obj); } void ConstantInitializer::visit(const ir::operation::DepthwiseConv2D &node) { const auto &kernel_index = node.getInputs().at(ir::operation::DepthwiseConv2D::KERNEL); const auto &kernel_obj = _operands.at(kernel_index); - registerCopyInitializer(kernel_index, kernel_obj); + registerExternalInitializer(kernel_index, kernel_obj); const auto &bias_index = node.getInputs().at(ir::operation::DepthwiseConv2D::BIAS); const auto &bias_obj = _operands.at(bias_index); - registerCopyInitializer(bias_index, bias_obj); + registerExternalInitializer(bias_index, bias_obj); } void ConstantInitializer::visit(const ir::operation::FullyConnected &node) { const auto &weight_index = node.getInputs().at(ir::operation::FullyConnected::WEIGHT); const auto &weight_obj = _operands.at(weight_index); - registerCopyInitializer(weight_index, weight_obj); + registerExternalInitializer(weight_index, weight_obj); const auto &bias_index = node.getInputs().at(ir::operation::FullyConnected::BIAS); if (!bias_index.undefined()) { const auto &bias_obj = _operands.at(bias_index); - registerCopyInitializer(bias_index, bias_obj); + registerExternalInitializer(bias_index, bias_obj); } } diff --git a/runtime/onert/backend/cpu/ConstantInitializer.h b/runtime/onert/backend/cpu/ConstantInitializer.h index bd06c64d1..de03a693a 100644 --- a/runtime/onert/backend/cpu/ConstantInitializer.h +++ b/runtime/onert/backend/cpu/ConstantInitializer.h @@ -36,6 +36,15 @@ public: const std::shared_ptr<TensorBuilder> &tensor_builder); public: + void registerDefaultInitializer(const ir::OperandIndex &index, const ir::Operand &obj) override; + + // TODO: For now the only cpu backend supports constant tensor to use data from external + // If the other backend supports (to do this, + // ExternalTensor should be abstract such as IExternal, maybe), + // this can be an interface of IConstantInitializer + void registerExternalInitializer(const ir::OperandIndex &, const ir::Operand &); + +public: void visit(const ir::operation::Conv2D &) override; void visit(const ir::operation::DepthwiseConv2D &) override; void visit(const ir::operation::FullyConnected &) override; diff --git a/runtime/onert/backend/cpu/ExternalContext.h b/runtime/onert/backend/cpu/ExternalContext.h new file mode 100644 index 000000000..6627412d2 --- /dev/null +++ b/runtime/onert/backend/cpu/ExternalContext.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_CPU_EXTERNAL_CONTEXT_H__ +#define __ONERT_BACKEND_CPU_EXTERNAL_CONTEXT_H__ + +#include <backend/IExternalContext.h> +#include <util/ConfigSource.h> +#include <ruy/context.h> + +namespace +{ +const int kDefaultNumThreadpoolThreads = 1; +} + +namespace onert +{ +namespace backend +{ +namespace cpu +{ + +class ExternalContext : public IExternalContext +{ +public: + ExternalContext() : _ruy_context(new ruy::Context) + { + setMaxNumThreads(onert::util::getConfigInt(onert::util::config::RUY_THREADS)); +#ifdef USE_RUY_GEMV + _ruy_context->cache_policy = ruy::kCacheLHSOnNarrowMul; +#endif + } + + void setMaxNumThreads(int max_num_threads) + { + const int target_num_threads = + max_num_threads > -1 ? max_num_threads : kDefaultNumThreadpoolThreads; + _ruy_context->max_num_threads = target_num_threads; + } + + ruy::Context *ruy_context() const { return _ruy_context.get(); } + +private: + const std::unique_ptr<ruy::Context> _ruy_context; +}; + +} // namespace cpu +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_CPU_EXTERNAL_CONTEXT_H__ diff --git a/runtime/onert/backend/cpu/KernelGenerator.cc b/runtime/onert/backend/cpu/KernelGenerator.cc index 72f960675..7939fe894 100644 --- a/runtime/onert/backend/cpu/KernelGenerator.cc +++ b/runtime/onert/backend/cpu/KernelGenerator.cc @@ -20,6 +20,7 @@ #include "ops/AddLayer.h" #include "ops/ArgMinMaxLayer.h" #include "ops/AvgPoolLayer.h" +#include "ops/BatchToSpaceNDLayer.h" #include "ops/CastLayer.h" #include "ops/CompareLayer.h" #include "ops/ConcatLayer.h" @@ -49,7 +50,9 @@ #include "ops/RangeLayer.h" #include "ops/ReduceLayer.h" #include "ops/ReLULayer.h" +#include "ops/ReLU6Layer.h" #include "ops/ReshapeLayer.h" +#include "ops/ResizeBilinearLayer.h" #include "ops/ReverseLayer.h" #include "ops/RoundLayer.h" #include "ops/RsqrtLayer.h" @@ -60,7 +63,9 @@ #include "ops/SoftMaxLayer.h" #include "ops/StridedSliceLayer.h" #include "ops/SpaceToBatchNDLayer.h" +#include "ops/SpaceToDepthLayer.h" #include "ops/SplitLayer.h" +#include "ops/SplitVLayer.h" #include "ops/SubLayer.h" #include "ops/TanhLayer.h" #include "ops/TileLayer.h" @@ -70,11 +75,14 @@ #include "ops/ZerosLikeLayer.h" #include "ops/SquaredDiffLayer.h" #include "ops/LogicalOrLayer.h" +#include "ops/L2NormLayer.h" #include "ops/MatrixBandPartLayer.h" #include "ops/BatchMatMulLayer.h" #include "ops/BroadcastToLayer.h" #include "ops/FusedBatchNormLayer.h" #include "ops/LogSoftMaxLayer.h" +#include "ops/QuantizeLayer.h" +#include "ops/StatelessRandomUniformLayer.h" #include <backend/Backend.h> #include <backend/IConfig.h> @@ -119,9 +127,11 @@ ops::ReduceType convertReduceType(ir::operation::Reduce::ReduceType reduce_type_ KernelGenerator::KernelGenerator( const ir::Operands &operands_ctx, const ir::Operations &operations_ctx, const std::shared_ptr<TensorBuilder> &tensor_builder, - const std::shared_ptr<backend::custom::IKernelBuilder> &kernel_builder) + const std::shared_ptr<backend::custom::IKernelBuilder> &kernel_builder, + const std::shared_ptr<ExternalContext> &external_context) : _ctx(operands_ctx), _operations_ctx{operations_ctx}, _tensor_builder(tensor_builder), - _kernel_builder(kernel_builder), _current_op_seq_layout(ir::Layout::UNKNOWN) + _kernel_builder(kernel_builder), _current_op_seq_layout(ir::Layout::UNKNOWN), + _external_context(external_context) { // DO NOTHING } @@ -184,10 +194,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)}; const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); - auto ker_alloc = _tensor_builder->portableAt(ker_index).get(); - auto bias_alloc = _tensor_builder->portableAt(bias_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); + auto ker_tensor = _tensor_builder->portableAt(ker_index).get(); + auto bias_tensor = _tensor_builder->portableAt(bias_index).get(); const auto stride = node.param().stride; const auto activation = node.param().activation; @@ -196,9 +206,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) if (_ctx.at(ifm_index).info().isDynamic() || _ctx.at(ker_index).info().isDynamic()) { - fn->configure(ifm_alloc, ker_alloc, bias_alloc, param_padding.type, param_padding.param.left, + fn->configure(ifm_tensor, ker_tensor, bias_tensor, param_padding.type, param_padding.param.left, param_padding.param.right, param_padding.param.top, param_padding.param.bottom, - stride.horizontal, stride.vertical, activation, ofm_alloc); + stride.horizontal, stride.vertical, activation, ofm_tensor); _return_fn = std::move(fn); return; @@ -213,9 +223,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node) const auto padding = ir::calculatePadding(param_padding, ifm_shape, ofm_shape, stride, ker_width, ker_height); - fn->configure(ifm_alloc, ker_alloc, bias_alloc, param_padding.type, padding.left, padding.right, - padding.top, padding.bottom, stride.horizontal, stride.vertical, activation, - ofm_alloc); + fn->configure(ifm_tensor, ker_tensor, bias_tensor, param_padding.type, padding.left, + padding.right, padding.top, padding.bottom, stride.horizontal, stride.vertical, + activation, ofm_tensor); _return_fn = std::move(fn); } @@ -241,16 +251,16 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node) const auto multiplier = node.param().multiplier; const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); - auto ker_alloc = _tensor_builder->portableAt(ker_index).get(); - auto bias_alloc = _tensor_builder->portableAt(bias_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); + auto ker_tensor = _tensor_builder->portableAt(ker_index).get(); + auto bias_tensor = _tensor_builder->portableAt(bias_index).get(); auto fn = std::make_unique<ops::DepthwiseConvolutionLayer>(); - fn->configure(ifm_alloc, ker_alloc, bias_alloc, padding.left, padding.right, padding.top, + fn->configure(ifm_tensor, ker_tensor, bias_tensor, padding.left, padding.right, padding.top, padding.bottom, stride.horizontal, stride.vertical, multiplier, activation, - ofm_alloc); + ofm_tensor); _return_fn = std::move(fn); } @@ -270,13 +280,13 @@ void KernelGenerator::visit(const ir::operation::MaxPool2D &node) ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); auto fn = std::make_unique<ops::MaxPoolLayer>(); - fn->configure(ifm_alloc, padding.left, padding.right, padding.top, padding.bottom, - stride.horizontal, stride.vertical, kw, kh, activation, ofm_alloc); + fn->configure(ifm_tensor, padding.left, padding.right, padding.top, padding.bottom, + stride.horizontal, stride.vertical, kw, kh, activation, ofm_tensor); _return_fn = std::move(fn); } @@ -295,13 +305,13 @@ void KernelGenerator::visit(const ir::operation::AvgPool2D &node) ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh); const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); auto fn = std::make_unique<ops::AvgPoolLayer>(); - fn->configure(ifm_alloc, padding.left, padding.right, padding.top, padding.bottom, - stride.horizontal, stride.vertical, kw, kh, activation, ofm_alloc); + fn->configure(ifm_tensor, padding.left, padding.right, padding.top, padding.bottom, + stride.horizontal, stride.vertical, kw, kh, activation, ofm_tensor); _return_fn = std::move(fn); } @@ -313,7 +323,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node) const auto rank = _ctx.at(ofm_index).shape().rank(); const auto axis = ops::getAxis(rank, node.param().axis, _current_op_seq_layout); - auto output_alloc = _tensor_builder->portableAt(ofm_index).get(); + auto output_tensor = _tensor_builder->portableAt(ofm_index).get(); std::vector<const IPortableTensor *> input_tensors; for (auto &ifm_idx : node.getInputs()) @@ -321,7 +331,33 @@ void KernelGenerator::visit(const ir::operation::Concat &node) auto fn = std::make_unique<ops::ConcatLayer>(); - fn->configure(input_tensors, axis, output_alloc); + fn->configure(input_tensors, axis, output_tensor); + + _return_fn = std::move(fn); +} + +void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::BatchToSpaceND::INPUT)}; + const auto block_size_index{node.getInputs().at(ir::operation::BatchToSpaceND::BLOCK_SIZE)}; + + auto output_alloc = _tensor_builder->portableAt(output_index).get(); + auto input_alloc = _tensor_builder->portableAt(input_index).get(); + auto block_size_alloc = _tensor_builder->portableAt(block_size_index).get(); + + auto fn = std::make_unique<ops::BatchToSpaceNDLayer>(); + + IPortableTensor *crops_alloc = nullptr; + const auto NNApiInputs = 2; + + if (node.getInputs().size() != NNApiInputs) + { + const auto crops_data_index{node.getInputs().at(ir::operation::BatchToSpaceND::CROPS_DATA)}; + crops_alloc = _tensor_builder->portableAt(crops_data_index).get(); + } + + fn->configure(input_alloc, output_alloc, block_size_alloc, crops_alloc); _return_fn = std::move(fn); } @@ -332,13 +368,13 @@ void KernelGenerator::visit(const ir::operation::Fill &node) const auto input_index{node.getInputs().at(ir::operation::Fill::Input::INPUT)}; const auto value_index{node.getInputs().at(ir::operation::Fill::Input::VALUE)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto value_alloc = _tensor_builder->portableAt(value_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto value_tensor = _tensor_builder->portableAt(value_index).get(); auto fn = std::make_unique<ops::FillLayer>(); - fn->configure(input_alloc, value_alloc, output_alloc); + fn->configure(input_tensor, value_tensor, output_tensor); _return_fn = std::move(fn); } @@ -353,15 +389,16 @@ void KernelGenerator::visit(const ir::operation::FullyConnected &node) const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)}; const auto activation = node.param().activation; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto weight_alloc = _tensor_builder->portableAt(weight_index).get(); - auto bias_alloc = + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto weight_tensor = _tensor_builder->portableAt(weight_index).get(); + auto bias_tensor = bias_index.undefined() ? nullptr : _tensor_builder->portableAt(bias_index).get(); auto fn = std::make_unique<ops::FullyConnectedLayer>(); - fn->configure(input_alloc, weight_alloc, bias_alloc, activation, output_alloc); + fn->configure(input_tensor, weight_tensor, bias_tensor, activation, output_tensor, + _external_context); _return_fn = std::move(fn); } @@ -371,21 +408,21 @@ void KernelGenerator::visit(const ir::operation::Reshape &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); // optional 2nd input - IPortableTensor *shape_alloc = nullptr; + IPortableTensor *shape_tensor = nullptr; if (node.getInputs().size() == 2) { const auto shape_index{node.getInputs().at(ir::operation::Reshape::Input::SHAPE)}; - shape_alloc = _tensor_builder->portableAt(shape_index).get(); + shape_tensor = _tensor_builder->portableAt(shape_index).get(); } auto fn = std::make_unique<ops::ReshapeLayer>(); - fn->configure(input_alloc, shape_alloc, output_alloc); + fn->configure(input_tensor, shape_tensor, output_tensor); _return_fn = std::move(fn); } @@ -394,13 +431,13 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); // Squeeze can share same kernel with reshape auto fn = std::make_unique<ops::ReshapeLayer>(); - fn->configure(input_alloc, nullptr, output_alloc); + fn->configure(input_tensor, nullptr, output_tensor); _return_fn = std::move(fn); } @@ -412,12 +449,12 @@ void KernelGenerator::visit(const ir::operation::Softmax &node) const auto beta = node.param().beta; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); auto fn = std::make_unique<ops::SoftMaxLayer>(); - fn->configure(input_alloc, beta, output_alloc); + fn->configure(input_tensor, beta, output_tensor); _return_fn = std::move(fn); } @@ -430,13 +467,13 @@ void KernelGenerator::visit(const ir::operation::Add &node) const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); - auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); + auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); auto fn = std::make_unique<ops::AddLayer>(); - fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc); + fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor); _return_fn = std::move(fn); } @@ -447,15 +484,15 @@ void KernelGenerator::visit(const ir::operation::Comparison &node) const auto lhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)}; const auto rhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); - auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); + auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); auto comparison_type = node.param().comparison_type; auto fn = std::make_unique<ops::CompareLayer>(); - fn->configure(lhs_alloc, rhs_alloc, comparison_type, ofm_alloc); + fn->configure(lhs_tensor, rhs_tensor, comparison_type, ofm_tensor); _return_fn = std::move(fn); } @@ -466,11 +503,11 @@ void KernelGenerator::visit(const ir::operation::Gather &node) const auto input_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)}; const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto indices_alloc = _tensor_builder->portableAt(indices_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto indices_tensor = _tensor_builder->portableAt(indices_index).get(); - const auto backend_layout = output_alloc->layout(); + const auto backend_layout = output_tensor->layout(); UNUSED_RELEASE(backend_layout); // NOTE The frontend layout and backend layout must be the same for this operation. @@ -481,8 +518,8 @@ void KernelGenerator::visit(const ir::operation::Gather &node) // a model. For example, if a model in NHWC has this operation as output rank == 4, indices // rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W // and C are not sequential in NCHW. So the backend in NCHW cannot handle this case. - assert(backend_layout == input_alloc->layout()); - assert(backend_layout == indices_alloc->layout()); + assert(backend_layout == input_tensor->layout()); + assert(backend_layout == indices_tensor->layout()); const auto &input_shape = _ctx.at(input_index).shape(); UNUSED_RELEASE(input_shape); assert(input_shape.rank() < 4 || _current_op_seq_layout == backend_layout); @@ -492,7 +529,7 @@ void KernelGenerator::visit(const ir::operation::Gather &node) auto fn = std::make_unique<ops::GatherLayer>(); - fn->configure(input_alloc, indices_alloc, output_alloc, axis_value); + fn->configure(input_tensor, indices_tensor, output_tensor, axis_value); _return_fn = std::move(fn); } @@ -506,13 +543,13 @@ void KernelGenerator::visit(const ir::operation::Sub &node) const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); - auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); + auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); auto fn = std::make_unique<ops::SubLayer>(); - fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc); + fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor); _return_fn = std::move(fn); } @@ -526,13 +563,13 @@ void KernelGenerator::visit(const ir::operation::Mul &node) const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); - auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); + auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); auto fn = std::make_unique<ops::MulLayer>(); - fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc); + fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor); _return_fn = std::move(fn); } @@ -547,18 +584,18 @@ void KernelGenerator::visit(const ir::operation::OneHot &node) const auto axis = node.param().axis; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto indices_alloc = _tensor_builder->portableAt(indices_index).get(); - auto depth_alloc = _tensor_builder->portableAt(depth_index).get(); - auto onvalue_alloc = _tensor_builder->portableAt(onvalue_index).get(); - auto offvalue_alloc = _tensor_builder->portableAt(offvalue_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto indices_tensor = _tensor_builder->portableAt(indices_index).get(); + auto depth_tensor = _tensor_builder->portableAt(depth_index).get(); + auto onvalue_tensor = _tensor_builder->portableAt(onvalue_index).get(); + auto offvalue_tensor = _tensor_builder->portableAt(offvalue_index).get(); - assert(indices_alloc->data_type() == OperandType::INT32); - assert(axis <= static_cast<int>(indices_alloc->num_dimensions())); + assert(indices_tensor->data_type() == OperandType::INT32); + assert(axis <= static_cast<int>(indices_tensor->num_dimensions())); auto fn = std::make_unique<ops::OneHotLayer>(); - fn->configure(indices_alloc, depth_alloc, onvalue_alloc, offvalue_alloc, output_alloc, axis); + fn->configure(indices_tensor, depth_tensor, onvalue_tensor, offvalue_tensor, output_tensor, axis); _return_fn = std::move(fn); } @@ -572,13 +609,13 @@ void KernelGenerator::visit(const ir::operation::Div &node) const auto activation = node.param().activation; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); - auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); + auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); auto fn = std::make_unique<ops::DivLayer>(); - fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc); + fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor); _return_fn = std::move(fn); } @@ -587,16 +624,16 @@ void KernelGenerator::visit(const ir::operation::Einsum &node) { const auto ofm_index{node.getOutputs().at(0)}; - auto output_alloc = _tensor_builder->portableAt(ofm_index).get(); - std::vector<const IPortableTensor *> input_allocs; + auto output_tensor = _tensor_builder->portableAt(ofm_index).get(); + std::vector<const IPortableTensor *> input_tensors; for (auto &ifm_idx : node.getInputs()) - input_allocs.emplace_back(_tensor_builder->portableAt(ifm_idx).get()); + input_tensors.emplace_back(_tensor_builder->portableAt(ifm_idx).get()); const auto equation = node.param().equation; auto fn = std::make_unique<ops::EinsumLayer>(); - fn->configure(input_allocs, equation, output_alloc); + fn->configure(input_tensors, equation, output_tensor); _return_fn = std::move(fn); } @@ -605,14 +642,14 @@ void KernelGenerator::visit(const ir::operation::Custom &node) { auto fill_op_info = [&](const ir::OperandIndexSequence &opSeq, std::vector<custom::TypeInfo> &types, - std::vector<std::shared_ptr<IPortableTensor>> &allocs) { + std::vector<std::shared_ptr<IPortableTensor>> &tensors) { for (auto &idx : opSeq) { const auto &operand = _ctx.at(idx); // TODO make sure using `_current_op_seq_layout` is correct for custom operations types.emplace_back(custom::TypeInfo{operand.shape(), operand.typeInfo().type()}); - auto in_alloc = _tensor_builder->portableAt(idx); - allocs.emplace_back(in_alloc); + auto in_tensor = _tensor_builder->portableAt(idx); + tensors.emplace_back(in_tensor); } }; @@ -634,12 +671,12 @@ void KernelGenerator::visit(const ir::operation::Exp &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); auto fn = std::make_unique<ops::ExpLayer>(); - fn->configure(input_alloc, output_alloc); + fn->configure(input_tensor, output_tensor); _return_fn = std::move(fn); } @@ -650,13 +687,13 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node) const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)}; const auto axis_index{node.getInputs().at(ir::operation::ExpandDims::Input::AXIS)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto axis_alloc = _tensor_builder->portableAt(axis_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto axis_tensor = _tensor_builder->portableAt(axis_index).get(); auto fn = std::make_unique<ops::ExpandDimsLayer>(); - fn->configure(input_alloc, axis_alloc, output_alloc); + fn->configure(input_tensor, axis_tensor, output_tensor); _return_fn = std::move(fn); } @@ -666,12 +703,12 @@ void KernelGenerator::visit(const ir::operation::Logistic &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); auto fn = std::make_unique<ops::LogisticLayer>(); - fn->configure(input_alloc, output_alloc); + fn->configure(input_tensor, output_tensor); _return_fn = std::move(fn); } @@ -681,12 +718,12 @@ void KernelGenerator::visit(const ir::operation::Tanh &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); auto fn = std::make_unique<ops::TanhLayer>(); - fn->configure(input_alloc, output_alloc); + fn->configure(input_tensor, output_tensor); _return_fn = std::move(fn); } @@ -700,7 +737,7 @@ void KernelGenerator::visit(const ir::operation::Pack &node) assert(-rank <= axis && axis < rank); - auto output_alloc = _tensor_builder->portableAt(ofm_index).get(); + auto output_tensor = _tensor_builder->portableAt(ofm_index).get(); std::vector<const IPortableTensor *> input_tensors; for (auto &ifm_idx : node.getInputs()) @@ -708,7 +745,7 @@ void KernelGenerator::visit(const ir::operation::Pack &node) auto fn = std::make_unique<ops::PackLayer>(); - fn->configure(input_tensors, axis, output_alloc); + fn->configure(input_tensors, axis, output_tensor); _return_fn = std::move(fn); } @@ -722,7 +759,7 @@ void KernelGenerator::visit(const ir::operation::Unpack &node) assert(rank == 0 || (-rank <= axis && axis < rank)); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); std::vector<IPortableTensor *> output_tensors; for (auto &output_idx : node.getOutputs()) @@ -732,7 +769,7 @@ void KernelGenerator::visit(const ir::operation::Unpack &node) uint32_t axis_resolved = (axis < 0 ? axis + rank : axis); - fn->configure(input_alloc, axis_resolved, node.param().num, output_tensors); + fn->configure(input_tensor, axis_resolved, node.param().num, output_tensors); _return_fn = std::move(fn); } @@ -751,8 +788,16 @@ void KernelGenerator::visit(const ir::operation::Pad &node) auto fn = std::make_unique<ops::PadLayer>(); - fn->configure(input, output, pad_base, pad_rank); + bool isPadV2 = node.getInputs().size() == 3 ? true : false; + const void *value = nullptr; + + if (isPadV2) + { + const auto value_index{node.getInputs().at(ir::operation::Pad::Input::VALUE)}; + value = reinterpret_cast<const void *>(_ctx.at(value_index).data()->base()); + } + fn->configure(input, output, pad_base, pad_rank, value); _return_fn = std::move(fn); } @@ -762,13 +807,13 @@ void KernelGenerator::visit(const ir::operation::Max &node) const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); - auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); + auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); auto fn = std::make_unique<ops::MaxLayer>(); - fn->configure(lhs_alloc, rhs_alloc, ofm_alloc); + fn->configure(lhs_tensor, rhs_tensor, ofm_tensor); _return_fn = std::move(fn); } @@ -779,13 +824,13 @@ void KernelGenerator::visit(const ir::operation::Min &node) const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); - auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); + auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); auto fn = std::make_unique<ops::MinLayer>(); - fn->configure(lhs_alloc, rhs_alloc, ofm_alloc); + fn->configure(lhs_tensor, rhs_tensor, ofm_tensor); _return_fn = std::move(fn); } @@ -795,12 +840,12 @@ void KernelGenerator::visit(const ir::operation::Cast &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); auto fn = std::make_unique<ops::CastLayer>(); - fn->configure(ifm_alloc, ofm_alloc); + fn->configure(ifm_tensor, ofm_tensor); _return_fn = std::move(fn); } @@ -810,12 +855,12 @@ void KernelGenerator::visit(const ir::operation::Transpose &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Transpose::Input::INPUT)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); auto fn = std::make_unique<ops::TransposeLayer>(); - fn->configure(input_alloc, output_alloc, node.param().perm); + fn->configure(input_tensor, output_tensor, node.param().perm); _return_fn = std::move(fn); } @@ -827,15 +872,15 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)}; const auto keep_dims = node.param().keep_dims; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto axes_alloc = _tensor_builder->portableAt(axes_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto axes_tensor = _tensor_builder->portableAt(axes_index).get(); if (node.param().reduce_type == ir::operation::Reduce::ReduceType::MEAN) { auto fn = std::make_unique<ops::MeanLayer>(); - fn->configure(input_alloc, axes_alloc, output_alloc, keep_dims); + fn->configure(input_tensor, axes_tensor, output_tensor, keep_dims); _return_fn = std::move(fn); } @@ -844,7 +889,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node) auto fn = std::make_unique<ops::ReduceLayer>(); const auto reduce_type = convertReduceType(node.param().reduce_type); - fn->configure(input_alloc, axes_alloc, output_alloc, reduce_type, keep_dims); + fn->configure(input_tensor, axes_tensor, output_tensor, reduce_type, keep_dims); _return_fn = std::move(fn); } @@ -855,12 +900,27 @@ void KernelGenerator::visit(const ir::operation::ReLU &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(0)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); auto fn = std::make_unique<ops::ReLULayer>(); - fn->configure(input_alloc, output_alloc); + fn->configure(input_tensor, output_tensor); + + _return_fn = std::move(fn); +} + +void KernelGenerator::visit(const ir::operation::ReLU6 &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(0)}; + + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + + auto fn = std::make_unique<ops::ReLU6Layer>(); + + fn->configure(input_tensor, output_tensor); _return_fn = std::move(fn); } @@ -872,14 +932,14 @@ void KernelGenerator::visit(const ir::operation::Select &node) const auto true_index{node.getInputs().at(ir::operation::Select::Input::INPUT_TRUE)}; const auto false_index{node.getInputs().at(ir::operation::Select::Input::INPUT_FALSE)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto condition_alloc = _tensor_builder->portableAt(condition_index).get(); - auto true_alloc = _tensor_builder->portableAt(true_index).get(); - auto false_alloc = _tensor_builder->portableAt(false_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto condition_tensor = _tensor_builder->portableAt(condition_index).get(); + auto true_tensor = _tensor_builder->portableAt(true_index).get(); + auto false_tensor = _tensor_builder->portableAt(false_index).get(); auto fn = std::make_unique<ops::SelectLayer>(); - fn->configure(condition_alloc, true_alloc, false_alloc, output_alloc); + fn->configure(condition_tensor, true_tensor, false_tensor, output_tensor); _return_fn = std::move(fn); } @@ -891,14 +951,14 @@ void KernelGenerator::visit(const ir::operation::Slice &node) const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)}; const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto begins_alloc = _tensor_builder->portableAt(begins_index).get(); - auto sizes_alloc = _tensor_builder->portableAt(sizes_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto begins_tensor = _tensor_builder->portableAt(begins_index).get(); + auto sizes_tensor = _tensor_builder->portableAt(sizes_index).get(); auto fn = std::make_unique<ops::SliceLayer>(); - fn->configure(input_alloc, begins_alloc, sizes_alloc, output_alloc); + fn->configure(input_tensor, begins_tensor, sizes_tensor, output_tensor); _return_fn = std::move(fn); } @@ -911,11 +971,11 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node) const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)}; const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto starts_alloc = _tensor_builder->portableAt(starts_index).get(); - auto ends_alloc = _tensor_builder->portableAt(ends_index).get(); - auto strides_alloc = _tensor_builder->portableAt(strides_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto starts_tensor = _tensor_builder->portableAt(starts_index).get(); + auto ends_tensor = _tensor_builder->portableAt(ends_index).get(); + auto strides_tensor = _tensor_builder->portableAt(strides_index).get(); auto begin_mask = node.param().begin_mask; auto end_mask = node.param().end_mask; @@ -923,7 +983,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node) auto fn = std::make_unique<ops::StridedSliceLayer>(); - fn->configure(input_alloc, starts_alloc, ends_alloc, strides_alloc, output_alloc, begin_mask, + fn->configure(input_tensor, starts_tensor, ends_tensor, strides_tensor, output_tensor, begin_mask, end_mask, shrink_axis_mask); _return_fn = std::move(fn); @@ -957,12 +1017,12 @@ void KernelGenerator::visit(const ir::operation::Abs &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); auto fn = std::make_unique<ops::AbsLayer>(); - fn->configure(ifm_alloc, ofm_alloc); + fn->configure(ifm_tensor, ofm_tensor); _return_fn = std::move(fn); } @@ -972,12 +1032,12 @@ void KernelGenerator::visit(const ir::operation::Sin &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Sin::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); auto fn = std::make_unique<ops::SinLayer>(); - fn->configure(ifm_alloc, ofm_alloc); + fn->configure(ifm_tensor, ofm_tensor); _return_fn = std::move(fn); } @@ -987,12 +1047,12 @@ void KernelGenerator::visit(const ir::operation::Cos &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Cos::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); auto fn = std::make_unique<ops::CosLayer>(); - fn->configure(ifm_alloc, ofm_alloc); + fn->configure(ifm_tensor, ofm_tensor); _return_fn = std::move(fn); } @@ -1002,12 +1062,12 @@ void KernelGenerator::visit(const ir::operation::RSQRT &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); auto fn = std::make_unique<ops::RsqrtLayer>(); - fn->configure(ifm_alloc, ofm_alloc); + fn->configure(ifm_tensor, ofm_tensor); _return_fn = std::move(fn); } @@ -1017,12 +1077,33 @@ void KernelGenerator::visit(const ir::operation::Shape &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Shape::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); auto fn = std::make_unique<ops::ShapeLayer>(); - fn->configure(ifm_alloc, ofm_alloc); + fn->configure(ifm_tensor, ofm_tensor); + + _return_fn = std::move(fn); +} + +void KernelGenerator::visit(const ir::operation::ResizeBilinear &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::ResizeBilinear::INPUT)}; + + auto output_height = node.param().height_out; + auto output_width = node.param().width_out; + auto align_corners = node.param().align_corners; + auto half_pixel_centers = node.param().half_pixel_centers; + + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + + auto fn = std::make_unique<ops::ResizeBilinearLayer>(); + + fn->configure(input_tensor, output_tensor, output_height, output_width, align_corners, + half_pixel_centers); _return_fn = std::move(fn); } @@ -1033,13 +1114,13 @@ void KernelGenerator::visit(const ir::operation::Reverse &node) const auto input_index{node.getInputs().at(ir::operation::Reverse::INPUT)}; const auto axis_index{node.getInputs().at(ir::operation::Reverse::AXIS)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto axis_alloc = _tensor_builder->portableAt(axis_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto axis_tensor = _tensor_builder->portableAt(axis_index).get(); auto fn = std::make_unique<ops::ReverseLayer>(); - fn->configure(input_alloc, axis_alloc, output_alloc); + fn->configure(input_tensor, axis_tensor, output_tensor); _return_fn = std::move(fn); } @@ -1049,12 +1130,12 @@ void KernelGenerator::visit(const ir::operation::Neg &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); auto fn = std::make_unique<ops::NegLayer>(); - fn->configure(ifm_alloc, ofm_alloc); + fn->configure(ifm_tensor, ofm_tensor); _return_fn = std::move(fn); } @@ -1066,12 +1147,12 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node) const auto axis = node.param().axis; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); auto fn = std::make_unique<ops::ArgMinMaxLayer>(); - fn->configure(input_alloc, output_alloc, axis, /* is_arg_max */ true); + fn->configure(input_tensor, output_tensor, axis, /* is_arg_max */ true); _return_fn = std::move(fn); } @@ -1082,13 +1163,13 @@ void KernelGenerator::visit(const ir::operation::Pow &node) const auto lhs_index{node.getInputs().at(ir::operation::Pow::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::Pow::RHS)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); - auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); + auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); auto fn = std::make_unique<ops::PowLayer>(); - fn->configure(lhs_alloc, rhs_alloc, ir::Activation::NONE, output_alloc); + fn->configure(lhs_tensor, rhs_tensor, ir::Activation::NONE, output_tensor); _return_fn = std::move(fn); } @@ -1098,12 +1179,12 @@ void KernelGenerator::visit(const ir::operation::Log &node) const auto ofm_index{node.getOutputs().at(0)}; const auto ifm_index{node.getInputs().at(ir::operation::Log::Input::INPUT)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get(); auto fn = std::make_unique<ops::LogLayer>(); - fn->configure(ifm_alloc, ofm_alloc); + fn->configure(ifm_tensor, ofm_tensor); _return_fn = std::move(fn); } @@ -1113,12 +1194,12 @@ void KernelGenerator::visit(const ir::operation::Round &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::Round::INPUT)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); auto fn = std::make_unique<ops::RoundLayer>(); - fn->configure(input_alloc, output_alloc); + fn->configure(input_tensor, output_tensor); _return_fn = std::move(fn); } @@ -1128,12 +1209,12 @@ void KernelGenerator::visit(const ir::operation::LogicalNot &node) const auto output_index{node.getOutputs().at(0)}; const auto input_index{node.getInputs().at(ir::operation::LogicalNot::INPUT)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); auto fn = std::make_unique<ops::LogicalNotLayer>(); - fn->configure(input_alloc, output_alloc); + fn->configure(input_tensor, output_tensor); _return_fn = std::move(fn); } @@ -1144,28 +1225,43 @@ void KernelGenerator::visit(const ir::operation::LogicalOr &node) const auto lhs_index{node.getInputs().at(0)}; const auto rhs_index{node.getInputs().at(1)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); - auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); + auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); auto fn = std::make_unique<ops::LogicalOrLayer>(); - fn->configure(lhs_alloc, rhs_alloc, ofm_alloc); + fn->configure(lhs_tensor, rhs_tensor, ofm_tensor); _return_fn = std::move(fn); } -void KernelGenerator::visit(const ir::operation::ZerosLike &node) +void KernelGenerator::visit(const ir::operation::L2Normalization &node) { const auto output_index{node.getOutputs().at(0)}; - const auto input_index{node.getInputs().at(ir::operation::ZerosLike::INPUT)}; + const auto input_index{node.getInputs().at(0)}; auto output_alloc = _tensor_builder->portableAt(output_index).get(); auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto fn = std::make_unique<ops::ZerosLikeLayer>(); + auto fn = std::make_unique<ops::L2NormLayer>(); fn->configure(input_alloc, output_alloc); + + _return_fn = std::move(fn); +} + +void KernelGenerator::visit(const ir::operation::ZerosLike &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto input_index{node.getInputs().at(ir::operation::ZerosLike::INPUT)}; + + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + + auto fn = std::make_unique<ops::ZerosLikeLayer>(); + + fn->configure(input_tensor, output_tensor); _return_fn = std::move(fn); } @@ -1176,14 +1272,14 @@ void KernelGenerator::visit(const ir::operation::Range &node) const auto limit_index{node.getInputs().at(ir::operation::Range::LIMIT)}; const auto delta_index{node.getInputs().at(ir::operation::Range::DELTA)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto start_alloc = _tensor_builder->portableAt(start_index).get(); - auto limit_alloc = _tensor_builder->portableAt(limit_index).get(); - auto delta_alloc = _tensor_builder->portableAt(delta_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto start_tensor = _tensor_builder->portableAt(start_index).get(); + auto limit_tensor = _tensor_builder->portableAt(limit_index).get(); + auto delta_tensor = _tensor_builder->portableAt(delta_index).get(); auto fn = std::make_unique<ops::RangeLayer>(); - fn->configure(start_alloc, limit_alloc, delta_alloc, output_alloc); + fn->configure(start_tensor, limit_tensor, delta_tensor, output_tensor); _return_fn = std::move(fn); } @@ -1193,13 +1289,13 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node) const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)}; - auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get(); - auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); - auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); + auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get(); + auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); + auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); auto fn = std::make_unique<ops::SqDiffLayer>(); - fn->configure(lhs_alloc, rhs_alloc, ofm_alloc); + fn->configure(lhs_tensor, rhs_tensor, ofm_tensor); _return_fn = std::move(fn); } @@ -1209,13 +1305,13 @@ void KernelGenerator::visit(const ir::operation::Tile &node) const auto input_index{node.getInputs().at(ir::operation::Tile::INPUT)}; const auto multiples_index{node.getInputs().at(ir::operation::Tile::MULTIPLES)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto multiples_alloc = _tensor_builder->portableAt(multiples_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto multiples_tensor = _tensor_builder->portableAt(multiples_index).get(); auto fn = std::make_unique<ops::TileLayer>(); - fn->configure(input_alloc, multiples_alloc, output_alloc); + fn->configure(input_tensor, multiples_tensor, output_tensor); _return_fn = std::move(fn); } @@ -1226,14 +1322,14 @@ void KernelGenerator::visit(const ir::operation::MatrixBandPart &node) const auto num_lower_index{node.getInputs().at(ir::operation::MatrixBandPart::NUM_LOWER_DIAG)}; const auto num_upper_index{node.getInputs().at(ir::operation::MatrixBandPart::NUM_UPPER_DIAG)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto num_lower_alloc = _tensor_builder->portableAt(num_lower_index).get(); - auto num_upper_alloc = _tensor_builder->portableAt(num_upper_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto num_lower_tensor = _tensor_builder->portableAt(num_lower_index).get(); + auto num_upper_tensor = _tensor_builder->portableAt(num_upper_index).get(); auto fn = std::make_unique<ops::MatrixBandPartLayer>(); - fn->configure(input_alloc, num_lower_alloc, num_upper_alloc, output_alloc); + fn->configure(input_tensor, num_lower_tensor, num_upper_tensor, output_tensor); _return_fn = std::move(fn); } @@ -1243,16 +1339,16 @@ void KernelGenerator::visit(const ir::operation::BatchMatMul &node) const auto lhs_index{node.getInputs().at(ir::operation::BatchMatMul::LHS)}; const auto rhs_index{node.getInputs().at(ir::operation::BatchMatMul::RHS)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get(); - auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get(); + auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get(); const auto adj_x = node.param().adj_x; const auto adj_y = node.param().adj_y; auto fn = std::make_unique<ops::BatchMatMulLayer>(); - fn->configure(lhs_alloc, rhs_alloc, adj_x, adj_y, output_alloc); + fn->configure(lhs_tensor, rhs_tensor, adj_x, adj_y, output_tensor); _return_fn = std::move(fn); } @@ -1262,13 +1358,13 @@ void KernelGenerator::visit(const ir::operation::BroadcastTo &node) const auto input_index{node.getInputs().at(ir::operation::BroadcastTo::INPUT)}; const auto shape_index{node.getInputs().at(ir::operation::BroadcastTo::SHAPE)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto shape_alloc = _tensor_builder->portableAt(shape_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto shape_tensor = _tensor_builder->portableAt(shape_index).get(); auto fn = std::make_unique<ops::BroadcastToLayer>(); - fn->configure(input_alloc, shape_alloc, output_alloc); + fn->configure(input_tensor, shape_tensor, output_tensor); _return_fn = std::move(fn); } @@ -1277,10 +1373,10 @@ void KernelGenerator::visit(const ir::operation::FusedBatchNorm &node) { const auto ofm_index{node.getOutputs().at(0)}; - auto output_alloc = _tensor_builder->portableAt(ofm_index).get(); - std::vector<const IPortableTensor *> input_allocs; + auto output_tensor = _tensor_builder->portableAt(ofm_index).get(); + std::vector<const IPortableTensor *> input_tensors; for (auto &ifm_idx : node.getInputs()) - input_allocs.emplace_back(_tensor_builder->portableAt(ifm_idx).get()); + input_tensors.emplace_back(_tensor_builder->portableAt(ifm_idx).get()); const auto epsilon = node.param().epsilon; const auto is_training = node.param().is_training; @@ -1288,7 +1384,7 @@ void KernelGenerator::visit(const ir::operation::FusedBatchNorm &node) auto fn = std::make_unique<ops::FusedBatchNormLayer>(); - fn->configure(input_allocs, epsilon, is_training, data_format, output_alloc); + fn->configure(input_tensors, epsilon, is_training, data_format, output_tensor); _return_fn = std::move(fn); } @@ -1301,12 +1397,12 @@ void KernelGenerator::visit(const ir::operation::LogSoftmax &node) const auto beta = node.param().beta; const auto axis = node.param().axis; - auto output_alloc = _tensor_builder->at(output_index).get(); - auto input_alloc = _tensor_builder->at(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); auto fn = std::make_unique<ops::LogSoftMaxLayer>(); - fn->configure(input_alloc, beta, axis, output_alloc); + fn->configure(input_tensor, beta, axis, output_tensor); _return_fn = std::move(fn); } @@ -1318,14 +1414,84 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node) const auto block_shape_index{node.getInputs().at(ir::operation::SpaceToBatchND::BLOCK_SIZE)}; const auto padding_index{node.getInputs().at(ir::operation::SpaceToBatchND::PADDINGS)}; - auto output_alloc = _tensor_builder->portableAt(output_index).get(); - auto input_alloc = _tensor_builder->portableAt(input_index).get(); - auto block_shape_alloc = _tensor_builder->portableAt(block_shape_index).get(); - auto padding_alloc = _tensor_builder->portableAt(padding_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto block_shape_tensor = _tensor_builder->portableAt(block_shape_index).get(); + auto padding_tensor = _tensor_builder->portableAt(padding_index).get(); auto fn = std::make_unique<ops::SpaceToBatchNDLayer>(); - fn->configure(input_alloc, block_shape_alloc, padding_alloc, output_alloc); + fn->configure(input_tensor, block_shape_tensor, padding_tensor, output_tensor); + + _return_fn = std::move(fn); +} + +void KernelGenerator::visit(const ir::operation::Quantize &node) +{ + const auto input_index{node.getInputs().at(ir::operation::Quantize::Input::INPUT)}; + const auto output_index{node.getOutputs().at(0)}; + + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + + auto fn = std::make_unique<ops::QuantizeLayer>(); + + fn->configure(input_tensor, output_tensor); + + _return_fn = std::move(fn); +} + +void KernelGenerator::visit(const ir::operation::SpaceToDepth &node) +{ + const auto input_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)}; + const auto output_index{node.getOutputs().at(0)}; + auto block_size = node.param().block_size; + + auto input_tensor = _tensor_builder->portableAt(input_index).get(); + auto output_tensor = _tensor_builder->portableAt(output_index).get(); + + auto fn = std::make_unique<ops::SpaceToDepthLayer>(); + + fn->configure(input_tensor, block_size, output_tensor); + _return_fn = std::move(fn); +} + +void KernelGenerator::visit(const ir::operation::StatelessRandomUniform &node) +{ + const auto output_index{node.getOutputs().at(0)}; + const auto shape_index{node.getInputs().at(ir::operation::StatelessRandomUniform::SHAPE)}; + const auto seed_index{node.getInputs().at(ir::operation::StatelessRandomUniform::SEED)}; + + auto output_alloc = _tensor_builder->portableAt(output_index).get(); + auto shape_alloc = _tensor_builder->portableAt(shape_index).get(); + auto seed_alloc = _tensor_builder->portableAt(seed_index).get(); + + auto fn = std::make_unique<ops::StatelessRandomUniformLayer>(); + + fn->configure(shape_alloc, seed_alloc, output_alloc); + _return_fn = std::move(fn); +} + +void KernelGenerator::visit(const ir::operation::SplitV &node) +{ + const auto num_splits = node.param().num_splits; + assert(num_splits == static_cast<int>(node.getOutputs().size())); + + const auto input_idx{node.getInputs().at(ir::operation::SplitV::Input::INPUT)}; + const auto size_splits{node.getInputs().at(ir::operation::SplitV::Input::SIZE_SPLITS)}; + const auto split_dim{node.getInputs().at(ir::operation::SplitV::Input::SPLIT_DIM)}; + + auto in_tensor = _tensor_builder->portableAt(input_idx).get(); + auto in_size_splits = _tensor_builder->portableAt(size_splits).get(); + auto in_split_dim = _tensor_builder->portableAt(split_dim).get(); + + std::vector<IPortableTensor *> out_tensors; + for (auto &output_idx : node.getOutputs()) + out_tensors.emplace_back(_tensor_builder->portableAt(output_idx).get()); + + auto fn = std::make_unique<ops::SplitVLayer>(); + + fn->configure(in_tensor, in_size_splits, in_split_dim, num_splits, out_tensors); _return_fn = std::move(fn); } diff --git a/runtime/onert/backend/cpu/KernelGenerator.h b/runtime/onert/backend/cpu/KernelGenerator.h index d6f4c2825..40c056a96 100644 --- a/runtime/onert/backend/cpu/KernelGenerator.h +++ b/runtime/onert/backend/cpu/KernelGenerator.h @@ -17,6 +17,7 @@ #ifndef __ONERT_BACKEND_CPU_KERNEL_GENERATOR_H__ #define __ONERT_BACKEND_CPU_KERNEL_GENERATOR_H__ +#include "ExternalContext.h" #include "TensorBuilder.h" #include "Tensor.h" @@ -37,7 +38,8 @@ class KernelGenerator : public IKernelGenerator public: KernelGenerator(const ir::Operands &operands_ctx, const ir::Operations &operations_ctx, const std::shared_ptr<TensorBuilder> &tensor_builder, - const std::shared_ptr<custom::IKernelBuilder> &kernel_builder); + const std::shared_ptr<custom::IKernelBuilder> &kernel_builder, + const std::shared_ptr<ExternalContext> &external_context); using IKernelGenerator::visit; @@ -74,6 +76,7 @@ public: void visit(const ir::operation::Transpose &) override; void visit(const ir::operation::Reduce &) override; void visit(const ir::operation::ReLU &) override; + void visit(const ir::operation::ReLU6 &) override; void visit(const ir::operation::Select &) override; void visit(const ir::operation::Slice &) override; void visit(const ir::operation::StridedSlice &) override; @@ -83,6 +86,7 @@ public: void visit(const ir::operation::Sin &) override; void visit(const ir::operation::RSQRT &) override; void visit(const ir::operation::Shape &) override; + void visit(const ir::operation::ResizeBilinear &node) override; void visit(const ir::operation::Reverse &) override; void visit(const ir::operation::Neg &) override; void visit(const ir::operation::ArgMax &) override; @@ -94,13 +98,19 @@ public: void visit(const ir::operation::SquaredDifference &) override; void visit(const ir::operation::Tile &) override; void visit(const ir::operation::LogicalOr &) override; + void visit(const ir::operation::L2Normalization &) override; void visit(const ir::operation::Range &) override; void visit(const ir::operation::MatrixBandPart &) override; void visit(const ir::operation::BatchMatMul &) override; + void visit(const ir::operation::BatchToSpaceND &) override; void visit(const ir::operation::BroadcastTo &) override; void visit(const ir::operation::FusedBatchNorm &) override; void visit(const ir::operation::LogSoftmax &) override; void visit(const ir::operation::SpaceToBatchND &) override; + void visit(const ir::operation::Quantize &) override; + void visit(const ir::operation::SpaceToDepth &) override; + void visit(const ir::operation::StatelessRandomUniform &) override; + void visit(const ir::operation::SplitV &) override; private: const ir::Operands &_ctx; @@ -108,6 +118,7 @@ private: std::shared_ptr<TensorBuilder> _tensor_builder; std::shared_ptr<backend::custom::IKernelBuilder> _kernel_builder; ir::Layout _current_op_seq_layout; + const std::shared_ptr<ExternalContext> _external_context; }; } // namespace cpu diff --git a/runtime/onert/backend/cpu/StaticTensorManager.cc b/runtime/onert/backend/cpu/StaticTensorManager.cc new file mode 100644 index 000000000..78c98dabf --- /dev/null +++ b/runtime/onert/backend/cpu/StaticTensorManager.cc @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "StaticTensorManager.h" +#include "Tensor.h" + +#include <util/logging.h> + +namespace onert +{ +namespace backend +{ +namespace cpu +{ + +StaticTensorManager::StaticTensorManager(const std::shared_ptr<cpu_common::TensorRegistry> ®, + cpu_common::DynamicTensorManager *dynamic_tensor_manager) + : _nonconst_mgr{new cpu_common::MemoryManager()}, _tensors{reg}, + _dynamic_tensor_manager{dynamic_tensor_manager} +{ + // DO NOTHING +} + +void StaticTensorManager::allocateNonconsts(void) +{ + _nonconst_mgr->allocate(); + + for (auto &pair : _tensors->native_tensors()) + { + const auto &ind = pair.first; + auto tensor = pair.second; + if (!_as_constants[ind] && !tensor->is_dynamic()) + { + auto *buffer = _nonconst_mgr->getBuffer(ind); + tensor->setBuffer(buffer); + + VERBOSE(CPU_StaticTensorManager) << "TENSOR(#" << ind.value() + << "): " << static_cast<void *>(buffer) << std::endl; + } + } +} + +void StaticTensorManager::deallocateNonconsts(void) { _nonconst_mgr->deallocate(); } + +void StaticTensorManager::buildTensor(const ir::OperandIndex &ind, + const ir::OperandInfo &tensor_info, ir::Layout backend_layout, + bool as_const) +{ + assert(!_tensors->getITensor(ind)); + if (as_const) + { + auto tensor = std::make_shared<ExternalTensor>(tensor_info, backend_layout); + _tensors->setNativeTensor(ind, tensor); + } + else + { + auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout, _dynamic_tensor_manager); + _tensors->setNativeTensor(ind, tensor); + } + _as_constants[ind] = as_const; +} + +void StaticTensorManager::claimPlan(const ir::OperandIndex &ind, uint32_t size) +{ + assert(_tensors->getITensor(ind)); + + // This method is called only when a tensor has proper shape + assert(!_tensors->getITensor(ind)->is_dynamic()); + + if (!_as_constants[ind]) + _nonconst_mgr->claimPlan(ind, size); +} + +void StaticTensorManager::releasePlan(const ir::OperandIndex &ind) +{ + assert(_tensors->getITensor(ind)); + + // This method is called only when a tensor has proper shape + assert(!_tensors->getITensor(ind)->is_dynamic()); + + if (!_as_constants[ind]) + _nonconst_mgr->releasePlan(ind); +} + +void StaticTensorManager::iterate(const std::function<void(const ir::OperandIndex &)> &fn) +{ + for (const auto &it : _tensors->native_tensors()) + fn(it.first); +} + +} // namespace cpu +} // namespace backend +} // namespace onert diff --git a/runtime/onert/backend/cpu/StaticTensorManager.h b/runtime/onert/backend/cpu/StaticTensorManager.h new file mode 100644 index 000000000..2af61e4e7 --- /dev/null +++ b/runtime/onert/backend/cpu/StaticTensorManager.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__ +#define __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__ + +#include "backend/IStaticTensorManager.h" +#include "backend/cpu_common/DynamicTensorManager.h" +#include "backend/cpu_common/MemoryManager.h" +#include "backend/cpu_common/TensorRegistry.h" +#include "backend/ITensorManager.h" +#include "ir/OperandIndexMap.h" +#include "ir/OperandInfo.h" + +namespace onert +{ +namespace backend +{ +namespace cpu +{ + +class StaticTensorManager : public backend::IStaticTensorManager +{ +public: + StaticTensorManager(const std::shared_ptr<cpu_common::TensorRegistry> ®, + cpu_common::DynamicTensorManager *dynamic_tensor_manager); + virtual ~StaticTensorManager() = default; + + void allocateNonconsts(void); + void deallocateNonconsts(void); + + void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &tensor_info, + ir::Layout backend_layout, bool as_const); + + void claimPlan(const ir::OperandIndex &ind, uint32_t size); + void releasePlan(const ir::OperandIndex &ind); + + void iterate(const std::function<void(const ir::OperandIndex &)> &fn); + +private: + std::unique_ptr<cpu_common::MemoryManager> _nonconst_mgr; + const std::shared_ptr<cpu_common::TensorRegistry> _tensors; + ir::OperandIndexMap<bool> _as_constants; + cpu_common::DynamicTensorManager *_dynamic_tensor_manager; +}; + +} // namespace cpu +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__ diff --git a/runtime/onert/backend/cpu/Tensor.h b/runtime/onert/backend/cpu/Tensor.h index 4dd251bd3..20e60260c 100644 --- a/runtime/onert/backend/cpu/Tensor.h +++ b/runtime/onert/backend/cpu/Tensor.h @@ -29,15 +29,22 @@ namespace cpu using Tensor = cpu_common::Tensor; -// Tensor which has data from external. To support this, assume below things -// no padding, always NHWC layout, constant tensor and not dynamic +/** + * @brief Class that uses data from external memory that is not managed by a backend + * instead of allocating and copying the data. ExternalTensor's data pointer points to + * an address of memory such as where memory is already allocated, or mmapped area. + * This is meaning that ExternalTensor can take all of types' ir::Data. + * To support this, assume below things no padding, always NHWC layout, + * constant tensor and not dynamic. + */ class ExternalTensor : public Tensor { public: ExternalTensor() = delete; public: - ExternalTensor(const ir::OperandInfo &info, const ir::Layout layout) : Tensor(info, layout) + ExternalTensor(const ir::OperandInfo &info, const ir::Layout layout) + : Tensor(info, layout, nullptr) { assert(_layout == ir::Layout::NHWC); assert(_info.isConstant()); @@ -45,6 +52,11 @@ public: } public: + /** + * @brief set Data to be shared from external so that this ExternalTensor will not be + * allocated on CPU backend + * @param[in] data data of Operand to be set + */ void setData(const std::shared_ptr<ir::Data> data) { assert(data != nullptr); diff --git a/runtime/onert/backend/cpu/TensorBuilder.cc b/runtime/onert/backend/cpu/TensorBuilder.cc index 886e8d820..ab8ba5756 100644 --- a/runtime/onert/backend/cpu/TensorBuilder.cc +++ b/runtime/onert/backend/cpu/TensorBuilder.cc @@ -29,8 +29,8 @@ namespace cpu TensorBuilder::TensorBuilder() : _tensor_reg{new cpu_common::TensorRegistry()}, - _static_tensor_mgr{new cpu_common::StaticTensorManager(_tensor_reg)}, - _dynamic_tensor_mgr{new cpu_common::DynamicTensorManager(_tensor_reg)} + _dynamic_tensor_mgr{new cpu_common::DynamicTensorManager(_tensor_reg)}, + _static_tensor_mgr{new StaticTensorManager(_tensor_reg, _dynamic_tensor_mgr.get())} { /* empty */ } @@ -77,11 +77,7 @@ bool TensorBuilder::isRegistered(const ir::OperandIndex &ind) const return _tensor_info_map.find(ind) != _tensor_info_map.end(); } -void TensorBuilder::prepare(void) -{ - _static_tensor_mgr->allocateConsts(); - _static_tensor_mgr->allocateNonconsts(); -} +void TensorBuilder::prepare(void) { _static_tensor_mgr->allocateNonconsts(); } void TensorBuilder::allocate() { @@ -99,17 +95,17 @@ std::shared_ptr<IPortableTensor> TensorBuilder::portableAt(const ir::OperandInde return _tensor_reg->getPortableTensor(ind); } -bool TensorBuilder::setExternalTensor(const ir::OperandIndex &ind, - const std::shared_ptr<IPortableTensor> &tensor) +bool TensorBuilder::setMigrantTensor(const ir::OperandIndex &ind, + const std::shared_ptr<IPortableTensor> &tensor) { - return _tensor_reg->setExternalTensor(ind, tensor); + return _tensor_reg->setMigrantTensor(ind, tensor); } void TensorBuilder::iterate(const IterateFunction &fn) { _static_tensor_mgr->iterate(fn); } -std::shared_ptr<cpu_common::Tensor> TensorBuilder::at(const ir::OperandIndex &ind) +std::shared_ptr<Tensor> TensorBuilder::at(const ir::OperandIndex &ind) { - return _tensor_reg->getManagedTensor(ind); + return _tensor_reg->getNativeTensor(ind); } std::unique_ptr<ITensorManager> TensorBuilder::releaseStaticTensorManager(void) diff --git a/runtime/onert/backend/cpu/TensorBuilder.h b/runtime/onert/backend/cpu/TensorBuilder.h index ba25451ec..617136514 100644 --- a/runtime/onert/backend/cpu/TensorBuilder.h +++ b/runtime/onert/backend/cpu/TensorBuilder.h @@ -18,13 +18,14 @@ #define __ONERT_BACKEND_CPU_TENSOR_BUILDER_H__ #include <backend/cpu_common/DynamicTensorManager.h> -#include <backend/cpu_common/StaticTensorManager.h> #include <backend/cpu_common/TensorRegistry.h> -#include <backend/cpu_common/Tensor.h> #include <backend/ITensorBuilder.h> #include <ir/OperandIndexMap.h> +#include "StaticTensorManager.h" +#include "Tensor.h" + #include <unordered_map> namespace onert @@ -80,17 +81,17 @@ public: * If not, program will crash with assert or exception. * @return shared_ptr<Tensor> */ - std::shared_ptr<cpu_common::Tensor> at(const ir::OperandIndex &ind); + std::shared_ptr<Tensor> at(const ir::OperandIndex &ind); std::shared_ptr<IPortableTensor> portableAt(const ir::OperandIndex &ind); - bool setExternalTensor(const ir::OperandIndex &ind, - const std::shared_ptr<IPortableTensor> &tensor) override; + bool setMigrantTensor(const ir::OperandIndex &ind, + const std::shared_ptr<IPortableTensor> &tensor) override; std::shared_ptr<ITensorRegistry> tensorRegistry() override { return _tensor_reg; } private: const std::shared_ptr<cpu_common::TensorRegistry> _tensor_reg; - std::unique_ptr<cpu_common::StaticTensorManager> _static_tensor_mgr; std::unique_ptr<cpu_common::DynamicTensorManager> _dynamic_tensor_mgr; + std::unique_ptr<StaticTensorManager> _static_tensor_mgr; ir::OperandIndexMap<ir::OperandInfo> _tensor_info_map; }; diff --git a/runtime/onert/backend/cpu/ops/BatchToSpaceNDLayer.cc b/runtime/onert/backend/cpu/ops/BatchToSpaceNDLayer.cc new file mode 100644 index 000000000..f2f10eb9d --- /dev/null +++ b/runtime/onert/backend/cpu/ops/BatchToSpaceNDLayer.cc @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "BatchToSpaceNDLayer.h" + +#include <cker/operation/BatchToSpaceND.h> + +namespace onert +{ +namespace backend +{ +namespace cpu +{ +namespace ops +{ + +BatchToSpaceNDLayer::BatchToSpaceNDLayer() + : _input(nullptr), _output(nullptr), _block_shape(nullptr), _crops(nullptr) +{ + // DO NOTHING +} + +template <typename T> void BatchToSpaceNDLayer::batchToSpaceNDGeneric() +{ + const int32_t NNapiCrops[]{0, 0, 0, 0}; + const int32_t *_crops_buffer; + + if (_crops == nullptr) + { + _crops_buffer = NNapiCrops; + } + else + { + _crops_buffer = reinterpret_cast<const int32_t *>(_crops->buffer()); + } + nnfw::cker::BatchToSpaceND<T>( + getTensorShape(_input), reinterpret_cast<const T *>(_input->buffer()), + reinterpret_cast<const int32_t *>(_block_shape->buffer()), _crops_buffer, + getTensorShape(_output), reinterpret_cast<T *>(_output->buffer())); +} + +void BatchToSpaceNDLayer::configure(const IPortableTensor *input, IPortableTensor *output, + IPortableTensor *block_shape, IPortableTensor *crops) +{ + _output = output; + _input = input; + _block_shape = block_shape; + _crops = crops; +} + +void BatchToSpaceNDLayer::run() +{ + if (_output->data_type() == OperandType::FLOAT32) + { + batchToSpaceNDGeneric<float>(); + } + else if (_output->data_type() == OperandType::QUANT_UINT8_ASYMM) + { + batchToSpaceNDGeneric<uint8_t>(); + } + else + { + throw std::runtime_error{"NYI"}; + } +} + +} // namespace ops +} // namespace cpu +} // namespace backend +} // namespace onert diff --git a/runtime/onert/backend/cpu/ops/BatchToSpaceNDLayer.h b/runtime/onert/backend/cpu/ops/BatchToSpaceNDLayer.h new file mode 100644 index 000000000..6e25b241b --- /dev/null +++ b/runtime/onert/backend/cpu/ops/BatchToSpaceNDLayer.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_CPU_OPS_BATCHTOSPACEND_LAYER_H__ +#define __ONERT_BACKEND_CPU_OPS_BATCHTOSPACEND_LAYER_H__ + +#include <backend/IPortableTensor.h> +#include "OperationUtils.h" + +#include <exec/IFunction.h> + +namespace onert +{ +namespace backend +{ +namespace cpu +{ +namespace ops +{ + +class BatchToSpaceNDLayer : public ::onert::exec::IFunction +{ +public: + BatchToSpaceNDLayer(); + +public: + template <typename T> void batchToSpaceNDGeneric(); + + void configure(const IPortableTensor *input, IPortableTensor *output, + IPortableTensor *block_shape, IPortableTensor *crops); + + void run() override; + +private: + const IPortableTensor *_input; + IPortableTensor *_output; + IPortableTensor *_block_shape; + IPortableTensor *_crops; +}; + +} // namespace ops +} // namespace cpu +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_CPU_OPS_BATCHTOSPACEND_LAYER_H__ diff --git a/runtime/onert/backend/cpu/ops/CompareLayer.cc b/runtime/onert/backend/cpu/ops/CompareLayer.cc index f557f3ade..adf902aaf 100644 --- a/runtime/onert/backend/cpu/ops/CompareLayer.cc +++ b/runtime/onert/backend/cpu/ops/CompareLayer.cc @@ -17,6 +17,7 @@ #include "OperationUtils.h" +#include <assert.h> #include <cker/operation/Comparison.h> using namespace nnfw::cker; namespace onert @@ -34,6 +35,14 @@ namespace using OpType = onert::ir::operation::Comparison::ComparisonType; using namespace onert::backend::cpu; +// Assumes these enum values to be in the order like this +static_assert(static_cast<int>(OpType::Equal) == 0, "An OpType value has changed!"); +static_assert(static_cast<int>(OpType::NotEqual) == 1, "An OpType value has changed!"); +static_assert(static_cast<int>(OpType::Greater) == 2, "An OpType value has changed!"); +static_assert(static_cast<int>(OpType::GreaterEqual) == 3, "An OpType value has changed!"); +static_assert(static_cast<int>(OpType::Less) == 4, "An OpType value has changed!"); +static_assert(static_cast<int>(OpType::LessEqual) == 5, "An OpType value has changed!"); + template <typename T> void compareQuant8(const IPortableTensor *lhs, const IPortableTensor *rhs, IPortableTensor *output, OpType op_type) @@ -52,95 +61,33 @@ void compareQuant8(const IPortableTensor *lhs, const IPortableTensor *rhs, IPort ¶ms.input2_shift); params.is_broadcast = !HaveSameShapes(lhs, rhs); - if (params.is_broadcast) - { - switch (op_type) - { - case OpType::Equal: - Broadcast4DSlowEqualWithScaling( - params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer())); - break; - case OpType::NotEqual: - Broadcast4DSlowNotEqualWithScaling( - params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer())); - break; - case OpType::Greater: - Broadcast4DSlowGreaterWithScaling( - params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer())); - break; - case OpType::GreaterEqual: - Broadcast4DSlowGreaterEqualWithScaling( - params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer())); - break; - case OpType::Less: - Broadcast4DSlowLessWithScaling( - params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer())); - break; - case OpType::LessEqual: - Broadcast4DSlowLessEqualWithScaling( - params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer())); - break; - default: - throw std::runtime_error{"Invalid OpType for CompareLayer"}; - } - } - else // if (requires_broadcast == false) - { - switch (op_type) - { - case OpType::Equal: - EqualWithScaling(params, getExtendedTensorShape(lhs), - reinterpret_cast<const T *>(lhs->buffer()), getExtendedTensorShape(rhs), - reinterpret_cast<const T *>(rhs->buffer()), getExtendedTensorShape(output), - reinterpret_cast<bool *>(output->buffer())); - break; - case OpType::NotEqual: - NotEqualWithScaling( - params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer())); - break; - case OpType::Greater: - GreaterWithScaling( - params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer())); - break; - case OpType::GreaterEqual: - GreaterEqualWithScaling( - params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer())); - break; - case OpType::Less: - LessWithScaling(params, getExtendedTensorShape(lhs), - reinterpret_cast<const T *>(lhs->buffer()), getExtendedTensorShape(rhs), - reinterpret_cast<const T *>(rhs->buffer()), getExtendedTensorShape(output), - reinterpret_cast<bool *>(output->buffer())); - break; - case OpType::LessEqual: - LessEqualWithScaling( - params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer())); - break; - default: - throw std::runtime_error{"Invalid OpType for CompareLayer"}; - } - } - return; + using CompareFunction = + void (*)(ComparisonParams & params, const Shape &input1_shape, const T *input1_data, + const Shape &input2_shape, const T *input2_data, const Shape &output_shape, + bool *output_data); + + static const CompareFunction broadcast_fns[] = { + Broadcast4DSlowEqualWithScaling, Broadcast4DSlowNotEqualWithScaling, + Broadcast4DSlowGreaterWithScaling, Broadcast4DSlowGreaterEqualWithScaling, + Broadcast4DSlowLessWithScaling, Broadcast4DSlowLessEqualWithScaling, + }; + static const CompareFunction non_broadcast_fns[] = { + EqualWithScaling, NotEqualWithScaling, GreaterWithScaling, + GreaterEqualWithScaling, LessWithScaling, LessEqualWithScaling, + }; + + static_assert(sizeof(broadcast_fns) == sizeof(non_broadcast_fns), + "Sizes of broadcast_fns and non_broadcast_fns must match!"); + + auto index = static_cast<int>(op_type); + if (index < 0 || index >= static_cast<int>(sizeof(broadcast_fns) / sizeof(broadcast_fns[0]))) + throw std::runtime_error{"Invalid OpType for CompareLayer"}; + + CompareFunction fn = (params.is_broadcast ? broadcast_fns[index] : non_broadcast_fns[index]); + + fn(params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), + getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), + getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer())); } template <typename T> @@ -149,94 +96,33 @@ void compareScalar(const IPortableTensor *lhs, const IPortableTensor *rhs, IPort { bool requires_broadcast = !HaveSameShapes(lhs, rhs); - if (requires_broadcast) - { - switch (op_type) - { - case OpType::Equal: - Broadcast4DSlowEqual( - getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer())); - break; - case OpType::NotEqual: - Broadcast4DSlowNotEqual( - getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer())); - break; - case OpType::Greater: - Broadcast4DSlowGreater( - getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer())); - break; - case OpType::GreaterEqual: - Broadcast4DSlowGreaterEqual( - getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer())); - break; - case OpType::Less: - Broadcast4DSlowLess(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), - getExtendedTensorShape(output), - reinterpret_cast<bool *>(output->buffer())); - break; - case OpType::LessEqual: - Broadcast4DSlowLessEqual( - getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer())); - break; - default: - throw std::runtime_error{"Invalid OpType for CompareLayer"}; - } - } - else // if (requires_broadcast == false) - { - switch (op_type) - { - case OpType::Equal: - EqualNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer())); - break; - case OpType::NotEqual: - NotEqualNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), - getExtendedTensorShape(output), - reinterpret_cast<bool *>(output->buffer())); - break; - case OpType::Greater: - GreaterNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), - getExtendedTensorShape(output), - reinterpret_cast<bool *>(output->buffer())); - break; - case OpType::GreaterEqual: - GreaterEqualNoScaling( - getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer())); - break; - case OpType::Less: - LessNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), - getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer())); - break; - case OpType::LessEqual: - LessEqualNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), - getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), - getExtendedTensorShape(output), - reinterpret_cast<bool *>(output->buffer())); - break; - default: - throw std::runtime_error{"Invalid OpType for CompareLayer"}; - } - } - return; + using CompareFunction = + void (*)(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape, + const T *input2_data, const Shape &output_shape, bool *output_data); + + static const CompareFunction broadcast_fns[] = { + Broadcast4DSlowEqual, Broadcast4DSlowNotEqual, Broadcast4DSlowGreater, + Broadcast4DSlowGreaterEqual, Broadcast4DSlowLess, Broadcast4DSlowLessEqual, + }; + static const CompareFunction non_broadcast_fns[] = { + EqualNoScaling, NotEqualNoScaling, GreaterNoScaling, + GreaterEqualNoScaling, LessNoScaling, LessEqualNoScaling, + }; + + static_assert(sizeof(broadcast_fns) == sizeof(non_broadcast_fns), + "Sizes of broadcast_fns and non_broadcast_fns must match!"); + + auto index = static_cast<int>(op_type); + if (index < 0 || index >= static_cast<int>(sizeof(broadcast_fns) / sizeof(broadcast_fns[0]))) + throw std::runtime_error{"Invalid OpType for CompareLayer"}; + + CompareFunction fn = (requires_broadcast ? broadcast_fns[index] : non_broadcast_fns[index]); + + fn(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), + getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()), + getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer())); } + } // namespace CompareLayer::CompareLayer() diff --git a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc index c00be64e5..05da33abf 100644 --- a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc +++ b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc @@ -18,6 +18,8 @@ #include "../Tensor.h" #include <cker/operation/FullyConnected.h> +#include <cker/TensorUtils.h> +#include <misc/polymorphic_downcast.h> namespace onert { @@ -31,7 +33,7 @@ namespace ops FullyConnectedLayer::FullyConnectedLayer() : _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr), _activation(ir::Activation::NONE), _temp_arena(new nnfw::cker::FCTempArena()), - _is_hybrid(false) + _external_context(nullptr), _is_hybrid(false) { // DO NOTHING } @@ -102,7 +104,8 @@ void FullyConnectedLayer::fullyConnectedHybrid() op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()), getTensorShape(_weights), reinterpret_cast<const int8_t *>(_weights->buffer()), getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr), - getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), temp_arena); + getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), temp_arena, + _external_context->ruy_context()); #else nnfw::cker::FullyConnectedHybrid( op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()), @@ -110,31 +113,67 @@ void FullyConnectedLayer::fullyConnectedHybrid() (_cached_weights) ? reinterpret_cast<const int8_t *>(_cached_weights) : reinterpret_cast<const int8_t *>(_weights->buffer()), getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr), - getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), temp_arena); + getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), temp_arena, + _external_context->ruy_context()); -// TODO Enable calling decrease_ref -#if 0 if (_cached_weights == nullptr || _is_weights_freed) return; - auto weight_tensor = dynamic_cast<const Tensor *>(_weights); - if (weight_tensor) + // '_cached_weights is not nullptr and _is_weights_freed is false' means + // this weight shape is satisfied with the ruy kernel's prepack cache's condition. + // After entering here, it will not enter again except below the case - input is zero-vector + + // if input's elements are filled with zero, it by-passes(does not enter ruy-kernel path) + // so that handle this case + const int input_size = getTensorShape(_input).FlatSize(); + if (nnfw::cker::IsZeroVector(reinterpret_cast<float *>(_input->buffer()), input_size)) + return; + + auto weight_tensor = nnfw::misc::polymorphic_downcast<const Tensor *>(_weights); + + // This weight tensor could be other ops' const tensor. + // Therefore, below reference should be checked like following + auto tensor = const_cast<Tensor *>(weight_tensor); + if (tensor->buffer() == nullptr) // ref is already 0? { - auto tensor = const_cast<Tensor *>(weight_tensor); + _is_weights_freed = true; + return; + } - tensor->decrease_ref(); - if (tensor->buffer() == nullptr) // ref == 0? - { - _is_weights_freed = true; - } + tensor->decrease_ref(); + if (tensor->buffer() == nullptr) // ref == 0? + { + _is_weights_freed = true; } -#endif // if 0 #endif } +void FullyConnectedLayer::fullyConnectedSparseWeight() +{ + float output_activation_min = 0, output_activation_max = 0; + CalculateActivationRange(_activation, &output_activation_min, &output_activation_max); + + nnfw::cker::FullyConnectedParams op_params; + op_params.float_activation_min = output_activation_min; + op_params.float_activation_max = output_activation_max; + op_params.activation = convertActivationType(_activation); + + int w0_size = getTensorShape(_weights).Dims(0); + const uint16_t *w1_segments = _weights->w1_segments(); + const uint16_t *w1_indices = _weights->w1_indices(); + + nnfw::cker::FullyConnectedSparseWeight( + op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()), + getTensorShape(_weights), reinterpret_cast<const float *>(_weights->buffer()), + getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr), + getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), w0_size, w1_segments, + w1_indices); +} + void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortableTensor *weights, const IPortableTensor *bias, ir::Activation activation, - IPortableTensor *output) + IPortableTensor *output, + const std::shared_ptr<ExternalContext> &external_context) { _input = input; _weights = weights; @@ -143,6 +182,7 @@ void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortabl _output = output; _is_hybrid = input->data_type() == OperandType::FLOAT32 && weights->data_type() == OperandType::QUANT_INT8_SYMM; + _external_context = external_context; } void FullyConnectedLayer::run() @@ -151,6 +191,10 @@ void FullyConnectedLayer::run() { fullyConnectedHybrid(); } + else if (_weights->is_sparse()) + { + fullyConnectedSparseWeight(); + } else if (_input->data_type() == OperandType::FLOAT32) { fullyConnectedFloat32(); @@ -167,7 +211,16 @@ void FullyConnectedLayer::run() void FullyConnectedLayer::prepare() { -#ifdef USE_RUY_GEMV + if (_bias && _bias->is_constant()) + { + const int bias_size = getTensorShape(_bias).FlatSize(); + if (nnfw::cker::IsZeroVector(reinterpret_cast<float *>(_bias->buffer()), bias_size)) + { + _bias = nullptr; + } + } + +#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && defined(USE_RUY_GEMV) // TODO This is workaround // The only fc hybrid will use ruy kernel if (_input->data_type() != OperandType::FLOAT32 || diff --git a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h index dd5ef2436..f1242677c 100644 --- a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h +++ b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h @@ -18,6 +18,7 @@ #define __ONERT_BACKEND_CPU_OPS_FULLYCONNECTEDLAYER_H__ #include <backend/IPortableTensor.h> +#include "../ExternalContext.h" #include "OperationUtils.h" #include <exec/IFunction.h> @@ -52,8 +53,11 @@ public: void fullyConnectedHybrid(); + void fullyConnectedSparseWeight(); + void configure(const IPortableTensor *input, const IPortableTensor *weights, - const IPortableTensor *bias, ir::Activation activation, IPortableTensor *output); + const IPortableTensor *bias, ir::Activation activation, IPortableTensor *output, + const std::shared_ptr<ExternalContext> &external_context); void run() override; @@ -68,10 +72,13 @@ private: ir::Activation _activation; std::unique_ptr<nnfw::cker::FCTempArena> _temp_arena; + std::shared_ptr<ExternalContext> _external_context; + bool _is_hybrid; #ifdef USE_RUY_GEMV uint8_t *_cached_weights = nullptr; // weights to be cached and a key + bool _is_weights_freed = false; // is weights freed? #endif }; diff --git a/runtime/onert/backend/cpu/ops/L2NormLayer.cc b/runtime/onert/backend/cpu/ops/L2NormLayer.cc new file mode 100644 index 000000000..0d99b0586 --- /dev/null +++ b/runtime/onert/backend/cpu/ops/L2NormLayer.cc @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "L2NormLayer.h" + +#include "OperationUtils.h" + +#include <cker/operation/L2Normalize.h> +#include <cker/Types.h> + +namespace onert +{ +namespace backend +{ +namespace cpu +{ +namespace ops +{ + +void L2NormLayer::configure(const IPortableTensor *input, IPortableTensor *output) +{ + assert(input != nullptr); + assert(output != nullptr); + + _input = input; + _output = output; +} + +void L2NormLayer::run() +{ + switch (_input->data_type()) + { + case OperandType::FLOAT32: + nnfw::cker::L2NormalizeFloat32( + getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()), + getTensorShape(_output), reinterpret_cast<float *>(_output->buffer())); + break; + + case OperandType::QUANT_UINT8_ASYMM: + { + nnfw::cker::L2NormParams params; + assert(_input->data_offset() == 128); + params.input_zero_point = _input->data_offset(); + nnfw::cker::L2NormalizeQuant8( + params, getTensorShape(_input), reinterpret_cast<const uint8_t *>(_input->buffer()), + getTensorShape(_output), reinterpret_cast<uint8_t *>(_output->buffer())); + } + break; + + default: + throw std::runtime_error{"L2Norm: Unsupported data type"}; + } +} + +} // namespace ops +} // namespace cpu +} // namespace backend +} // namespace onert diff --git a/runtime/onert/backend/cpu/ops/L2NormLayer.h b/runtime/onert/backend/cpu/ops/L2NormLayer.h new file mode 100644 index 000000000..63f2d1133 --- /dev/null +++ b/runtime/onert/backend/cpu/ops/L2NormLayer.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in riting, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_CPU_OPS_L2NORM_LAYER_H__ +#define __ONERT_BACKEND_CPU_OPS_L2NORM_LAYER_H__ + +#include <backend/IPortableTensor.h> + +#include <exec/IFunction.h> + +namespace onert +{ +namespace backend +{ +namespace cpu +{ +namespace ops +{ +class L2NormLayer : public ::onert::exec::IFunction +{ +public: + L2NormLayer() : _input(nullptr), _output(nullptr) + { + // Nothing + } + +public: + void configure(const IPortableTensor *_input, IPortableTensor *output); + + void run() override; + +private: + const IPortableTensor *_input; + IPortableTensor *_output; +}; + +} // namespace ops +} // namespace cpu +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_CPU_OPS_L2NORM_LAYER_H__ diff --git a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc index d71e325ac..06dde4fc4 100644 --- a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc +++ b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc @@ -49,8 +49,8 @@ void LogSoftMaxLayer::logsoftmaxQuant8() // NYI } -void LogSoftMaxLayer::configure(const Tensor *input, const float beta, const int axis, - Tensor *output) +void LogSoftMaxLayer::configure(const IPortableTensor *input, const float beta, const int axis, + IPortableTensor *output) { _input = input; _output = output; diff --git a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h index bc145cea7..ba9deca17 100644 --- a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h +++ b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h @@ -40,13 +40,14 @@ public: void logsoftmaxQuant8(); - void configure(const Tensor *input, const float beta, const int axis, Tensor *output); + void configure(const IPortableTensor *input, const float beta, const int axis, + IPortableTensor *output); void run(); private: - const Tensor *_input; - Tensor *_output; + const IPortableTensor *_input; + IPortableTensor *_output; float _beta; int _axis; diff --git a/runtime/onert/backend/cpu/ops/OperationUtils.h b/runtime/onert/backend/cpu/ops/OperationUtils.h index 8d29374ff..98385521a 100644 --- a/runtime/onert/backend/cpu/ops/OperationUtils.h +++ b/runtime/onert/backend/cpu/ops/OperationUtils.h @@ -52,6 +52,17 @@ union DataPtr { void *v; }; +union ConstDataPtr { + const uint8_t *u8; + const int8_t *i8; + const uint32_t *u32; + const int32_t *i32; + const bool *b; + const float *f; + const int64_t *i64; + const void *v; +}; + uint32_t getNumberOfDimensions(const IPortableTensor *tensor); uint32_t getNumberOfElements(const IPortableTensor *tensor); diff --git a/runtime/onert/backend/cpu/ops/PadLayer.cc b/runtime/onert/backend/cpu/ops/PadLayer.cc index fcfcf7b5e..6a2bf9da0 100644 --- a/runtime/onert/backend/cpu/ops/PadLayer.cc +++ b/runtime/onert/backend/cpu/ops/PadLayer.cc @@ -33,33 +33,40 @@ PadLayer::PadLayer() // DO NOTHING } -void PadLayer::padFloat32() +template <typename T> void PadLayer::padImpl(const T *constant_value_data) { - nnfw::cker::Pad(_padData, _padRank, getTensorShape(_input), - reinterpret_cast<const float *>(_input->buffer()), getTensorShape(_output), - reinterpret_cast<float *>(_output->buffer()), _constantValueData.f); + nnfw::cker::Pad<T>(_padData, _padRank, getTensorShape(_input), + reinterpret_cast<const T *>(_input->buffer()), getTensorShape(_output), + reinterpret_cast<T *>(_output->buffer()), constant_value_data); } -void PadLayer::padQuant8() { throw std::runtime_error("Quantized Pad isn't supported NYI"); } void PadLayer::configure(const IPortableTensor *input, IPortableTensor *output, - const int32_t *padData, int32_t padRank, uint8_t *constantValueData) + const int32_t *padData, int32_t padRank, const void *constantValueData) { _input = input; _output = output; memcpy(_padData, padData, sizeof(_padData)); _padRank = padRank; - _constantValueData.u8 = constantValueData; + _constantValueData.v = constantValueData; } void PadLayer::run() { if (_input->data_type() == OperandType::FLOAT32) { - padFloat32(); + padImpl<float>(_constantValueData.f); } else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM) { - padQuant8(); + if (_constantValueData.u8 == nullptr) + { + uint8_t pad_value = static_cast<uint8_t>(_output->data_offset()); + padImpl<uint8_t>(&pad_value); + } + else + { + padImpl<uint8_t>(_constantValueData.u8); + } } else { diff --git a/runtime/onert/backend/cpu/ops/PadLayer.h b/runtime/onert/backend/cpu/ops/PadLayer.h index 85bd2e6f0..efd73d5e5 100644 --- a/runtime/onert/backend/cpu/ops/PadLayer.h +++ b/runtime/onert/backend/cpu/ops/PadLayer.h @@ -39,12 +39,10 @@ public: PadLayer(); public: - void padFloat32(); - - void padQuant8(); + template <typename T> void padImpl(const T *constant_value_data); void configure(const IPortableTensor *input, IPortableTensor *output, const int32_t *padData, - int32_t padRank, uint8_t *constantValueData = nullptr); + int32_t padRank, const void *constantValueData = nullptr); void run() override; @@ -54,7 +52,7 @@ private: int32_t _padData[8]; int32_t _padRank; - DataPtr _constantValueData; + ConstDataPtr _constantValueData; }; } // namespace ops diff --git a/runtime/onert/backend/cpu/ops/QuantizeLayer.cc b/runtime/onert/backend/cpu/ops/QuantizeLayer.cc new file mode 100644 index 000000000..45fc148bf --- /dev/null +++ b/runtime/onert/backend/cpu/ops/QuantizeLayer.cc @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "QuantizeLayer.h" + +#include <cker/operation/Quantize.h> + +namespace onert +{ +namespace backend +{ +namespace cpu +{ +namespace ops +{ + +QuantizeLayer::QuantizeLayer() : _input(nullptr), _output(nullptr) +{ + // DO NOTHING +} + +template <typename InputT, typename OutputT> void QuantizeLayer::affineQuantize() +{ + nnfw::cker::Quantize(getTensorShape(_input), reinterpret_cast<const InputT *>(_input->buffer()), + getTensorShape(_output), reinterpret_cast<OutputT *>(_output->buffer()), + _output->data_scale(), _output->data_offset()); +} + +void QuantizeLayer::configure(const IPortableTensor *input, IPortableTensor *output) +{ + _input = input; + _output = output; +} + +void QuantizeLayer::run() +{ + if (_input->data_type() == OperandType::FLOAT32) + { + affineQuantize<float, uint8_t>(); + } + else + { + throw std::runtime_error{"Quantize: unsupported data type"}; + } +} + +} // namespace ops +} // namespace cpu +} // namespace backend +} // namespace onert diff --git a/runtime/onert/backend/cpu/ops/QuantizeLayer.h b/runtime/onert/backend/cpu/ops/QuantizeLayer.h new file mode 100644 index 000000000..b4e7aca40 --- /dev/null +++ b/runtime/onert/backend/cpu/ops/QuantizeLayer.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__ +#define __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__ + +#include <backend/IPortableTensor.h> +#include "OperationUtils.h" + +#include <exec/IFunction.h> + +namespace onert +{ +namespace backend +{ +namespace cpu +{ +namespace ops +{ + +class QuantizeLayer : public ::onert::exec::IFunction +{ +public: + QuantizeLayer(); + +public: + template <typename InputT, typename OutputT> void affineQuantize(); + + void configure(const IPortableTensor *input, IPortableTensor *output); + + void run() override; + +private: + const IPortableTensor *_input; + IPortableTensor *_output; +}; + +} // namespace ops +} // namespace cpu +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__ diff --git a/runtime/onert/backend/cpu/ops/ReLU6Layer.cc b/runtime/onert/backend/cpu/ops/ReLU6Layer.cc new file mode 100644 index 000000000..26eb35e0d --- /dev/null +++ b/runtime/onert/backend/cpu/ops/ReLU6Layer.cc @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ReLU6Layer.h" + +#include "OperationUtils.h" + +#include <cker/operation/ReLU6.h> + +namespace onert +{ +namespace backend +{ +namespace cpu +{ +namespace ops +{ + +ReLU6Layer::ReLU6Layer() : _input(nullptr), _output(nullptr) +{ + // DO NOTHING +} + +void ReLU6Layer::relu6Float32() +{ + nnfw::cker::ReLU6(getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()), + reinterpret_cast<float *>(_output->buffer())); +} + +void ReLU6Layer::relu6Quant8() +{ + // cker quant8 relu is not implemented yet + throw std::runtime_error{"NYI"}; +} + +void ReLU6Layer::configure(const IPortableTensor *input, IPortableTensor *output) +{ + _input = input; + _output = output; +} + +void ReLU6Layer::run() +{ + if (_input->data_type() == OperandType::FLOAT32) + { + relu6Float32(); + } + else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM) + { + relu6Quant8(); + } + else + { + throw std::runtime_error{"ReLU6: unsupported data type"}; + } +} + +} // namespace ops +} // namespace cpu +} // namespace backend +} // namespace onert diff --git a/runtime/onert/backend/cpu/ops/ReLU6Layer.h b/runtime/onert/backend/cpu/ops/ReLU6Layer.h new file mode 100644 index 000000000..994d17a30 --- /dev/null +++ b/runtime/onert/backend/cpu/ops/ReLU6Layer.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_CPU_OPS_RELU6LAYER_H__ +#define __ONERT_BACKEND_CPU_OPS_RELU6LAYER_H__ + +#include <backend/IPortableTensor.h> + +#include <exec/IFunction.h> + +namespace onert +{ +namespace backend +{ +namespace cpu +{ +namespace ops +{ + +class ReLU6Layer : public ::onert::exec::IFunction +{ +public: + ReLU6Layer(); + +public: + void relu6Float32(); + + void relu6Quant8(); + + void configure(const IPortableTensor *input, IPortableTensor *output); + + void run() override; + +private: + const IPortableTensor *_input; + IPortableTensor *_output; +}; + +} // namespace ops +} // namespace cpu +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_CPU_OPS_RELU6LAYER_H__ diff --git a/runtime/onert/backend/cpu/ops/ReduceLayer.cc b/runtime/onert/backend/cpu/ops/ReduceLayer.cc index 1dad031aa..fe22dbed7 100644 --- a/runtime/onert/backend/cpu/ops/ReduceLayer.cc +++ b/runtime/onert/backend/cpu/ops/ReduceLayer.cc @@ -116,6 +116,39 @@ void evalGeneric(const IPortableTensor *input, IPortableTensor *output, throw std::runtime_error{"Reduce(generic): unsupported data type"}; } } + +void evalSumQuantized(const IPortableTensor *input, IPortableTensor *output, + const std::vector<int> &axes, bool keep_dims, + nnfw::cker::Reduce &reduce_kernel) +{ + const bool same_scale = (input->data_scale() == output->data_scale() && + input->data_offset() == output->data_offset()); + + reduce_kernel.prepare(input->num_dimensions(), axes.size()); + + if (!same_scale) + { + std::vector<int32_t> temp_sum(output->getShape().num_elements()); + bool result = reduce_kernel.QuantizedMeanOrSum<uint8_t, int32_t>( + reinterpret_cast<const uint8_t *>(input->buffer()), input->data_offset(), + input->data_scale(), getTensorShape(input), reinterpret_cast<uint8_t *>(output->buffer()), + output->data_offset(), output->data_scale(), getTensorShape(output), axes, keep_dims, + temp_sum.data(), true, [](const int32_t current, const uint8_t in) -> int32_t { + const int32_t actual_in = static_cast<int32_t>(in); + return current + actual_in; + }); + + if (!result) + { + throw std::runtime_error{"Reduce: Fail to run"}; + } + + return; + } + + evalGeneric<ReduceType::kSum>(input, output, axes, keep_dims, reduce_kernel); +} + } // namespace ReduceLayer::ReduceLayer() @@ -143,6 +176,11 @@ void ReduceLayer::run() switch (_reduceType) { case ReduceType::kSum: + if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM) + { + evalSumQuantized(_input, _output, axes, _keep_dims, *_reduce_kernel); + return; + } evalGeneric<ReduceType::kSum>(_input, _output, axes, _keep_dims, *_reduce_kernel); break; case ReduceType::kProd: diff --git a/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.cc b/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.cc new file mode 100644 index 000000000..180094bb8 --- /dev/null +++ b/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.cc @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "OperationUtils.h" +#include "ResizeBilinearLayer.h" +#include "cker/operation/ResizeBilinear.h" +#include <cker/Types.h> + +namespace onert +{ +namespace backend +{ +namespace cpu +{ +namespace ops +{ + +ResizeBilinearLayer::ResizeBilinearLayer() + : _input(nullptr), _output(nullptr), _output_height(0), _output_width(0), _align_corners(false), + _half_pixel_centers(false) +{ + // DO NOTHING +} + +void ResizeBilinearLayer::configure(const IPortableTensor *input, IPortableTensor *output, + int32_t output_height, int32_t output_width, bool align_corners, + bool half_pixel_centers) +{ + _input = input; + _output = output; + _output_height = output_height; + _output_width = output_width; + _align_corners = align_corners; + _half_pixel_centers = half_pixel_centers; +} + +void ResizeBilinearLayer::run() +{ + nnfw::cker::ResizeBilinearParams params; + params.align_corners = _align_corners; + params.half_pixel_centers = _half_pixel_centers; + params.output_height = _output_height; + params.output_width = _output_width; + + switch (_input->data_type()) + { + case OperandType::FLOAT32: + nnfw::cker::ResizeBilinear( + params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()), + getTensorShape(_output), reinterpret_cast<float *>(_output->buffer())); + break; + + case OperandType::QUANT_UINT8_ASYMM: + nnfw::cker::ResizeBilinear( + params, getTensorShape(_input), reinterpret_cast<const uint8_t *>(_input->buffer()), + getTensorShape(_output), reinterpret_cast<uint8_t *>(_output->buffer())); + break; + + case OperandType::UINT8: + case OperandType::BOOL8: + case OperandType::FLOAT16: + case OperandType::INT32: + case OperandType::INT64: + case OperandType::QUANT_INT8_SYMM: + std::runtime_error("ResizeBilinear NYI"); + break; + default: + std::runtime_error("ResizeBilinear unsupported data type"); + } +} + +} // namespace ops +} // namespace cpu +} // namespace backend +} // namespace onert diff --git a/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.h b/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.h new file mode 100644 index 000000000..fc49b348e --- /dev/null +++ b/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in riting, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_CPU_OPS_RESIZEBILINEAR_H__ +#define __ONERT_BACKEND_CPU_OPS_RESIZEBILINEAR_H__ + +#include <backend/IPortableTensor.h> + +#include <exec/IFunction.h> + +namespace onert +{ +namespace backend +{ +namespace cpu +{ +namespace ops +{ + +class ResizeBilinearLayer : public ::onert::exec::IFunction +{ +public: + ResizeBilinearLayer(); + +public: + void configure(const IPortableTensor *input1, IPortableTensor *output, int32_t output_height, + int32_t output_width, bool align_corners, bool half_pixel_centers); + + void run() override; + +private: + const IPortableTensor *_input; + IPortableTensor *_output; + int32_t _output_height; + int32_t _output_width; + bool _align_corners; + bool _half_pixel_centers; +}; + +} // namespace ops +} // namespace cpu +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_CPU_OPS_RESIZEBILINEAR_H__ diff --git a/runtime/onert/backend/cpu/ops/SliceLayer.cc b/runtime/onert/backend/cpu/ops/SliceLayer.cc index a9106c1a2..449c073e6 100644 --- a/runtime/onert/backend/cpu/ops/SliceLayer.cc +++ b/runtime/onert/backend/cpu/ops/SliceLayer.cc @@ -46,7 +46,7 @@ void SliceLayer::GetBeginAndSizeVectors(int dimensions, const IPortableTensor *b } } -void SliceLayer::sliceFloat32() +template <typename T> void SliceLayer::sliceImpl() { const int kMaxDim = nnfw::cker::Shape::kMaxSmallSize; @@ -74,14 +74,8 @@ void SliceLayer::sliceFloat32() } nnfw::cker::Slice(op_params, getExtendedTensorShape(_input), - reinterpret_cast<const float *>(_input->buffer()), - reinterpret_cast<float *>(_output->buffer())); -} - -void SliceLayer::sliceQuant8() -{ - // cker quant8 slice is not implemented yet - throw std::runtime_error{"NYI"}; + reinterpret_cast<const T *>(_input->buffer()), + reinterpret_cast<T *>(_output->buffer())); } void SliceLayer::configure(const IPortableTensor *input, const IPortableTensor *begin, @@ -97,11 +91,11 @@ void SliceLayer::run() { if (_input->data_type() == OperandType::FLOAT32) { - sliceFloat32(); + sliceImpl<float>(); } else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM) { - sliceQuant8(); + sliceImpl<uint8_t>(); } else { diff --git a/runtime/onert/backend/cpu/ops/SliceLayer.h b/runtime/onert/backend/cpu/ops/SliceLayer.h index 9945d7ee6..650e2c97a 100644 --- a/runtime/onert/backend/cpu/ops/SliceLayer.h +++ b/runtime/onert/backend/cpu/ops/SliceLayer.h @@ -42,8 +42,7 @@ public: void run() override; private: - void sliceFloat32(); - void sliceQuant8(); + template <typename T> void sliceImpl(); template <typename T> void GetBeginAndSizeVectors(int dimensions, const IPortableTensor *begin, diff --git a/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc new file mode 100644 index 000000000..a0869aed8 --- /dev/null +++ b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "SpaceToDepthLayer.h" + +#include "OperationUtils.h" + +#include <cker/operation/SpaceToDepth.h> + +namespace onert +{ +namespace backend +{ +namespace cpu +{ +namespace ops +{ +SpaceToDepthLayer::SpaceToDepthLayer() : _input(nullptr), _block_size(0), _output(nullptr) +{ + // DO NOTHING +} + +template <typename T> void SpaceToDepthLayer::spaceToDepth() +{ + + nnfw::cker::SpaceToDepthParams params; + params.block_size = _block_size; + + nnfw::cker::SpaceToDepth(params, getTensorShape(_input), + reinterpret_cast<const T *>(_input->buffer()), getTensorShape(_output), + reinterpret_cast<T *>(_output->buffer())); +} + +void SpaceToDepthLayer::configure(const IPortableTensor *input, const int32_t block_size, + IPortableTensor *output) +{ + _input = input; + _block_size = block_size; + _output = output; +} + +void SpaceToDepthLayer::run() +{ + if (_input->data_type() == OperandType::FLOAT32) + { + spaceToDepth<float>(); + } + else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM) + { + spaceToDepth<uint8_t>(); + } + else + { + throw std::runtime_error{"SpaceToDepth: unsupported data type"}; + } +} + +} // namespace ops +} // namespace cpu +} // namespace backend +} // namespace onert diff --git a/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h new file mode 100644 index 000000000..c11ef2b0a --- /dev/null +++ b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in riting, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_CPU_OPS_SPACE_TO_DEPTH_LAYER_H__ +#define __ONERT_BACKEND_CPU_OPS_SPACE_TO_DEPTH_LAYER_H__ + +#include <backend/IPortableTensor.h> + +#include <exec/IFunction.h> + +namespace onert +{ +namespace backend +{ +namespace cpu +{ +namespace ops +{ +class SpaceToDepthLayer : public ::onert::exec::IFunction +{ +public: + SpaceToDepthLayer(); + + void configure(const IPortableTensor *input, const int32_t block_size, IPortableTensor *output); + + void run() override; + +private: + template <typename T> void spaceToDepth(); + + const IPortableTensor *_input; + int32_t _block_size; + IPortableTensor *_output; +}; + +} // namespace ops +} // namespace cpu +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_CPU_OPS_SPACE_TO_BATCH_ND_LAYER_H__ diff --git a/runtime/onert/backend/cpu/ops/SplitVLayer.cc b/runtime/onert/backend/cpu/ops/SplitVLayer.cc new file mode 100644 index 000000000..d6ca12442 --- /dev/null +++ b/runtime/onert/backend/cpu/ops/SplitVLayer.cc @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "SplitVLayer.h" + +#include "OperationUtils.h" + +#include <cker/operation/SplitV.h> + +namespace onert +{ +namespace backend +{ +namespace cpu +{ +namespace ops +{ + +SplitVLayer::SplitVLayer() + : _input(nullptr), _size_splits(nullptr), _split_dim(nullptr), _num_splits(0), _outputs() +{ + // DO NOTHING +} + +template <typename T> void SplitVLayer::splitV(void) +{ + nnfw::cker::SplitVParams op_params; + op_params.axis = *(reinterpret_cast<const int32_t *>(_split_dim->buffer())); + op_params.num_split = _num_splits; + + std::vector<T *> outputPtrs; + std::vector<nnfw::cker::Shape> outshape; + + for (const auto output : _outputs) + { + assert(output->total_size() == sizeOfData(output->data_type(), output->getShape().dims())); + outputPtrs.emplace_back(reinterpret_cast<T *>(output->buffer())); + outshape.emplace_back(getTensorShape(output)); + } + + assert(_input->total_size() == sizeOfData(_input->data_type(), _input->getShape().dims())); + nnfw::cker::SplitV<T>(op_params, getTensorShape(_input), reinterpret_cast<T *>(_input->buffer()), + outshape, outputPtrs.data()); +} + +void SplitVLayer::configure(const IPortableTensor *input, const IPortableTensor *size_splits, + const IPortableTensor *split_dim, uint16_t num_splits, + std::vector<IPortableTensor *> &outputs) +{ + assert(input != nullptr); + + _num_splits = num_splits; + _size_splits = size_splits; + _input = input; + _split_dim = split_dim; + _outputs = outputs; +} + +void SplitVLayer::run() +{ + if (_input->data_type() == OperandType::FLOAT32) + { + splitV<float>(); + } + else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM) + { + splitV<uint8_t>(); + } + else if (_input->data_type() == OperandType::INT32) + { + splitV<int32_t>(); + } + else if (_input->data_type() == OperandType::INT64) + { + splitV<int64_t>(); + } + else + { + throw std::runtime_error{"SplitV: unsupported input type"}; + } +} + +} // namespace ops +} // namespace cpu +} // namespace backend +} // namespace onert diff --git a/runtime/onert/backend/cpu/ops/SplitVLayer.h b/runtime/onert/backend/cpu/ops/SplitVLayer.h new file mode 100644 index 000000000..98f2f4406 --- /dev/null +++ b/runtime/onert/backend/cpu/ops/SplitVLayer.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_CPU_OPS_SPLIT_V_LAYER_H__ +#define __ONERT_BACKEND_CPU_OPS_SPLIT_V_LAYER_H__ + +#include <backend/IPortableTensor.h> + +#include <exec/IFunction.h> + +namespace onert +{ +namespace backend +{ +namespace cpu +{ +namespace ops +{ + +class SplitVLayer : public ::onert::exec::IFunction +{ +public: + SplitVLayer(); + +public: + template <typename T> void splitV(void); + + void configure(const IPortableTensor *input, const IPortableTensor *size_splits, + const IPortableTensor *size_dim, uint16_t num_splits, + std::vector<IPortableTensor *> &outputs); + + void run() override; + +private: + const IPortableTensor *_input; + const IPortableTensor *_size_splits; + const IPortableTensor *_split_dim; + uint16_t _num_splits; + std::vector<IPortableTensor *> _outputs; +}; + +} // namespace ops +} // namespace cpu +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_CPU_OPS_SPLIT_V_LAYER_H__ diff --git a/runtime/onert/backend/cpu/ops/StatelessRandomUniformLayer.cc b/runtime/onert/backend/cpu/ops/StatelessRandomUniformLayer.cc new file mode 100644 index 000000000..b8dfcb4b5 --- /dev/null +++ b/runtime/onert/backend/cpu/ops/StatelessRandomUniformLayer.cc @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "StatelessRandomUniformLayer.h" + +#include <cker/operation/StatelessRandomUniform.h> + +namespace onert +{ +namespace backend +{ +namespace cpu +{ +namespace ops +{ + +StatelessRandomUniformLayer::StatelessRandomUniformLayer() + : _shape(nullptr), _seed(nullptr), _output(nullptr) +{ + // DO NOTHING +} + +void StatelessRandomUniformLayer::configure(const IPortableTensor *shape, + const IPortableTensor *seed, IPortableTensor *output) +{ + _shape = shape; + _seed = seed; + _output = output; +} + +void StatelessRandomUniformLayer::StatelessRandomUniformFloat32() +{ + nnfw::cker::StatelessRandomUniform( + getTensorShape(_shape), reinterpret_cast<const int *>(_shape->buffer()), + getTensorShape(_seed), reinterpret_cast<const int *>(_seed->buffer()), + getTensorShape(_output), reinterpret_cast<float *>(_output->buffer())); +} + +void StatelessRandomUniformLayer::run() +{ + switch (_output->data_type()) + { + // ToDo : It need to support INT8 and UINT8 also when will be applied quantization. + case OperandType::FLOAT32: + StatelessRandomUniformFloat32(); + break; + default: + throw std::runtime_error{"StatelessRandomUniformLayer: unsupported data type"}; + } +} + +} // namespace ops +} // namespace cpu +} // namespace backend +} // namespace onert diff --git a/runtime/onert/backend/cpu/ops/StatelessRandomUniformLayer.h b/runtime/onert/backend/cpu/ops/StatelessRandomUniformLayer.h new file mode 100644 index 000000000..ef11d623d --- /dev/null +++ b/runtime/onert/backend/cpu/ops/StatelessRandomUniformLayer.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __ONERT_BACKEND_CPU_OPS_STATELESS_RANDOM_UNIFORM_H__ +#define __ONERT_BACKEND_CPU_OPS_STATELESS_RANDOM_UNIFORM_H__ + +#include <backend/IPortableTensor.h> +#include "OperationUtils.h" + +#include <exec/IFunction.h> + +namespace onert +{ +namespace backend +{ +namespace cpu +{ +namespace ops +{ + +class StatelessRandomUniformLayer : public ::onert::exec::IFunction +{ +public: + StatelessRandomUniformLayer(); + +public: + void configure(const IPortableTensor *shape, const IPortableTensor *seed, + IPortableTensor *output); + + void StatelessRandomUniformFloat32(); + + void run() override; + +private: + const IPortableTensor *_shape; + const IPortableTensor *_seed; + + IPortableTensor *_output; +}; + +} // namespace ops +} // namespace cpu +} // namespace backend +} // namespace onert + +#endif // __ONERT_BACKEND_CPU_OPS_STATELESS_RANDOM_UNIFORM_H__ |