diff options
Diffstat (limited to 'compiler/luci/pass/src')
-rw-r--r-- | compiler/luci/pass/src/CircleOptimizer.cpp | 121 | ||||
-rw-r--r-- | compiler/luci/pass/src/CircleOptimizerUtils.cpp | 89 | ||||
-rw-r--r-- | compiler/luci/pass/src/CircleOptimizerUtils.h | 42 | ||||
-rw-r--r-- | compiler/luci/pass/src/FuseBCQPass.cpp | 405 | ||||
-rw-r--r-- | compiler/luci/pass/src/FuseInstanceNormPass.cpp | 231 | ||||
-rw-r--r-- | compiler/luci/pass/src/FuseInstanceNormPass.test.cpp | 64 | ||||
-rw-r--r-- | compiler/luci/pass/src/FuseInstanceNormPassInternal.h | 28 | ||||
-rw-r--r-- | compiler/luci/pass/src/QuantizationUtils.cpp | 172 | ||||
-rw-r--r-- | compiler/luci/pass/src/QuantizationUtils.h | 38 | ||||
-rw-r--r-- | compiler/luci/pass/src/QuantizeDequantizeWeightsPass.cpp | 495 | ||||
-rw-r--r-- | compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp | 551 | ||||
-rw-r--r-- | compiler/luci/pass/src/ResolveCustomOpAddPass.cpp | 124 | ||||
-rw-r--r-- | compiler/luci/pass/src/ResolveCustomOpBatchMatMulPass.cpp | 69 | ||||
-rw-r--r-- | compiler/luci/pass/src/ResolveCustomOpMatMulPass.cpp | 185 |
14 files changed, 2588 insertions, 26 deletions
diff --git a/compiler/luci/pass/src/CircleOptimizer.cpp b/compiler/luci/pass/src/CircleOptimizer.cpp index dcb05a0b5..90fbe9009 100644 --- a/compiler/luci/pass/src/CircleOptimizer.cpp +++ b/compiler/luci/pass/src/CircleOptimizer.cpp @@ -16,16 +16,23 @@ #include "luci/CircleOptimizer.h" +#include "luci/Pass/FuseBCQPass.h" #include "luci/Pass/FuseInstanceNormPass.h" +#include "luci/Pass/ResolveCustomOpAddPass.h" +#include "luci/Pass/ResolveCustomOpBatchMatMulPass.h" +#include "luci/Pass/ResolveCustomOpMatMulPass.h" +#include "luci/Pass/QuantizeWithMinMaxPass.h" +#include "luci/Pass/QuantizeDequantizeWeightsPass.h" // TODO add more passes #include "luci/Pass/ShapeInferencePass.h" #include "luci/Pass/TypeInferencePass.h" // logo passes -#include <logo/RemoveDeadNodePass.h> +#include <logo/RemoveDeadNodeWithQueryPass.h> #include "ProgressReporter.h" +#include "CircleOptimizerUtils.h" #include <logo/Phase.h> @@ -36,18 +43,39 @@ namespace using namespace luci; -class OptimizeOptionsImpl : public luci::CircleOptimizer::Options +class OptimizeOptionsImpl final : public luci::CircleOptimizer::Options { public: void enable(Algorithm) final; + void param(AlgorithmParameters, const std::string &) final; + const std::string param(AlgorithmParameters) const final; bool query(Algorithm) final; private: std::vector<Algorithm> _algorithms; + std::map<AlgorithmParameters, const std::string> _algorithm_params; }; void OptimizeOptionsImpl::enable(Algorithm algo) { _algorithms.push_back(algo); } +void OptimizeOptionsImpl::param(AlgorithmParameters param, const std::string &str) +{ + _algorithm_params.insert(std::pair<AlgorithmParameters, const std::string>(param, str)); +} + +const std::string OptimizeOptionsImpl::param(AlgorithmParameters param) const +{ + auto param_str = _algorithm_params.find(param); + if (param_str != _algorithm_params.end()) + { + return param_str->second; + } + else + { + return std::string(); + } +} + bool OptimizeOptionsImpl::query(Algorithm algo) { std::vector<Algorithm>::iterator it = std::find(_algorithms.begin(), _algorithms.end(), algo); @@ -77,14 +105,31 @@ void CircleOptimizer::optimize(loco::Graph *g) const logo::Phase phase; /* TRANSFORM DECLARATION BEGIN */ + if (_options->query(Options::Algorithm::ResolveCustomOpAdd)) + { + phase.emplace_back(std::make_unique<luci::ResolveCustomOpAddPass>()); + } + if (_options->query(Options::Algorithm::ResolveCustomOpBatchMatMul)) + { + phase.emplace_back(std::make_unique<luci::ResolveCustomOpBatchMatMulPass>()); + } + if (_options->query(Options::Algorithm::ResolveCustomOpMatMul)) + { + phase.emplace_back(std::make_unique<luci::ResolveCustomOpMatMulPass>()); + } if (_options->query(Options::Algorithm::FuseInstanceNorm)) { phase.emplace_back(std::make_unique<FuseInstanceNormPass>()); } + if (_options->query(Options::Algorithm::FuseBCQ)) + { + phase.emplace_back(std::make_unique<FuseBCQPass>()); + } + // Shape inference is needed for added nodes doing above transformations phase.emplace_back(std::make_unique<luci::ShapeInferencePass>()); phase.emplace_back(std::make_unique<luci::TypeInferencePass>()); - phase.emplace_back(std::make_unique<logo::RemoveDeadNodePass>()); + phase.emplace_back(std::make_unique<logo::RemoveDeadNodeWithQueryPass>()); /* TRANSFORM DECLARATION END */ ProgressReporter prog(g, logo::PhaseStrategy::Saturate); @@ -93,4 +138,74 @@ void CircleOptimizer::optimize(loco::Graph *g) const phase_runner.run(phase); } +void CircleOptimizer::quantize(loco::Graph *g) const +{ + // Fake quantization of weights + if (_options->query(Options::Algorithm::QuantizeDequantizeWeights)) + { + static const std::vector<std::string> fakeq_supported_input_dtype{"float32"}; + static const std::vector<std::string> fakeq_supported_output_dtype{"uint8"}; + static const std::vector<std::string> fakeq_supported_granularity{"layer"}; + + auto input_dtype = _options->param(Options::AlgorithmParameters::Quantize_input_dtype); + auto output_dtype = _options->param(Options::AlgorithmParameters::Quantize_output_dtype); + auto granularity = _options->param(Options::AlgorithmParameters::Quantize_granularity); + + if (!in_array(to_lower_case(input_dtype), fakeq_supported_input_dtype)) + throw std::runtime_error("Unsupported input type. List of supported input type: " + + to_string(fakeq_supported_input_dtype)); + + if (!in_array(to_lower_case(output_dtype), fakeq_supported_output_dtype)) + throw std::runtime_error("Unsupported output type. List of supported output type: " + + to_string(fakeq_supported_output_dtype)); + + if (!in_array(to_lower_case(granularity), fakeq_supported_granularity)) + throw std::runtime_error("Unsupported granularity. List of supported granularity: " + + to_string(fakeq_supported_granularity)); + + luci::QuantizeDequantizeWeightsPass fake_quantizer( + str_to_dtype(input_dtype), str_to_dtype(output_dtype), str_to_granularity(granularity)); + fake_quantizer.run(g); + } + + // Actual quantization of weights, bias, and activation + if (_options->query(Options::Algorithm::QuantizeWithMinMax)) + { + static const std::vector<std::string> qwmm_supported_input_dtype{"float32"}; + static const std::vector<std::string> qwmm_supported_output_dtype{"uint8"}; + static const std::vector<std::string> qwmm_supported_granularity{"layer"}; + + auto input_dtype = _options->param(Options::AlgorithmParameters::Quantize_input_dtype); + auto output_dtype = _options->param(Options::AlgorithmParameters::Quantize_output_dtype); + auto granularity = _options->param(Options::AlgorithmParameters::Quantize_granularity); + + if (!in_array(to_lower_case(input_dtype), qwmm_supported_input_dtype)) + throw std::runtime_error("Unsupported input type. List of supported input types: " + + to_string(qwmm_supported_input_dtype)); + + if (!in_array(to_lower_case(output_dtype), qwmm_supported_output_dtype)) + throw std::runtime_error("Unsupported output type. List of supported output types: " + + to_string(qwmm_supported_output_dtype)); + + if (!in_array(to_lower_case(granularity), qwmm_supported_granularity)) + throw std::runtime_error("Unsupported granularity. List of supported granularity: " + + to_string(qwmm_supported_granularity)); + + luci::QuantizeWithMinMaxPass quantizer(str_to_dtype(input_dtype), str_to_dtype(output_dtype), + str_to_granularity(granularity)); + quantizer.run(g); + } + + logo::Phase phase; + + // Do Shape/Type inference + phase.emplace_back(std::make_unique<luci::ShapeInferencePass>()); + phase.emplace_back(std::make_unique<luci::TypeInferencePass>()); + + ProgressReporter prog(g, logo::PhaseStrategy::Saturate); + logo::PhaseRunner<logo::PhaseStrategy::Saturate> phase_runner{g}; + phase_runner.attach(&prog); + phase_runner.run(phase); +} + } // namespace luci diff --git a/compiler/luci/pass/src/CircleOptimizerUtils.cpp b/compiler/luci/pass/src/CircleOptimizerUtils.cpp new file mode 100644 index 000000000..ffc372392 --- /dev/null +++ b/compiler/luci/pass/src/CircleOptimizerUtils.cpp @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "CircleOptimizerUtils.h" + +namespace luci +{ + +bool in_array(const std::string &str, const std::vector<std::string> &array) +{ + return std::find(array.begin(), array.end(), str) != array.end(); +} + +std::string to_string(const std::vector<std::string> &strings) +{ + assert(!strings.empty()); + + std::string res; + for (unsigned int i = 0; i < strings.size() - 1; i++) + res += strings[i] + ", "; + + res += strings[strings.size() - 1]; + return res; +} + +std::string to_lower_case(std::string s) +{ + std::transform(s.begin(), s.end(), s.begin(), [](unsigned char c) { return std::tolower(c); }); + return s; +} + +loco::DataType str_to_dtype(const std::string &str) +{ + if (to_lower_case(str).compare("uint8") == 0) + return loco::DataType::U8; + if (to_lower_case(str).compare("uint16") == 0) + return loco::DataType::U16; + if (to_lower_case(str).compare("uint32") == 0) + return loco::DataType::U32; + if (to_lower_case(str).compare("uint64") == 0) + return loco::DataType::U64; + + if (to_lower_case(str).compare("int8") == 0) + return loco::DataType::S8; + if (to_lower_case(str).compare("int16") == 0) + return loco::DataType::S16; + if (to_lower_case(str).compare("int32") == 0) + return loco::DataType::S32; + if (to_lower_case(str).compare("int64") == 0) + return loco::DataType::S64; + + if (to_lower_case(str).compare("float16") == 0) + return loco::DataType::FLOAT16; + if (to_lower_case(str).compare("float32") == 0) + return loco::DataType::FLOAT32; + if (to_lower_case(str).compare("float64") == 0) + return loco::DataType::FLOAT64; + + if (to_lower_case(str).compare("bool") == 0) + return loco::DataType::BOOL; + + return loco::DataType::Unknown; +} + +QuantizationGranularity str_to_granularity(const std::string &str) +{ + if (to_lower_case(str).compare("layer") == 0) + return QuantizationGranularity::LayerWise; + + if (to_lower_case(str).compare("channel") == 0) + return QuantizationGranularity::ChannelWise; + + throw std::runtime_error("Quantization granularity must be either 'layer' or 'channel'"); +} + +} // namespace luci diff --git a/compiler/luci/pass/src/CircleOptimizerUtils.h b/compiler/luci/pass/src/CircleOptimizerUtils.h new file mode 100644 index 000000000..7e577a05f --- /dev/null +++ b/compiler/luci/pass/src/CircleOptimizerUtils.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __LUCI_CIRCLE_OPTIMIZER_UTILS_H__ +#define __LUCI_CIRCLE_OPTIMIZER_UTILS_H__ + +#include "luci/Pass/QuantizeDequantizeWeightsPass.h" +#include "luci/Pass/QuantizeWithMinMaxPass.h" + +#include <loco.h> + +#include <algorithm> + +namespace luci +{ + +bool in_array(const std::string &, const std::vector<std::string> &); + +std::string to_string(const std::vector<std::string> &); + +std::string to_lower_case(std::string); + +loco::DataType str_to_dtype(const std::string &); + +QuantizationGranularity str_to_granularity(const std::string &); + +} // namespace luci + +#endif // __LUCI_CIRCLE_OPTIMIZER_UTILS_H__ diff --git a/compiler/luci/pass/src/FuseBCQPass.cpp b/compiler/luci/pass/src/FuseBCQPass.cpp new file mode 100644 index 000000000..b81db8827 --- /dev/null +++ b/compiler/luci/pass/src/FuseBCQPass.cpp @@ -0,0 +1,405 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "luci/Pass/FuseBCQPass.h" + +#include <luci/IR/CircleNodes.h> + +#include <cassert> +#include <string> +#include <set> + +namespace +{ + +/** + * @brief Circle nodes including BCQ information and a circle node to which BCQ will be applied + * are connected with their name. And their names include common prefix. + * However, after pb file is converted to tflite file, some nodes' name are changed. + * Thus this function will return original common prefix. + * + * @note All the re-naming rule of TFLite converter is not figured out. + * Therefore, if new naming rule is detected, this function should be updated. + */ +const std::string node_name_prefix(luci::NodeName node_name) +{ + std::string prefix = node_name; + + if (prefix.find("ReadVariableOp/resource/") != std::string::npos) + { + const auto start_index = prefix.find("ReadVariableOp/resource/"); + + const auto left_prefix = prefix.substr(0, start_index); + const auto right_prefix = prefix.substr(start_index + 24); + + prefix = left_prefix + right_prefix; + } + + if (prefix.find("Tensordot/") != std::string::npos) + { + const auto index = prefix.find("Tensordot/"); + prefix = prefix.substr(0, index - 1); + } + else if (prefix.find("kernel/") != std::string::npos) + { + const auto index = prefix.find("kernel/"); + prefix = prefix.substr(0, index - 1); + } + else if (prefix.find("/bcqinfo_") != std::string::npos) + { + const auto index = prefix.find("/bcqinfo_"); + prefix = prefix.substr(0, index); + } + + return prefix; +} + +} // namespace + +namespace +{ + +class BCQConverter final +{ +public: + void add_BCQ_info_node(luci::CircleConst *node) + { + const auto node_name = node->name(); + const auto prefix = node_name_prefix(node_name); + + // If bcqinfo_* nodes are held by Reshape operation, + // shape of bcqinfo_* nodes are copied to `shape` input of Reshape operation. + // Then the name becomes bcqinfo_*_copy_shape. + // We should prevent this node not to added to bcq information. + if (node_name.find("_copy_shape") != std::string::npos) + return; + + if (node_name.find("bcqinfo_do_w_x") != std::string::npos) + _do_w_x[prefix] = node; + else if (node_name.find("bcqinfo_alpha") != std::string::npos) + _alpha[prefix] = node; + else if (node_name.find("bcqinfo_packed_binary_code") != std::string::npos) + _packed_binary_code[prefix] = node; + else if (node_name.find("bcqinfo_number_of_clusters") != std::string::npos) + _number_of_clusters[prefix] = node; + else if (node_name.find("bcqinfo_size_of_clusters") != std::string::npos) + _size_of_clusters[prefix] = node; + else if (node_name.find("bcqinfo_qbits_of_clusters") != std::string::npos) + _qbits_of_clusters[prefix] = node; + else if (node_name.find("bcqinfo_dequant_weight") != std::string::npos) + _dequant_weight[prefix] = node; + } + + bool has_BCQ_info(luci::CircleConst *node) + { + const auto prefix = node_name_prefix(node->name()); + bool has_info = true; + + has_info &= (_do_w_x.find(prefix) != _do_w_x.end()); + has_info &= (_alpha.find(prefix) != _alpha.end()); + has_info &= (_packed_binary_code.find(prefix) != _packed_binary_code.end()); + has_info &= (_number_of_clusters.find(prefix) != _number_of_clusters.end()); + has_info &= (_size_of_clusters.find(prefix) != _size_of_clusters.end()); + has_info &= (_qbits_of_clusters.find(prefix) != _qbits_of_clusters.end()); + // bcqinfo_dequant_weight is just for validation, so not always exists. + + return has_info; + } + + bool do_w_x(luci::CircleConst *node) + { + const auto prefix = node_name_prefix(node->name()); + + if (_do_w_x[prefix]->dtype() == loco::DataType::S32) + return _do_w_x[prefix]->at<loco::DataType::S32>(0) == 1; + else if (_do_w_x[prefix]->dtype() == loco::DataType::BOOL) + return _do_w_x[prefix]->at<loco::DataType::BOOL>(0); + else + throw std::runtime_error("do_w_x should be int or bool"); + } + + luci::CircleConst *get_alpha(luci::CircleConst *node) + { + const auto prefix = node_name_prefix(node->name()); + return _alpha[prefix]; + } + + luci::CircleConst *get_packed_binary_code(luci::CircleConst *node) + { + const auto prefix = node_name_prefix(node->name()); + return _packed_binary_code[prefix]; + } + + luci::CircleConst *get_number_of_clusters(luci::CircleConst *node) + { + const auto prefix = node_name_prefix(node->name()); + return _number_of_clusters[prefix]; + } + + luci::CircleConst *get_size_of_clusters(luci::CircleConst *node) + { + const auto prefix = node_name_prefix(node->name()); + return _size_of_clusters[prefix]; + } + + luci::CircleConst *get_qbits_of_clusters(luci::CircleConst *node) + { + const auto prefix = node_name_prefix(node->name()); + return _qbits_of_clusters[prefix]; + } + + luci::CircleConst *packed_clusters(luci::CircleConst *node) + { + auto graph = node->graph(); + auto qbits_of_clusters = get_qbits_of_clusters(node); + auto size_of_clusters = get_size_of_clusters(node); + const auto number_of_clusters = get_number_of_clusters(node)->at<loco::DataType::S32>(0); + + auto packed_clusters = graph->nodes()->create<luci::CircleConst>(); + packed_clusters->dtype(loco::DataType::S32); + packed_clusters->size<loco::DataType::S32>(number_of_clusters * 2); + packed_clusters->rank(2); + packed_clusters->dim(0) = number_of_clusters; + packed_clusters->dim(1) = 2; + packed_clusters->shape_status(luci::ShapeStatus::VALID); + + for (int i = 0; i < number_of_clusters; ++i) + { + packed_clusters->at<loco::DataType::S32>(i * 2) = + qbits_of_clusters->at<loco::DataType::S32>(i); + packed_clusters->at<loco::DataType::S32>(i * 2 + 1) = + size_of_clusters->at<loco::DataType::S32>(i); + } + + return packed_clusters; + } + + /** + * @brief Exclude BCQ information nodes which are used for fusing BCQ operations + * from graph output by using CircleOutputExclude + */ + void clear_BCQ_nodes() + { + auto createNoOp = [](luci::CircleNode *circle_node) { + auto graph = circle_node->graph(); + auto noOp = graph->nodes()->create<luci::CircleOutputExclude>(); + + if (circle_node->shape_status() == luci::ShapeStatus::VALID) + { + noOp->dtype(circle_node->dtype()); + noOp->rank(circle_node->rank()); + for (uint32_t i = 0; i < circle_node->rank(); ++i) + noOp->dim(i) = circle_node->dim(i); + } + else + { + // For type inference + noOp->dtype(loco::DataType::FLOAT32); + } + + return noOp; + }; + + auto clear_nodes = [createNoOp](std::map<std::string, luci::CircleConst *> &nodes) { + for (auto &n : nodes) + { + auto node = n.second; + + for (auto s : loco::succs(node)) + { + if (auto outnode = dynamic_cast<luci::CircleOutput *>(s)) + { + outnode->from(createNoOp(node)); + } + else if (auto reshape_node = dynamic_cast<luci::CircleReshape *>(s)) + { + for (auto o : loco::succs(reshape_node)) + { + auto circle_output = loco::must_cast<luci::CircleOutput *>(o); + circle_output->from(createNoOp(reshape_node)); + } + } + } + } + }; + + clear_nodes(_do_w_x); + clear_nodes(_alpha); + clear_nodes(_packed_binary_code); + clear_nodes(_number_of_clusters); + clear_nodes(_size_of_clusters); + clear_nodes(_qbits_of_clusters); + clear_nodes(_dequant_weight); + } + +private: + std::map<std::string, luci::CircleConst *> _do_w_x; + std::map<std::string, luci::CircleConst *> _alpha; + std::map<std::string, luci::CircleConst *> _packed_binary_code; + std::map<std::string, luci::CircleConst *> _number_of_clusters; + std::map<std::string, luci::CircleConst *> _size_of_clusters; + std::map<std::string, luci::CircleConst *> _qbits_of_clusters; + std::map<std::string, luci::CircleConst *> _dequant_weight; +}; + +} // namespace + +namespace luci +{ + +bool FuseBCQPass::run(loco::Graph *g) +{ + BCQConverter converter; + + bool changed = false; + + for (auto node : loco::all_nodes(g)) + { + if (auto circle_const = dynamic_cast<luci::CircleConst *>(node)) + { + converter.add_BCQ_info_node(circle_const); + } + } + + for (auto node : loco::active_nodes(loco::output_nodes(g))) + { + if (auto gather = dynamic_cast<luci::CircleGather *>(node)) + { + auto params = dynamic_cast<luci::CircleConst *>(gather->params()); + if (params != nullptr && converter.has_BCQ_info(params)) + { + auto bcq_gather = g->nodes()->create<luci::CircleBCQGather>(); + + bcq_gather->input_scales(converter.get_alpha(params)); + bcq_gather->input_binary(converter.get_packed_binary_code(params)); + bcq_gather->indices(gather->indices()); + bcq_gather->input_clusters(converter.packed_clusters(params)); + + const auto binary_hidden_size = + loco::must_cast<luci::CircleConst *>(bcq_gather->input_binary())->dim(1).value() * 32; + bcq_gather->input_hidden_size(binary_hidden_size); + + if (converter.do_w_x(params)) + { + bcq_gather->axis(gather->axis()); + } + else + { + const auto axis_transpose = (gather->axis() == 0) ? 1 : 0; + bcq_gather->axis(axis_transpose); + } + + loco::replace(gather).with(bcq_gather); + + changed = true; + } + } + else if (auto fully_connected = dynamic_cast<luci::CircleFullyConnected *>(node)) + { + auto weights = dynamic_cast<luci::CircleConst *>(fully_connected->weights()); + if (weights != nullptr && converter.has_BCQ_info(weights)) + { + auto bcq_fc = g->nodes()->create<luci::CircleBCQFullyConnected>(); + + bcq_fc->weights_scales(converter.get_alpha(weights)); + bcq_fc->weights_binary(converter.get_packed_binary_code(weights)); + bcq_fc->bias(fully_connected->bias()); + bcq_fc->weights_clusters(converter.packed_clusters(weights)); + bcq_fc->fusedActivationFunction(fully_connected->fusedActivationFunction()); + + loco::Node *bcq_input = fully_connected->input(); + int32_t batch_rank = 0; + + // If input of BCQFullyConnected has more than rank 2, we should reshape it as rank 2 + const auto original_input = loco::must_cast<luci::CircleNode *>(fully_connected->input()); + if (original_input->shape_status() == ShapeStatus::VALID && original_input->rank() > 2) + { + auto new_shape = g->nodes()->create<luci::CircleConst>(); + new_shape->dtype(loco::DataType::S32); + new_shape->size<loco::DataType::S32>(2); + new_shape->rank(1); + new_shape->dim(0) = 2; + + auto batch_size = 1; + for (uint32_t i = 0; i < original_input->rank() - 1; ++i) + batch_size *= original_input->dim(i).value(); + + new_shape->at<loco::DataType::S32>(0) = batch_size; + new_shape->at<loco::DataType::S32>(1) = + original_input->dim(original_input->rank() - 1).value(); + new_shape->shape_status(ShapeStatus::VALID); + + auto reshape = g->nodes()->create<luci::CircleReshape>(); + reshape->tensor(original_input); + reshape->shape(new_shape); + + bcq_input = reshape; + batch_rank = original_input->rank() - 2; + } + + // If x_w formation, we should insert Transpose in front and back of BCQFullyConnected + if (converter.do_w_x(weights)) + { + const auto binary_hidden_size = + loco::must_cast<luci::CircleNode *>(fully_connected->input()) + ->dim(batch_rank) + .value(); + bcq_fc->weights_hidden_size(binary_hidden_size); + bcq_fc->input(bcq_input); + loco::replace(fully_connected).with(bcq_fc); + } + else + { + const auto binary_hidden_size = + loco::must_cast<luci::CircleNode *>(fully_connected->input()) + ->dim(1 + batch_rank) + .value(); + bcq_fc->weights_hidden_size(binary_hidden_size); + + auto perm = g->nodes()->create<luci::CircleConst>(); + perm->dtype(loco::DataType::S32); + perm->size<loco::DataType::S32>(2); + perm->rank(1); + perm->dim(0) = 2; + perm->at<loco::DataType::S32>(0) = 1; + perm->at<loco::DataType::S32>(1) = 0; + perm->shape_status(ShapeStatus::VALID); + + auto input_transpose = g->nodes()->create<luci::CircleTranspose>(); + input_transpose->a(bcq_input); + input_transpose->perm(perm); + + bcq_fc->input(input_transpose); + + auto output_transpose = g->nodes()->create<luci::CircleTranspose>(); + output_transpose->a(bcq_fc); + output_transpose->perm(perm); + + loco::replace(fully_connected).with(output_transpose); + } + + changed = true; + } + } + } + + if (changed) + converter.clear_BCQ_nodes(); + + return changed; +} + +} // namespace luci diff --git a/compiler/luci/pass/src/FuseInstanceNormPass.cpp b/compiler/luci/pass/src/FuseInstanceNormPass.cpp index 180b5bbef..ad8765c41 100644 --- a/compiler/luci/pass/src/FuseInstanceNormPass.cpp +++ b/compiler/luci/pass/src/FuseInstanceNormPass.cpp @@ -15,6 +15,7 @@ */ #include "luci/Pass/FuseInstanceNormPass.h" +#include "FuseInstanceNormPassInternal.h" #include <luci/IR/CircleNodes.h> @@ -114,8 +115,6 @@ bool NodeFiller<ARG_TYPE_1, ARG_TYPE_2>::with_commutative_args_of(const COMM_NOD } // namespace // Helper to check detail -namespace -{ /// @return true When node has shape of '1 x .. x 1 x depth' bool is_1D_with_dummy_dim(luci::CircleConst *node, uint32_t depth) @@ -130,7 +129,23 @@ bool is_1D_with_dummy_dim(luci::CircleConst *node, uint32_t depth) return node->dim(axis).value() == depth; } -bool is_instance_mean(luci::CircleMean *mean) +/// @return true if node shape consists of ones, except the one before the last dim: 1,...1,depth,1 +bool is_quasi_1D_with_dummy_dim(luci::CircleConst *node, uint32_t depth) +{ + auto rank = node->rank(); + // minimal accepted shape is [1 x depth x 1] + if (rank < 3) + return false; + const auto depth_axis = rank - 2; + for (uint32_t axis = 0; axis < rank; ++axis) + { + if (axis != depth_axis && node->dim(axis).value() != 1) + return false; + } + return node->dim(depth_axis).value() == depth; +} + +bool is_instance_mean_v0(luci::CircleMean *mean) { // // CHECK 1) input is rank 4 @@ -175,7 +190,53 @@ bool is_instance_mean(luci::CircleMean *mean) return mean->keep_dims(); } -} // namespace +bool is_instance_mean_v1(luci::CircleMean *mean) +{ + // + // CHECK 1) input is rank 5 (NHWCX) + // + auto input = mean->input(); + if (not loco::shape_known(input)) + return false; + auto input_shape = loco::shape_get(input).as<loco::TensorShape>(); + if (input_shape.rank() != 5) + return false; + + // + // CHECK 2) 'reduction indices' is CircleConst of value [1,2,4], that is HWX of NHWCX input shape + // + // TODO Support equivalent case, like [-3,-2] + // TODO Support non-Const case? + // TODO What if input is NCHW format in Circle? + auto red_indices = dynamic_cast<luci::CircleConst *>(mean->reduction_indices()); + if (not red_indices) + return false; + if (red_indices->rank() != 1) + return false; + std::set<int32_t> red_indices_set; + + // TODO Currently only support S32, support other types + if (red_indices->dtype() != loco::DataType::S32) + return false; + for (uint32_t i = 0; i < red_indices->dim(0).value(); ++i) + red_indices_set.insert(red_indices->at<loco::DataType::S32>(i)); + + if (red_indices_set.size() != 3) + return false; + if (red_indices_set.find(1) == red_indices_set.end()) + return false; + if (red_indices_set.find(2) == red_indices_set.end()) + return false; + if (red_indices_set.find(4) == red_indices_set.end()) + return false; + + // + // CHECK 3) keep_dims == true (?) + // + // We only have case of 'keep_dims == true' so far, but it might be okay with 'keep_dims == false' + // TODO Check this fact, and if true, return true regardless of keep_dims + return mean->keep_dims(); +} // Helper to fuse Instance Norm namespace @@ -227,14 +288,61 @@ namespace * | * V * [Out] + *------------------------------------------------------------------- + * [In] + * | + * V + * ifm + * | + * V + * +---------reshape_of_ifm ----+ (reduction indicies) + * | | | | + * | | V V + * | | mean_of_reshape -------------+ + * | V | | + * | sqdiff <--+ (reduction indicies) | + * | | | | + * | V | | + * | mean_as_variance <---+ const_as_epsilon | + * | | | | + * | V | | + * | add_as_variance <--------+ | + * | | | + * | V | + * | rsqrt const_as_gamma | + * | | | | + * | V | | + * | mul_gamma <--+ | + * | | | | + * V V V | + * mul_as_scaled_reshape mul_as_scaled_mean <-----------+ + * | | + * | const_as_beta | + * | | V + * | +------> sub + * V | + * add_as_terminal <----------+ + * | + * V + * reshape_as_terminal + * | + * V + * [Out] */ class InstanceNormPattern final { public: - InstanceNormPattern(luci::CircleAdd *candidate) + enum PatternVersion + { + Version_0, + Version_1 + }; + + InstanceNormPattern(luci::CircleAdd *candidate, PatternVersion pv) { assert(candidate); add_as_terminal = candidate; + _pv = pv; } public: @@ -244,7 +352,9 @@ public: public: // Context loco::Node *ifm = nullptr; + luci::CircleReshape *reshape_of_ifm = nullptr; luci::CircleMean *mean_of_ifm = nullptr; + luci::CircleMean *mean_of_reshape = nullptr; luci::CircleSquaredDifference *sqdiff = nullptr; luci::CircleMean *mean_as_variance = nullptr; luci::CircleConst *const_as_epsilon = nullptr; @@ -254,12 +364,14 @@ public: luci::CircleMul *mul_gamma = nullptr; luci::CircleMul *mul_as_scaled_ifm = nullptr; luci::CircleMul *mul_as_scaled_mean = nullptr; + luci::CircleMul *mul_as_scaled_reshape = nullptr; luci::CircleConst *const_as_beta = nullptr; luci::CircleSub *sub = nullptr; luci::CircleAdd *add_as_terminal = nullptr; private: bool _matched = false; + PatternVersion _pv; }; bool InstanceNormPattern::matched() @@ -273,8 +385,18 @@ bool InstanceNormPattern::matched() // Check order is DFS - CHECK_OR_FALSE(fill(&mul_as_scaled_ifm, &sub).with_commutative_args_of(add_as_terminal)); - CHECK_OR_FALSE(fill(&ifm, &mul_gamma).with_commutative_args_of(mul_as_scaled_ifm)); + if (_pv == PatternVersion::Version_0) + { + CHECK_OR_FALSE(fill(&mul_as_scaled_ifm, &sub).with_commutative_args_of(add_as_terminal)); + CHECK_OR_FALSE(fill(&ifm, &mul_gamma).with_commutative_args_of(mul_as_scaled_ifm)); + } + if (_pv == PatternVersion::Version_1) + { + CHECK_OR_FALSE(fill(&mul_as_scaled_reshape, &sub).with_commutative_args_of(add_as_terminal)); + CHECK_OR_FALSE( + fill(&reshape_of_ifm, &mul_gamma).with_commutative_args_of(mul_as_scaled_reshape)); + ifm = reshape_of_ifm->tensor(); + } CHECK_OR_FALSE(loco::shape_known(ifm)); auto ifm_shape = loco::shape_get(ifm); @@ -284,7 +406,15 @@ bool InstanceNormPattern::matched() uint32_t ifm_channel_depth = ifm_tensor_shape.dim(3).value(); CHECK_OR_FALSE(fill(&rsqrt, &const_as_gamma).with_commutative_args_of(mul_gamma)); - CHECK_OR_FALSE(is_1D_with_dummy_dim(const_as_gamma, ifm_channel_depth)); + + if (_pv == PatternVersion::Version_0) + { + CHECK_OR_FALSE(is_1D_with_dummy_dim(const_as_gamma, ifm_channel_depth)); + } + if (_pv == PatternVersion::Version_1) + { + CHECK_OR_FALSE(is_quasi_1D_with_dummy_dim(const_as_gamma, ifm_channel_depth)); + } add_as_variance = dynamic_cast<luci::CircleAdd *>(rsqrt->x()); CHECK_OR_FALSE(add_as_variance); @@ -296,29 +426,69 @@ bool InstanceNormPattern::matched() // TODO Support regarding broadcast CHECK_OR_FALSE(const_as_epsilon->size<loco::DataType::FLOAT32>() == 1); - CHECK_OR_FALSE(is_instance_mean(mean_as_variance)); + if (_pv == PatternVersion::Version_0) + { + CHECK_OR_FALSE(is_instance_mean_v0(mean_as_variance)); + } + if (_pv == PatternVersion::Version_1) + { + CHECK_OR_FALSE(is_instance_mean_v1(mean_as_variance)); + } + sqdiff = dynamic_cast<luci::CircleSquaredDifference *>(mean_as_variance->input()); CHECK_OR_FALSE(sqdiff); - loco::Node *ifm_should_be = nullptr; - CHECK_OR_FALSE(fill(&ifm_should_be, &mean_of_ifm).with_commutative_args_of(sqdiff)); - CHECK_OR_FALSE(ifm == ifm_should_be); - CHECK_OR_FALSE(is_instance_mean(mean_of_ifm)); - CHECK_OR_FALSE(ifm == mean_of_ifm->input()); + if (_pv == PatternVersion::Version_0) + { + loco::Node *ifm_should_be = nullptr; + CHECK_OR_FALSE(fill(&ifm_should_be, &mean_of_ifm).with_commutative_args_of(sqdiff)); + CHECK_OR_FALSE(ifm == ifm_should_be); + CHECK_OR_FALSE(is_instance_mean_v0(mean_of_ifm)); + CHECK_OR_FALSE(ifm == mean_of_ifm->input()); + } + if (_pv == PatternVersion::Version_1) + { + loco::Node *reshape_should_be = nullptr; + CHECK_OR_FALSE(fill(&reshape_should_be, &mean_of_reshape).with_commutative_args_of(sqdiff)); + CHECK_OR_FALSE(reshape_of_ifm == reshape_should_be); + CHECK_OR_FALSE(is_instance_mean_v1(mean_of_reshape)); + CHECK_OR_FALSE(reshape_of_ifm == mean_of_reshape->input()); + } const_as_beta = dynamic_cast<luci::CircleConst *>(sub->x()); CHECK_OR_FALSE(const_as_beta); - CHECK_OR_FALSE(is_1D_with_dummy_dim(const_as_beta, ifm_channel_depth)); + + if (_pv == PatternVersion::Version_0) + { + CHECK_OR_FALSE(is_1D_with_dummy_dim(const_as_beta, ifm_channel_depth)); + } + if (_pv == PatternVersion::Version_1) + { + CHECK_OR_FALSE(is_quasi_1D_with_dummy_dim(const_as_beta, ifm_channel_depth)); + } mul_as_scaled_mean = dynamic_cast<luci::CircleMul *>(sub->y()); CHECK_OR_FALSE(mul_as_scaled_mean); luci::CircleMul *mul_gamma_should_be = nullptr; luci::CircleMean *mean_of_ifm_should_be = nullptr; - CHECK_OR_FALSE(fill(&mul_gamma_should_be, &mean_of_ifm_should_be) - .with_commutative_args_of(mul_as_scaled_mean)); - CHECK_OR_FALSE(mul_gamma == mul_gamma_should_be); - CHECK_OR_FALSE(mean_of_ifm == mean_of_ifm_should_be); + luci::CircleMean *mean_of_reshape_should_be = nullptr; + + if (_pv == PatternVersion::Version_0) + { + CHECK_OR_FALSE(fill(&mul_gamma_should_be, &mean_of_ifm_should_be) + .with_commutative_args_of(mul_as_scaled_mean)); + CHECK_OR_FALSE(mul_gamma == mul_gamma_should_be); + CHECK_OR_FALSE(mean_of_ifm == mean_of_ifm_should_be); + } + if (_pv == PatternVersion::Version_1) + { + CHECK_OR_FALSE(fill(&mul_gamma_should_be, &mean_of_reshape_should_be) + .with_commutative_args_of(mul_as_scaled_mean)); + CHECK_OR_FALSE(mul_gamma == mul_gamma_should_be); + CHECK_OR_FALSE(mean_of_reshape == mean_of_reshape_should_be); + } + #undef CHECK_OR_FALSE _matched = true; return true; @@ -381,13 +551,28 @@ namespace luci bool FuseInstanceNormPass::run(loco::Graph *g) { bool changed = false; + luci::CircleAdd *add; + InstanceNormPattern::PatternVersion pv; + for (auto node : loco::active_nodes(loco::output_nodes(g))) { - auto add = dynamic_cast<luci::CircleAdd *>(node); - if (not add) - continue; + auto reshape = dynamic_cast<luci::CircleReshape *>(node); + if (not reshape) + { + add = dynamic_cast<luci::CircleAdd *>(node); + if (not add) + continue; + pv = InstanceNormPattern::PatternVersion::Version_0; + } + else + { + add = dynamic_cast<luci::CircleAdd *>(reshape->tensor()); + if (not add) + continue; + pv = InstanceNormPattern::PatternVersion::Version_1; + } - InstanceNormPattern pattern(add); + InstanceNormPattern pattern(add, pv); if (not pattern.matched()) continue; diff --git a/compiler/luci/pass/src/FuseInstanceNormPass.test.cpp b/compiler/luci/pass/src/FuseInstanceNormPass.test.cpp new file mode 100644 index 000000000..3037f3def --- /dev/null +++ b/compiler/luci/pass/src/FuseInstanceNormPass.test.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "FuseInstanceNormPassInternal.h" + +#include <vector> + +#include <gtest/gtest.h> + +namespace +{ + +void setShape(luci::CircleNode &node, const std::vector<int> &v) +{ + node.rank(v.size()); + for (int i = 0; i < v.size(); ++i) + { + node.dim(i) = v[i]; + } +} + +} // namespace + +TEST(FuseInstanceNormPass, is_quasi_1D_with_dummy_dim) +{ + luci::CircleConst const_node; + + setShape(const_node, {}); + EXPECT_FALSE(is_quasi_1D_with_dummy_dim(&const_node, 8)); + + setShape(const_node, {1}); + EXPECT_FALSE(is_quasi_1D_with_dummy_dim(&const_node, 8)); + + setShape(const_node, {8}); + EXPECT_FALSE(is_quasi_1D_with_dummy_dim(&const_node, 8)); + + setShape(const_node, {1, 2, 1, 8, 1}); + EXPECT_FALSE(is_quasi_1D_with_dummy_dim(&const_node, 8)); + + setShape(const_node, {8, 3}); + EXPECT_FALSE(is_quasi_1D_with_dummy_dim(&const_node, 8)); + + setShape(const_node, {8, 1}); + EXPECT_FALSE(is_quasi_1D_with_dummy_dim(&const_node, 8)); + + setShape(const_node, {1, 8, 1}); + EXPECT_TRUE(is_quasi_1D_with_dummy_dim(&const_node, 8)); + + setShape(const_node, {1, 1, 1, 8, 1}); + EXPECT_TRUE(is_quasi_1D_with_dummy_dim(&const_node, 8)); +} diff --git a/compiler/luci/pass/src/FuseInstanceNormPassInternal.h b/compiler/luci/pass/src/FuseInstanceNormPassInternal.h new file mode 100644 index 000000000..32b638ba5 --- /dev/null +++ b/compiler/luci/pass/src/FuseInstanceNormPassInternal.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __LUCI_CIRCLE_FUSE_INSTANCE_NORM_PASS_INTERNAL_H__ +#define __LUCI_CIRCLE_FUSE_INSTANCE_NORM_PASS_INTERNAL_H__ + +#include <luci/IR/CircleNodes.h> + +/// @return true When node has shape of '1 x .. x 1 x depth' +bool is_1D_with_dummy_dim(luci::CircleConst *node, uint32_t depth); + +/// @return true When node has shape of '1 x .. x depth x 1' +bool is_quasi_1D_with_dummy_dim(luci::CircleConst *node, uint32_t depth); + +#endif // __LUCI_CIRCLE_FUSE_INSTANCE_NORM_PASS_INTERNAL_H__ diff --git a/compiler/luci/pass/src/QuantizationUtils.cpp b/compiler/luci/pass/src/QuantizationUtils.cpp new file mode 100644 index 000000000..6726ce746 --- /dev/null +++ b/compiler/luci/pass/src/QuantizationUtils.cpp @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "QuantizationUtils.h" + +#include <luci/Log.h> + +#include <iostream> +#include <cmath> + +namespace luci +{ + +void compute_sym_scale_zp(float min, float max, float &scaling_factor, int64_t &zp, + float &nudged_min, float &nudged_max) +{ + assert(min != max); + + const int32_t kMaxScale = std::numeric_limits<int16_t>::max(); + const int32_t kMinScale = -kMaxScale; + const double qmin_double = kMinScale; + const double qmax_double = kMaxScale; + const double rmin = std::fmin(0, min); + const double rmax = std::fmax(0, max); + double scale_factor_from_min_side{0}; + double scale_factor_from_max_side{0}; + + if ((qmin_double * rmin) > 0) + scale_factor_from_min_side = rmin / qmin_double; + + if ((qmax_double * rmax) > 0) + scale_factor_from_max_side = rmax / qmax_double; + + scaling_factor = scale_factor_from_min_side > scale_factor_from_max_side + ? scale_factor_from_min_side + : scale_factor_from_max_side; + zp = 0; + nudged_min = static_cast<float>(qmin_double * scaling_factor); + nudged_max = static_cast<float>(qmax_double * scaling_factor); +} + +void compute_asym_scale_zp(float min, float max, float &scaling_factor, int64_t &zp, + float &nudged_min, float &nudged_max) +{ + LOGGER(l); + + assert(min <= max); + const int32_t kMinScale = 0; + const int32_t kMaxScale = 255; + const double qmin_double = kMinScale; + const double qmax_double = kMaxScale; + const double rmin = std::fmin(0, min); + const double rmax = std::fmax(0, max); + + double scale = (rmax - rmin) / (qmax_double - qmin_double); + double zero_point_double = 0; + uint8_t nudged_zero_point = 0; + if (scale == 0) + { + WARN(l) << "The minimum and maximum values are the same." << std::endl; + if (min >= 0 && max >= 0) + zero_point_double = kMinScale; + else + zero_point_double = kMaxScale; + } + else + zero_point_double = qmin_double - rmin / scale; + if (zero_point_double <= qmin_double) + { + assert(min >= 0 && max >= 0); + nudged_zero_point = kMinScale; + scale = max / (qmax_double - qmin_double); + if (min > 0 && max > 0) + WARN(l) << "The minimum and maximum values are all positive." << std::endl; + } + else if (zero_point_double >= qmax_double) + { + assert(min < 0 && max < 0); + nudged_zero_point = kMaxScale; + scale = -min / (qmax_double - qmin_double); + WARN(l) << "The minimum and maximum values are all negative." << std::endl; + } + else + { + assert(min < 0 && max >= 0); + nudged_zero_point = static_cast<uint8_t>(std::round(zero_point_double)); + } + + nudged_min = static_cast<float>((qmin_double - nudged_zero_point) * scale); + nudged_max = static_cast<float>((qmax_double - nudged_zero_point) * scale); + + scaling_factor = scale; + zp = nudged_zero_point; +} + +bool get_channel_dim_index(CircleConst *node, loco::TensorShape &dimension, int &channel_dim_index) +{ + auto succs = loco::succs(node); + if (succs.size() != 1) // assume weights is used by only one node + return false; + + for (auto out : succs) + { + auto conv = dynamic_cast<CircleConv2D *>(out); + auto dw_conv = dynamic_cast<CircleDepthwiseConv2D *>(out); + auto tw_conv = dynamic_cast<CircleTransposeConv *>(out); + auto fc = dynamic_cast<CircleFullyConnected *>(out); + + // Refer to https://github.com/Samsung/ONE/pull/2448. + if ((conv != nullptr && conv->filter() == node) || + (tw_conv != nullptr && tw_conv->filter() == node)) // OHWI + { + assert(node->rank() == 4); + dimension.dim(0).set(node->dim(0).value()); + dimension.dim(1).set(node->dim(1).value()); + dimension.dim(2).set(node->dim(2).value()); + dimension.dim(3).set(node->dim(3).value()); + channel_dim_index = 0; // Set channel_dim_index based on "O" + return true; + } + else if (dw_conv != nullptr && dw_conv->filter() == node) // IHWC + { + assert(node->rank() == 4); + dimension.dim(0).set(node->dim(0).value()); + dimension.dim(1).set(node->dim(1).value()); + dimension.dim(2).set(node->dim(2).value()); + dimension.dim(3).set(node->dim(3).value()); + channel_dim_index = 3; // Set channel_dim_index based on "C" + return true; + } + else if (fc != nullptr && fc->weights() == node) // OI + { + assert(node->rank() == 2); + dimension.dim(0).set(node->dim(0).value()); + dimension.dim(1).set(1); // Set FC layer like CONV + dimension.dim(2).set(1); + dimension.dim(3).set(node->dim(1).value()); + channel_dim_index = 0; // Set channel_dim_index based on "O" + return true; + } + else + { + // node does not support channle-wise quantization + assert(false); + } + } + + return false; +} + +uint32_t cal_offset(loco::TensorShape &dimension, uint32_t *indices) +{ + return indices[0] * dimension.dim(1).value() * dimension.dim(2).value() * + dimension.dim(3).value() + + indices[1] * dimension.dim(2).value() * dimension.dim(3).value() + + indices[2] * dimension.dim(3).value() + indices[3]; +} + +} // namespace luci diff --git a/compiler/luci/pass/src/QuantizationUtils.h b/compiler/luci/pass/src/QuantizationUtils.h new file mode 100644 index 000000000..ec0e86df8 --- /dev/null +++ b/compiler/luci/pass/src/QuantizationUtils.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef __LUCI_QUANTIZATION_UTILS_H__ +#define __LUCI_QUANTIZATION_UTILS_H__ + +#include <luci/IR/CircleNodes.h> +#include <loco/IR/TensorShape.h> + +namespace luci +{ + +void compute_sym_scale_zp(float min, float max, float &scaling_factor, int64_t &zp, + float &nudged_min, float &nudged_max); + +void compute_asym_scale_zp(float min, float max, float &scaling_factor, int64_t &zp, + float &nudged_min, float &nudged_max); + +bool get_channel_dim_index(CircleConst *node, loco::TensorShape &dimension, int &channel_dim_index); + +uint32_t cal_offset(loco::TensorShape &dimension, uint32_t *indices); + +} // namespace luci + +#endif // __LUCI_QUANTIZATION_UTILS_H__ diff --git a/compiler/luci/pass/src/QuantizeDequantizeWeightsPass.cpp b/compiler/luci/pass/src/QuantizeDequantizeWeightsPass.cpp new file mode 100644 index 000000000..c492234c7 --- /dev/null +++ b/compiler/luci/pass/src/QuantizeDequantizeWeightsPass.cpp @@ -0,0 +1,495 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2019 The TensorFlow Authors. All Rights Reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "luci/Pass/QuantizeDequantizeWeightsPass.h" +#include "QuantizationUtils.h" + +#include <luci/IR/CircleNodes.h> +#include <luci/IR/CircleNodeVisitor.h> +#include <luci/Log.h> +#include <loco/IR/TensorShape.h> + +#include <iostream> +#include <cmath> + +namespace luci +{ + +namespace +{ + +void cal_minmax_per_channel(CircleConst *node, std::vector<float> &min, std::vector<float> &max) +{ + loco::TensorShape dimension; + dimension.rank(4); + uint32_t indices[4] = { + 0, + }; + int channel_dim_index{0}; + int size{0}; + + if (!get_channel_dim_index(node, dimension, channel_dim_index)) + { + assert(false); + return; + } + size = dimension.dim(channel_dim_index).value(); + + std::vector<bool> has_min_max_value(size, false); + min.resize(size); + max.resize(size); + for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++) + { + for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++) + { + for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++) + { + for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++) + { + int channel_idx = indices[channel_dim_index]; + auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices)); + if (has_min_max_value[channel_idx]) + { + min[channel_idx] = data < min[channel_idx] ? data : min[channel_idx]; + max[channel_idx] = data > max[channel_idx] ? data : max[channel_idx]; + } + else + { + min[channel_idx] = data; + max[channel_idx] = data; + has_min_max_value[channel_idx] = true; + } + } + } + } + } +} + +void sym_wquant_per_channel(CircleConst *node, std::vector<float> &min, std::vector<float> &max, + std::vector<float> &scaling_factor, std::vector<int64_t> &zp, + std::vector<float> &nudged_min, std::vector<float> &nudged_max) +{ + assert(node->dtype() == loco::DataType::FLOAT32); + const int32_t kMaxScale = std::numeric_limits<int16_t>::max(); + const int32_t kMinScale = -kMaxScale; + + uint32_t size = node->size<loco::DataType::FLOAT32>(); + std::vector<int32_t> quantized_values(size); + + for (size_t i = 0; i < min.size(); ++i) + { + compute_sym_scale_zp(min[i], max[i], scaling_factor[i], zp[i], nudged_min[i], nudged_max[i]); + } + + loco::TensorShape dimension; + dimension.rank(4); + uint32_t indices[4] = { + 0, + }; + int channel_dim_index{0}; + + if (!get_channel_dim_index(node, dimension, channel_dim_index)) + { + assert(false); + return; + } + + for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++) + { + for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++) + { + for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++) + { + for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++) + { + int channel_idx = indices[channel_dim_index]; + const float scaling_factor_inv = 1.0 / scaling_factor[channel_idx]; + auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices)); + data = data < nudged_min[channel_idx] ? nudged_min[channel_idx] : data; + data = data > nudged_max[channel_idx] ? nudged_max[channel_idx] : data; + quantized_values[cal_offset(dimension, indices)] = + static_cast<int32_t>(std::round(data * scaling_factor_inv)); + } + } + } + } + + node->dtype(loco::DataType::S16); // change the type of tensor + node->size<loco::DataType::S16>(size); // resize tensor + for (uint32_t i = 0; i < size; ++i) + { + node->at<loco::DataType::S16>(i) = + std::min(kMaxScale, std::max(kMinScale, quantized_values[i])); + } +} + +void sym_wdequant_per_channel(CircleConst *node, std::vector<float> &scaling_factor) +{ + assert(node->dtype() == loco::DataType::S16); + uint32_t size = node->size<loco::DataType::S16>(); + std::vector<float> dequantized_values(size); + + loco::TensorShape dimension; + dimension.rank(4); + uint32_t indices[4] = { + 0, + }; + int channel_dim_index{0}; + + if (!get_channel_dim_index(node, dimension, channel_dim_index)) + { + assert(false); + return; + } + + for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++) + { + for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++) + { + for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++) + { + for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++) + { + int channel_idx = indices[channel_dim_index]; + auto data = node->at<loco::DataType::S16>(cal_offset(dimension, indices)); + dequantized_values[cal_offset(dimension, indices)] = + static_cast<float>(data) * scaling_factor[channel_idx]; + } + } + } + } + + node->dtype(loco::DataType::FLOAT32); // change the type of tensor + node->size<loco::DataType::FLOAT32>(size); // resize tensor + for (uint32_t i = 0; i < size; ++i) + { + node->at<loco::DataType::FLOAT32>(i) = dequantized_values[i]; + } +} + +void asymmetric_wquant_per_channel(CircleConst *node, std::vector<float> &min, + std::vector<float> &max, std::vector<float> &scaling_factor, + std::vector<int64_t> &zp, std::vector<float> &nudged_min, + std::vector<float> &nudged_max) +{ + assert(node->dtype() == loco::DataType::FLOAT32); + + const int32_t kMinScale = 0; + const int32_t kMaxScale = 255; + + uint32_t size = node->size<loco::DataType::FLOAT32>(); + std::vector<int32_t> quantized_values(size); + + for (size_t i = 0; i < min.size(); ++i) + { + compute_asym_scale_zp(min[i], max[i], scaling_factor[i], zp[i], nudged_min[i], nudged_max[i]); + } + + loco::TensorShape dimension; + dimension.rank(4); + uint32_t indices[4] = { + 0, + }; + int channel_dim_index{0}; + + if (!get_channel_dim_index(node, dimension, channel_dim_index)) + { + assert(false); + return; + } + + for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++) + { + for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++) + { + for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++) + { + for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++) + { + int channel_idx = indices[channel_dim_index]; + const float scaling_factor_inv = 1.0 / scaling_factor[channel_idx]; + auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices)); + data = data < nudged_min[channel_idx] ? nudged_min[channel_idx] : data; + data = data > nudged_max[channel_idx] ? nudged_max[channel_idx] : data; + quantized_values[cal_offset(dimension, indices)] = static_cast<int32_t>( + std::round((data - nudged_min[channel_idx]) * scaling_factor_inv)); + } + } + } + } + + node->dtype(loco::DataType::U8); // change the type of tensor + node->size<loco::DataType::U8>(size); // resize tensor + for (uint32_t i = 0; i < size; ++i) + { + node->at<loco::DataType::U8>(i) = std::min(kMaxScale, std::max(kMinScale, quantized_values[i])); + } +} + +void asymmetric_wdequant_per_channel(CircleConst *node, std::vector<float> &scaling_factor, + std::vector<float> &nudged_min) +{ + assert(node->dtype() == loco::DataType::U8); + uint32_t size = node->size<loco::DataType::U8>(); + std::vector<float> dequantized_values(size); + + loco::TensorShape dimension; + dimension.rank(4); + uint32_t indices[4] = { + 0, + }; + int channel_dim_index{0}; + + if (!get_channel_dim_index(node, dimension, channel_dim_index)) + { + assert(false); + return; + } + + for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++) + { + for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++) + { + for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++) + { + for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++) + { + int channel_idx = indices[channel_dim_index]; + auto data = node->at<loco::DataType::U8>(cal_offset(dimension, indices)); + dequantized_values[cal_offset(dimension, indices)] = + static_cast<float>(data) * scaling_factor[channel_idx] + nudged_min[channel_idx]; + } + } + } + } + + node->dtype(loco::DataType::FLOAT32); // change the type of tensor + node->size<loco::DataType::FLOAT32>(size); // resize tensor + for (uint32_t i = 0; i < size; ++i) + { + node->at<loco::DataType::FLOAT32>(i) = dequantized_values[i]; + } +} + +void asymmetric_wquant_with_minmax_per_layer(CircleConst *node, float min, float max, + float &scaling_factor, int64_t &zp, float &nudged_min, + float &nudged_max) +{ + + const int32_t kMinScale = 0; + const int32_t kMaxScale = 255; + + uint32_t size = node->size<loco::DataType::FLOAT32>(); + compute_asym_scale_zp(min, max, scaling_factor, zp, nudged_min, nudged_max); + const float scaling_factor_inv = 1.0 / scaling_factor; + std::vector<int32_t> quantized_values(size); + for (uint32_t i = 0; i < size; ++i) + { + // clipping + auto data = node->at<loco::DataType::FLOAT32>(i); + data = data < nudged_min ? nudged_min : data; + data = data > nudged_max ? nudged_max : data; + quantized_values[i] = + static_cast<int32_t>(std::round((data - nudged_min) * scaling_factor_inv)); + } + + node->dtype(loco::DataType::U8); // change the type of tensor + node->size<loco::DataType::U8>(size); // resize tensor + for (uint32_t i = 0; i < size; ++i) + { + node->at<loco::DataType::U8>(i) = std::min(kMaxScale, std::max(kMinScale, quantized_values[i])); + } +} + +void asymmetric_wdequant_with_minmax_per_layer(CircleConst *node, float scaling_factor, + float nudged_min) +{ + uint32_t size = node->size<loco::DataType::U8>(); + std::vector<float> dequantized_values(size); + for (uint32_t i = 0; i < size; ++i) + { + auto data = node->at<loco::DataType::U8>(i); + dequantized_values[i] = static_cast<float>(data) * scaling_factor + nudged_min; + } + + node->dtype(loco::DataType::FLOAT32); // change the type of tensor + node->size<loco::DataType::FLOAT32>(size); // resize tensor + for (uint32_t i = 0; i < size; ++i) + { + node->at<loco::DataType::FLOAT32>(i) = dequantized_values[i]; + } +} + +bool is_quantized(const CircleNode *node) +{ + return node->dtype() == loco::DataType::U8 || // activation, weight + node->dtype() == loco::DataType::S16 || // activation, weight + node->dtype() == loco::DataType::S32; // bias +} + +// Check if node is weights of conv2d, transepose_conv2d, depthwise_conv2d, or fully_connected layer +bool is_weights(CircleNode *node) +{ + auto circle_const = dynamic_cast<CircleConst *>(node); + if (circle_const == nullptr) + return false; + + auto succs = loco::succs(node); + if (succs.size() != 1) // assume weights is used by only one node + return false; + + for (auto out : succs) + { + auto conv = dynamic_cast<CircleConv2D *>(out); + if (conv != nullptr && conv->filter() == circle_const && circle_const->rank() == 4) + return true; + + auto dw_conv = dynamic_cast<CircleDepthwiseConv2D *>(out); + if (dw_conv != nullptr && dw_conv->filter() == circle_const && circle_const->rank() == 4) + return true; + + auto tw_conv = dynamic_cast<CircleTransposeConv *>(out); + if (tw_conv != nullptr && tw_conv->filter() == circle_const && circle_const->rank() == 4) + return true; + + auto fc = dynamic_cast<CircleFullyConnected *>(out); + if (fc != nullptr && fc->weights() == circle_const && circle_const->rank() == 2) + return true; + } + return false; +} + +/** + * @brief QuantizeDequantizeWeights quantizes and dequantizes tensors for weights + * @details Find min/max values on the fly, quantize the model, and dequantize the model + */ +struct QuantizeDequantizeWeights final : public luci::CircleNodeMutableVisitor<bool> +{ + QuantizeDequantizeWeights(loco::DataType input, loco::DataType output, + QuantizationGranularity granularity) + : input_type(input), output_type(output), granularity(granularity) + { + } + + loco::DataType input_type; + loco::DataType output_type; + QuantizationGranularity granularity; + + // Quantize and dequantize input tensors of each node + bool visit(luci::CircleNode *node) + { + assert(output_type == loco::DataType::U8 || output_type == loco::DataType::S16); + LOGGER(l); + INFO(l) << "QuantizeDequantizeWeights visit node: " << node->name() << std::endl; + auto arity = node->arity(); + for (uint32_t i = 0; i < arity; i++) + { + auto input_node = node->arg(i); + auto circle_node = loco::must_cast<luci::CircleNode *>(input_node); + + // Check if this is already quantized + if (is_quantized(circle_node)) + continue; + + if (is_weights(circle_node)) + { + auto circle_const = loco::must_cast<luci::CircleConst *>(circle_node); + + // Find min/max per channel-wise + if (granularity == QuantizationGranularity::ChannelWise) + { + std::vector<float> min; + std::vector<float> max; + + cal_minmax_per_channel(circle_const, min, max); + + std::vector<float> nudged_min(min.size()); + std::vector<float> nudged_max(min.size()); + std::vector<float> scaling_factor(min.size()); + std::vector<int64_t> zp(min.size()); + + if (output_type == loco::DataType::U8) + { + asymmetric_wquant_per_channel(circle_const, min, max, scaling_factor, zp, nudged_min, + nudged_max); + asymmetric_wdequant_per_channel(circle_const, scaling_factor, nudged_min); + } + else + { + sym_wquant_per_channel(circle_const, min, max, scaling_factor, zp, nudged_min, + nudged_max); + sym_wdequant_per_channel(circle_const, scaling_factor); + } + + auto quantparam = std::make_unique<CircleQuantParam>(); + quantparam->min = nudged_min; + quantparam->max = nudged_max; + quantparam->scale = scaling_factor; + quantparam->zerop = zp; + circle_node->quantparam(std::move(quantparam)); + } + // Find min/max per layer-wise + else + { + float min = std::numeric_limits<float>::max(); + float max = std::numeric_limits<float>::lowest(); + for (uint32_t i = 0; i < circle_const->size<loco::DataType::FLOAT32>(); i++) + { + auto data = circle_const->at<loco::DataType::FLOAT32>(i); + min = data < min ? data : min; + max = data > max ? data : max; + } + float scaling_factor{0}; + int64_t zp{0}; + float nudged_min{0}; + float nudged_max{0}; + + asymmetric_wquant_with_minmax_per_layer(circle_const, min, max, scaling_factor, zp, + nudged_min, nudged_max); + asymmetric_wdequant_with_minmax_per_layer(circle_const, scaling_factor, nudged_min); + auto quantparam = std::make_unique<CircleQuantParam>(); + quantparam->min.push_back(nudged_min); + quantparam->max.push_back(nudged_max); + quantparam->scale.push_back(scaling_factor); + quantparam->zerop.push_back(zp); + circle_node->quantparam(std::move(quantparam)); + } + } + } + return false; + } +}; + +} // namespace + +bool QuantizeDequantizeWeightsPass::run(loco::Graph *g) +{ + LOGGER(l); + INFO(l) << "QuantizeDequantizeWeightsPass Start" << std::endl; + + // Quantize weights + for (auto node : loco::active_nodes(loco::output_nodes(g))) + { + QuantizeDequantizeWeights qw(_input_dtype, _output_dtype, _granularity); + auto circle_node = loco::must_cast<luci::CircleNode *>(node); + circle_node->accept(&qw); + } + + INFO(l) << "QuantizeDequantizeWeightsPass End" << std::endl; + return false; // one time run +} + +} // namespace luci diff --git a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp new file mode 100644 index 000000000..f8abee751 --- /dev/null +++ b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp @@ -0,0 +1,551 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2019 The TensorFlow Authors. All Rights Reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "luci/Pass/QuantizeWithMinMaxPass.h" +#include "QuantizationUtils.h" + +#include <luci/IR/CircleNodes.h> +#include <luci/IR/CircleNodeVisitor.h> +#include <luci/Log.h> + +#include <oops/UserExn.h> + +#include <iostream> +#include <cmath> + +namespace luci +{ + +namespace +{ + +// Check if the node is the bias of Conv2D, DepthwiseConv2D, or FullyConnected layer +// If true, return <input, weight> pair of the successor node (used to quantize bias) +// If flase, return <nullptr, nullptr> +std::pair<loco::Node *, loco::Node *> get_input_weight_of_bias(CircleNode *node) +{ + auto circle_const = dynamic_cast<CircleConst *>(node); + if (circle_const == nullptr) + return std::make_pair(nullptr, nullptr); + + auto succs = loco::succs(node); + if (succs.size() != 1) // assume bias is used by only one node + return std::make_pair(nullptr, nullptr); + + for (auto out : succs) + { + auto conv = dynamic_cast<CircleConv2D *>(out); + if (conv != nullptr && conv->bias() == circle_const) + { + assert(conv->input() != nullptr); + assert(conv->filter() != nullptr); + return std::make_pair(conv->input(), conv->filter()); + } + auto dw_conv = dynamic_cast<CircleDepthwiseConv2D *>(out); + if (dw_conv != nullptr && dw_conv->bias() == circle_const) + { + assert(dw_conv->input() != nullptr); + assert(dw_conv->filter() != nullptr); + return std::make_pair(dw_conv->input(), dw_conv->filter()); + } + auto fc = dynamic_cast<CircleFullyConnected *>(out); + if (fc != nullptr && fc->bias() == circle_const) + { + assert(fc->input() != nullptr); + assert(fc->weights() != nullptr); + return std::make_pair(fc->input(), fc->weights()); + } + } + return std::make_pair(nullptr, nullptr); +} + +void asym_quant_bias_per_layer(CircleConst *node, float input_scale, float weight_scale, + float *scaling_factor, int64_t *zp) +{ + float scale = input_scale * weight_scale; + const float scaling_factor_inv = (scale == 0) ? 0 : 1.0 / scale; + + uint32_t size = node->size<loco::DataType::FLOAT32>(); + std::vector<int32_t> quantized_values(size); + for (uint32_t i = 0; i < size; ++i) + { + quantized_values[i] = + static_cast<int32_t>(std::round(node->at<loco::DataType::FLOAT32>(i) * scaling_factor_inv)); + } + + node->dtype(loco::DataType::S32); // change the type of tensor + node->size<loco::DataType::S32>(size); // resize tensor + const int32_t kMinScale = std::numeric_limits<int32_t>::lowest(); + const int32_t kMaxScale = std::numeric_limits<int32_t>::max(); + for (uint32_t i = 0; i < size; ++i) + { + node->at<loco::DataType::S32>(i) = + std::min(kMaxScale, std::max(kMinScale, quantized_values[i])); + } + *scaling_factor = scale; + *zp = 0; +} + +void quant_bias_per_channel(CircleConst *node, float input_scale, std::vector<float> &weight_scale, + std::vector<float> &scaling_factor, std::vector<int64_t> &zp) +{ + float scaling_factor_inv{0}; + + uint32_t size = node->size<loco::DataType::FLOAT32>(); + std::vector<int32_t> quantized_values(size); + + for (uint32_t i = 0; i < size; ++i) + { + scaling_factor[i] = input_scale * weight_scale[i]; + scaling_factor_inv = (scaling_factor[i] == 0) ? 0 : 1.0 / scaling_factor[i]; + quantized_values[i] = + static_cast<int32_t>(std::round(node->at<loco::DataType::FLOAT32>(i) * scaling_factor_inv)); + zp[i] = 0; + } + + node->dtype(loco::DataType::S32); // change the type of tensor + node->size<loco::DataType::S32>(size); // resize tensor + const int32_t kMinScale = std::numeric_limits<int32_t>::lowest(); + const int32_t kMaxScale = std::numeric_limits<int32_t>::max(); + for (uint32_t i = 0; i < size; ++i) + { + node->at<loco::DataType::S32>(i) = + std::min(kMaxScale, std::max(kMinScale, quantized_values[i])); + } +} + +bool has_min_max(const CircleNode *node) +{ + return node->quantparam() && !node->quantparam()->min.empty() && !node->quantparam()->max.empty(); +} + +bool is_quantized(const CircleNode *node) +{ + return node->dtype() == loco::DataType::U8 || // activation, weight + node->dtype() == loco::DataType::S32; // bias +} + +void sym_wquant_per_channel(CircleConst *node, std::vector<float> &scaling_factor) +{ + assert(node->dtype() == loco::DataType::FLOAT32); + + const int32_t kMaxScale = std::numeric_limits<int16_t>::max(); + const int32_t kMinScale = -kMaxScale; + + uint32_t size = node->size<loco::DataType::FLOAT32>(); + std::vector<int32_t> quantized_values(size); + + loco::TensorShape dimension; + dimension.rank(4); + uint32_t indices[4] = { + 0, + }; + int channel_dim_index{0}; + + if (!get_channel_dim_index(node, dimension, channel_dim_index)) + { + assert(false); + return; + } + + for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++) + { + for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++) + { + for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++) + { + for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++) + { + int channel_idx = indices[channel_dim_index]; + const float scaling_factor_inv = 1.0 / scaling_factor[channel_idx]; + auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices)); + quantized_values[cal_offset(dimension, indices)] = + static_cast<int32_t>(std::round(data * scaling_factor_inv)); + } + } + } + } + + node->dtype(loco::DataType::S16); // change the type of tensor + node->size<loco::DataType::S16>(size); // resize tensor + for (uint32_t i = 0; i < size; ++i) + { + node->at<loco::DataType::S16>(i) = + std::min(kMaxScale, std::max(kMinScale, quantized_values[i])); + } +} + +void asym_wquant_per_channel(CircleConst *node, std::vector<float> &min, + std::vector<float> &scaling_factor) +{ + assert(node->dtype() == loco::DataType::FLOAT32); + + const int32_t kMinScale = 0; + const int32_t kMaxScale = 255; + + uint32_t size = node->size<loco::DataType::FLOAT32>(); + std::vector<int32_t> quantized_values(size); + + loco::TensorShape dimension; + dimension.rank(4); + uint32_t indices[4] = { + 0, + }; + int channel_dim_index{0}; + + if (!get_channel_dim_index(node, dimension, channel_dim_index)) + { + assert(false); + return; + } + + for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++) + { + for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++) + { + for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++) + { + for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++) + { + int channel_idx = indices[channel_dim_index]; + const float scaling_factor_inv = 1.0 / scaling_factor[channel_idx]; + auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices)); + quantized_values[cal_offset(dimension, indices)] = + static_cast<int32_t>(std::round((data - min[channel_idx]) * scaling_factor_inv)); + } + } + } + } + + node->dtype(loco::DataType::U8); // change the type of tensor + node->size<loco::DataType::U8>(size); // resize tensor + for (uint32_t i = 0; i < size; ++i) + { + node->at<loco::DataType::U8>(i) = std::min(kMaxScale, std::max(kMinScale, quantized_values[i])); + } +} + +void asym_wquant_per_layer(CircleConst *node, float min, float scaling_factor) +{ + const int32_t kMinScale = 0; + const int32_t kMaxScale = 255; + + uint32_t size = node->size<loco::DataType::FLOAT32>(); + + const float scaling_factor_inv = 1.0 / scaling_factor; + std::vector<int32_t> quantized_values(size); + for (uint32_t i = 0; i < size; ++i) + { + auto data = node->at<loco::DataType::FLOAT32>(i); + quantized_values[i] = static_cast<int32_t>(std::round((data - min) * scaling_factor_inv)); + } + + node->dtype(loco::DataType::U8); // change the type of tensor + node->size<loco::DataType::U8>(size); // resize tensor + for (uint32_t i = 0; i < size; ++i) + { + node->at<loco::DataType::U8>(i) = std::min(kMaxScale, std::max(kMinScale, quantized_values[i])); + } +} + +// Check if node is weights of conv2d, depthwise_conv2d, or fully_connected layer +bool is_weights(CircleNode *node) +{ + auto circle_const = dynamic_cast<CircleConst *>(node); + if (circle_const == nullptr) + return false; + + auto succs = loco::succs(node); + if (succs.size() != 1) // assume weights is used by only one node + return false; + + for (auto out : succs) + { + auto conv = dynamic_cast<CircleConv2D *>(out); + if (conv != nullptr && conv->filter() == circle_const) + return true; + + auto dw_conv = dynamic_cast<CircleDepthwiseConv2D *>(out); + if (dw_conv != nullptr && dw_conv->filter() == circle_const) + return true; + + auto fc = dynamic_cast<CircleFullyConnected *>(out); + if (fc != nullptr && fc->weights() == circle_const) + return true; + } + return false; +} + +/** + * @brief QuantizeActivation quantizes tensors for activations + * @details Quantize using recorded min/max values + */ +struct QuantizeActivation final : public luci::CircleNodeMutableVisitor<bool> +{ + QuantizeActivation(loco::DataType input, loco::DataType output) + : input_type(input), output_type(output) + { + } + + loco::DataType input_type; + loco::DataType output_type; + + // Quantize input tensors of each node + bool visit(luci::CircleNode *node) + { + LOGGER(l); + INFO(l) << "QuantizeActivation visit node: " << node->name() << std::endl; + auto arity = node->arity(); + for (uint32_t i = 0; i < arity; i++) + { + auto input_node = node->arg(i); + auto circle_node = loco::must_cast<luci::CircleNode *>(input_node); + + // Check if this is already quantized + if (is_quantized(circle_node)) + continue; + + // Check if this is bias (bias is quantized later) + auto iw = get_input_weight_of_bias(circle_node); + if (iw.first != nullptr && iw.second != nullptr) + continue; + + // Check if this is activation + // We assume min/max are recorded only for activations + if (has_min_max(circle_node) && !is_weights(circle_node)) + { + // Quantize using recorded min/max + auto quantparam = circle_node->quantparam(); + assert(quantparam->min.size() == 1); // only support layer-wise quant + assert(quantparam->max.size() == 1); // only support layer-wise quant + auto min = quantparam->min[0]; + auto max = quantparam->max[0]; + + float scaling_factor{0}; + int64_t zp{0}; + float nudged_min{0}; + float nudged_max{0}; + + if (output_type == loco::DataType::U8) + { + compute_asym_scale_zp(min, max, scaling_factor, zp, nudged_min, nudged_max); + circle_node->dtype(loco::DataType::U8); + } + else + { + compute_sym_scale_zp(min, max, scaling_factor, zp, nudged_min, nudged_max); + circle_node->dtype(loco::DataType::S16); + } + + circle_node->quantparam()->max[0] = nudged_max; + circle_node->quantparam()->min[0] = nudged_min; + circle_node->quantparam()->scale.push_back(scaling_factor); + circle_node->quantparam()->zerop.push_back(zp); + } + } + return false; + } +}; + +struct QuantizeBias final : public luci::CircleNodeMutableVisitor<bool> +{ + QuantizeBias(loco::DataType input, loco::DataType output, QuantizationGranularity gr) + : input_type(input), output_type(output), granularity(gr) + { + } + + loco::DataType input_type; + loco::DataType output_type; + QuantizationGranularity granularity; + + // Quantize bias node + bool visit(luci::CircleNode *node) + { + // Check if this is already quantized + if (is_quantized(node)) + return false; + + // Check if this is bias + auto iw = get_input_weight_of_bias(node); + if (iw.first == nullptr || iw.second == nullptr) + return false; + + auto input = loco::must_cast<luci::CircleNode *>(iw.first); + auto weight = loco::must_cast<luci::CircleNode *>(iw.second); + + if (granularity == QuantizationGranularity::ChannelWise) + { + assert(input->quantparam()->scale.size() == 1); // input scale's layer-wise + auto input_scale = input->quantparam()->scale[0]; + + assert(weight->quantparam() != nullptr); // weight scale's channel-wise + auto weight_scale = weight->quantparam()->scale; + + auto circle_const = loco::must_cast<luci::CircleConst *>(node); + + uint32_t size = circle_const->size<loco::DataType::FLOAT32>(); + assert(size == weight_scale.size()); + std::vector<float> scaling_factor(size); + std::vector<int64_t> zp(size); + + quant_bias_per_channel(circle_const, input_scale, weight_scale, scaling_factor, zp); + + auto quantparam = std::make_unique<CircleQuantParam>(); + quantparam->scale = scaling_factor; + quantparam->zerop = zp; + assert(circle_const->quantparam() == nullptr); // bias should not be quantized before + circle_const->quantparam(std::move(quantparam)); + } + else + { + assert(input->quantparam()->scale.size() == 1); // Only support per-layer quant + auto input_scale = input->quantparam()->scale[0]; + + assert(weight->quantparam()->scale.size() == 1); // Only support per-layer quant + auto weight_scale = weight->quantparam()->scale[0]; + + auto circle_const = loco::must_cast<luci::CircleConst *>(node); + float scaling_factor{0}; + int64_t zp{0}; + asym_quant_bias_per_layer(circle_const, input_scale, weight_scale, &scaling_factor, &zp); + auto quantparam = std::make_unique<CircleQuantParam>(); + quantparam->scale.push_back(scaling_factor); + quantparam->zerop.push_back(zp); + assert(circle_const->quantparam() == nullptr); // bias should not be quantized before + circle_const->quantparam(std::move(quantparam)); + } + return false; + } +}; + +/** + * @brief QuantizeWeights quantizes tensors for weights + * @details Find min/max values on the fly and then quantize + */ +struct QuantizeWeights final : public luci::CircleNodeMutableVisitor<bool> +{ + QuantizeWeights(loco::DataType input, loco::DataType output, QuantizationGranularity gr) + : input_type(input), output_type(output), granularity(gr) + { + } + + loco::DataType input_type; + loco::DataType output_type; + QuantizationGranularity granularity; + + // Quantize input tensors of each node + bool visit(luci::CircleNode *node) + { + LOGGER(l); + INFO(l) << "QuantizeWeights visit node: " << node->name() << std::endl; + auto arity = node->arity(); + for (uint32_t i = 0; i < arity; i++) + { + auto input_node = node->arg(i); + auto circle_node = loco::must_cast<luci::CircleNode *>(input_node); + + // Check if this is already quantized + if (is_quantized(circle_node)) + continue; + + if (is_weights(circle_node)) + { + auto circle_const = loco::must_cast<luci::CircleConst *>(circle_node); + + // Find min/max per channel-wise + if (granularity == QuantizationGranularity::ChannelWise) + { + auto quantparam = circle_node->quantparam(); + assert(quantparam != nullptr); + auto min = quantparam->min; + auto scaling_factor = quantparam->scale; + + if (output_type == loco::DataType::U8) + { + asym_wquant_per_channel(circle_const, min, scaling_factor); + } + else + { + sym_wquant_per_channel(circle_const, scaling_factor); + } + } + // Find min/max per layer-wise + else + { + // Quantize using recorded quantparam + auto quantparam = circle_node->quantparam(); + assert(quantparam != nullptr); + assert(quantparam->min.size() == 1); // only support layer-wise quant + assert(quantparam->scale.size() == 1); // only support layer-wise quant + auto min = quantparam->min[0]; + auto scaling_factor = quantparam->scale[0]; + asym_wquant_per_layer(circle_const, min, scaling_factor); + } + } + } + return false; + } +}; + +} // namespace + +bool QuantizeWithMinMaxPass::run(loco::Graph *g) +{ + LOGGER(l); + INFO(l) << "QuantizeWithMinMaxPass Start" << std::endl; + + // Quantize activation + for (auto node : loco::active_nodes(loco::output_nodes(g))) + { + QuantizeActivation qa(_input_dtype, _output_dtype); + auto circle_node = loco::must_cast<luci::CircleNode *>(node); + circle_node->accept(&qa); + } + + // Quantize weights + for (auto node : loco::active_nodes(loco::output_nodes(g))) + { + QuantizeWeights qw(_input_dtype, _output_dtype, _granularity); + auto circle_node = loco::must_cast<luci::CircleNode *>(node); + circle_node->accept(&qw); + } + + // Quantize bias + for (auto node : loco::active_nodes(loco::output_nodes(g))) + { + QuantizeBias qb(_input_dtype, _output_dtype, _granularity); + auto circle_node = loco::must_cast<luci::CircleNode *>(node); + circle_node->accept(&qb); + } + + // Update output dtype + auto graph_outputs = g->outputs(); + for (auto node : loco::output_nodes(g)) + { + auto circle_node = loco::must_cast<luci::CircleOutput *>(node); + if (static_cast<luci::CircleNode *>(circle_node->from())->dtype() == _output_dtype) + { + circle_node->dtype(_output_dtype); + auto graph_output = graph_outputs->at(circle_node->index()); + graph_output->dtype(_output_dtype); + } + } + + INFO(l) << "QuantizeWithMinMaxPass End" << std::endl; + return false; // one time run +} + +} // namespace luci diff --git a/compiler/luci/pass/src/ResolveCustomOpAddPass.cpp b/compiler/luci/pass/src/ResolveCustomOpAddPass.cpp new file mode 100644 index 000000000..e52d667d7 --- /dev/null +++ b/compiler/luci/pass/src/ResolveCustomOpAddPass.cpp @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "luci/Pass/ResolveCustomOpAddPass.h" + +#include "flatbuffers/flexbuffers.h" + +#include <luci/IR/CircleNodes.h> +#include <luci/IR/AttrFusedActFunc.h> + +namespace +{ + +/// @brief Returns the index of BroadcastTo node among cop's inputs. +// NOTE This function assumes there is only one BroadcastTo node among its inputs. +int32_t get_broadcastTo_index_among_inputs_of(luci::CircleCustom *cop) +{ + for (uint32_t idx = 0; idx < cop->numInputs(); idx++) + { + auto input = dynamic_cast<const luci::CircleCustomOut *>(cop->inputs(idx)); + if (input) + { + auto broadcastTo = loco::must_cast<luci::CircleCustom *>(input->input()); + if (broadcastTo->custom_code() == "BroadcastTo") + return idx; + } + } + + return -1; +} + +/** BEFORE + * [CircleConst] + * | + * [CircleNode] [BroadcastTo(CircleCustom)] + * \ | + * \ [CircleCustomOUt] + * \ / + * [AddV2(CircleCustom)] + * AFTER + * + * [CircleConst] [CircleNode] + * \ / + * \ / + * [CircleAdd] + */ +bool resolve_with_BroadcastTo(luci::CircleCustom *addv2) +{ + int32_t broadcastTo_idx = get_broadcastTo_index_among_inputs_of(addv2); + + if (broadcastTo_idx == -1) + return false; + + auto input = loco::must_cast<const luci::CircleCustomOut *>(addv2->inputs(broadcastTo_idx)); + auto broadcastTo = loco::must_cast<luci::CircleCustom *>(input->input()); + + auto add = addv2->graph()->nodes()->create<luci::CircleAdd>(); + add->fusedActivationFunction(luci::FusedActFunc::NONE); + add->x(addv2->inputs(1 - broadcastTo_idx)); + add->y(broadcastTo->inputs(0)); + auto customOut = loco::succs(addv2); + assert(customOut.size() == 1); + replace(*customOut.begin()).with(add); + + return true; +} + +bool resolve_custom_op(luci::CircleCustom *addv2) +{ + const std::string custom_code = addv2->custom_code(); + const std::vector<uint8_t> custom_options = addv2->custom_options(); + + if (custom_code != "AddV2") + return false; + + if (resolve_with_BroadcastTo(addv2)) + return true; + + auto add = addv2->graph()->nodes()->create<luci::CircleAdd>(); + add->fusedActivationFunction(luci::FusedActFunc::NONE); + add->x(addv2->inputs(0)); + add->y(addv2->inputs(1)); + auto customOut = loco::succs(addv2); + assert(customOut.size() == 1); + replace(*customOut.begin()).with(add); + + return true; +} + +} // namespace + +namespace luci +{ + +bool ResolveCustomOpAddPass::run(loco::Graph *g) +{ + bool changed = false; + + for (auto node : loco::active_nodes(loco::output_nodes(g))) + { + auto cop = dynamic_cast<luci::CircleCustom *>(node); + if (not cop) + continue; + + changed |= resolve_custom_op(cop); + } + + return changed; +} + +} // namespace luci diff --git a/compiler/luci/pass/src/ResolveCustomOpBatchMatMulPass.cpp b/compiler/luci/pass/src/ResolveCustomOpBatchMatMulPass.cpp new file mode 100644 index 000000000..145e9cb62 --- /dev/null +++ b/compiler/luci/pass/src/ResolveCustomOpBatchMatMulPass.cpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "luci/Pass/ResolveCustomOpBatchMatMulPass.h" + +#include "flatbuffers/flexbuffers.h" + +#include <luci/IR/CircleNodes.h> + +namespace +{ + +bool resolve_custom_op(luci::CircleCustom *cop) +{ + const std::string custom_code = cop->custom_code(); + const std::vector<uint8_t> custom_options = cop->custom_options(); + + if (custom_code == "BatchMatMulV2") + { + auto batch_matmul = cop->graph()->nodes()->create<luci::CircleBatchMatMul>(); + // input + batch_matmul->x(cop->inputs(0)); + batch_matmul->y(cop->inputs(1)); + // TODO find much better way of parsing custom_options + // adj + auto map = flexbuffers::GetRoot(custom_options).AsMap(); + batch_matmul->adj_x(map["adj_x"].AsBool()); + batch_matmul->adj_y(map["adj_y"].AsBool()); + + replace(cop).with(batch_matmul); + return true; + } + return false; +} + +} // namespace + +namespace luci +{ + +bool ResolveCustomOpBatchMatMulPass::run(loco::Graph *g) +{ + bool changed = false; + for (auto node : loco::active_nodes(loco::output_nodes(g))) + { + auto cop = dynamic_cast<luci::CircleCustom *>(node); + if (not cop) + continue; + + changed |= resolve_custom_op(cop); + } + + return changed; +} + +} // namespace luci diff --git a/compiler/luci/pass/src/ResolveCustomOpMatMulPass.cpp b/compiler/luci/pass/src/ResolveCustomOpMatMulPass.cpp new file mode 100644 index 000000000..547fd22fc --- /dev/null +++ b/compiler/luci/pass/src/ResolveCustomOpMatMulPass.cpp @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "luci/Pass/ResolveCustomOpMatMulPass.h" + +#include "flatbuffers/flexbuffers.h" +#include <loco/IR/DataTypeTraits.h> + +#include <luci/IR/CircleNodes.h> + +#include <loco.h> +#include <oops/InternalExn.h> +#include <loco/Service/ShapeInference.h> +#include <loco/Service/TypeInference.h> + +namespace +{ + +template <typename T> +luci::CircleConst *create_const_node(loco::Graph *g, const loco::DataType dtype, + const std::vector<uint32_t> &shape, + const std::vector<T> &values) +{ + auto node = g->nodes()->create<luci::CircleConst>(); + node->dtype(dtype); + node->rank(shape.size()); + + uint32_t size = 1; + for (uint32_t i = 0; i < shape.size(); ++i) + { + node->dim(i) = shape.at(i); + size *= shape.at(i); + } + +#define INIT_VALUES(DT) \ + { \ + node->size<DT>(size); \ + for (uint32_t i = 0; i < values.size(); ++i) \ + node->at<DT>(i) = values[i]; \ + } + + switch (dtype) + { + case loco::DataType::U8: + INIT_VALUES(loco::DataType::U8); + break; + case loco::DataType::S16: + INIT_VALUES(loco::DataType::S16); + break; + case loco::DataType::S32: + INIT_VALUES(loco::DataType::S32); + break; + case loco::DataType::FLOAT32: + INIT_VALUES(loco::DataType::FLOAT32) + break; + default: + INTERNAL_EXN("create_const_node called with unsupported type"); + break; + } + return node; +} + +bool resolve_matmul(luci::CircleCustom *cop) +{ +#define CHECK_OR_FALSE(condition) \ + if (not(condition)) \ + return false; +#define CHECK_OR_THROW(condition, message) \ + if (not(condition)) \ + INTERNAL_EXN(message); + + auto graph = cop->graph(); + const std::vector<uint8_t> custom_options = cop->custom_options(); + auto map = flexbuffers::GetRoot(custom_options).AsMap(); + const auto U8 = loco::DataType::U8; + const auto S16 = loco::DataType::S16; + const auto S32 = loco::DataType::S32; + const auto FLOAT32 = loco::DataType::FLOAT32; + + bool transpose_a = map["transpose_a"].AsBool(); + bool transpose_b = map["transpose_b"].AsBool(); + + loco::Node *lhs = cop->inputs(0); + loco::Node *rhs = cop->inputs(1); + + // Check that the type of the first input is known + CHECK_OR_FALSE(loco::dtype_known(lhs)); + auto lhs_dtype = loco::dtype_get(cop->inputs(0)); + + // If transpose of first input is requested, its shape must be known + CHECK_OR_FALSE(!transpose_a || loco::shape_known(lhs)); + // and its rank should be at least 2 + CHECK_OR_FALSE(!transpose_a || loco::shape_get(lhs).as<loco::TensorShape>().rank() >= 2); + // Check that the shape of the 2nd input is known + CHECK_OR_FALSE(loco::shape_known(rhs)); + // TODO as of 06/23/20 TFLite only supports rank 2 for 2nd input. Fix this once that changes! + CHECK_OR_FALSE(loco::shape_get(rhs).as<loco::TensorShape>().rank() == 2); + // Check that input data type is supported + CHECK_OR_THROW(lhs_dtype == U8 || lhs_dtype == S16 || lhs_dtype == FLOAT32, + "Only UInt8, Int16 and Float32 data types are supported by MatMul"); + + if (transpose_a) + { + auto a_shape = loco::shape_get(lhs).as<loco::TensorShape>(); + // Create a permutation constant node + std::vector<uint32_t> perm; + for (uint32_t i = 0; i < a_shape.rank(); ++i) + perm.push_back(i); + std::swap(perm[a_shape.rank() - 1], perm[a_shape.rank() - 2]); + auto perm_node = create_const_node(graph, S32, {a_shape.rank()}, perm); + // Now make a transpose node + auto transpose_node = graph->nodes()->create<luci::CircleTranspose>(); + transpose_node->a(lhs); + transpose_node->perm(perm_node); + lhs = transpose_node; + } + + // Transpose the second input if needed. TFLite FullyConnected operator + // assumes the second input is in column-major order, but the input is + // in row-major order, thus we need to convert between them. + if (!transpose_b) + { + const std::vector<uint32_t> perm{1, 0}; + auto perm_node = create_const_node(graph, S32, {2}, perm); + auto transpose_node = graph->nodes()->create<luci::CircleTranspose>(); + transpose_node->a(rhs); + transpose_node->perm(perm_node); + rhs = transpose_node; + } + + // Make a constant zero-filled bias node + auto b_shape = loco::shape_get(cop->inputs(1)).as<loco::TensorShape>(); + uint32_t bias_size = b_shape.dim(transpose_b ? 1 : 0).value(); + const std::vector<float> val(bias_size, .0f); + auto bias_node = create_const_node(graph, lhs_dtype, {bias_size}, val); + auto fc_node = graph->nodes()->create<luci::CircleFullyConnected>(); + fc_node->input(lhs); + fc_node->weights(rhs); + fc_node->bias(bias_node); + fc_node->fusedActivationFunction(luci::FusedActFunc::NONE); + + replace(cop).with(fc_node); + return true; +} + +} // namespace + +namespace luci +{ + +bool ResolveCustomOpMatMulPass::run(loco::Graph *g) +{ + bool changed = false; + for (auto node : loco::active_nodes(loco::output_nodes(g))) + { + auto cop = dynamic_cast<luci::CircleCustom *>(node); + if (not cop) + continue; + + if (cop->custom_code() != "MatMul") + continue; + + if (!resolve_matmul(cop)) + continue; + + changed = true; + } + + return changed; +} + +} // namespace luci |