14 files changed, 2588 insertions, 26 deletions
diff --git a/compiler/luci/pass/src/CircleOptimizer.cpp b/compiler/luci/pass/src/CircleOptimizer.cpp
index dcb05a0b5..90fbe9009 100644
--- a/compiler/luci/pass/src/CircleOptimizer.cpp
+++ b/compiler/luci/pass/src/CircleOptimizer.cpp
@@ -16,16 +16,23 @@
 
 #include "luci/CircleOptimizer.h"
 
+#include "luci/Pass/FuseBCQPass.h"
 #include "luci/Pass/FuseInstanceNormPass.h"
+#include "luci/Pass/ResolveCustomOpAddPass.h"
+#include "luci/Pass/ResolveCustomOpBatchMatMulPass.h"
+#include "luci/Pass/ResolveCustomOpMatMulPass.h"
+#include "luci/Pass/QuantizeWithMinMaxPass.h"
+#include "luci/Pass/QuantizeDequantizeWeightsPass.h"
 // TODO add more passes
 
 #include "luci/Pass/ShapeInferencePass.h"
 #include "luci/Pass/TypeInferencePass.h"
 
 // logo passes
-#include <logo/RemoveDeadNodePass.h>
+#include <logo/RemoveDeadNodeWithQueryPass.h>
 
 #include "ProgressReporter.h"
+#include "CircleOptimizerUtils.h"
 
 #include <logo/Phase.h>
 
@@ -36,18 +43,39 @@ namespace
 
 using namespace luci;
 
-class OptimizeOptionsImpl : public luci::CircleOptimizer::Options
+class OptimizeOptionsImpl final : public luci::CircleOptimizer::Options
 {
 public:
   void enable(Algorithm) final;
+  void param(AlgorithmParameters, const std::string &) final;
+  const std::string param(AlgorithmParameters) const final;
   bool query(Algorithm) final;
 
 private:
   std::vector<Algorithm> _algorithms;
+  std::map<AlgorithmParameters, const std::string> _algorithm_params;
 };
 
 void OptimizeOptionsImpl::enable(Algorithm algo) { _algorithms.push_back(algo); }
 
+void OptimizeOptionsImpl::param(AlgorithmParameters param, const std::string &str)
+{
+  _algorithm_params.insert(std::pair<AlgorithmParameters, const std::string>(param, str));
+}
+
+const std::string OptimizeOptionsImpl::param(AlgorithmParameters param) const
+{
+  auto param_str = _algorithm_params.find(param);
+  if (param_str != _algorithm_params.end())
+  {
+    return param_str->second;
+  }
+  else
+  {
+    return std::string();
+  }
+}
+
 bool OptimizeOptionsImpl::query(Algorithm algo)
 {
   std::vector<Algorithm>::iterator it = std::find(_algorithms.begin(), _algorithms.end(), algo);
@@ -77,14 +105,31 @@ void CircleOptimizer::optimize(loco::Graph *g) const
   logo::Phase phase;
 
   /* TRANSFORM DECLARATION BEGIN */
+  if (_options->query(Options::Algorithm::ResolveCustomOpAdd))
+  {
+    phase.emplace_back(std::make_unique<luci::ResolveCustomOpAddPass>());
+  }
+  if (_options->query(Options::Algorithm::ResolveCustomOpBatchMatMul))
+  {
+    phase.emplace_back(std::make_unique<luci::ResolveCustomOpBatchMatMulPass>());
+  }
+  if (_options->query(Options::Algorithm::ResolveCustomOpMatMul))
+  {
+    phase.emplace_back(std::make_unique<luci::ResolveCustomOpMatMulPass>());
+  }
   if (_options->query(Options::Algorithm::FuseInstanceNorm))
   {
     phase.emplace_back(std::make_unique<FuseInstanceNormPass>());
   }
+  if (_options->query(Options::Algorithm::FuseBCQ))
+  {
+    phase.emplace_back(std::make_unique<FuseBCQPass>());
+  }
+
   // Shape inference is needed for added nodes doing above transformations
   phase.emplace_back(std::make_unique<luci::ShapeInferencePass>());
   phase.emplace_back(std::make_unique<luci::TypeInferencePass>());
-  phase.emplace_back(std::make_unique<logo::RemoveDeadNodePass>());
+  phase.emplace_back(std::make_unique<logo::RemoveDeadNodeWithQueryPass>());
   /* TRANSFORM DECLARATION END */
 
   ProgressReporter prog(g, logo::PhaseStrategy::Saturate);
@@ -93,4 +138,74 @@ void CircleOptimizer::optimize(loco::Graph *g) const
   phase_runner.run(phase);
 }
 
+void CircleOptimizer::quantize(loco::Graph *g) const
+{
+  // Fake quantization of weights
+  if (_options->query(Options::Algorithm::QuantizeDequantizeWeights))
+  {
+    static const std::vector<std::string> fakeq_supported_input_dtype{"float32"};
+    static const std::vector<std::string> fakeq_supported_output_dtype{"uint8"};
+    static const std::vector<std::string> fakeq_supported_granularity{"layer"};
+
+    auto input_dtype = _options->param(Options::AlgorithmParameters::Quantize_input_dtype);
+    auto output_dtype = _options->param(Options::AlgorithmParameters::Quantize_output_dtype);
+    auto granularity = _options->param(Options::AlgorithmParameters::Quantize_granularity);
+
+    if (!in_array(to_lower_case(input_dtype), fakeq_supported_input_dtype))
+      throw std::runtime_error("Unsupported input type. List of supported input type: " +
+                               to_string(fakeq_supported_input_dtype));
+
+    if (!in_array(to_lower_case(output_dtype), fakeq_supported_output_dtype))
+      throw std::runtime_error("Unsupported output type. List of supported output type: " +
+                               to_string(fakeq_supported_output_dtype));
+
+    if (!in_array(to_lower_case(granularity), fakeq_supported_granularity))
+      throw std::runtime_error("Unsupported granularity. List of supported granularity: " +
+                               to_string(fakeq_supported_granularity));
+
+    luci::QuantizeDequantizeWeightsPass fake_quantizer(
+        str_to_dtype(input_dtype), str_to_dtype(output_dtype), str_to_granularity(granularity));
+    fake_quantizer.run(g);
+  }
+
+  // Actual quantization of weights, bias, and activation
+  if (_options->query(Options::Algorithm::QuantizeWithMinMax))
+  {
+    static const std::vector<std::string> qwmm_supported_input_dtype{"float32"};
+    static const std::vector<std::string> qwmm_supported_output_dtype{"uint8"};
+    static const std::vector<std::string> qwmm_supported_granularity{"layer"};
+
+    auto input_dtype = _options->param(Options::AlgorithmParameters::Quantize_input_dtype);
+    auto output_dtype = _options->param(Options::AlgorithmParameters::Quantize_output_dtype);
+    auto granularity = _options->param(Options::AlgorithmParameters::Quantize_granularity);
+
+    if (!in_array(to_lower_case(input_dtype), qwmm_supported_input_dtype))
+      throw std::runtime_error("Unsupported input type. List of supported input types: " +
+                               to_string(qwmm_supported_input_dtype));
+
+    if (!in_array(to_lower_case(output_dtype), qwmm_supported_output_dtype))
+      throw std::runtime_error("Unsupported output type. List of supported output types: " +
+                               to_string(qwmm_supported_output_dtype));
+
+    if (!in_array(to_lower_case(granularity), qwmm_supported_granularity))
+      throw std::runtime_error("Unsupported granularity. List of supported granularity: " +
+                               to_string(qwmm_supported_granularity));
+
+    luci::QuantizeWithMinMaxPass quantizer(str_to_dtype(input_dtype), str_to_dtype(output_dtype),
+                                           str_to_granularity(granularity));
+    quantizer.run(g);
+  }
+
+  logo::Phase phase;
+
+  // Do Shape/Type inference
+  phase.emplace_back(std::make_unique<luci::ShapeInferencePass>());
+  phase.emplace_back(std::make_unique<luci::TypeInferencePass>());
+
+  ProgressReporter prog(g, logo::PhaseStrategy::Saturate);
+  logo::PhaseRunner<logo::PhaseStrategy::Saturate> phase_runner{g};
+  phase_runner.attach(&prog);
+  phase_runner.run(phase);
+}
+
 } // namespace luci
diff --git a/compiler/luci/pass/src/CircleOptimizerUtils.cpp b/compiler/luci/pass/src/CircleOptimizerUtils.cpp
new file mode 100644
index 000000000..ffc372392
--- /dev/null
+++ b/compiler/luci/pass/src/CircleOptimizerUtils.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleOptimizerUtils.h"
+
+namespace luci
+{
+
+bool in_array(const std::string &str, const std::vector<std::string> &array)
+{
+  return std::find(array.begin(), array.end(), str) != array.end();
+}
+
+std::string to_string(const std::vector<std::string> &strings)
+{
+  assert(!strings.empty());
+
+  std::string res;
+  for (unsigned int i = 0; i < strings.size() - 1; i++)
+    res += strings[i] + ", ";
+
+  res += strings[strings.size() - 1];
+  return res;
+}
+
+std::string to_lower_case(std::string s)
+{
+  std::transform(s.begin(), s.end(), s.begin(), [](unsigned char c) { return std::tolower(c); });
+  return s;
+}
+
+loco::DataType str_to_dtype(const std::string &str)
+{
+  if (to_lower_case(str).compare("uint8") == 0)
+    return loco::DataType::U8;
+  if (to_lower_case(str).compare("uint16") == 0)
+    return loco::DataType::U16;
+  if (to_lower_case(str).compare("uint32") == 0)
+    return loco::DataType::U32;
+  if (to_lower_case(str).compare("uint64") == 0)
+    return loco::DataType::U64;
+
+  if (to_lower_case(str).compare("int8") == 0)
+    return loco::DataType::S8;
+  if (to_lower_case(str).compare("int16") == 0)
+    return loco::DataType::S16;
+  if (to_lower_case(str).compare("int32") == 0)
+    return loco::DataType::S32;
+  if (to_lower_case(str).compare("int64") == 0)
+    return loco::DataType::S64;
+
+  if (to_lower_case(str).compare("float16") == 0)
+    return loco::DataType::FLOAT16;
+  if (to_lower_case(str).compare("float32") == 0)
+    return loco::DataType::FLOAT32;
+  if (to_lower_case(str).compare("float64") == 0)
+    return loco::DataType::FLOAT64;
+
+  if (to_lower_case(str).compare("bool") == 0)
+    return loco::DataType::BOOL;
+
+  return loco::DataType::Unknown;
+}
+
+QuantizationGranularity str_to_granularity(const std::string &str)
+{
+  if (to_lower_case(str).compare("layer") == 0)
+    return QuantizationGranularity::LayerWise;
+
+  if (to_lower_case(str).compare("channel") == 0)
+    return QuantizationGranularity::ChannelWise;
+
+  throw std::runtime_error("Quantization granularity must be either 'layer' or 'channel'");
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/CircleOptimizerUtils.h b/compiler/luci/pass/src/CircleOptimizerUtils.h
new file mode 100644
index 000000000..7e577a05f
--- /dev/null
+++ b/compiler/luci/pass/src/CircleOptimizerUtils.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_CIRCLE_OPTIMIZER_UTILS_H__
+#define __LUCI_CIRCLE_OPTIMIZER_UTILS_H__
+
+#include "luci/Pass/QuantizeDequantizeWeightsPass.h"
+#include "luci/Pass/QuantizeWithMinMaxPass.h"
+
+#include <loco.h>
+
+#include <algorithm>
+
+namespace luci
+{
+
+bool in_array(const std::string &, const std::vector<std::string> &);
+
+std::string to_string(const std::vector<std::string> &);
+
+std::string to_lower_case(std::string);
+
+loco::DataType str_to_dtype(const std::string &);
+
+QuantizationGranularity str_to_granularity(const std::string &);
+
+} // namespace luci
+
+#endif // __LUCI_CIRCLE_OPTIMIZER_UTILS_H__
diff --git a/compiler/luci/pass/src/FuseBCQPass.cpp b/compiler/luci/pass/src/FuseBCQPass.cpp
new file mode 100644
index 000000000..b81db8827
--- /dev/null
+++ b/compiler/luci/pass/src/FuseBCQPass.cpp
@@ -0,0 +1,405 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FuseBCQPass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <cassert>
+#include <string>
+#include <set>
+
+namespace
+{
+
+/**
+ * @brief Circle nodes including BCQ information and a circle node to which BCQ will be applied
+ *        are connected with their name. And their names include common prefix.
+ *        However, after pb file is converted to tflite file, some nodes' name are changed.
+ *        Thus this function will return original common prefix.
+ *
+ * @note  All the re-naming rule of TFLite converter is not figured out.
+ *        Therefore, if new naming rule is detected, this function should be updated.
+ */
+const std::string node_name_prefix(luci::NodeName node_name)
+{
+  std::string prefix = node_name;
+
+  if (prefix.find("ReadVariableOp/resource/") != std::string::npos)
+  {
+    const auto start_index = prefix.find("ReadVariableOp/resource/");
+
+    const auto left_prefix = prefix.substr(0, start_index);
+    const auto right_prefix = prefix.substr(start_index + 24);
+
+    prefix = left_prefix + right_prefix;
+  }
+
+  if (prefix.find("Tensordot/") != std::string::npos)
+  {
+    const auto index = prefix.find("Tensordot/");
+    prefix = prefix.substr(0, index - 1);
+  }
+  else if (prefix.find("kernel/") != std::string::npos)
+  {
+    const auto index = prefix.find("kernel/");
+    prefix = prefix.substr(0, index - 1);
+  }
+  else if (prefix.find("/bcqinfo_") != std::string::npos)
+  {
+    const auto index = prefix.find("/bcqinfo_");
+    prefix = prefix.substr(0, index);
+  }
+
+  return prefix;
+}
+
+} // namespace
+
+namespace
+{
+
+class BCQConverter final
+{
+public:
+  void add_BCQ_info_node(luci::CircleConst *node)
+  {
+    const auto node_name = node->name();
+    const auto prefix = node_name_prefix(node_name);
+
+    // If bcqinfo_* nodes are held by Reshape operation,
+    // shape of bcqinfo_* nodes are copied to `shape` input of Reshape operation.
+    // Then the name becomes bcqinfo_*_copy_shape.
+    // We should prevent this node not to added to bcq information.
+    if (node_name.find("_copy_shape") != std::string::npos)
+      return;
+
+    if (node_name.find("bcqinfo_do_w_x") != std::string::npos)
+      _do_w_x[prefix] = node;
+    else if (node_name.find("bcqinfo_alpha") != std::string::npos)
+      _alpha[prefix] = node;
+    else if (node_name.find("bcqinfo_packed_binary_code") != std::string::npos)
+      _packed_binary_code[prefix] = node;
+    else if (node_name.find("bcqinfo_number_of_clusters") != std::string::npos)
+      _number_of_clusters[prefix] = node;
+    else if (node_name.find("bcqinfo_size_of_clusters") != std::string::npos)
+      _size_of_clusters[prefix] = node;
+    else if (node_name.find("bcqinfo_qbits_of_clusters") != std::string::npos)
+      _qbits_of_clusters[prefix] = node;
+    else if (node_name.find("bcqinfo_dequant_weight") != std::string::npos)
+      _dequant_weight[prefix] = node;
+  }
+
+  bool has_BCQ_info(luci::CircleConst *node)
+  {
+    const auto prefix = node_name_prefix(node->name());
+    bool has_info = true;
+
+    has_info &= (_do_w_x.find(prefix) != _do_w_x.end());
+    has_info &= (_alpha.find(prefix) != _alpha.end());
+    has_info &= (_packed_binary_code.find(prefix) != _packed_binary_code.end());
+    has_info &= (_number_of_clusters.find(prefix) != _number_of_clusters.end());
+    has_info &= (_size_of_clusters.find(prefix) != _size_of_clusters.end());
+    has_info &= (_qbits_of_clusters.find(prefix) != _qbits_of_clusters.end());
+    // bcqinfo_dequant_weight is just for validation, so not always exists.
+
+    return has_info;
+  }
+
+  bool do_w_x(luci::CircleConst *node)
+  {
+    const auto prefix = node_name_prefix(node->name());
+
+    if (_do_w_x[prefix]->dtype() == loco::DataType::S32)
+      return _do_w_x[prefix]->at<loco::DataType::S32>(0) == 1;
+    else if (_do_w_x[prefix]->dtype() == loco::DataType::BOOL)
+      return _do_w_x[prefix]->at<loco::DataType::BOOL>(0);
+    else
+      throw std::runtime_error("do_w_x should be int or bool");
+  }
+
+  luci::CircleConst *get_alpha(luci::CircleConst *node)
+  {
+    const auto prefix = node_name_prefix(node->name());
+    return _alpha[prefix];
+  }
+
+  luci::CircleConst *get_packed_binary_code(luci::CircleConst *node)
+  {
+    const auto prefix = node_name_prefix(node->name());
+    return _packed_binary_code[prefix];
+  }
+
+  luci::CircleConst *get_number_of_clusters(luci::CircleConst *node)
+  {
+    const auto prefix = node_name_prefix(node->name());
+    return _number_of_clusters[prefix];
+  }
+
+  luci::CircleConst *get_size_of_clusters(luci::CircleConst *node)
+  {
+    const auto prefix = node_name_prefix(node->name());
+    return _size_of_clusters[prefix];
+  }
+
+  luci::CircleConst *get_qbits_of_clusters(luci::CircleConst *node)
+  {
+    const auto prefix = node_name_prefix(node->name());
+    return _qbits_of_clusters[prefix];
+  }
+
+  luci::CircleConst *packed_clusters(luci::CircleConst *node)
+  {
+    auto graph = node->graph();
+    auto qbits_of_clusters = get_qbits_of_clusters(node);
+    auto size_of_clusters = get_size_of_clusters(node);
+    const auto number_of_clusters = get_number_of_clusters(node)->at<loco::DataType::S32>(0);
+
+    auto packed_clusters = graph->nodes()->create<luci::CircleConst>();
+    packed_clusters->dtype(loco::DataType::S32);
+    packed_clusters->size<loco::DataType::S32>(number_of_clusters * 2);
+    packed_clusters->rank(2);
+    packed_clusters->dim(0) = number_of_clusters;
+    packed_clusters->dim(1) = 2;
+    packed_clusters->shape_status(luci::ShapeStatus::VALID);
+
+    for (int i = 0; i < number_of_clusters; ++i)
+    {
+      packed_clusters->at<loco::DataType::S32>(i * 2) =
+          qbits_of_clusters->at<loco::DataType::S32>(i);
+      packed_clusters->at<loco::DataType::S32>(i * 2 + 1) =
+          size_of_clusters->at<loco::DataType::S32>(i);
+    }
+
+    return packed_clusters;
+  }
+
+  /**
+   * @brief Exclude BCQ information nodes which are used for fusing BCQ operations
+   *        from graph output by using CircleOutputExclude
+   */
+  void clear_BCQ_nodes()
+  {
+    auto createNoOp = [](luci::CircleNode *circle_node) {
+      auto graph = circle_node->graph();
+      auto noOp = graph->nodes()->create<luci::CircleOutputExclude>();
+
+      if (circle_node->shape_status() == luci::ShapeStatus::VALID)
+      {
+        noOp->dtype(circle_node->dtype());
+        noOp->rank(circle_node->rank());
+        for (uint32_t i = 0; i < circle_node->rank(); ++i)
+          noOp->dim(i) = circle_node->dim(i);
+      }
+      else
+      {
+        // For type inference
+        noOp->dtype(loco::DataType::FLOAT32);
+      }
+
+      return noOp;
+    };
+
+    auto clear_nodes = [createNoOp](std::map<std::string, luci::CircleConst *> &nodes) {
+      for (auto &n : nodes)
+      {
+        auto node = n.second;
+
+        for (auto s : loco::succs(node))
+        {
+          if (auto outnode = dynamic_cast<luci::CircleOutput *>(s))
+          {
+            outnode->from(createNoOp(node));
+          }
+          else if (auto reshape_node = dynamic_cast<luci::CircleReshape *>(s))
+          {
+            for (auto o : loco::succs(reshape_node))
+            {
+              auto circle_output = loco::must_cast<luci::CircleOutput *>(o);
+              circle_output->from(createNoOp(reshape_node));
+            }
+          }
+        }
+      }
+    };
+
+    clear_nodes(_do_w_x);
+    clear_nodes(_alpha);
+    clear_nodes(_packed_binary_code);
+    clear_nodes(_number_of_clusters);
+    clear_nodes(_size_of_clusters);
+    clear_nodes(_qbits_of_clusters);
+    clear_nodes(_dequant_weight);
+  }
+
+private:
+  std::map<std::string, luci::CircleConst *> _do_w_x;
+  std::map<std::string, luci::CircleConst *> _alpha;
+  std::map<std::string, luci::CircleConst *> _packed_binary_code;
+  std::map<std::string, luci::CircleConst *> _number_of_clusters;
+  std::map<std::string, luci::CircleConst *> _size_of_clusters;
+  std::map<std::string, luci::CircleConst *> _qbits_of_clusters;
+  std::map<std::string, luci::CircleConst *> _dequant_weight;
+};
+
+} // namespace
+
+namespace luci
+{
+
+bool FuseBCQPass::run(loco::Graph *g)
+{
+  BCQConverter converter;
+
+  bool changed = false;
+
+  for (auto node : loco::all_nodes(g))
+  {
+    if (auto circle_const = dynamic_cast<luci::CircleConst *>(node))
+    {
+      converter.add_BCQ_info_node(circle_const);
+    }
+  }
+
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    if (auto gather = dynamic_cast<luci::CircleGather *>(node))
+    {
+      auto params = dynamic_cast<luci::CircleConst *>(gather->params());
+      if (params != nullptr && converter.has_BCQ_info(params))
+      {
+        auto bcq_gather = g->nodes()->create<luci::CircleBCQGather>();
+
+        bcq_gather->input_scales(converter.get_alpha(params));
+        bcq_gather->input_binary(converter.get_packed_binary_code(params));
+        bcq_gather->indices(gather->indices());
+        bcq_gather->input_clusters(converter.packed_clusters(params));
+
+        const auto binary_hidden_size =
+            loco::must_cast<luci::CircleConst *>(bcq_gather->input_binary())->dim(1).value() * 32;
+        bcq_gather->input_hidden_size(binary_hidden_size);
+
+        if (converter.do_w_x(params))
+        {
+          bcq_gather->axis(gather->axis());
+        }
+        else
+        {
+          const auto axis_transpose = (gather->axis() == 0) ? 1 : 0;
+          bcq_gather->axis(axis_transpose);
+        }
+
+        loco::replace(gather).with(bcq_gather);
+
+        changed = true;
+      }
+    }
+    else if (auto fully_connected = dynamic_cast<luci::CircleFullyConnected *>(node))
+    {
+      auto weights = dynamic_cast<luci::CircleConst *>(fully_connected->weights());
+      if (weights != nullptr && converter.has_BCQ_info(weights))
+      {
+        auto bcq_fc = g->nodes()->create<luci::CircleBCQFullyConnected>();
+
+        bcq_fc->weights_scales(converter.get_alpha(weights));
+        bcq_fc->weights_binary(converter.get_packed_binary_code(weights));
+        bcq_fc->bias(fully_connected->bias());
+        bcq_fc->weights_clusters(converter.packed_clusters(weights));
+        bcq_fc->fusedActivationFunction(fully_connected->fusedActivationFunction());
+
+        loco::Node *bcq_input = fully_connected->input();
+        int32_t batch_rank = 0;
+
+        // If input of BCQFullyConnected has more than rank 2, we should reshape it as rank 2
+        const auto original_input = loco::must_cast<luci::CircleNode *>(fully_connected->input());
+        if (original_input->shape_status() == ShapeStatus::VALID && original_input->rank() > 2)
+        {
+          auto new_shape = g->nodes()->create<luci::CircleConst>();
+          new_shape->dtype(loco::DataType::S32);
+          new_shape->size<loco::DataType::S32>(2);
+          new_shape->rank(1);
+          new_shape->dim(0) = 2;
+
+          auto batch_size = 1;
+          for (uint32_t i = 0; i < original_input->rank() - 1; ++i)
+            batch_size *= original_input->dim(i).value();
+
+          new_shape->at<loco::DataType::S32>(0) = batch_size;
+          new_shape->at<loco::DataType::S32>(1) =
+              original_input->dim(original_input->rank() - 1).value();
+          new_shape->shape_status(ShapeStatus::VALID);
+
+          auto reshape = g->nodes()->create<luci::CircleReshape>();
+          reshape->tensor(original_input);
+          reshape->shape(new_shape);
+
+          bcq_input = reshape;
+          batch_rank = original_input->rank() - 2;
+        }
+
+        // If x_w formation, we should insert Transpose in front and back of BCQFullyConnected
+        if (converter.do_w_x(weights))
+        {
+          const auto binary_hidden_size =
+              loco::must_cast<luci::CircleNode *>(fully_connected->input())
+                  ->dim(batch_rank)
+                  .value();
+          bcq_fc->weights_hidden_size(binary_hidden_size);
+          bcq_fc->input(bcq_input);
+          loco::replace(fully_connected).with(bcq_fc);
+        }
+        else
+        {
+          const auto binary_hidden_size =
+              loco::must_cast<luci::CircleNode *>(fully_connected->input())
+                  ->dim(1 + batch_rank)
+                  .value();
+          bcq_fc->weights_hidden_size(binary_hidden_size);
+
+          auto perm = g->nodes()->create<luci::CircleConst>();
+          perm->dtype(loco::DataType::S32);
+          perm->size<loco::DataType::S32>(2);
+          perm->rank(1);
+          perm->dim(0) = 2;
+          perm->at<loco::DataType::S32>(0) = 1;
+          perm->at<loco::DataType::S32>(1) = 0;
+          perm->shape_status(ShapeStatus::VALID);
+
+          auto input_transpose = g->nodes()->create<luci::CircleTranspose>();
+          input_transpose->a(bcq_input);
+          input_transpose->perm(perm);
+
+          bcq_fc->input(input_transpose);
+
+          auto output_transpose = g->nodes()->create<luci::CircleTranspose>();
+          output_transpose->a(bcq_fc);
+          output_transpose->perm(perm);
+
+          loco::replace(fully_connected).with(output_transpose);
+        }
+
+        changed = true;
+      }
+    }
+  }
+
+  if (changed)
+    converter.clear_BCQ_nodes();
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/FuseInstanceNormPass.cpp b/compiler/luci/pass/src/FuseInstanceNormPass.cpp
index 180b5bbef..ad8765c41 100644
--- a/compiler/luci/pass/src/FuseInstanceNormPass.cpp
+++ b/compiler/luci/pass/src/FuseInstanceNormPass.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "luci/Pass/FuseInstanceNormPass.h"
+#include "FuseInstanceNormPassInternal.h"
 
 #include <luci/IR/CircleNodes.h>
 
@@ -114,8 +115,6 @@ bool NodeFiller<ARG_TYPE_1, ARG_TYPE_2>::with_commutative_args_of(const COMM_NOD
 } // namespace
 
 // Helper to check detail
-namespace
-{
 
 /// @return true  When node has shape of '1 x .. x 1 x depth'
 bool is_1D_with_dummy_dim(luci::CircleConst *node, uint32_t depth)
@@ -130,7 +129,23 @@ bool is_1D_with_dummy_dim(luci::CircleConst *node, uint32_t depth)
   return node->dim(axis).value() == depth;
 }
 
-bool is_instance_mean(luci::CircleMean *mean)
+/// @return true if node shape consists of ones, except the one before the last dim: 1,...1,depth,1
+bool is_quasi_1D_with_dummy_dim(luci::CircleConst *node, uint32_t depth)
+{
+  auto rank = node->rank();
+  // minimal accepted shape is [1 x depth x 1]
+  if (rank < 3)
+    return false;
+  const auto depth_axis = rank - 2;
+  for (uint32_t axis = 0; axis < rank; ++axis)
+  {
+    if (axis != depth_axis && node->dim(axis).value() != 1)
+      return false;
+  }
+  return node->dim(depth_axis).value() == depth;
+}
+
+bool is_instance_mean_v0(luci::CircleMean *mean)
 {
   //
   // CHECK 1) input is rank 4
@@ -175,7 +190,53 @@ bool is_instance_mean(luci::CircleMean *mean)
   return mean->keep_dims();
 }
 
-} // namespace
+bool is_instance_mean_v1(luci::CircleMean *mean)
+{
+  //
+  // CHECK 1) input is rank 5 (NHWCX)
+  //
+  auto input = mean->input();
+  if (not loco::shape_known(input))
+    return false;
+  auto input_shape = loco::shape_get(input).as<loco::TensorShape>();
+  if (input_shape.rank() != 5)
+    return false;
+
+  //
+  // CHECK 2) 'reduction indices' is CircleConst of value [1,2,4], that is HWX of NHWCX input shape
+  //
+  // TODO Support equivalent case, like [-3,-2]
+  // TODO Support non-Const case?
+  // TODO What if input is NCHW format in Circle?
+  auto red_indices = dynamic_cast<luci::CircleConst *>(mean->reduction_indices());
+  if (not red_indices)
+    return false;
+  if (red_indices->rank() != 1)
+    return false;
+  std::set<int32_t> red_indices_set;
+
+  // TODO Currently only support S32, support other types
+  if (red_indices->dtype() != loco::DataType::S32)
+    return false;
+  for (uint32_t i = 0; i < red_indices->dim(0).value(); ++i)
+    red_indices_set.insert(red_indices->at<loco::DataType::S32>(i));
+
+  if (red_indices_set.size() != 3)
+    return false;
+  if (red_indices_set.find(1) == red_indices_set.end())
+    return false;
+  if (red_indices_set.find(2) == red_indices_set.end())
+    return false;
+  if (red_indices_set.find(4) == red_indices_set.end())
+    return false;
+
+  //
+  // CHECK 3) keep_dims == true (?)
+  //
+  // We only have case of 'keep_dims == true' so far, but it might be okay with 'keep_dims == false'
+  // TODO Check this fact, and if true, return true regardless of keep_dims
+  return mean->keep_dims();
+}
 
 // Helper to fuse Instance Norm
 namespace
@@ -227,14 +288,61 @@ namespace
  *         |
  *         V
  *       [Out]
+ *-------------------------------------------------------------------
+ *                 [In]
+ *                   |
+ *                   V
+ *                  ifm
+ *                   |
+ *                   V
+ *     +---------reshape_of_ifm ----+   (reduction indicies)
+ *     |             |              |    |
+ *     |             |              V    V
+ *     |             |       mean_of_reshape -------------+
+ *     |             V       |                            |
+ *     |           sqdiff <--+   (reduction indicies)     |
+ *     |             |             |                      |
+ *     |             V             |                      |
+ *     |      mean_as_variance <---+  const_as_epsilon    |
+ *     |             |                 |                  |
+ *     |             V                 |                  |
+ *     |      add_as_variance <--------+                  |
+ *     |             |                                    |
+ *     |             V                                    |
+ *     |           rsqrt     const_as_gamma               |
+ *     |             |          |                         |
+ *     |             V          |                         |
+ *     |           mul_gamma <--+                         |
+ *     |            |      |                              |
+ *     V            V      V                              |
+ * mul_as_scaled_reshape   mul_as_scaled_mean <-----------+
+ *         |                   |
+ *         |   const_as_beta   |
+ *         |         |         V
+ *         |         +------> sub
+ *         V                   |
+ *  add_as_terminal <----------+
+ *         |
+ *         V
+ *  reshape_as_terminal
+ *         |
+ *         V
+ *       [Out]
  */
 class InstanceNormPattern final
 {
 public:
-  InstanceNormPattern(luci::CircleAdd *candidate)
+  enum PatternVersion
+  {
+    Version_0,
+    Version_1
+  };
+
+  InstanceNormPattern(luci::CircleAdd *candidate, PatternVersion pv)
   {
     assert(candidate);
     add_as_terminal = candidate;
+    _pv = pv;
   }
 
 public:
@@ -244,7 +352,9 @@ public:
 public:
   // Context
   loco::Node *ifm = nullptr;
+  luci::CircleReshape *reshape_of_ifm = nullptr;
   luci::CircleMean *mean_of_ifm = nullptr;
+  luci::CircleMean *mean_of_reshape = nullptr;
   luci::CircleSquaredDifference *sqdiff = nullptr;
   luci::CircleMean *mean_as_variance = nullptr;
   luci::CircleConst *const_as_epsilon = nullptr;
@@ -254,12 +364,14 @@ public:
   luci::CircleMul *mul_gamma = nullptr;
   luci::CircleMul *mul_as_scaled_ifm = nullptr;
   luci::CircleMul *mul_as_scaled_mean = nullptr;
+  luci::CircleMul *mul_as_scaled_reshape = nullptr;
   luci::CircleConst *const_as_beta = nullptr;
   luci::CircleSub *sub = nullptr;
   luci::CircleAdd *add_as_terminal = nullptr;
 
 private:
   bool _matched = false;
+  PatternVersion _pv;
 };
 
 bool InstanceNormPattern::matched()
@@ -273,8 +385,18 @@ bool InstanceNormPattern::matched()
 
   // Check order is DFS
 
-  CHECK_OR_FALSE(fill(&mul_as_scaled_ifm, &sub).with_commutative_args_of(add_as_terminal));
-  CHECK_OR_FALSE(fill(&ifm, &mul_gamma).with_commutative_args_of(mul_as_scaled_ifm));
+  if (_pv == PatternVersion::Version_0)
+  {
+    CHECK_OR_FALSE(fill(&mul_as_scaled_ifm, &sub).with_commutative_args_of(add_as_terminal));
+    CHECK_OR_FALSE(fill(&ifm, &mul_gamma).with_commutative_args_of(mul_as_scaled_ifm));
+  }
+  if (_pv == PatternVersion::Version_1)
+  {
+    CHECK_OR_FALSE(fill(&mul_as_scaled_reshape, &sub).with_commutative_args_of(add_as_terminal));
+    CHECK_OR_FALSE(
+        fill(&reshape_of_ifm, &mul_gamma).with_commutative_args_of(mul_as_scaled_reshape));
+    ifm = reshape_of_ifm->tensor();
+  }
 
   CHECK_OR_FALSE(loco::shape_known(ifm));
   auto ifm_shape = loco::shape_get(ifm);
@@ -284,7 +406,15 @@ bool InstanceNormPattern::matched()
   uint32_t ifm_channel_depth = ifm_tensor_shape.dim(3).value();
 
   CHECK_OR_FALSE(fill(&rsqrt, &const_as_gamma).with_commutative_args_of(mul_gamma));
-  CHECK_OR_FALSE(is_1D_with_dummy_dim(const_as_gamma, ifm_channel_depth));
+
+  if (_pv == PatternVersion::Version_0)
+  {
+    CHECK_OR_FALSE(is_1D_with_dummy_dim(const_as_gamma, ifm_channel_depth));
+  }
+  if (_pv == PatternVersion::Version_1)
+  {
+    CHECK_OR_FALSE(is_quasi_1D_with_dummy_dim(const_as_gamma, ifm_channel_depth));
+  }
 
   add_as_variance = dynamic_cast<luci::CircleAdd *>(rsqrt->x());
   CHECK_OR_FALSE(add_as_variance);
@@ -296,29 +426,69 @@ bool InstanceNormPattern::matched()
   // TODO Support regarding broadcast
   CHECK_OR_FALSE(const_as_epsilon->size<loco::DataType::FLOAT32>() == 1);
 
-  CHECK_OR_FALSE(is_instance_mean(mean_as_variance));
+  if (_pv == PatternVersion::Version_0)
+  {
+    CHECK_OR_FALSE(is_instance_mean_v0(mean_as_variance));
+  }
+  if (_pv == PatternVersion::Version_1)
+  {
+    CHECK_OR_FALSE(is_instance_mean_v1(mean_as_variance));
+  }
+
   sqdiff = dynamic_cast<luci::CircleSquaredDifference *>(mean_as_variance->input());
   CHECK_OR_FALSE(sqdiff);
 
-  loco::Node *ifm_should_be = nullptr;
-  CHECK_OR_FALSE(fill(&ifm_should_be, &mean_of_ifm).with_commutative_args_of(sqdiff));
-  CHECK_OR_FALSE(ifm == ifm_should_be);
-  CHECK_OR_FALSE(is_instance_mean(mean_of_ifm));
-  CHECK_OR_FALSE(ifm == mean_of_ifm->input());
+  if (_pv == PatternVersion::Version_0)
+  {
+    loco::Node *ifm_should_be = nullptr;
+    CHECK_OR_FALSE(fill(&ifm_should_be, &mean_of_ifm).with_commutative_args_of(sqdiff));
+    CHECK_OR_FALSE(ifm == ifm_should_be);
+    CHECK_OR_FALSE(is_instance_mean_v0(mean_of_ifm));
+    CHECK_OR_FALSE(ifm == mean_of_ifm->input());
+  }
+  if (_pv == PatternVersion::Version_1)
+  {
+    loco::Node *reshape_should_be = nullptr;
+    CHECK_OR_FALSE(fill(&reshape_should_be, &mean_of_reshape).with_commutative_args_of(sqdiff));
+    CHECK_OR_FALSE(reshape_of_ifm == reshape_should_be);
+    CHECK_OR_FALSE(is_instance_mean_v1(mean_of_reshape));
+    CHECK_OR_FALSE(reshape_of_ifm == mean_of_reshape->input());
+  }
 
   const_as_beta = dynamic_cast<luci::CircleConst *>(sub->x());
   CHECK_OR_FALSE(const_as_beta);
-  CHECK_OR_FALSE(is_1D_with_dummy_dim(const_as_beta, ifm_channel_depth));
+
+  if (_pv == PatternVersion::Version_0)
+  {
+    CHECK_OR_FALSE(is_1D_with_dummy_dim(const_as_beta, ifm_channel_depth));
+  }
+  if (_pv == PatternVersion::Version_1)
+  {
+    CHECK_OR_FALSE(is_quasi_1D_with_dummy_dim(const_as_beta, ifm_channel_depth));
+  }
 
   mul_as_scaled_mean = dynamic_cast<luci::CircleMul *>(sub->y());
   CHECK_OR_FALSE(mul_as_scaled_mean);
 
   luci::CircleMul *mul_gamma_should_be = nullptr;
   luci::CircleMean *mean_of_ifm_should_be = nullptr;
-  CHECK_OR_FALSE(fill(&mul_gamma_should_be, &mean_of_ifm_should_be)
-                     .with_commutative_args_of(mul_as_scaled_mean));
-  CHECK_OR_FALSE(mul_gamma == mul_gamma_should_be);
-  CHECK_OR_FALSE(mean_of_ifm == mean_of_ifm_should_be);
+  luci::CircleMean *mean_of_reshape_should_be = nullptr;
+
+  if (_pv == PatternVersion::Version_0)
+  {
+    CHECK_OR_FALSE(fill(&mul_gamma_should_be, &mean_of_ifm_should_be)
+                       .with_commutative_args_of(mul_as_scaled_mean));
+    CHECK_OR_FALSE(mul_gamma == mul_gamma_should_be);
+    CHECK_OR_FALSE(mean_of_ifm == mean_of_ifm_should_be);
+  }
+  if (_pv == PatternVersion::Version_1)
+  {
+    CHECK_OR_FALSE(fill(&mul_gamma_should_be, &mean_of_reshape_should_be)
+                       .with_commutative_args_of(mul_as_scaled_mean));
+    CHECK_OR_FALSE(mul_gamma == mul_gamma_should_be);
+    CHECK_OR_FALSE(mean_of_reshape == mean_of_reshape_should_be);
+  }
+
 #undef CHECK_OR_FALSE
   _matched = true;
   return true;
@@ -381,13 +551,28 @@ namespace luci
 bool FuseInstanceNormPass::run(loco::Graph *g)
 {
   bool changed = false;
+  luci::CircleAdd *add;
+  InstanceNormPattern::PatternVersion pv;
+
   for (auto node : loco::active_nodes(loco::output_nodes(g)))
   {
-    auto add = dynamic_cast<luci::CircleAdd *>(node);
-    if (not add)
-      continue;
+    auto reshape = dynamic_cast<luci::CircleReshape *>(node);
+    if (not reshape)
+    {
+      add = dynamic_cast<luci::CircleAdd *>(node);
+      if (not add)
+        continue;
+      pv = InstanceNormPattern::PatternVersion::Version_0;
+    }
+    else
+    {
+      add = dynamic_cast<luci::CircleAdd *>(reshape->tensor());
+      if (not add)
+        continue;
+      pv = InstanceNormPattern::PatternVersion::Version_1;
+    }
 
-    InstanceNormPattern pattern(add);
+    InstanceNormPattern pattern(add, pv);
     if (not pattern.matched())
       continue;
 
diff --git a/compiler/luci/pass/src/FuseInstanceNormPass.test.cpp b/compiler/luci/pass/src/FuseInstanceNormPass.test.cpp
new file mode 100644
index 000000000..3037f3def
--- /dev/null
+++ b/compiler/luci/pass/src/FuseInstanceNormPass.test.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "FuseInstanceNormPassInternal.h"
+
+#include <vector>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+void setShape(luci::CircleNode &node, const std::vector<int> &v)
+{
+  node.rank(v.size());
+  for (int i = 0; i < v.size(); ++i)
+  {
+    node.dim(i) = v[i];
+  }
+}
+
+} // namespace
+
+TEST(FuseInstanceNormPass, is_quasi_1D_with_dummy_dim)
+{
+  luci::CircleConst const_node;
+
+  setShape(const_node, {});
+  EXPECT_FALSE(is_quasi_1D_with_dummy_dim(&const_node, 8));
+
+  setShape(const_node, {1});
+  EXPECT_FALSE(is_quasi_1D_with_dummy_dim(&const_node, 8));
+
+  setShape(const_node, {8});
+  EXPECT_FALSE(is_quasi_1D_with_dummy_dim(&const_node, 8));
+
+  setShape(const_node, {1, 2, 1, 8, 1});
+  EXPECT_FALSE(is_quasi_1D_with_dummy_dim(&const_node, 8));
+
+  setShape(const_node, {8, 3});
+  EXPECT_FALSE(is_quasi_1D_with_dummy_dim(&const_node, 8));
+
+  setShape(const_node, {8, 1});
+  EXPECT_FALSE(is_quasi_1D_with_dummy_dim(&const_node, 8));
+
+  setShape(const_node, {1, 8, 1});
+  EXPECT_TRUE(is_quasi_1D_with_dummy_dim(&const_node, 8));
+
+  setShape(const_node, {1, 1, 1, 8, 1});
+  EXPECT_TRUE(is_quasi_1D_with_dummy_dim(&const_node, 8));
+}
diff --git a/compiler/luci/pass/src/FuseInstanceNormPassInternal.h b/compiler/luci/pass/src/FuseInstanceNormPassInternal.h
new file mode 100644
index 000000000..32b638ba5
--- /dev/null
+++ b/compiler/luci/pass/src/FuseInstanceNormPassInternal.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_CIRCLE_FUSE_INSTANCE_NORM_PASS_INTERNAL_H__
+#define __LUCI_CIRCLE_FUSE_INSTANCE_NORM_PASS_INTERNAL_H__
+
+#include <luci/IR/CircleNodes.h>
+
+/// @return true  When node has shape of '1 x .. x 1 x depth'
+bool is_1D_with_dummy_dim(luci::CircleConst *node, uint32_t depth);
+
+/// @return true  When node has shape of '1 x .. x depth x 1'
+bool is_quasi_1D_with_dummy_dim(luci::CircleConst *node, uint32_t depth);
+
+#endif // __LUCI_CIRCLE_FUSE_INSTANCE_NORM_PASS_INTERNAL_H__
diff --git a/compiler/luci/pass/src/QuantizationUtils.cpp b/compiler/luci/pass/src/QuantizationUtils.cpp
new file mode 100644
index 000000000..6726ce746
--- /dev/null
+++ b/compiler/luci/pass/src/QuantizationUtils.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "QuantizationUtils.h"
+
+#include <luci/Log.h>
+
+#include <iostream>
+#include <cmath>
+
+namespace luci
+{
+
+void compute_sym_scale_zp(float min, float max, float &scaling_factor, int64_t &zp,
+                          float &nudged_min, float &nudged_max)
+{
+  assert(min != max);
+
+  const int32_t kMaxScale = std::numeric_limits<int16_t>::max();
+  const int32_t kMinScale = -kMaxScale;
+  const double qmin_double = kMinScale;
+  const double qmax_double = kMaxScale;
+  const double rmin = std::fmin(0, min);
+  const double rmax = std::fmax(0, max);
+  double scale_factor_from_min_side{0};
+  double scale_factor_from_max_side{0};
+
+  if ((qmin_double * rmin) > 0)
+    scale_factor_from_min_side = rmin / qmin_double;
+
+  if ((qmax_double * rmax) > 0)
+    scale_factor_from_max_side = rmax / qmax_double;
+
+  scaling_factor = scale_factor_from_min_side > scale_factor_from_max_side
+                       ? scale_factor_from_min_side
+                       : scale_factor_from_max_side;
+  zp = 0;
+  nudged_min = static_cast<float>(qmin_double * scaling_factor);
+  nudged_max = static_cast<float>(qmax_double * scaling_factor);
+}
+
+void compute_asym_scale_zp(float min, float max, float &scaling_factor, int64_t &zp,
+                           float &nudged_min, float &nudged_max)
+{
+  LOGGER(l);
+
+  assert(min <= max);
+  const int32_t kMinScale = 0;
+  const int32_t kMaxScale = 255;
+  const double qmin_double = kMinScale;
+  const double qmax_double = kMaxScale;
+  const double rmin = std::fmin(0, min);
+  const double rmax = std::fmax(0, max);
+
+  double scale = (rmax - rmin) / (qmax_double - qmin_double);
+  double zero_point_double = 0;
+  uint8_t nudged_zero_point = 0;
+  if (scale == 0)
+  {
+    WARN(l) << "The minimum and maximum values are the same." << std::endl;
+    if (min >= 0 && max >= 0)
+      zero_point_double = kMinScale;
+    else
+      zero_point_double = kMaxScale;
+  }
+  else
+    zero_point_double = qmin_double - rmin / scale;
+  if (zero_point_double <= qmin_double)
+  {
+    assert(min >= 0 && max >= 0);
+    nudged_zero_point = kMinScale;
+    scale = max / (qmax_double - qmin_double);
+    if (min > 0 && max > 0)
+      WARN(l) << "The minimum and maximum values are all positive." << std::endl;
+  }
+  else if (zero_point_double >= qmax_double)
+  {
+    assert(min < 0 && max < 0);
+    nudged_zero_point = kMaxScale;
+    scale = -min / (qmax_double - qmin_double);
+    WARN(l) << "The minimum and maximum values are all negative." << std::endl;
+  }
+  else
+  {
+    assert(min < 0 && max >= 0);
+    nudged_zero_point = static_cast<uint8_t>(std::round(zero_point_double));
+  }
+
+  nudged_min = static_cast<float>((qmin_double - nudged_zero_point) * scale);
+  nudged_max = static_cast<float>((qmax_double - nudged_zero_point) * scale);
+
+  scaling_factor = scale;
+  zp = nudged_zero_point;
+}
+
+bool get_channel_dim_index(CircleConst *node, loco::TensorShape &dimension, int &channel_dim_index)
+{
+  auto succs = loco::succs(node);
+  if (succs.size() != 1) // assume weights is used by only one node
+    return false;
+
+  for (auto out : succs)
+  {
+    auto conv = dynamic_cast<CircleConv2D *>(out);
+    auto dw_conv = dynamic_cast<CircleDepthwiseConv2D *>(out);
+    auto tw_conv = dynamic_cast<CircleTransposeConv *>(out);
+    auto fc = dynamic_cast<CircleFullyConnected *>(out);
+
+    // Refer to https://github.com/Samsung/ONE/pull/2448.
+    if ((conv != nullptr && conv->filter() == node) ||
+        (tw_conv != nullptr && tw_conv->filter() == node)) // OHWI
+    {
+      assert(node->rank() == 4);
+      dimension.dim(0).set(node->dim(0).value());
+      dimension.dim(1).set(node->dim(1).value());
+      dimension.dim(2).set(node->dim(2).value());
+      dimension.dim(3).set(node->dim(3).value());
+      channel_dim_index = 0; // Set channel_dim_index based on "O"
+      return true;
+    }
+    else if (dw_conv != nullptr && dw_conv->filter() == node) // IHWC
+    {
+      assert(node->rank() == 4);
+      dimension.dim(0).set(node->dim(0).value());
+      dimension.dim(1).set(node->dim(1).value());
+      dimension.dim(2).set(node->dim(2).value());
+      dimension.dim(3).set(node->dim(3).value());
+      channel_dim_index = 3; // Set channel_dim_index based on "C"
+      return true;
+    }
+    else if (fc != nullptr && fc->weights() == node) // OI
+    {
+      assert(node->rank() == 2);
+      dimension.dim(0).set(node->dim(0).value());
+      dimension.dim(1).set(1); // Set FC layer like CONV
+      dimension.dim(2).set(1);
+      dimension.dim(3).set(node->dim(1).value());
+      channel_dim_index = 0; // Set channel_dim_index based on "O"
+      return true;
+    }
+    else
+    {
+      // node does not support channle-wise quantization
+      assert(false);
+    }
+  }
+
+  return false;
+}
+
+uint32_t cal_offset(loco::TensorShape &dimension, uint32_t *indices)
+{
+  return indices[0] * dimension.dim(1).value() * dimension.dim(2).value() *
+             dimension.dim(3).value() +
+         indices[1] * dimension.dim(2).value() * dimension.dim(3).value() +
+         indices[2] * dimension.dim(3).value() + indices[3];
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/QuantizationUtils.h b/compiler/luci/pass/src/QuantizationUtils.h
new file mode 100644
index 000000000..ec0e86df8
--- /dev/null
+++ b/compiler/luci/pass/src/QuantizationUtils.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_QUANTIZATION_UTILS_H__
+#define __LUCI_QUANTIZATION_UTILS_H__
+
+#include <luci/IR/CircleNodes.h>
+#include <loco/IR/TensorShape.h>
+
+namespace luci
+{
+
+void compute_sym_scale_zp(float min, float max, float &scaling_factor, int64_t &zp,
+                          float &nudged_min, float &nudged_max);
+
+void compute_asym_scale_zp(float min, float max, float &scaling_factor, int64_t &zp,
+                           float &nudged_min, float &nudged_max);
+
+bool get_channel_dim_index(CircleConst *node, loco::TensorShape &dimension, int &channel_dim_index);
+
+uint32_t cal_offset(loco::TensorShape &dimension, uint32_t *indices);
+
+} // namespace luci
+
+#endif // __LUCI_QUANTIZATION_UTILS_H__
diff --git a/compiler/luci/pass/src/QuantizeDequantizeWeightsPass.cpp b/compiler/luci/pass/src/QuantizeDequantizeWeightsPass.cpp
new file mode 100644
index 000000000..c492234c7
--- /dev/null
+++ b/compiler/luci/pass/src/QuantizeDequantizeWeightsPass.cpp
@@ -0,0 +1,495 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/QuantizeDequantizeWeightsPass.h"
+#include "QuantizationUtils.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/IR/CircleNodeVisitor.h>
+#include <luci/Log.h>
+#include <loco/IR/TensorShape.h>
+
+#include <iostream>
+#include <cmath>
+
+namespace luci
+{
+
+namespace
+{
+
+void cal_minmax_per_channel(CircleConst *node, std::vector<float> &min, std::vector<float> &max)
+{
+  loco::TensorShape dimension;
+  dimension.rank(4);
+  uint32_t indices[4] = {
+      0,
+  };
+  int channel_dim_index{0};
+  int size{0};
+
+  if (!get_channel_dim_index(node, dimension, channel_dim_index))
+  {
+    assert(false);
+    return;
+  }
+  size = dimension.dim(channel_dim_index).value();
+
+  std::vector<bool> has_min_max_value(size, false);
+  min.resize(size);
+  max.resize(size);
+  for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++)
+  {
+    for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++)
+    {
+      for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++)
+      {
+        for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++)
+        {
+          int channel_idx = indices[channel_dim_index];
+          auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
+          if (has_min_max_value[channel_idx])
+          {
+            min[channel_idx] = data < min[channel_idx] ? data : min[channel_idx];
+            max[channel_idx] = data > max[channel_idx] ? data : max[channel_idx];
+          }
+          else
+          {
+            min[channel_idx] = data;
+            max[channel_idx] = data;
+            has_min_max_value[channel_idx] = true;
+          }
+        }
+      }
+    }
+  }
+}
+
+void sym_wquant_per_channel(CircleConst *node, std::vector<float> &min, std::vector<float> &max,
+                            std::vector<float> &scaling_factor, std::vector<int64_t> &zp,
+                            std::vector<float> &nudged_min, std::vector<float> &nudged_max)
+{
+  assert(node->dtype() == loco::DataType::FLOAT32);
+  const int32_t kMaxScale = std::numeric_limits<int16_t>::max();
+  const int32_t kMinScale = -kMaxScale;
+
+  uint32_t size = node->size<loco::DataType::FLOAT32>();
+  std::vector<int32_t> quantized_values(size);
+
+  for (size_t i = 0; i < min.size(); ++i)
+  {
+    compute_sym_scale_zp(min[i], max[i], scaling_factor[i], zp[i], nudged_min[i], nudged_max[i]);
+  }
+
+  loco::TensorShape dimension;
+  dimension.rank(4);
+  uint32_t indices[4] = {
+      0,
+  };
+  int channel_dim_index{0};
+
+  if (!get_channel_dim_index(node, dimension, channel_dim_index))
+  {
+    assert(false);
+    return;
+  }
+
+  for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++)
+  {
+    for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++)
+    {
+      for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++)
+      {
+        for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++)
+        {
+          int channel_idx = indices[channel_dim_index];
+          const float scaling_factor_inv = 1.0 / scaling_factor[channel_idx];
+          auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
+          data = data < nudged_min[channel_idx] ? nudged_min[channel_idx] : data;
+          data = data > nudged_max[channel_idx] ? nudged_max[channel_idx] : data;
+          quantized_values[cal_offset(dimension, indices)] =
+              static_cast<int32_t>(std::round(data * scaling_factor_inv));
+        }
+      }
+    }
+  }
+
+  node->dtype(loco::DataType::S16);      // change the type of tensor
+  node->size<loco::DataType::S16>(size); // resize tensor
+  for (uint32_t i = 0; i < size; ++i)
+  {
+    node->at<loco::DataType::S16>(i) =
+        std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
+  }
+}
+
+void sym_wdequant_per_channel(CircleConst *node, std::vector<float> &scaling_factor)
+{
+  assert(node->dtype() == loco::DataType::S16);
+  uint32_t size = node->size<loco::DataType::S16>();
+  std::vector<float> dequantized_values(size);
+
+  loco::TensorShape dimension;
+  dimension.rank(4);
+  uint32_t indices[4] = {
+      0,
+  };
+  int channel_dim_index{0};
+
+  if (!get_channel_dim_index(node, dimension, channel_dim_index))
+  {
+    assert(false);
+    return;
+  }
+
+  for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++)
+  {
+    for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++)
+    {
+      for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++)
+      {
+        for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++)
+        {
+          int channel_idx = indices[channel_dim_index];
+          auto data = node->at<loco::DataType::S16>(cal_offset(dimension, indices));
+          dequantized_values[cal_offset(dimension, indices)] =
+              static_cast<float>(data) * scaling_factor[channel_idx];
+        }
+      }
+    }
+  }
+
+  node->dtype(loco::DataType::FLOAT32);      // change the type of tensor
+  node->size<loco::DataType::FLOAT32>(size); // resize tensor
+  for (uint32_t i = 0; i < size; ++i)
+  {
+    node->at<loco::DataType::FLOAT32>(i) = dequantized_values[i];
+  }
+}
+
+void asymmetric_wquant_per_channel(CircleConst *node, std::vector<float> &min,
+                                   std::vector<float> &max, std::vector<float> &scaling_factor,
+                                   std::vector<int64_t> &zp, std::vector<float> &nudged_min,
+                                   std::vector<float> &nudged_max)
+{
+  assert(node->dtype() == loco::DataType::FLOAT32);
+
+  const int32_t kMinScale = 0;
+  const int32_t kMaxScale = 255;
+
+  uint32_t size = node->size<loco::DataType::FLOAT32>();
+  std::vector<int32_t> quantized_values(size);
+
+  for (size_t i = 0; i < min.size(); ++i)
+  {
+    compute_asym_scale_zp(min[i], max[i], scaling_factor[i], zp[i], nudged_min[i], nudged_max[i]);
+  }
+
+  loco::TensorShape dimension;
+  dimension.rank(4);
+  uint32_t indices[4] = {
+      0,
+  };
+  int channel_dim_index{0};
+
+  if (!get_channel_dim_index(node, dimension, channel_dim_index))
+  {
+    assert(false);
+    return;
+  }
+
+  for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++)
+  {
+    for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++)
+    {
+      for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++)
+      {
+        for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++)
+        {
+          int channel_idx = indices[channel_dim_index];
+          const float scaling_factor_inv = 1.0 / scaling_factor[channel_idx];
+          auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
+          data = data < nudged_min[channel_idx] ? nudged_min[channel_idx] : data;
+          data = data > nudged_max[channel_idx] ? nudged_max[channel_idx] : data;
+          quantized_values[cal_offset(dimension, indices)] = static_cast<int32_t>(
+              std::round((data - nudged_min[channel_idx]) * scaling_factor_inv));
+        }
+      }
+    }
+  }
+
+  node->dtype(loco::DataType::U8);      // change the type of tensor
+  node->size<loco::DataType::U8>(size); // resize tensor
+  for (uint32_t i = 0; i < size; ++i)
+  {
+    node->at<loco::DataType::U8>(i) = std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
+  }
+}
+
+void asymmetric_wdequant_per_channel(CircleConst *node, std::vector<float> &scaling_factor,
+                                     std::vector<float> &nudged_min)
+{
+  assert(node->dtype() == loco::DataType::U8);
+  uint32_t size = node->size<loco::DataType::U8>();
+  std::vector<float> dequantized_values(size);
+
+  loco::TensorShape dimension;
+  dimension.rank(4);
+  uint32_t indices[4] = {
+      0,
+  };
+  int channel_dim_index{0};
+
+  if (!get_channel_dim_index(node, dimension, channel_dim_index))
+  {
+    assert(false);
+    return;
+  }
+
+  for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++)
+  {
+    for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++)
+    {
+      for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++)
+      {
+        for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++)
+        {
+          int channel_idx = indices[channel_dim_index];
+          auto data = node->at<loco::DataType::U8>(cal_offset(dimension, indices));
+          dequantized_values[cal_offset(dimension, indices)] =
+              static_cast<float>(data) * scaling_factor[channel_idx] + nudged_min[channel_idx];
+        }
+      }
+    }
+  }
+
+  node->dtype(loco::DataType::FLOAT32);      // change the type of tensor
+  node->size<loco::DataType::FLOAT32>(size); // resize tensor
+  for (uint32_t i = 0; i < size; ++i)
+  {
+    node->at<loco::DataType::FLOAT32>(i) = dequantized_values[i];
+  }
+}
+
+void asymmetric_wquant_with_minmax_per_layer(CircleConst *node, float min, float max,
+                                             float &scaling_factor, int64_t &zp, float &nudged_min,
+                                             float &nudged_max)
+{
+
+  const int32_t kMinScale = 0;
+  const int32_t kMaxScale = 255;
+
+  uint32_t size = node->size<loco::DataType::FLOAT32>();
+  compute_asym_scale_zp(min, max, scaling_factor, zp, nudged_min, nudged_max);
+  const float scaling_factor_inv = 1.0 / scaling_factor;
+  std::vector<int32_t> quantized_values(size);
+  for (uint32_t i = 0; i < size; ++i)
+  {
+    // clipping
+    auto data = node->at<loco::DataType::FLOAT32>(i);
+    data = data < nudged_min ? nudged_min : data;
+    data = data > nudged_max ? nudged_max : data;
+    quantized_values[i] =
+        static_cast<int32_t>(std::round((data - nudged_min) * scaling_factor_inv));
+  }
+
+  node->dtype(loco::DataType::U8);      // change the type of tensor
+  node->size<loco::DataType::U8>(size); // resize tensor
+  for (uint32_t i = 0; i < size; ++i)
+  {
+    node->at<loco::DataType::U8>(i) = std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
+  }
+}
+
+void asymmetric_wdequant_with_minmax_per_layer(CircleConst *node, float scaling_factor,
+                                               float nudged_min)
+{
+  uint32_t size = node->size<loco::DataType::U8>();
+  std::vector<float> dequantized_values(size);
+  for (uint32_t i = 0; i < size; ++i)
+  {
+    auto data = node->at<loco::DataType::U8>(i);
+    dequantized_values[i] = static_cast<float>(data) * scaling_factor + nudged_min;
+  }
+
+  node->dtype(loco::DataType::FLOAT32);      // change the type of tensor
+  node->size<loco::DataType::FLOAT32>(size); // resize tensor
+  for (uint32_t i = 0; i < size; ++i)
+  {
+    node->at<loco::DataType::FLOAT32>(i) = dequantized_values[i];
+  }
+}
+
+bool is_quantized(const CircleNode *node)
+{
+  return node->dtype() == loco::DataType::U8 ||  // activation, weight
+         node->dtype() == loco::DataType::S16 || // activation, weight
+         node->dtype() == loco::DataType::S32;   // bias
+}
+
+// Check if node is weights of conv2d, transepose_conv2d, depthwise_conv2d, or fully_connected layer
+bool is_weights(CircleNode *node)
+{
+  auto circle_const = dynamic_cast<CircleConst *>(node);
+  if (circle_const == nullptr)
+    return false;
+
+  auto succs = loco::succs(node);
+  if (succs.size() != 1) // assume weights is used by only one node
+    return false;
+
+  for (auto out : succs)
+  {
+    auto conv = dynamic_cast<CircleConv2D *>(out);
+    if (conv != nullptr && conv->filter() == circle_const && circle_const->rank() == 4)
+      return true;
+
+    auto dw_conv = dynamic_cast<CircleDepthwiseConv2D *>(out);
+    if (dw_conv != nullptr && dw_conv->filter() == circle_const && circle_const->rank() == 4)
+      return true;
+
+    auto tw_conv = dynamic_cast<CircleTransposeConv *>(out);
+    if (tw_conv != nullptr && tw_conv->filter() == circle_const && circle_const->rank() == 4)
+      return true;
+
+    auto fc = dynamic_cast<CircleFullyConnected *>(out);
+    if (fc != nullptr && fc->weights() == circle_const && circle_const->rank() == 2)
+      return true;
+  }
+  return false;
+}
+
+/**
+ * @brief QuantizeDequantizeWeights quantizes and dequantizes tensors for weights
+ * @details Find min/max values on the fly, quantize the model, and dequantize the model
+ */
+struct QuantizeDequantizeWeights final : public luci::CircleNodeMutableVisitor<bool>
+{
+  QuantizeDequantizeWeights(loco::DataType input, loco::DataType output,
+                            QuantizationGranularity granularity)
+      : input_type(input), output_type(output), granularity(granularity)
+  {
+  }
+
+  loco::DataType input_type;
+  loco::DataType output_type;
+  QuantizationGranularity granularity;
+
+  // Quantize and dequantize input tensors of each node
+  bool visit(luci::CircleNode *node)
+  {
+    assert(output_type == loco::DataType::U8 || output_type == loco::DataType::S16);
+    LOGGER(l);
+    INFO(l) << "QuantizeDequantizeWeights visit node: " << node->name() << std::endl;
+    auto arity = node->arity();
+    for (uint32_t i = 0; i < arity; i++)
+    {
+      auto input_node = node->arg(i);
+      auto circle_node = loco::must_cast<luci::CircleNode *>(input_node);
+
+      // Check if this is already quantized
+      if (is_quantized(circle_node))
+        continue;
+
+      if (is_weights(circle_node))
+      {
+        auto circle_const = loco::must_cast<luci::CircleConst *>(circle_node);
+
+        // Find min/max per channel-wise
+        if (granularity == QuantizationGranularity::ChannelWise)
+        {
+          std::vector<float> min;
+          std::vector<float> max;
+
+          cal_minmax_per_channel(circle_const, min, max);
+
+          std::vector<float> nudged_min(min.size());
+          std::vector<float> nudged_max(min.size());
+          std::vector<float> scaling_factor(min.size());
+          std::vector<int64_t> zp(min.size());
+
+          if (output_type == loco::DataType::U8)
+          {
+            asymmetric_wquant_per_channel(circle_const, min, max, scaling_factor, zp, nudged_min,
+                                          nudged_max);
+            asymmetric_wdequant_per_channel(circle_const, scaling_factor, nudged_min);
+          }
+          else
+          {
+            sym_wquant_per_channel(circle_const, min, max, scaling_factor, zp, nudged_min,
+                                   nudged_max);
+            sym_wdequant_per_channel(circle_const, scaling_factor);
+          }
+
+          auto quantparam = std::make_unique<CircleQuantParam>();
+          quantparam->min = nudged_min;
+          quantparam->max = nudged_max;
+          quantparam->scale = scaling_factor;
+          quantparam->zerop = zp;
+          circle_node->quantparam(std::move(quantparam));
+        }
+        // Find min/max per layer-wise
+        else
+        {
+          float min = std::numeric_limits<float>::max();
+          float max = std::numeric_limits<float>::lowest();
+          for (uint32_t i = 0; i < circle_const->size<loco::DataType::FLOAT32>(); i++)
+          {
+            auto data = circle_const->at<loco::DataType::FLOAT32>(i);
+            min = data < min ? data : min;
+            max = data > max ? data : max;
+          }
+          float scaling_factor{0};
+          int64_t zp{0};
+          float nudged_min{0};
+          float nudged_max{0};
+
+          asymmetric_wquant_with_minmax_per_layer(circle_const, min, max, scaling_factor, zp,
+                                                  nudged_min, nudged_max);
+          asymmetric_wdequant_with_minmax_per_layer(circle_const, scaling_factor, nudged_min);
+          auto quantparam = std::make_unique<CircleQuantParam>();
+          quantparam->min.push_back(nudged_min);
+          quantparam->max.push_back(nudged_max);
+          quantparam->scale.push_back(scaling_factor);
+          quantparam->zerop.push_back(zp);
+          circle_node->quantparam(std::move(quantparam));
+        }
+      }
+    }
+    return false;
+  }
+};
+
+} // namespace
+
+bool QuantizeDequantizeWeightsPass::run(loco::Graph *g)
+{
+  LOGGER(l);
+  INFO(l) << "QuantizeDequantizeWeightsPass Start" << std::endl;
+
+  // Quantize weights
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    QuantizeDequantizeWeights qw(_input_dtype, _output_dtype, _granularity);
+    auto circle_node = loco::must_cast<luci::CircleNode *>(node);
+    circle_node->accept(&qw);
+  }
+
+  INFO(l) << "QuantizeDequantizeWeightsPass End" << std::endl;
+  return false; // one time run
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
new file mode 100644
index 000000000..f8abee751
--- /dev/null
+++ b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
@@ -0,0 +1,551 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/QuantizeWithMinMaxPass.h"
+#include "QuantizationUtils.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/IR/CircleNodeVisitor.h>
+#include <luci/Log.h>
+
+#include <oops/UserExn.h>
+
+#include <iostream>
+#include <cmath>
+
+namespace luci
+{
+
+namespace
+{
+
+// Check if the node is the bias of Conv2D, DepthwiseConv2D, or FullyConnected layer
+// If true, return <input, weight> pair of the successor node (used to quantize bias)
+// If flase, return <nullptr, nullptr>
+std::pair<loco::Node *, loco::Node *> get_input_weight_of_bias(CircleNode *node)
+{
+  auto circle_const = dynamic_cast<CircleConst *>(node);
+  if (circle_const == nullptr)
+    return std::make_pair(nullptr, nullptr);
+
+  auto succs = loco::succs(node);
+  if (succs.size() != 1) // assume bias is used by only one node
+    return std::make_pair(nullptr, nullptr);
+
+  for (auto out : succs)
+  {
+    auto conv = dynamic_cast<CircleConv2D *>(out);
+    if (conv != nullptr && conv->bias() == circle_const)
+    {
+      assert(conv->input() != nullptr);
+      assert(conv->filter() != nullptr);
+      return std::make_pair(conv->input(), conv->filter());
+    }
+    auto dw_conv = dynamic_cast<CircleDepthwiseConv2D *>(out);
+    if (dw_conv != nullptr && dw_conv->bias() == circle_const)
+    {
+      assert(dw_conv->input() != nullptr);
+      assert(dw_conv->filter() != nullptr);
+      return std::make_pair(dw_conv->input(), dw_conv->filter());
+    }
+    auto fc = dynamic_cast<CircleFullyConnected *>(out);
+    if (fc != nullptr && fc->bias() == circle_const)
+    {
+      assert(fc->input() != nullptr);
+      assert(fc->weights() != nullptr);
+      return std::make_pair(fc->input(), fc->weights());
+    }
+  }
+  return std::make_pair(nullptr, nullptr);
+}
+
+void asym_quant_bias_per_layer(CircleConst *node, float input_scale, float weight_scale,
+                               float *scaling_factor, int64_t *zp)
+{
+  float scale = input_scale * weight_scale;
+  const float scaling_factor_inv = (scale == 0) ? 0 : 1.0 / scale;
+
+  uint32_t size = node->size<loco::DataType::FLOAT32>();
+  std::vector<int32_t> quantized_values(size);
+  for (uint32_t i = 0; i < size; ++i)
+  {
+    quantized_values[i] =
+        static_cast<int32_t>(std::round(node->at<loco::DataType::FLOAT32>(i) * scaling_factor_inv));
+  }
+
+  node->dtype(loco::DataType::S32);      // change the type of tensor
+  node->size<loco::DataType::S32>(size); // resize tensor
+  const int32_t kMinScale = std::numeric_limits<int32_t>::lowest();
+  const int32_t kMaxScale = std::numeric_limits<int32_t>::max();
+  for (uint32_t i = 0; i < size; ++i)
+  {
+    node->at<loco::DataType::S32>(i) =
+        std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
+  }
+  *scaling_factor = scale;
+  *zp = 0;
+}
+
+void quant_bias_per_channel(CircleConst *node, float input_scale, std::vector<float> &weight_scale,
+                            std::vector<float> &scaling_factor, std::vector<int64_t> &zp)
+{
+  float scaling_factor_inv{0};
+
+  uint32_t size = node->size<loco::DataType::FLOAT32>();
+  std::vector<int32_t> quantized_values(size);
+
+  for (uint32_t i = 0; i < size; ++i)
+  {
+    scaling_factor[i] = input_scale * weight_scale[i];
+    scaling_factor_inv = (scaling_factor[i] == 0) ? 0 : 1.0 / scaling_factor[i];
+    quantized_values[i] =
+        static_cast<int32_t>(std::round(node->at<loco::DataType::FLOAT32>(i) * scaling_factor_inv));
+    zp[i] = 0;
+  }
+
+  node->dtype(loco::DataType::S32);      // change the type of tensor
+  node->size<loco::DataType::S32>(size); // resize tensor
+  const int32_t kMinScale = std::numeric_limits<int32_t>::lowest();
+  const int32_t kMaxScale = std::numeric_limits<int32_t>::max();
+  for (uint32_t i = 0; i < size; ++i)
+  {
+    node->at<loco::DataType::S32>(i) =
+        std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
+  }
+}
+
+bool has_min_max(const CircleNode *node)
+{
+  return node->quantparam() && !node->quantparam()->min.empty() && !node->quantparam()->max.empty();
+}
+
+bool is_quantized(const CircleNode *node)
+{
+  return node->dtype() == loco::DataType::U8 || // activation, weight
+         node->dtype() == loco::DataType::S32;  // bias
+}
+
+void sym_wquant_per_channel(CircleConst *node, std::vector<float> &scaling_factor)
+{
+  assert(node->dtype() == loco::DataType::FLOAT32);
+
+  const int32_t kMaxScale = std::numeric_limits<int16_t>::max();
+  const int32_t kMinScale = -kMaxScale;
+
+  uint32_t size = node->size<loco::DataType::FLOAT32>();
+  std::vector<int32_t> quantized_values(size);
+
+  loco::TensorShape dimension;
+  dimension.rank(4);
+  uint32_t indices[4] = {
+      0,
+  };
+  int channel_dim_index{0};
+
+  if (!get_channel_dim_index(node, dimension, channel_dim_index))
+  {
+    assert(false);
+    return;
+  }
+
+  for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++)
+  {
+    for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++)
+    {
+      for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++)
+      {
+        for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++)
+        {
+          int channel_idx = indices[channel_dim_index];
+          const float scaling_factor_inv = 1.0 / scaling_factor[channel_idx];
+          auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
+          quantized_values[cal_offset(dimension, indices)] =
+              static_cast<int32_t>(std::round(data * scaling_factor_inv));
+        }
+      }
+    }
+  }
+
+  node->dtype(loco::DataType::S16);      // change the type of tensor
+  node->size<loco::DataType::S16>(size); // resize tensor
+  for (uint32_t i = 0; i < size; ++i)
+  {
+    node->at<loco::DataType::S16>(i) =
+        std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
+  }
+}
+
+void asym_wquant_per_channel(CircleConst *node, std::vector<float> &min,
+                             std::vector<float> &scaling_factor)
+{
+  assert(node->dtype() == loco::DataType::FLOAT32);
+
+  const int32_t kMinScale = 0;
+  const int32_t kMaxScale = 255;
+
+  uint32_t size = node->size<loco::DataType::FLOAT32>();
+  std::vector<int32_t> quantized_values(size);
+
+  loco::TensorShape dimension;
+  dimension.rank(4);
+  uint32_t indices[4] = {
+      0,
+  };
+  int channel_dim_index{0};
+
+  if (!get_channel_dim_index(node, dimension, channel_dim_index))
+  {
+    assert(false);
+    return;
+  }
+
+  for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++)
+  {
+    for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++)
+    {
+      for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++)
+      {
+        for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++)
+        {
+          int channel_idx = indices[channel_dim_index];
+          const float scaling_factor_inv = 1.0 / scaling_factor[channel_idx];
+          auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
+          quantized_values[cal_offset(dimension, indices)] =
+              static_cast<int32_t>(std::round((data - min[channel_idx]) * scaling_factor_inv));
+        }
+      }
+    }
+  }
+
+  node->dtype(loco::DataType::U8);      // change the type of tensor
+  node->size<loco::DataType::U8>(size); // resize tensor
+  for (uint32_t i = 0; i < size; ++i)
+  {
+    node->at<loco::DataType::U8>(i) = std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
+  }
+}
+
+void asym_wquant_per_layer(CircleConst *node, float min, float scaling_factor)
+{
+  const int32_t kMinScale = 0;
+  const int32_t kMaxScale = 255;
+
+  uint32_t size = node->size<loco::DataType::FLOAT32>();
+
+  const float scaling_factor_inv = 1.0 / scaling_factor;
+  std::vector<int32_t> quantized_values(size);
+  for (uint32_t i = 0; i < size; ++i)
+  {
+    auto data = node->at<loco::DataType::FLOAT32>(i);
+    quantized_values[i] = static_cast<int32_t>(std::round((data - min) * scaling_factor_inv));
+  }
+
+  node->dtype(loco::DataType::U8);      // change the type of tensor
+  node->size<loco::DataType::U8>(size); // resize tensor
+  for (uint32_t i = 0; i < size; ++i)
+  {
+    node->at<loco::DataType::U8>(i) = std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
+  }
+}
+
+// Check if node is weights of conv2d, depthwise_conv2d, or fully_connected layer
+bool is_weights(CircleNode *node)
+{
+  auto circle_const = dynamic_cast<CircleConst *>(node);
+  if (circle_const == nullptr)
+    return false;
+
+  auto succs = loco::succs(node);
+  if (succs.size() != 1) // assume weights is used by only one node
+    return false;
+
+  for (auto out : succs)
+  {
+    auto conv = dynamic_cast<CircleConv2D *>(out);
+    if (conv != nullptr && conv->filter() == circle_const)
+      return true;
+
+    auto dw_conv = dynamic_cast<CircleDepthwiseConv2D *>(out);
+    if (dw_conv != nullptr && dw_conv->filter() == circle_const)
+      return true;
+
+    auto fc = dynamic_cast<CircleFullyConnected *>(out);
+    if (fc != nullptr && fc->weights() == circle_const)
+      return true;
+  }
+  return false;
+}
+
+/**
+ * @brief QuantizeActivation quantizes tensors for activations
+ * @details Quantize using recorded min/max values
+ */
+struct QuantizeActivation final : public luci::CircleNodeMutableVisitor<bool>
+{
+  QuantizeActivation(loco::DataType input, loco::DataType output)
+      : input_type(input), output_type(output)
+  {
+  }
+
+  loco::DataType input_type;
+  loco::DataType output_type;
+
+  // Quantize input tensors of each node
+  bool visit(luci::CircleNode *node)
+  {
+    LOGGER(l);
+    INFO(l) << "QuantizeActivation visit node: " << node->name() << std::endl;
+    auto arity = node->arity();
+    for (uint32_t i = 0; i < arity; i++)
+    {
+      auto input_node = node->arg(i);
+      auto circle_node = loco::must_cast<luci::CircleNode *>(input_node);
+
+      // Check if this is already quantized
+      if (is_quantized(circle_node))
+        continue;
+
+      // Check if this is bias (bias is quantized later)
+      auto iw = get_input_weight_of_bias(circle_node);
+      if (iw.first != nullptr && iw.second != nullptr)
+        continue;
+
+      // Check if this is activation
+      // We assume min/max are recorded only for activations
+      if (has_min_max(circle_node) && !is_weights(circle_node))
+      {
+        // Quantize using recorded min/max
+        auto quantparam = circle_node->quantparam();
+        assert(quantparam->min.size() == 1); // only support layer-wise quant
+        assert(quantparam->max.size() == 1); // only support layer-wise quant
+        auto min = quantparam->min[0];
+        auto max = quantparam->max[0];
+
+        float scaling_factor{0};
+        int64_t zp{0};
+        float nudged_min{0};
+        float nudged_max{0};
+
+        if (output_type == loco::DataType::U8)
+        {
+          compute_asym_scale_zp(min, max, scaling_factor, zp, nudged_min, nudged_max);
+          circle_node->dtype(loco::DataType::U8);
+        }
+        else
+        {
+          compute_sym_scale_zp(min, max, scaling_factor, zp, nudged_min, nudged_max);
+          circle_node->dtype(loco::DataType::S16);
+        }
+
+        circle_node->quantparam()->max[0] = nudged_max;
+        circle_node->quantparam()->min[0] = nudged_min;
+        circle_node->quantparam()->scale.push_back(scaling_factor);
+        circle_node->quantparam()->zerop.push_back(zp);
+      }
+    }
+    return false;
+  }
+};
+
+struct QuantizeBias final : public luci::CircleNodeMutableVisitor<bool>
+{
+  QuantizeBias(loco::DataType input, loco::DataType output, QuantizationGranularity gr)
+      : input_type(input), output_type(output), granularity(gr)
+  {
+  }
+
+  loco::DataType input_type;
+  loco::DataType output_type;
+  QuantizationGranularity granularity;
+
+  // Quantize bias node
+  bool visit(luci::CircleNode *node)
+  {
+    // Check if this is already quantized
+    if (is_quantized(node))
+      return false;
+
+    // Check if this is bias
+    auto iw = get_input_weight_of_bias(node);
+    if (iw.first == nullptr || iw.second == nullptr)
+      return false;
+
+    auto input = loco::must_cast<luci::CircleNode *>(iw.first);
+    auto weight = loco::must_cast<luci::CircleNode *>(iw.second);
+
+    if (granularity == QuantizationGranularity::ChannelWise)
+    {
+      assert(input->quantparam()->scale.size() == 1); // input scale's layer-wise
+      auto input_scale = input->quantparam()->scale[0];
+
+      assert(weight->quantparam() != nullptr); // weight scale's channel-wise
+      auto weight_scale = weight->quantparam()->scale;
+
+      auto circle_const = loco::must_cast<luci::CircleConst *>(node);
+
+      uint32_t size = circle_const->size<loco::DataType::FLOAT32>();
+      assert(size == weight_scale.size());
+      std::vector<float> scaling_factor(size);
+      std::vector<int64_t> zp(size);
+
+      quant_bias_per_channel(circle_const, input_scale, weight_scale, scaling_factor, zp);
+
+      auto quantparam = std::make_unique<CircleQuantParam>();
+      quantparam->scale = scaling_factor;
+      quantparam->zerop = zp;
+      assert(circle_const->quantparam() == nullptr); // bias should not be quantized before
+      circle_const->quantparam(std::move(quantparam));
+    }
+    else
+    {
+      assert(input->quantparam()->scale.size() == 1); // Only support per-layer quant
+      auto input_scale = input->quantparam()->scale[0];
+
+      assert(weight->quantparam()->scale.size() == 1); // Only support per-layer quant
+      auto weight_scale = weight->quantparam()->scale[0];
+
+      auto circle_const = loco::must_cast<luci::CircleConst *>(node);
+      float scaling_factor{0};
+      int64_t zp{0};
+      asym_quant_bias_per_layer(circle_const, input_scale, weight_scale, &scaling_factor, &zp);
+      auto quantparam = std::make_unique<CircleQuantParam>();
+      quantparam->scale.push_back(scaling_factor);
+      quantparam->zerop.push_back(zp);
+      assert(circle_const->quantparam() == nullptr); // bias should not be quantized before
+      circle_const->quantparam(std::move(quantparam));
+    }
+    return false;
+  }
+};
+
+/**
+ * @brief QuantizeWeights quantizes tensors for weights
+ * @details Find min/max values on the fly and then quantize
+ */
+struct QuantizeWeights final : public luci::CircleNodeMutableVisitor<bool>
+{
+  QuantizeWeights(loco::DataType input, loco::DataType output, QuantizationGranularity gr)
+      : input_type(input), output_type(output), granularity(gr)
+  {
+  }
+
+  loco::DataType input_type;
+  loco::DataType output_type;
+  QuantizationGranularity granularity;
+
+  // Quantize input tensors of each node
+  bool visit(luci::CircleNode *node)
+  {
+    LOGGER(l);
+    INFO(l) << "QuantizeWeights visit node: " << node->name() << std::endl;
+    auto arity = node->arity();
+    for (uint32_t i = 0; i < arity; i++)
+    {
+      auto input_node = node->arg(i);
+      auto circle_node = loco::must_cast<luci::CircleNode *>(input_node);
+
+      // Check if this is already quantized
+      if (is_quantized(circle_node))
+        continue;
+
+      if (is_weights(circle_node))
+      {
+        auto circle_const = loco::must_cast<luci::CircleConst *>(circle_node);
+
+        // Find min/max per channel-wise
+        if (granularity == QuantizationGranularity::ChannelWise)
+        {
+          auto quantparam = circle_node->quantparam();
+          assert(quantparam != nullptr);
+          auto min = quantparam->min;
+          auto scaling_factor = quantparam->scale;
+
+          if (output_type == loco::DataType::U8)
+          {
+            asym_wquant_per_channel(circle_const, min, scaling_factor);
+          }
+          else
+          {
+            sym_wquant_per_channel(circle_const, scaling_factor);
+          }
+        }
+        // Find min/max per layer-wise
+        else
+        {
+          // Quantize using recorded quantparam
+          auto quantparam = circle_node->quantparam();
+          assert(quantparam != nullptr);
+          assert(quantparam->min.size() == 1);   // only support layer-wise quant
+          assert(quantparam->scale.size() == 1); // only support layer-wise quant
+          auto min = quantparam->min[0];
+          auto scaling_factor = quantparam->scale[0];
+          asym_wquant_per_layer(circle_const, min, scaling_factor);
+        }
+      }
+    }
+    return false;
+  }
+};
+
+} // namespace
+
+bool QuantizeWithMinMaxPass::run(loco::Graph *g)
+{
+  LOGGER(l);
+  INFO(l) << "QuantizeWithMinMaxPass Start" << std::endl;
+
+  // Quantize activation
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    QuantizeActivation qa(_input_dtype, _output_dtype);
+    auto circle_node = loco::must_cast<luci::CircleNode *>(node);
+    circle_node->accept(&qa);
+  }
+
+  // Quantize weights
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    QuantizeWeights qw(_input_dtype, _output_dtype, _granularity);
+    auto circle_node = loco::must_cast<luci::CircleNode *>(node);
+    circle_node->accept(&qw);
+  }
+
+  // Quantize bias
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    QuantizeBias qb(_input_dtype, _output_dtype, _granularity);
+    auto circle_node = loco::must_cast<luci::CircleNode *>(node);
+    circle_node->accept(&qb);
+  }
+
+  // Update output dtype
+  auto graph_outputs = g->outputs();
+  for (auto node : loco::output_nodes(g))
+  {
+    auto circle_node = loco::must_cast<luci::CircleOutput *>(node);
+    if (static_cast<luci::CircleNode *>(circle_node->from())->dtype() == _output_dtype)
+    {
+      circle_node->dtype(_output_dtype);
+      auto graph_output = graph_outputs->at(circle_node->index());
+      graph_output->dtype(_output_dtype);
+    }
+  }
+
+  INFO(l) << "QuantizeWithMinMaxPass End" << std::endl;
+  return false; // one time run
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/ResolveCustomOpAddPass.cpp b/compiler/luci/pass/src/ResolveCustomOpAddPass.cpp
new file mode 100644
index 000000000..e52d667d7
--- /dev/null
+++ b/compiler/luci/pass/src/ResolveCustomOpAddPass.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/ResolveCustomOpAddPass.h"
+
+#include "flatbuffers/flexbuffers.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/IR/AttrFusedActFunc.h>
+
+namespace
+{
+
+/// @brief Returns the index of BroadcastTo node among cop's inputs.
+// NOTE This function assumes there is only one BroadcastTo node among its inputs.
+int32_t get_broadcastTo_index_among_inputs_of(luci::CircleCustom *cop)
+{
+  for (uint32_t idx = 0; idx < cop->numInputs(); idx++)
+  {
+    auto input = dynamic_cast<const luci::CircleCustomOut *>(cop->inputs(idx));
+    if (input)
+    {
+      auto broadcastTo = loco::must_cast<luci::CircleCustom *>(input->input());
+      if (broadcastTo->custom_code() == "BroadcastTo")
+        return idx;
+    }
+  }
+
+  return -1;
+}
+
+/** BEFORE
+ *                                  [CircleConst]
+ *                                        |
+ *        [CircleNode]         [BroadcastTo(CircleCustom)]
+ *              \                         |
+ *               \                [CircleCustomOUt]
+ *                \                   /
+ *               [AddV2(CircleCustom)]
+ *  AFTER
+ *
+ *         [CircleConst]         [CircleNode]
+ *                   \           /
+ *                    \         /
+ *                    [CircleAdd]
+ */
+bool resolve_with_BroadcastTo(luci::CircleCustom *addv2)
+{
+  int32_t broadcastTo_idx = get_broadcastTo_index_among_inputs_of(addv2);
+
+  if (broadcastTo_idx == -1)
+    return false;
+
+  auto input = loco::must_cast<const luci::CircleCustomOut *>(addv2->inputs(broadcastTo_idx));
+  auto broadcastTo = loco::must_cast<luci::CircleCustom *>(input->input());
+
+  auto add = addv2->graph()->nodes()->create<luci::CircleAdd>();
+  add->fusedActivationFunction(luci::FusedActFunc::NONE);
+  add->x(addv2->inputs(1 - broadcastTo_idx));
+  add->y(broadcastTo->inputs(0));
+  auto customOut = loco::succs(addv2);
+  assert(customOut.size() == 1);
+  replace(*customOut.begin()).with(add);
+
+  return true;
+}
+
+bool resolve_custom_op(luci::CircleCustom *addv2)
+{
+  const std::string custom_code = addv2->custom_code();
+  const std::vector<uint8_t> custom_options = addv2->custom_options();
+
+  if (custom_code != "AddV2")
+    return false;
+
+  if (resolve_with_BroadcastTo(addv2))
+    return true;
+
+  auto add = addv2->graph()->nodes()->create<luci::CircleAdd>();
+  add->fusedActivationFunction(luci::FusedActFunc::NONE);
+  add->x(addv2->inputs(0));
+  add->y(addv2->inputs(1));
+  auto customOut = loco::succs(addv2);
+  assert(customOut.size() == 1);
+  replace(*customOut.begin()).with(add);
+
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+bool ResolveCustomOpAddPass::run(loco::Graph *g)
+{
+  bool changed = false;
+
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto cop = dynamic_cast<luci::CircleCustom *>(node);
+    if (not cop)
+      continue;
+
+    changed |= resolve_custom_op(cop);
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/ResolveCustomOpBatchMatMulPass.cpp b/compiler/luci/pass/src/ResolveCustomOpBatchMatMulPass.cpp
new file mode 100644
index 000000000..145e9cb62
--- /dev/null
+++ b/compiler/luci/pass/src/ResolveCustomOpBatchMatMulPass.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/ResolveCustomOpBatchMatMulPass.h"
+
+#include "flatbuffers/flexbuffers.h"
+
+#include <luci/IR/CircleNodes.h>
+
+namespace
+{
+
+bool resolve_custom_op(luci::CircleCustom *cop)
+{
+  const std::string custom_code = cop->custom_code();
+  const std::vector<uint8_t> custom_options = cop->custom_options();
+
+  if (custom_code == "BatchMatMulV2")
+  {
+    auto batch_matmul = cop->graph()->nodes()->create<luci::CircleBatchMatMul>();
+    // input
+    batch_matmul->x(cop->inputs(0));
+    batch_matmul->y(cop->inputs(1));
+    // TODO find much better way of parsing custom_options
+    // adj
+    auto map = flexbuffers::GetRoot(custom_options).AsMap();
+    batch_matmul->adj_x(map["adj_x"].AsBool());
+    batch_matmul->adj_y(map["adj_y"].AsBool());
+
+    replace(cop).with(batch_matmul);
+    return true;
+  }
+  return false;
+}
+
+} // namespace
+
+namespace luci
+{
+
+bool ResolveCustomOpBatchMatMulPass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto cop = dynamic_cast<luci::CircleCustom *>(node);
+    if (not cop)
+      continue;
+
+    changed |= resolve_custom_op(cop);
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/ResolveCustomOpMatMulPass.cpp b/compiler/luci/pass/src/ResolveCustomOpMatMulPass.cpp
new file mode 100644
index 000000000..547fd22fc
--- /dev/null
+++ b/compiler/luci/pass/src/ResolveCustomOpMatMulPass.cpp
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/ResolveCustomOpMatMulPass.h"
+
+#include "flatbuffers/flexbuffers.h"
+#include <loco/IR/DataTypeTraits.h>
+
+#include <luci/IR/CircleNodes.h>
+
+#include <loco.h>
+#include <oops/InternalExn.h>
+#include <loco/Service/ShapeInference.h>
+#include <loco/Service/TypeInference.h>
+
+namespace
+{
+
+template <typename T>
+luci::CircleConst *create_const_node(loco::Graph *g, const loco::DataType dtype,
+                                     const std::vector<uint32_t> &shape,
+                                     const std::vector<T> &values)
+{
+  auto node = g->nodes()->create<luci::CircleConst>();
+  node->dtype(dtype);
+  node->rank(shape.size());
+
+  uint32_t size = 1;
+  for (uint32_t i = 0; i < shape.size(); ++i)
+  {
+    node->dim(i) = shape.at(i);
+    size *= shape.at(i);
+  }
+
+#define INIT_VALUES(DT)                          \
+  {                                              \
+    node->size<DT>(size);                        \
+    for (uint32_t i = 0; i < values.size(); ++i) \
+      node->at<DT>(i) = values[i];               \
+  }
+
+  switch (dtype)
+  {
+    case loco::DataType::U8:
+      INIT_VALUES(loco::DataType::U8);
+      break;
+    case loco::DataType::S16:
+      INIT_VALUES(loco::DataType::S16);
+      break;
+    case loco::DataType::S32:
+      INIT_VALUES(loco::DataType::S32);
+      break;
+    case loco::DataType::FLOAT32:
+      INIT_VALUES(loco::DataType::FLOAT32)
+      break;
+    default:
+      INTERNAL_EXN("create_const_node called with unsupported type");
+      break;
+  }
+  return node;
+}
+
+bool resolve_matmul(luci::CircleCustom *cop)
+{
+#define CHECK_OR_FALSE(condition) \
+  if (not(condition))             \
+    return false;
+#define CHECK_OR_THROW(condition, message) \
+  if (not(condition))                      \
+    INTERNAL_EXN(message);
+
+  auto graph = cop->graph();
+  const std::vector<uint8_t> custom_options = cop->custom_options();
+  auto map = flexbuffers::GetRoot(custom_options).AsMap();
+  const auto U8 = loco::DataType::U8;
+  const auto S16 = loco::DataType::S16;
+  const auto S32 = loco::DataType::S32;
+  const auto FLOAT32 = loco::DataType::FLOAT32;
+
+  bool transpose_a = map["transpose_a"].AsBool();
+  bool transpose_b = map["transpose_b"].AsBool();
+
+  loco::Node *lhs = cop->inputs(0);
+  loco::Node *rhs = cop->inputs(1);
+
+  // Check that the type of the first input is known
+  CHECK_OR_FALSE(loco::dtype_known(lhs));
+  auto lhs_dtype = loco::dtype_get(cop->inputs(0));
+
+  // If transpose of first input is requested, its shape must be known
+  CHECK_OR_FALSE(!transpose_a || loco::shape_known(lhs));
+  // and its rank should be at least 2
+  CHECK_OR_FALSE(!transpose_a || loco::shape_get(lhs).as<loco::TensorShape>().rank() >= 2);
+  // Check that the shape of the 2nd input is known
+  CHECK_OR_FALSE(loco::shape_known(rhs));
+  // TODO as of 06/23/20 TFLite only supports rank 2 for 2nd input. Fix this once that changes!
+  CHECK_OR_FALSE(loco::shape_get(rhs).as<loco::TensorShape>().rank() == 2);
+  // Check that input data type is supported
+  CHECK_OR_THROW(lhs_dtype == U8 || lhs_dtype == S16 || lhs_dtype == FLOAT32,
+                 "Only UInt8, Int16 and Float32 data types are supported by MatMul");
+
+  if (transpose_a)
+  {
+    auto a_shape = loco::shape_get(lhs).as<loco::TensorShape>();
+    // Create a permutation constant node
+    std::vector<uint32_t> perm;
+    for (uint32_t i = 0; i < a_shape.rank(); ++i)
+      perm.push_back(i);
+    std::swap(perm[a_shape.rank() - 1], perm[a_shape.rank() - 2]);
+    auto perm_node = create_const_node(graph, S32, {a_shape.rank()}, perm);
+    // Now make a transpose node
+    auto transpose_node = graph->nodes()->create<luci::CircleTranspose>();
+    transpose_node->a(lhs);
+    transpose_node->perm(perm_node);
+    lhs = transpose_node;
+  }
+
+  // Transpose the second input if needed. TFLite FullyConnected operator
+  // assumes the second input is in column-major order, but the input is
+  // in row-major order, thus we need to convert between them.
+  if (!transpose_b)
+  {
+    const std::vector<uint32_t> perm{1, 0};
+    auto perm_node = create_const_node(graph, S32, {2}, perm);
+    auto transpose_node = graph->nodes()->create<luci::CircleTranspose>();
+    transpose_node->a(rhs);
+    transpose_node->perm(perm_node);
+    rhs = transpose_node;
+  }
+
+  // Make a constant zero-filled bias node
+  auto b_shape = loco::shape_get(cop->inputs(1)).as<loco::TensorShape>();
+  uint32_t bias_size = b_shape.dim(transpose_b ? 1 : 0).value();
+  const std::vector<float> val(bias_size, .0f);
+  auto bias_node = create_const_node(graph, lhs_dtype, {bias_size}, val);
+  auto fc_node = graph->nodes()->create<luci::CircleFullyConnected>();
+  fc_node->input(lhs);
+  fc_node->weights(rhs);
+  fc_node->bias(bias_node);
+  fc_node->fusedActivationFunction(luci::FusedActFunc::NONE);
+
+  replace(cop).with(fc_node);
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+bool ResolveCustomOpMatMulPass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto cop = dynamic_cast<luci::CircleCustom *>(node);
+    if (not cop)
+      continue;
+
+    if (cop->custom_code() != "MatMul")
+      continue;
+
+    if (!resolve_matmul(cop))
+      continue;
+
+    changed = true;
+  }
+
+  return changed;
+}
+
+} // namespace luci