v0.5.0

author: huifang <huifangzhang@openailab> 2018-01-31 19:00:24 +0800
committer: huifang <huifangzhang@openailab> 2018-01-31 19:00:24 +0800
commit: 1224a143fc631f9f004881dba7a32c6f7ed5e1a5 (patch)
tree: 2f41c58f0d40c9c76b4df1762c09597397629648
parent: 7e51f0ff47b9a74f949f5b9ca448d56b6ba3eba5 (diff)
download: caffeonacl-1224a143fc631f9f004881dba7a32c6f7ed5e1a5.tar.gz
caffeonacl-1224a143fc631f9f004881dba7a32c6f7ed5e1a5.tar.bz2
caffeonacl-1224a143fc631f9f004881dba7a32c6f7ed5e1a5.zip
49 files changed, 2591 insertions, 1546 deletions
diff --git a/Makefile b/Makefile
index 2afeae34..8f0c65fa 100644
--- a/Makefile
+++ b/Makefile
@@ -329,7 +329,7 @@ ifeq ($(DEBUG), 1)
 	COMMON_FLAGS += -DDEBUG -g -O0
 	NVCCFLAGS += -G
 else
-	COMMON_FLAGS += -DNDEBUG -O2
+	COMMON_FLAGS += -DNDEBUG -O3
 endif
 
 # cuDNN acceleration configuration.
@@ -352,6 +352,10 @@ ifeq ($(USE_ACL), 1)
 	COMMON_FLAGS += -DUSE_ACL -std=c++11
 endif
 
+ifeq ($(USE_OPENCL), 1)
+	COMMON_FLAGS += -DUSE_OPENCL
+endif
+
 #USE_PROFILING -- get profiling informations, is controled by LOGACL
 #LAYER_PERF_STAT -- haitao's net profiling information
 ifeq ($(USE_PROFILING), 1)
diff --git a/Makefile.config.acl b/Makefile.config.acl
index f20326f9..8bbce0a2 100644
--- a/Makefile.config.acl
+++ b/Makefile.config.acl
@@ -1,18 +1,19 @@
 ## Refer to http://caffe.berkeleyvision.org/installation.html
 # Contributions simplifying and improving our build system are welcome!
 
-AIDDIR=/usr/local/AID
-export PKG_CONFIG_PATH=${AIDDIR}/opencv3.3.0/lib/pkgconfig
-
 # cuDNN acceleration switch (uncomment to build with cuDNN).
 # USE_CUDNN := 1
 
+AIDDIR=/usr/local/AID
+export PKG_CONFIG_PATH=${AIDDIR}/opencv3.3.0/lib/pkgconfig
+
 # CPU-only switch (uncomment to build without GPU support).
 CPU_ONLY := 1
 
 USE_PROFILING := 0
 
 USE_ACL :=1
+USE_OPENCL:=1
 ACL_ROOT :=$(AIDDIR)/ComputeLibrary
 ACL_INCS :=$(ACL_ROOT)/include 
 ACL_INCS +=$(ACL_ROOT)
@@ -143,3 +144,11 @@ TEST_GPUID := 0
 
 # enable pretty build (comment to see full commands)
 Q ?= @
+
+ifeq ($(wildcard $(AIDDIR)),)
+ACL_ROOT :=$(shell pwd)/../ComputeLibrary
+ACL_INCS :=$(ACL_ROOT)/include
+ACL_INCS +=$(ACL_ROOT)
+ACL_LIBS_DIR :=$(ACL_ROOT)/build
+OPENCV_VERSION :=
+endif
diff --git a/README.md b/README.md
index c4c1deb0..cd1ffe27 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,4 @@
 
-
 # CaffeOnACL
 [![License](https://img.shields.io/badge/license-BSD-blue.svg)](LICENSE)
 
@@ -11,9 +10,10 @@ The release version is 0.4.0, is based on [Rockchip RK3399](http://www.rock-chip
 * Caffe is a fast open framework for deep learning. See also [Caffe](https://github.com/BVLC/caffe).
 
 ### Documents
-* [Installation instructions](https://github.com/OAID/CaffeOnACL/blob/master/acl_openailab/installation.md)
-* [User Manuals PDF](https://github.com/OAID/CaffeOnACL/blob/master/acl_openailab/user_manual.pdf)
-* [Performance Report PDF](https://github.com/OAID/CaffeOnACL/blob/master/acl_openailab/performance_report.pdf)
+* [Installation instructions](acl_openailab/installation.md)
+* [User Manuals PDF](acl_openailab/user_manual.pdf)
+* [Performance Report PDF](acl_openailab/performance_report.pdf)
+* [Accuracy Report PDF](acl_openailab/accuracy_report.pdf)
 
 ### Arm Compute Library Compatibility Issues :
 There are some compatibility issues between ACL and Caffe Layers, we bypass it to Caffe's original layer class as the workaround solution for the below issues
diff --git a/acl_openailab/accuracy_report.pdf b/acl_openailab/accuracy_report.pdf
new file mode 100644
index 00000000..75d9c08f
--- /dev/null
+++ b/acl_openailab/accuracy_report.pdf
diff --git a/acl_openailab/installation.md b/acl_openailab/installation.md
index 75bb62bb..f97c1040 100644
--- a/acl_openailab/installation.md
+++ b/acl_openailab/installation.md
@@ -21,10 +21,11 @@ This Installation will help you get started to setup CaffeOnACL on RK3399 quickl
 	wget --no-check-certificate https://github.com/opencv/opencv/archive/3.3.0.tar.gz
 	tar -xvf 3.3.0.tar.gz
 #### Download "gen-pkg-config-pc" 
-	wget https://github.com/OAID/AID-tools/raw/master/script/gen-pkg-config-pc.sh
+	wget ftp://ftp.openailab.net/tools/script/gen-pkg-config-pc.sh
+	chmod +x ./gen-pkg-config-pc.sh
 #### Download "ACL" 
 	git clone https://github.com/ARM-software/ComputeLibrary.git
-	git checkout bf8b01d
+	git checkout 48bc34e
 #### Download "CaffeOnACL" :
 	git clone https://github.com/OAID/CaffeOnACL.git
 #### Download "Googletest" :
@@ -45,14 +46,16 @@ This Installation will help you get started to setup CaffeOnACL on RK3399 quickl
 	mkdir build
     aarch64-linux-gnu-gcc opencl-1.2-stubs/opencl_stubs.c -Iinclude -shared -o build/libOpenCL.so
 	scons Werror=1 -j4 debug=0 asserts=1 neon=1 opencl=1 embed_kernels=1 os=linux arch=arm64-v8a
-	wget https://github.com/OAID/AID-tools/raw/master/script/Computelibrary/Makefile
+	wget ftp://ftp.openailab.net/tools/script/Computelibrary/Makefile
 	sudo make install
+	sudo ~/gen-pkg-config-pc.sh /usr/local/AID
 
 ## 3.3 Build Caffe :
 	cd ~/CaffeOnACL
 	make all 
 	make distribute
 	sudo make install
+	sudo ~/gen-pkg-config-pc.sh /usr/local/AID
 
 ## 3.4 Build Unit tests
 ##### Build the gtest libraries
diff --git a/acl_openailab/performance_report.pdf b/acl_openailab/performance_report.pdf
index 7bffc1d8..4cc1b531 100644
--- a/acl_openailab/performance_report.pdf
+++ b/acl_openailab/performance_report.pdf
diff --git a/acl_openailab/user_manual.pdf b/acl_openailab/user_manual.pdf
index 25530f04..aadfe38e 100644
--- a/acl_openailab/user_manual.pdf
+++ b/acl_openailab/user_manual.pdf
diff --git a/data/ilsvrc12/get_ilsvrc_aux.sh b/data/ilsvrc12/get_ilsvrc_aux.sh
index dc0d0a72..835212bd 100644
--- a/data/ilsvrc12/get_ilsvrc_aux.sh
+++ b/data/ilsvrc12/get_ilsvrc_aux.sh
@@ -18,4 +18,4 @@ echo "Unzipping..."
 
 tar -xf caffe_ilsvrc12.tar.gz && rm -f caffe_ilsvrc12.tar.gz
 
-echo "Done."
+echo "Done. "
+\ No newline at end of file
diff --git a/examples/cpp_classification/classification_profiling_schedule.cpp b/examples/cpp_classification/classification_profiling_schedule.cpp
new file mode 100644
index 00000000..91fff5f7
--- /dev/null
+++ b/examples/cpp_classification/classification_profiling_schedule.cpp
@@ -0,0 +1,547 @@
+#include <caffe/caffe.hpp>
+#ifdef USE_OPENCV
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif  // USE_OPENCV
+#include <algorithm>
+#include <iosfwd>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#ifdef USE_PROFILING
+
+#include <iostream>
+
+#include <time.h>
+
+#define REPEAT_TEST
+
+unsigned long get_cur_time(void)
+{
+   struct timespec tm;
+
+   clock_gettime(CLOCK_MONOTONIC_COARSE, &tm);
+
+   return (tm.tv_sec*1000+tm.tv_nsec/1000000);
+}
+
+#endif //USE_PROFILING
+
+#ifdef USE_OPENCV
+using namespace caffe;  // NOLINT(build/namespaces)
+using std::string;
+
+/* Pair (label, confidence) representing a prediction. */
+typedef std::pair<string, float> Prediction;
+
+class Classifier {
+ public:
+  Classifier(const string& model_file,
+             const string& trained_file,
+             const string& mean_file,
+             const string& label_file);
+
+  std::vector<Prediction> Classify(const cv::Mat& img, int N = 5);
+
+#ifdef USE_PROFILING
+
+#ifdef LAYER_PERF_STAT
+  void  dump_perf_stat(void);
+  void  dump_single_layer_io(int idx, Layer<float> * p_layer);
+  void  dump_single_layer_perf(int idx, Layer<float> * p_layer,uint64_t total_net_time);
+#ifdef REPEAT_TEST
+  void collect_layer_stat(vector<vector<perf_stat> * > & all_stat);
+  void dump_all_stat(vector <vector<perf_stat>*>& all_stat);
+  void reset_layer_stat();
+#endif
+#endif
+
+#endif //USE_PROFILING
+
+ private:
+  void SetMean(const string& mean_file);
+
+  std::vector<float> Predict(const cv::Mat& img);
+
+  void WrapInputLayer(std::vector<cv::Mat>* input_channels);
+
+  void Preprocess(const cv::Mat& img,
+                  std::vector<cv::Mat>* input_channels);
+
+ private:
+  shared_ptr<Net<float> > net_;
+  cv::Size input_geometry_;
+  int num_channels_;
+  cv::Mat mean_;
+  std::vector<string> labels_;
+};
+
+Classifier::Classifier(const string& model_file,
+                       const string& trained_file,
+                       const string& mean_file,
+                       const string& label_file) {
+#ifdef CPU_ONLY
+  Caffe::set_mode(Caffe::CPU);
+#else
+  Caffe::set_mode(Caffe::GPU);
+#endif
+
+  AclEnableSchedule();
+  /* Load the network. */
+  net_.reset(new Net<float>(model_file, TEST));
+  net_->CopyTrainedLayersFrom(trained_file);
+
+  CHECK_EQ(net_->num_inputs(), 1) << "Network should have exactly one input.";
+  CHECK_EQ(net_->num_outputs(), 1) << "Network should have exactly one output.";
+
+  Blob<float>* input_layer = net_->input_blobs()[0];
+  num_channels_ = input_layer->channels();
+  CHECK(num_channels_ == 3 || num_channels_ == 1)
+    << "Input layer should have 1 or 3 channels.";
+  input_geometry_ = cv::Size(input_layer->width(), input_layer->height());
+
+  /* Load the binaryproto mean file. */
+  SetMean(mean_file);
+
+  /* Load labels. */
+  std::ifstream labels(label_file.c_str());
+  CHECK(labels) << "Unable to open labels file " << label_file;
+  string line;
+  while (std::getline(labels, line))
+    labels_.push_back(string(line));
+
+  Blob<float>* output_layer = net_->output_blobs()[0];
+  CHECK_EQ(labels_.size(), output_layer->channels())
+    << "Number of labels is different from the output layer dimension.";
+}
+
+static bool PairCompare(const std::pair<float, int>& lhs,
+                        const std::pair<float, int>& rhs) {
+  return lhs.first > rhs.first;
+}
+
+/* Return the indices of the top N values of vector v. */
+static std::vector<int> Argmax(const std::vector<float>& v, int N) {
+  std::vector<std::pair<float, int> > pairs;
+  for (size_t i = 0; i < v.size(); ++i)
+    pairs.push_back(std::make_pair(v[i], i));
+  std::partial_sort(pairs.begin(), pairs.begin() + N, pairs.end(), PairCompare);
+
+  std::vector<int> result;
+  for (int i = 0; i < N; ++i)
+    result.push_back(pairs[i].second);
+  return result;
+}
+
+/* Return the top N predictions. */
+std::vector<Prediction> Classifier::Classify(const cv::Mat& img, int N) {
+  std::vector<float> output = Predict(img);
+
+  N = std::min<int>(labels_.size(), N);
+  std::vector<int> maxN = Argmax(output, N);
+  std::vector<Prediction> predictions;
+  for (int i = 0; i < N; ++i) {
+    int idx = maxN[i];
+    predictions.push_back(std::make_pair(labels_[idx], output[idx]));
+  }
+
+  return predictions;
+}
+
+/* Load the mean file in binaryproto format. */
+void Classifier::SetMean(const string& mean_file) {
+  BlobProto blob_proto;
+  ReadProtoFromBinaryFileOrDie(mean_file.c_str(), &blob_proto);
+
+  /* Convert from BlobProto to Blob<float> */
+  Blob<float> mean_blob;
+  mean_blob.FromProto(blob_proto);
+  CHECK_EQ(mean_blob.channels(), num_channels_)
+    << "Number of channels of mean file doesn't match input layer.";
+
+  /* The format of the mean file is planar 32-bit float BGR or grayscale. */
+  std::vector<cv::Mat> channels;
+  float* data = mean_blob.mutable_cpu_data();
+  for (int i = 0; i < num_channels_; ++i) {
+    /* Extract an individual channel. */
+    cv::Mat channel(mean_blob.height(), mean_blob.width(), CV_32FC1, data);
+    channels.push_back(channel);
+    data += mean_blob.height() * mean_blob.width();
+  }
+
+  /* Merge the separate channels into a single image. */
+  cv::Mat mean;
+  cv::merge(channels, mean);
+
+  /* Compute the global mean pixel value and create a mean image
+   * filled with this value. */
+  cv::Scalar channel_mean = cv::mean(mean);
+  mean_ = cv::Mat(input_geometry_, mean.type(), channel_mean);
+}
+
+std::vector<float> Classifier::Predict(const cv::Mat& img) {
+  Blob<float>* input_layer = net_->input_blobs()[0];
+  input_layer->Reshape(1, num_channels_,
+                       input_geometry_.height, input_geometry_.width);
+  /* Forward dimension change to all layers. */
+  net_->Reshape();
+
+  std::vector<cv::Mat> input_channels;
+  WrapInputLayer(&input_channels);
+
+  Preprocess(img, &input_channels);
+
+#ifdef USE_PROFILING
+  unsigned long tstart=get_cur_time();
+#endif //USE_PROFILING
+
+  net_->Forward();
+
+#ifdef USE_PROFILING
+
+  unsigned long tend=get_cur_time();
+
+  std::cout<<"used time: "<<tend-tstart<<std::endl;
+
+#ifdef LAYER_PERF_STAT
+  dump_perf_stat(); 
+#ifdef REPEAT_TEST
+
+   reset_layer_stat();
+
+   vector<vector<perf_stat>* >  all_stat;
+   int rep_number=10;
+
+   for(int i=0;i<rep_number;i++)
+   {
+      net_->Forward();
+      collect_layer_stat(all_stat);
+      reset_layer_stat();
+   }
+
+   //dump stats
+   dump_all_stat(all_stat);
+
+   for(int i=0;i<all_stat.size();i++)
+         delete all_stat[i];
+   
+#endif //REPEAT_TEST
+#endif //LAYER_PERF_STAT
+#endif //USE_PROFILING
+
+  /* Copy the output layer to a std::vector */
+  Blob<float>* output_layer = net_->output_blobs()[0];
+  const float* begin = output_layer->cpu_data();
+  const float* end = begin + output_layer->channels();
+  return std::vector<float>(begin, end);
+}
+
+#ifdef USE_PROFILING
+
+#ifdef LAYER_PERF_STAT
+
+#ifdef REPEAT_TEST
+void Classifier::collect_layer_stat(vector<vector<perf_stat>*>& all_stat)
+{
+   vector<perf_stat > * p_stat;
+   perf_stat * p_time_stat;
+   const vector<shared_ptr<Layer<float> > >& layers=net_->layers();
+
+   
+   p_stat=new vector<perf_stat>;
+
+   for (int i =0;i< layers.size(); i++) {
+        p_time_stat=layers[i]->get_time_stat();
+        p_stat->push_back(*p_time_stat);
+
+   }
+
+   all_stat.push_back(p_stat);
+}
+
+void Classifier::reset_layer_stat(void)
+{
+   const vector<shared_ptr<Layer<float> > >& layers=net_->layers();
+   perf_stat * p_time_stat;
+
+   for (int i =0;i< layers.size(); i++) {
+        p_time_stat=layers[i]->get_time_stat();
+
+        p_time_stat->count=0;
+        p_time_stat->total=0;
+        p_time_stat->used=p_time_stat->start=p_time_stat->end=0;
+   }
+}
+
+void Classifier::dump_all_stat(vector<vector<perf_stat>*>& all_stat)
+{
+
+   struct new_perf_stat {
+        perf_stat stat;
+        int       idx;
+   };
+    
+   vector<new_perf_stat > layer_stat;
+   perf_stat * p_stat;
+
+   uint64_t total_time=0;
+
+   layer_stat.resize(all_stat[0]->size());
+
+   for(int i=0;i<all_stat.size();i++)
+   {
+      for(int j=0;j<layer_stat.size();j++)
+       {
+          p_stat=&layer_stat[j].stat;
+
+          p_stat->total+=(*all_stat[i])[j].total;
+          p_stat->count+=(*all_stat[i])[j].count;
+          total_time+=(*all_stat[i])[j].total;
+       }
+   }
+
+   total_time=total_time/all_stat.size();
+
+   std::cout<<std::endl<<"----------------------------------"<<std::endl;
+   std::cout<<"STATS for "<<all_stat.size()<<" reptitions: ..."<<std::endl;
+   std::cout<<"Total time: "<<total_time<<" per forward"<<std::endl;
+   std::cout<<"Each layer stats: ..."<<std::endl;
+
+
+   for(int i=layer_stat.size()-1;i>=0;i--)
+   {
+      p_stat=&layer_stat[i].stat;
+
+      layer_stat[i].idx=i;
+
+     std::cout<<"  "<<i<<": used time: "<<p_stat->total/all_stat.size();
+     std::cout<<" ratio: "<<((float)p_stat->total)/all_stat.size()/total_time*100;
+     std::cout<<" enter count: "<<p_stat->count/all_stat.size()<<std::endl;
+   }
+
+   std::cout<<std::endl;
+
+   std::cout<<"time cost top 10 layers are: ..."<<std::endl;
+
+   std::sort(layer_stat.begin(),layer_stat.end(),[](const new_perf_stat& a, const new_perf_stat& b)
+       {
+          if(a.stat.total>b.stat.total)
+            return true;
+          else
+            return false;
+       });
+
+   uint64_t  top_total_time=0;
+
+   for(int i=0; i<10; i++)
+   {
+      p_stat=&layer_stat[i].stat;
+
+     std::cout<<"  "<<layer_stat[i].idx<<": used time: "<<p_stat->total/all_stat.size();
+     std::cout<<" ratio: "<<((float)p_stat->total)/all_stat.size()/total_time*100;
+     std::cout<<" enter count: "<<p_stat->count/all_stat.size()<<std::endl;
+     top_total_time+=p_stat->total;
+   }
+
+   std::cout<<"Top cost layers occupied: "<<(float)top_total_time/all_stat.size()/total_time*100<<std::endl;
+
+   std::cout<<std::endl;
+}
+
+#endif
+
+void Classifier::dump_single_layer_io(int idx, Layer<float> * p_layer)
+{
+   const LayerParameter& layer_param=p_layer->layer_param();
+
+   std::cout<<std::endl<<"LAYER IDX: "<<idx<<" name: "<<layer_param.name();
+   std::cout<<" type: "<<layer_param.type()<<std::endl;
+
+   const vector<Blob<float>*> *p_bottom_vec=p_layer->saved_bottom;
+
+   for(int i=0;i<layer_param.bottom_size(); i++)
+   {
+      std::cout<<"bottom "<<layer_param.bottom(i)<<": ";
+
+      Blob<float> * p_blob=(*p_bottom_vec)[i];
+
+      for(int j=0;j<p_blob->num_axes();j++)
+      {
+          std::cout<<p_blob->shape(j)<<" ";
+      }
+      std::cout<<std::endl;
+   }
+
+   const vector<Blob<float>*> *p_top_vec=p_layer->saved_top;
+   for(int i=0;i<layer_param.top_size(); i++)
+   {
+      std::cout<<"top "<<layer_param.top(i)<<": ";
+      Blob<float> * p_blob=(*p_top_vec)[i];
+
+      for(int j=0;j<p_blob->num_axes();j++)
+      {
+          std::cout<<p_blob->shape(j)<<" ";
+      }
+      std::cout<<std::endl;
+   }
+}
+
+void Classifier::dump_single_layer_perf(int idx, Layer<float> * p_layer, uint64_t total_net_time)
+{
+   const LayerParameter& layer_param=p_layer->layer_param();
+   perf_stat * p_time_stat;
+
+   p_time_stat=p_layer->get_time_stat();
+
+   std::cout<<std::endl<<"LAYER IDX: "<<idx<<" name: "<<layer_param.name();
+   std::cout<<" type: "<<layer_param.type();
+   std::cout<<"  ratio: "<<(float)p_time_stat->total/total_net_time*100<<std::endl;
+
+
+   std::cout<<"time stat:  total: "<<p_time_stat->total<<" count: "<<p_time_stat->count;
+   if(p_time_stat->count)
+    {
+       std::cout<<" average: "<<((float)p_time_stat->total)/p_time_stat->count;
+    }
+
+   std::cout<<" start: "<<p_time_stat->start<<" end: "<<p_time_stat->end;
+   std::cout<<std::endl;
+
+
+} 
+
+void Classifier::dump_perf_stat(void)
+{
+   uint64_t total_net_time=0;
+
+   const vector<shared_ptr<Layer<float> > >& layers=net_->layers();
+
+   std::cout<<"Input/output shape for each layer ... total: "<<layers.size()<<std::endl;
+
+   for (int i = layers.size() - 1; i >= 0; --i) {
+     dump_single_layer_io(i,layers[i].get());
+   }
+
+
+   for (int i = layers.size() - 1; i >= 0; --i) {
+
+     perf_stat * p_time_stat;
+
+     p_time_stat=layers[i]->get_time_stat();
+
+     total_net_time+=p_time_stat->total;
+
+   }
+   
+   std::cout<<"Time for each layer ... sum of all layers is : ";
+   std::cout<<total_net_time<<std::endl;
+
+   for (int i = layers.size() - 1; i >= 0; --i) {
+
+     dump_single_layer_perf(i,layers[i].get(),total_net_time);
+   }
+
+}
+
+#endif
+
+#endif //USE_PROFILING
+
+/* Wrap the input layer of the network in separate cv::Mat objects
+ * (one per channel). This way we save one memcpy operation and we
+ * don't need to rely on cudaMemcpy2D. The last preprocessing
+ * operation will write the separate channels directly to the input
+ * layer. */
+void Classifier::WrapInputLayer(std::vector<cv::Mat>* input_channels) {
+  Blob<float>* input_layer = net_->input_blobs()[0];
+
+  int width = input_layer->width();
+  int height = input_layer->height();
+  float* input_data = input_layer->mutable_cpu_data();
+  for (int i = 0; i < input_layer->channels(); ++i) {
+    cv::Mat channel(height, width, CV_32FC1, input_data);
+    input_channels->push_back(channel);
+    input_data += width * height;
+  }
+}
+
+void Classifier::Preprocess(const cv::Mat& img,
+                            std::vector<cv::Mat>* input_channels) {
+  /* Convert the input image to the input image format of the network. */
+  cv::Mat sample;
+  if (img.channels() == 3 && num_channels_ == 1)
+    cv::cvtColor(img, sample, cv::COLOR_BGR2GRAY);
+  else if (img.channels() == 4 && num_channels_ == 1)
+    cv::cvtColor(img, sample, cv::COLOR_BGRA2GRAY);
+  else if (img.channels() == 4 && num_channels_ == 3)
+    cv::cvtColor(img, sample, cv::COLOR_BGRA2BGR);
+  else if (img.channels() == 1 && num_channels_ == 3)
+    cv::cvtColor(img, sample, cv::COLOR_GRAY2BGR);
+  else
+    sample = img;
+
+  cv::Mat sample_resized;
+  if (sample.size() != input_geometry_)
+    cv::resize(sample, sample_resized, input_geometry_);
+  else
+    sample_resized = sample;
+
+  cv::Mat sample_float;
+  if (num_channels_ == 3)
+    sample_resized.convertTo(sample_float, CV_32FC3);
+  else
+    sample_resized.convertTo(sample_float, CV_32FC1);
+
+  cv::Mat sample_normalized;
+  cv::subtract(sample_float, mean_, sample_normalized);
+
+  /* This operation will write the separate BGR planes directly to the
+   * input layer of the network because it is wrapped by the cv::Mat
+   * objects in input_channels. */
+  cv::split(sample_normalized, *input_channels);
+
+  CHECK(reinterpret_cast<float*>(input_channels->at(0).data)
+        == net_->input_blobs()[0]->cpu_data())
+    << "Input channels are not wrapping the input layer of the network.";
+}
+
+int main(int argc, char** argv) {
+  if (argc != 6) {
+    std::cerr << "Usage: " << argv[0]
+              << " deploy.prototxt network.caffemodel"
+              << " mean.binaryproto labels.txt img.jpg" << std::endl;
+    return 1;
+  }
+
+  ::google::InitGoogleLogging(argv[0]);
+
+  string model_file   = argv[1];
+  string trained_file = argv[2];
+  string mean_file    = argv[3];
+  string label_file   = argv[4];
+  Classifier classifier(model_file, trained_file, mean_file, label_file);
+
+  string file = argv[5];
+
+  std::cout << "---------- Prediction for "
+            << file << " ----------" << std::endl;
+
+  cv::Mat img = cv::imread(file, -1);
+  CHECK(!img.empty()) << "Unable to decode image " << file;
+  std::vector<Prediction> predictions = classifier.Classify(img);
+
+  /* Print the top N predictions. */
+  for (size_t i = 0; i < predictions.size(); ++i) {
+    Prediction p = predictions[i];
+    std::cout << std::fixed << std::setprecision(4) << p.second << " - \""
+              << p.first << "\"" << std::endl;
+  }
+}
+#else
+int main(int argc, char** argv) {
+  LOG(FATAL) << "This example requires OpenCV; compile with USE_OPENCV.";
+}
+#endif  // USE_OPENCV
diff --git a/include/caffe/acl_layer.hpp b/include/caffe/acl_layer.hpp
deleted file mode 100644
index b188bb8c..00000000
--- a/include/caffe/acl_layer.hpp
+++ /dev/null
@@ -1,278 +0,0 @@
-#ifndef CAFFE_ACL_LAYER_HPP_
-#define CAFFE_ACL_LAYER_HPP_
-
-#ifdef USE_ACL
-#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
-#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
-#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
-#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
-#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
-#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
-#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
-#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
-#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
-#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
-#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h"
-#include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h"
-#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
-#include "arm_compute/runtime/CL/functions/CLDepthConcatenate.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-using namespace arm_compute;
-#define FLAGS_ENABLE_ACL_ABSVAL    0x00000001
-#define FLAGS_ENABLE_ACL_BNLL      0x00000002
-#define FLAGS_ENABLE_ACL_CONV      0x00000004
-#define FLAGS_ENABLE_ACL_FC        0x00000008
-#define FLAGS_ENABLE_ACL_LRN       0x00000010
-#define FLAGS_ENABLE_ACL_POOLING   0x00000020
-#define FLAGS_ENABLE_ACL_RELU      0x00000040
-#define FLAGS_ENABLE_ACL_SIGMOID   0x00000080
-#define FLAGS_ENABLE_ACL_SOFTMAX   0x00000100
-#define FLAGS_ENABLE_ACL_TANH      0x00000200
-#define FLAGS_ENABLE_ACL_LC        0x00000400
-#define FLAGS_ENABLE_ACL_BN        0x00000800
-#define FLAGS_ENABLE_ACL_CONCAT    0x00001000
-extern unsigned int bypass_acl_class_layer;
-#endif
-#ifdef USE_PROFILING
-#include "layer.hpp"
-
-#define MASK_LOG_APP_TIME 0x00000001
-#define MASK_LOG_ALLOCATE 0x00000002
-#define MASK_LOG_RUN      0x00000004
-#define MASK_LOG_CONFIG   0x00000008
-#define MASK_LOG_COPY     0x00000010
-#define MASK_LOG_ABSVAL   0x00000020
-#define MASK_LOG_BNLL     0x00000040
-#define MASK_LOG_CONV     0x00000080
-#define MASK_LOG_FC       0x00000100
-#define MASK_LOG_LRN      0x00000200
-#define MASK_LOG_POOLING  0x00000400
-#define MASK_LOG_RELU     0x00000800
-#define MASK_LOG_SIGMOID  0x00001000
-#define MASK_LOG_SOFTMAX  0x00002000
-#define MASK_LOG_TANH     0x00004000
-#define MASK_LOG_LC       0x00008000
-#define MASK_LOG_BN       0x00010000
-#define MASK_LOG_CONCAT   0x00020000
-#define APP_TIME_INFO     MASK_LOG_APP_TIME,"time:       \t"
-#define ACL_ALLOCATE_INFO MASK_LOG_ALLOCATE,"allocate:   \t\t"
-#define ACL_RUN_INFO      MASK_LOG_RUN,     "run:        \t\t\t"
-#define ACL_CONFIG_INFO   MASK_LOG_CONFIG,  "configure:  \t\t\t\t"
-#define ACL_COPY_INFO     MASK_LOG_COPY,    "tensor_copy:\t\t\t\t\t"
-#define ACL_ABSVAL_INFO   MASK_LOG_ABSVAL,  "ACL_ABSVAL :\t\t\t\t\t\t"
-#define ACL_BNLL_INFO     MASK_LOG_BNLL,    "ACL_BNLL   :\t\t\t\t\t\t\t"
-#define ACL_CONV_INFO     MASK_LOG_CONV,    "ACL_CONV   :\t\t\t\t\t\t\t\t"
-#define ACL_FC_INFO       MASK_LOG_FC,      "ACL_FC     :\t\t\t\t\t\t\t\t\t"
-#define ACL_LRN_INFO      MASK_LOG_LRN,     "ACL_LRN    :\t\t\t\t\t\t\t\t\t\t"
-#define ACL_POOLING_INFO  MASK_LOG_POOLING, "ACL_POOLING:\t\t\t\t\t\t\t\t\t\t\t"
-#define ACL_RELU_INFO     MASK_LOG_RELU,    "ACL_RELU   :\t\t\t\t\t\t\t\t\t\t\t\t"
-#define ACL_SIGMOID_INFO  MASK_LOG_SIGMOID, "ACL_SIGMOID:\t\t\t\t\t\t\t\t\t\t\t\t\t"
-#define ACL_SOFTMAX_INFO  MASK_LOG_SOFTMAX, "ACL_SOFTMAX:\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
-#define ACL_TANH_INFO     MASK_LOG_TANH,    "ACL_TANH   :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
-#define ACL_LC_INFO       MASK_LOG_LC,      "ACL_LC     :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
-#define ACL_BN_INFO       MASK_LOG_BN,      "ACL_BN     :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
-#define ACL_CONCAT_INFO   MASK_LOG_CONCAT,  "ACL_CONCAT :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
-extern unsigned int acl_log_flags;
-#endif //USE_PROFILING
-namespace caffe {
-#ifdef USE_ACL
-enum TensorType{
-    tensor_input,
-    tensor_output,
-    tensor_weights,
-    tensor_biases,
-    tensor_data,
-};
-template <typename ACLTensor>
-class BaseTensor:public ACLTensor{
-public:
-    BaseTensor(bool share)
-       :share_(share),type_(tensor_input),allocate_(false){
-    }
-    virtual void bindmem(void *mem,bool share){
-        mem_=mem;
-        share_=share;
-    }
-    virtual void settensortype(TensorType type){
-        type_=type;
-    };
-    virtual void map(bool blocking = true){}
-    virtual void unmap(){}
-    virtual void commit(TensorType type=tensor_data);
-    int tensor_copy(void * mem, bool toTensor=true);
-protected:
-    void* mem_;
-    bool share_;
-    TensorType type_;
-    bool allocate_;
-};
-class GPUTensor:public BaseTensor<CLTensor>{
-public:
-    explicit GPUTensor(bool share)
-       :BaseTensor(share){}
-    virtual void map(bool blocking = true){
-        if (!allocate_){
-            CLTensor::allocator()->allocate();
-            allocate_=true;
-        }
-        CLTensor::map(blocking);
-     }
-     virtual void unmap(){
-        CLTensor::unmap();
-     }
-};
-class CPUTensor:public BaseTensor<Tensor>{
-public:
-    explicit CPUTensor(bool share)
-        :BaseTensor(share){}
-    virtual void map(bool blocking = true){
-        if (!allocate_){
-            Tensor::allocator()->allocate();
-            allocate_=true;
-        }
-    }
-    virtual void unmap(){
-    }
-};
-template <typename ACLLayer,typename ACLTensor>
-class ACLXPUBaseLayer{
-public:
-    virtual void commit(){
-        if (input) {
-            input->commit(tensor_input);
-        }
-        if (output){
-            output->commit(tensor_output);
-        }
-        if (weights){
-            weights->commit(tensor_weights);
-        }
-        if (biases){
-            biases->commit(tensor_biases);
-        }
-    }
-    virtual void run(bool gpu){
-        commit();
-#ifdef USE_PROFILING
-        logtime_util log_time(ACL_RUN_INFO);
-#endif //USE_PROFILING
-        layer->run();
-        if (gpu) {
-            // Make sure all the OpenCL jobs are done executing:
-            CLScheduler::get().sync();
-        }
-    }
-    virtual bool reshape(TensorShape &shape,TensorType type);
-    explicit ACLXPUBaseLayer(){
-        layer=nullptr;
-        input=nullptr;
-        output=nullptr;
-        weights=nullptr;
-        biases=nullptr;
-        mean=nullptr;
-        var=nullptr;
-        beta=nullptr;
-        gamma=nullptr;
-#ifdef USE_CONV_CACHE
-        for(int i = 0; i < 16; ++i){
-           cache.layer[i] = nullptr;
-           cache.input[i] = nullptr;
-           cache.output[i] = nullptr;
-           cache.weights[i] = nullptr;
-           cache.biases[i] = nullptr;
-        }
-#endif //USE_CONV_CACHE    
-    }
-    virtual void freelayer(){
-#ifndef USE_CONV_CACHE
-        if (layer) delete layer;
-        if (input) delete input;
-        if (output) delete output;
-        if (weights) delete weights;
-        if (biases) delete biases;
-        if (mean) delete mean;
-        if (var) delete var;
-        if (beta) delete beta;
-        if (gamma) delete gamma;
-#endif //USE_CONV_CACHE    
-        layer=nullptr;
-        input=nullptr;
-        output=nullptr;
-        weights=nullptr;
-        biases=nullptr;
-        mean=nullptr;
-        var=nullptr;
-        beta=nullptr; 
-        gamma=nullptr;
-    }
-    virtual ~ACLXPUBaseLayer(){
-        freelayer();
-    }
-    ACLLayer *layer;
-    ACLTensor *input;
-    ACLTensor *output;
-    ACLTensor *weights;
-    ACLTensor *biases;
-    //for BN
-    ACLTensor *mean;
-    ACLTensor *var;
-    ACLTensor *beta; 
-    ACLTensor *gamma;
-#ifdef USE_CONV_CACHE
-    struct{
-        ACLLayer *layer[16];
-        ACLTensor *input[16];
-        ACLTensor *output[16];
-        ACLTensor *weights[16];
-        ACLTensor *biases[16];
-    }cache;
-#endif //USE_CONV_CACHE    
-};
-template <typename GPULayer, typename CPULayer>
-class ACLBaseLayer {
-public:
-    explicit ACLBaseLayer();
-    virtual void gpu_run();
-    virtual void cpu_run();
-    virtual ~ACLBaseLayer();
-    virtual GPULayer * new_gpulayer();
-    virtual CPULayer * new_cpulayer();
-    ACLXPUBaseLayer<GPULayer,GPUTensor>& gpu(){
-        return gpu_;
-    }
-    ACLXPUBaseLayer<CPULayer,CPUTensor>& cpu(){
-        return cpu_;
-    }
-    bool checkreshape(TensorShape shape,bool gpu=false, TensorType type=tensor_input);
-    template <typename ACLTensor> bool tensor_mem(ACLTensor *tensor,void *mem,bool share=false);
-    template <typename ACLTensor> bool tensor_mem(void *mem,ACLTensor *tensor,bool share=false);
-    template <typename ACLTensor> bool new_tensor(ACLTensor *&tensor,TensorShape shape,void *mem=nullptr,bool share=false);
-protected:
-    ACLXPUBaseLayer<GPULayer,GPUTensor> gpu_;
-    ACLXPUBaseLayer<CPULayer,CPUTensor> cpu_;
-    bool init_layer_;
-    bool force_bypass_acl_path_;
-
-};
-
-#endif
-}
-#define INSTANTIATE_ACLBASECLASS(GPULayer,CPULayer) \
-  template class ACLBaseLayer<GPULayer,CPULayer>; 
-
-#define INSTANTIATE_ACLBASE_FUNCTION(GPULayer,CPULayer,ACLTensor) \
-    template bool ACLBaseLayer<GPULayer,CPULayer>::tensor_mem(ACLTensor *tensor,void *mem,bool share); \
-    template bool ACLBaseLayer<GPULayer,CPULayer>::tensor_mem(void *mem,ACLTensor *tensor,bool share); \
-    template bool ACLBaseLayer<GPULayer,CPULayer>::new_tensor(ACLTensor *&tensor,TensorShape shape,void *mem,bool share); \
-
-
-#endif
diff --git a/include/caffe/acl_operator.hpp b/include/caffe/acl_operator.hpp
new file mode 100644
index 00000000..90051054
--- /dev/null
+++ b/include/caffe/acl_operator.hpp
@@ -0,0 +1,718 @@
+#ifndef CAFFE_ACL_LAYER_HPP_
+#define CAFFE_ACL_LAYER_HPP_
+
+#ifdef USE_ACL
+#include "arm_compute/runtime/NEON/functions/NEDepthConcatenateLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/runtime/CL/functions/CLDepthConcatenateLayer.h"
+#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
+#include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "acl_tensor.hpp"
+#include "caffe/common.hpp"
+#include "caffe/layer.hpp"
+#define FLAGS_ENABLE_ACL_ABSVAL    0x00000001
+#define FLAGS_ENABLE_ACL_BNLL      0x00000002
+#define FLAGS_ENABLE_ACL_CONV      0x00000004
+#define FLAGS_ENABLE_ACL_FC        0x00000008
+#define FLAGS_ENABLE_ACL_LRN       0x00000010
+#define FLAGS_ENABLE_ACL_POOLING   0x00000020
+#define FLAGS_ENABLE_ACL_RELU      0x00000040
+#define FLAGS_ENABLE_ACL_SIGMOID   0x00000080
+#define FLAGS_ENABLE_ACL_SOFTMAX   0x00000100
+#define FLAGS_ENABLE_ACL_TANH      0x00000200
+#define FLAGS_ENABLE_ACL_LC        0x00000400
+#define FLAGS_ENABLE_ACL_BN        0x00000800
+#define FLAGS_ENABLE_ACL_CONCAT    0x00001000
+extern unsigned int bypass_acl_class_layer;
+extern unsigned int openailab_intfp;
+#endif
+#ifdef USE_PROFILING
+#include "layer.hpp"
+
+#define MASK_LOG_APP_TIME 0x00000001
+#define MASK_LOG_ALLOCATE 0x00000002
+#define MASK_LOG_RUN      0x00000004
+#define MASK_LOG_CONFIG   0x00000008
+#define MASK_LOG_COPY     0x00000010
+#define MASK_LOG_ABSVAL   0x00000020
+#define MASK_LOG_BNLL     0x00000040
+#define MASK_LOG_CONV     0x00000080
+#define MASK_LOG_FC       0x00000100
+#define MASK_LOG_LRN      0x00000200
+#define MASK_LOG_POOLING  0x00000400
+#define MASK_LOG_RELU     0x00000800
+#define MASK_LOG_SIGMOID  0x00001000
+#define MASK_LOG_SOFTMAX  0x00002000
+#define MASK_LOG_TANH     0x00004000
+#define MASK_LOG_LC       0x00008000
+#define MASK_LOG_BN       0x00010000
+#define MASK_LOG_CONCAT   0x00020000
+#define APP_TIME_INFO     MASK_LOG_APP_TIME,"time:       \t"
+#define ACL_ALLOCATE_INFO MASK_LOG_ALLOCATE,"allocate:   \t\t"
+#define ACL_RUN_INFO      MASK_LOG_RUN,     "run:        \t\t\t"
+#define ACL_CONFIG_INFO   MASK_LOG_CONFIG,  "configure:  \t\t\t\t"
+#define ACL_COPY_INFO     MASK_LOG_COPY,    "tensor_copy:\t\t\t\t\t"
+#define ACL_ABSVAL_INFO   MASK_LOG_ABSVAL,  "ACL_ABSVAL :\t\t\t\t\t\t"
+#define ACL_BNLL_INFO     MASK_LOG_BNLL,    "ACL_BNLL   :\t\t\t\t\t\t\t"
+#define ACL_CONV_INFO     MASK_LOG_CONV,    "ACL_CONV   :\t\t\t\t\t\t\t\t"
+#define ACL_FC_INFO       MASK_LOG_FC,      "ACL_FC     :\t\t\t\t\t\t\t\t\t"
+#define ACL_LRN_INFO      MASK_LOG_LRN,     "ACL_LRN    :\t\t\t\t\t\t\t\t\t\t"
+#define ACL_POOLING_INFO  MASK_LOG_POOLING, "ACL_POOLING:\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_RELU_INFO     MASK_LOG_RELU,    "ACL_RELU   :\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_SIGMOID_INFO  MASK_LOG_SIGMOID, "ACL_SIGMOID:\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_SOFTMAX_INFO  MASK_LOG_SOFTMAX, "ACL_SOFTMAX:\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_TANH_INFO     MASK_LOG_TANH,    "ACL_TANH   :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_LC_INFO       MASK_LOG_LC,      "ACL_LC     :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_BN_INFO       MASK_LOG_BN,      "ACL_BN     :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+#define ACL_CONCAT_INFO   MASK_LOG_CONCAT,  "ACL_CONCAT :\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
+extern unsigned int acl_log_flags;
+#endif //USE_PROFILING
+namespace caffe {
+#ifdef USE_ACL
+enum TensorType{
+    tensor_input,
+    tensor_output,
+    tensor_weights,
+    tensor_biases,
+    tensor_mean,
+    tensor_var,
+    tensor_beta,
+    tensor_gamma,
+    tensor_concat,
+    tensor_data,
+};
+enum OperatorState{
+    operator_not_init,
+    operator_init_done,
+    operator_reinit,
+};
+enum OperateType{
+    operate_type_pooling,
+    operate_type_activation,
+    operate_type_lrn,
+    operate_type_conv,
+    operate_type_lc,
+    operate_type_fc,
+    operate_type_bn,
+    operate_type_softmax,
+    operate_type_concat,
+};
+class BaseACLTensor{
+public:
+    BaseACLTensor()
+         :type_(tensor_input),allocate_(false){
+    }
+    virtual void bindmem(void *mem){
+        mem_=mem;
+    }
+    virtual void settensortype(TensorType type){
+        type_=type;
+    };
+    virtual void map(bool blocking = true){
+    }
+    virtual void unmap(){}
+    virtual void commit(TensorType type=tensor_data){}
+    int tensor_copy(arm_compute::ITensor* tensor,void * mem, bool toTensor=true);
+protected:
+    void* mem_;
+    TensorType type_;
+    bool allocate_;
+};
+class ACLTensor:public BaseACLTensor,public Tensor{
+public:
+    ACLTensor(arm_compute::TensorInfo &&info)
+       :Tensor(info){
+    }
+    virtual void map(bool blocking = true){
+        if (!allocate_){
+            Tensor::allocate();
+            allocate_=true;
+        }
+        Tensor::map(blocking);
+    }
+    virtual int tensor_copy(void * mem, bool toTensor=true){
+        auto acl_tensor=this;
+        arm_compute::ITensor* tensor=acl_tensor->tensor();
+        BaseACLTensor::tensor_copy(tensor,mem,toTensor);
+        return 0;
+    }
+    virtual void unmap(){Tensor::unmap();}
+    virtual void commit(TensorType type=tensor_data);
+};
+class ACLSubTensor:public BaseACLTensor,public SubTensor{
+public:
+    ACLSubTensor(std::unique_ptr<ACLTensor> &parent,arm_compute::TensorShape &shape,arm_compute::Coordinates& coord)
+       :SubTensor(parent.get(),shape,coord){
+    }
+    virtual int tensor_copy(void * mem, bool toTensor=true){
+        return 0;
+    }
+};
+
+template <typename T>
+class TensorPair{
+public:
+    TensorPair(){}
+    ~TensorPair(){}
+    TensorType type;
+    std::unique_ptr<T> tensor;
+};
+template <typename T>
+std::unique_ptr<T> &tensor_item(std::vector<std::unique_ptr<TensorPair<T>>>& pool,TensorType type,int idx){
+    int count=0;
+    for (auto &item: pool) {
+        if(item.get()->type==type){
+            ++count;
+        }
+        if(item.get()->type==type && idx==count-1){
+            return item.get()->tensor;
+        }
+    }
+    pool.push_back((std::unique_ptr<TensorPair<T>>)std::move(new TensorPair<T>));
+    auto item=pool[pool.size()-1].get();
+    item->type=type;
+    item->tensor=NULL;
+    return item->tensor;
+}
+class ACLOperator {
+public:
+    virtual void commit(){
+        for (auto & item: tensor_pool_) {
+            if(item.get()->tensor)item.get()->tensor->commit(item.get()->type);
+        }
+    }
+    inline void run(){
+        commit();
+    #ifdef USE_PROFILING
+            logtime_util log_time(ACL_RUN_INFO);
+    #endif //USE_PROFILING
+       for(auto &c : funcs_)
+       {
+           c->run();
+       }
+    }
+
+    inline std::vector<std::unique_ptr<arm_compute::IFunction>> &funcs(){return funcs_;}
+
+    inline std::unique_ptr<ACLSubTensor> &sinput(int idx=0){return tensor_item(subtensor_pool_,tensor_input,idx);}
+    inline std::unique_ptr<ACLSubTensor> &soutput(int idx=0){return tensor_item(subtensor_pool_,tensor_output,idx);}
+    inline std::unique_ptr<ACLSubTensor> &sweights(int idx=0){return tensor_item(subtensor_pool_,tensor_weights,idx);}
+    inline std::unique_ptr<ACLSubTensor> &sbiases(int idx=0){return tensor_item(subtensor_pool_,tensor_biases,idx);}
+
+    inline std::unique_ptr<ACLTensor> &cinput(int idx=0){return tensor_item(tensor_pool_,tensor_concat,idx);}
+    inline std::unique_ptr<ACLTensor> &input(int idx=0){return tensor_item(tensor_pool_,tensor_input,idx);}
+    inline std::unique_ptr<ACLTensor> &output(int idx=0){return tensor_item(tensor_pool_,tensor_output,idx);}
+    inline std::unique_ptr<ACLTensor> &weights(int idx=0){return tensor_item(tensor_pool_,tensor_weights,idx);}
+    inline std::unique_ptr<ACLTensor> &biases(int idx=0){return tensor_item(tensor_pool_,tensor_biases,idx);}
+    inline std::unique_ptr<ACLTensor> &mean(int idx=0){return tensor_item(tensor_pool_,tensor_mean,idx);}
+    inline std::unique_ptr<ACLTensor> &var(int idx=0){return tensor_item(tensor_pool_,tensor_var,idx);}
+    inline std::unique_ptr<ACLTensor> &beta(int idx=0){return tensor_item(tensor_pool_,tensor_beta,idx);}
+    inline std::unique_ptr<ACLTensor> &gamma(int idx=0){return tensor_item(tensor_pool_,tensor_gamma,idx);}
+    inline std::unique_ptr<ACLTensor> &tensor(TensorType type){
+        switch (type) {
+        case tensor_biases:
+            return biases();
+            break;
+        case tensor_weights:
+            return weights();
+            break;
+        case tensor_output:
+            return output();
+            break;
+        default:
+        case tensor_input:
+            return input();
+            break;
+        }
+        return input();
+    }
+
+
+    explicit ACLOperator(const LayerParameter& param);
+    virtual ~ACLOperator();
+    inline TargetHint getTargetHint(){
+#ifdef USE_OPENCL
+        if (target_hint_==TargetHint::DONT_CARE) {
+            if (Caffe::arm_gpu_mode()) {
+                return TargetHint::OPENCL; 
+            }
+            return TargetHint::NEON;
+        }
+        return target_hint_;
+#else
+        return TargetHint::NEON;
+#endif
+    }
+    inline void setTargetHint(TargetHint hint){
+        target_hint_=hint;
+    }
+    inline ConvolutionMethodHint & getConvMethod(){ return convolution_method_hint_;}
+    inline bool tensor_mem(std::unique_ptr<ACLTensor> &tensor,void *mem){
+        tensor->bindmem(mem);
+        return true;
+    }
+    inline bool tensor_mem(void *mem,std::unique_ptr<ACLTensor> &tensor){
+        tensor->tensor_copy(mem,false);
+        return true;
+    }
+    bool new_tensor(std::unique_ptr<ACLTensor> &tensor,arm_compute::TensorShape &shape,void *mem=nullptr,bool commit=false);
+    bool new_tensor(std::unique_ptr<ACLSubTensor> &tensor,std::unique_ptr<ACLTensor> &parent,arm_compute::TensorShape &shape,arm_compute::Coordinates& coord);
+    inline int & group(){return _group;}
+    inline void set_operator_property(OperateType type,const char*name){
+        name_=name;
+        type_=type;
+    }
+    inline void acl_run(void *input_data, void *output_data){
+        if(input_data)tensor_mem(input(),input_data);
+        run();
+        tensor_mem(output_data,output());
+    }
+
+
+protected:
+    inline bool isGPUMode(){
+        if (!support_opencl_) return false;
+        return getTargetHint()==TargetHint::OPENCL;
+    }
+    inline OperatorState & opstate(){return operator_state_;}
+    inline bool is_operator_init_done(arm_compute::TensorShape shape,TensorType type=tensor_input){
+        checkreshape(shape,type);
+        return operator_state_==operator_init_done;
+    }
+    inline void set_operator_init_done(){
+        opstate()=operator_init_done;
+        set_bypass_state(false);
+    }
+    inline void set_bypass_state(bool state=false){
+        force_bypass_acl_path_=state;
+    }
+    inline OperatorState checkreshape(arm_compute::TensorShape shape,TensorType type=tensor_input){
+        opstate()=reshape(shape,type);
+        if (opstate()==operator_reinit) {
+            freeres();
+        }
+        return opstate();
+    }
+    inline OperatorState reshape(arm_compute::TensorShape &shape,TensorType type){
+        arm_compute::TensorShape _shape;
+        std::unique_ptr<ACLTensor> &acl_tensor=tensor(type);
+        if (!acl_tensor.get()) return operator_not_init;
+        _shape = acl_tensor->info().tensor_shape();
+        if (_shape.total_size()==shape.total_size() && _shape[0]==shape[0] && _shape[1]==shape[1]) {
+            return operator_init_done;
+        }
+        return operator_reinit;
+    }
+    inline void freeres(){
+        tensor_pool_.clear();
+        subtensor_pool_.clear();
+        funcs_.clear();
+    }
+    inline const char* &name(){return name_;}
+
+protected:
+    std::vector<std::unique_ptr<TensorPair<ACLTensor>>>tensor_pool_;
+    std::vector<std::unique_ptr<TensorPair<ACLSubTensor>>>subtensor_pool_;
+    std::vector<std::unique_ptr<arm_compute::IFunction>> funcs_;
+    OperatorState operator_state_;
+    bool force_bypass_acl_path_;
+    TargetHint            target_hint_;             
+    ConvolutionMethodHint convolution_method_hint_; 
+    static bool support_opencl_;
+    static bool init_cl_env;
+    int _group;
+    const char* name_;
+    OperateType type_;
+};
+
+int isScheduleEnable();
+
+template <typename OperatorType, typename TensorType>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(arm_compute::ITensor *input, arm_compute::ITensor *output){
+    auto op = cpp14::make_unique<OperatorType>();
+    op->configure(
+        dynamic_cast<TensorType *>(input),
+        dynamic_cast<TensorType *>(output)
+        );
+
+    return std::move(op);
+}
+
+template <typename OperatorType, typename TensorType>
+std::unique_ptr<arm_compute::IFunction> instantiate(arm_compute::ITensor *input, arm_compute::ITensor *output)
+{
+    return instantiate_function<OperatorType, TensorType>(input, output);
+}
+
+template <typename GPUOpType,typename GPUTensor,typename CPUOpType,typename CPUTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func(std::unique_ptr<ACLTensor>& input, std::unique_ptr<ACLTensor>& output,TargetHint&  hint){
+    std::unique_ptr<arm_compute::IFunction> func;
+#ifdef USE_OPENCL
+    if(hint == TargetHint::OPENCL)
+    {
+        func = instantiate<GPUOpType, GPUTensor>(input->tensor(), output->tensor());
+    }
+    else
+#endif
+    {
+        func = instantiate<CPUOpType, CPUTensor>(input->tensor(), output->tensor());
+    }
+    return func;
+}
+
+
+template <typename OperatorType, typename TensorType,typename VectorTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(VectorTensor inputs, arm_compute::ITensor *output){
+    auto op = cpp14::make_unique<OperatorType>();
+    op->configure(
+        inputs,
+        dynamic_cast<TensorType *>(output)
+        );
+
+    return std::move(op);
+}
+
+template <typename OperatorType, typename TensorType,typename VectorTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate(VectorTensor inputs, arm_compute::ITensor *output)
+{
+    return instantiate_function<OperatorType, TensorType,VectorTensor>(inputs, output);
+}
+
+template <typename GPUOpType,typename GPUTensor,typename CPUOpType,typename CPUTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func_lists(ACLOperator*& acl_op, std::unique_ptr<ACLTensor>& output,int num,TargetHint&  hint){
+    std::unique_ptr<arm_compute::IFunction> func;
+#ifdef USE_OPENCL
+    if(hint == TargetHint::OPENCL)
+    {
+        static std::vector<arm_compute::ICLTensor*> tensors;
+        tensors.clear();
+        for (int i=0;i<num;++i) {
+            tensors.push_back(dynamic_cast<arm_compute::ICLTensor*>(acl_op->cinput(i).get()->tensor()));
+        }
+        func = instantiate<GPUOpType, GPUTensor, std::vector<arm_compute::ICLTensor *>>(tensors, output->tensor());
+    }
+    else
+#endif
+    {
+        static std::vector<arm_compute::ITensor*> tensors;
+        tensors.clear();
+        for (int i=0;i<num;++i) {
+            tensors.push_back(dynamic_cast<arm_compute::ITensor*>(acl_op->cinput(i).get()->tensor()));
+        }
+        func = instantiate<CPUOpType, CPUTensor,std::vector<arm_compute::ITensor*>>(tensors, output->tensor());
+    }
+    return func;
+}
+
+template <typename OperatorType, typename TensorType,typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(arm_compute::ITensor *input, arm_compute::ITensor *output, const OperatorInfo &info){
+    auto op = cpp14::make_unique<OperatorType>();
+    op->configure(
+        dynamic_cast<TensorType *>(input),
+        dynamic_cast<TensorType *>(output),
+        info);
+
+    return std::move(op);
+}
+
+template <typename OperatorType, typename TensorType,typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate(arm_compute::ITensor *input, arm_compute::ITensor *output, const OperatorInfo &info)
+{
+    return instantiate_function<OperatorType, TensorType, OperatorInfo>(input, output, info);
+}
+
+template <typename GPUOpType,typename GPUTensor,typename CPUOpType,typename CPUTensor, typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func(std::unique_ptr<ACLTensor>&  input, std::unique_ptr<ACLTensor>&  output, const OperatorInfo &info,TargetHint&  hint){
+    std::unique_ptr<arm_compute::IFunction> func;
+#ifdef USE_OPENCL
+    if(hint == TargetHint::OPENCL)
+    {
+        func = instantiate<GPUOpType, GPUTensor,OperatorInfo>(input->tensor(), output->tensor(), info);
+    }
+    else
+#endif
+    {
+        func = instantiate<CPUOpType, CPUTensor,OperatorInfo>(input->tensor(), output->tensor(), info);
+    }
+    return func;
+}
+
+
+template <typename OperatorType, typename TensorType,typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(arm_compute::ITensor *input,arm_compute::ITensor *weights,arm_compute::ITensor *biases, arm_compute::ITensor *output, const OperatorInfo &info){
+    auto op = cpp14::make_unique<OperatorType>();
+    op->configure(
+        dynamic_cast<TensorType *>(input),
+        dynamic_cast<TensorType *>(weights),
+        dynamic_cast<TensorType *>(biases),
+        dynamic_cast<TensorType *>(output),
+        info);
+    return std::move(op);
+}
+
+template <typename OperatorType, typename TensorType,typename OperatorInfo>
+std::unique_ptr<arm_compute::IFunction> instantiate(arm_compute::ITensor *input,arm_compute::ITensor *weights,arm_compute::ITensor *biases, arm_compute::ITensor *output, const OperatorInfo &info)
+{
+    return instantiate_function<OperatorType, TensorType, OperatorInfo>(input,weights,biases,output, info);
+}
+
+template <typename GPUOpType,typename GPUTensor,typename CPUOpType,typename CPUTensor, typename OperatorInfo,typename ACLTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func(std::unique_ptr<ACLTensor>&  input,std::unique_ptr<ACLTensor>& weights,std::unique_ptr<ACLTensor>&  biases, std::unique_ptr<ACLTensor>&  output, const OperatorInfo &info,TargetHint&  hint){
+    std::unique_ptr<arm_compute::IFunction> func;
+    arm_compute::ITensor * biases_tensor=NULL;
+
+    if (biases.get()) {
+        biases_tensor=biases->tensor();
+    }
+#ifdef USE_OPENCL
+    if (hint == TargetHint::OPENCL)
+    {
+        func = instantiate<GPUOpType, GPUTensor,OperatorInfo>(input->tensor(), weights->tensor(),biases_tensor,output->tensor(), info);
+    }
+    else
+#endif
+    {
+        func = instantiate<CPUOpType, CPUTensor,OperatorInfo>(input->tensor(), weights->tensor(),biases_tensor, output->tensor(), info);
+    }
+    return func;
+}
+
+
+
+template <typename Dtype,typename OperatorType, typename TensorType>
+std::unique_ptr<arm_compute::IFunction> instantiate_function(arm_compute::ITensor *input, arm_compute::ITensor *output,
+               arm_compute::ITensor *mean,arm_compute::ITensor *var,arm_compute::ITensor *beta,arm_compute::ITensor *gamma,Dtype & eps){
+    auto op = cpp14::make_unique<OperatorType>();
+    op->configure(
+        dynamic_cast<TensorType *>(input),
+        dynamic_cast<TensorType *>(output),
+        dynamic_cast<TensorType *>(mean),
+        dynamic_cast<TensorType *>(var),
+        dynamic_cast<TensorType *>(beta),
+        dynamic_cast<TensorType *>(gamma),
+        eps);
+
+    return std::move(op);
+}
+
+template <typename Dtype,typename OperatorType, typename TensorType>
+std::unique_ptr<arm_compute::IFunction> instantiate(arm_compute::ITensor * input,arm_compute::ITensor * output, 
+               arm_compute::ITensor * mean,arm_compute::ITensor * var,arm_compute::ITensor * beta,arm_compute::ITensor * gamma,Dtype eps){
+    return instantiate_function<Dtype,OperatorType, TensorType>(input,output, mean,var,beta,gamma,eps);
+}
+
+template <typename Dtype,typename GPUOpType,typename GPUTensor,typename CPUOpType,typename CPUTensor>
+std::unique_ptr<arm_compute::IFunction> instantiate_op_func(std::unique_ptr<ACLTensor>& input,std::unique_ptr<ACLTensor>& output, 
+               std::unique_ptr<ACLTensor>& mean,std::unique_ptr<ACLTensor>& var,std::unique_ptr<ACLTensor>& beta,std::unique_ptr<ACLTensor>& gamma,Dtype eps,TargetHint  hint){
+    std::unique_ptr<arm_compute::IFunction> func;
+#ifdef USE_OPENCL
+    if(hint == TargetHint::OPENCL)
+    {
+        func = instantiate<Dtype,GPUOpType, GPUTensor>(input->tensor(),output->tensor(), mean->tensor(),var->tensor(),beta->tensor(),gamma->tensor(),eps);
+    }
+    else
+#endif
+    {
+        func = instantiate<Dtype,CPUOpType, CPUTensor>(input->tensor(),output->tensor(), mean->tensor(),var->tensor(),beta->tensor(),gamma->tensor(),eps);
+    }
+    return func;
+}
+
+
+template <typename OperatorInfo>
+bool instantiate_op_pooling(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input, std::unique_ptr<ACLTensor> & output,TargetHint  hint, const OperatorInfo &info){
+    func.push_back(instantiate_op_func<arm_compute::CLPoolingLayer, arm_compute::ICLTensor, arm_compute::NEPoolingLayer, arm_compute::ITensor, arm_compute::PoolingLayerInfo>(input, output, info, hint));
+    return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_activation(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input,std::unique_ptr<ACLTensor> & output,TargetHint  hint, const OperatorInfo &info){
+    func.push_back(instantiate_op_func<arm_compute::CLActivationLayer,arm_compute::ICLTensor,arm_compute::NEActivationLayer,arm_compute::ITensor, arm_compute::ActivationLayerInfo>(input, output, info, hint));
+    return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_lrn(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input,std::unique_ptr<ACLTensor> & output,TargetHint  hint, const OperatorInfo &info){
+    func.push_back(instantiate_op_func<arm_compute::CLNormalizationLayer,arm_compute::ICLTensor,arm_compute::NENormalizationLayer,arm_compute::ITensor, arm_compute::NormalizationLayerInfo>(input, output, info, hint));
+    return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_conv(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input,std::unique_ptr<ACLTensor> & output,TargetHint  hint,const OperatorInfo &info){
+    std::unique_ptr<ACLTensor> & weights=acl_op->weights();
+    std::unique_ptr<ACLTensor> & biases=acl_op->biases();
+    ConvolutionMethodHint& conv_method=acl_op->getConvMethod();
+    bool has_biases=biases.get()?true:false;
+    int& groups=acl_op->group();
+    arm_compute::TensorShape input_shape=input->info().tensor_shape();
+    arm_compute::TensorShape weights_shape=weights->info().tensor_shape();
+    arm_compute::TensorShape biases_shape;
+    if (has_biases) {
+        biases_shape = biases->info().tensor_shape();
+    }
+    arm_compute::TensorShape output_shape=output->info().tensor_shape();
+
+    if (groups==1) {
+        if (conv_method == ConvolutionMethodHint::GEMM) {
+            func.push_back(instantiate_op_func<arm_compute::CLConvolutionLayer, arm_compute::ICLTensor, arm_compute::NEConvolutionLayer, arm_compute::ITensor, arm_compute::PadStrideInfo>(acl_op->input(), acl_op->weights(), acl_op->biases(), acl_op->output(), info, hint));
+        }else{
+            func.push_back(instantiate_op_func<arm_compute::CLDirectConvolutionLayer, arm_compute::ICLTensor, arm_compute::NEDirectConvolutionLayer, arm_compute::ITensor, arm_compute::PadStrideInfo>(acl_op->input(), acl_op->weights(), acl_op->biases(), acl_op->output(), info, hint));
+        }
+        return true;
+    }
+
+    // Calculate sub-tensor splits
+    const int input_split   = input_shape.z()  / groups;
+    const int output_split  = output_shape.z() / groups;
+    const int weights_split = weights_shape[3] / groups;
+    const int biases_split  = biases_shape.x() / groups;
+
+    // Calculate sub-tensor shapes
+    input_shape.set(2, input_split);
+    output_shape.set(2, output_split);
+    weights_shape.set(3, weights_split);
+    biases_shape.set(0, biases_split);
+
+    for (auto i = 0; i < groups; ++i) {
+        // Calculate sub-tensors starting coordinates
+        arm_compute::Coordinates input_coord(0, 0, input_split * i);
+        arm_compute::Coordinates output_coord(0, 0, output_split * i);
+        arm_compute::Coordinates weights_coord(0, 0, 0, weights_split * i);
+        arm_compute::Coordinates biases_coord(biases_split * i);
+
+        // Create sub-tensors for input, output, weights and bias
+        acl_op->new_tensor(acl_op->sinput(i), acl_op->input(), input_shape, input_coord);
+        acl_op->new_tensor(acl_op->soutput(i),acl_op->output(),output_shape, output_coord);
+        acl_op->new_tensor(acl_op->sweights(i),acl_op->weights(), weights_shape, weights_coord);
+        if (has_biases) {
+            acl_op->new_tensor(acl_op->sbiases(i),acl_op->biases(), biases_shape, biases_coord);
+        }
+
+        if (conv_method == ConvolutionMethodHint::GEMM) {
+            func.push_back(instantiate_op_func<arm_compute::CLConvolutionLayer, arm_compute::ICLTensor, arm_compute::NEConvolutionLayer, arm_compute::ITensor, arm_compute::PadStrideInfo,ACLSubTensor>(acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i), acl_op->soutput(i), info, hint));
+        }else{
+            func.push_back(instantiate_op_func<arm_compute::CLDirectConvolutionLayer, arm_compute::ICLTensor, arm_compute::NEDirectConvolutionLayer, arm_compute::ITensor, arm_compute::PadStrideInfo,ACLSubTensor>(acl_op->sinput(i), acl_op->sweights(i), acl_op->sbiases(i), acl_op->soutput(i), info, hint));
+        }
+    }
+    return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_lc(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input,std::unique_ptr<ACLTensor> & output,TargetHint  hint, const OperatorInfo &info){
+    std::unique_ptr<ACLTensor> & weights=acl_op->weights();
+    std::unique_ptr<ACLTensor> & biases=acl_op->biases();
+    func.push_back(instantiate_op_func<arm_compute::CLLocallyConnectedLayer,arm_compute::ICLTensor,arm_compute::NELocallyConnectedLayer,arm_compute::ITensor, arm_compute::PadStrideInfo>(input, weights,biases,output,info, hint));
+    return true;
+}
+template <typename OperatorInfo>
+bool instantiate_op_fc(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input,std::unique_ptr<ACLTensor> & output,TargetHint  hint, const OperatorInfo &info){
+    std::unique_ptr<ACLTensor> & weights=acl_op->weights();
+    std::unique_ptr<ACLTensor> & biases=acl_op->biases();
+    func.push_back(instantiate_op_func<arm_compute::CLFullyConnectedLayer,arm_compute::ICLTensor,arm_compute::NEFullyConnectedLayer,arm_compute::ITensor, bool>(input, weights,biases,output,info, hint));
+    return true;
+}
+template <typename Dtype>
+bool instantiate_op_bn(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input,std::unique_ptr<ACLTensor> & output,TargetHint  hint, Dtype eps){
+    std::unique_ptr<ACLTensor> & mean=acl_op->mean();
+    std::unique_ptr<ACLTensor> & var=acl_op->var();
+    std::unique_ptr<ACLTensor> & beta=acl_op->beta();
+    std::unique_ptr<ACLTensor> & gamma=acl_op->gamma();
+    func.push_back(instantiate_op_func<Dtype,arm_compute::CLBatchNormalizationLayer,arm_compute::ICLTensor,arm_compute::NEBatchNormalizationLayer,arm_compute::ITensor>(input, output, mean,var,beta,gamma,eps, hint));
+    return true;
+}
+inline bool instantiate_op_softmax(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input,std::unique_ptr<ACLTensor> & output,TargetHint  hint,void *data){
+    func.push_back(instantiate_op_func<arm_compute::CLSoftmaxLayer,arm_compute::ICLTensor,arm_compute::NESoftmaxLayer,arm_compute::ITensor>(input, output, hint));
+    return true;
+}
+inline bool instantiate_op_concat(ACLOperator* acl_op,std::vector<std::unique_ptr<arm_compute::IFunction>> & func,std::unique_ptr<ACLTensor> & input,std::unique_ptr<ACLTensor> & output,TargetHint  hint,int num){
+    func.push_back(instantiate_op_func_lists<arm_compute::CLDepthConcatenateLayer,arm_compute::ICLTensor,arm_compute::NEDepthConcatenateLayer,arm_compute::ITensor>(acl_op, output, num,hint));
+    return true;
+}
+template <typename Dtype>
+Dtype* GetDataPtr(ACLOperator* op,Blob<Dtype>* const &blob,bool isconst=false){
+    if (!isconst) {
+        if (op->getTargetHint() == TargetHint::NEON) {
+            return blob->mutable_cpu_data();
+        }
+        return blob->mutable_gpu_data();
+    }
+    if (op->getTargetHint()==TargetHint::NEON) {
+        return (Dtype*)blob->cpu_data();
+    }
+    return (Dtype*)blob->gpu_data();
+}
+
+template <typename Dtype>
+Dtype* InputdataPtr(ACLOperator* op,const vector<Blob<Dtype>*>& bottom,int index=-1){
+    if (index==-1) index=0;
+    return GetDataPtr(op, bottom[index], true);
+}
+template <typename Dtype>
+Dtype* OutputdataPtr(ACLOperator* op,const vector<Blob<Dtype>*>& top){
+    return GetDataPtr(op,top[0]);
+}
+
+template <typename Dtype>
+void acl_run(ACLOperator* op,const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top,bool multi_input_run=true){
+    if (multi_input_run) {
+        for (int i = 0; i < bottom.size(); ++i) {
+            const Dtype* bottom_data = bottom[i]->cpu_data();
+            Dtype* top_data = top[i]->mutable_cpu_data();
+            op->acl_run((void*)bottom_data,(void*)top_data);
+        }
+        return ;
+    }
+    for (int i = 0; i < bottom.size(); ++i) {
+        op->tensor_mem(op->cinput(i),InputdataPtr(op,bottom,i));
+    }
+    op->acl_run(NULL,OutputdataPtr(op,top));
+}
+}
+
+#define INIT_GLOBAL_FUNCS_TYPE(Dtype) \
+template <> \
+Dtype* InputdataPtr(ACLOperator* op,const vector<Blob<Dtype>*>& bottom,int index); \
+template <> \
+Dtype* OutputdataPtr(ACLOperator* op,const vector<Blob<Dtype>*>& top); \
+template <> \
+Dtype* GetDataPtr(ACLOperator* op,Blob<Dtype>* const & blob,bool isconst); \
+
+#define INIT_GLOBAL_FUNCS() \
+INIT_GLOBAL_FUNCS_TYPE(double); \
+INIT_GLOBAL_FUNCS_TYPE(float); \
+
+
+#ifdef USE_PROFILING 
+#define acl_configure(opname,acl_op,args...)\
+{\
+            set_operator_property(operate_type_##opname,#opname); \
+            logtime_util log_time(ACL_CONFIG_INFO); \
+            instantiate_op_##opname(acl_op,acl_op->funcs(),acl_op->input(),acl_op->output(),acl_op->getTargetHint(),args);\
+}
+#else
+#define acl_configure(opname,acl_op,args...)\
+{\
+            set_operator_property(operate_type_##opname,#opname); \
+            instantiate_op_##opname(acl_op,acl_op->funcs(),acl_op->input(),acl_op->output(),acl_op->getTargetHint(),args);\
+}
+#endif 
+
+#endif
+
+#endif
diff --git a/include/caffe/acl_tensor.hpp b/include/caffe/acl_tensor.hpp
new file mode 100644
index 00000000..89466de3
--- /dev/null
+++ b/include/caffe/acl_tensor.hpp
@@ -0,0 +1,114 @@
+#ifndef __TENSOR_H__
+#define __TENSOR_H__
+
+#ifdef USE_ACL
+#include "arm_compute/runtime/CL/CLSubTensor.h"
+#include "arm_compute/runtime/SubTensor.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <memory>
+
+namespace caffe{
+enum class TargetHint{
+    DONT_CARE,
+    OPENCL,   
+    NEON,
+};
+
+enum class ConvolutionMethodHint{
+    GEMM,  
+    DIRECT, 
+};
+namespace cpp14{
+template <class T>
+struct _Unique_if{
+    typedef std::unique_ptr<T> _Single_object;
+};
+
+template <class T>
+struct _Unique_if<T[]>{
+    typedef std::unique_ptr<T[]> _Unknown_bound;
+};
+
+template <class T, size_t N>
+struct _Unique_if<T[N]>{
+    typedef void _Known_bound;
+};
+
+template <class T, class... Args>
+typename _Unique_if<T>::_Single_object
+make_unique(Args &&... args){
+    return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+
+template <class T>
+typename _Unique_if<T>::_Unknown_bound
+make_unique(size_t n){
+    typedef typename std::remove_extent<T>::type U;
+    return std::unique_ptr<T>(new U[n]());
+}
+
+template <class T, class... Args>
+typename _Unique_if<T>::_Known_bound
+make_unique(Args &&...) ;
+}
+
+class Tensor {
+public:
+    Tensor(arm_compute::TensorInfo &info) noexcept;
+    ~Tensor(){
+    }
+    Tensor(Tensor &&src) noexcept ;
+    void set_info(arm_compute::TensorInfo &&info){
+        _info = info;
+    }
+    arm_compute::ITensor *set_target(TargetHint target);
+    const arm_compute::TensorInfo &info() const{
+        return _info;
+    }
+    arm_compute::ITensor * tensor(){
+        return _tensor.get();
+    }
+    void allocate();
+    void init(){
+
+    }
+    TargetHint target() const{
+        return _target;
+    }
+    virtual void map(bool blocking = true);
+    virtual void unmap();
+
+private:
+    TargetHint                       _target;  
+    arm_compute::TensorInfo                       _info;    
+    std::unique_ptr<arm_compute::ITensor>         _tensor;  
+};
+
+class SubTensor 
+{
+public:
+    SubTensor(Tensor* parent, arm_compute::TensorShape& tensor_shape, arm_compute::Coordinates& coords)noexcept;
+    ~SubTensor(){}
+    arm_compute::ITensor       *tensor() ;
+    const arm_compute::ITensor *tensor() const ;
+    TargetHint                  target() const ;
+    void                        allocate() ;
+    arm_compute::ITensor *set_target(TargetHint target);
+
+private:
+    /** Instantiates a sub-tensor */
+    void instantiate_subtensor();
+
+private:
+    TargetHint                            _target;       /**< Target that this tensor is pinned on */
+    arm_compute::TensorShape              _tensor_shape; /**< SubTensor shape */
+    arm_compute::Coordinates              _coords;       /**< SubTensor Coordinates */
+    arm_compute::ITensor                 *_parent;       /**< Parent tensor */
+    std::unique_ptr<arm_compute::ITensor> _subtensor;    /**< SubTensor */
+};
+
+} 
+#endif
+#endif //__TENSOR_H__
diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp
index 49b1e695..4ffeb68c 100644
--- a/include/caffe/layer.hpp
+++ b/include/caffe/layer.hpp
@@ -22,6 +22,7 @@ extern unsigned int acl_log_flags;
 namespace boost { class mutex; }
 
 namespace caffe {
+bool AclEnableSchedule(int enable=1);
 #ifdef USE_PROFILING
 class logtime_util
 {
diff --git a/include/caffe/layers/acl_absval_layer.hpp b/include/caffe/layers/acl_absval_layer.hpp
index c1655404..9eba67de 100644
--- a/include/caffe/layers/acl_absval_layer.hpp
+++ b/include/caffe/layers/acl_absval_layer.hpp
@@ -11,7 +11,7 @@
 #include "caffe/layers/absval_layer.hpp"
 
 #ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
 #include "caffe/layers/acl_base_activation_layer.hpp"
 #endif
 
@@ -46,9 +46,10 @@ class ACLAbsValLayer : public ACLBaseActivationLayer<Dtype>,public AbsValLayer<D
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
 		  NOT_IMPLEMENTED;
       }
-  virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type);
+  virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top, arm_compute::ActivationLayerInfo::ActivationFunction type);
 
+  virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
 };
 #endif
 
diff --git a/include/caffe/layers/acl_base_activation_layer.hpp b/include/caffe/layers/acl_base_activation_layer.hpp
index e2abdafa..39643f33 100644
--- a/include/caffe/layers/acl_base_activation_layer.hpp
+++ b/include/caffe/layers/acl_base_activation_layer.hpp
@@ -10,7 +10,7 @@
 #include "caffe/layers/neuron_layer.hpp"
 
 #ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
 #include "caffe/layers/acl_base_activation_layer.hpp"
 #endif
 
@@ -23,9 +23,9 @@ namespace caffe {
  *        Fallback to BNLLLayer for some corner cases. 
  */
 template <typename Dtype>
-class ACLBaseActivationLayer : public ACLBaseLayer<CLActivationLayer,NEActivationLayer> {
+class ACLBaseActivationLayer : public ACLOperator {
  public:
-  explicit ACLBaseActivationLayer(const LayerParameter& param)
+  explicit ACLBaseActivationLayer(const LayerParameter& param):ACLOperator(param)
       {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
@@ -46,8 +46,8 @@ class ACLBaseActivationLayer : public ACLBaseLayer<CLActivationLayer,NEActivatio
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
 		  NOT_IMPLEMENTED;
       }
-  virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top,ActivationLayerInfo::ActivationFunction type=ActivationLayerInfo::ActivationFunction::RELU);
+  virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top,arm_compute::ActivationLayerInfo::ActivationFunction type=arm_compute::ActivationLayerInfo::ActivationFunction::RELU);
 };
 #endif
 
diff --git a/include/caffe/layers/acl_base_conv_layer.hpp b/include/caffe/layers/acl_base_conv_layer.hpp
deleted file mode 100644
index 6b38eb28..00000000
--- a/include/caffe/layers/acl_base_conv_layer.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-#ifndef CAFFE_ACL_BASE_CONV_LAYER_HPP_
-#define CAFFE_ACL_BASE_CONV_LAYER_HPP_
-
-#include <vector>
-
-#include "caffe/blob.hpp"
-#include "caffe/layer.hpp"
-#include "caffe/proto/caffe.pb.h"
-
-#include "caffe/layers/conv_layer.hpp"
-
-#ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
-#endif
-
-namespace caffe {
-
-#ifdef USE_ACL
-/*
- * @brief ACL implementation of ConvolutionLayer.
- *        Fallback to ConvolutionLayer for some corner cases.
- *
-*/
-template <typename Dtype,typename GPUConvLayer,typename CPUConvLayer>
-class ACLConvolutionLayer : public ACLBaseLayer<GPUConvLayer,CPUConvLayer>,public ConvolutionLayer<Dtype> {
- public:
-  explicit ACLConvolutionLayer(const LayerParameter& param)
-      : ConvolutionLayer<Dtype>(param) {}
-  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual ~ACLConvolutionLayer();
-
- protected:
-  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-		  NOT_IMPLEMENTED;
-      }
-  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
-      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
-		  NOT_IMPLEMENTED;
-      }
-  virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top);
-
-};
-#endif
-
-}  // namespace caffe
-
-// Instantiate a class with float and double specifications.
-#define INSTANTIATE_CONV_CLASS(classname,GPUConvLayer,CPUConvLayer) \
-  template class classname<float,GPUConvLayer,CPUConvLayer>; \
-  template class classname<double,GPUConvLayer,CPUConvLayer>
-
-#endif  // CAFFE_ACL_BASE_CONV_LAYER_HPP_
diff --git a/include/caffe/layers/acl_batch_norm_layer.hpp b/include/caffe/layers/acl_batch_norm_layer.hpp
index e899804f..97dcab3d 100644
--- a/include/caffe/layers/acl_batch_norm_layer.hpp
+++ b/include/caffe/layers/acl_batch_norm_layer.hpp
@@ -10,7 +10,7 @@
 #include "caffe/layers/batch_norm_layer.hpp"
 
 #ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
 #endif
 
 namespace caffe {
@@ -21,10 +21,10 @@ namespace caffe {
  *        Fallback to BatchNormLayer for some corner cases.
 */
 template <typename Dtype>
-class ACLBatchNormLayer : public ACLBaseLayer<CLBatchNormalizationLayer,NEBatchNormalizationLayer>,public BatchNormLayer<Dtype> {
+class ACLBatchNormLayer : public ACLOperator,public BatchNormLayer<Dtype> {
  public:
   explicit ACLBatchNormLayer(const LayerParameter& param)
-      : BatchNormLayer<Dtype>(param) {}
+      : ACLOperator(param),BatchNormLayer<Dtype>(param) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -44,8 +44,9 @@ class ACLBatchNormLayer : public ACLBaseLayer<CLBatchNormalizationLayer,NEBatchN
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
 		  NOT_IMPLEMENTED;
       }
-  virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+  virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
+  virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
 };
 #endif
 
diff --git a/include/caffe/layers/acl_bnll_layer.hpp b/include/caffe/layers/acl_bnll_layer.hpp
index ea2f8a16..9cf607e1 100644
--- a/include/caffe/layers/acl_bnll_layer.hpp
+++ b/include/caffe/layers/acl_bnll_layer.hpp
@@ -11,7 +11,7 @@
 #include "caffe/layers/bnll_layer.hpp"
 
 #ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
 #include "caffe/layers/acl_base_activation_layer.hpp"
 #endif
 
@@ -47,8 +47,9 @@ class ACLBNLLLayer : public ACLBaseActivationLayer<Dtype>,public BNLLLayer<Dtype
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
 		  NOT_IMPLEMENTED;
       }
-  virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type);
+  virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top, arm_compute::ActivationLayerInfo::ActivationFunction type);
+  virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
 };
 #endif
 
diff --git a/include/caffe/layers/acl_concat_layer.hpp b/include/caffe/layers/acl_concat_layer.hpp
index 90212192..bc1c9173 100644
--- a/include/caffe/layers/acl_concat_layer.hpp
+++ b/include/caffe/layers/acl_concat_layer.hpp
@@ -10,7 +10,7 @@
 #include "caffe/layers/concat_layer.hpp"
 
 #ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
 #endif
 
 namespace caffe {
@@ -21,10 +21,10 @@ namespace caffe {
  *        Fallback to ConcatLayer for some corner cases.
 */
 template <typename Dtype>
-class ACLConcatLayer : public ACLBaseLayer<CLDepthConcatenate,NEDepthConcatenate>,public ConcatLayer<Dtype> {
+class ACLConcatLayer : public ACLOperator,public ConcatLayer<Dtype> {
  public:
   explicit ACLConcatLayer(const LayerParameter& param)
-      : ConcatLayer<Dtype>(param) {}
+      : ACLOperator(param),ConcatLayer<Dtype>(param) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -44,11 +44,9 @@ class ACLConcatLayer : public ACLBaseLayer<CLDepthConcatenate,NEDepthConcatenate
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
 		  NOT_IMPLEMENTED;
       }
-  virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+  virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
- private:
-   std::vector<ITensor *> cpu_vectors;
-   std::vector<ICLTensor *> gpu_vectors;
+  virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
 };
 #endif
 
diff --git a/include/caffe/layers/acl_conv_layer.hpp b/include/caffe/layers/acl_conv_layer.hpp
index 2fd795c9..21ca6aa6 100644
--- a/include/caffe/layers/acl_conv_layer.hpp
+++ b/include/caffe/layers/acl_conv_layer.hpp
@@ -1,46 +1,56 @@
 #ifndef CAFFE_ACL_CONV_LAYER_HPP_
 #define CAFFE_ACL_CONV_LAYER_HPP_
 
+#include <vector>
+
+#include "caffe/blob.hpp"
+#include "caffe/layer.hpp"
+#include "caffe/proto/caffe.pb.h"
+
+#include "caffe/layers/conv_layer.hpp"
+
 #ifdef USE_ACL
-#include "caffe/layers/acl_base_conv_layer.hpp"
+#include "caffe/acl_operator.hpp"
 #endif
 
 namespace caffe {
 
-extern bool use_direct_conv_;
 #ifdef USE_ACL
+/*
+ * @brief ACL implementation of ConvolutionLayer.
+ *        Fallback to ConvolutionLayer for some corner cases.
+ *
+*/
 template <typename Dtype>
-inline shared_ptr<Layer<Dtype> > GetACLConvolutionLayer(
-    const LayerParameter& param) {
-    ConvolutionParameter conv_param = param.convolution_param();
-    const char* pDirectConv;
-    pDirectConv = getenv ("DIRECTCONV");
-    if (pDirectConv){
-      unsigned int bdirectconv;
-      sscanf(pDirectConv,"%i", &bdirectconv);
-      if(bdirectconv != use_direct_conv_){
-          use_direct_conv_ = bdirectconv;
-          printf("DIRECTCONV<%s>\n", pDirectConv);
-          printf("DIRECTCONV: %x\n", use_direct_conv_);
+class ACLConvolutionLayer : public ACLOperator,public ConvolutionLayer<Dtype> {
+ public:
+  explicit ACLConvolutionLayer(const LayerParameter& param)
+      : ACLOperator(param),ConvolutionLayer<Dtype>(param) {
+  }
+  virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual ~ACLConvolutionLayer();
+
+ protected:
+  virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+		  NOT_IMPLEMENTED;
       }
-    }
-    int pad_data[3];
-    if (conv_param.has_pad_h() || conv_param.has_pad_w()) {
-      pad_data[0] = conv_param.pad_h();
-      pad_data[1] = conv_param.pad_w();
-    } else {
-      const int kDefaultPad = 0;
-      const int num_pad_dims = conv_param.pad_size();
-      for (int i = 0; i < 2; ++i) {
-        pad_data[i] = (num_pad_dims == 0) ? kDefaultPad :
-            conv_param.pad((num_pad_dims == 1) ? 0 : i);
+  virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
+      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
+		  NOT_IMPLEMENTED;
       }
-    }
-    if (use_direct_conv_ && ( (conv_param.kernel_size(0)==1 &&pad_data[0]==0 && pad_data[1]==0) || (conv_param.kernel_size(0)==3 && pad_data[0]<=1 && pad_data[1] <=1 ) )) {
-        return shared_ptr<Layer<Dtype> >(new ACLConvolutionLayer<Dtype, CLConvolutionLayer, NEDirectConvolutionLayer>(param)); //NEDirectConvolutionLayer only for 1x1 and 3x3
-    }
-    return shared_ptr<Layer<Dtype> >(new ACLConvolutionLayer<Dtype, CLConvolutionLayer, NEConvolutionLayer>(param)); 
-}
+  virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top);
+  virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
+};
+
 #endif
 
 }  // namespace caffe
diff --git a/include/caffe/layers/acl_inner_product_layer.hpp b/include/caffe/layers/acl_inner_product_layer.hpp
index f42becb0..67a29914 100644
--- a/include/caffe/layers/acl_inner_product_layer.hpp
+++ b/include/caffe/layers/acl_inner_product_layer.hpp
@@ -10,7 +10,7 @@
 #include "caffe/layers/inner_product_layer.hpp"
 
 #ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
 #endif
 
 namespace caffe {
@@ -21,10 +21,11 @@ namespace caffe {
  *        Fallback to InnerProductLayer for some corner cases. 
  */
 template <typename Dtype>
-class ACLInnerProductLayer : public ACLBaseLayer<CLFullyConnectedLayer,NEFullyConnectedLayer>,public InnerProductLayer<Dtype> {
+class ACLInnerProductLayer : public ACLOperator,public InnerProductLayer<Dtype> {
  public:
   explicit ACLInnerProductLayer(const LayerParameter& param)
-      : InnerProductLayer<Dtype>(param) {}
+      : ACLOperator(param),InnerProductLayer<Dtype>(param) {
+  }
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -44,8 +45,9 @@ class ACLInnerProductLayer : public ACLBaseLayer<CLFullyConnectedLayer,NEFullyCo
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
 		  NOT_IMPLEMENTED;
       }
-  virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+  virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
+  virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
 };
 #endif
 
diff --git a/include/caffe/layers/acl_local_connect_layer.hpp b/include/caffe/layers/acl_local_connect_layer.hpp
index fdb30757..b3b3a9e3 100644
--- a/include/caffe/layers/acl_local_connect_layer.hpp
+++ b/include/caffe/layers/acl_local_connect_layer.hpp
@@ -10,7 +10,7 @@
 #include "caffe/layers/local_connect_layer.hpp"
 
 #ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
 #endif
 
 namespace caffe {
@@ -22,10 +22,10 @@ namespace caffe {
  *
 */
 template <typename Dtype>
-class ACLLocalConnectLayer : public ACLBaseLayer<CLLocallyConnectedLayer,NELocallyConnectedLayer>,public LocalConnectLayer<Dtype> {
+class ACLLocalConnectLayer : public ACLOperator,public LocalConnectLayer<Dtype> {
  public:
   explicit ACLLocalConnectLayer(const LayerParameter& param)
-      : LocalConnectLayer<Dtype>(param) {}
+      : ACLOperator(param),LocalConnectLayer<Dtype>(param) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -45,9 +45,9 @@ class ACLLocalConnectLayer : public ACLBaseLayer<CLLocallyConnectedLayer,NELocal
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
 		  NOT_IMPLEMENTED;
       }
-  virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+  virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
-
+  virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
 };
 #endif
 
diff --git a/include/caffe/layers/acl_lrn_layer.hpp b/include/caffe/layers/acl_lrn_layer.hpp
index 6fd9fbc8..1a47f804 100644
--- a/include/caffe/layers/acl_lrn_layer.hpp
+++ b/include/caffe/layers/acl_lrn_layer.hpp
@@ -10,7 +10,7 @@
 #include "caffe/layers/lrn_layer.hpp"
 
 #ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
 #endif
 
 namespace caffe {
@@ -21,10 +21,10 @@ namespace caffe {
  *        Fallback to LRNLayer for some corner cases.
 */
 template <typename Dtype>
-class ACLLRNLayer : public ACLBaseLayer<CLNormalizationLayer,NENormalizationLayer>,public LRNLayer<Dtype> {
+class ACLLRNLayer : public ACLOperator,public LRNLayer<Dtype> {
  public:
   explicit ACLLRNLayer(const LayerParameter& param)
-      : LRNLayer<Dtype>(param) {}
+      : ACLOperator(param),LRNLayer<Dtype>(param) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -44,8 +44,9 @@ class ACLLRNLayer : public ACLBaseLayer<CLNormalizationLayer,NENormalizationLaye
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
 		  NOT_IMPLEMENTED;
       }
-  virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+  virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
+  virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
 };
 #endif
 
diff --git a/include/caffe/layers/acl_pooling_layer.hpp b/include/caffe/layers/acl_pooling_layer.hpp
index acca35cf..b7f94497 100644
--- a/include/caffe/layers/acl_pooling_layer.hpp
+++ b/include/caffe/layers/acl_pooling_layer.hpp
@@ -10,7 +10,7 @@
 #include "caffe/layers/pooling_layer.hpp"
 
 #ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
 #endif
 
 namespace caffe {
@@ -21,10 +21,10 @@ namespace caffe {
  *        Fallback to PoolingLayer for some corner cases.
 */
 template <typename Dtype>
-class ACLPoolingLayer : public ACLBaseLayer<CLPoolingLayer,NEPoolingLayer>,public PoolingLayer<Dtype> {
+class ACLPoolingLayer : public ACLOperator,public PoolingLayer<Dtype> {
  public:
   explicit ACLPoolingLayer(const LayerParameter& param)
-      : PoolingLayer<Dtype>(param) {}
+      : ACLOperator(param),PoolingLayer<Dtype>(param) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -44,8 +44,9 @@ class ACLPoolingLayer : public ACLBaseLayer<CLPoolingLayer,NEPoolingLayer>,publi
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
 		  NOT_IMPLEMENTED;
       }
-  virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+  virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
+  virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
 };
 #endif
 
diff --git a/include/caffe/layers/acl_relu_layer.hpp b/include/caffe/layers/acl_relu_layer.hpp
index 041dbecf..8bbb7267 100644
--- a/include/caffe/layers/acl_relu_layer.hpp
+++ b/include/caffe/layers/acl_relu_layer.hpp
@@ -11,7 +11,7 @@
 #include "caffe/layers/relu_layer.hpp"
 
 #ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
 #include "caffe/layers/acl_base_activation_layer.hpp"
 #endif
 
@@ -46,8 +46,9 @@ class ACLReLULayer : public ACLBaseActivationLayer<Dtype>,public ReLULayer<Dtype
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
 		  NOT_IMPLEMENTED;
       }
-  virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+  virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
+  virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
 };
 #endif
 
diff --git a/include/caffe/layers/acl_sigmoid_layer.hpp b/include/caffe/layers/acl_sigmoid_layer.hpp
index 8638f73e..0e4bcbaf 100644
--- a/include/caffe/layers/acl_sigmoid_layer.hpp
+++ b/include/caffe/layers/acl_sigmoid_layer.hpp
@@ -11,7 +11,7 @@
 #include "caffe/layers/sigmoid_layer.hpp"
 
 #ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
 #include "caffe/layers/acl_base_activation_layer.hpp"
 #endif
 
@@ -45,8 +45,9 @@ class ACLSigmoidLayer : public ACLBaseActivationLayer<Dtype>,public SigmoidLayer
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
 		  NOT_IMPLEMENTED;
       }
-  virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type);
+  virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top, arm_compute::ActivationLayerInfo::ActivationFunction type);
+  virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
 };
 #endif
 
diff --git a/include/caffe/layers/acl_softmax_layer.hpp b/include/caffe/layers/acl_softmax_layer.hpp
index 9e450f5d..7e69992c 100644
--- a/include/caffe/layers/acl_softmax_layer.hpp
+++ b/include/caffe/layers/acl_softmax_layer.hpp
@@ -10,7 +10,7 @@
 #include "caffe/layers/softmax_layer.hpp"
 
 #ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
 #endif
 
 namespace caffe {
@@ -21,10 +21,10 @@ namespace caffe {
  *        Fallback to SoftmaxLayer for some corner cases.
  */
 template <typename Dtype>
-class ACLSoftmaxLayer : public ACLBaseLayer<CLSoftmaxLayer,NESoftmaxLayer>,public SoftmaxLayer<Dtype> {
+class ACLSoftmaxLayer : public ACLOperator,public SoftmaxLayer<Dtype> {
  public:
   explicit ACLSoftmaxLayer(const LayerParameter& param)
-      : SoftmaxLayer<Dtype>(param) {}
+      : ACLOperator(param),SoftmaxLayer<Dtype>(param) {}
   virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
   virtual void Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -44,8 +44,9 @@ class ACLSoftmaxLayer : public ACLBaseLayer<CLSoftmaxLayer,NESoftmaxLayer>,publi
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
 		  NOT_IMPLEMENTED;
       }
-  virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+  virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
+  virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
 };
 #endif
 
diff --git a/include/caffe/layers/acl_tanh_layer.hpp b/include/caffe/layers/acl_tanh_layer.hpp
index 5a74ce5a..717ef012 100644
--- a/include/caffe/layers/acl_tanh_layer.hpp
+++ b/include/caffe/layers/acl_tanh_layer.hpp
@@ -11,7 +11,7 @@
 #include "caffe/layers/tanh_layer.hpp"
 
 #ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
+#include "caffe/acl_operator.hpp"
 #include "caffe/layers/acl_base_activation_layer.hpp"
 #endif
 
@@ -46,8 +46,9 @@ class ACLTanHLayer : public ACLBaseActivationLayer<Dtype>,public TanHLayer<Dtype
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom){
 		  NOT_IMPLEMENTED;
       }
-  virtual void SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type);
+  virtual void SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top, arm_compute::ActivationLayerInfo::ActivationFunction type);
+  virtual bool Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top);
 };
 #endif
 
diff --git a/src/caffe/acl_layer.cpp b/src/caffe/acl_layer.cpp
deleted file mode 100644
index 879b6701..00000000
--- a/src/caffe/acl_layer.cpp
+++ /dev/null
@@ -1,289 +0,0 @@
-#ifdef USE_ACL
-#include "caffe/acl_layer.hpp"
-
-unsigned int bypass_acl_class_layer =    (0 | \
-                                          /*0xffffffff |*/ \
-                                          /*FLAGS_ENABLE_ACL_FC |*/ \
-                                          /*FLAGS_ENABLE_ACL_LRN |*/ \
-                                          0 );
-
-#ifdef USE_PROFILING
-
-#include "arm_neon.h"
-
-unsigned int acl_log_flags = (0 | \
-                              MASK_LOG_APP_TIME | \
-                            /*MASK_LOG_ALLOCATE | */\
-                            /*MASK_LOG_ALLOCATE | */\
-                            /*MASK_LOG_RUN      | */\
-                            /*MASK_LOG_CONFIG   | */\
-                            /*MASK_LOG_COPY     | */\
-                              MASK_LOG_ABSVAL   | \
-                              MASK_LOG_BNLL     | \
-                              MASK_LOG_CONV     | \
-                              MASK_LOG_FC       | \
-                              MASK_LOG_LRN      | \
-                              MASK_LOG_POOLING  | \
-                              MASK_LOG_RELU     | \
-                              MASK_LOG_SIGMOID  | \
-                              MASK_LOG_SOFTMAX  | \
-                              MASK_LOG_TANH     | \
-                              MASK_LOG_LC       | \
-                              MASK_LOG_BN       | \
-                              MASK_LOG_CONCAT   | \
-                              0);                                          
-#include <stdio.h>      /* printf */
-#include <stdlib.h>     /* getenv */
-#endif //USE_PROFILING
-
-namespace caffe {
-template <typename GPULayer, typename CPULayer>
-ACLBaseLayer<GPULayer,CPULayer>::ACLBaseLayer()
-    :init_layer_(true),force_bypass_acl_path_(false){
-  const char* pBypassACL;
-  pBypassACL = getenv ("BYPASSACL");
-  if (pBypassACL){
-    unsigned int bacl;
-    sscanf(pBypassACL,"%i", &bacl);
-	if(bacl != bypass_acl_class_layer){
-	    bypass_acl_class_layer = bacl;
-        printf("BYPASSACL<%s>\n", pBypassACL);
-        printf("BYPASSACL: %x\n", bypass_acl_class_layer);
-	}
-  }
-#ifdef USE_PROFILING
-  const char* pLogACL;
-  pLogACL    = getenv("LOGACL");
-  if (pLogACL){
-    unsigned int alf;
-    sscanf(pLogACL,"%i", &alf);
-	if (alf != acl_log_flags){
-	    acl_log_flags = alf;
-        printf("LOGACL<%s>\n", pLogACL);
-        printf("LOGACL: %x\n", acl_log_flags);
-	}
-  }
-#endif //USE_PROFILING
-}
-template <typename GPULayer, typename CPULayer>
-void ACLBaseLayer<GPULayer,CPULayer>::gpu_run() {
-    gpu_.run(true);
-}
-template <typename GPULayer, typename CPULayer>
-void ACLBaseLayer<GPULayer,CPULayer>::cpu_run() {
-    cpu_.run(false);
-}
-
-template <typename GPULayer, typename CPULayer>
-ACLBaseLayer<GPULayer,CPULayer>::~ACLBaseLayer(){
-}
-template <typename GPULayer, typename CPULayer>
-template <typename ACLTensor> bool ACLBaseLayer<GPULayer,CPULayer>::new_tensor(ACLTensor *&tensor,TensorShape shape,void *mem,bool share)
-{
-    tensor=new ACLTensor(share);
-#if 1    //F32
-    tensor->allocator()->init(TensorInfo(shape, Format::F32));
-#else  //F16
-    tensor->allocator()->init(TensorInfo(shape, Format::F16));
-#endif    
-    tensor->bindmem(mem,share);
-    return true;
-}
-
-template <typename ACLTensor>
-void BaseTensor<ACLTensor>::commit(TensorType type){
-    settensortype(type);
-    if (!share_&&mem_) {
-        if (!allocate_){ 
-#ifdef USE_PROFILING
-            logtime_util log_time(ACL_ALLOCATE_INFO);
-#endif //USE_PROFILING
-            ACLTensor::allocator()->allocate(); 
-            allocate_=true;
-        }
-        if (type_!= tensor_output) {
-           tensor_copy(mem_);
-        }
-        mem_=nullptr;
-    }
-}
-
-template <typename ACLTensor>
-int BaseTensor<ACLTensor>::tensor_copy(void * mem,bool toTensor)
-{
-#ifdef USE_PROFILING
-    logtime_util log_time(ACL_COPY_INFO);
-#endif //USE_PROFILING
-    arm_compute::Window window;
-    ACLTensor* tensor=this;
-    window.use_tensor_dimensions(tensor->info()->tensor_shape(), /* first_dimension =*/Window::DimY); // Iterate through the rows (not each element)
-    int width = tensor->info()->tensor_shape()[0]; //->dimension(0); //window.x().end() - window.x().start(); // + 1;
-    int height = tensor->info()->tensor_shape()[1]; //->dimension(1); //window.y().end() - window.y().start(); // + 1;
-    int deepth = tensor->info()->tensor_shape()[2];
-    map();
-    // Create an iterator:
-    arm_compute::Iterator it(tensor, window);
-    // Except it works for an arbitrary number of dimensions
-    if (toTensor) { //mem->tensor
-        arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates & id)
-        {
-#if 0 //F16
-            if (tensor->info()->element_size() ==2)
-            {
-                for(int i = 0; i < width; i+= 4){
-                    auto pa = (float32x4_t*)((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width + id.x() + i) * 4);
-                    *(float16x4_t*)(((char*)it.ptr()) + i*2) = vcvt_f16_f32(*pa);
-                }
-            }
-            else{
-#endif
-                memcpy(it.ptr(), ((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width + id.x()) * tensor->info()->element_size()), width * tensor->info()->element_size());
-#if 0 //F16
-            }
-#endif
-        },
-        it);
-    }else{ //tensor-->mem
-        arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates & id)
-        {
-#if 0 //F16		
-            if (tensor->info()->element_size() ==2)
-            {
-                for(int i = 0; i < width; i+= 4){
-                    auto pa = (float32x4_t*)(((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width + id.x() + i) * 4));
-                    *pa = vcvt_f32_f16(*(float16x4_t*)(((char*)it.ptr()) + i*2));
-                }
-            }
-            else{
-#endif			
-                memcpy(((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width) * tensor->info()->element_size()), it.ptr(), width * tensor->info()->element_size());
-#if 0 //F16				
-            }
-#endif			
-        },
-        it);
-    }
-    unmap();
-
-    return 0;
-}
-
-template <typename GPULayer, typename CPULayer>
-template <typename ACLTensor> bool  ACLBaseLayer<GPULayer,CPULayer>::tensor_mem(ACLTensor *tensor,void *mem,bool share)
-{
-    tensor->bindmem(mem,share);
-    return true;
-}
-
-template <typename GPULayer, typename CPULayer>
-template <typename ACLTensor> bool  ACLBaseLayer<GPULayer,CPULayer>::tensor_mem(void *mem,ACLTensor *tensor,bool share)
-{
-    if (mem==tensor->buffer()) return true;
-    if (!share) {
-     tensor->tensor_copy(mem,false);
-    }
-    return true;
-}
-
-
-template <typename GPULayer, typename CPULayer>
-bool ACLBaseLayer<GPULayer,CPULayer>::checkreshape(TensorShape shape,bool gpu, TensorType type)
-{
-    if (gpu) {
-        init_layer_ = gpu_.reshape(shape,type);
-    }else{
-        init_layer_ = cpu_.reshape(shape,type);
-    }
-    return init_layer_;
-}
-
-template <typename GPULayer, typename CPULayer>
-GPULayer * ACLBaseLayer<GPULayer,CPULayer>::new_gpulayer(){
-        gpu_.layer= new GPULayer;
-        return gpu_.layer;
-}
-template <typename GPULayer, typename CPULayer>
-CPULayer * ACLBaseLayer<GPULayer,CPULayer>::new_cpulayer(){
-        cpu_.layer= new CPULayer;
-        return cpu_.layer;
-}
-template <typename ACLLayer,typename ACLTensor>
-bool ACLXPUBaseLayer<ACLLayer,ACLTensor>::reshape(TensorShape &shape,TensorType type)
-{
-    TensorShape _shape;
-    if (!layer) return true;
-#ifdef USE_CONV_CACHE
-    if (tensor_input == type){
-        _shape = input->info()->tensor_shape();
-        if (_shape.total_size()==shape.total_size() && _shape[0]==shape[0] && _shape[1]==shape[1]) {
-            return false;
-        }
-        for(int i = 0; i < 16; ++i){
-            if(cache.input[i] == nullptr) break;
-            _shape = cache.input[i]->info()->tensor_shape();
-            if (_shape.total_size()==shape.total_size() && _shape[0]==shape[0] && _shape[1]==shape[1]) {
-                this->layer = cache.layer[i];
-                this->input = cache.input[i];
-                this->output = cache.output[i];
-                this->weights = cache.weights[i];
-                this->biases = cache.biases[i]; 
-                return false;
-            }
-        }
-    }
-#endif //USE_CONV_CACHE    
-    switch (type) {
-    case tensor_biases:
-        _shape = biases->info()->tensor_shape();
-        break;
-    case tensor_weights:
-        _shape = weights->info()->tensor_shape();
-        break;
-    case tensor_output:
-        _shape = output->info()->tensor_shape();
-        break;
-    case tensor_input:
-    default:
-        _shape = input->info()->tensor_shape();
-        break;
-    }
-    if (_shape.total_size()==shape.total_size() && _shape[0]==shape[0] && _shape[1]==shape[1]) {
-        return false;
-    }
-    freelayer();
-    return true;
-}
-
-INSTANTIATE_ACLBASECLASS(CLNormalizationLayer,NENormalizationLayer); 
-  INSTANTIATE_ACLBASE_FUNCTION(CLNormalizationLayer,NENormalizationLayer,GPUTensor);
-  INSTANTIATE_ACLBASE_FUNCTION(CLNormalizationLayer,NENormalizationLayer,CPUTensor);
-INSTANTIATE_ACLBASECLASS(CLActivationLayer,NEActivationLayer); 
-  INSTANTIATE_ACLBASE_FUNCTION(CLActivationLayer,NEActivationLayer,GPUTensor);
-  INSTANTIATE_ACLBASE_FUNCTION(CLActivationLayer,NEActivationLayer,CPUTensor);
-INSTANTIATE_ACLBASECLASS(CLPoolingLayer,NEPoolingLayer); 
-  INSTANTIATE_ACLBASE_FUNCTION(CLPoolingLayer,NEPoolingLayer,GPUTensor);
-  INSTANTIATE_ACLBASE_FUNCTION(CLPoolingLayer,NEPoolingLayer,CPUTensor);
-INSTANTIATE_ACLBASECLASS(CLSoftmaxLayer,NESoftmaxLayer); 
-  INSTANTIATE_ACLBASE_FUNCTION(CLSoftmaxLayer,NESoftmaxLayer,GPUTensor);
-  INSTANTIATE_ACLBASE_FUNCTION(CLSoftmaxLayer,NESoftmaxLayer,CPUTensor);
-INSTANTIATE_ACLBASECLASS(CLFullyConnectedLayer,NEFullyConnectedLayer); 
-  INSTANTIATE_ACLBASE_FUNCTION(CLFullyConnectedLayer,NEFullyConnectedLayer,GPUTensor);
-  INSTANTIATE_ACLBASE_FUNCTION(CLFullyConnectedLayer,NEFullyConnectedLayer,CPUTensor);
-INSTANTIATE_ACLBASECLASS(CLConvolutionLayer,NEConvolutionLayer); 
-  INSTANTIATE_ACLBASE_FUNCTION(CLConvolutionLayer,NEConvolutionLayer,GPUTensor);
-  INSTANTIATE_ACLBASE_FUNCTION(CLConvolutionLayer,NEConvolutionLayer,CPUTensor);
-INSTANTIATE_ACLBASECLASS(CLConvolutionLayer,NEDirectConvolutionLayer); 
-  INSTANTIATE_ACLBASE_FUNCTION(CLConvolutionLayer,NEDirectConvolutionLayer,GPUTensor);
-  INSTANTIATE_ACLBASE_FUNCTION(CLConvolutionLayer,NEDirectConvolutionLayer,CPUTensor);
-INSTANTIATE_ACLBASECLASS(CLBatchNormalizationLayer,NEBatchNormalizationLayer); 
-  INSTANTIATE_ACLBASE_FUNCTION(CLBatchNormalizationLayer,NEBatchNormalizationLayer,GPUTensor);
-  INSTANTIATE_ACLBASE_FUNCTION(CLBatchNormalizationLayer,NEBatchNormalizationLayer,CPUTensor);
-INSTANTIATE_ACLBASECLASS(CLLocallyConnectedLayer,NELocallyConnectedLayer); 
-  INSTANTIATE_ACLBASE_FUNCTION(CLLocallyConnectedLayer,NELocallyConnectedLayer,GPUTensor);
-  INSTANTIATE_ACLBASE_FUNCTION(CLLocallyConnectedLayer,NELocallyConnectedLayer,CPUTensor);
-INSTANTIATE_ACLBASECLASS(CLDepthConcatenate,NEDepthConcatenate); 
-  INSTANTIATE_ACLBASE_FUNCTION(CLDepthConcatenate,NEDepthConcatenate,GPUTensor);
-  INSTANTIATE_ACLBASE_FUNCTION(CLDepthConcatenate,NEDepthConcatenate,CPUTensor);
-}
-
-#endif
diff --git a/src/caffe/acl_operator.cpp b/src/caffe/acl_operator.cpp
new file mode 100644
index 00000000..b35265a8
--- /dev/null
+++ b/src/caffe/acl_operator.cpp
@@ -0,0 +1,227 @@
+#ifdef USE_ACL
+#include "caffe/acl_operator.hpp"
+#include "caffe/common.hpp"
+
+unsigned int bypass_acl_class_layer =    (0 | \
+                                          FLAGS_ENABLE_ACL_CONCAT | \
+                                          /*0xffffffff |*/ \
+                                          /*FLAGS_ENABLE_ACL_FC |*/ \
+                                          /*FLAGS_ENABLE_ACL_LRN |*/ \
+                                          0 );
+
+unsigned int openailab_intfp   = 0;
+int enable_schedule=0;
+
+#ifdef USE_PROFILING
+
+#include "arm_neon.h"
+
+unsigned int acl_log_flags = (0 | \
+                              MASK_LOG_APP_TIME | \
+                            /*MASK_LOG_ALLOCATE | */\
+                            /*MASK_LOG_ALLOCATE | */\
+                            /*MASK_LOG_RUN      | */\
+                            /*MASK_LOG_CONFIG   | */\
+                            /*MASK_LOG_COPY     | */\
+                              MASK_LOG_ABSVAL   | \
+                              MASK_LOG_BNLL     | \
+                              MASK_LOG_CONV     | \
+                              MASK_LOG_FC       | \
+                              MASK_LOG_LRN      | \
+                              MASK_LOG_POOLING  | \
+                              MASK_LOG_RELU     | \
+                              MASK_LOG_SIGMOID  | \
+                              MASK_LOG_SOFTMAX  | \
+                              MASK_LOG_TANH     | \
+                              MASK_LOG_LC       | \
+                              MASK_LOG_BN       | \
+                              MASK_LOG_CONCAT   | \
+                              0);                                          
+#include <stdio.h>      /* printf */
+#include <stdlib.h>     /* getenv */
+#endif //USE_PROFILING
+
+namespace caffe {
+bool AclEnableSchedule(int enable){
+    enable_schedule=enable;
+    if (enable) {
+        Caffe::set_mode(Caffe::GPU);
+    }
+    return true;
+}
+int isScheduleEnable()
+{
+    return enable_schedule;
+}
+bool ACLOperator::init_cl_env=true;
+bool ACLOperator::support_opencl_=false;
+bool opencl_is_available()
+{
+    return arm_compute::opencl_is_available();
+}
+ACLOperator::ACLOperator(const LayerParameter& param)
+    :operator_state_(operator_not_init),force_bypass_acl_path_(false),
+    target_hint_(TargetHint::DONT_CARE),
+    convolution_method_hint_(ConvolutionMethodHint::GEMM),
+    _group(1),name_(""){
+  const char* pBypassACL;
+  if(init_cl_env){
+#ifdef USE_OPENCL
+     try {
+        if (opencl_is_available()) {
+          arm_compute::CLScheduler::get().default_init();
+          support_opencl_=true;
+        }
+     }catch(std::exception& e){
+          support_opencl_=false;
+     }
+#endif
+     init_cl_env=false;
+  }
+  pBypassACL = getenv ("BYPASSACL");
+  if (pBypassACL){
+    unsigned int bacl;
+    sscanf(pBypassACL,"%i", &bacl);
+	if(bacl != bypass_acl_class_layer){
+	    bypass_acl_class_layer = bacl;
+        printf("BYPASSACL<%s>\n", pBypassACL);
+        printf("BYPASSACL: %x\n", bypass_acl_class_layer);
+	}
+  }
+
+  const string& layer_type = param.type();
+  if (layer_type=="Convolution") {
+      ConvolutionParameter conv_param = param.convolution_param();
+        const char* pDirectConv;
+        unsigned int use_direct_conv=0;
+        pDirectConv = getenv ("DIRECTCONV");
+        if (pDirectConv){
+          unsigned int bdirectconv;
+          sscanf(pDirectConv,"%i", &bdirectconv);
+          if(bdirectconv != use_direct_conv){
+              use_direct_conv = bdirectconv;
+              printf("DIRECTCONV<%s>\n", pDirectConv);
+              printf("DIRECTCONV: %x\n", use_direct_conv);
+          }
+        }
+        int pad_data[3];
+        if (conv_param.has_pad_h() || conv_param.has_pad_w()) {
+          pad_data[0] = conv_param.pad_h();
+          pad_data[1] = conv_param.pad_w();
+        } else {
+          const int kDefaultPad = 0;
+          const int num_pad_dims = conv_param.pad_size();
+          for (int i = 0; i < 2; ++i) {
+            pad_data[i] = (num_pad_dims == 0) ? kDefaultPad :
+                conv_param.pad((num_pad_dims == 1) ? 0 : i);
+          }
+        }
+        if (use_direct_conv && ( (conv_param.kernel_size(0)==1 &&pad_data[0]==0 && pad_data[1]==0) || (conv_param.kernel_size(0)==3 && pad_data[0]<=1 && pad_data[1] <=1 ) )) {
+            convolution_method_hint_=ConvolutionMethodHint::DIRECT; //NEDirectConvolutionLayer only for 1x1 and 3x3
+        }
+  }
+
+#ifdef USE_PROFILING
+  const char* pLogACL;
+  pLogACL    = getenv("LOGACL");
+  if (pLogACL){
+    unsigned int alf;
+    sscanf(pLogACL,"%i", &alf);
+	if (alf != acl_log_flags){
+	    acl_log_flags = alf;
+        printf("LOGACL<%s>\n", pLogACL);
+        printf("LOGACL: %x\n", acl_log_flags);
+	}
+  }
+#endif //USE_PROFILING
+  const char* pEnableSchedule;
+  pEnableSchedule = getenv ("ENABLESCHEDULE");
+  if (pEnableSchedule){
+    unsigned int bshedule;
+    sscanf(pEnableSchedule,"%i", &bshedule);
+    if(bshedule != enable_schedule){
+        enable_schedule = bshedule;
+        printf("ENABLESCHEDULE<%s>\n", pEnableSchedule);
+        printf("ENABLESCHEDULE: %x\n", enable_schedule);
+    }
+    if (enable_schedule) {
+        AclEnableSchedule(1);
+    }
+  }
+}
+ACLOperator::~ACLOperator() {
+}
+
+bool ACLOperator::new_tensor(std::unique_ptr<ACLTensor> &tensor,arm_compute::TensorShape &shape,void *mem,bool commit)
+{
+    auto acl_tensor=new ACLTensor(arm_compute::TensorInfo(shape, arm_compute::Format::F32));
+    acl_tensor->set_target(getTargetHint());
+    acl_tensor->bindmem(mem);
+    if (commit) acl_tensor->commit();
+    tensor=(std::unique_ptr<ACLTensor>) std::move(acl_tensor);
+    return true;
+}
+bool ACLOperator::new_tensor(std::unique_ptr<ACLSubTensor> &tensor,std::unique_ptr<ACLTensor> &parent,arm_compute::TensorShape &shape,arm_compute::Coordinates& coord)
+{
+    auto acl_tensor=new ACLSubTensor(parent,shape, coord);
+    acl_tensor->set_target(getTargetHint());
+    tensor=(std::unique_ptr<ACLSubTensor>) std::move(acl_tensor);
+    return true;
+}
+
+void ACLTensor::commit(TensorType type)
+{
+    settensortype(type);
+    if (mem_) {
+        if (!allocate_){ 
+#ifdef USE_PROFILING
+            logtime_util log_time(ACL_ALLOCATE_INFO);
+#endif //USE_PROFILING
+            allocate(); 
+            allocate_=true;
+        }
+        if (type_!= tensor_output) {
+           tensor_copy(mem_);
+        }
+        mem_=nullptr;
+    }
+}
+
+int BaseACLTensor::tensor_copy(arm_compute::ITensor* tensor,void * mem,bool toTensor)
+{
+#ifdef USE_PROFILING
+    logtime_util log_time(ACL_COPY_INFO);
+#endif //USE_PROFILING
+    arm_compute::Window window;
+    window.use_tensor_dimensions(tensor->info()->tensor_shape(), /* first_dimension =*/arm_compute::Window::DimY); // Iterate through the rows (not each element)
+    int width = tensor->info()->tensor_shape()[0]; 
+    int height = tensor->info()->tensor_shape()[1];
+    int deepth = tensor->info()->tensor_shape()[2];
+    map();
+    // Create an iterator:
+    arm_compute::Iterator it(tensor, window);
+    // Except it works for an arbitrary number of dimensions
+    if (toTensor) { //mem->tensor
+        arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates & id)
+        {
+                memcpy(it.ptr(), ((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width + id.x()) * tensor->info()->element_size()), width * tensor->info()->element_size());
+        },
+        it);
+    }else{ //tensor-->mem
+        arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates & id)
+        {
+                memcpy(((char*)mem) + ((id[3] * (width * height * deepth) + id.z() * (width * height) + id.y() * width) * tensor->info()->element_size()), it.ptr(), width * tensor->info()->element_size());
+        },
+        it);
+    }
+    unmap();
+
+    return 0;
+}
+
+INIT_GLOBAL_FUNCS();
+
+}
+
+
+#endif
diff --git a/src/caffe/acl_tensor.cpp b/src/caffe/acl_tensor.cpp
new file mode 100644
index 00000000..1ab82330
--- /dev/null
+++ b/src/caffe/acl_tensor.cpp
@@ -0,0 +1,138 @@
+#include "caffe/acl_tensor.hpp"
+
+namespace caffe {
+
+#ifdef USE_ACL
+template <typename TensorType>
+std::unique_ptr<arm_compute::ITensor> initialise_tensor(arm_compute::TensorInfo &info)
+{
+    auto tensor = cpp14::make_unique<TensorType>();
+    tensor->allocator()->init(info);
+    return std::move(tensor);
+}
+
+template <typename TensorType>
+void tensor_allocate(arm_compute::ITensor &tensor)
+{
+    auto itensor = dynamic_cast<TensorType *>(&tensor);
+    itensor->allocator()->allocate();
+} 
+
+Tensor::Tensor(arm_compute::TensorInfo &info) noexcept
+    : _target(TargetHint::DONT_CARE), _info(info),  _tensor(nullptr)
+{
+}
+
+Tensor::Tensor(Tensor &&src) noexcept
+    : _target(src._target),
+      _info(std::move(src._info)),
+      _tensor(std::move(src._tensor))
+{
+}
+
+arm_compute::ITensor *Tensor::set_target(TargetHint target)
+{
+    switch(target)
+    {
+#ifdef USE_OPENCL
+        case TargetHint::OPENCL:
+            _tensor = initialise_tensor<arm_compute::CLTensor>(_info);
+            break;
+#endif
+        case TargetHint::NEON:
+            _tensor = initialise_tensor<arm_compute::Tensor>(_info);
+            break;
+        default:
+            break;
+    }
+    _target = target;
+    return _tensor.get();
+}
+
+void Tensor::allocate()
+{
+    switch(_target)
+    {
+#ifdef USE_OPENCL
+        case TargetHint::OPENCL:
+            tensor_allocate<arm_compute::CLTensor>(*_tensor);
+            break;
+#endif
+        case TargetHint::NEON:
+            tensor_allocate<arm_compute::Tensor>(*_tensor);
+            break;
+        default:
+            break;
+    }
+}
+void Tensor::map(bool blocking){
+#ifdef USE_OPENCL
+    if (_target==TargetHint::OPENCL) 
+        dynamic_cast<arm_compute::CLTensor *>(tensor())->map(blocking);
+#endif
+}
+void Tensor::unmap(){
+#ifdef USE_OPENCL
+    if (_target==TargetHint::OPENCL) 
+        dynamic_cast<arm_compute::CLTensor *>(tensor())->unmap();
+#endif
+}
+
+template <typename SubTensorType, typename ParentTensorType>
+std::unique_ptr<arm_compute::ITensor> initialise_subtensor(arm_compute::ITensor *parent, arm_compute::TensorShape shape, arm_compute::Coordinates coords)
+{
+    auto ptensor   = dynamic_cast<ParentTensorType *>(parent);
+    auto subtensor = cpp14::make_unique<SubTensorType>(ptensor, shape, coords);
+    return std::move(subtensor);
+}
+SubTensor::SubTensor(Tensor* parent, arm_compute::TensorShape& tensor_shape, arm_compute::Coordinates& coords) noexcept
+    : _target(TargetHint::DONT_CARE), _tensor_shape(tensor_shape), _coords(coords), _parent(nullptr), _subtensor(nullptr)
+{
+    _parent = parent->tensor();
+    _target = parent->target();
+
+    instantiate_subtensor();
+}
+arm_compute::ITensor *SubTensor::set_target(TargetHint target)
+{
+    return (target == _target) ? _subtensor.get() : nullptr;
+}
+
+arm_compute::ITensor *SubTensor::tensor()
+{
+    return _subtensor.get();
+}
+
+const arm_compute::ITensor *SubTensor::tensor() const
+{
+    return _subtensor.get();
+}
+
+TargetHint SubTensor::target() const
+{
+    return _target;
+}
+
+void SubTensor::allocate()
+{
+    // NOP for sub-tensors
+}
+
+void SubTensor::instantiate_subtensor()
+{
+    switch(_target)
+    {
+#ifdef USE_OPENCL
+        case TargetHint::OPENCL:
+            _subtensor = initialise_subtensor<arm_compute::CLSubTensor, arm_compute::ICLTensor>(_parent, _tensor_shape, _coords);
+            break;
+#endif
+        default:
+        case TargetHint::NEON:
+            _subtensor = initialise_subtensor<arm_compute::SubTensor, arm_compute::ITensor>(_parent, _tensor_shape, _coords);
+            break;
+    }
+}
+
+#endif
+}
diff --git a/src/caffe/common.cpp b/src/caffe/common.cpp
index f1db7dd4..dd800b04 100644
--- a/src/caffe/common.cpp
+++ b/src/caffe/common.cpp
@@ -7,10 +7,6 @@
 
 #include "caffe/common.hpp"
 #include "caffe/util/rng.hpp"
-#ifdef USE_ACL
-#include "arm_compute/runtime/CL/CLScheduler.h"
-using namespace arm_compute;
-#endif
 
 namespace caffe {
 
@@ -58,18 +54,8 @@ void GlobalInit(int* pargc, char*** pargv) {
 
 Caffe::Caffe()
     : random_generator_(), mode_(Caffe::CPU),use_mali_gpu_(false),
-      solver_count_(1), solver_rank_(0), multiprocess_(false) {
-#ifdef USE_ACL
-
-   try {
-       CLScheduler::get().default_init();
-   }
-   catch(std::exception& e)
+      solver_count_(1), solver_rank_(0), multiprocess_(false)
    {
-       std::cout << "OPENCL initialization failed"<< std::endl;
-   }
-
-#endif
 }
 
 Caffe::~Caffe() { }
diff --git a/src/caffe/layer_factory.cpp b/src/caffe/layer_factory.cpp
index f9e2908c..5e1011e6 100644
--- a/src/caffe/layer_factory.cpp
+++ b/src/caffe/layer_factory.cpp
@@ -56,7 +56,7 @@ shared_ptr<Layer<Dtype> > GetConvolutionLayer(
   ConvolutionParameter conv_param = param.convolution_param();
   ConvolutionParameter_Engine engine = conv_param.engine();
 #ifdef USE_ACL
-  return GetACLConvolutionLayer<Dtype>(param);
+  return shared_ptr<Layer<Dtype> >(new ACLConvolutionLayer<Dtype>(param));
 #endif  
 #ifdef USE_CUDNN
   bool use_dilation = false;
diff --git a/src/caffe/layers/acl_absval_layer.cpp b/src/caffe/layers/acl_absval_layer.cpp
index b0b0304d..deea89c7 100644
--- a/src/caffe/layers/acl_absval_layer.cpp
+++ b/src/caffe/layers/acl_absval_layer.cpp
@@ -15,9 +15,9 @@ void ACLAbsValLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-void ACLAbsValLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top,ActivationLayerInfo::ActivationFunction type){
-    ACLBaseActivationLayer<Dtype>::SetupACLLayer(bottom, top,ActivationLayerInfo::ActivationFunction::ABS);
+void ACLAbsValLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top,arm_compute::ActivationLayerInfo::ActivationFunction type){
+    ACLBaseActivationLayer<Dtype>::SetupACLOperator(bottom, top,arm_compute::ActivationLayerInfo::ActivationFunction::ABS);
 }
 
 template <typename Dtype>
@@ -28,12 +28,21 @@ void ACLAbsValLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
+bool ACLAbsValLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top){
+    bool bypass_acl=false;
+    if (this->force_bypass_acl_path_) {
+        bypass_acl=true;
+    }
+    return bypass_acl;
+}
+
+template <typename Dtype>
 void ACLAbsValLayer<Dtype>::Forward_cpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 #ifdef USE_PROFILING
     logtime_util log_time(ACL_ABSVAL_INFO);
 #endif //USE_PROFILING
-    if (this->force_bypass_acl_path_) {
+    if (Bypass_acl(bottom,top)) {
         AbsValLayer<Dtype>::Forward_cpu(bottom,top);
         return;
     }
@@ -46,7 +55,7 @@ void ACLAbsValLayer<Dtype>::Forward_gpu(
 #ifdef USE_PROFILING
     logtime_util log_time(ACL_ABSVAL_INFO);
 #endif //USE_PROFILING
-    if (this->force_bypass_acl_path_) {
+    if (Bypass_acl(bottom,top)) {
         AbsValLayer<Dtype>::Forward_cpu(bottom,top);
         return;
     }
diff --git a/src/caffe/layers/acl_base_activation_layer.cpp b/src/caffe/layers/acl_base_activation_layer.cpp
index 1fcd682c..cac524fb 100644
--- a/src/caffe/layers/acl_base_activation_layer.cpp
+++ b/src/caffe/layers/acl_base_activation_layer.cpp
@@ -11,50 +11,25 @@ void ACLBaseActivationLayer<Dtype>::LayerSetUp(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
 }
 template <typename Dtype>
-void ACLBaseActivationLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top,ActivationLayerInfo::ActivationFunction type){
+void ACLBaseActivationLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top,arm_compute::ActivationLayerInfo::ActivationFunction type){
 
     const unsigned int count  = bottom[0]->count();
     const unsigned int count_ = top[0]->count();
-    TensorShape input_shape(count);
-    TensorShape output_shape(count_);
-    checkreshape(input_shape,Caffe::arm_gpu_mode());
-    if (!this->init_layer_) return;
-    this->init_layer_=false;
-    // Initialize ACL.
-    if (Caffe::arm_gpu_mode()) {
-        new_gpulayer();
-    }else{
-        new_cpulayer();
-    }
+    arm_compute::TensorShape input_shape(count);
+    arm_compute::TensorShape output_shape(count_);
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
 
-    this->force_bypass_acl_path_=false;
-    ActivationLayerInfo act_info(type);
+    // Initialize ACL.
+    arm_compute::ActivationLayerInfo act_info(type);
      
-    if(type== ActivationLayerInfo::ActivationFunction::TANH)
-      act_info=ActivationLayerInfo(type,1.0,1.0);
-
-   
+    if(type== arm_compute::ActivationLayerInfo::ActivationFunction::TANH)
+      act_info=arm_compute::ActivationLayerInfo(type,1.0,1.0);
 
-    if (Caffe::arm_gpu_mode()) {
-        Dtype *top_data = top[0]->mutable_gpu_data(); 
-        const Dtype* bottom_data = bottom[0]->gpu_data();
-        new_tensor(this->gpu().input,input_shape,(void*)bottom_data);
-        new_tensor(this->gpu().output,output_shape,(void*)top_data);
-#ifdef USE_PROFILING
-        logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
-        this->gpu().layer->configure(this->gpu().input,this->gpu().output,act_info);
-    }else{
-        Dtype *top_data = top[0]->mutable_cpu_data(); 
-        const Dtype* bottom_data = bottom[0]->cpu_data();
-        new_tensor(this->cpu().input,input_shape,(void*)bottom_data);
-        new_tensor(this->cpu().output,output_shape,(void*)top_data);
-#ifdef USE_PROFILING
-        logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
-        this->cpu().layer->configure(this->cpu().input,this->cpu().output,act_info);
-    }
+    new_tensor(input(),input_shape,(void*)InputdataPtr(this,bottom));
+    new_tensor(output(),output_shape,(void*)OutputdataPtr(this,top));
+    acl_configure(activation,this,act_info);
 }
 template <typename Dtype>
 void ACLBaseActivationLayer<Dtype>::Reshape(
@@ -64,27 +39,19 @@ void ACLBaseActivationLayer<Dtype>::Reshape(
 template <typename Dtype>
 void ACLBaseActivationLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-    if(Caffe::arm_gpu_mode()){
-        Forward_gpu(bottom, top);
+    if(isGPUMode()){
+        ACLBaseActivationLayer<Dtype>::Forward_gpu(bottom, top);
         return;
     }        
-    Dtype* top_data = top[0]->mutable_cpu_data();
-    const Dtype* bottom_data = bottom[0]->cpu_data();
-    SetupACLLayer(bottom,top);
-    tensor_mem(this->cpu().input,(void*)(bottom_data));
-    cpu_run();
-    tensor_mem((void*)(top_data),this->cpu().output);
+    SetupACLOperator(bottom,top);
+    caffe::acl_run(this,bottom,top);
 }
 
 template <typename Dtype>
 void ACLBaseActivationLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-    Dtype* top_data = top[0]->mutable_gpu_data();
-    const Dtype* bottom_data = bottom[0]->gpu_data();
-    SetupACLLayer(bottom,top);
-    tensor_mem(this->gpu().input,(void*)(bottom_data));
-    gpu_run();
-    tensor_mem((void*)(top_data),this->gpu().output);
+    SetupACLOperator(bottom,top);
+    caffe::acl_run(this,bottom,top);
 }
 
 template <typename Dtype>
diff --git a/src/caffe/layers/acl_base_conv_layer.cpp b/src/caffe/layers/acl_base_conv_layer.cpp
deleted file mode 100644
index e3c58996..00000000
--- a/src/caffe/layers/acl_base_conv_layer.cpp
+++ /dev/null
@@ -1,222 +0,0 @@
-#ifdef USE_ACL
-#include <algorithm>
-#include <vector>
-
-#include "caffe/filler.hpp"
-#include "caffe/layers/acl_conv_layer.hpp"
-
-namespace caffe {
-
-bool use_direct_conv_=false;
-template <typename Dtype,typename GPUConvLayer,typename CPUConvLayer>
-void ACLConvolutionLayer<Dtype,GPUConvLayer,CPUConvLayer>::LayerSetUp(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  ConvolutionLayer<Dtype>::LayerSetUp(bottom, top);
-  this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV;
-}
-
-template <typename Dtype,typename GPUConvLayer,typename CPUConvLayer>
-void ACLConvolutionLayer<Dtype,GPUConvLayer,CPUConvLayer>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top){
-
-    TensorShape input_shape((unsigned int)bottom[0]->width(), (unsigned int)bottom[0]->height(),(unsigned int)bottom[0]->channels(),(unsigned int)bottom[0]->num());
-    ACLBaseLayer<GPUConvLayer,CPUConvLayer>::checkreshape(input_shape,Caffe::arm_gpu_mode());
-    if (!this->init_layer_) return;
-    this->init_layer_=false;
-  // Initialize ACL.
-    if (Caffe::arm_gpu_mode()) {
-        ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_gpulayer();
-    }else{
-        ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_cpulayer();
-    }
-    this->force_bypass_acl_path_=false;
-    ConvolutionParameter conv_param = this->layer_param_.convolution_param();
-    int stride_x =this->stride_.mutable_cpu_data()[1];
-    int stride_y =this->stride_.mutable_cpu_data()[0];
-    int pad_x=this->pad_.mutable_cpu_data()[1];
-    int pad_y=this->pad_.mutable_cpu_data()[0];
-    unsigned int kernel_x=this->kernel_shape_.mutable_cpu_data()[1];
-    unsigned int kernel_y=this->kernel_shape_.mutable_cpu_data()[0];
-    PadStrideInfo conv_info(stride_x,stride_y,pad_x,pad_y);
-    TensorShape weights_shape(kernel_x,kernel_y,(unsigned int)this->channels_, (unsigned int)this->num_output_);
-    TensorShape biases_shape ((unsigned int)this->num_output_);
-    TensorShape output_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),(unsigned int)top[0]->channels(),(unsigned int)top[0]->num());
-
-    if (Caffe::arm_gpu_mode()) {
-        Dtype *top_data = top[0]->mutable_gpu_data(); 
-        const Dtype* bottom_data = bottom[0]->gpu_data();
-        //[kernel_x, kernel_y, IFM, OFM]
-        ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_tensor(this->gpu().weights,weights_shape,(void*)(this->blobs_[0].get()->mutable_gpu_data()));
-        ACLBaseLayer<GPUConvLayer,CPUConvLayer>::tensor_mem(this->gpu().weights,(void*)(this->blobs_[0].get()->mutable_gpu_data()));
-        //[OFM]
-        if (this->bias_term_) {
-            ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_tensor(this->gpu().biases,biases_shape,(void*)(this->blobs_[1].get()->mutable_gpu_data()));
-            ACLBaseLayer<GPUConvLayer,CPUConvLayer>::tensor_mem(this->gpu().biases,(void*)(this->blobs_[1].get()->mutable_gpu_data()));
-        }
-
-        //[width, height, IFM]
-        ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_tensor(this->gpu().input,input_shape,(void*)bottom_data);
-        //[width, height, OFM]
-        ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_tensor(this->gpu().output,output_shape,(void*)top_data);
-#ifdef USE_PROFILING
-        {
-            logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
-        this->gpu().layer->configure(this->gpu().input,this->gpu().weights,this->gpu().biases,this->gpu().output,conv_info);
-#ifdef USE_PROFILING
-        }
-#endif //USE_PROFILING
-#ifdef USE_CONV_CACHE
-        for(int i = 0; i < 16; ++i){
-            fprintf(stderr, "<GPU>check cache[%d]\n", i);
-            if(this->gpu().cache.layer[i] == nullptr){
-                this->gpu().cache.layer[i] = this->gpu().layer;
-                this->gpu().cache.input[i] = this->gpu().input;
-                this->gpu().cache.output[i] = this->gpu().output;
-                this->gpu().cache.weights[i] = this->gpu().weights;
-                this->gpu().cache.biases[i] = this->gpu().biases;
-                break;
-            }
-        }    
-#endif //USE_CONV_CACHE    		
-    }else{
-        Dtype *top_data = top[0]->mutable_cpu_data(); 
-        const Dtype* bottom_data = bottom[0]->cpu_data();
-        //[kernel_x, kernel_y, IFM, OFM]
-        ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_tensor(this->cpu().weights,weights_shape,(void*)(this->blobs_[0].get()->mutable_cpu_data()));
-        ACLBaseLayer<GPUConvLayer,CPUConvLayer>::tensor_mem(this->cpu().weights,(void*)(this->blobs_[0].get()->mutable_cpu_data()));
-        //[OFM]
-        if (this->bias_term_) {
-            ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_tensor(this->cpu().biases,biases_shape,(void*)(this->blobs_[1].get()->mutable_cpu_data()));
-            ACLBaseLayer<GPUConvLayer,CPUConvLayer>::tensor_mem(this->cpu().biases,(void*)(this->blobs_[1].get()->mutable_cpu_data()));
-        }
-
-        //[width, height, IFM]
-        ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_tensor(this->cpu().input,input_shape,(void*)bottom_data);
-        //[width, height, OFM]
-        ACLBaseLayer<GPUConvLayer,CPUConvLayer>::new_tensor(this->cpu().output,output_shape,(void*)top_data);
-#ifdef USE_PROFILING
-        {
-            logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
-        this->cpu().layer->configure(this->cpu().input,this->cpu().weights,this->cpu().biases,this->cpu().output,conv_info);
-#ifdef USE_PROFILING
-        }
-#endif //USE_PROFILING
-#ifdef USE_CONV_CACHE
-        for(int i = 0; i < 16; ++i){
-            fprintf(stderr, "<CPU>check cache[%d]\n", i);
-            if(this->cpu().cache.layer[i] == nullptr){
-                this->cpu().cache.layer[i] = this->cpu().layer;
-                this->cpu().cache.input[i] = this->cpu().input;
-                this->cpu().cache.output[i] = this->cpu().output;
-                this->cpu().cache.weights[i] = this->cpu().weights;
-                this->cpu().cache.biases[i] = this->cpu().biases;
-                break;
-            }
-        }    
-#endif //USE_CONV_CACHE    		
-    }
-}
-template <typename Dtype,typename GPUConvLayer,typename CPUConvLayer>
-void ACLConvolutionLayer<Dtype,GPUConvLayer,CPUConvLayer>::Reshape(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-  ConvolutionLayer<Dtype>::Reshape(bottom, top);
-}
-
-template <typename Dtype,typename GPUConvLayer,typename CPUConvLayer>
-void ACLConvolutionLayer<Dtype,GPUConvLayer,CPUConvLayer>::Forward_cpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-    if(Caffe::arm_gpu_mode()){
-        Forward_gpu(bottom, top);
-        return;
-    }         
-#ifdef USE_PROFILING
-    logtime_util log_time(ACL_CONV_INFO);
-#endif //USE_PROFILING
-    if (this->force_bypass_acl_path_|| this->group_!=1) {
-        ConvolutionLayer<Dtype>::Forward_cpu(bottom,top);
-        return;
-    }
-
-    ConvolutionParameter conv_param = this->layer_param_.convolution_param();
-    if (conv_param.kernel_size_size()>2 || this->num_spatial_axes_>2 || this->num_spatial_axes_==0) {
-        ConvolutionLayer<Dtype>::Forward_cpu(bottom,top);
-        return;
-    }
-    /* check dilation */
-    int dilated=0;
-
-    for(int i=0;i<this->num_spatial_axes_;i++)
-    {
-        const int *p=this->dilation_.cpu_data();
-
-        if(p[i]!=1) 
-           dilated=1;
-    }
-    if(dilated) {
-        ConvolutionLayer<Dtype>::Forward_cpu(bottom,top);
-        return;
-     }
-    
-    SetupACLLayer(bottom,top);
-    for (int i = 0; i < bottom.size(); ++i) {
-        const Dtype* bottom_data = bottom[i]->cpu_data();
-        Dtype* top_data = top[i]->mutable_cpu_data();
-        ACLBaseLayer<GPUConvLayer,CPUConvLayer>::tensor_mem(this->cpu().input,(void*)bottom_data);
-        ACLBaseLayer<GPUConvLayer,CPUConvLayer>::cpu_run();
-        ACLBaseLayer<GPUConvLayer,CPUConvLayer>::tensor_mem((void*)top_data,this->cpu().output);
-  }
-}
-
-template <typename Dtype,typename GPUConvLayer,typename CPUConvLayer>
-void ACLConvolutionLayer<Dtype,GPUConvLayer,CPUConvLayer>::Forward_gpu(
-    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-#ifdef USE_PROFILING
-    logtime_util log_time(ACL_CONV_INFO);
-#endif //USE_PROFILING
-    ConvolutionParameter conv_param = this->layer_param_.convolution_param();
-    if (this->force_bypass_acl_path_|| this->group_!=1) {
-        ConvolutionLayer<Dtype>::Forward_cpu(bottom,top);
-        return;
-    }
-    if (conv_param.kernel_size_size()>2 || this->num_spatial_axes_>2 ) {
-        ConvolutionLayer<Dtype>::Forward_cpu(bottom,top);
-        return;
-    }
-    /* check dilation */
-    int dilated=0;
-
-    for(int i=0;i<this->num_spatial_axes_;i++)
-    {
-        const int *p=this->dilation_.gpu_data();
-
-        if(p[i]!=1) 
-           dilated=1;
-    }
-
-    if(dilated) {
-        ConvolutionLayer<Dtype>::Forward_cpu(bottom,top);
-        return;
-     }
-    SetupACLLayer(bottom,top);
-    for (int i = 0; i < bottom.size(); ++i) {
-      const Dtype* bottom_data = bottom[i]->gpu_data();
-      Dtype* top_data = top[i]->mutable_gpu_data();
-      ACLBaseLayer<GPUConvLayer,CPUConvLayer>::tensor_mem(this->gpu().input,(void*)bottom_data);
-      ACLBaseLayer<GPUConvLayer,CPUConvLayer>::gpu_run();
-      ACLBaseLayer<GPUConvLayer,CPUConvLayer>::tensor_mem((void*)top_data,this->gpu().output);
-    }
-}
-
-template <typename Dtype,typename GPUConvLayer,typename CPUConvLayer>
-ACLConvolutionLayer<Dtype,GPUConvLayer,CPUConvLayer>::~ACLConvolutionLayer() {
-}
-
-#ifdef USE_ACL
-INSTANTIATE_CONV_CLASS(ACLConvolutionLayer,CLConvolutionLayer,NEDirectConvolutionLayer);
-INSTANTIATE_CONV_CLASS(ACLConvolutionLayer,CLConvolutionLayer,NEConvolutionLayer);
-#endif
-
-}   // namespace caffe
-#endif  // USE_ACL
diff --git a/src/caffe/layers/acl_batch_norm_layer.cpp b/src/caffe/layers/acl_batch_norm_layer.cpp
index a6bc16d1..15df15cb 100644
--- a/src/caffe/layers/acl_batch_norm_layer.cpp
+++ b/src/caffe/layers/acl_batch_norm_layer.cpp
@@ -12,94 +12,43 @@ void ACLBatchNormLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_BN;
 }
 template <typename Dtype>
-void ACLBatchNormLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+void ACLBatchNormLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
-    if (!this->init_layer_) return;
-    this->init_layer_=false;
-    // Initialize ACL.
-    if (Caffe::arm_gpu_mode()) {
-        new_gpulayer();
-    }else{
-        new_cpulayer();
-    }
-
-    this->force_bypass_acl_path_=false;
+    arm_compute::TensorShape in_shape ((unsigned int)bottom[0]->width(), (unsigned int)bottom[0]->height(),(unsigned int)bottom[0]->channels(),(unsigned int)bottom[0]->num());
+    if (is_operator_init_done(in_shape)) return;
+    set_operator_init_done();
 
-    TensorShape in_shape ((unsigned int)bottom[0]->width(), (unsigned int)bottom[0]->height(),(unsigned int)bottom[0]->channels(),(unsigned int)bottom[0]->num());
-    TensorShape out_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),(unsigned int)top[0]->channels(),(unsigned int)top[0]->num());
-    TensorShape mean_shape((unsigned int)this->channels_);
-    TensorShape var_shape=mean_shape;
-    TensorShape beta_shape=mean_shape;
-    TensorShape gamma_shape=mean_shape;
+    // Initialize ACL.
+    arm_compute::TensorShape out_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),(unsigned int)top[0]->channels(),(unsigned int)top[0]->num());
+    arm_compute::TensorShape mean_shape((unsigned int)this->channels_);
+    arm_compute::TensorShape var_shape=mean_shape;
+    arm_compute::TensorShape beta_shape=mean_shape;
+    arm_compute::TensorShape gamma_shape=mean_shape;
     Dtype beta_val[beta_shape.total_size()];
     Dtype gamma_val[gamma_shape.total_size()];
 
-
     for (int i=0;i<beta_shape.total_size();++i) {
         beta_val[i]=0.0;
     }
     for (int i=0;i<gamma_shape.total_size();++i) {
         gamma_val[i]=1.0;
     }
-    if (Caffe::arm_gpu_mode()) {
-        Dtype *top_data = top[0]->mutable_gpu_data(); 
-        const Dtype* bottom_data = bottom[0]->gpu_data();
-        // use the stored mean/variance estimates.
-        const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ?
-            0 : 1 / this->blobs_[2]->cpu_data()[0];
-        caffe_cpu_scale(this->variance_.count(), scale_factor,
-            this->blobs_[0]->gpu_data(), this->mean_.mutable_gpu_data());
-        caffe_cpu_scale(this->variance_.count(), scale_factor,
-            this->blobs_[1]->gpu_data(), this->variance_.mutable_gpu_data());
-        new_tensor(this->gpu().input,in_shape,(void*)bottom_data);
-        new_tensor(this->gpu().output,out_shape,(void*)top_data);
-        new_tensor(this->gpu().mean,mean_shape);
-        new_tensor(this->gpu().var,var_shape);
-        new_tensor(this->gpu().beta,beta_shape);
-        new_tensor(this->gpu().gamma,gamma_shape);
-        tensor_mem(this->gpu().mean,(void*)this->mean_.mutable_gpu_data());
-        tensor_mem(this->gpu().var,(void*)this->variance_.mutable_gpu_data());
-        tensor_mem(this->gpu().beta,(void*)beta_val);
-        tensor_mem(this->gpu().gamma,(void*)gamma_val);
-        this->gpu().mean->commit();
-        this->gpu().var->commit();
-        this->gpu().beta->commit();
-        this->gpu().gamma->commit();
 
-#ifdef USE_PROFILING
-        logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
-        this->gpu().layer->configure(this->gpu().input,this->gpu().output,this->gpu().mean,this->gpu().var,this->gpu().beta,this->gpu().gamma,this->eps_);
-    }else{
-        Dtype *top_data = top[0]->mutable_cpu_data(); 
-        const Dtype* bottom_data = bottom[0]->cpu_data();
-        // use the stored mean/variance estimates.
-        const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ?
-            0 : 1 / this->blobs_[2]->cpu_data()[0];
-        caffe_cpu_scale(this->variance_.count(), scale_factor,
-            this->blobs_[0]->cpu_data(), this->mean_.mutable_cpu_data());
-        caffe_cpu_scale(this->variance_.count(), scale_factor,
-            this->blobs_[1]->cpu_data(), this->variance_.mutable_cpu_data());
-        new_tensor(this->cpu().input,in_shape,(void*)bottom_data);
-        new_tensor(this->cpu().output,out_shape,(void*)top_data);
-        new_tensor(this->cpu().mean,mean_shape);
-        new_tensor(this->cpu().var,var_shape);
-        new_tensor(this->cpu().beta,beta_shape);
-        new_tensor(this->cpu().gamma,gamma_shape);
-        tensor_mem(this->cpu().mean,(void*)this->mean_.mutable_cpu_data());
-        tensor_mem(this->cpu().var,(void*)this->variance_.mutable_cpu_data());
-        tensor_mem(this->cpu().beta,(void*)beta_val);
-        tensor_mem(this->cpu().gamma,(void*)gamma_val);
-        this->cpu().mean->commit();
-        this->cpu().var->commit();
-        this->cpu().beta->commit();
-        this->cpu().gamma->commit();
+    new_tensor(input(),in_shape,InputdataPtr(this,bottom));
+    new_tensor(output(),out_shape,OutputdataPtr(this,top));
+    // use the stored mean/variance estimates.
+    const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ?
+        0 : 1 / this->blobs_[2]->cpu_data()[0];
+    caffe_cpu_scale(this->variance_.count(), scale_factor,
+        this->blobs_[0]->cpu_data(), GetDataPtr(this,&this->mean_));
+    caffe_cpu_scale(this->variance_.count(), scale_factor,
+        this->blobs_[1]->cpu_data(), GetDataPtr(this,&this->variance_));
 
-#ifdef USE_PROFILING
-        logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
-        this->cpu().layer->configure(this->cpu().input,this->cpu().output,this->cpu().mean,this->cpu().var,this->cpu().beta,this->cpu().gamma,this->eps_);
-    }
+    new_tensor(mean(),mean_shape,GetDataPtr(this,&this->mean_));
+    new_tensor(var(),var_shape,GetDataPtr(this,&this->variance_));
+    new_tensor(beta(),beta_shape,(void*)beta_val,true);
+    new_tensor(gamma(),gamma_shape,(void*)gamma_val,true);
+    acl_configure(bn,this,this->eps_);
 }
 template <typename Dtype>
 void ACLBatchNormLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -109,43 +58,48 @@ void ACLBatchNormLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
+bool ACLBatchNormLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top){
+    bool bypass_acl=false;
+    if (this->force_bypass_acl_path_||!this->use_global_stats_) {
+        bypass_acl=true;
+    }
+    if (isScheduleEnable()) {
+        bypass_acl=true;
+    }
+    return bypass_acl;
+}
+
+template <typename Dtype>
 void ACLBatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  if(Caffe::arm_gpu_mode()){
+  if(isGPUMode()){
       Forward_gpu(bottom, top);
       return;
   }         
 #ifdef USE_PROFILING
   logtime_util log_time(ACL_BN_INFO);
 #endif //USE_PROFILING
-  if (this->force_bypass_acl_path_||!this->use_global_stats_) {
+  if (Bypass_acl(bottom,top)) {
         BatchNormLayer<Dtype>::Forward_cpu(bottom,top);
         return;
   }
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  SetupACLLayer(bottom,top);
-  tensor_mem(this->cpu().input,(void*)(bottom_data));
-  cpu_run();
-  tensor_mem((void*)(top_data),this->cpu().output);
+  SetupACLOperator(bottom,top);
+  caffe::acl_run(this,bottom,top);
 }
 
 template <typename Dtype>
 void ACLBatchNormLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-    if (this->force_bypass_acl_path_||!this->use_global_stats_) {
-          BatchNormLayer<Dtype>::Forward_cpu(bottom,top);
-          return;
-    }
 #ifdef USE_PROFILING
   logtime_util log_time(ACL_BN_INFO);
 #endif //USE_PROFILING
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  SetupACLLayer(bottom,top);
-  tensor_mem(this->gpu().input,(void*)(bottom_data));
-  gpu_run();
-  tensor_mem((void*)(top_data),this->gpu().output);
+    if (Bypass_acl(bottom,top)) {
+          BatchNormLayer<Dtype>::Forward_cpu(bottom,top);
+          return;
+    }
+  SetupACLOperator(bottom,top);
+  caffe::acl_run(this,bottom,top);
 }
 
 template <typename Dtype>
diff --git a/src/caffe/layers/acl_bnll_layer.cpp b/src/caffe/layers/acl_bnll_layer.cpp
index 86f09831..20903cda 100644
--- a/src/caffe/layers/acl_bnll_layer.cpp
+++ b/src/caffe/layers/acl_bnll_layer.cpp
@@ -14,9 +14,9 @@ void ACLBNLLLayer<Dtype>::LayerSetUp(
   this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_BNLL;
 }
 template <typename Dtype>
-void ACLBNLLLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type){
-    ACLBaseActivationLayer<Dtype>::SetupACLLayer(bottom, top,ActivationLayerInfo::ActivationFunction::SOFT_RELU);
+void ACLBNLLLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top, arm_compute::ActivationLayerInfo::ActivationFunction type){
+    ACLBaseActivationLayer<Dtype>::SetupACLOperator(bottom, top,arm_compute::ActivationLayerInfo::ActivationFunction::SOFT_RELU);
 }
 template <typename Dtype>
 void ACLBNLLLayer<Dtype>::Reshape(
@@ -26,12 +26,21 @@ void ACLBNLLLayer<Dtype>::Reshape(
 }
 
 template <typename Dtype>
+bool ACLBNLLLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top){
+    bool bypass_acl=false;
+    if (this->force_bypass_acl_path_) {
+        bypass_acl=true;
+    }
+    return bypass_acl;
+}
+
+template <typename Dtype>
 void ACLBNLLLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
 #ifdef USE_PROFILING
     logtime_util log_time(ACL_BNLL_INFO);
 #endif //USE_PROFILING
-    if (this->force_bypass_acl_path_) {
+    if (Bypass_acl(bottom,top)) {
         BNLLLayer<Dtype>::Forward_cpu(bottom,top);
         return;
     }
@@ -44,7 +53,7 @@ void ACLBNLLLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 #ifdef USE_PROFILING
     logtime_util log_time(ACL_BNLL_INFO);
 #endif //USE_PROFILING
-    if (this->force_bypass_acl_path_) {
+    if (Bypass_acl(bottom,top)) {
         BNLLLayer<Dtype>::Forward_cpu(bottom,top);
         return;
     }
diff --git a/src/caffe/layers/acl_concat_layer.cpp b/src/caffe/layers/acl_concat_layer.cpp
index 57a14126..d849a9e0 100644
--- a/src/caffe/layers/acl_concat_layer.cpp
+++ b/src/caffe/layers/acl_concat_layer.cpp
@@ -9,90 +9,66 @@ template <typename Dtype>
 void ACLConcatLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
   ConcatLayer<Dtype>::LayerSetUp(bottom, top);
-  //this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONCAT;
-  this->force_bypass_acl_path_= true;
+  this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONCAT;
 }
 template <typename Dtype>
-void ACLConcatLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+void ACLConcatLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
 
     unsigned int channels=0;
     for (int i = 0; i < bottom.size(); ++i) {
         channels+=bottom[i]->channels();
     }
-    TensorShape out_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),channels);
+    arm_compute::TensorShape out_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),channels);
+
+    if (is_operator_init_done(out_shape,tensor_output)) return;
+    set_operator_init_done();
 
-    if (!this->init_layer_) return;
-    this->init_layer_=false;
     // Initialize ACL.
-    if (Caffe::arm_gpu_mode()) {
-        new_gpulayer();
-    }else{
-        new_cpulayer();
+    std::vector<arm_compute::TensorShape> shapes;
+    for (int i = 0; i < bottom.size(); ++i) {
+        arm_compute::TensorShape in_shape((unsigned int)bottom[i]->width(), (unsigned int)bottom[i]->height(),(unsigned int)bottom[i]->channels());
+        new_tensor(cinput(i),in_shape,InputdataPtr(this,bottom,i));
     }
+    new_tensor(output(),out_shape,OutputdataPtr(this,top));
+    acl_configure(concat,this,bottom.size());
 
-    this->force_bypass_acl_path_=false;
-	
-    if (Caffe::arm_gpu_mode()) {
-        Dtype *top_data = top[0]->mutable_gpu_data(); 
-        for (int i = 0; i < bottom.size(); ++i) {
-          const Dtype* bottom_data = bottom[i]->gpu_data();
-          TensorShape vec_shape((unsigned int)bottom[i]->width(), (unsigned int)bottom[i]->height(),(unsigned int)bottom[0]->channels());
-          GPUTensor *vector;
-          new_tensor(vector,vec_shape,(void*)bottom_data);
-          tensor_mem(vector,(void*)bottom_data);
-          vector->commit();
-          gpu_vectors.push_back(vector);
-        }
-        new_tensor(this->gpu().output,out_shape,(void*)top_data);
-#ifdef USE_PROFILING
-        logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
-        this->gpu().layer->configure(gpu_vectors,this->gpu().output);
-    }else{
-        Dtype *top_data = top[0]->mutable_cpu_data(); 
-        for (int i = 0; i < bottom.size(); ++i) {
-          const Dtype* bottom_data = bottom[i]->cpu_data();
-          TensorShape vec_shape((unsigned int)bottom[i]->width(), (unsigned int)bottom[i]->height(),(unsigned int)bottom[0]->channels());
-          CPUTensor *vector;
-          new_tensor(vector,vec_shape,(void*)bottom_data);
-          tensor_mem(vector,(void*)bottom_data);
-          vector->commit();
-          cpu_vectors.push_back(vector);
-        }
-        new_tensor(this->cpu().output,out_shape,(void*)top_data);
-#ifdef USE_PROFILING
-        logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
-        this->cpu().layer->configure(cpu_vectors,this->cpu().output);
-    }
 }
 template <typename Dtype>
 void ACLConcatLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  ConcatLayer<Dtype>::Reshape(bottom, top);
+    ConcatLayer<Dtype>::Reshape(bottom, top);
+}
+template <typename Dtype>
+bool ACLConcatLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top){
+    bool bypass_acl=false;
+    if (this->force_bypass_acl_path_||this->concat_axis_==0) {
+        bypass_acl=true;
+    }
+    if(isScheduleEnable()){
+        bypass_acl=true;
+     }
+    return bypass_acl;
 
 }
 
 template <typename Dtype>
 void ACLConcatLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  if(Caffe::arm_gpu_mode()){
+  if(isGPUMode()){
       Forward_gpu(bottom, top);
       return;
   }         
 #ifdef USE_PROFILING
   logtime_util log_time(ACL_CONCAT_INFO);
 #endif //USE_PROFILING
-  if (this->force_bypass_acl_path_||this->concat_axis_==0) {
+  if (Bypass_acl(bottom,top)) {
       ConcatLayer<Dtype>::Forward_cpu(bottom,top);
       return;
   }
 
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  SetupACLLayer(bottom,top);
-  cpu_run();
-  tensor_mem((void*)(top_data),this->cpu().output);
+  SetupACLOperator(bottom,top);
+  caffe::acl_run(this,bottom,top,false);
 }
 
 template <typename Dtype>
@@ -101,27 +77,16 @@ void ACLConcatLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 #ifdef USE_PROFILING
   logtime_util log_time(ACL_CONCAT_INFO);
 #endif //USE_PROFILING
-  if (this->force_bypass_acl_path_||this->concat_axis_==0) {
+  if (Bypass_acl(bottom,top)) {
       ConcatLayer<Dtype>::Forward_cpu(bottom,top);
       return;
   }
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  SetupACLLayer(bottom,top);
-  gpu_run();
-  tensor_mem((void*)(top_data),this->gpu().output);
+  SetupACLOperator(bottom,top);
+  caffe::acl_run(this,bottom,top,false);
 }
 
 template <typename Dtype>
 ACLConcatLayer<Dtype>::~ACLConcatLayer() {
-    if(this->force_bypass_acl_path_)return;
-    for (int i =0; i < cpu_vectors.size(); i ++) {
-        delete cpu_vectors[i];
-    }
-    for (int i =0; i < gpu_vectors.size(); i ++) {
-        delete gpu_vectors[i];
-    }
-    cpu_vectors.erase(cpu_vectors.begin());
-    gpu_vectors.erase(gpu_vectors.begin());
 }
 
 INSTANTIATE_CLASS(ACLConcatLayer);
diff --git a/src/caffe/layers/acl_conv_layer.cpp b/src/caffe/layers/acl_conv_layer.cpp
new file mode 100644
index 00000000..5cc6fcd9
--- /dev/null
+++ b/src/caffe/layers/acl_conv_layer.cpp
@@ -0,0 +1,147 @@
+#ifdef USE_ACL
+#include <algorithm>
+#include <vector>
+#include "caffe/filler.hpp"
+#include "caffe/layers/acl_conv_layer.hpp"
+
+namespace caffe {
+
+bool use_direct_conv_=false;
+template <typename Dtype>
+void ACLConvolutionLayer<Dtype>::LayerSetUp(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  ConvolutionLayer<Dtype>::LayerSetUp(bottom, top);
+  this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_CONV;
+}
+
+template <typename Dtype>
+void ACLConvolutionLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top){
+
+    arm_compute::TensorShape input_shape((unsigned int)bottom[0]->width(), (unsigned int)bottom[0]->height(),(unsigned int)bottom[0]->channels(),(unsigned int)bottom[0]->num());
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+
+  // Initialize ACL.
+    ConvolutionParameter conv_param = this->layer_param_.convolution_param();
+    int stride_x =this->stride_.mutable_cpu_data()[1];
+    int stride_y =this->stride_.mutable_cpu_data()[0];
+    int pad_x=this->pad_.mutable_cpu_data()[1];
+    int pad_y=this->pad_.mutable_cpu_data()[0];
+    unsigned int kernel_x=this->kernel_shape_.mutable_cpu_data()[1];
+    unsigned int kernel_y=this->kernel_shape_.mutable_cpu_data()[0];
+    arm_compute::PadStrideInfo conv_info(stride_x,stride_y,pad_x,pad_y);
+    arm_compute::TensorShape weights_shape(kernel_x,kernel_y,(unsigned int)this->channels_/this->group_, (unsigned int)this->num_output_);
+    arm_compute::TensorShape biases_shape ((unsigned int)this->num_output_);
+    arm_compute::TensorShape output_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),(unsigned int)top[0]->channels(),(unsigned int)top[0]->num());
+    group()=this->group_;
+
+    //[kernel_x, kernel_y, IFM, OFM]
+    new_tensor(weights(),weights_shape,GetDataPtr(this,this->blobs_[0].get()));
+    //[OFM]
+    if (this->bias_term_) {
+        new_tensor(biases(),biases_shape,GetDataPtr(this,this->blobs_[1].get()));
+    }
+
+    //[width, height, IFM]
+    new_tensor(input(),input_shape,InputdataPtr(this,bottom));
+    //[width, height, OFM]
+    new_tensor(output(),output_shape,OutputdataPtr(this,top));
+
+    acl_configure(conv,this,conv_info);
+}
+template <typename Dtype>
+void ACLConvolutionLayer<Dtype>::Reshape(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+  ConvolutionLayer<Dtype>::Reshape(bottom, top);
+}
+
+template <typename Dtype>
+bool ACLConvolutionLayer<Dtype>::Bypass_acl(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+    bool bypass_acl=false;
+    if (this->force_bypass_acl_path_|| ((openailab_intfp==0) && (this->group_>=5)) //for performance, more groups impact GPU performance
+       || ((openailab_intfp != 0 && (top[0]->channels() / this->group_ == 1)))) {
+        bypass_acl=true;
+    }
+
+    ConvolutionParameter conv_param = this->layer_param_.convolution_param();
+    if (conv_param.kernel_size_size()>2 || this->num_spatial_axes_>2 || this->num_spatial_axes_==0) {
+        bypass_acl=true;
+    }
+    /* check dilation */
+    int dilated=0;
+
+    for(int i=0;i<this->num_spatial_axes_;i++)
+    {
+        const int *p=this->dilation_.cpu_data();
+
+        if(p[i]!=1) 
+           dilated=1;
+    }
+    if(dilated) {
+        bypass_acl=true;
+     }
+
+
+    if((this->kernel_shape_.mutable_cpu_data()[1]==1||this->kernel_shape_.mutable_cpu_data()[0]==1) &&
+        isScheduleEnable()){
+        bypass_acl=true;
+     }
+    if((this->kernel_shape_.mutable_cpu_data()[1]==3||this->kernel_shape_.mutable_cpu_data()[0]==3) &&
+        (bottom[0]->channels()<150) && isScheduleEnable()){
+        bypass_acl=true;
+     }
+
+    return bypass_acl;
+}
+
+template <typename Dtype>
+void ACLConvolutionLayer<Dtype>::Forward_cpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+    if(isGPUMode()){
+        Forward_gpu(bottom, top);
+        return;
+    }         
+#ifdef USE_PROFILING
+    logtime_util log_time(ACL_CONV_INFO);
+#endif //USE_PROFILING
+
+    if (Bypass_acl(bottom,top)) {
+        ConvolutionLayer<Dtype>::Forward_cpu(bottom,top);
+        return;
+     }
+   
+    SetupACLOperator(bottom,top);
+
+   // acl fp
+    if (openailab_intfp==0){
+        caffe::acl_run(this,bottom,top);
+    }
+    return;
+}
+
+template <typename Dtype>
+void ACLConvolutionLayer<Dtype>::Forward_gpu(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
+#ifdef USE_PROFILING
+    logtime_util log_time(ACL_CONV_INFO);
+#endif //USE_PROFILING
+    if (Bypass_acl(bottom,top)) {
+        ConvolutionLayer<Dtype>::Forward_cpu(bottom,top);
+        return;
+     }
+    SetupACLOperator(bottom,top);
+    caffe::acl_run(this,bottom,top);
+}
+
+template <typename Dtype>
+ACLConvolutionLayer<Dtype>::~ACLConvolutionLayer() {
+}
+
+#ifdef USE_ACL
+INSTANTIATE_CLASS(ACLConvolutionLayer);
+#endif
+
+}   // namespace caffe
+#endif  // USE_ACL
diff --git a/src/caffe/layers/acl_inner_product_layer.cpp b/src/caffe/layers/acl_inner_product_layer.cpp
index 47d1011f..bb819fcc 100644
--- a/src/caffe/layers/acl_inner_product_layer.cpp
+++ b/src/caffe/layers/acl_inner_product_layer.cpp
@@ -1,6 +1,5 @@
 #ifdef USE_ACL
 #include <vector>
-
 #include "caffe/filler.hpp"
 #include "caffe/layers/acl_inner_product_layer.hpp"
 #include "caffe/util/math_functions.hpp"
@@ -15,65 +14,30 @@ void ACLInnerProductLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_FC;
 }
 template <typename Dtype>
-void ACLInnerProductLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+void ACLInnerProductLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
 
-    TensorShape weights_shape_t((unsigned int)this->K_, (unsigned int)this->N_);
-    TensorShape weights_shape((unsigned int)this->N_, (unsigned int)this->K_);
-    TensorShape biases_shape((unsigned int)this->N_);
-    TensorShape input_shape((unsigned int)this->K_, (unsigned int)this->M_);
-    TensorShape output_shape((unsigned int)this->N_, (unsigned int)this->M_);
-    checkreshape(input_shape,Caffe::arm_gpu_mode());
-    if (!this->init_layer_) return;
-    this->init_layer_=false;
+    arm_compute::TensorShape weights_shape_t((unsigned int)this->K_, (unsigned int)this->N_);
+    arm_compute::TensorShape weights_shape((unsigned int)this->N_, (unsigned int)this->K_);
+    arm_compute::TensorShape biases_shape((unsigned int)this->N_);
+    arm_compute::TensorShape input_shape((unsigned int)this->K_, (unsigned int)this->M_);
+    arm_compute::TensorShape output_shape((unsigned int)this->N_, (unsigned int)this->M_);
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
     // Initialize ACL.
-    if (Caffe::arm_gpu_mode()) {
-        new_gpulayer();
-    }else{
-        new_cpulayer();
-    }
 
     bool transpose = !this->layer_param_.inner_product_param().transpose();
-    this->force_bypass_acl_path_ = false; 
-    if (Caffe::arm_gpu_mode()) {
-        Dtype *top_data = top[0]->mutable_gpu_data(); 
-        const Dtype* bottom_data = bottom[0]->gpu_data();
-        if (transpose) {
-            new_tensor(this->gpu().weights,weights_shape_t,(void*)(this->blobs_[0].get()->mutable_gpu_data()));
-        }else{
-            new_tensor(this->gpu().weights,weights_shape,(void*)(this->blobs_[0].get()->mutable_gpu_data()));
-        }
-        tensor_mem(this->gpu().weights,(void*)(this->blobs_[0].get()->mutable_gpu_data()));
-        if (this->bias_term_) {
-            new_tensor(this->gpu().biases,biases_shape,(void*)(this->blobs_[1].get()->mutable_gpu_data()));
-            tensor_mem(this->gpu().biases,(void*)(this->blobs_[1].get()->mutable_gpu_data()));
-        }
-        new_tensor(this->gpu().input,input_shape,(void*)bottom_data);
-        new_tensor(this->gpu().output,output_shape,(void*)top_data);
-#ifdef USE_PROFILING
-        logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
-        this->gpu().layer->configure(this->gpu().input,this->gpu().weights,this->gpu().biases,this->gpu().output,transpose);
+    if (transpose) {
+        new_tensor(weights(),weights_shape_t,GetDataPtr(this,this->blobs_[0].get()));
     }else{
-        Dtype *top_data = top[0]->mutable_cpu_data(); 
-        const Dtype* bottom_data = bottom[0]->cpu_data();
-        if (transpose) {
-            new_tensor(this->cpu().weights,weights_shape_t,(void*)(this->blobs_[0].get()->mutable_cpu_data()));
-        }else{
-            new_tensor(this->cpu().weights,weights_shape,(void*)(this->blobs_[0].get()->mutable_cpu_data()));
-        }
-        tensor_mem(this->cpu().weights,(void*)(this->blobs_[0].get()->mutable_cpu_data()));
-        if (this->bias_term_) {
-            new_tensor(this->cpu().biases,biases_shape,(void*)(this->blobs_[1].get()->mutable_cpu_data()));
-            tensor_mem(this->cpu().biases,(void*)(this->blobs_[1].get()->mutable_cpu_data()));
-        }
-        new_tensor(this->cpu().input,input_shape,(void*)bottom_data);
-        new_tensor(this->cpu().output,output_shape,(void*)top_data);
-#ifdef USE_PROFILING
-        logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
-        this->cpu().layer->configure(this->cpu().input,this->cpu().weights,this->cpu().biases,this->cpu().output,transpose);
+        new_tensor(weights(),weights_shape,GetDataPtr(this,this->blobs_[0].get()));
+    }
+    if (this->bias_term_) {
+        new_tensor(biases(),biases_shape,GetDataPtr(this,this->blobs_[1].get()));
     }
+    new_tensor(input(),input_shape,InputdataPtr(this,bottom));
+    new_tensor(output(),output_shape,OutputdataPtr(this,top));
+    acl_configure(fc,this,transpose);
 }
 template <typename Dtype>
 void ACLInnerProductLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -82,25 +46,40 @@ void ACLInnerProductLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
+bool ACLInnerProductLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top){
+    bool bypass_acl=false;
+    if (this->force_bypass_acl_path_) {
+        bypass_acl=true;
+    }
+    return bypass_acl;
+}
+
+template <typename Dtype>
 void ACLInnerProductLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  if(Caffe::arm_gpu_mode()){
+  if(isGPUMode()){
    	Forward_gpu(bottom, top);
    	return;
   }         
 #ifdef USE_PROFILING
   logtime_util log_time(ACL_FC_INFO);
 #endif //USE_PROFILING
-  if (this->force_bypass_acl_path_) {
+  if (Bypass_acl(bottom, top)) {
        InnerProductLayer<Dtype>::Forward_cpu(bottom,top);
        return;
   }
-  Dtype* top_data = top[0]->mutable_cpu_data();
-  const Dtype* bottom_data = bottom[0]->cpu_data();
-  SetupACLLayer(bottom,top);
-  tensor_mem(this->cpu().input,(void*)(bottom_data));
-  cpu_run();
-  tensor_mem((void*)(top_data),this->cpu().output);
+  SetupACLOperator(bottom,top);
+
+  if (this->M_ != 1 && openailab_intfp != 0){
+      InnerProductLayer<Dtype>::Forward_cpu(bottom,top);
+      return;
+  }
+
+  // ACL FP
+  if(openailab_intfp == 0){
+      caffe::acl_run(this,bottom,top);
+  }
+  return;
 }
 
 template <typename Dtype>
@@ -109,16 +88,12 @@ void ACLInnerProductLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom
 #ifdef USE_PROFILING
   logtime_util log_time(ACL_FC_INFO);
 #endif //USE_PROFILING
-  if (this->force_bypass_acl_path_) {
+  if (Bypass_acl(bottom, top)) {
         InnerProductLayer<Dtype>::Forward_cpu(bottom,top);
         return;
   }
-  Dtype* top_data = top[0]->mutable_gpu_data();
-  const Dtype* bottom_data = bottom[0]->gpu_data();
-  SetupACLLayer(bottom,top);
-  tensor_mem(this->gpu().input,(void*)(bottom_data));
-  gpu_run();
-  tensor_mem((void*)(top_data),this->gpu().output);
+  SetupACLOperator(bottom,top);
+  caffe::acl_run(this,bottom,top);
 }
 
 template <typename Dtype>
diff --git a/src/caffe/layers/acl_local_connect_layer.cpp b/src/caffe/layers/acl_local_connect_layer.cpp
index 4eed72f9..1846faf4 100644
--- a/src/caffe/layers/acl_local_connect_layer.cpp
+++ b/src/caffe/layers/acl_local_connect_layer.cpp
@@ -15,20 +15,14 @@ void ACLLocalConnectLayer<Dtype>::LayerSetUp(
 }
 
 template <typename Dtype>
-void ACLLocalConnectLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+void ACLLocalConnectLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
 
-    TensorShape input_shape((unsigned int)bottom[0]->width(), (unsigned int)bottom[0]->height(),(unsigned int)bottom[0]->channels(),(unsigned int)bottom[0]->num());
-    checkreshape(input_shape,Caffe::arm_gpu_mode());
-    if (!this->init_layer_) return;
-    this->init_layer_=false;
-  // Initialize ACL.
-    if (Caffe::arm_gpu_mode()) {
-        new_gpulayer();
-    }else{
-        new_cpulayer();
-    }
-    this->force_bypass_acl_path_=false;
+    arm_compute::TensorShape input_shape((unsigned int)bottom[0]->width(), (unsigned int)bottom[0]->height(),(unsigned int)bottom[0]->channels(),(unsigned int)bottom[0]->num());
+    if (is_operator_init_done(input_shape)) return;
+    set_operator_init_done();
+
+    // Initialize ACL.
     ConvolutionParameter conv_param = this->layer_param_.convolution_param();
     int stride_x =this->stride_;
     int stride_y =this->stride_;
@@ -36,73 +30,23 @@ void ACLLocalConnectLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bott
     int pad_y=this->pad_;
     unsigned int kernel_x=this->kernel_size_;
     unsigned int kernel_y=this->kernel_size_;
-    PadStrideInfo conv_info(stride_x,stride_y,pad_x,pad_y);
-    TensorShape weights_shape(kernel_x,kernel_y,(unsigned int)this->channels_, (unsigned int)this->num_output_);
-    TensorShape biases_shape ((unsigned int)this->num_output_);
-    TensorShape output_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),(unsigned int)top[0]->channels(),(unsigned int)top[0]->num());
-
-    if (Caffe::arm_gpu_mode()) {
-        Dtype *top_data = top[0]->mutable_gpu_data(); 
-        const Dtype* bottom_data = bottom[0]->gpu_data();
-        //[kernel_x, kernel_y, IFM, OFM]
-        new_tensor(this->gpu().weights,weights_shape,(void*)(this->blobs_[0].get()->mutable_gpu_data()));
-        tensor_mem(this->gpu().weights,(void*)(this->blobs_[0].get()->mutable_gpu_data()));
-        //[OFM]
-        if (this->bias_term_) {
-            new_tensor(this->gpu().biases,biases_shape,(void*)(this->blobs_[1].get()->mutable_gpu_data()));
-            tensor_mem(this->gpu().biases,(void*)(this->blobs_[1].get()->mutable_gpu_data()));
-        }
+    arm_compute::PadStrideInfo conv_info(stride_x,stride_y,pad_x,pad_y);
+    arm_compute::TensorShape weights_shape(kernel_x,kernel_y,(unsigned int)this->channels_, (unsigned int)this->num_output_);
+    arm_compute::TensorShape biases_shape ((unsigned int)this->num_output_);
+    arm_compute::TensorShape output_shape((unsigned int)top[0]->width(), (unsigned int)top[0]->height(),(unsigned int)top[0]->channels(),(unsigned int)top[0]->num());
 
-        //[width, height, IFM]
-        new_tensor(this->gpu().input,input_shape,(void*)bottom_data);
-        //[width, height, OFM]
-        new_tensor(this->gpu().output,output_shape,(void*)top_data);
-#ifdef USE_PROFILING
-        {
-            logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
-        this->gpu().layer->configure(this->gpu().input,this->gpu().weights,this->gpu().biases,this->gpu().output,conv_info);
-#ifdef USE_PROFILING
-        }
-#endif //USE_PROFILING
-#ifdef USE_CONV_CACHE
-        for(int i = 0; i < 16; ++i){
-            fprintf(stderr, "<GPU>check cache[%d]\n", i);
-            if(this->gpu().cache.layer[i] == nullptr){
-                this->gpu().cache.layer[i] = this->gpu().layer;
-                this->gpu().cache.input[i] = this->gpu().input;
-                this->gpu().cache.output[i] = this->gpu().output;
-                this->gpu().cache.weights[i] = this->gpu().weights;
-                this->gpu().cache.biases[i] = this->gpu().biases;
-                break;
-            }
-        }    
-#endif //USE_CONV_CACHE    		
-    }else{
-        Dtype *top_data = top[0]->mutable_cpu_data(); 
-        const Dtype* bottom_data = bottom[0]->cpu_data();
-        //[kernel_x, kernel_y, IFM, OFM]
-        new_tensor(this->cpu().weights,weights_shape,(void*)(this->blobs_[0].get()->mutable_cpu_data()));
-        tensor_mem(this->cpu().weights,(void*)(this->blobs_[0].get()->mutable_cpu_data()));
-        //[OFM]
-        if (this->bias_term_) {
-            new_tensor(this->cpu().biases,biases_shape,(void*)(this->blobs_[1].get()->mutable_cpu_data()));
-            tensor_mem(this->cpu().biases,(void*)(this->blobs_[1].get()->mutable_cpu_data()));
-        }
-
-        //[width, height, IFM]
-        new_tensor(this->cpu().input,input_shape,(void*)bottom_data);
-        //[width, height, OFM]
-        new_tensor(this->cpu().output,output_shape,(void*)top_data);
-#ifdef USE_PROFILING
-        {
-            logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
-        this->cpu().layer->configure(this->cpu().input,this->cpu().weights,this->cpu().biases,this->cpu().output,conv_info);
-#ifdef USE_PROFILING
-        }
-#endif //USE_PROFILING
+    //[kernel_x, kernel_y, IFM, OFM]
+    new_tensor(weights(),weights_shape,GetDataPtr(this,this->blobs_[0].get()));
+    //[OFM]
+    if (this->bias_term_) {
+        new_tensor(biases(),biases_shape,GetDataPtr(this,this->blobs_[1].get()));
     }
+
+    //[width, height, IFM]
+    new_tensor(input(),input_shape,InputdataPtr(this,bottom));
+    //[width, height, OFM]
+    new_tensor(output(),output_shape,OutputdataPtr(this,top));
+    acl_configure(lc,this,conv_info);
 }
 template <typename Dtype>
 void ACLLocalConnectLayer<Dtype>::Reshape(
@@ -111,34 +55,37 @@ void ACLLocalConnectLayer<Dtype>::Reshape(
 }
 
 template <typename Dtype>
+bool ACLLocalConnectLayer<Dtype>::Bypass_acl(
+    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top){
+    bool bypass_acl=false;
+    if (this->force_bypass_acl_path_) {
+        bypass_acl=true;
+    }
+
+    ConvolutionParameter conv_param = this->layer_param_.convolution_param();
+    if (conv_param.kernel_size_size()>2 ) {
+        bypass_acl=true;
+    }
+    return bypass_acl;
+}
+
+template <typename Dtype>
 void ACLLocalConnectLayer<Dtype>::Forward_cpu(
     const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
-    if(Caffe::arm_gpu_mode()){
+    if(isGPUMode()){
         Forward_gpu(bottom, top);
         return;
     }         
 #ifdef USE_PROFILING
     logtime_util log_time(ACL_LC_INFO);
 #endif //USE_PROFILING
-    if (this->force_bypass_acl_path_) {
-        LocalConnectLayer<Dtype>::Forward_cpu(bottom,top);
-        return;
-    }
-
-    ConvolutionParameter conv_param = this->layer_param_.convolution_param();
-    if (conv_param.kernel_size_size()>2 ) {
+    if (Bypass_acl(bottom,top)) {
         LocalConnectLayer<Dtype>::Forward_cpu(bottom,top);
         return;
     }
     
-    SetupACLLayer(bottom,top);
-    for (int i = 0; i < bottom.size(); ++i) {
-        const Dtype* bottom_data = bottom[i]->cpu_data();
-        Dtype* top_data = top[i]->mutable_cpu_data();
-        tensor_mem(this->cpu().input,(void*)bottom_data);
-        cpu_run();
-        tensor_mem((void*)top_data,this->cpu().output);
-  }
+    SetupACLOperator(bottom,top);
+    caffe::acl_run(this,bottom,top);
 }
 
 template <typename Dtype>
@@ -148,22 +95,12 @@ void ACLLocalConnectLayer<Dtype>::Forward_gpu(
     logtime_util log_time(ACL_LC_INFO);
 #endif //USE_PROFILING
     ConvolutionParameter conv_param = this->layer_param_.convolution_param();
-    if (this->force_bypass_acl_path_) {
-        LocalConnectLayer<Dtype>::Forward_cpu(bottom,top);
-        return;
-    }
-    if (conv_param.kernel_size_size()>2 ) {
+    if (Bypass_acl(bottom,top)) {
         LocalConnectLayer<Dtype>::Forward_cpu(bottom,top);
         return;
     }
-    SetupACLLayer(bottom,top);
-    for (int i = 0; i < bottom.size(); ++i) {
-      const Dtype* bottom_data = bottom[i]->gpu_data();
-      Dtype* top_data = top[i]->mutable_gpu_data();
-      tensor_mem(this->gpu().input,(void*)bottom_data);
-      gpu_run();
-      tensor_mem((void*)top_data,this->gpu().output);
-    }
+    SetupACLOperator(bottom,top);
+    caffe::acl_run(this,bottom,top);
 }
 
 template <typename Dtype>
diff --git a/src/caffe/layers/acl_lrn_layer.cpp b/src/caffe/layers/acl_lrn_layer.cpp
index db9630da..2a94e010 100644
--- a/src/caffe/layers/acl_lrn_layer.cpp
+++ b/src/caffe/layers/acl_lrn_layer.cpp
@@ -5,7 +5,7 @@
 
 namespace caffe {
 
-const NormType IN_MAP=(arm_compute::NormType)0;
+const arm_compute::NormType IN_MAP=(arm_compute::NormType)0;
 template <typename Dtype>
 void ACLLRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
@@ -13,46 +13,24 @@ void ACLLRNLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_LRN;
 }
 template <typename Dtype>
-void ACLLRNLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+void ACLLRNLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
 
-    TensorShape shape((unsigned int)this->width_,(unsigned int)this->height_, (unsigned int)this->channels_);
-    checkreshape(shape,Caffe::arm_gpu_mode());
-    if (!this->init_layer_) return;
-    // Initialize ACL.
-    if (Caffe::arm_gpu_mode()) {
-        new_gpulayer();
-    }else{
-        new_cpulayer();
-    }
+    arm_compute::TensorShape shape((unsigned int)this->width_,(unsigned int)this->height_, (unsigned int)this->channels_);
+    if (is_operator_init_done(shape)) return;
+    set_operator_init_done();
 
-    //this->force_bypass_acl_path_=false;
-    NormalizationLayerInfo *norm_info;
+    // Initialize ACL.
+    arm_compute::NormalizationLayerInfo norm_info(IN_MAP, this->size_, this->alpha_, this->beta_, this->k_);
     if(this->layer_param_.lrn_param().norm_region() == LRNParameter_NormRegion_WITHIN_CHANNEL)
-       norm_info=new NormalizationLayerInfo(IN_MAP, this->size_, this->alpha_, this->beta_, this->k_);
+       norm_info=arm_compute::NormalizationLayerInfo(IN_MAP, this->size_, this->alpha_, this->beta_, this->k_);
     else
-       norm_info=new NormalizationLayerInfo(NormType::CROSS_MAP, this->size_, this->alpha_, this->beta_, this->k_);
+       norm_info=arm_compute::NormalizationLayerInfo(arm_compute::NormType::CROSS_MAP, this->size_, this->alpha_, this->beta_, this->k_);
+
+    new_tensor(input(),shape,InputdataPtr(this,bottom));
+    new_tensor(output(),shape,OutputdataPtr(this,top));
+    acl_configure(lrn,this,norm_info);
 
-    if (Caffe::arm_gpu_mode()) {
-        Dtype *top_data = top[0]->mutable_gpu_data(); 
-        const Dtype* bottom_data = bottom[0]->gpu_data();
-        new_tensor(this->gpu().input,shape,(void*)bottom_data);
-        new_tensor(this->gpu().output,shape,(void*)top_data);
-#ifdef USE_PROFILING
-        logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
-        this->gpu().layer->configure(this->gpu().input,this->gpu().output,*norm_info);
-    }else{
-        Dtype *top_data = top[0]->mutable_cpu_data(); 
-        const Dtype* bottom_data = bottom[0]->cpu_data();
-        new_tensor(this->cpu().input,shape,(void*)bottom_data);
-        new_tensor(this->cpu().output,shape,(void*)top_data);
-#ifdef USE_PROFILING
-        logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
-        this->cpu().layer->configure(this->cpu().input,this->cpu().output,*norm_info);
-    }
-    delete norm_info;
 }
 template <typename Dtype>
 void ACLLRNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -62,35 +40,41 @@ void ACLLRNLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
+bool ACLLRNLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top){
+    bool bypass_acl=false;
+    if (this->force_bypass_acl_path_ || this->layer_param_.lrn_param().norm_region() == LRNParameter_NormRegion_WITHIN_CHANNEL) {
+        bypass_acl=true;
+    }
+    return bypass_acl;
+}
+
+template <typename Dtype>
 void ACLLRNLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  if(Caffe::arm_gpu_mode()){
+  if(isGPUMode()){
     Forward_gpu(bottom, top);
     return;
   }         
 #ifdef USE_PROFILING
   logtime_util log_time(ACL_LRN_INFO);
 #endif //USE_PROFILING
-  if (this->force_bypass_acl_path_ || this->layer_param_.lrn_param().norm_region() == LRNParameter_NormRegion_WITHIN_CHANNEL) {
+  if (Bypass_acl(bottom, top)) {
       LRNLayer<Dtype>::Forward_cpu(bottom,top);
       return;
   }
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = top[0]->mutable_cpu_data();
-  SetupACLLayer(bottom,top);
+  SetupACLOperator(bottom,top);
   switch (this->layer_param_.lrn_param().norm_region()) {
   case LRNParameter_NormRegion_ACROSS_CHANNELS:
       for (int n = 0; n < this->num_; ++n) {
-          tensor_mem(this->cpu().input,(void*)(bottom_data+ bottom[0]->offset(n)));
-          cpu_run();
-          tensor_mem((void*)(top_data + top[0]->offset(n)),this->cpu().output);
+          acl_run((void*)(bottom_data+ bottom[0]->offset(n)),(void*)(top_data + top[0]->offset(n)));
       }
     break;
   case LRNParameter_NormRegion_WITHIN_CHANNEL:
       for (int n = 0; n < bottom[0]->num(); ++n) {
-            tensor_mem(this->cpu().input,(void*)(bottom_data));
-            cpu_run();
-            tensor_mem((void*)(top_data),this->cpu().output);
+            acl_run((void*)bottom_data,(void*)top_data);
             bottom_data += bottom[0]->offset(0, 1);
             top_data += top[0]->offset(0, 1);
       }
@@ -106,26 +90,22 @@ void ACLLRNLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 #ifdef USE_PROFILING
   logtime_util log_time(ACL_LRN_INFO);
 #endif //USE_PROFILING
-  if (this->force_bypass_acl_path_) {
+  if (Bypass_acl(bottom, top)) {
        LRNLayer<Dtype>::Forward_cpu(bottom,top);
        return;
   }
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = top[0]->mutable_gpu_data();
-  SetupACLLayer(bottom,top);
+  SetupACLOperator(bottom,top);
   switch (this->layer_param_.lrn_param().norm_region()) {
   case LRNParameter_NormRegion_ACROSS_CHANNELS:
       for (int n = 0; n < this->num_; ++n) {
-          tensor_mem(this->gpu().input,(void*)(bottom_data+ bottom[0]->offset(n)));
-          gpu_run();
-          tensor_mem((void*)(top_data + top[0]->offset(n)),this->gpu().output);
+          acl_run((void*)(bottom_data+ bottom[0]->offset(n)),(void*)(top_data + top[0]->offset(n)));
       }
     break;
   case LRNParameter_NormRegion_WITHIN_CHANNEL:
       for (int n = 0; n < bottom[0]->num(); ++n) {
-            tensor_mem(this->gpu().input,(void*)(bottom_data));
-            gpu_run();
-            tensor_mem((void*)(top_data),this->gpu().output);
+            acl_run((void*)bottom_data,(void*)top_data);
             bottom_data += bottom[0]->offset(0, 1);
             top_data += top[0]->offset(0, 1);
       }
diff --git a/src/caffe/layers/acl_pooling_layer.cpp b/src/caffe/layers/acl_pooling_layer.cpp
index f72b2235..f62fb5d4 100644
--- a/src/caffe/layers/acl_pooling_layer.cpp
+++ b/src/caffe/layers/acl_pooling_layer.cpp
@@ -12,48 +12,25 @@ void ACLPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_POOLING;
 }
 template <typename Dtype>
-void ACLPoolingLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+void ACLPoolingLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
 
-    TensorShape in_shape ((unsigned int)this->width_, (unsigned int)this->height_,(unsigned int)this->channels_);
-    TensorShape out_shape((unsigned int)this->pooled_width_, (unsigned int)this->pooled_height_,(unsigned int)this->channels_);
-    checkreshape(in_shape,Caffe::arm_gpu_mode());
-    if (!this->init_layer_) return;
-    this->init_layer_=false;
-    // Initialize ACL.
-    if (Caffe::arm_gpu_mode()) {
-        new_gpulayer();
-    }else{
-        new_cpulayer();
-    }
+    arm_compute::TensorShape in_shape ((unsigned int)this->width_, (unsigned int)this->height_,(unsigned int)this->channels_);
+    arm_compute::TensorShape out_shape((unsigned int)this->pooled_width_, (unsigned int)this->pooled_height_,(unsigned int)this->channels_);
+    if (is_operator_init_done(in_shape)) return;
+    set_operator_init_done();
 
-    this->force_bypass_acl_path_=false;
-    PoolingLayerInfo *pool_info;
+    // Initialize ACL.
+    arm_compute::PoolingLayerInfo pool_info;
     if(this->layer_param_.pooling_param().pool()==PoolingParameter_PoolMethod_MAX)
-       pool_info=new PoolingLayerInfo(PoolingType::MAX, this->kernel_w_, PadStrideInfo(this->stride_w_,this->stride_h_,this->pad_w_,this->pad_h_,DimensionRoundingType::CEIL));
+       pool_info=arm_compute::PoolingLayerInfo(arm_compute::PoolingType::MAX, this->kernel_w_, arm_compute::PadStrideInfo(this->stride_w_,this->stride_h_,this->pad_w_,this->pad_h_,arm_compute::DimensionRoundingType::CEIL));
     else
-       pool_info=new PoolingLayerInfo(PoolingType::AVG, this->kernel_w_, PadStrideInfo(this->stride_w_,this->stride_h_,this->pad_w_,this->pad_h_,DimensionRoundingType::CEIL));
+       pool_info=arm_compute::PoolingLayerInfo(arm_compute::PoolingType::AVG, this->kernel_w_, arm_compute::PadStrideInfo(this->stride_w_,this->stride_h_,this->pad_w_,this->pad_h_,arm_compute::DimensionRoundingType::CEIL));
+
+    new_tensor(input(),in_shape,InputdataPtr(this,bottom));
+    new_tensor(output(),out_shape,OutputdataPtr(this,top));
+    acl_configure(pooling,this,pool_info);
 
-    if (Caffe::arm_gpu_mode()) {
-        Dtype *top_data = top[0]->mutable_gpu_data(); 
-        const Dtype* bottom_data = bottom[0]->gpu_data();
-        new_tensor(this->gpu().input,in_shape,(void*)bottom_data);
-        new_tensor(this->gpu().output,out_shape,(void*)top_data);
-#ifdef USE_PROFILING
-        logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
-        this->gpu().layer->configure(this->gpu().input,this->gpu().output,*pool_info);
-    }else{
-        Dtype *top_data = top[0]->mutable_cpu_data(); 
-        const Dtype* bottom_data = bottom[0]->cpu_data();
-        new_tensor(this->cpu().input,in_shape,(void*)bottom_data);
-        new_tensor(this->cpu().output,out_shape,(void*)top_data);
-#ifdef USE_PROFILING
-        logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
-        this->cpu().layer->configure(this->cpu().input,this->cpu().output,*pool_info);
-    }
-    delete pool_info;
 }
 template <typename Dtype>
 void ACLPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -63,39 +40,44 @@ void ACLPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
+bool ACLPoolingLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top){
+    bool bypass_acl=false;
+    if (this->force_bypass_acl_path_|| this->layer_param_.pooling_param().global_pooling()) {
+        bypass_acl=true;
+    }
+    if (this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_MAX && 
+      this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_AVE) {
+        bypass_acl=true;
+  }
+  if (this->kernel_h_!=this->kernel_w_) {
+        bypass_acl=true;
+  }
+  if (this->kernel_h_!=2 && this->kernel_h_!=3) {
+        bypass_acl=true;
+  }
+    return bypass_acl;
+}
+
+template <typename Dtype>
 void ACLPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  if(Caffe::arm_gpu_mode()){
+  if(isGPUMode()){
       Forward_gpu(bottom, top);
       return;
   }         
 #ifdef USE_PROFILING
   logtime_util log_time(ACL_POOLING_INFO);
 #endif //USE_PROFILING
-  if (this->force_bypass_acl_path_|| this->layer_param_.pooling_param().global_pooling()) {
+  if (Bypass_acl(bottom,top)) {
       PoolingLayer<Dtype>::Forward_cpu(bottom,top);
       return;
   }
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = top[0]->mutable_cpu_data();
-  if (this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_MAX && 
-      this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_AVE) {
-      PoolingLayer<Dtype>::Forward_cpu(bottom,top);
-      return ;
-  }
-  if (this->kernel_h_!=this->kernel_w_ || top.size()>1) {
-      PoolingLayer<Dtype>::Forward_cpu(bottom,top);
-      return ;
-  }
-  if (this->kernel_h_!=2 && this->kernel_h_!=3) {
-      PoolingLayer<Dtype>::Forward_cpu(bottom,top);
-      return ;
-  }
-  SetupACLLayer(bottom,top);
+  SetupACLOperator(bottom,top);
   for (int n = 0; n < bottom[0]->num(); ++n) {
-        tensor_mem(this->cpu().input,(void*)(bottom_data));
-        cpu_run();
-        tensor_mem((void*)(top_data),this->cpu().output);
+        acl_run((void*)bottom_data,(void*)top_data);
         bottom_data += bottom[0]->offset(1);
         top_data += top[0]->offset(1);
   }
@@ -107,30 +89,15 @@ void ACLPoolingLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 #ifdef USE_PROFILING
   logtime_util log_time(ACL_POOLING_INFO);
 #endif //USE_PROFILING
-  if (this->force_bypass_acl_path_|| this->layer_param_.pooling_param().global_pooling()) {
+  if (Bypass_acl(bottom,top)) {
       PoolingLayer<Dtype>::Forward_cpu(bottom,top);
-      return;
+      return ;
   }
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = top[0]->mutable_gpu_data();
-  if (this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_MAX && 
-      this->layer_param_.pooling_param().pool()!=PoolingParameter_PoolMethod_AVE) {
-      PoolingLayer<Dtype>::Forward_cpu(bottom,top);
-      return ;
-  }
-  if (this->kernel_h_!=this->kernel_w_) {
-      PoolingLayer<Dtype>::Forward_cpu(bottom,top);
-      return ;
-  }
-  if (this->kernel_h_!=2 && this->kernel_h_!=3) {
-      PoolingLayer<Dtype>::Forward_cpu(bottom,top);
-      return ;
-  }
-  SetupACLLayer(bottom,top);
+  SetupACLOperator(bottom,top);
   for (int n = 0; n < bottom[0]->num(); ++n) {
-        tensor_mem(this->gpu().input,(void*)(bottom_data));
-        gpu_run();
-        tensor_mem((void*)(top_data),this->gpu().output);
+        acl_run((void*)bottom_data,(void*)top_data);
         bottom_data += bottom[0]->offset(1);
         top_data += top[0]->offset(1);
   }
diff --git a/src/caffe/layers/acl_relu_layer.cpp b/src/caffe/layers/acl_relu_layer.cpp
index 03194539..2b712dda 100644
--- a/src/caffe/layers/acl_relu_layer.cpp
+++ b/src/caffe/layers/acl_relu_layer.cpp
@@ -13,9 +13,9 @@ void ACLReLULayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_RELU;
 }
 template <typename Dtype>
-void ACLReLULayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+void ACLReLULayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
-    ACLBaseActivationLayer<Dtype>::SetupACLLayer(bottom, top,ActivationLayerInfo::ActivationFunction::RELU);
+    ACLBaseActivationLayer<Dtype>::SetupACLOperator(bottom, top,arm_compute::ActivationLayerInfo::ActivationFunction::RELU);
 }
 template <typename Dtype>
 void ACLReLULayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -25,20 +25,32 @@ void ACLReLULayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
+bool ACLReLULayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,
+    const vector<Blob<Dtype>*>& top) {
+    bool bypass_acl=false;
+    if (this->force_bypass_acl_path_) {
+        bypass_acl=true;
+  }
+  // Fallback to standard Caffe for leaky ReLU.
+  if (ReLULayer<Dtype>::layer_param_.relu_param().negative_slope() != 0) {
+        bypass_acl=true;
+  }
+  if (isScheduleEnable()) {
+      bypass_acl=true;
+  }
+  return bypass_acl;
+}
+
+template <typename Dtype>
 void ACLReLULayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
 #ifdef USE_PROFILING
   logtime_util log_time(ACL_RELU_INFO);
 #endif //USE_PROFILING
-  if (this->force_bypass_acl_path_) {
+  if (Bypass_acl(bottom,top)) {
       ReLULayer<Dtype>::Forward_cpu(bottom,top);
       return;
   }
-  // Fallback to standard Caffe for leaky ReLU.
-  if (ReLULayer<Dtype>::layer_param_.relu_param().negative_slope() != 0) {
-    ReLULayer<Dtype>::Forward_cpu(bottom, top);
-    return;
-  }
   ACLBaseActivationLayer<Dtype>::Forward_cpu(bottom,top);
 }
 
@@ -48,12 +60,7 @@ void ACLReLULayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 #ifdef USE_PROFILING
   logtime_util log_time(ACL_RELU_INFO);
 #endif //USE_PROFILING
-  if (this->force_bypass_acl_path_) {
-      ReLULayer<Dtype>::Forward_cpu(bottom,top);
-      return;
-  }
-  // Fallback to standard Caffe for leaky ReLU.
-  if (ReLULayer<Dtype>::layer_param_.relu_param().negative_slope() != 0) {
+  if (Bypass_acl(bottom,top)) {
     ReLULayer<Dtype>::Forward_cpu(bottom, top);
 	return;
   }
diff --git a/src/caffe/layers/acl_sigmoid_layer.cpp b/src/caffe/layers/acl_sigmoid_layer.cpp
index eac15651..4b3f660c 100644
--- a/src/caffe/layers/acl_sigmoid_layer.cpp
+++ b/src/caffe/layers/acl_sigmoid_layer.cpp
@@ -14,9 +14,9 @@ void ACLSigmoidLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-void ACLSigmoidLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top,ActivationLayerInfo::ActivationFunction type){
-    ACLBaseActivationLayer<Dtype>::SetupACLLayer(bottom, top,ActivationLayerInfo::ActivationFunction::LOGISTIC);
+void ACLSigmoidLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top,arm_compute::ActivationLayerInfo::ActivationFunction type){
+    ACLBaseActivationLayer<Dtype>::SetupACLOperator(bottom, top,arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC);
 }
 template <typename Dtype>
 void ACLSigmoidLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -26,12 +26,21 @@ void ACLSigmoidLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
+bool ACLSigmoidLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top){
+    bool bypass_acl=false;
+    if (this->force_bypass_acl_path_) {
+        bypass_acl=true;
+    }
+    return bypass_acl;
+}
+
+template <typename Dtype>
 void ACLSigmoidLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
 #ifdef USE_PROFILING
     logtime_util log_time(ACL_SIGMOID_INFO);
 #endif //USE_PROFILING
-    if (this->force_bypass_acl_path_) {
+    if (Bypass_acl(bottom,top)) {
         SigmoidLayer<Dtype>::Forward_cpu(bottom,top);
         return;
     }
@@ -44,7 +53,7 @@ void ACLSigmoidLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 #ifdef USE_PROFILING
     logtime_util log_time(ACL_SIGMOID_INFO);
 #endif //USE_PROFILING
-    if (this->force_bypass_acl_path_) {
+    if (Bypass_acl(bottom,top)) {
         SigmoidLayer<Dtype>::Forward_cpu(bottom,top);
         return;
     }
diff --git a/src/caffe/layers/acl_softmax_layer.cpp b/src/caffe/layers/acl_softmax_layer.cpp
index 1568d3aa..6d3cd93e 100644
--- a/src/caffe/layers/acl_softmax_layer.cpp
+++ b/src/caffe/layers/acl_softmax_layer.cpp
@@ -13,42 +13,19 @@ void ACLSoftmaxLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
   this->force_bypass_acl_path_= bypass_acl_class_layer & FLAGS_ENABLE_ACL_SOFTMAX;
 }
 template <typename Dtype>
-void ACLSoftmaxLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
+void ACLSoftmaxLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top){
 
     unsigned int channels = bottom[0]->shape(this->softmax_axis_); 
-    TensorShape shape(channels*this->inner_num_);
-    checkreshape(shape,Caffe::arm_gpu_mode());
-    if (!this->init_layer_) return;
-    this->init_layer_=false;
+    arm_compute::TensorShape shape(channels*this->inner_num_);
+    if (is_operator_init_done(shape)) return;
+    set_operator_init_done();
 
     // Initialize ACL.
-    if (Caffe::arm_gpu_mode()) {
-        new_gpulayer();
-    }else{
-        new_cpulayer();
-    }
+    new_tensor(input(),shape,InputdataPtr(this,bottom));
+    new_tensor(output(),shape,OutputdataPtr(this,top));
+    acl_configure(softmax,this,NULL);
 
-    //this->force_bypass_acl_path_=false;
-    if (Caffe::arm_gpu_mode()) {
-        Dtype *top_data = top[0]->mutable_gpu_data(); 
-        const Dtype* bottom_data = bottom[0]->gpu_data();
-        new_tensor(this->gpu().input,shape,(void*)bottom_data);
-        new_tensor(this->gpu().output,shape,(void*)top_data);
-#ifdef USE_PROFILING
-        logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
-        this->gpu().layer->configure(this->gpu().input,this->gpu().output);
-    }else{
-        Dtype *top_data = top[0]->mutable_cpu_data(); 
-        const Dtype* bottom_data = bottom[0]->cpu_data();
-        new_tensor(this->cpu().input,shape,(void*)bottom_data);
-        new_tensor(this->cpu().output,shape,(void*)top_data);
-#ifdef USE_PROFILING
-        logtime_util log_time(ACL_CONFIG_INFO);
-#endif //USE_PROFILING
-        this->cpu().layer->configure(this->cpu().input,this->cpu().output);
-    }
 }
 template <typename Dtype>
 void ACLSoftmaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
@@ -57,29 +34,36 @@ void ACLSoftmaxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
+bool ACLSoftmaxLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top){
+    bool bypass_acl=false;
+    if (this->force_bypass_acl_path_ || this->inner_num_>1) {
+        bypass_acl=true;
+    }
+    return bypass_acl;
+}
+
+template <typename Dtype>
 void ACLSoftmaxLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
-  if(Caffe::arm_gpu_mode()){
+  if(isGPUMode()){
       Forward_gpu(bottom, top);
       return;
   }         
 #ifdef USE_PROFILING
     logtime_util log_time(ACL_SOFTMAX_INFO);
 #endif //USE_PROFILING
-  if (this->force_bypass_acl_path_ || this->inner_num_>1) {
+  if (Bypass_acl(bottom,top)) {
       SoftmaxLayer<Dtype>::Forward_cpu(bottom,top);
       return ;
   }
   const Dtype* bottom_data = bottom[0]->cpu_data();
   Dtype* top_data = top[0]->mutable_cpu_data();
-  SetupACLLayer(bottom,top);
+  SetupACLOperator(bottom,top);
 
   int channels = bottom[0]->shape(this->softmax_axis_);
 
   for (int i = 0; i < this->outer_num_; ++i) {
-      tensor_mem(this->cpu().input,(void*)(bottom_data));
-      cpu_run();
-      tensor_mem((void*)(top_data),this->cpu().output);
+      acl_run((void*)bottom_data,(void*)top_data);
       top_data += channels;
       bottom_data += channels;
   }
@@ -91,17 +75,15 @@ void ACLSoftmaxLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 #ifdef USE_PROFILING
   logtime_util log_time(ACL_SOFTMAX_INFO);
 #endif //USE_PROFILING
-  if (this->force_bypass_acl_path_|| this->inner_num_>1) {
+  if (Bypass_acl(bottom,top)) {
         SoftmaxLayer<Dtype>::Forward_cpu(bottom,top);
         return;
   }
   const Dtype* bottom_data = bottom[0]->gpu_data();
   Dtype* top_data = top[0]->mutable_gpu_data();
-  SetupACLLayer(bottom,top);
+  SetupACLOperator(bottom,top);
   for (int i = 0; i < this->outer_num_; ++i) {
-      tensor_mem(this->gpu().input,(void*)(bottom_data));
-      gpu_run();
-      tensor_mem((void*)(top_data),this->gpu().output);
+      acl_run((void*)bottom_data,(void*)top_data);
       top_data += this->inner_num_;
       bottom_data += this->inner_num_;
   }
diff --git a/src/caffe/layers/acl_tanh_layer.cpp b/src/caffe/layers/acl_tanh_layer.cpp
index a1bb632c..9c1066e1 100644
--- a/src/caffe/layers/acl_tanh_layer.cpp
+++ b/src/caffe/layers/acl_tanh_layer.cpp
@@ -14,9 +14,9 @@ void ACLTanHLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
-void ACLTanHLayer<Dtype>::SetupACLLayer(const vector<Blob<Dtype>*>& bottom,
-      const vector<Blob<Dtype>*>& top, ActivationLayerInfo::ActivationFunction type){
-    ACLBaseActivationLayer<Dtype>::SetupACLLayer(bottom, top,ActivationLayerInfo::ActivationFunction::TANH);
+void ACLTanHLayer<Dtype>::SetupACLOperator(const vector<Blob<Dtype>*>& bottom,
+      const vector<Blob<Dtype>*>& top, arm_compute::ActivationLayerInfo::ActivationFunction type){
+    ACLBaseActivationLayer<Dtype>::SetupACLOperator(bottom, top,arm_compute::ActivationLayerInfo::ActivationFunction::TANH);
 }
 
 template <typename Dtype>
@@ -27,12 +27,20 @@ void ACLTanHLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
 }
 
 template <typename Dtype>
+bool ACLTanHLayer<Dtype>::Bypass_acl(const vector<Blob<Dtype>*>& bottom,const vector<Blob<Dtype>*>& top){
+    bool bypass_acl=false;
+    if (this->force_bypass_acl_path_) {
+        bypass_acl=true;
+    }
+    return bypass_acl;
+}
+template <typename Dtype>
 void ACLTanHLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
     const vector<Blob<Dtype>*>& top) {
 #ifdef USE_PROFILING
   logtime_util log_time(ACL_TANH_INFO);
 #endif //USE_PROFILING
-  if (this->force_bypass_acl_path_) {
+  if (Bypass_acl(bottom,top)) {
        TanHLayer<Dtype>::Forward_cpu(bottom,top);
        return;
   }
@@ -45,7 +53,7 @@ void ACLTanHLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
 #ifdef USE_PROFILING
     logtime_util log_time(ACL_TANH_INFO);
 #endif //USE_PROFILING
-    if (this->force_bypass_acl_path_) {
+    if (Bypass_acl(bottom,top)) {
          TanHLayer<Dtype>::Forward_cpu(bottom,top);
          return;
     }
diff --git a/tools/extra/tpi.py b/tools/extra/tpi.py
new file mode 100644
index 00000000..f455ed9b
--- /dev/null
+++ b/tools/extra/tpi.py
@@ -0,0 +1,213 @@
+import sys
+import os
+import re
+import pdb
+import xlwt
+ 
+help_ = '''
+Usage:
+    python tpi.py log.txt
+'''
+
+#data_list= {}
+data_list1= []
+data_list2= []
+cnt=0
+times=0.0
+table_val=''
+name_list1= ['allocate','run','configure','tensor_copy','ACL_CONV','ACL_FC','ACL_LRN','ACL_POOLING','ACL_RELU','ACL_SOFTMAX']
+
+
+
+
+def getvalpairs(words):
+    name=''
+    val=''
+    for word in words:
+        if word=='':
+            continue
+        if name=='':
+            name=word
+        else:
+            val=word
+            break;
+        #print word,
+    #print ''
+    return (name,val)
+
+def addpairstolist(db,name,val,idx):
+     #pdb.set_trace()
+     #if idx in db:
+     #   db[idx]['val'] += val
+     #else:
+     #   db[idx] = {'val':val,'name':name}
+
+     #pdb.set_trace()
+
+     for i in db:
+         if i['name']==name:
+             i['val'] += val
+             return
+     db.append({'idx':idx,'val':val,'name':name})
+
+def gettabnum(line):
+    start=line.find(':')
+    if start==-1:
+        start=0
+    else:
+        start+=1
+    #pdb.set_trace()
+    str=line[start:-1].lstrip(' ')
+    words=re.split('\t',str)
+    idx=0
+    for word in words:
+        idx+=1
+        if word=='':
+            continue
+        break
+    return idx
+
+def decodefile(logfile):
+    data_list=data_list1
+    for line in open(logfile):
+        if line.find(':')==-1:
+            continue
+        #pdb.set_trace()
+        #print line,
+        idx=gettabnum(line)
+        words=re.split('\t|:| |\r|\n',line)
+        #print(words)
+        (name,val)=getvalpairs(words)
+        #print (name,float(val),eval(val))
+        if name == 'used' and val == 'time':
+            data_list=data_list2
+        try:
+            addpairstolist(data_list,name,float(val),idx)
+        except ValueError as e:
+            #print(line)
+            continue
+
+def printresult(db):
+    #for i in db:
+    #    print i, db[i]['idx'],db[i]['val']
+    #pdb.set_trace()
+    db.sort(key=lambda obj:obj.get('idx'), reverse=False)
+    tpi_start=0
+    conv_str='ACL_CONV'
+    find_acl = 0
+    name_index=0
+    global trow
+    global tcol
+    for i in db:
+        if i['name']==conv_str:
+            tpi_start=i['idx']
+
+    tpi=0
+    for i in db:
+        if i['idx']>=tpi_start:
+            tpi+=i['val']
+
+    start=len('ACL_')
+
+    table_head='TPI'+'\t'
+    table_val='%.4f' % (tpi/times)+'\t'
+
+    for i in db:
+        #print i
+        if i['idx']<tpi_start:
+            if i['name'].find('ACL_')==0:
+               table_head+=i['name'][start:]+'\t'
+            else:
+                table_head+=i['name']+'\t'
+            table_val+='%.4f' % (i['val']/times)+'\t'
+
+    print(table_head)
+    print(table_val)
+
+    table_head='TPI'+'\t'
+    table_val='%.4f' % (tpi/times)+'\t'
+
+    for i in db:
+        if i['idx']>=tpi_start:
+            if i['name'].find('ACL_')==0:
+               #pdb.set_trace()
+               table_head+=i['name'][start:]+'\t'
+            else:
+                table_head+=i['name']+'\t'
+            table_val+='%.4f' % (i['val']/times)+'\t'
+
+    print(table_head)
+    print(table_val)
+
+    ws.write(trow, tcol, 'TPI')
+    ws.write(trow+1,tcol,'%.4f' % (tpi/times))
+    tcol+=1
+
+    temp_row=trow
+    temp_col=tcol
+    for i in name_list1:
+        if i.find('ACL_')==0 and find_acl==0:
+            temp_row+=2
+            temp_col=2
+            find_acl=1
+        ws.write(temp_row,temp_col,i)
+        ws.write(temp_row+1,temp_col,'0')
+        temp_col+=1
+    find_acl=0
+
+    for i in db:
+        curname=i['name']
+        curvalue='%.4f' % (i['val']/times)
+        if curname == 'ACL_BN':
+            ws.write(trow+2,7,curname)
+            ws.write(trow+3,7,curvalue)
+
+        if curname in name_list1:
+            val_col=name_list1.index(curname)+2
+            val_row=trow
+            # print ('name found'+ curname + curvalue)
+            # print(val_col)
+            # print (val_row)
+            if val_col>5:
+                val_col-=4
+                val_row+=2
+            ws.write(val_row,val_col,curname)
+            ws.write(val_row+1,val_col,curvalue)
+
+    tcol=0
+    trow+=4
+
+
+if __name__ == '__main__' :
+    if len(sys.argv) < 2:
+        print(help_)
+        sys.exit()
+    else:
+        logfile = sys.argv[1]
+
+    filename = os.path.basename(logfile)
+    decodefile(logfile)
+
+    wb = xlwt.Workbook()
+    ws = wb.add_sheet('testsheet',True)
+    trow = 0
+    tcol = 0
+    cnt=1
+    times=1.0
+    table_val=''
+    print('1st time:')
+    ws.write(trow,tcol,'1st time')
+    tcol+=1
+    printresult(data_list1)
+
+    cnt=2
+    times=10.0
+    table_val=''
+    print('\nAverage of 2-11 times:')
+    ws.write(trow, tcol, '2-11 times')
+    tcol+=1
+    printresult(data_list2)
+    wb.save(filename+'.xls')
+    print ('Xls file generated')
+
+
author	huifang <huifangzhang@openailab>	2018-01-31 19:00:24 +0800
committer	huifang <huifangzhang@openailab>	2018-01-31 19:00:24 +0800
commit	1224a143fc631f9f004881dba7a32c6f7ed5e1a5 (patch)
tree	2f41c58f0d40c9c76b4df1762c09597397629648
parent	7e51f0ff47b9a74f949f5b9ca448d56b6ba3eba5 (diff)
download	caffeonacl-1224a143fc631f9f004881dba7a32c6f7ed5e1a5.tar.gz caffeonacl-1224a143fc631f9f004881dba7a32c6f7ed5e1a5.tar.bz2 caffeonacl-1224a143fc631f9f004881dba7a32c6f7ed5e1a5.zip